diff --git a/openmp/runtime/CMakeLists.txt b/openmp/runtime/CMakeLists.txt
--- a/openmp/runtime/CMakeLists.txt
+++ b/openmp/runtime/CMakeLists.txt
@@ -288,6 +288,13 @@
   set (LIBOMP_USE_VERSION_SYMBOLS FALSE)
 endif()
 
+# Unshackled task support defaults to OFF
+set(LIBOMP_USE_UNSHACKLED_TASK FALSE CACHE BOOL "Use unshackled task?")
+if(WIN32)
+  message(STATUS "Unshackled thread is not supported on Windows now. - forcing off")
+  set(LIBOMP_USE_UNSHACKLED_TASK FALSE)
+endif()
+
 # OMPT-support defaults to ON for OpenMP 5.0+ and if the requirements in
 # cmake/config-ix.cmake are fulfilled.
 set(OMPT_DEFAULT FALSE)
diff --git a/openmp/runtime/src/kmp.h b/openmp/runtime/src/kmp.h
--- a/openmp/runtime/src/kmp.h
+++ b/openmp/runtime/src/kmp.h
@@ -2236,7 +2236,12 @@
   unsigned priority_specified : 1; /* set if the compiler provides priority
                                       setting for the task */
   unsigned detachable : 1; /* 1 == can detach */
+#if USE_UNSHACKLED_TASK
+  unsigned unshackled : 1; /* 1 == unshackled task */
+  unsigned reserved : 8; /* reserved for compiler use */
+#else
   unsigned reserved : 9; /* reserved for compiler use */
+#endif
 
   /* Library flags */ /* Total library flags must be 16 bits */
   unsigned tasktype : 1; /* task is either explicit(1) or implicit (0) */
@@ -2284,6 +2289,23 @@
   kmp_depnode_t
       *td_depnode; // Pointer to graph node if this task has dependencies
   kmp_task_team_t *td_task_team;
+#if USE_UNSHACKLED_TASK
+  // The task team of its parent task team. Usually we could access it via
+  // parent_task->td_task_team, but it is possible that
+  // parent_task->td_task_team is nullptr because of late initialization.
+  // Sometimes We must use this pointer, and the td_task_team of the
+  // encountering thread is never nullptr, therefore we it when this task is
+  // created.
+  kmp_task_team_t *td_parent_task_team;
+  // The global thread id of the encountering thread. We need it because when a
+  // regular task depends on an unshackled task, and the unshackled task is
+  // finished on an unshackled thread, it will call __kmp_release_deps to
+  // release all dependences. If now the task is a regular task, we need to pass
+  // the encountering gtid such that the task will be picked up and executed by
+  // its encountering team instead of unshackled team.
+  kmp_int32 encountering_gtid;
+
+#endif
   kmp_int32 td_size_alloc; // The size of task structure, including shareds etc.
 #if defined(KMP_GOMP_COMPAT)
   // 4 or 8 byte integers for the loop bounds in GOMP_taskloop
@@ -2351,10 +2373,20 @@
   kmp_int32 tt_max_threads; // # entries allocated for threads_data array
   kmp_int32 tt_found_proxy_tasks; // found proxy tasks since last barrier
   kmp_int32 tt_untied_task_encountered;
+#if USE_UNSHACKLED_TASK
+  // There is unshackled thread encountered in this task team so that we must
+  // wait when waiting on task team
+  kmp_int32 tt_unshackled_task_encountered;
+#endif
 
   KMP_ALIGN_CACHE
   std::atomic<kmp_int32> tt_unfinished_threads; /* #threads still active */
 
+#if USE_UNSHACKLED_TASK
+  KMP_ALIGN_CACHE
+  std::atomic<kmp_int32> tt_unfinished_unshackled_tasks;
+#endif
+
   KMP_ALIGN_CACHE
   volatile kmp_uint32
       tt_active; /* is the team still actively executing tasks */
@@ -2819,6 +2851,9 @@
 extern volatile int __kmp_init_monitor;
 #endif
 extern volatile int __kmp_init_user_locks;
+#if USE_UNSHACKLED_TASK
+extern volatile int __kmp_init_unshackled_threads;
+#endif
 extern int __kmp_init_counter;
 extern int __kmp_root_counter;
 extern int __kmp_version;
@@ -3905,6 +3940,40 @@
 
 extern void __kmp_omp_display_env(int verbose);
 
+#if USE_UNSHACKLED_TASK
+// 1: it is initializing unshackled team
+extern volatile int __kmp_init_unshackled;
+// Master thread of unshackled team
+extern kmp_info_t *__kmp_unshackled_master_thread;
+// Descriptors for the unshackled threads
+extern kmp_info_t **__kmp_unshackled_threads;
+// Number of unshackled threads
+extern int __kmp_unshackled_threads_num;
+// Number of unshackled tasks that have not been executed yet
+extern std::atomic<kmp_int32> __kmp_unexecuted_unshackled_tasks;
+
+extern void __kmp_unshackled_initialize();
+extern void __kmp_unshackled_threads_initz_routine();
+extern void __kmp_do_initialize_unshackled_threads();
+extern void __kmp_unshackled_threads_initz_wait();
+extern void __kmp_unshackled_initz_release();
+extern void __kmp_unshackled_master_thread_wait();
+extern void __kmp_unshackled_worker_thread_wait();
+extern void __kmp_unshackled_worker_thread_signal();
+
+// Check whether a given thread is an unshackled thread
+#define KMP_UNSHACKLED_THREAD(gtid)                                            \
+  ((gtid) >= 1 && (gtid) <= __kmp_unshackled_threads_num)
+
+#define KMP_UNSHACKLED_WORKER_THREAD(gtid)                                     \
+  ((gtid) > 1 && (gtid) <= __kmp_unshackled_threads_num)
+
+// Map a gtid to an unshackled thread. The first unshackled thread, a.k.a master
+// thread, is skipped.
+#define KMP_GTID_TO_SHADOW_GTID(gtid)                                          \
+  ((gtid) % (__kmp_unshackled_threads_num - 1) + 2)
+#endif
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/openmp/runtime/src/kmp_config.h.cmake b/openmp/runtime/src/kmp_config.h.cmake
--- a/openmp/runtime/src/kmp_config.h.cmake
+++ b/openmp/runtime/src/kmp_config.h.cmake
@@ -44,6 +44,8 @@
 #define OMPT_DEBUG LIBOMP_OMPT_DEBUG
 #cmakedefine01 LIBOMP_OMPT_SUPPORT
 #define OMPT_SUPPORT LIBOMP_OMPT_SUPPORT
+#cmakedefine01 LIBOMP_USE_UNSHACKLED_TASK
+#define USE_UNSHACKLED_TASK LIBOMP_USE_UNSHACKLED_TASK
 #cmakedefine01 LIBOMP_OMPT_OPTIONAL
 #define OMPT_OPTIONAL LIBOMP_OMPT_OPTIONAL
 #cmakedefine01 LIBOMP_USE_ADAPTIVE_LOCKS
diff --git a/openmp/runtime/src/kmp_global.cpp b/openmp/runtime/src/kmp_global.cpp
--- a/openmp/runtime/src/kmp_global.cpp
+++ b/openmp/runtime/src/kmp_global.cpp
@@ -46,11 +46,17 @@
 volatile int __kmp_init_common = FALSE;
 volatile int __kmp_init_middle = FALSE;
 volatile int __kmp_init_parallel = FALSE;
+#if USE_UNSHACKLED_TASK
+volatile int __kmp_init_unshackled = FALSE;
+#endif
 #if KMP_USE_MONITOR
 volatile int __kmp_init_monitor =
     0; /* 1 - launched, 2 - actually started (Windows* OS only) */
 #endif
 volatile int __kmp_init_user_locks = FALSE;
+#if USE_UNSHACKLED_TASK
+volatile int __kmp_init_unshackled_threads = FALSE;
+#endif
 
 /* list of address of allocated caches for commons */
 kmp_cached_addr_t *__kmp_threadpriv_cache_list = NULL;
diff --git a/openmp/runtime/src/kmp_runtime.cpp b/openmp/runtime/src/kmp_runtime.cpp
--- a/openmp/runtime/src/kmp_runtime.cpp
+++ b/openmp/runtime/src/kmp_runtime.cpp
@@ -3630,6 +3630,39 @@
     }
   }
 
+#if USE_UNSHACKLED_TASK
+  // When unshackled task is enabled, __kmp_threads is organized as follows:
+  // 0: initial thread, also a regular OpenMP thread.
+  // [1, __kmp_unshackled_threads_num]: slots for unshackled threads.
+  // [__kmp_unshackled_threads_num + 1, __kmp_threads_capacity): slots for
+  // regular OpenMP threads.
+  if (TCR_4(__kmp_init_unshackled_threads)) {
+    // Find an available thread slot for unshackled thread. Slots for unshackled
+    // threads start from 1 to __kmp_unshackled_threads_num.
+    for (gtid = 1; TCR_PTR(__kmp_threads[gtid]) != NULL &&
+                   gtid <= __kmp_unshackled_threads_num;
+         gtid++)
+      ;
+    KMP_ASSERT(gtid <= __kmp_unshackled_threads_num);
+    KA_TRACE(1, ("__kmp_register_root: found slot in threads array for "
+                 "unshackled thread: T#%d\n",
+                 gtid));
+  } else {
+    /* find an available thread slot */
+    // Don't reassign the zero slot since we need that to only be used by
+    // initial thread. Slots for unshackled threads should also be skipped.
+    if (initial_thread && __kmp_threads[0] == NULL) {
+      gtid = 0;
+    } else {
+      for (gtid = __kmp_unshackled_threads_num + 1;
+           TCR_PTR(__kmp_threads[gtid]) != NULL; gtid++)
+        ;
+    }
+    KA_TRACE(
+        1, ("__kmp_register_root: found slot in threads array: T#%d\n", gtid));
+    KMP_ASSERT(gtid < __kmp_threads_capacity);
+  }
+#else
   /* find an available thread slot */
   /* Don't reassign the zero slot since we need that to only be used by initial
      thread */
@@ -3639,6 +3672,7 @@
   KA_TRACE(1,
            ("__kmp_register_root: found slot in threads array: T#%d\n", gtid));
   KMP_ASSERT(gtid < __kmp_threads_capacity);
+#endif
 
   /* update global accounting */
   __kmp_all_nth++;
@@ -4295,9 +4329,22 @@
 #endif
 
   KMP_MB();
+
+#if USE_UNSHACKLED_TASK
+  for (new_gtid = TCR_4(__kmp_init_unshackled_threads)
+                      ? 1
+                      : __kmp_unshackled_threads_num + 1;
+       TCR_PTR(__kmp_threads[new_gtid]) != NULL; ++new_gtid) {
+    KMP_DEBUG_ASSERT(new_gtid < __kmp_threads_capacity);
+  }
+  if (TCR_4(__kmp_init_unshackled_threads)) {
+    KMP_DEBUG_ASSERT(new_gtid <= __kmp_unshackled_threads_num);
+  }
+#else
   for (new_gtid = 1; TCR_PTR(__kmp_threads[new_gtid]) != NULL; ++new_gtid) {
     KMP_DEBUG_ASSERT(new_gtid < __kmp_threads_capacity);
   }
+#endif
 
   /* allocate space for it. */
   new_thr = (kmp_info_t *)__kmp_allocate(sizeof(kmp_info_t));
@@ -6693,6 +6740,11 @@
   __kmp_env_free(&val);
 #endif
 
+#if USE_UNSHACKLED_TASK
+  // TODO: Initialize this value from an env
+  __kmp_unshackled_threads_num = 8;
+#endif
+
   __kmp_threads_capacity =
       __kmp_initial_threads_capacity(__kmp_dflt_team_nth_ub);
   // Moved here from __kmp_env_initialize() "KMP_ALL_THREADPRIVATE" part
@@ -6992,6 +7044,45 @@
   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
 }
 
+#if USE_UNSHACKLED_TASK
+void __kmp_unshackled_initialize() {
+  if (TCR_4(__kmp_init_unshackled))
+    return;
+
+  // __kmp_parallel_initialize is required before we initialize unshackled
+  if (!TCR_4(__kmp_init_parallel))
+    __kmp_parallel_initialize();
+
+  // Double check. Note that this double check should not be placed before
+  // __kmp_parallel_initialize as it will cause dead lock.
+  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
+  if (TCR_4(__kmp_init_unshackled)) {
+    __kmp_release_bootstrap_lock(&__kmp_initz_lock);
+    return;
+  }
+
+  // Set the count of unshackled tasks to be executed to zero
+  KMP_ATOMIC_ST_REL(&__kmp_unexecuted_unshackled_tasks, 0);
+
+  // Set the global variable indicating that we're initializing unshackled
+  // team/threads
+  TCW_SYNC_4(__kmp_init_unshackled_threads, TRUE);
+
+  // Platform independent initialization
+  __kmp_do_initialize_unshackled_threads();
+
+  // Wait here for the finish of initialization of unshackled teams
+  if (TCR_4(__kmp_init_parallel)) {
+    __kmp_unshackled_threads_initz_wait();
+  }
+
+  // We have finished unshackled initialization
+  TCW_SYNC_4(__kmp_init_unshackled, TRUE);
+
+  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
+}
+#endif
+
 /* ------------------------------------------------------------------------ */
 
 void __kmp_run_before_invoked_task(int gtid, int tid, kmp_info_t *this_thr,
@@ -8336,7 +8427,6 @@
   }
 }
 
-
 void __kmp_omp_display_env(int verbose) {
   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
   if (__kmp_init_serial == 0)
@@ -8344,3 +8434,47 @@
   __kmp_display_env_impl(!verbose, verbose);
   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
 }
+
+#if USE_UNSHACKLED_TASK
+kmp_info_t **__kmp_unshackled_threads;
+kmp_info_t *__kmp_unshackled_master_thread;
+int __kmp_unshackled_threads_num;
+std::atomic<kmp_int32> __kmp_unexecuted_unshackled_tasks;
+
+namespace {
+std::atomic<kmp_int32> __kmp_hit_unshackled_threads_num;
+
+void __kmp_unshackled_wrapper_fn(int *gtid, int *, ...) {
+  // This is an explicit synchronization on all unshackled threads in case that
+  // when a regular thread pushes an unshackled task to one unshackled thread,
+  // the thread has not been awaken once since they're released by the master
+  // thread after creating the team.
+  KMP_ATOMIC_INC(&__kmp_hit_unshackled_threads_num);
+  while (KMP_ATOMIC_LD_ACQ(&__kmp_hit_unshackled_threads_num) !=
+         __kmp_unshackled_threads_num)
+    ;
+
+  // If master thread, then wait for signal
+  if (__kmpc_master(nullptr, *gtid)) {
+    // First, unset the initial state and release the initial thread
+    TCW_4(__kmp_init_unshackled_threads, FALSE);
+    __kmp_unshackled_initz_release();
+    __kmp_unshackled_master_thread_wait();
+  }
+}
+} // namespace
+
+void __kmp_unshackled_threads_initz_routine() {
+  // Create a new root for unshackled team/threads
+  const int gtid = __kmp_register_root(TRUE);
+  __kmp_unshackled_master_thread = __kmp_threads[gtid];
+  __kmp_unshackled_threads = &__kmp_threads[gtid];
+  __kmp_unshackled_master_thread->th.th_set_nproc =
+      __kmp_unshackled_threads_num;
+
+  KMP_ATOMIC_ST_REL(&__kmp_hit_unshackled_threads_num, 0);
+
+  __kmpc_fork_call(nullptr, 0, __kmp_unshackled_wrapper_fn);
+}
+
+#endif
diff --git a/openmp/runtime/src/kmp_settings.cpp b/openmp/runtime/src/kmp_settings.cpp
--- a/openmp/runtime/src/kmp_settings.cpp
+++ b/openmp/runtime/src/kmp_settings.cpp
@@ -503,6 +503,12 @@
   if (nth < (4 * __kmp_xproc))
     nth = (4 * __kmp_xproc);
 
+#if USE_UNSHACKLED_TASK
+  // If unshackled task is enabled, we initialize the thread capacity with extra
+  // __kmp_unshackled_threads_num.
+  nth += __kmp_unshackled_threads_num;
+#endif
+
   if (nth > __kmp_max_nth)
     nth = __kmp_max_nth;
 
diff --git a/openmp/runtime/src/kmp_taskdeps.h b/openmp/runtime/src/kmp_taskdeps.h
--- a/openmp/runtime/src/kmp_taskdeps.h
+++ b/openmp/runtime/src/kmp_taskdeps.h
@@ -121,7 +121,11 @@
         KA_TRACE(20, ("__kmp_release_deps: T#%d successor %p of %p scheduled "
                       "for execution.\n",
                       gtid, successor->dn.task, task));
+#if USE_UNSHACKLED_TASK
+        __kmp_omp_task(task->encountering_gtid, successor->dn.task, false);
+#else
         __kmp_omp_task(gtid, successor->dn.task, false);
+#endif
       }
     }
 
diff --git a/openmp/runtime/src/kmp_tasking.cpp b/openmp/runtime/src/kmp_tasking.cpp
--- a/openmp/runtime/src/kmp_tasking.cpp
+++ b/openmp/runtime/src/kmp_tasking.cpp
@@ -325,6 +325,14 @@
 static kmp_int32 __kmp_push_task(kmp_int32 gtid, kmp_task_t *task) {
   kmp_info_t *thread = __kmp_threads[gtid];
   kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
+
+#if USE_UNSHACKLED_TASK
+  if (taskdata->td_flags.unshackled) {
+    gtid = KMP_GTID_TO_SHADOW_GTID(gtid);
+    thread = __kmp_threads[gtid];
+  }
+#endif
+
   kmp_task_team_t *task_team = thread->th.th_task_team;
   kmp_int32 tid = __kmp_tid_from_gtid(gtid);
   kmp_thread_data_t *thread_data;
@@ -363,7 +371,8 @@
   // Find tasking deque specific to encountering thread
   thread_data = &task_team->tt.tt_threads_data[tid];
 
-  // No lock needed since only owner can allocate
+  // No lock needed even if the task is unshackled because we have initialized
+  // the dequeue for unshackled thread data
   if (thread_data->td.td_deque == NULL) {
     __kmp_alloc_task_deque(thread, thread_data);
   }
@@ -429,6 +438,16 @@
 
   __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
 
+#if USE_UNSHACKLED_TASK
+  // Signal one worker thread to execute the task
+  if (taskdata->td_flags.unshackled) {
+    // Increment the number of unshackled tasks to be executed
+    KMP_ATOMIC_INC(&__kmp_unexecuted_unshackled_tasks);
+    // Wake unshackled threads up if they're sleeping
+    __kmp_unshackled_worker_thread_signal();
+  }
+#endif
+
   return TASK_SUCCESSFULLY_PUSHED;
 }
 
@@ -721,7 +740,6 @@
 #else /* ! USE_FAST_MEMORY */
   __kmp_thread_free(thread, taskdata);
 #endif
-
   KA_TRACE(20, ("__kmp_free_task: T#%d freed task %p\n", gtid, taskdata));
 }
 
@@ -930,8 +948,16 @@
       __ompt_task_finish(task, resumed_task, ompt_task_complete);
 #endif
 
+#if USE_UNSHACKLED_TASK
+    if (taskdata->td_flags.unshackled) {
+      KMP_DEBUG_ASSERT(taskdata->td_parent_task_team);
+      KMP_ATOMIC_DEC(
+          &taskdata->td_parent_task_team->tt.tt_unfinished_unshackled_tasks);
+    }
+#endif
+
     // Only need to keep track of count if team parallel and tasking not
-    // serialized, or task is detachable and event has already been fulfilled 
+    // serialized, or task is detachable and event has already been fulfilled
     if (!(taskdata->td_flags.team_serial || taskdata->td_flags.tasking_ser) ||
         taskdata->td_flags.detachable == TASK_DETACHABLE) {
       // Predecrement simulated by "- 1" calculation
@@ -1182,6 +1208,10 @@
   kmp_task_t *task;
   kmp_taskdata_t *taskdata;
   kmp_info_t *thread = __kmp_threads[gtid];
+#if USE_UNSHACKLED_TASK
+  kmp_info_t *encountering_thread = thread;
+  kmp_int32 encountering_gtid = gtid;
+#endif
   kmp_team_t *team = thread->th.th_team;
   kmp_taskdata_t *parent_task = thread->th.th_current_task;
   size_t shareds_offset;
@@ -1189,6 +1219,23 @@
   if (!TCR_4(__kmp_init_middle))
     __kmp_middle_initialize();
 
+#if USE_UNSHACKLED_TASK
+  if (flags->unshackled) {
+    if (!TCR_4(__kmp_init_unshackled)) {
+      __kmp_unshackled_initialize();
+    }
+
+    // For an unshackled task encountered by a regular thread, we will push the
+    // task to the (gtid%__kmp_unshackled_threads_num)-th unshackled thread
+    if (!KMP_UNSHACKLED_THREAD(gtid)) {
+      thread = __kmp_threads[KMP_GTID_TO_SHADOW_GTID(gtid)];
+      team = thread->th.th_team;
+      // We don't change the parent-child relation for unshackled task as we
+      // need that to do per-task-region synchronization
+    }
+  }
+#endif
+
   KA_TRACE(10, ("__kmp_task_alloc(enter): T#%d loc=%p, flags=(0x%x) "
                 "sizeof_task=%ld sizeof_shared=%ld entry=%p\n",
                 gtid, loc_ref, *((kmp_int32 *)flags), sizeof_kmp_task_t,
@@ -1199,6 +1246,7 @@
     }
     flags->final = 1;
   }
+
   if (flags->tiedness == TASK_UNTIED && !team->t.t_serialized) {
     // Untied task encountered causes the TSC algorithm to check entire deque of
     // the victim thread. If no untied task encountered, then checking the head
@@ -1261,11 +1309,21 @@
 
 // Avoid double allocation here by combining shareds with taskdata
 #if USE_FAST_MEMORY
+#if USE_UNSHACKLED_TASK
+  taskdata = (kmp_taskdata_t *)__kmp_fast_allocate(
+      encountering_thread, shareds_offset + sizeof_shareds);
+#else
   taskdata = (kmp_taskdata_t *)__kmp_fast_allocate(thread, shareds_offset +
                                                                sizeof_shareds);
+#endif
 #else /* ! USE_FAST_MEMORY */
+#if USE_UNSHACKLED_TASK
+  taskdata = (kmp_taskdata_t *)__kmp_thread_malloc(
+      encountering_thread, shareds_offset + sizeof_shareds);
+#else
   taskdata = (kmp_taskdata_t *)__kmp_thread_malloc(thread, shareds_offset +
                                                                sizeof_shareds);
+#endif
 #endif /* USE_FAST_MEMORY */
   ANNOTATE_HAPPENS_AFTER(taskdata);
 
@@ -1293,7 +1351,11 @@
 
   taskdata->td_task_id = KMP_GEN_TASK_ID();
   taskdata->td_team = team;
+#if USE_UNSHACKLED_TASK
+  taskdata->td_alloc_thread = encountering_thread;
+#else
   taskdata->td_alloc_thread = thread;
+#endif
   taskdata->td_parent = parent_task;
   taskdata->td_level = parent_task->td_level + 1; // increment nesting level
   KMP_ATOMIC_ST_RLX(&taskdata->td_untied_count, 0);
@@ -1312,6 +1374,11 @@
   taskdata->td_flags.destructors_thunk = flags->destructors_thunk;
   taskdata->td_flags.proxy = flags->proxy;
   taskdata->td_flags.detachable = flags->detachable;
+#if USE_UNSHACKLED_TASK
+  taskdata->td_flags.unshackled = flags->unshackled;
+  taskdata->td_parent_task_team = encountering_thread->th.th_task_team;
+  taskdata->encountering_gtid = encountering_gtid;
+#endif
   taskdata->td_task_team = thread->th.th_task_team;
   taskdata->td_size_alloc = shareds_offset + sizeof_shareds;
   taskdata->td_flags.tasktype = TASK_EXPLICIT;
@@ -1369,6 +1436,18 @@
     }
   }
 
+#if USE_UNSHACKLED_TASK
+  {
+    kmp_task_team_t *parent_team = taskdata->td_parent_task_team;
+    if (flags->unshackled && parent_team) {
+      KMP_ATOMIC_INC(&parent_team->tt.tt_unfinished_unshackled_tasks);
+      if (!parent_team->tt.tt_unshackled_task_encountered) {
+        TCW_4(parent_team->tt.tt_unshackled_task_encountered, TRUE);
+      }
+    }
+  }
+#endif
+
   KA_TRACE(20, ("__kmp_task_alloc(exit): T#%d created task %p parent=%p\n",
                 gtid, taskdata, taskdata->td_parent));
   ANNOTATE_HAPPENS_BEFORE(task);
@@ -1406,6 +1485,13 @@
                                          size_t sizeof_shareds,
                                          kmp_routine_entry_t task_entry,
                                          kmp_int64 device_id) {
+#if USE_UNSHACKLED_TASK
+  kmp_tasking_flags_t *input_flags = (kmp_tasking_flags_t *)&flags;
+  input_flags->unshackled = TRUE;
+  // Unshackled thread is always final for now because it is created by the
+  // compiler and used only for async offloading
+  input_flags->final = TRUE;
+#endif
   return __kmpc_omp_task_alloc(loc_ref, gtid, flags, sizeof_kmp_task_t,
                                sizeof_shareds, task_entry);
 }
@@ -1478,6 +1564,15 @@
   }
 #endif
 
+#if USE_UNSHACKLED_TASK
+  // Decreament the counter of unshackled tasks to be executed
+  if (taskdata->td_flags.unshackled) {
+    // Unshackled tasks can only be executed by unshackled threads
+    KMP_ASSERT(KMP_UNSHACKLED_THREAD(gtid));
+    KMP_ATOMIC_DEC(&__kmp_unexecuted_unshackled_tasks);
+  }
+#endif
+
   // Proxy tasks are not handled by the runtime
   if (taskdata->td_flags.proxy != TASK_PROXY) {
     ANNOTATE_HAPPENS_AFTER(task);
@@ -1875,6 +1970,14 @@
 
     must_wait = must_wait || (thread->th.th_task_team != NULL &&
                               thread->th.th_task_team->tt.tt_found_proxy_tasks);
+
+#if USE_UNSHACKLED_TASK
+    // If unshackled thread is encountered, we must enable wait here.
+    must_wait = must_wait ||
+                (thread->th.th_task_team != NULL &&
+                 thread->th.th_task_team->tt.tt_unshackled_task_encountered);
+#endif
+
     if (must_wait) {
       kmp_flag_32 flag(RCAST(std::atomic<kmp_uint32> *,
                              &(taskdata->td_incomplete_child_tasks)),
@@ -2840,7 +2943,13 @@
 
   thread->th.th_reap_state = KMP_NOT_SAFE_TO_REAP;
   threads_data = (kmp_thread_data_t *)TCR_PTR(task_team->tt.tt_threads_data);
+#if USE_UNSHACKLED_TASK
+  // This can happen when unshackled task is enabled
+  if (threads_data == nullptr)
+    return FALSE;
+#else
   KMP_DEBUG_ASSERT(threads_data != NULL);
+#endif
 
   nthreads = task_team->tt.tt_nproc;
   unfinished_threads = &(task_team->tt.tt_unfinished_threads);
@@ -2924,8 +3033,8 @@
         }
       }
 
-      if (task == NULL) // break out of tasking loop
-        break;
+      if (task == NULL)
+        break; // break out of tasking loop
 
 // Found a task; execute it
 #if USE_ITT_BUILD && USE_ITT_NOTIFY
@@ -3380,6 +3489,10 @@
   task_team->tt.tt_nproc = nthreads = team->t.t_nproc;
 
   KMP_ATOMIC_ST_REL(&task_team->tt.tt_unfinished_threads, nthreads);
+#if USE_UNSHACKLED_TASK
+  KMP_ATOMIC_ST_REL(&task_team->tt.tt_unfinished_unshackled_tasks, 0);
+  TCW_4(task_team->tt.tt_unshackled_task_encountered, FALSE);
+#endif
   TCW_4(task_team->tt.tt_active, TRUE);
 
   KA_TRACE(20, ("__kmp_allocate_task_team: T#%d exiting; task_team = %p "
@@ -3552,6 +3665,28 @@
                     ((team != NULL) ? team->t.t_id : -1), other_team));
     }
   }
+
+#if USE_UNSHACKLED_TASK
+  // For regular thread, task enabling should be called when the task is going
+  // to be pushed to a dequeue. However, for the unshackled thread, we need it
+  // ahead of time so that some operations can be performed without race
+  // condition.
+  if (this_thr == __kmp_unshackled_master_thread) {
+    for (int i = 0; i < 2; ++i) {
+      kmp_task_team_t *task_team = team->t.t_task_team[i];
+      if (KMP_TASKING_ENABLED(task_team)) {
+        continue;
+      }
+      __kmp_enable_tasking(task_team, this_thr);
+      for (int j = 0; j < task_team->tt.tt_nproc; ++j) {
+        kmp_thread_data_t *thread_data = &task_team->tt.tt_threads_data[j];
+        if (thread_data->td.td_deque == NULL) {
+          __kmp_alloc_task_deque(__kmp_unshackled_threads[j], thread_data);
+        }
+      }
+    }
+  }
+#endif
 }
 
 // __kmp_task_team_sync: Propagation of task team data from team to threads
@@ -3618,6 +3753,12 @@
 
     TCW_PTR(this_thr->th.th_task_team, NULL);
   }
+
+#if USE_UNSHACKLED_TASK
+  if (task_team && task_team->tt.tt_unshackled_task_encountered)
+    while (KMP_ATOMIC_LD_ACQ(&task_team->tt.tt_unfinished_unshackled_tasks))
+      ;
+#endif
 }
 
 // __kmp_tasking_barrier:
diff --git a/openmp/runtime/src/kmp_wait_release.h b/openmp/runtime/src/kmp_wait_release.h
--- a/openmp/runtime/src/kmp_wait_release.h
+++ b/openmp/runtime/src/kmp_wait_release.h
@@ -381,6 +381,27 @@
         break;
     }
 
+#if USE_UNSHACKLED_TASK
+    // For unshackled thread, if task_team is nullptr, it means the master
+    // thread has not released the barrier. We cannot wait here because once the
+    // master thread releases all children barriers, all unshackled threads are
+    // still sleeping. This leads to a problem that following configuration,
+    // such as task team sync, will not be performed such that this thread does
+    // not have task team. Usually it is not bad. However, a corner case is,
+    // when the first task encountered is an untied task, the check in
+    // __kmp_task_alloc will crash because it uses the task team pointer without
+    // checking whether it is nullptr. It is probably under some kind of
+    // assumption.
+    if (task_team && KMP_UNSHACKLED_WORKER_THREAD(th_gtid)) {
+      // If there is still unshackled tasks to be executed, the unshackled
+      // thread will not enter a waiting status.
+      if (KMP_ATOMIC_LD_ACQ(&__kmp_unexecuted_unshackled_tasks) == 0) {
+        __kmp_unshackled_worker_thread_wait();
+      }
+      continue;
+    }
+#endif
+
     // Don't suspend if KMP_BLOCKTIME is set to "infinite"
     if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME &&
         __kmp_pause_status != kmp_soft_paused)
diff --git a/openmp/runtime/src/z_Linux_util.cpp b/openmp/runtime/src/z_Linux_util.cpp
--- a/openmp/runtime/src/z_Linux_util.cpp
+++ b/openmp/runtime/src/z_Linux_util.cpp
@@ -25,6 +25,7 @@
 #include <alloca.h>
 #endif
 #include <math.h> // HUGE_VAL.
+#include <semaphore.h>
 #include <sys/resource.h>
 #include <sys/syscall.h>
 #include <sys/time.h>
@@ -2439,7 +2440,7 @@
                            ,
                            void **exit_frame_ptr
 #endif
-                           ) {
+) {
 #if OMPT_SUPPORT
   *exit_frame_ptr = OMPT_GET_FRAME_ADDRESS(0);
 #endif
@@ -2518,4 +2519,107 @@
 
 #endif
 
+#if USE_UNSHACKLED_TASK
+
+namespace {
+pthread_t __kmp_unshackled_master_thread_handle;
+
+// Condition variable for initializing unshackled team
+pthread_cond_t __kmp_unshackled_threads_initz_cond_var;
+pthread_mutex_t __kmp_unshackled_threads_initz_lock;
+
+// Condition variable for the wrapper function of master thread
+pthread_cond_t __kmp_unshackled_master_thread_cond_var;
+pthread_mutex_t _kmp_unshackled_master_thread_lock;
+
+// Semaphore for worker threads. We don't use condition variable here in case
+// that when multiple signals are sent at the same time, only one thread might
+// be waken.
+sem_t __kmp_unshackled_task_sem;
+} // namespace
+
+void __kmp_unshackled_worker_thread_wait() {
+  int status = sem_wait(&__kmp_unshackled_task_sem);
+  KMP_CHECK_SYSFAIL("sem_wait", status);
+}
+
+void __kmp_do_initialize_unshackled_threads() {
+  // Initialize condition variable
+  int status =
+      pthread_cond_init(&__kmp_unshackled_threads_initz_cond_var, nullptr);
+  KMP_CHECK_SYSFAIL("pthread_cond_init", status);
+
+  status = pthread_cond_init(&__kmp_unshackled_master_thread_cond_var, nullptr);
+  KMP_CHECK_SYSFAIL("pthread_cond_init", status);
+
+  status = pthread_mutex_init(&__kmp_unshackled_threads_initz_lock, nullptr);
+  KMP_CHECK_SYSFAIL("pthread_mutex_init", status);
+
+  status = pthread_mutex_init(&_kmp_unshackled_master_thread_lock, nullptr);
+  KMP_CHECK_SYSFAIL("pthread_mutex_init", status);
+
+  // Initialize the semaphore
+  status = sem_init(&__kmp_unshackled_task_sem, 0, 0);
+  KMP_CHECK_SYSFAIL("sem_init", status);
+
+  // Create a new thread to finish initialization
+  status = pthread_create(
+      &__kmp_unshackled_master_thread_handle, nullptr,
+      [](void *) -> void * {
+        __kmp_unshackled_threads_initz_routine();
+        return nullptr;
+      },
+      nullptr);
+  KMP_CHECK_SYSFAIL("pthread_create", status);
+}
+
+void __kmp_unshackled_threads_initz_wait() {
+  // Initial thread waits here for the completion of the initialization. The
+  // condition variable will be notified by master thread of unshackled teams
+  int status = pthread_mutex_lock(&__kmp_unshackled_threads_initz_lock);
+  KMP_CHECK_SYSFAIL("pthread_mutex_lock", status);
+
+  if (TCR_4(__kmp_init_unshackled_threads)) {
+    status = pthread_cond_wait(&__kmp_unshackled_threads_initz_cond_var,
+                               &__kmp_unshackled_threads_initz_lock);
+    KMP_CHECK_SYSFAIL("pthread_cond_wait", status);
+  }
+
+  status = pthread_mutex_unlock(&__kmp_unshackled_threads_initz_lock);
+  KMP_CHECK_SYSFAIL("pthread_mutex_unlock", status);
+}
+
+void __kmp_unshackled_initz_release() {
+  // After all initialization, reset __kmp_init_unshackled_threads to false
+  int status = pthread_mutex_lock(&__kmp_unshackled_threads_initz_lock);
+  KMP_CHECK_SYSFAIL("pthread_mutex_lock", status);
+
+  status = pthread_cond_signal(&__kmp_unshackled_threads_initz_cond_var);
+  KMP_CHECK_SYSFAIL("pthread_cond_wait", status);
+
+  TCW_SYNC_4(__kmp_init_unshackled_threads, FALSE);
+
+  status = pthread_mutex_unlock(&__kmp_unshackled_threads_initz_lock);
+  KMP_CHECK_SYSFAIL("pthread_mutex_unlock", status);
+}
+
+void __kmp_unshackled_master_thread_wait() {
+  // The master thread of unshackled team will be blocked here. The
+  // condition variable can only be signal in the destructor of RTL
+  int status = pthread_mutex_lock(&_kmp_unshackled_master_thread_lock);
+  KMP_CHECK_SYSFAIL("pthread_mutex_lock", status);
+  status = pthread_cond_wait(&__kmp_unshackled_master_thread_cond_var,
+                             &_kmp_unshackled_master_thread_lock);
+  KMP_CHECK_SYSFAIL("pthread_cond_wait", status);
+  status = pthread_mutex_unlock(&_kmp_unshackled_master_thread_lock);
+  KMP_CHECK_SYSFAIL("pthread_mutex_unlock", status);
+}
+
+void __kmp_unshackled_worker_thread_signal() {
+  int status = sem_post(&__kmp_unshackled_task_sem);
+  KMP_CHECK_SYSFAIL("sem_post", status);
+}
+
+#endif
+
 // end of file //
diff --git a/openmp/runtime/test/tasking/unshackled_task/depend.cpp b/openmp/runtime/test/tasking/unshackled_task/depend.cpp
new file mode 100644
--- /dev/null
+++ b/openmp/runtime/test/tasking/unshackled_task/depend.cpp
@@ -0,0 +1,168 @@
+// RUN: %libomp-cxx-compile-and-run
+
+/*
+ * This test aims to check whether unshackled task can work with regular task in
+ * terms of dependences. It is equivalent to the following code:
+ *
+ * #pragma omp parallel
+ * for for (int i = 0; i < N; ++i) {
+ *   int data = -1;
+ * #pragma omp task shared(data) depend(out: data)
+ *   {
+ *     data = 1;
+ *   }
+ * #pragma omp task shared(data) depend(inout: data)
+ *   {
+ *     data += 2;
+ *   }
+ * #pragma omp task shared(data) depend(inout: data)
+ *   {
+ *     data += 4;
+ *   }
+ * #pragma omp taskwait
+ *   assert(data == 7);
+ * }
+ */
+
+#include <cassert>
+#include <iostream>
+
+extern "C" {
+struct ident_t;
+
+using kmp_int32 = int32_t;
+using kmp_int64 = int64_t;
+using kmp_routine_entry_t = kmp_int32 (*)(kmp_int32, void *);
+using kmp_intptr_t = intptr_t;
+
+typedef struct kmp_depend_info {
+  kmp_intptr_t base_addr;
+  size_t len;
+  struct {
+    bool in : 1;
+    bool out : 1;
+    bool mtx : 1;
+  } flags;
+} kmp_depend_info_t;
+
+typedef union kmp_cmplrdata {
+  kmp_int32 priority;
+  kmp_routine_entry_t destructors;
+} kmp_cmplrdata_t;
+
+typedef struct kmp_task {
+  void *shareds;
+  kmp_routine_entry_t routine;
+  kmp_int32 part_id;
+  kmp_cmplrdata_t data1;
+  kmp_cmplrdata_t data2;
+} kmp_task_t;
+
+struct kmp_task_t_with_privates {
+  kmp_task_t task;
+};
+
+struct anon {
+  int32_t *data;
+};
+
+int32_t __kmpc_global_thread_num(void *);
+kmp_task_t *__kmpc_omp_task_alloc(ident_t *, kmp_int32, kmp_int32, size_t,
+                                  size_t, kmp_routine_entry_t);
+kmp_task_t *__kmpc_omp_target_task_alloc(ident_t *, kmp_int32, kmp_int32,
+                                         size_t, size_t, kmp_routine_entry_t,
+                                         kmp_int64);
+kmp_int32 __kmpc_omp_taskwait(ident_t *, kmp_int32);
+kmp_int32 __kmpc_omp_task_with_deps(ident_t *loc_ref, kmp_int32 gtid,
+                                    kmp_task_t *new_task, kmp_int32 ndeps,
+                                    kmp_depend_info_t *dep_list,
+                                    kmp_int32 ndeps_noalias,
+                                    kmp_depend_info_t *noalias_dep_list);
+}
+
+kmp_int32 omp_task_entry_1(kmp_int32 gtid, kmp_task_t_with_privates *task) {
+  auto shareds = reinterpret_cast<anon *>(task->task.shareds);
+  auto p = shareds->data;
+  *p = 1;
+  return 0;
+}
+
+kmp_int32 omp_task_entry_2(kmp_int32 gtid, kmp_task_t_with_privates *task) {
+  auto shareds = reinterpret_cast<anon *>(task->task.shareds);
+  auto p = shareds->data;
+  *p += 2;
+  return 0;
+}
+
+kmp_int32 omp_task_entry_3(kmp_int32 gtid, kmp_task_t_with_privates *task) {
+  auto shareds = reinterpret_cast<anon *>(task->task.shareds);
+  auto p = shareds->data;
+  *p += 4;
+  return 0;
+}
+
+int main(int argc, char *argv[]) {
+  constexpr const int N = 1024;
+#pragma omp parallel for
+  for (int i = 0; i < N; ++i) {
+    int32_t gtid = __kmpc_global_thread_num(nullptr);
+    int32_t data = 0;
+
+    // Task 1
+    auto task1 = __kmpc_omp_task_alloc(
+        nullptr, gtid, 1, sizeof(kmp_task_t_with_privates), sizeof(anon),
+        reinterpret_cast<kmp_routine_entry_t>(omp_task_entry_1));
+
+    auto shareds = reinterpret_cast<anon *>(task1->shareds);
+    shareds->data = &data;
+
+    kmp_depend_info_t depinfo1;
+    depinfo1.base_addr = reinterpret_cast<intptr_t>(&data);
+    depinfo1.flags.out = 1;
+    depinfo1.len = 4;
+
+    __kmpc_omp_task_with_deps(nullptr, gtid, task1, 1, &depinfo1, 0, nullptr);
+
+    // Task 2
+    auto task2 = __kmpc_omp_target_task_alloc(
+        nullptr, gtid, 1, sizeof(kmp_task_t_with_privates), sizeof(anon),
+        reinterpret_cast<kmp_routine_entry_t>(omp_task_entry_2), -1);
+
+    shareds = reinterpret_cast<anon *>(task2->shareds);
+    shareds->data = &data;
+
+    kmp_depend_info_t depinfo2;
+    depinfo2.base_addr = reinterpret_cast<intptr_t>(&data);
+    depinfo2.flags.in = 1;
+    depinfo2.flags.out = 1;
+    depinfo2.len = 4;
+
+    __kmpc_omp_task_with_deps(nullptr, gtid, task2, 1, &depinfo2, 0, nullptr);
+
+    // Task 3
+    auto task3 = __kmpc_omp_task_alloc(
+        nullptr, gtid, 1, sizeof(kmp_task_t_with_privates), sizeof(anon),
+        reinterpret_cast<kmp_routine_entry_t>(omp_task_entry_3));
+
+    shareds = reinterpret_cast<anon *>(task3->shareds);
+    shareds->data = &data;
+
+    kmp_depend_info_t depinfo3;
+    depinfo3.base_addr = reinterpret_cast<intptr_t>(&data);
+    depinfo3.flags.in = 1;
+    depinfo3.flags.out = 1;
+    depinfo3.len = 4;
+
+    __kmpc_omp_task_with_deps(nullptr, gtid, task3, 1, &depinfo3, 0, nullptr);
+
+    // Wait for all tasks
+    __kmpc_omp_taskwait(nullptr, gtid);
+
+    assert(data == 7);
+  }
+
+  std::cout << "PASS\n";
+  return 0;
+}
+
+// CHECK: PASS
diff --git a/openmp/runtime/test/tasking/unshackled_task/gtid.cpp b/openmp/runtime/test/tasking/unshackled_task/gtid.cpp
new file mode 100644
--- /dev/null
+++ b/openmp/runtime/test/tasking/unshackled_task/gtid.cpp
@@ -0,0 +1,86 @@
+// RUN: %libomp-cxx-compile-and-run
+
+/*
+ * This test aims to check whether unshackled thread has right gtid. It is
+ * equivalent to the following code:
+ *
+ * #pragma omp parallel for
+ * for (int i = 0; i < N; ++i) {
+ *   int data = -1;
+ * #pragma omp task unshackled shared(data)
+ *   {
+ *     data = omp_get_global_thread_id();
+ *   }
+ * #pragma omp taskwait
+ *   assert(data > 0 && data <= __kmp_num_unshackled_threads);
+ * }
+ */
+
+#include <cassert>
+#include <iostream>
+
+extern "C" {
+struct ident_t;
+
+using kmp_int32 = int32_t;
+using kmp_int64 = int64_t;
+using kmp_routine_entry_t = kmp_int32 (*)(kmp_int32, void *);
+
+typedef union kmp_cmplrdata {
+  kmp_int32 priority;
+  kmp_routine_entry_t destructors;
+} kmp_cmplrdata_t;
+
+typedef struct kmp_task {
+  void *shareds;
+  kmp_routine_entry_t routine;
+  kmp_int32 part_id;
+  kmp_cmplrdata_t data1;
+  kmp_cmplrdata_t data2;
+} kmp_task_t;
+
+struct kmp_task_t_with_privates {
+  kmp_task_t task;
+};
+
+struct anon {
+  int32_t *data;
+};
+
+int32_t __kmpc_global_thread_num(void *) __attribute__((weak));
+kmp_task_t *__kmpc_omp_target_task_alloc(ident_t *, kmp_int32, kmp_int32,
+                                         size_t, size_t, kmp_routine_entry_t,
+                                         kmp_int64);
+kmp_int32 __kmpc_omp_task(ident_t *, kmp_int32, kmp_task_t *);
+kmp_int32 __kmpc_omp_taskwait(ident_t *, kmp_int32);
+}
+
+kmp_int32 omp_task_entry(kmp_int32 gtid, kmp_task_t_with_privates *task) {
+  auto shareds = reinterpret_cast<anon *>(task->task.shareds);
+  auto p = shareds->data;
+  *p = __kmpc_global_thread_num(nullptr);
+  return 0;
+}
+
+int main(int argc, char *argv[]) {
+  constexpr const int N = 1024;
+#pragma omp parallel for
+  for (int i = 0; i < N; ++i) {
+    int32_t gtid = __kmpc_global_thread_num(nullptr);
+    auto task = __kmpc_omp_target_task_alloc(
+        nullptr, gtid, 1, sizeof(kmp_task_t_with_privates), sizeof(anon),
+        reinterpret_cast<kmp_routine_entry_t>(omp_task_entry), -1);
+    auto shareds = reinterpret_cast<anon *>(task->shareds);
+    int32_t data = -1;
+    shareds->data = &data;
+    __kmpc_omp_task(nullptr, gtid, task);
+    __kmpc_omp_taskwait(nullptr, gtid);
+    // FIXME: 8 here is not accurate
+    assert(data > 0 && data <= 8);
+  }
+
+  std::cout << "PASS\n";
+  return 0;
+}
+
+// CHECK: PASS