diff --git a/openmp/runtime/src/i18n/en_US.txt b/openmp/runtime/src/i18n/en_US.txt
--- a/openmp/runtime/src/i18n/en_US.txt
+++ b/openmp/runtime/src/i18n/en_US.txt
@@ -269,7 +269,7 @@
 Using_uint_Value             "%1$s value \"%2$u\" will be used."
 Using_uint64_Value           "%1$s value \"%2$s\" will be used."
 Using_str_Value              "%1$s value \"%2$s\" will be used."
-BarrierPatternOverride       "Mixing other barrier patterns with dist is prohibited. Using dist for all barrier patterns."
+BarrierPatternOverride       "Mixing other barrier patterns with %1$s is prohibited. Using %1$s for all barrier patterns."
 MaxValueUsing                "%1$s maximum value \"%2$d\" will be used."
 MinValueUsing                "%1$s minimum value \"%2$d\" will be used."
 MemoryAllocFailed            "Memory allocation failed."
@@ -479,6 +479,8 @@
 AffHWSubsetAllFiltered       "KMP_HW_SUBSET ignored: all hardware resources would be filtered, please reduce the filter."
 AffHWSubsetAttrsNonHybrid    "KMP_HW_SUBSET ignored: Too many attributes specified. This machine is not a hybrid architecutre."
 AffHWSubsetIgnoringAttr      "KMP_HW_SUBSET: ignoring %1$s attribute. This machine is not a hybrid architecutre."
+HardBarrierNotSupported      "%1$s: Fallback to software barrier in this run entirely"
+HardBarrierCannotBeUsed      "Hard barrier cannot be used for this team. Fallback to software barrier"
 
 # --------------------------------------------------------------------------------------------------
 -*- HINTS -*-
diff --git a/openmp/runtime/src/kmp.h b/openmp/runtime/src/kmp.h
--- a/openmp/runtime/src/kmp.h
+++ b/openmp/runtime/src/kmp.h
@@ -115,7 +115,6 @@
 #include "kmp_debug.h"
 #include "kmp_lock.h"
 #include "kmp_version.h"
-#include "kmp_barrier.h"
 #if USE_DEBUGGER
 #include "kmp_debugger.h"
 #endif
@@ -730,6 +729,8 @@
     virtual void clear(int i) {}
     // Zero out entire mask
     virtual void zero() {}
+    // Count the mask
+    virtual int count() const { return 0; }
     // Copy src into this mask
     virtual void copy(const Mask *src) {}
     // this &= rhs
@@ -1991,6 +1992,7 @@
                                                 branching factor 2^n */
                            bp_hierarchical_bar = 3, /* Machine hierarchy tree */
                            bp_dist_bar = 4, /* Distributed barrier */
+                           bp_hard_bar = 5, /* Hardware barrier */
                            bp_last_bar /* Placeholder to mark the end */
 } kmp_bar_pat_e;
 
@@ -2789,6 +2791,7 @@
   std::atomic<bool> th_blocking;
 #endif
   kmp_cg_root_t *th_cg_roots; // list of cg_roots associated with this thread
+  int th_hard_barrier_window; // used for hardware barrier
 } kmp_base_info_t;
 
 typedef union KMP_ALIGN_CACHE kmp_info {
@@ -2836,6 +2839,9 @@
 #endif
 #define KMP_INLINE_ARGV_ENTRIES (int)(KMP_INLINE_ARGV_BYTES / KMP_PTR_SKIP)
 
+class distributedBarrier;
+class hardBarrier;
+
 typedef struct KMP_ALIGN_CACHE kmp_base_team {
   // Synchronization Data
   // ---------------------------------------------------------------------------
@@ -2929,6 +2935,7 @@
   void *t_stack_id; // team specific stack stitching id (for ittnotify)
 #endif /* USE_ITT_BUILD */
   distributedBarrier *b; // Distributed barrier data associated with team
+  hardBarrier *h; // Hard barrier data associated with team
 } kmp_base_team_t;
 
 union KMP_ALIGN_CACHE kmp_team {
@@ -3632,6 +3639,8 @@
 extern void __kmp_cleanup_hierarchy();
 extern void __kmp_get_hierarchy(kmp_uint32 nproc, kmp_bstate_t *thr_bar);
 
+extern bool __kmp_check_places_for_hard_barrier();
+
 #if KMP_USE_FUTEX
 
 extern int __kmp_futex_determine_capable(void);
@@ -3721,6 +3730,8 @@
 extern void __kmp_end_split_barrier(enum barrier_type bt, int gtid);
 extern int __kmp_barrier_gomp_cancel(int gtid);
 
+extern void __kmp_hard_barrier_wakeup_soft(kmp_info_t *master);
+
 /*!
  * Tell the fork call which compiler generated the fork call, and therefore how
  * to deal with the call.
@@ -4199,6 +4210,8 @@
 extern void __kmp_hidden_helper_worker_thread_signal();
 extern void __kmp_hidden_helper_main_thread_release();
 
+extern int __kmp_get_online_cores();
+
 // Check whether a given thread is a hidden helper thread
 #define KMP_HIDDEN_HELPER_THREAD(gtid)                                         \
   ((gtid) >= 1 && (gtid) <= __kmp_hidden_helper_threads_num)
diff --git a/openmp/runtime/src/kmp_affinity.h b/openmp/runtime/src/kmp_affinity.h
--- a/openmp/runtime/src/kmp_affinity.h
+++ b/openmp/runtime/src/kmp_affinity.h
@@ -52,6 +52,7 @@
     int next(int previous) const override {
       return hwloc_bitmap_next(mask, previous);
     }
+    int count() const override { return 0; }
     int get_system_affinity(bool abort_on_error) override {
       KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
                   "Illegal get affinity operation when not capable");
@@ -291,6 +292,13 @@
       for (mask_size_type i = 0; i < e; ++i)
         mask[i] = (mask_t)0;
     }
+    int count() const override {
+      int count = 0;
+      for (int i = begin(); i < end(); i = next(i)) {
+        count++;
+      }
+      return count;
+    }
     void copy(const KMPAffinity::Mask *src) override {
       const Mask *convert = static_cast<const Mask *>(src);
       mask_size_type e = get_num_mask_types();
diff --git a/openmp/runtime/src/kmp_affinity.cpp b/openmp/runtime/src/kmp_affinity.cpp
--- a/openmp/runtime/src/kmp_affinity.cpp
+++ b/openmp/runtime/src/kmp_affinity.cpp
@@ -4475,6 +4475,40 @@
   KMP_CPU_FREE_ARRAY(osId2Mask, maxIndex + 1);
 }
 
+// return true if affinity setting is eligible for hard barrier
+// i.e. each place contains only one core in succession
+bool __kmp_check_places_for_hard_barrier() {
+  int prev = -1;
+
+  KA_TRACE(20, ("__kmp_check_places_for_hard_barrier: enter. num_masks: %d\n",
+                __kmp_affinity_num_masks));
+
+  for (int i = 0; i < __kmp_affinity_num_masks; i++) {
+    kmp_affin_mask_t *mask = KMP_CPU_INDEX(__kmp_affinity_masks, i);
+
+    if (mask->count() != 1) {
+      KA_TRACE(20,
+               ("__kmp_check_places_for_hard_barrier: place %d has %d cores\n",
+                i, mask->count()));
+      return false;
+    }
+
+    int current = mask->begin();
+    if (prev != -1 && current != (prev + 1)) {
+      KA_TRACE(20, ("__kmp_check_places_for_hard_barrier: place %d is on core "
+                    "%d, not next to previous core %d\n",
+                    i, current, prev));
+      return false;
+    }
+
+    prev = current;
+  }
+
+  KA_TRACE(
+      20, ("__kmp_check_places_for_hard_barrier: eixt. ok for hard barrier\n"));
+  return true;
+}
+
 void __kmp_affinity_initialize(void) {
   // Much of the code above was written assuming that if a machine was not
   // affinity capable, then __kmp_affinity_type == affinity_none.  We now
diff --git a/openmp/runtime/src/kmp_barrier.h b/openmp/runtime/src/kmp_barrier.h
--- a/openmp/runtime/src/kmp_barrier.h
+++ b/openmp/runtime/src/kmp_barrier.h
@@ -14,8 +14,12 @@
 #define KMP_BARRIER_H
 
 #include "kmp.h"
+#include "kmp_itt.h"
 #include "kmp_i18n.h"
 
+#include <fcntl.h>
+#include "kmp_safe_c_api.h"
+
 #if KMP_HAVE_XMMINTRIN_H && KMP_HAVE__MM_MALLOC
 #include <xmmintrin.h>
 #define KMP_ALIGNED_ALLOCATE(size, alignment) _mm_malloc(size, alignment)
@@ -138,4 +142,147 @@
   void go_reset();
 };
 
+class hardBarrier {
+private:
+  /* group number */
+  int *group;
+  /* barrier blade number */
+  int *bb;
+  /* barrier window number on each core (indexed by tid) */
+  int **barrier_window;
+
+  int primary_core;
+  int get_window_index_from_tid(int tid) { return tid % CORES_PER_GROUP; }
+
+public:
+  static constexpr const char *SYSFS_ROOT = "/sys/class/misc/hard_barrier";
+  static const int BUF_SIZE = 64;
+  // const value read from sysfs
+  static int NUM_GROUPS;
+  static int CORES_PER_GROUP;
+
+  int num_groups;
+  int *threads_in_group;
+  bool is_hybrid;
+  void *team_icvs;
+
+  static hardBarrier *allocate() {
+    if (NUM_GROUPS == 0) {
+      // This only happens when allocating hot team in register_root().
+      // At that time affinity has not been initialized and
+      // we cannot determine if hard barrier can be used or not.
+      // So, return NULL here to defer allocation
+      return NULL;
+    }
+
+    hardBarrier *h = (hardBarrier *)KMP_ALIGNED_ALLOCATE(sizeof(hardBarrier),
+                                                         4 * CACHE_LINE);
+    if (!h) {
+      KMP_FATAL(MemoryAllocFailed);
+    }
+    h->team_icvs = __kmp_allocate(sizeof(kmp_internal_control));
+    h->threads_in_group = (int *)__kmp_allocate(sizeof(int) * NUM_GROUPS);
+    h->group = (int *)__kmp_allocate(sizeof(int) * NUM_GROUPS);
+    h->bb = (int *)__kmp_allocate(sizeof(int) * NUM_GROUPS);
+    h->barrier_window = (int **)__kmp_allocate(sizeof(int *) * NUM_GROUPS);
+
+    h->is_hybrid = false;
+    h->num_groups = 0;
+    for (int i = 0; i < NUM_GROUPS; i++) {
+      h->group[i] = -1;
+      h->bb[i] = -1;
+      h->threads_in_group[i] = 0;
+      h->barrier_window[i] =
+          (int *)__kmp_allocate(sizeof(int) * hardBarrier::CORES_PER_GROUP);
+      for (int j = 0; j < hardBarrier::CORES_PER_GROUP; j++) {
+        h->barrier_window[i][j] = -1;
+      }
+    }
+    return h;
+  }
+  static void deallocate(hardBarrier *h) {
+    h->barrier_free();
+    for (int i = 0; i < hardBarrier::NUM_GROUPS; i++) {
+      __kmp_free(h->barrier_window[i]);
+    }
+    __kmp_free(h->barrier_window);
+    __kmp_free(h->bb);
+    __kmp_free(h->group);
+    __kmp_free(h->threads_in_group);
+    __kmp_free(h->team_icvs);
+    KMP_ALIGNED_FREE(h);
+  }
+
+  hardBarrier() = delete;
+  ~hardBarrier() = delete;
+
+  int barrier_alloc(kmp_info_t *this_thr, int nthreads);
+  void barrier_free();
+  bool is_barrier_allocated() {
+    if (num_groups > 0) {
+      return true;
+    }
+    return false;
+  };
+  void get_window(kmp_info_t *this_thr, int tid);
+  void sync(kmp_info_t *this_thr, int final_spin, int gtid,
+            int tid USE_ITT_BUILD_ARG(void *itt_sync_obj));
+  static void wakeup() {
+    // wakeup threads which entered to sleep by wfe
+    asm volatile("sevl" :::);
+  }
+  static bool system_supports_hard_barrier() {
+    int ret, fd;
+    char buf[BUF_SIZE];
+
+    KMP_SNPRINTF(buf, BUF_SIZE, "%s/group0/barrier0/masks", SYSFS_ROOT);
+    fd = open(buf, O_RDWR);
+    if (fd < 0) {
+      // module not loaded or permission denied
+      return false;
+    }
+    close(fd);
+
+    // check avilable barrier
+    for (int group = 0;; group++) {
+      KMP_SNPRINTF(buf, BUF_SIZE, "%s/group%d/available_cpus", SYSFS_ROOT,
+                   group);
+      fd = open(buf, O_RDONLY);
+      if (fd < 0) {
+        NUM_GROUPS = group;
+        break;
+      }
+
+      ret = read(fd, buf, BUF_SIZE);
+      KMP_ASSERT(ret > 0);
+      buf[ret] = '\0';
+      int start, end;
+      ret = KMP_SSCANF(buf, "%d-%d", &start, &end);
+      if (CORES_PER_GROUP == 0) {
+        CORES_PER_GROUP = end - start + 1;
+      } else {
+        // XXX: Assume all group has the same number of cores for now
+        KMP_ASSERT(CORES_PER_GROUP == end - start + 1);
+      }
+      close(fd);
+    }
+
+    KMP_ASSERT(hardBarrier::NUM_GROUPS > 0 && hardBarrier::CORES_PER_GROUP > 0);
+
+    return true;
+  }
+
+  // below functinos also support when primary's core is not the
+  // first one of the group
+  bool is_group_leader(int tid) {
+    return tid == 0 || ((tid + primary_core) % CORES_PER_GROUP) == 0;
+  }
+  int get_group_from_tid(int tid) {
+    return (tid + (primary_core % CORES_PER_GROUP)) / CORES_PER_GROUP;
+  }
+  int get_tid_of_group_leader(int group) {
+    return (group * CORES_PER_GROUP) - (primary_core % CORES_PER_GROUP);
+  }
+};
+
 #endif // KMP_BARRIER_H
diff --git a/openmp/runtime/src/kmp_barrier.cpp b/openmp/runtime/src/kmp_barrier.cpp
--- a/openmp/runtime/src/kmp_barrier.cpp
+++ b/openmp/runtime/src/kmp_barrier.cpp
@@ -1734,6 +1734,817 @@
                 gtid, team->t.t_id, tid, bt));
 }
 
+//*****************************************************//
+// hardware barrier
+int hardBarrier::NUM_GROUPS = 0;
+int hardBarrier::CORES_PER_GROUP = 0;
+
+int hardBarrier::barrier_alloc(kmp_info_t *this_thr, int nthreads) {
+  // can be reallocated when team size is changed
+  if (is_barrier_allocated()) {
+    this->barrier_free();
+  }
+
+  int offset = this_thr->th.th_affin_mask->begin();
+  KMP_DEBUG_ASSERT(this_thr->th.th_affin_mask->count() == 1);
+
+  this->primary_core = offset;
+  if ((offset % CORES_PER_GROUP) + nthreads > CORES_PER_GROUP) {
+    this->is_hybrid = true;
+  } else {
+    this->is_hybrid = false;
+  }
+
+  // we already checked that all threads is packed closely
+  // from primary core in __kmp_can_use_hard_barrier
+  int group;
+  int first_group = (offset / CORES_PER_GROUP);
+  for (group = 0; group < NUM_GROUPS && nthreads > 0; group++) {
+    int nthreads_in_group;
+    if (group == 0) {
+      // handling for when the primary's core is not the first one in the group
+      nthreads_in_group = CORES_PER_GROUP - (offset % CORES_PER_GROUP);
+      if (nthreads_in_group > nthreads)
+        nthreads_in_group = nthreads;
+    } else {
+      nthreads_in_group =
+          (nthreads > CORES_PER_GROUP) ? CORES_PER_GROUP : nthreads;
+    }
+
+    int _group = (group + first_group) % NUM_GROUPS;
+
+    // serach free bb
+    int bb;
+    for (bb = 0;; bb++) {
+      int ret, fd;
+      char buf[BUF_SIZE];
+      ssize_t n;
+
+      KMP_SNPRINTF(buf, BUF_SIZE, "%s/group%d/barrier%d/masks", SYSFS_ROOT,
+                   _group, bb);
+      fd = open(buf, O_WRONLY);
+      if (fd < 0) {
+        // no more available barrier
+        this->num_groups = group;
+        barrier_free();
+        return -1;
+      }
+
+      n = KMP_SNPRINTF(buf, BUF_SIZE, "%d-%d", offset,
+                       offset + nthreads_in_group - 1);
+      ret = write(fd, buf, n + 1);
+      close(fd);
+      if (ret > 0) {
+        // success
+        break;
+      }
+    }
+
+    KA_TRACE(10,
+             ("hardBarrier::barrier_alloc. allocate bb: %d/%d\n", bb, _group));
+
+    this->group[group] = _group;
+    this->bb[group] = bb;
+    this->threads_in_group[group] = nthreads_in_group;
+
+    // set next start position
+    offset += nthreads_in_group;
+    nthreads -= nthreads_in_group;
+  }
+  this->num_groups = group;
+
+  return 0;
+}
+
+void hardBarrier::barrier_free() {
+  if (!is_barrier_allocated()) {
+    return;
+  }
+
+  for (int group = 0; group < this->num_groups && this->group[group] >= 0;
+       group++) {
+    char buf[BUF_SIZE];
+    int ret, fd;
+    KMP_SNPRINTF(buf, BUF_SIZE, "%s/group%d/barrier%d/masks", SYSFS_ROOT,
+                 this->group[group], this->bb[group]);
+    fd = open(buf, O_WRONLY);
+    // open error shoud not happen here
+    KMP_ASSERT(fd > 0);
+
+    buf[0] = '\0';
+    ret = write(fd, buf, 1);
+    // write error shoud not happen here
+    KMP_ASSERT(ret > 0);
+    close(fd);
+
+    KA_TRACE(10, ("hardBarrier::barrier_free. free bb: %d/%d\n",
+                  this->group[group], this->bb[group]));
+
+    this->bb[group] = -1;
+    this->group[group] = -1;
+    this->threads_in_group[group] = 0;
+    for (int i = 0; i < CORES_PER_GROUP; i++) {
+      this->barrier_window[group][i] = -1;
+    }
+  }
+  this->num_groups = -1;
+}
+
+void hardBarrier::get_window(kmp_info_t *this_thr, int tid) {
+  int group = get_group_from_tid(tid);
+  int id = get_window_index_from_tid(tid);
+
+  int bb = this->bb[group];
+  group = this->group[group];
+
+  if (this->barrier_window[group][id] != -1) {
+    // already assigned. restore window number
+    this_thr->th.th_hard_barrier_window = this->barrier_window[group][id];
+    return;
+  }
+
+  KMP_DEBUG_ASSERT(group >= 0 && bb >= 0);
+
+  char buf[BUF_SIZE];
+  int ret, fd;
+
+  // get window number
+  KMP_SNPRINTF(buf, BUF_SIZE, "%s/group%d/barrier%d/user", SYSFS_ROOT, group,
+               bb);
+  fd = open(buf, O_RDONLY);
+  // open error shoud not happen here
+  KMP_ASSERT(fd > 0);
+
+  ret = read(fd, buf, BUF_SIZE);
+  // write error shoud not happen here
+  KMP_ASSERT(ret > 0);
+  buf[ret] = '\0';
+
+  int window;
+  ret = KMP_SSCANF(buf, "%d", &window);
+  KMP_ASSERT(ret > 0);
+
+  close(fd);
+
+  KA_TRACE(10, ("hardBarrier::get_window. bb: %d/%d, tid: %d, window: %d\n",
+                group, bb, window));
+
+  /*
+   * It would  be possible barrier window number to be used for this
+   * thread might change when nest is used. Keep window number in team
+   */
+  this->barrier_window[group][id] = window;
+  this_thr->th.th_hard_barrier_window = window;
+}
+
+#define SYNC(reg)                                                              \
+  asm volatile("mrs x1, " #reg "\n\t"                                          \
+               "mvn x1, x1\n\t"                                                \
+               "and x1, x1, #1\n\t"                                            \
+               "msr " #reg ", x1\n\t"                                          \
+                                                                               \
+               "sevl\n\t"                                                      \
+                                                                               \
+               "1:mrs x2, " #reg "\n\t"                                        \
+               "and x2, x2, #1\n\t"                                            \
+               "cmp x1, x2\n\t"                                                \
+                                                                               \
+               "beq 2f\n\t"                                                    \
+               "b 1b\n\t"                                                      \
+                                                                               \
+               "2:\n\t"                                                        \
+               :                                                               \
+               :                                                               \
+               : "x1", "x2", "cc", "memory")
+
+// same as above but wait event by wfe
+#define SYNC_WFE(reg)                                                          \
+  asm volatile("mrs x1, " #reg "\n\t"                                          \
+               "mvn x1, x1\n\t"                                                \
+               "and x1, x1, #1\n\t"                                            \
+               "msr " #reg ", x1\n\t"                                          \
+                                                                               \
+               "sevl\n\t"                                                      \
+                                                                               \
+               "1:mrs x2, " #reg "\n\t"                                        \
+               "and x2, x2, #1\n\t"                                            \
+               "cmp x1, x2\n\t"                                                \
+                                                                               \
+               "beq 2f\n\t"                                                    \
+               "wfe\n\t"                                                       \
+               "b 1b\n\t"                                                      \
+                                                                               \
+               "2:\n\t"                                                        \
+               :                                                               \
+               :                                                               \
+               : "x1", "x2", "cc", "memory")
+
+// update (negate) own's BST bit
+#define UPDATE_BST(reg, val)                                                   \
+  asm volatile("mrs x1, " #reg "\n\t"                                          \
+               "mvn x1, x1\n\t"                                                \
+               "and x1, x1, #1\n\t"                                            \
+               "msr " #reg ", x1\n\t"                                          \
+                                                                               \
+               "sevl\n\t"                                                      \
+               "mov %x[val], x1\n\t"                                           \
+               : [val] "=r"(val)                                               \
+               :                                                               \
+               : "x1", "cc", "memory")
+
+// check if LBSY bit has been changed
+// (when all core has udpated BST, LBSY changes)
+#define CHECK_LBSY(reg, val, completed)                                        \
+  asm volatile("mrs x2, " #reg "\n\t"                                          \
+               "and x2, x2, #1\n\t"                                            \
+               "cmp %x[val], x2\n\t"                                           \
+               "beq 1f\n\t"                                                    \
+                                                                               \
+               "mov %x[comp], #0\n\t"                                          \
+               "b 2f\n\t"                                                      \
+                                                                               \
+               "1:\n\t"                                                        \
+               "mov %x[comp], #1\n\t"                                          \
+                                                                               \
+               "2:\n\t"                                                        \
+               : [comp] "=r"(completed)                                        \
+               : [val] "r"(val)                                                \
+               : "x2", "cc", "memory")
+
+// same as above but first wait event by wfe
+#define CHECK_LBSY_WFE(reg, val, completed)                                    \
+  asm volatile("wfe\n\t"                                                       \
+               "mrs x2, " #reg "\n\t"                                          \
+               "and x2, x2, #1\n\t"                                            \
+               "cmp %x[val], x2\n\t"                                           \
+               "beq 1f\n\t"                                                    \
+                                                                               \
+               "mov %x[comp], #0\n\t"                                          \
+               "b 2f\n\t"                                                      \
+                                                                               \
+               "1:\n\t"                                                        \
+               "mov %x[comp], #1\n\t"                                          \
+                                                                               \
+               "2:\n\t"                                                        \
+               : [comp] "=r"(completed)                                        \
+               : [val] "r"(val)                                                \
+               : "x2", "cc", "memory")
+
+// task handling is based on __kmp_wait_template
+void hardBarrier::sync(kmp_info_t *this_thr, int final_spin, int gtid,
+                       int tid USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
+
+  int window = this_thr->th.th_hard_barrier_window;
+
+#if KMP_STATS_ENABLED
+  stats_state_e thread_state = KMP_GET_THREAD_STATE();
+#endif
+
+#if OMPT_SUPPORT
+  ompt_state_t ompt_entry_state;
+  ompt_data_t *tId;
+  if (ompt_enabled.enabled) {
+    ompt_entry_state = this_thr->th.ompt_thread_info.state;
+    if (!final_spin || ompt_entry_state != ompt_state_wait_barrier_implicit ||
+        KMP_MASTER_TID(this_thr->th.th_info.ds.ds_tid)) {
+      ompt_lw_taskteam_t *team = NULL;
+      if (this_thr->th.th_team)
+        team = this_thr->th.th_team->t.ompt_serialized_team_info;
+      if (team) {
+        tId = &(team->ompt_task_info.task_data);
+      } else {
+        tId = OMPT_CUR_TASK_DATA(this_thr);
+      }
+    } else {
+      tId = &(this_thr->th.ompt_thread_info.task_data);
+    }
+    if (final_spin && this_thr->th.th_task_team == NULL) {
+      // implicit task is done. Either no taskqueue, or task-team finished
+      __ompt_implicit_task_end(this_thr, ompt_entry_state, tId);
+    }
+  }
+#endif
+
+  if (__kmp_tasking_mode == tskm_immediate_exec) {
+    // shortcut route when there is notask
+    if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
+      switch (window) {
+      case 0:
+        SYNC_WFE(s3_3_c15_c15_0);
+        break;
+      case 1:
+        SYNC_WFE(s3_3_c15_c15_1);
+        break;
+      case 2:
+        SYNC_WFE(s3_3_c15_c15_2);
+        break;
+      case 3:
+        SYNC_WFE(s3_3_c15_c15_3);
+        break;
+      default:
+        // should not happen
+        KMP_ASSERT(0);
+        break;
+      }
+    } else {
+      switch (window) {
+      case 0:
+        SYNC(s3_3_c15_c15_0);
+        break;
+      case 1:
+        SYNC(s3_3_c15_c15_1);
+        break;
+      case 2:
+        SYNC(s3_3_c15_c15_2);
+        break;
+      case 3:
+        SYNC(s3_3_c15_c15_3);
+        break;
+      default:
+        // should not happen
+        KMP_ASSERT(0);
+        break;
+      }
+    }
+  } else {
+    uint64_t val = 0;
+    // updagte BST bit
+    switch (window) {
+    case 0:
+      UPDATE_BST(s3_3_c15_c15_0, val);
+      break;
+    case 1:
+      UPDATE_BST(s3_3_c15_c15_1, val);
+      break;
+    case 2:
+      UPDATE_BST(s3_3_c15_c15_2, val);
+      break;
+    case 3:
+      UPDATE_BST(s3_3_c15_c15_3, val);
+      break;
+    default:
+      // should not happen
+      KMP_ASSERT(0);
+      break;
+    }
+    KA_TRACE(50, ("hardBarrier::sync_with_task: T#%d(%d:%d) new bst: %d, "
+                  "final_spin: %d\n",
+                  gtid, this_thr->th.th_team->t.t_id, tid, val, final_spin));
+
+    kmp_task_team_t *task_team = this_thr->th.th_task_team;
+    int tasks_completed = FALSE;
+    uint64_t completed = 0;
+    do {
+      // execute tasks
+      if (task_team != NULL) {
+        if (TCR_SYNC_4(task_team->tt.tt_active)) {
+          if (KMP_TASKING_ENABLED(task_team)) {
+            if (final_spin) {
+              do {
+                __kmp_atomic_execute_tasks_64(
+                    this_thr, gtid, (kmp_atomic_flag_64<> *)NULL, final_spin,
+                    &tasks_completed USE_ITT_BUILD_ARG(itt_sync_obj), 0);
+              } while (!tasks_completed);
+            } else {
+              __kmp_atomic_execute_tasks_64(
+                  this_thr, gtid, (kmp_atomic_flag_64<> *)NULL, final_spin,
+                  &tasks_completed USE_ITT_BUILD_ARG(itt_sync_obj), 0);
+            }
+          } else
+            this_thr->th.th_reap_state = KMP_SAFE_TO_REAP;
+        } else {
+          KMP_DEBUG_ASSERT(!KMP_MASTER_TID(this_thr->th.th_info.ds.ds_tid));
+#if OMPT_SUPPORT
+          // task-team is done now, other cases should be catched above
+          if (final_spin && ompt_enabled.enabled)
+            __ompt_implicit_task_end(this_thr, ompt_entry_state, tId);
+#endif
+          this_thr->th.th_task_team = NULL;
+          this_thr->th.th_reap_state = KMP_SAFE_TO_REAP;
+          task_team = NULL;
+        }
+      } else {
+        this_thr->th.th_reap_state = KMP_SAFE_TO_REAP;
+      }
+
+      if (TCR_4(__kmp_global.g.g_done)) {
+        if (__kmp_global.g.g_abort)
+          __kmp_abort_thread();
+      }
+
+#if KMP_STATS_ENABLED
+      // Check if thread has been signalled to idle state
+      // This indicates that the logical "join-barrier" has finished
+      if (this_thr->th.th_stats->isIdle() &&
+          KMP_GET_THREAD_STATE() == FORK_JOIN_BARRIER) {
+        KMP_SET_THREAD_STATE(IDLE);
+        KMP_PUSH_PARTITIONED_TIMER(OMP_idle);
+      }
+#endif
+
+      if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME ||
+          ((task_team != NULL) && TCR_4(task_team->tt.tt_found_tasks))) {
+        // if user requests busy loop or remaining tasks exist,
+        // do not sleep by wfe
+        switch (window) {
+        case 0:
+          CHECK_LBSY(s3_3_c15_c15_0, val, completed);
+          break;
+        case 1:
+          CHECK_LBSY(s3_3_c15_c15_1, val, completed);
+          break;
+        case 2:
+          CHECK_LBSY(s3_3_c15_c15_2, val, completed);
+          break;
+        case 3:
+          CHECK_LBSY(s3_3_c15_c15_3, val, completed);
+          break;
+        }
+      } else {
+        switch (window) {
+        case 0:
+          CHECK_LBSY_WFE(s3_3_c15_c15_0, val, completed);
+          break;
+        case 1:
+          CHECK_LBSY_WFE(s3_3_c15_c15_1, val, completed);
+          break;
+        case 2:
+          CHECK_LBSY_WFE(s3_3_c15_c15_2, val, completed);
+          break;
+        case 3:
+          CHECK_LBSY_WFE(s3_3_c15_c15_3, val, completed);
+          break;
+        }
+      }
+    } while (!completed);
+  }
+
+#if OMPT_SUPPORT
+  ompt_state_t ompt_exit_state = this_thr->th.ompt_thread_info.state;
+  if (ompt_enabled.enabled && ompt_exit_state != ompt_state_undefined) {
+#if OMPT_OPTIONAL
+    if (final_spin) {
+      __ompt_implicit_task_end(this_thr, ompt_exit_state, tId);
+      ompt_exit_state = this_thr->th.ompt_thread_info.state;
+    }
+#endif
+    if (ompt_exit_state == ompt_state_idle) {
+      this_thr->th.ompt_thread_info.state = ompt_state_overhead;
+    }
+  }
+#endif
+#if KMP_STATS_ENABLED
+  // If we were put into idle state, pop that off the state stack
+  if (KMP_GET_THREAD_STATE() == IDLE) {
+    KMP_POP_PARTITIONED_TIMER();
+    KMP_SET_THREAD_STATE(thread_state);
+    this_thr->th.th_stats->resetIdleFlag();
+  }
+#endif
+}
+
+static void __kmp_hard_barrier_gather(
+    enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
+    void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
+  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hard_gather);
+  kmp_team_t *team = this_thr->th.th_team;
+  kmp_info_t **other_threads = team->t.t_threads;
+
+  KA_TRACE(20, ("__kmp_hard_barrier_gather: T#%d(%d:%d) enter for "
+                "barrier type %d\n",
+                gtid, team->t.t_id, tid, bt));
+  KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
+
+  // Optimized route for intra-group only barrier in plain barrier.
+  // Release barrier (one synchronization) is enough in that case
+  // if no tasking. When tasking is enabled, workers still need to wait
+  // the primary to setup task team in gather barrier
+  if (bt == bs_plain_barrier && __kmp_tasking_mode == tskm_immediate_exec &&
+      team->t.h && team->t.h->is_barrier_allocated() && !team->t.h->is_hybrid) {
+    KMP_DEBUG_ASSERT(reduce == NULL);
+    KA_TRACE(20, ("__kmp_hard_barrier_gather: T#%d(%d:%d) skip barrier in"
+                  "no tasking mode\n",
+                  gtid, team->t.t_id, tid));
+    return;
+  }
+
+  if (!team->t.h || !team->t.h->is_barrier_allocated()) {
+    // fallback to soft barrier
+    KA_TRACE(20, ("__kmp_hard_barrier_gather: T#%d(%d:%d) fall back to hyper\n",
+                  gtid, team->t.t_id, tid));
+    return __kmp_hyper_barrier_gather(bt, this_thr, gtid, tid,
+                                      reduce USE_ITT_BUILD_ARG(itt_sync_obj));
+  }
+
+  if (team->t.h->is_hybrid) {
+    // sync within group first
+    team->t.h->sync(this_thr, FALSE, gtid, tid USE_ITT_BUILD_ARG(itt_sync_obj));
+
+    // then, group leader sync using soft barrier
+    // (simple linear barrier scheme for now)
+    if (team->t.h->is_group_leader(tid)) {
+      if (KMP_MASTER_TID(tid)) {
+        // reduce in my group
+        if (reduce) {
+          OMPT_REDUCTION_DECL(this_thr, gtid);
+          OMPT_REDUCTION_BEGIN;
+          // reduce in my group
+          int group = team->t.h->get_group_from_tid(tid);
+          for (int i = 1; i < team->t.h->threads_in_group[group]; i++) {
+            int child_tid = tid + i;
+            kmp_info_t *child_thr = other_threads[child_tid];
+            (*reduce)(this_thr->th.th_local.reduce_data,
+                      child_thr->th.th_local.reduce_data);
+          }
+          OMPT_REDUCTION_END;
+        }
+
+        // wait and reduce other group leader
+        kmp_balign_team_t *team_bar = &team->t.t_bar[bt];
+        kmp_uint64 new_state = team_bar->b_arrived + KMP_BARRIER_STATE_BUMP;
+        for (int i = 1; i < team->t.h->num_groups; i++) {
+          int child_tid = team->t.h->get_tid_of_group_leader(i);
+          kmp_flag_64<> flag(
+              &other_threads[child_tid]->th.th_bar[bt].bb.b_arrived, new_state);
+          flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
+          if (reduce) {
+            OMPT_REDUCTION_DECL(this_thr, gtid);
+            OMPT_REDUCTION_BEGIN;
+            (*reduce)(this_thr->th.th_local.reduce_data,
+                      other_threads[child_tid]->th.th_local.reduce_data);
+            OMPT_REDUCTION_END;
+          }
+        }
+        team_bar->b_arrived = new_state;
+      } else {
+        if (reduce) {
+          OMPT_REDUCTION_DECL(this_thr, gtid);
+          OMPT_REDUCTION_BEGIN;
+          // reduce in my group
+          int group = team->t.h->get_group_from_tid(tid);
+          for (int i = 1; i < team->t.h->threads_in_group[group]; i++) {
+            int child_tid = tid + i;
+            kmp_info_t *child_thr = other_threads[child_tid];
+            (*reduce)(this_thr->th.th_local.reduce_data,
+                      child_thr->th.th_local.reduce_data);
+          }
+          OMPT_REDUCTION_END;
+        }
+        // mark arrival to primary thread
+        kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
+        kmp_flag_64<> flag(&thr_bar->b_arrived, other_threads[0]);
+        flag.release();
+      }
+    }
+
+    KA_TRACE(20,
+             ("__kmp_hard_barrier_gather: T#%d(%d:%d) exit hybrid barrier for "
+              "barrier type %d\n",
+              gtid, team->t.t_id, tid, bt));
+    return;
+  }
+
+  // non-hybrid barrier case. just sync within group
+  team->t.h->sync(this_thr, FALSE, gtid, tid USE_ITT_BUILD_ARG(itt_sync_obj));
+
+  if (KMP_MASTER_TID(tid)) {
+    // primary performes all reduce operation
+    if (reduce) {
+      OMPT_REDUCTION_DECL(this_thr, gtid);
+      OMPT_REDUCTION_BEGIN;
+      for (int child_tid = 1; child_tid < team->t.t_nproc; child_tid++) {
+        kmp_info_t *child_thr = other_threads[child_tid];
+        (*reduce)(this_thr->th.th_local.reduce_data,
+                  child_thr->th.th_local.reduce_data);
+      }
+      OMPT_REDUCTION_END;
+    }
+  }
+
+  KA_TRACE(20, ("__kmp_hard_barrier_gather: T#%d(%d:%d) exit for "
+                "barrier type %d\n",
+                gtid, team->t.t_id, tid, bt));
+}
+
+static void __kmp_hard_barrier_release(
+    enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
+    int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
+  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hard_release);
+  kmp_bstate_t *thr_bar;
+  kmp_team_t *team;
+
+  KA_TRACE(20, ("__kmp_hard_barrier_release: T#%d(%d:%d)"
+                "enter for barrier type %d\n",
+                gtid, NULL, tid, bt));
+  thr_bar = &this_thr->th.th_bar[bt].bb;
+
+  // If this is not a fork barrier, everything is already set up.
+  // Provide the optimized route for intra-group only barrier which performs
+  // best
+  if (bt != bs_forkjoin_barrier) {
+    team = this_thr->th.th_team;
+    if (team->t.h && team->t.h->is_barrier_allocated() &&
+        !team->t.h->is_hybrid) {
+      team->t.h->sync(this_thr, TRUE, gtid,
+                      tid USE_ITT_BUILD_ARG(itt_sync_obj));
+      return;
+    }
+  }
+
+  if (!KMP_MASTER_TID(tid)) {
+    // workers and non-master group leaders need to check their presence in team
+    do {
+      KA_TRACE(
+          20,
+          ("__kmp_hard_barrier_release: T#%d(%d:%d) worker. wait for team is "
+           "set up.  th_used_in_team: %d\n",
+           gtid, NULL, tid, this_thr->th.th_used_in_team.load()));
+      // if this thread is not in team yet
+      if (this_thr->th.th_used_in_team.load() != 1 &&
+          this_thr->th.th_used_in_team.load() != 3 &&
+          this_thr->th.th_used_in_team.load() != 4) {
+
+        // wait th_used_in_team will become 3 (by master in __kmp_add_threads)
+        kmp_flag_32<false, false> my_flag(&(this_thr->th.th_used_in_team), 3);
+        if (KMP_COMPARE_AND_STORE_ACQ32(&(this_thr->th.th_used_in_team), 2,
+                                        0) ||
+            this_thr->th.th_used_in_team.load() == 0) {
+          my_flag.wait(this_thr, true USE_ITT_BUILD_ARG(itt_sync_obj));
+        }
+        if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
+          return;
+      }
+
+      if (this_thr->th.th_used_in_team.load() != 1 &&
+          this_thr->th.th_used_in_team.load() != 3 &&
+          this_thr->th.th_used_in_team.load() != 4) // spurious wake-up?
+        continue;
+      if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
+        return;
+
+      tid = __kmp_tid_from_gtid(gtid);
+      team = this_thr->th.th_team;
+      KMP_DEBUG_ASSERT(tid >= 0);
+      KMP_DEBUG_ASSERT(team);
+      KA_TRACE(20, ("__kmp_hard_barrier_release: T#%d(%d:%d) worker. woke up\n",
+                    gtid, team, tid));
+
+      if (this_thr->th.th_used_in_team.load() == 3) {
+        KMP_COMPARE_AND_STORE_ACQ32(&(this_thr->th.th_used_in_team), 3, 1);
+      }
+      // if th_used_in_team is 4, then the team this thread currently belongs
+      // to will change the number of threads. We need to perform
+      // synchronization before that as hard barrier cannot wakeup a part of
+      // thread
+
+      // if hard barrier cannot be used, fallback to softbarrier
+      if (!team->t.h || !team->t.h->is_barrier_allocated()) {
+        KA_TRACE(20, ("__kmp_hard_barrier_release: T#%d(%d:%d) worker. fall "
+                      "back to hyper\n",
+                      gtid, team, tid));
+        __kmp_hyper_barrier_release(
+            bt, this_thr, gtid, tid,
+            propagate_icvs USE_ITT_BUILD_ARG(itt_sync_obj));
+        if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done)) {
+          // master is waiting th_used_in_team to become 0 in kmp_reap_thread
+          KMP_COMPARE_AND_STORE_ACQ32(&(this_thr->th.th_used_in_team), 4, 0);
+          return;
+        }
+
+        // this thread will no longer belong to team
+        if (this_thr->th.th_used_in_team.load() == 4) {
+          KA_TRACE(20, ("__kmp_hard_barrier_release: T#%d(%d:%d) worker. this "
+                        "thread becomes unused\n",
+                        gtid, team, tid));
+          KMP_COMPARE_AND_STORE_ACQ32(&(this_thr->th.th_used_in_team), 4, 2);
+          continue;
+        }
+
+        return;
+      }
+
+      // if this is a fork barrier, allocate a barrier window for this thread
+      if (bt == bs_forkjoin_barrier) {
+        // setup affinity early since hard barrier requires it
+        if (this_thr->th.th_new_place != this_thr->th.th_current_place) {
+          __kmp_affinity_set_place(gtid);
+          // we should call __kmp_aux_display_affinity here but it will be
+          // resulted in displaying the affinity information twice (here and
+          // in __kmp_fork_barrier after this function returns). skip here.
+        }
+        team->t.h->get_window(this_thr, tid);
+      }
+
+      if (team->t.h->is_hybrid && team->t.h->is_group_leader(tid)) {
+        // group leader waits primary to wake me up
+        kmp_flag_64<> flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
+        flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
+        if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
+          return;
+        TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE);
+        KMP_MB();
+
+        // intra group sync
+        team->t.h->sync(this_thr, FALSE, gtid,
+                        tid USE_ITT_BUILD_ARG(itt_sync_obj));
+      } else {
+        // intra group sync
+        team->t.h->sync(this_thr, TRUE, gtid,
+                        tid USE_ITT_BUILD_ARG(itt_sync_obj));
+      }
+
+      if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
+        return;
+
+      // check if this thread still belongs to the team
+      if (this_thr->th.th_used_in_team.load() == 1) {
+#if KMP_BARRIER_ICV_PUSH
+        if (propagate_icvs) {
+          __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid],
+                                   team, tid, FALSE);
+          // get icvs stored in team's struct
+          copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
+                    (kmp_internal_control_t *)team->t.h->team_icvs);
+          copy_icvs(&thr_bar->th_fixed_icvs,
+                    &team->t.t_implicit_task_taskdata[tid].td_icvs);
+        }
+#endif // KMP_BARRIER_ICV_PUSH
+       // success, exit from barrier
+        break;
+      }
+
+      if (this_thr->th.th_used_in_team.load() == 4) {
+        KA_TRACE(20, ("__kmp_hard_barrier_release: T#%d(%d:%d) worker. this "
+                      "thread becomes unused\n",
+                      gtid, team, tid));
+        KMP_COMPARE_AND_STORE_ACQ32(&(this_thr->th.th_used_in_team), 4, 2);
+      }
+      // next team may use soft barrier, reset status
+      for (int bs = 0; bs < bs_last_barrier; bs++) {
+        kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bs].bb;
+        thr_bar->b_arrived = KMP_INIT_BARRIER_STATE;
+      }
+      // thread now becomes unused, retry from start
+    } while (1);
+  } else {
+    // primary
+    team = this_thr->th.th_team;
+    tid = __kmp_tid_from_gtid(gtid);
+    KA_TRACE(20, ("__kmp_hard_barrier_release: T#%d(%d:%d) primary.\n", gtid,
+                  team, tid));
+
+    // fallback to softbarrier;
+    if (!team->t.h || !team->t.h->is_barrier_allocated()) {
+      KA_TRACE(20, ("__kmp_hard_barrier_release: T#%d(%d:%d) primary. fallback "
+                    "to hyper\n",
+                    gtid, team, tid));
+      return __kmp_hyper_barrier_release(
+          bt, this_thr, gtid, tid,
+          propagate_icvs USE_ITT_BUILD_ARG(itt_sync_obj));
+    }
+
+    if (bt == bs_forkjoin_barrier) {
+      if (this_thr->th.th_new_place != this_thr->th.th_current_place) {
+        __kmp_affinity_set_place(gtid);
+      }
+      team->t.h->get_window(this_thr, tid);
+    }
+
+#if KMP_BARRIER_ICV_PUSH
+    if (propagate_icvs) {
+      __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team,
+                               tid, FALSE);
+      copy_icvs(&thr_bar->th_fixed_icvs,
+                &team->t.t_implicit_task_taskdata[tid].td_icvs);
+    }
+#endif
+
+    if (team->t.h->is_hybrid) {
+      // wakeup group leader
+      kmp_info_t **other_threads = team->t.t_threads;
+      for (int i = 1; i < team->t.h->num_groups; i++) {
+        int child_tid = team->t.h->get_tid_of_group_leader(i);
+        kmp_flag_64<> flag(&other_threads[child_tid]->th.th_bar[bt].bb.b_go,
+                           other_threads[child_tid]);
+        flag.release();
+      }
+    }
+
+    // intra group sync
+    team->t.h->sync(this_thr, TRUE, gtid, tid USE_ITT_BUILD_ARG(itt_sync_obj));
+  }
+
+  KA_TRACE(20, ("__kmp_hard_barrier_release: T#%d(%d:%d) exit for "
+                "barrier type %d\n",
+                gtid, team->t.t_id, tid, bt));
+}
+
+void __kmp_hard_barrier_wakeup_soft(kmp_info_t *master) {
+  __kmp_hyper_barrier_release(bs_forkjoin_barrier, master,
+                              __kmp_gtid_from_thread(master), 0,
+                              true USE_ITT_BUILD_ARG(NULL));
+}
+//*****************************************************//
 // End of Barrier Algorithms
 
 // type traits for cancellable value
@@ -1871,6 +2682,11 @@
                                   reduce USE_ITT_BUILD_ARG(itt_sync_obj));
         break;
       }
+      case bp_hard_bar: {
+        __kmp_hard_barrier_gather(bt, this_thr, gtid, tid,
+                                  reduce USE_ITT_BUILD_ARG(itt_sync_obj));
+        break;
+      }
       case bp_hyper_bar: {
         // don't set branch bits to 0; use linear
         KMP_ASSERT(__kmp_barrier_gather_branch_bits[bt]);
@@ -1990,6 +2806,11 @@
                                      FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
           break;
         }
+        case bp_hard_bar: {
+          __kmp_hard_barrier_release(bt, this_thr, gtid, tid,
+                                     FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
+          break;
+        }
         case bp_hyper_bar: {
           KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
           __kmp_hyper_barrier_release(bt, this_thr, gtid, tid,
@@ -2143,6 +2964,11 @@
                                    FALSE USE_ITT_BUILD_ARG(NULL));
         break;
       }
+      case bp_hard_bar: {
+        __kmp_hard_barrier_release(bt, this_thr, gtid, tid,
+                                   FALSE USE_ITT_BUILD_ARG(NULL));
+        break;
+      }
       default: {
         __kmp_linear_barrier_release(bt, this_thr, gtid, tid,
                                      FALSE USE_ITT_BUILD_ARG(NULL));
@@ -2276,6 +3102,11 @@
                               NULL USE_ITT_BUILD_ARG(itt_sync_obj));
     break;
   }
+  case bp_hard_bar: {
+    __kmp_hard_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid,
+                              NULL USE_ITT_BUILD_ARG(itt_sync_obj));
+    break;
+  }
   case bp_hyper_bar: {
     KMP_ASSERT(__kmp_barrier_gather_branch_bits[bs_forkjoin_barrier]);
     __kmp_hyper_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid,
@@ -2469,6 +3300,11 @@
                                TRUE USE_ITT_BUILD_ARG(NULL));
     break;
   }
+  case bp_hard_bar: {
+    __kmp_hard_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid,
+                               TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
+    break;
+  }
   case bp_hyper_bar: {
     KMP_ASSERT(__kmp_barrier_release_branch_bits[bs_forkjoin_barrier]);
     __kmp_hyper_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid,
diff --git a/openmp/runtime/src/kmp_global.cpp b/openmp/runtime/src/kmp_global.cpp
--- a/openmp/runtime/src/kmp_global.cpp
+++ b/openmp/runtime/src/kmp_global.cpp
@@ -111,7 +111,7 @@
 #endif // KMP_FAST_REDUCTION_BARRIER
 };
 char const *__kmp_barrier_pattern_name[bp_last_bar] = {
-    "linear", "tree", "hyper", "hierarchical", "dist"};
+    "linear", "tree", "hyper", "hierarchical", "dist", "hard"};
 
 int __kmp_allThreadsSpecified = 0;
 size_t __kmp_align_alloc = CACHE_LINE;
diff --git a/openmp/runtime/src/kmp_runtime.cpp b/openmp/runtime/src/kmp_runtime.cpp
--- a/openmp/runtime/src/kmp_runtime.cpp
+++ b/openmp/runtime/src/kmp_runtime.cpp
@@ -13,6 +13,7 @@
 #include "kmp.h"
 #include "kmp_affinity.h"
 #include "kmp_atomic.h"
+#include "kmp_barrier.h"
 #include "kmp_environment.h"
 #include "kmp_error.h"
 #include "kmp_i18n.h"
@@ -111,6 +112,10 @@
                                int new_nthreads);
 void __kmp_add_threads_to_team(kmp_team_t *team, int new_nthreads);
 
+static void __kmp_resize_hard_barrier(kmp_team_t *team, kmp_info_t *master,
+                                      int new_nproc, int new_proc_bind,
+                                      bool force = false);
+
 /* Calculate the identifier of the current thread */
 /* fast (and somewhat portable) way to get unique identifier of executing
    thread. Returns KMP_GTID_DNE if we haven't been assigned a gtid. */
@@ -1011,6 +1016,15 @@
       __kmp_partition_places(team);
     }
 #endif
+    if (__kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_hard_bar) {
+      // affinity is not setup for teams construct therefore we cannot use hard
+      // barrier for teams construct
+      if (!fork_teams_workers && team->t.t_nproc > 1) {
+        __kmp_resize_hard_barrier(team, master_th, team->t.t_nproc,
+                                  team->t.t_proc_bind);
+      }
+      __kmp_add_threads_to_team(team, team->t.t_nproc);
+    }
   }
 
   if (__kmp_display_affinity && team->t.t_display_affinity != 1) {
@@ -1350,6 +1364,45 @@
 #endif
 }
 
+// return true if hard barrier can be used for this team
+static bool __kmp_can_use_hard_barrier(kmp_team_t *team, kmp_info_t *master,
+                                       int nthreads, int proc_bind) {
+  // TODO: add support of KMP_AFFINITY
+  if (proc_bind != proc_bind_close) {
+    KA_TRACE(20, ("__kmp_can_use_hard_barrier: proc bind is not close: %d/%d\n",
+                  proc_bind, __kmp_affinity_type));
+    return false;
+  }
+
+  if (master->th.th_affin_mask->count() != 1) {
+    KA_TRACE(
+        20,
+        ("__kmp_can_use_hard_barrier: master affinity is not appropriate: %d\n",
+         master->th.th_affin_mask->count()));
+    return false;
+  }
+
+  int primary_core = master->th.th_affin_mask->begin();
+  if (((primary_core % hardBarrier::CORES_PER_GROUP) + nthreads) >
+      hardBarrier::CORES_PER_GROUP * hardBarrier::NUM_GROUPS) {
+    KA_TRACE(20, ("__kmp_can_use_hard_barrier: too many workers: %d/%d/%d\n",
+                  primary_core, nthreads, team->t.t_nproc));
+    return false;
+  }
+
+  if ((primary_core + nthreads) % hardBarrier::CORES_PER_GROUP == 1) {
+    KA_TRACE(20, ("__kmp_can_use_hard_barrier: each group needs at least 1 "
+                  "threads: %d/%d/%d\n",
+                  primary_core, nthreads, team->t.t_nproc));
+    return false;
+  }
+
+  KA_TRACE(20, ("__kmp_can_use_hard_barrier: hard barrier can be used for this "
+                "team:%p\n",
+                team));
+  return true;
+}
+
 /* most of the work for a fork */
 /* return true if we really went parallel, false if serialized */
 int __kmp_fork_call(ident_t *loc, int gtid,
@@ -1658,6 +1711,14 @@
       __kmp_partition_places(parent_team);
 #endif
 
+      if (__kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_hard_bar) {
+        // force resize since this team may be able to use hard barrier
+        __kmp_resize_hard_barrier(parent_team, parent_team->t.t_threads[0],
+                                  parent_team->t.t_nproc,
+                                  parent_team->t.t_proc_bind, true);
+        __kmp_add_threads_to_team(parent_team, parent_team->t.t_nproc);
+      }
+
       KF_TRACE(10, ("__kmp_fork_call: before internal fork: root=%p, team=%p, "
                     "master_th=%p, gtid=%d\n",
                     root, parent_team, master_th, gtid));
@@ -2051,6 +2112,8 @@
                                  argc USE_NESTED_HOT_ARG(master_th));
       if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar)
         copy_icvs((kmp_internal_control_t *)team->t.b->team_icvs, &new_icvs);
+      if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_hard_bar)
+        copy_icvs((kmp_internal_control_t *)team->t.h->team_icvs, &new_icvs);
     } else {
       /* allocate a new parallel team */
       KF_TRACE(10, ("__kmp_fork_call: before __kmp_allocate_team\n"));
@@ -2064,6 +2127,10 @@
       if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar)
         copy_icvs((kmp_internal_control_t *)team->t.b->team_icvs,
                   &master_th->th.th_current_task->td_icvs);
+      if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_hard_bar) {
+        copy_icvs((kmp_internal_control_t *)team->t.h->team_icvs,
+                  &master_th->th.th_current_task->td_icvs);
+      }
     }
     KF_TRACE(
         10, ("__kmp_fork_call: after __kmp_allocate_team - team = %p\n", team));
@@ -2400,6 +2467,12 @@
     }
 #endif
 
+    if (__kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_hard_bar &&
+        parent_team->t.h && parent_team->t.h->is_barrier_allocated()) {
+      // reset barrier window status for parent team
+      parent_team->t.h->get_window(master_th, 0);
+    }
+
     return;
   }
 
@@ -2435,6 +2508,10 @@
         __kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
       team->t.b->update_num_threads(team->t.t_nproc);
       __kmp_add_threads_to_team(team, team->t.t_nproc);
+    } else if (team->t.t_nproc > 1 &&
+               __kmp_barrier_gather_pattern[bs_forkjoin_barrier] ==
+                   bp_hard_bar) {
+      __kmp_add_threads_to_team(team, team->t.t_nproc);
     }
   }
 
@@ -2602,6 +2679,10 @@
   master_th->th.th_team_nproc = parent_team->t.t_nproc;
   master_th->th.th_team_master = parent_team->t.t_threads[0];
   master_th->th.th_team_serialized = parent_team->t.t_serialized;
+  if (__kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_hard_bar &&
+      parent_team->t.h && parent_team->t.h->is_barrier_allocated()) {
+    parent_team->t.h->get_window(master_th, 0);
+  }
 
   /* restore serialized team, if need be */
   if (parent_team->t.t_serialized &&
@@ -2729,6 +2810,10 @@
     if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
       __kmp_resize_dist_barrier(hot_team, hot_team->t.t_nproc, new_nth);
     }
+    if (__kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_hard_bar) {
+      __kmp_resize_hard_barrier(hot_team, hot_team->t.t_threads[0], new_nth,
+                                hot_team->t.t_proc_bind);
+    }
     // Release the extra threads we don't need any more.
     for (f = new_nth; f < hot_team->t.t_nproc; f++) {
       KMP_DEBUG_ASSERT(hot_team->t.t_threads[f] != NULL);
@@ -2751,6 +2836,9 @@
     if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
       hot_team->t.b->update_num_threads(new_nth);
       __kmp_add_threads_to_team(hot_team, new_nth);
+    } else if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] ==
+               bp_hard_bar) {
+      __kmp_add_threads_to_team(hot_team, new_nth);
     }
 
     __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
@@ -4369,7 +4457,10 @@
     new_thr->th.th_task_state_top = 0;
     new_thr->th.th_task_state_stack_sz = 4;
 
-    if (__kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
+    if (__kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar ||
+        (__kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_hard_bar &&
+         new_thr->th.th_team->t.h &&
+         new_thr->th.th_team->t.h->is_barrier_allocated())) {
       // Make sure pool thread has transitioned to waiting on own thread struct
       KMP_DEBUG_ASSERT(new_thr->th.th_used_in_team.load() == 0);
       // Thread activated in __kmp_allocate_team when increasing team size
@@ -5058,6 +5149,117 @@
 
 #endif // KMP_AFFINITY_SUPPORTED
 
+// check if we can use hard barrier in this team and setup bb.
+// otherwise just fallback to use soft barrier
+static void __kmp_resize_hard_barrier(kmp_team_t *team, kmp_info_t *master,
+                                      int new_nproc, int new_proc_bind,
+                                      bool force) {
+
+  int old_nproc = team->t.t_nproc;
+
+  KA_TRACE(10, ("__kmp_resize_hard_barrier: enter T#%d, nthreads: %d -> %d, "
+                "proc_bind %d -> %d\n",
+                team->t.t_id, team->t.t_nproc, new_nproc, team->t.t_proc_bind,
+                new_proc_bind));
+
+  // below logic is mostly the same as distribution barrier
+  if (force || (old_nproc > 1 && old_nproc != new_nproc)) {
+    if (team->t.h->is_barrier_allocated()) {
+      // set all workers to transition state
+      for (int f = 1; f < old_nproc; ++f) {
+        KMP_DEBUG_ASSERT(team->t.t_threads[f] != NULL);
+        // Ignore threads that are already inactive or not present in the team
+        if (team->t.t_threads[f]->th.th_used_in_team.load() == 0) {
+          // teams construct causes thread_limit to get passed in, and some of
+          // those could be inactive; just ignore them
+          continue;
+        }
+        // If thread is transitioning still to in_use state, wait for it
+        if (team->t.t_threads[f]->th.th_used_in_team.load() == 3) {
+          while (team->t.t_threads[f]->th.th_used_in_team.load() == 3)
+            KMP_CPU_PAUSE();
+        }
+        // The thread should be in_use now
+        KMP_DEBUG_ASSERT(team->t.t_threads[f]->th.th_used_in_team.load() == 1);
+        // Transition to unused state (but sync before leaving)
+        team->t.t_threads[f]->th.th_used_in_team.store(4);
+      }
+
+      // wakeup old thread waiting in release barrier
+      KA_TRACE(20,
+               ("__kmp_resize_hard_barrier: T#%d, wake up old hard barrier\n",
+                team->t.t_id));
+      if (team->t.h->is_hybrid) {
+        for (int i = 1; i < team->t.h->num_groups; i++) {
+          int child_tid = team->t.h->get_tid_of_group_leader(i);
+          kmp_flag_64<> flag(&team->t.t_threads[child_tid]
+                                  ->th.th_bar[bs_forkjoin_barrier]
+                                  .bb.b_go,
+                             team->t.t_threads[child_tid]);
+          flag.release();
+        }
+      }
+      team->t.h->sync(master, FALSE, 0, 0 USE_ITT_BUILD_ARG(NULL));
+
+      // next team may use soft barrier, reset status
+      for (int bs = 0; bs < bs_last_barrier; bs++) {
+        kmp_balign_team_t *team_bar = &team->t.t_bar[bs];
+        team_bar->b_arrived = KMP_INIT_BARRIER_STATE;
+      }
+    } else {
+      // thread is sleeping in softbarrier
+      for (int f = 1; f < old_nproc; ++f) {
+        team->t.t_threads[f]->th.th_used_in_team.store(4);
+      }
+      KA_TRACE(20,
+               ("__kmp_resize_hard_barrier: T#%d, wake up old soft barrier\n",
+                team->t.t_id));
+
+      // XXX: it seems master->th.th_team points task_team at this point.
+      // is there any better way to handle this?
+      int temp = master->th.th_team_nproc;
+      kmp_team_t *temp_team = master->th.th_team;
+      master->th.th_team_nproc = team->t.t_nproc;
+      master->th.th_team = team;
+      __kmp_hard_barrier_wakeup_soft(master);
+      master->th.th_team_nproc = temp;
+      master->th.th_team = temp_team;
+    }
+
+    // wait for threads to be removed from team
+    for (int f = 1; f < old_nproc; ++f) {
+      while (team->t.t_threads[f]->th.th_used_in_team.load() != 0)
+        KMP_CPU_PAUSE();
+    }
+  }
+
+  // resize main logic
+  if (team->t.h->is_barrier_allocated() && team->t.t_nproc == new_nproc &&
+      team->t.t_proc_bind == new_proc_bind) {
+    KA_TRACE(10, ("__kmp_resize_hard_barrier: T#%d, hard barrier is alredy set "
+                  "up for team\n",
+                  team->t.t_id));
+  } else {
+    if (__kmp_can_use_hard_barrier(team, master, new_nproc, new_proc_bind) &&
+        team->t.h->barrier_alloc(master, new_nproc) == 0) {
+      KA_TRACE(10, ("__kmp_resize_hard_barrier: T#%d, allocate hard barrier. "
+                    "nthreads: %d, "
+                    "proc_bind: %d\n",
+                    team->t.t_id, new_nproc, new_proc_bind));
+    } else {
+      // in some case this is expected (i.e. outer team of nested parallelism
+      // if OMP_PROC_BIND=spread,close is used)
+      KMP_INFORM(HardBarrierCannotBeUsed);
+      team->t.h->barrier_free();
+      KA_TRACE(
+          10,
+          ("__kmp_resize_hard_barrier: T#%d, cannot use hard barrier. use soft "
+           "barrier. nthreads: %d, proc_bind: %d\n",
+           team->t.t_id, new_nproc, new_proc_bind));
+    }
+  }
+}
+
 /* allocate a new team data structure to use.  take one off of the free pool if
    available */
 kmp_team_t *
@@ -5138,6 +5340,19 @@
       __kmp_resize_dist_barrier(team, old_nthr, new_nproc);
     }
 
+    if (__kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_hard_bar) {
+      if (!team->t.h) {
+        // See comments in hardBarrier::allocate() for why this is needed
+        team->t.h = hardBarrier::allocate();
+      }
+      // if do_place_partion == 0 then this team cannot use hard barrier
+      // since affinity will not be set up for this team. Skip hard barrier
+      // initialization and use software barrier
+      if (do_place_partition) {
+        __kmp_resize_hard_barrier(team, master, new_nproc, new_proc_bind);
+      }
+    }
+
     // If not doing the place partition, then reset the team's proc bind
     // to indicate that partitioning of all threads still needs to take place
     if (do_place_partition == 0)
@@ -5267,6 +5482,12 @@
         __kmp_partition_places(team);
 #endif
       }
+
+      if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_hard_bar) {
+        // call __kmp_add_thread_to_team after __kmp_partition_places
+        // since hard barrier needs affinity
+        __kmp_add_threads_to_team(team, new_nproc);
+      }
     } else { // team->t.t_nproc < new_nproc
 #if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED
       kmp_affin_mask_t *old_mask;
@@ -5408,6 +5629,12 @@
         __kmp_partition_places(team);
 #endif
       }
+
+      if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_hard_bar) {
+        // call __kmp_add_thread_to_team after __kmp_partition_places
+        // since hard barrier needs affinity
+        __kmp_add_threads_to_team(team, new_nproc);
+      }
     } // Check changes in number of threads
 
     kmp_info_t *master = team->t.t_threads[0];
@@ -5480,6 +5707,13 @@
         }
       }
 
+      if (max_nproc > 1 &&
+          __kmp_barrier_gather_pattern[bs_plain_barrier] == bp_hard_bar) {
+        if (!team->t.h) {
+          team->t.h = hardBarrier::allocate();
+        }
+      }
+
       /* setup the team for fresh use */
       __kmp_initialize_team(team, new_nproc, new_icvs, NULL);
 
@@ -5541,6 +5775,12 @@
     team->t.b = distributedBarrier::allocate(__kmp_dflt_team_nth_ub);
   }
 
+  // if max_nproc == 1 then there is no need to use barrier at all
+  if (max_nproc > 1 &&
+      __kmp_barrier_gather_pattern[bs_plain_barrier] == bp_hard_bar) {
+    team->t.h = hardBarrier::allocate();
+  }
+
   /* NOTE well, for some reason allocating one big buffer and dividing it up
      seems to really hurt performance a lot on the P4, so, let's not use this */
   __kmp_allocate_team_arrays(team, max_nproc);
@@ -5702,7 +5942,10 @@
         KMP_COMPARE_AND_STORE_ACQ32(&(team->t.t_threads[f]->th.th_used_in_team),
                                     1, 2);
       }
-      __kmp_free_thread(team->t.t_threads[f]);
+      if (__kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_hard_bar) {
+        KMP_COMPARE_AND_STORE_ACQ32(&(team->t.t_threads[f]->th.th_used_in_team),
+                                    1, 4);
+      }
     }
 
     if (__kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
@@ -5726,7 +5969,52 @@
       }
     }
 
+    if (__kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_hard_bar) {
+      if (team->t.h && team->t.h->is_barrier_allocated()) {
+        // wakeup sleeping threads
+        if (team->t.h->is_hybrid) {
+          for (int i = 1; i < team->t.h->num_groups; i++) {
+            int child_tid = team->t.h->get_tid_of_group_leader(i);
+            kmp_flag_64<> flag(&team->t.t_threads[child_tid]
+                                    ->th.th_bar[bs_forkjoin_barrier]
+                                    .bb.b_go,
+                               team->t.t_threads[child_tid]);
+            flag.release();
+          }
+        }
+#if KMP_NESTED_HOT_TEAMS
+        if (team->t.t_threads[0]->th.th_hot_teams) {
+          // If nested hot teams is used, __kmp_free_hot_teams will call
+          // this function in for loop during final cleanup and therefore
+          // current affinity may not match the primary thread's affinity
+          // for this team. Restore the affinity to perform below sync
+          int gtid = team->t.t_threads[0]->th.th_info.ds.ds_gtid;
+          __kmp_affinity_set_place(gtid);
+        }
+#endif
+        team->t.h->sync(team->t.t_threads[0], FALSE, 0,
+                        0 USE_ITT_BUILD_ARG(NULL));
+      } else {
+        // same comments as in __kmp_can_use_hard_barrier
+        int temp_nproc = team->t.t_threads[0]->th.th_team_nproc;
+        kmp_team_t *temp_team = team->t.t_threads[0]->th.th_team;
+
+        team->t.t_threads[0]->th.th_team_nproc = team->t.t_nproc;
+        team->t.t_threads[0]->th.th_team = team;
+        __kmp_hard_barrier_wakeup_soft(team->t.t_threads[0]);
+        team->t.t_threads[0]->th.th_team_nproc = temp_nproc;
+        team->t.t_threads[0]->th.th_team = temp_team;
+      }
+
+      // Wait for threads to be removed from team
+      for (int f = 1; f < team->t.t_nproc; ++f) {
+        while (team->t.t_threads[f]->th.th_used_in_team.load() != 0)
+          KMP_CPU_PAUSE();
+      }
+    }
+
     for (f = 1; f < team->t.t_nproc; ++f) {
+      __kmp_free_thread(team->t.t_threads[f]);
       team->t.t_threads[f] = NULL;
     }
 
@@ -5735,6 +6023,13 @@
       distributedBarrier::deallocate(team->t.b);
       team->t.b = NULL;
     }
+    if (team->t.t_max_nproc > 1 &&
+        __kmp_barrier_gather_pattern[bs_plain_barrier] == bp_hard_bar) {
+      if (team->t.h) {
+        hardBarrier::deallocate(team->t.h);
+        team->t.h = NULL;
+      }
+    }
     /* put the team back in the team pool */
     /* TODO limit size of team pool, call reap_team if pool too large */
     team->t.t_next_pool = CCAST(kmp_team_t *, __kmp_team_pool);
@@ -6133,7 +6428,8 @@
       KA_TRACE(
           20, ("__kmp_reap_thread: releasing T#%d from fork barrier for reap\n",
                gtid));
-      if (__kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
+      if (__kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar ||
+          __kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_hard_bar) {
         while (
             !KMP_COMPARE_AND_STORE_ACQ32(&(thread->th.th_used_in_team), 0, 3))
           KMP_CPU_PAUSE();
@@ -7261,6 +7557,32 @@
 
 #endif /* KMP_AFFINITY_SUPPORTED */
 
+  // If we cannot use hardware barrier at all, fallback to hyper barrier here
+  // As hardware barrier has some restriction on place setting and it will not
+  // change during runtime, check the condition after
+  // __kmp_affinity_initialize()
+  if (__kmp_barrier_release_pattern[bs_plain_barrier] == bp_hard_bar) {
+    bool fallback = false;
+
+    if (!hardBarrier::system_supports_hard_barrier()) {
+      KMP_WARNING(HardBarrierNotSupported,
+                  "System does not support hardware barrier");
+      fallback = true;
+    }
+    if (!__kmp_check_places_for_hard_barrier()) {
+      KMP_WARNING(HardBarrierNotSupported,
+                  "Place setting is not appropraite for hardware barrier");
+      fallback = true;
+    }
+
+    if (fallback) {
+      for (int i = bs_plain_barrier; i < bs_last_barrier; i++) {
+        __kmp_barrier_release_pattern[i] = bp_hyper_bar;
+        __kmp_barrier_gather_pattern[i] = bp_hyper_bar;
+      }
+    }
+  }
+
   KMP_ASSERT(__kmp_xproc > 0);
   if (__kmp_avail_proc == 0) {
     __kmp_avail_proc = __kmp_xproc;
diff --git a/openmp/runtime/src/kmp_settings.cpp b/openmp/runtime/src/kmp_settings.cpp
--- a/openmp/runtime/src/kmp_settings.cpp
+++ b/openmp/runtime/src/kmp_settings.cpp
@@ -13,6 +13,7 @@
 #include "kmp.h"
 #include "kmp_affinity.h"
 #include "kmp_atomic.h"
+#include "kmp_barrier.h"
 #if KMP_USE_HIER_SCHED
 #include "kmp_dispatch_hier.h"
 #endif
@@ -1714,7 +1715,9 @@
   const char *var;
   /* ---------- Barrier method control ------------ */
 
-  static int dist_req = 0, non_dist_req = 0;
+  static int dist_req = 0;
+  static int hard_req = 0;
+  static int other_req = 0;
   static bool warn = 1;
   for (int i = bs_plain_barrier; i < bs_last_barrier; i++) {
     var = __kmp_barrier_pattern_env_name[i];
@@ -1729,8 +1732,10 @@
                                       ',')) {
           if (j == bp_dist_bar) {
             dist_req++;
+          } else if (j == bp_hard_bar) {
+            hard_req++;
           } else {
-            non_dist_req++;
+            other_req++;
           }
           __kmp_barrier_gather_pattern[i] = (kmp_bar_pat_e)j;
           break;
@@ -1748,8 +1753,10 @@
           if (__kmp_str_match(__kmp_barrier_pattern_name[j], 1, comma + 1)) {
             if (j == bp_dist_bar) {
               dist_req++;
+            } else if (j == bp_hard_bar) {
+              hard_req++;
             } else {
-              non_dist_req++;
+              other_req++;
             }
             __kmp_barrier_release_pattern[i] = (kmp_bar_pat_e)j;
             break;
@@ -1767,9 +1774,8 @@
   }
   if (dist_req != 0) {
     // set all barriers to dist
-    if ((non_dist_req != 0) && warn) {
-      KMP_INFORM(BarrierPatternOverride, name,
-                 __kmp_barrier_pattern_name[bp_dist_bar]);
+    if ((hard_req + other_req != 0) && warn) {
+      KMP_INFORM(BarrierPatternOverride, "dist");
       warn = 0;
     }
     for (int i = bs_plain_barrier; i < bs_last_barrier; i++) {
@@ -1778,6 +1784,16 @@
       if (__kmp_barrier_gather_pattern[i] != bp_dist_bar)
         __kmp_barrier_gather_pattern[i] = bp_dist_bar;
     }
+  } else if (hard_req != 0) {
+    // set all barriers to hard
+    if (other_req != 0 && warn) {
+      KMP_INFORM(BarrierPatternOverride, "hard");
+      warn = 0;
+    }
+    for (int i = bs_plain_barrier; i < bs_last_barrier; i++) {
+      __kmp_barrier_release_pattern[i] = bp_hard_bar;
+      __kmp_barrier_gather_pattern[i] = bp_hard_bar;
+    }
   }
 } // __kmp_stg_parse_barrier_pattern
 
diff --git a/openmp/runtime/src/kmp_stats.h b/openmp/runtime/src/kmp_stats.h
--- a/openmp/runtime/src/kmp_stats.h
+++ b/openmp/runtime/src/kmp_stats.h
@@ -248,6 +248,8 @@
 // KMP_hyper_release      -- time in __kmp_hyper_barrier_release
 // KMP_dist_gather       -- time in __kmp_dist_barrier_gather
 // KMP_dist_release      -- time in __kmp_dist_barrier_release
+// KMP_hard_gather       -- time in __kmp_hard_barrier_gather
+// KMP_hard_release      -- time in __kmp_hard_barrier_release
 // clang-format off
 #define KMP_FOREACH_DEVELOPER_TIMER(macro, arg)                                \
   macro(KMP_fork_call, 0, arg)                                                 \
@@ -263,6 +265,8 @@
   macro(KMP_linear_release, 0, arg)                                            \
   macro(KMP_tree_gather, 0, arg)                                               \
   macro(KMP_tree_release, 0, arg)                                              \
+  macro(KMP_hard_gather, 0, arg)                                               \
+  macro(KMP_hard_release, 0, arg)                                              \
   macro(USER_resume, 0, arg)                                                   \
   macro(USER_suspend, 0, arg)                                                  \
   macro(USER_mwait, 0, arg)                                                    \
diff --git a/openmp/runtime/src/kmp_tasking.cpp b/openmp/runtime/src/kmp_tasking.cpp
--- a/openmp/runtime/src/kmp_tasking.cpp
+++ b/openmp/runtime/src/kmp_tasking.cpp
@@ -16,6 +16,7 @@
 #include "kmp_stats.h"
 #include "kmp_wait_release.h"
 #include "kmp_taskdeps.h"
+#include "kmp_barrier.h"
 
 #if OMPT_SUPPORT
 #include "ompt-specific.h"
@@ -584,6 +585,11 @@
                 gtid, taskdata, thread_data->td.td_deque_ntasks,
                 thread_data->td.td_deque_head, thread_data->td.td_deque_tail));
 
+  if (__kmp_barrier_gather_pattern[bs_plain_barrier] == bp_hard_bar) {
+    // wakeup sleeping thread if any
+    hardBarrier::wakeup();
+  }
+
   __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
 
   return TASK_SUCCESSFULLY_PUSHED;
@@ -3261,7 +3267,7 @@
       // proceed. If this thread is in the last spin loop in the barrier,
       // waiting to be released, we know that the termination condition will not
       // be satisfied, so don't waste any cycles checking it.
-      if (flag == NULL || (!final_spin && flag->done_check())) {
+      if (!final_spin && (flag == NULL || flag->done_check())) {
         KA_TRACE(
             15,
             ("__kmp_execute_tasks_template: T#%d spin condition satisfied\n",
diff --git a/openmp/runtime/src/z_Linux_util.cpp b/openmp/runtime/src/z_Linux_util.cpp
--- a/openmp/runtime/src/z_Linux_util.cpp
+++ b/openmp/runtime/src/z_Linux_util.cpp
@@ -2729,4 +2729,11 @@
 }
 #endif // KMP_OS_LINUX
 
+#ifdef KMP_OS_LINUX
+int __kmp_get_online_cores() {
+  int r;
+  __kmp_type_convert(sysconf(_SC_NPROCESSORS_ONLN), &(r));
+  return r;
+}
+#endif
 // end of file //