diff --git a/openmp/runtime/src/i18n/en_US.txt b/openmp/runtime/src/i18n/en_US.txt --- a/openmp/runtime/src/i18n/en_US.txt +++ b/openmp/runtime/src/i18n/en_US.txt @@ -269,7 +269,7 @@ Using_uint_Value "%1$s value \"%2$u\" will be used." Using_uint64_Value "%1$s value \"%2$s\" will be used." Using_str_Value "%1$s value \"%2$s\" will be used." -BarrierPatternOverride "Mixing other barrier patterns with dist is prohibited. Using dist for all barrier patterns." +BarrierPatternOverride "Mixing other barrier patterns with %1$s is prohibited. Using %1$s for all barrier patterns." MaxValueUsing "%1$s maximum value \"%2$d\" will be used." MinValueUsing "%1$s minimum value \"%2$d\" will be used." MemoryAllocFailed "Memory allocation failed." @@ -479,6 +479,8 @@ AffHWSubsetAllFiltered "KMP_HW_SUBSET ignored: all hardware resources would be filtered, please reduce the filter." AffHWSubsetAttrsNonHybrid "KMP_HW_SUBSET ignored: Too many attributes specified. This machine is not a hybrid architecutre." AffHWSubsetIgnoringAttr "KMP_HW_SUBSET: ignoring %1$s attribute. This machine is not a hybrid architecutre." +HardBarrierNotSupported "%1$s: Fallback to software barrier in this run entirely" +HardBarrierCannotBeUsed "Hard barrier cannot be used for this team. Fallback to software barrier" # -------------------------------------------------------------------------------------------------- -*- HINTS -*- diff --git a/openmp/runtime/src/kmp.h b/openmp/runtime/src/kmp.h --- a/openmp/runtime/src/kmp.h +++ b/openmp/runtime/src/kmp.h @@ -115,7 +115,6 @@ #include "kmp_debug.h" #include "kmp_lock.h" #include "kmp_version.h" -#include "kmp_barrier.h" #if USE_DEBUGGER #include "kmp_debugger.h" #endif @@ -730,6 +729,8 @@ virtual void clear(int i) {} // Zero out entire mask virtual void zero() {} + // Count the mask + virtual int count() const { return 0; } // Copy src into this mask virtual void copy(const Mask *src) {} // this &= rhs @@ -1991,6 +1992,7 @@ branching factor 2^n */ bp_hierarchical_bar = 3, /* Machine hierarchy tree */ bp_dist_bar = 4, /* Distributed barrier */ + bp_hard_bar = 5, /* Hardware barrier */ bp_last_bar /* Placeholder to mark the end */ } kmp_bar_pat_e; @@ -2789,6 +2791,7 @@ std::atomic th_blocking; #endif kmp_cg_root_t *th_cg_roots; // list of cg_roots associated with this thread + int th_hard_barrier_window; // used for hardware barrier } kmp_base_info_t; typedef union KMP_ALIGN_CACHE kmp_info { @@ -2836,6 +2839,9 @@ #endif #define KMP_INLINE_ARGV_ENTRIES (int)(KMP_INLINE_ARGV_BYTES / KMP_PTR_SKIP) +class distributedBarrier; +class hardBarrier; + typedef struct KMP_ALIGN_CACHE kmp_base_team { // Synchronization Data // --------------------------------------------------------------------------- @@ -2929,6 +2935,7 @@ void *t_stack_id; // team specific stack stitching id (for ittnotify) #endif /* USE_ITT_BUILD */ distributedBarrier *b; // Distributed barrier data associated with team + hardBarrier *h; // Hard barrier data associated with team } kmp_base_team_t; union KMP_ALIGN_CACHE kmp_team { @@ -3632,6 +3639,8 @@ extern void __kmp_cleanup_hierarchy(); extern void __kmp_get_hierarchy(kmp_uint32 nproc, kmp_bstate_t *thr_bar); +extern bool __kmp_check_places_for_hard_barrier(); + #if KMP_USE_FUTEX extern int __kmp_futex_determine_capable(void); @@ -3721,6 +3730,8 @@ extern void __kmp_end_split_barrier(enum barrier_type bt, int gtid); extern int __kmp_barrier_gomp_cancel(int gtid); +extern void __kmp_hard_barrier_wakeup_soft(kmp_info_t *master); + /*! * Tell the fork call which compiler generated the fork call, and therefore how * to deal with the call. @@ -4199,6 +4210,8 @@ extern void __kmp_hidden_helper_worker_thread_signal(); extern void __kmp_hidden_helper_main_thread_release(); +extern int __kmp_get_online_cores(); + // Check whether a given thread is a hidden helper thread #define KMP_HIDDEN_HELPER_THREAD(gtid) \ ((gtid) >= 1 && (gtid) <= __kmp_hidden_helper_threads_num) diff --git a/openmp/runtime/src/kmp_affinity.h b/openmp/runtime/src/kmp_affinity.h --- a/openmp/runtime/src/kmp_affinity.h +++ b/openmp/runtime/src/kmp_affinity.h @@ -52,6 +52,7 @@ int next(int previous) const override { return hwloc_bitmap_next(mask, previous); } + int count() const override { return 0; } int get_system_affinity(bool abort_on_error) override { KMP_ASSERT2(KMP_AFFINITY_CAPABLE(), "Illegal get affinity operation when not capable"); @@ -291,6 +292,13 @@ for (mask_size_type i = 0; i < e; ++i) mask[i] = (mask_t)0; } + int count() const override { + int count = 0; + for (int i = begin(); i < end(); i = next(i)) { + count++; + } + return count; + } void copy(const KMPAffinity::Mask *src) override { const Mask *convert = static_cast(src); mask_size_type e = get_num_mask_types(); diff --git a/openmp/runtime/src/kmp_affinity.cpp b/openmp/runtime/src/kmp_affinity.cpp --- a/openmp/runtime/src/kmp_affinity.cpp +++ b/openmp/runtime/src/kmp_affinity.cpp @@ -4475,6 +4475,40 @@ KMP_CPU_FREE_ARRAY(osId2Mask, maxIndex + 1); } +// return true if affinity setting is eligible for hard barrier +// i.e. each place contains only one core in succession +bool __kmp_check_places_for_hard_barrier() { + int prev = -1; + + KA_TRACE(20, ("__kmp_check_places_for_hard_barrier: enter. num_masks: %d\n", + __kmp_affinity_num_masks)); + + for (int i = 0; i < __kmp_affinity_num_masks; i++) { + kmp_affin_mask_t *mask = KMP_CPU_INDEX(__kmp_affinity_masks, i); + + if (mask->count() != 1) { + KA_TRACE(20, + ("__kmp_check_places_for_hard_barrier: place %d has %d cores\n", + i, mask->count())); + return false; + } + + int current = mask->begin(); + if (prev != -1 && current != (prev + 1)) { + KA_TRACE(20, ("__kmp_check_places_for_hard_barrier: place %d is on core " + "%d, not next to previous core %d\n", + i, current, prev)); + return false; + } + + prev = current; + } + + KA_TRACE( + 20, ("__kmp_check_places_for_hard_barrier: eixt. ok for hard barrier\n")); + return true; +} + void __kmp_affinity_initialize(void) { // Much of the code above was written assuming that if a machine was not // affinity capable, then __kmp_affinity_type == affinity_none. We now diff --git a/openmp/runtime/src/kmp_barrier.h b/openmp/runtime/src/kmp_barrier.h --- a/openmp/runtime/src/kmp_barrier.h +++ b/openmp/runtime/src/kmp_barrier.h @@ -14,8 +14,12 @@ #define KMP_BARRIER_H #include "kmp.h" +#include "kmp_itt.h" #include "kmp_i18n.h" +#include +#include "kmp_safe_c_api.h" + #if KMP_HAVE_XMMINTRIN_H && KMP_HAVE__MM_MALLOC #include #define KMP_ALIGNED_ALLOCATE(size, alignment) _mm_malloc(size, alignment) @@ -138,4 +142,147 @@ void go_reset(); }; +class hardBarrier { +private: + /* group number */ + int *group; + /* barrier blade number */ + int *bb; + /* barrier window number on each core (indexed by tid) */ + int **barrier_window; + + int primary_core; + int get_window_index_from_tid(int tid) { return tid % CORES_PER_GROUP; } + +public: + static constexpr const char *SYSFS_ROOT = "/sys/class/misc/hard_barrier"; + static const int BUF_SIZE = 64; + // const value read from sysfs + static int NUM_GROUPS; + static int CORES_PER_GROUP; + + int num_groups; + int *threads_in_group; + bool is_hybrid; + void *team_icvs; + + static hardBarrier *allocate() { + if (NUM_GROUPS == 0) { + // This only happens when allocating hot team in register_root(). + // At that time affinity has not been initialized and + // we cannot determine if hard barrier can be used or not. + // So, return NULL here to defer allocation + return NULL; + } + + hardBarrier *h = (hardBarrier *)KMP_ALIGNED_ALLOCATE(sizeof(hardBarrier), + 4 * CACHE_LINE); + if (!h) { + KMP_FATAL(MemoryAllocFailed); + } + h->team_icvs = __kmp_allocate(sizeof(kmp_internal_control)); + h->threads_in_group = (int *)__kmp_allocate(sizeof(int) * NUM_GROUPS); + h->group = (int *)__kmp_allocate(sizeof(int) * NUM_GROUPS); + h->bb = (int *)__kmp_allocate(sizeof(int) * NUM_GROUPS); + h->barrier_window = (int **)__kmp_allocate(sizeof(int *) * NUM_GROUPS); + + h->is_hybrid = false; + h->num_groups = 0; + for (int i = 0; i < NUM_GROUPS; i++) { + h->group[i] = -1; + h->bb[i] = -1; + h->threads_in_group[i] = 0; + h->barrier_window[i] = + (int *)__kmp_allocate(sizeof(int) * hardBarrier::CORES_PER_GROUP); + for (int j = 0; j < hardBarrier::CORES_PER_GROUP; j++) { + h->barrier_window[i][j] = -1; + } + } + return h; + } + static void deallocate(hardBarrier *h) { + h->barrier_free(); + for (int i = 0; i < hardBarrier::NUM_GROUPS; i++) { + __kmp_free(h->barrier_window[i]); + } + __kmp_free(h->barrier_window); + __kmp_free(h->bb); + __kmp_free(h->group); + __kmp_free(h->threads_in_group); + __kmp_free(h->team_icvs); + KMP_ALIGNED_FREE(h); + } + + hardBarrier() = delete; + ~hardBarrier() = delete; + + int barrier_alloc(kmp_info_t *this_thr, int nthreads); + void barrier_free(); + bool is_barrier_allocated() { + if (num_groups > 0) { + return true; + } + return false; + }; + void get_window(kmp_info_t *this_thr, int tid); + void sync(kmp_info_t *this_thr, int final_spin, int gtid, + int tid USE_ITT_BUILD_ARG(void *itt_sync_obj)); + static void wakeup() { + // wakeup threads which entered to sleep by wfe + asm volatile("sevl" :::); + } + static bool system_supports_hard_barrier() { + int ret, fd; + char buf[BUF_SIZE]; + + KMP_SNPRINTF(buf, BUF_SIZE, "%s/group0/barrier0/masks", SYSFS_ROOT); + fd = open(buf, O_RDWR); + if (fd < 0) { + // module not loaded or permission denied + return false; + } + close(fd); + + // check avilable barrier + for (int group = 0;; group++) { + KMP_SNPRINTF(buf, BUF_SIZE, "%s/group%d/available_cpus", SYSFS_ROOT, + group); + fd = open(buf, O_RDONLY); + if (fd < 0) { + NUM_GROUPS = group; + break; + } + + ret = read(fd, buf, BUF_SIZE); + KMP_ASSERT(ret > 0); + buf[ret] = '\0'; + int start, end; + ret = KMP_SSCANF(buf, "%d-%d", &start, &end); + if (CORES_PER_GROUP == 0) { + CORES_PER_GROUP = end - start + 1; + } else { + // XXX: Assume all group has the same number of cores for now + KMP_ASSERT(CORES_PER_GROUP == end - start + 1); + } + close(fd); + } + + KMP_ASSERT(hardBarrier::NUM_GROUPS > 0 && hardBarrier::CORES_PER_GROUP > 0); + + return true; + } + + // below functinos also support when primary's core is not the + // first one of the group + bool is_group_leader(int tid) { + return tid == 0 || ((tid + primary_core) % CORES_PER_GROUP) == 0; + } + int get_group_from_tid(int tid) { + return (tid + (primary_core % CORES_PER_GROUP)) / CORES_PER_GROUP; + } + int get_tid_of_group_leader(int group) { + return (group * CORES_PER_GROUP) - (primary_core % CORES_PER_GROUP); + } +}; + #endif // KMP_BARRIER_H diff --git a/openmp/runtime/src/kmp_barrier.cpp b/openmp/runtime/src/kmp_barrier.cpp --- a/openmp/runtime/src/kmp_barrier.cpp +++ b/openmp/runtime/src/kmp_barrier.cpp @@ -1734,6 +1734,817 @@ gtid, team->t.t_id, tid, bt)); } +//*****************************************************// +// hardware barrier +int hardBarrier::NUM_GROUPS = 0; +int hardBarrier::CORES_PER_GROUP = 0; + +int hardBarrier::barrier_alloc(kmp_info_t *this_thr, int nthreads) { + // can be reallocated when team size is changed + if (is_barrier_allocated()) { + this->barrier_free(); + } + + int offset = this_thr->th.th_affin_mask->begin(); + KMP_DEBUG_ASSERT(this_thr->th.th_affin_mask->count() == 1); + + this->primary_core = offset; + if ((offset % CORES_PER_GROUP) + nthreads > CORES_PER_GROUP) { + this->is_hybrid = true; + } else { + this->is_hybrid = false; + } + + // we already checked that all threads is packed closely + // from primary core in __kmp_can_use_hard_barrier + int group; + int first_group = (offset / CORES_PER_GROUP); + for (group = 0; group < NUM_GROUPS && nthreads > 0; group++) { + int nthreads_in_group; + if (group == 0) { + // handling for when the primary's core is not the first one in the group + nthreads_in_group = CORES_PER_GROUP - (offset % CORES_PER_GROUP); + if (nthreads_in_group > nthreads) + nthreads_in_group = nthreads; + } else { + nthreads_in_group = + (nthreads > CORES_PER_GROUP) ? CORES_PER_GROUP : nthreads; + } + + int _group = (group + first_group) % NUM_GROUPS; + + // serach free bb + int bb; + for (bb = 0;; bb++) { + int ret, fd; + char buf[BUF_SIZE]; + ssize_t n; + + KMP_SNPRINTF(buf, BUF_SIZE, "%s/group%d/barrier%d/masks", SYSFS_ROOT, + _group, bb); + fd = open(buf, O_WRONLY); + if (fd < 0) { + // no more available barrier + this->num_groups = group; + barrier_free(); + return -1; + } + + n = KMP_SNPRINTF(buf, BUF_SIZE, "%d-%d", offset, + offset + nthreads_in_group - 1); + ret = write(fd, buf, n + 1); + close(fd); + if (ret > 0) { + // success + break; + } + } + + KA_TRACE(10, + ("hardBarrier::barrier_alloc. allocate bb: %d/%d\n", bb, _group)); + + this->group[group] = _group; + this->bb[group] = bb; + this->threads_in_group[group] = nthreads_in_group; + + // set next start position + offset += nthreads_in_group; + nthreads -= nthreads_in_group; + } + this->num_groups = group; + + return 0; +} + +void hardBarrier::barrier_free() { + if (!is_barrier_allocated()) { + return; + } + + for (int group = 0; group < this->num_groups && this->group[group] >= 0; + group++) { + char buf[BUF_SIZE]; + int ret, fd; + KMP_SNPRINTF(buf, BUF_SIZE, "%s/group%d/barrier%d/masks", SYSFS_ROOT, + this->group[group], this->bb[group]); + fd = open(buf, O_WRONLY); + // open error shoud not happen here + KMP_ASSERT(fd > 0); + + buf[0] = '\0'; + ret = write(fd, buf, 1); + // write error shoud not happen here + KMP_ASSERT(ret > 0); + close(fd); + + KA_TRACE(10, ("hardBarrier::barrier_free. free bb: %d/%d\n", + this->group[group], this->bb[group])); + + this->bb[group] = -1; + this->group[group] = -1; + this->threads_in_group[group] = 0; + for (int i = 0; i < CORES_PER_GROUP; i++) { + this->barrier_window[group][i] = -1; + } + } + this->num_groups = -1; +} + +void hardBarrier::get_window(kmp_info_t *this_thr, int tid) { + int group = get_group_from_tid(tid); + int id = get_window_index_from_tid(tid); + + int bb = this->bb[group]; + group = this->group[group]; + + if (this->barrier_window[group][id] != -1) { + // already assigned. restore window number + this_thr->th.th_hard_barrier_window = this->barrier_window[group][id]; + return; + } + + KMP_DEBUG_ASSERT(group >= 0 && bb >= 0); + + char buf[BUF_SIZE]; + int ret, fd; + + // get window number + KMP_SNPRINTF(buf, BUF_SIZE, "%s/group%d/barrier%d/user", SYSFS_ROOT, group, + bb); + fd = open(buf, O_RDONLY); + // open error shoud not happen here + KMP_ASSERT(fd > 0); + + ret = read(fd, buf, BUF_SIZE); + // write error shoud not happen here + KMP_ASSERT(ret > 0); + buf[ret] = '\0'; + + int window; + ret = KMP_SSCANF(buf, "%d", &window); + KMP_ASSERT(ret > 0); + + close(fd); + + KA_TRACE(10, ("hardBarrier::get_window. bb: %d/%d, tid: %d, window: %d\n", + group, bb, window)); + + /* + * It would be possible barrier window number to be used for this + * thread might change when nest is used. Keep window number in team + */ + this->barrier_window[group][id] = window; + this_thr->th.th_hard_barrier_window = window; +} + +#define SYNC(reg) \ + asm volatile("mrs x1, " #reg "\n\t" \ + "mvn x1, x1\n\t" \ + "and x1, x1, #1\n\t" \ + "msr " #reg ", x1\n\t" \ + \ + "sevl\n\t" \ + \ + "1:mrs x2, " #reg "\n\t" \ + "and x2, x2, #1\n\t" \ + "cmp x1, x2\n\t" \ + \ + "beq 2f\n\t" \ + "b 1b\n\t" \ + \ + "2:\n\t" \ + : \ + : \ + : "x1", "x2", "cc", "memory") + +// same as above but wait event by wfe +#define SYNC_WFE(reg) \ + asm volatile("mrs x1, " #reg "\n\t" \ + "mvn x1, x1\n\t" \ + "and x1, x1, #1\n\t" \ + "msr " #reg ", x1\n\t" \ + \ + "sevl\n\t" \ + \ + "1:mrs x2, " #reg "\n\t" \ + "and x2, x2, #1\n\t" \ + "cmp x1, x2\n\t" \ + \ + "beq 2f\n\t" \ + "wfe\n\t" \ + "b 1b\n\t" \ + \ + "2:\n\t" \ + : \ + : \ + : "x1", "x2", "cc", "memory") + +// update (negate) own's BST bit +#define UPDATE_BST(reg, val) \ + asm volatile("mrs x1, " #reg "\n\t" \ + "mvn x1, x1\n\t" \ + "and x1, x1, #1\n\t" \ + "msr " #reg ", x1\n\t" \ + \ + "sevl\n\t" \ + "mov %x[val], x1\n\t" \ + : [val] "=r"(val) \ + : \ + : "x1", "cc", "memory") + +// check if LBSY bit has been changed +// (when all core has udpated BST, LBSY changes) +#define CHECK_LBSY(reg, val, completed) \ + asm volatile("mrs x2, " #reg "\n\t" \ + "and x2, x2, #1\n\t" \ + "cmp %x[val], x2\n\t" \ + "beq 1f\n\t" \ + \ + "mov %x[comp], #0\n\t" \ + "b 2f\n\t" \ + \ + "1:\n\t" \ + "mov %x[comp], #1\n\t" \ + \ + "2:\n\t" \ + : [comp] "=r"(completed) \ + : [val] "r"(val) \ + : "x2", "cc", "memory") + +// same as above but first wait event by wfe +#define CHECK_LBSY_WFE(reg, val, completed) \ + asm volatile("wfe\n\t" \ + "mrs x2, " #reg "\n\t" \ + "and x2, x2, #1\n\t" \ + "cmp %x[val], x2\n\t" \ + "beq 1f\n\t" \ + \ + "mov %x[comp], #0\n\t" \ + "b 2f\n\t" \ + \ + "1:\n\t" \ + "mov %x[comp], #1\n\t" \ + \ + "2:\n\t" \ + : [comp] "=r"(completed) \ + : [val] "r"(val) \ + : "x2", "cc", "memory") + +// task handling is based on __kmp_wait_template +void hardBarrier::sync(kmp_info_t *this_thr, int final_spin, int gtid, + int tid USE_ITT_BUILD_ARG(void *itt_sync_obj)) { + + int window = this_thr->th.th_hard_barrier_window; + +#if KMP_STATS_ENABLED + stats_state_e thread_state = KMP_GET_THREAD_STATE(); +#endif + +#if OMPT_SUPPORT + ompt_state_t ompt_entry_state; + ompt_data_t *tId; + if (ompt_enabled.enabled) { + ompt_entry_state = this_thr->th.ompt_thread_info.state; + if (!final_spin || ompt_entry_state != ompt_state_wait_barrier_implicit || + KMP_MASTER_TID(this_thr->th.th_info.ds.ds_tid)) { + ompt_lw_taskteam_t *team = NULL; + if (this_thr->th.th_team) + team = this_thr->th.th_team->t.ompt_serialized_team_info; + if (team) { + tId = &(team->ompt_task_info.task_data); + } else { + tId = OMPT_CUR_TASK_DATA(this_thr); + } + } else { + tId = &(this_thr->th.ompt_thread_info.task_data); + } + if (final_spin && this_thr->th.th_task_team == NULL) { + // implicit task is done. Either no taskqueue, or task-team finished + __ompt_implicit_task_end(this_thr, ompt_entry_state, tId); + } + } +#endif + + if (__kmp_tasking_mode == tskm_immediate_exec) { + // shortcut route when there is notask + if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) { + switch (window) { + case 0: + SYNC_WFE(s3_3_c15_c15_0); + break; + case 1: + SYNC_WFE(s3_3_c15_c15_1); + break; + case 2: + SYNC_WFE(s3_3_c15_c15_2); + break; + case 3: + SYNC_WFE(s3_3_c15_c15_3); + break; + default: + // should not happen + KMP_ASSERT(0); + break; + } + } else { + switch (window) { + case 0: + SYNC(s3_3_c15_c15_0); + break; + case 1: + SYNC(s3_3_c15_c15_1); + break; + case 2: + SYNC(s3_3_c15_c15_2); + break; + case 3: + SYNC(s3_3_c15_c15_3); + break; + default: + // should not happen + KMP_ASSERT(0); + break; + } + } + } else { + uint64_t val = 0; + // updagte BST bit + switch (window) { + case 0: + UPDATE_BST(s3_3_c15_c15_0, val); + break; + case 1: + UPDATE_BST(s3_3_c15_c15_1, val); + break; + case 2: + UPDATE_BST(s3_3_c15_c15_2, val); + break; + case 3: + UPDATE_BST(s3_3_c15_c15_3, val); + break; + default: + // should not happen + KMP_ASSERT(0); + break; + } + KA_TRACE(50, ("hardBarrier::sync_with_task: T#%d(%d:%d) new bst: %d, " + "final_spin: %d\n", + gtid, this_thr->th.th_team->t.t_id, tid, val, final_spin)); + + kmp_task_team_t *task_team = this_thr->th.th_task_team; + int tasks_completed = FALSE; + uint64_t completed = 0; + do { + // execute tasks + if (task_team != NULL) { + if (TCR_SYNC_4(task_team->tt.tt_active)) { + if (KMP_TASKING_ENABLED(task_team)) { + if (final_spin) { + do { + __kmp_atomic_execute_tasks_64( + this_thr, gtid, (kmp_atomic_flag_64<> *)NULL, final_spin, + &tasks_completed USE_ITT_BUILD_ARG(itt_sync_obj), 0); + } while (!tasks_completed); + } else { + __kmp_atomic_execute_tasks_64( + this_thr, gtid, (kmp_atomic_flag_64<> *)NULL, final_spin, + &tasks_completed USE_ITT_BUILD_ARG(itt_sync_obj), 0); + } + } else + this_thr->th.th_reap_state = KMP_SAFE_TO_REAP; + } else { + KMP_DEBUG_ASSERT(!KMP_MASTER_TID(this_thr->th.th_info.ds.ds_tid)); +#if OMPT_SUPPORT + // task-team is done now, other cases should be catched above + if (final_spin && ompt_enabled.enabled) + __ompt_implicit_task_end(this_thr, ompt_entry_state, tId); +#endif + this_thr->th.th_task_team = NULL; + this_thr->th.th_reap_state = KMP_SAFE_TO_REAP; + task_team = NULL; + } + } else { + this_thr->th.th_reap_state = KMP_SAFE_TO_REAP; + } + + if (TCR_4(__kmp_global.g.g_done)) { + if (__kmp_global.g.g_abort) + __kmp_abort_thread(); + } + +#if KMP_STATS_ENABLED + // Check if thread has been signalled to idle state + // This indicates that the logical "join-barrier" has finished + if (this_thr->th.th_stats->isIdle() && + KMP_GET_THREAD_STATE() == FORK_JOIN_BARRIER) { + KMP_SET_THREAD_STATE(IDLE); + KMP_PUSH_PARTITIONED_TIMER(OMP_idle); + } +#endif + + if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME || + ((task_team != NULL) && TCR_4(task_team->tt.tt_found_tasks))) { + // if user requests busy loop or remaining tasks exist, + // do not sleep by wfe + switch (window) { + case 0: + CHECK_LBSY(s3_3_c15_c15_0, val, completed); + break; + case 1: + CHECK_LBSY(s3_3_c15_c15_1, val, completed); + break; + case 2: + CHECK_LBSY(s3_3_c15_c15_2, val, completed); + break; + case 3: + CHECK_LBSY(s3_3_c15_c15_3, val, completed); + break; + } + } else { + switch (window) { + case 0: + CHECK_LBSY_WFE(s3_3_c15_c15_0, val, completed); + break; + case 1: + CHECK_LBSY_WFE(s3_3_c15_c15_1, val, completed); + break; + case 2: + CHECK_LBSY_WFE(s3_3_c15_c15_2, val, completed); + break; + case 3: + CHECK_LBSY_WFE(s3_3_c15_c15_3, val, completed); + break; + } + } + } while (!completed); + } + +#if OMPT_SUPPORT + ompt_state_t ompt_exit_state = this_thr->th.ompt_thread_info.state; + if (ompt_enabled.enabled && ompt_exit_state != ompt_state_undefined) { +#if OMPT_OPTIONAL + if (final_spin) { + __ompt_implicit_task_end(this_thr, ompt_exit_state, tId); + ompt_exit_state = this_thr->th.ompt_thread_info.state; + } +#endif + if (ompt_exit_state == ompt_state_idle) { + this_thr->th.ompt_thread_info.state = ompt_state_overhead; + } + } +#endif +#if KMP_STATS_ENABLED + // If we were put into idle state, pop that off the state stack + if (KMP_GET_THREAD_STATE() == IDLE) { + KMP_POP_PARTITIONED_TIMER(); + KMP_SET_THREAD_STATE(thread_state); + this_thr->th.th_stats->resetIdleFlag(); + } +#endif +} + +static void __kmp_hard_barrier_gather( + enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid, + void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) { + KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hard_gather); + kmp_team_t *team = this_thr->th.th_team; + kmp_info_t **other_threads = team->t.t_threads; + + KA_TRACE(20, ("__kmp_hard_barrier_gather: T#%d(%d:%d) enter for " + "barrier type %d\n", + gtid, team->t.t_id, tid, bt)); + KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]); + + // Optimized route for intra-group only barrier in plain barrier. + // Release barrier (one synchronization) is enough in that case + // if no tasking. When tasking is enabled, workers still need to wait + // the primary to setup task team in gather barrier + if (bt == bs_plain_barrier && __kmp_tasking_mode == tskm_immediate_exec && + team->t.h && team->t.h->is_barrier_allocated() && !team->t.h->is_hybrid) { + KMP_DEBUG_ASSERT(reduce == NULL); + KA_TRACE(20, ("__kmp_hard_barrier_gather: T#%d(%d:%d) skip barrier in" + "no tasking mode\n", + gtid, team->t.t_id, tid)); + return; + } + + if (!team->t.h || !team->t.h->is_barrier_allocated()) { + // fallback to soft barrier + KA_TRACE(20, ("__kmp_hard_barrier_gather: T#%d(%d:%d) fall back to hyper\n", + gtid, team->t.t_id, tid)); + return __kmp_hyper_barrier_gather(bt, this_thr, gtid, tid, + reduce USE_ITT_BUILD_ARG(itt_sync_obj)); + } + + if (team->t.h->is_hybrid) { + // sync within group first + team->t.h->sync(this_thr, FALSE, gtid, tid USE_ITT_BUILD_ARG(itt_sync_obj)); + + // then, group leader sync using soft barrier + // (simple linear barrier scheme for now) + if (team->t.h->is_group_leader(tid)) { + if (KMP_MASTER_TID(tid)) { + // reduce in my group + if (reduce) { + OMPT_REDUCTION_DECL(this_thr, gtid); + OMPT_REDUCTION_BEGIN; + // reduce in my group + int group = team->t.h->get_group_from_tid(tid); + for (int i = 1; i < team->t.h->threads_in_group[group]; i++) { + int child_tid = tid + i; + kmp_info_t *child_thr = other_threads[child_tid]; + (*reduce)(this_thr->th.th_local.reduce_data, + child_thr->th.th_local.reduce_data); + } + OMPT_REDUCTION_END; + } + + // wait and reduce other group leader + kmp_balign_team_t *team_bar = &team->t.t_bar[bt]; + kmp_uint64 new_state = team_bar->b_arrived + KMP_BARRIER_STATE_BUMP; + for (int i = 1; i < team->t.h->num_groups; i++) { + int child_tid = team->t.h->get_tid_of_group_leader(i); + kmp_flag_64<> flag( + &other_threads[child_tid]->th.th_bar[bt].bb.b_arrived, new_state); + flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj)); + if (reduce) { + OMPT_REDUCTION_DECL(this_thr, gtid); + OMPT_REDUCTION_BEGIN; + (*reduce)(this_thr->th.th_local.reduce_data, + other_threads[child_tid]->th.th_local.reduce_data); + OMPT_REDUCTION_END; + } + } + team_bar->b_arrived = new_state; + } else { + if (reduce) { + OMPT_REDUCTION_DECL(this_thr, gtid); + OMPT_REDUCTION_BEGIN; + // reduce in my group + int group = team->t.h->get_group_from_tid(tid); + for (int i = 1; i < team->t.h->threads_in_group[group]; i++) { + int child_tid = tid + i; + kmp_info_t *child_thr = other_threads[child_tid]; + (*reduce)(this_thr->th.th_local.reduce_data, + child_thr->th.th_local.reduce_data); + } + OMPT_REDUCTION_END; + } + // mark arrival to primary thread + kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb; + kmp_flag_64<> flag(&thr_bar->b_arrived, other_threads[0]); + flag.release(); + } + } + + KA_TRACE(20, + ("__kmp_hard_barrier_gather: T#%d(%d:%d) exit hybrid barrier for " + "barrier type %d\n", + gtid, team->t.t_id, tid, bt)); + return; + } + + // non-hybrid barrier case. just sync within group + team->t.h->sync(this_thr, FALSE, gtid, tid USE_ITT_BUILD_ARG(itt_sync_obj)); + + if (KMP_MASTER_TID(tid)) { + // primary performes all reduce operation + if (reduce) { + OMPT_REDUCTION_DECL(this_thr, gtid); + OMPT_REDUCTION_BEGIN; + for (int child_tid = 1; child_tid < team->t.t_nproc; child_tid++) { + kmp_info_t *child_thr = other_threads[child_tid]; + (*reduce)(this_thr->th.th_local.reduce_data, + child_thr->th.th_local.reduce_data); + } + OMPT_REDUCTION_END; + } + } + + KA_TRACE(20, ("__kmp_hard_barrier_gather: T#%d(%d:%d) exit for " + "barrier type %d\n", + gtid, team->t.t_id, tid, bt)); +} + +static void __kmp_hard_barrier_release( + enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid, + int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) { + KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hard_release); + kmp_bstate_t *thr_bar; + kmp_team_t *team; + + KA_TRACE(20, ("__kmp_hard_barrier_release: T#%d(%d:%d)" + "enter for barrier type %d\n", + gtid, NULL, tid, bt)); + thr_bar = &this_thr->th.th_bar[bt].bb; + + // If this is not a fork barrier, everything is already set up. + // Provide the optimized route for intra-group only barrier which performs + // best + if (bt != bs_forkjoin_barrier) { + team = this_thr->th.th_team; + if (team->t.h && team->t.h->is_barrier_allocated() && + !team->t.h->is_hybrid) { + team->t.h->sync(this_thr, TRUE, gtid, + tid USE_ITT_BUILD_ARG(itt_sync_obj)); + return; + } + } + + if (!KMP_MASTER_TID(tid)) { + // workers and non-master group leaders need to check their presence in team + do { + KA_TRACE( + 20, + ("__kmp_hard_barrier_release: T#%d(%d:%d) worker. wait for team is " + "set up. th_used_in_team: %d\n", + gtid, NULL, tid, this_thr->th.th_used_in_team.load())); + // if this thread is not in team yet + if (this_thr->th.th_used_in_team.load() != 1 && + this_thr->th.th_used_in_team.load() != 3 && + this_thr->th.th_used_in_team.load() != 4) { + + // wait th_used_in_team will become 3 (by master in __kmp_add_threads) + kmp_flag_32 my_flag(&(this_thr->th.th_used_in_team), 3); + if (KMP_COMPARE_AND_STORE_ACQ32(&(this_thr->th.th_used_in_team), 2, + 0) || + this_thr->th.th_used_in_team.load() == 0) { + my_flag.wait(this_thr, true USE_ITT_BUILD_ARG(itt_sync_obj)); + } + if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done)) + return; + } + + if (this_thr->th.th_used_in_team.load() != 1 && + this_thr->th.th_used_in_team.load() != 3 && + this_thr->th.th_used_in_team.load() != 4) // spurious wake-up? + continue; + if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done)) + return; + + tid = __kmp_tid_from_gtid(gtid); + team = this_thr->th.th_team; + KMP_DEBUG_ASSERT(tid >= 0); + KMP_DEBUG_ASSERT(team); + KA_TRACE(20, ("__kmp_hard_barrier_release: T#%d(%d:%d) worker. woke up\n", + gtid, team, tid)); + + if (this_thr->th.th_used_in_team.load() == 3) { + KMP_COMPARE_AND_STORE_ACQ32(&(this_thr->th.th_used_in_team), 3, 1); + } + // if th_used_in_team is 4, then the team this thread currently belongs + // to will change the number of threads. We need to perform + // synchronization before that as hard barrier cannot wakeup a part of + // thread + + // if hard barrier cannot be used, fallback to softbarrier + if (!team->t.h || !team->t.h->is_barrier_allocated()) { + KA_TRACE(20, ("__kmp_hard_barrier_release: T#%d(%d:%d) worker. fall " + "back to hyper\n", + gtid, team, tid)); + __kmp_hyper_barrier_release( + bt, this_thr, gtid, tid, + propagate_icvs USE_ITT_BUILD_ARG(itt_sync_obj)); + if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done)) { + // master is waiting th_used_in_team to become 0 in kmp_reap_thread + KMP_COMPARE_AND_STORE_ACQ32(&(this_thr->th.th_used_in_team), 4, 0); + return; + } + + // this thread will no longer belong to team + if (this_thr->th.th_used_in_team.load() == 4) { + KA_TRACE(20, ("__kmp_hard_barrier_release: T#%d(%d:%d) worker. this " + "thread becomes unused\n", + gtid, team, tid)); + KMP_COMPARE_AND_STORE_ACQ32(&(this_thr->th.th_used_in_team), 4, 2); + continue; + } + + return; + } + + // if this is a fork barrier, allocate a barrier window for this thread + if (bt == bs_forkjoin_barrier) { + // setup affinity early since hard barrier requires it + if (this_thr->th.th_new_place != this_thr->th.th_current_place) { + __kmp_affinity_set_place(gtid); + // we should call __kmp_aux_display_affinity here but it will be + // resulted in displaying the affinity information twice (here and + // in __kmp_fork_barrier after this function returns). skip here. + } + team->t.h->get_window(this_thr, tid); + } + + if (team->t.h->is_hybrid && team->t.h->is_group_leader(tid)) { + // group leader waits primary to wake me up + kmp_flag_64<> flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP); + flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj)); + if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done)) + return; + TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE); + KMP_MB(); + + // intra group sync + team->t.h->sync(this_thr, FALSE, gtid, + tid USE_ITT_BUILD_ARG(itt_sync_obj)); + } else { + // intra group sync + team->t.h->sync(this_thr, TRUE, gtid, + tid USE_ITT_BUILD_ARG(itt_sync_obj)); + } + + if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done)) + return; + + // check if this thread still belongs to the team + if (this_thr->th.th_used_in_team.load() == 1) { +#if KMP_BARRIER_ICV_PUSH + if (propagate_icvs) { + __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], + team, tid, FALSE); + // get icvs stored in team's struct + copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs, + (kmp_internal_control_t *)team->t.h->team_icvs); + copy_icvs(&thr_bar->th_fixed_icvs, + &team->t.t_implicit_task_taskdata[tid].td_icvs); + } +#endif // KMP_BARRIER_ICV_PUSH + // success, exit from barrier + break; + } + + if (this_thr->th.th_used_in_team.load() == 4) { + KA_TRACE(20, ("__kmp_hard_barrier_release: T#%d(%d:%d) worker. this " + "thread becomes unused\n", + gtid, team, tid)); + KMP_COMPARE_AND_STORE_ACQ32(&(this_thr->th.th_used_in_team), 4, 2); + } + // next team may use soft barrier, reset status + for (int bs = 0; bs < bs_last_barrier; bs++) { + kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bs].bb; + thr_bar->b_arrived = KMP_INIT_BARRIER_STATE; + } + // thread now becomes unused, retry from start + } while (1); + } else { + // primary + team = this_thr->th.th_team; + tid = __kmp_tid_from_gtid(gtid); + KA_TRACE(20, ("__kmp_hard_barrier_release: T#%d(%d:%d) primary.\n", gtid, + team, tid)); + + // fallback to softbarrier; + if (!team->t.h || !team->t.h->is_barrier_allocated()) { + KA_TRACE(20, ("__kmp_hard_barrier_release: T#%d(%d:%d) primary. fallback " + "to hyper\n", + gtid, team, tid)); + return __kmp_hyper_barrier_release( + bt, this_thr, gtid, tid, + propagate_icvs USE_ITT_BUILD_ARG(itt_sync_obj)); + } + + if (bt == bs_forkjoin_barrier) { + if (this_thr->th.th_new_place != this_thr->th.th_current_place) { + __kmp_affinity_set_place(gtid); + } + team->t.h->get_window(this_thr, tid); + } + +#if KMP_BARRIER_ICV_PUSH + if (propagate_icvs) { + __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team, + tid, FALSE); + copy_icvs(&thr_bar->th_fixed_icvs, + &team->t.t_implicit_task_taskdata[tid].td_icvs); + } +#endif + + if (team->t.h->is_hybrid) { + // wakeup group leader + kmp_info_t **other_threads = team->t.t_threads; + for (int i = 1; i < team->t.h->num_groups; i++) { + int child_tid = team->t.h->get_tid_of_group_leader(i); + kmp_flag_64<> flag(&other_threads[child_tid]->th.th_bar[bt].bb.b_go, + other_threads[child_tid]); + flag.release(); + } + } + + // intra group sync + team->t.h->sync(this_thr, TRUE, gtid, tid USE_ITT_BUILD_ARG(itt_sync_obj)); + } + + KA_TRACE(20, ("__kmp_hard_barrier_release: T#%d(%d:%d) exit for " + "barrier type %d\n", + gtid, team->t.t_id, tid, bt)); +} + +void __kmp_hard_barrier_wakeup_soft(kmp_info_t *master) { + __kmp_hyper_barrier_release(bs_forkjoin_barrier, master, + __kmp_gtid_from_thread(master), 0, + true USE_ITT_BUILD_ARG(NULL)); +} +//*****************************************************// // End of Barrier Algorithms // type traits for cancellable value @@ -1871,6 +2682,11 @@ reduce USE_ITT_BUILD_ARG(itt_sync_obj)); break; } + case bp_hard_bar: { + __kmp_hard_barrier_gather(bt, this_thr, gtid, tid, + reduce USE_ITT_BUILD_ARG(itt_sync_obj)); + break; + } case bp_hyper_bar: { // don't set branch bits to 0; use linear KMP_ASSERT(__kmp_barrier_gather_branch_bits[bt]); @@ -1990,6 +2806,11 @@ FALSE USE_ITT_BUILD_ARG(itt_sync_obj)); break; } + case bp_hard_bar: { + __kmp_hard_barrier_release(bt, this_thr, gtid, tid, + FALSE USE_ITT_BUILD_ARG(itt_sync_obj)); + break; + } case bp_hyper_bar: { KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]); __kmp_hyper_barrier_release(bt, this_thr, gtid, tid, @@ -2143,6 +2964,11 @@ FALSE USE_ITT_BUILD_ARG(NULL)); break; } + case bp_hard_bar: { + __kmp_hard_barrier_release(bt, this_thr, gtid, tid, + FALSE USE_ITT_BUILD_ARG(NULL)); + break; + } default: { __kmp_linear_barrier_release(bt, this_thr, gtid, tid, FALSE USE_ITT_BUILD_ARG(NULL)); @@ -2276,6 +3102,11 @@ NULL USE_ITT_BUILD_ARG(itt_sync_obj)); break; } + case bp_hard_bar: { + __kmp_hard_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid, + NULL USE_ITT_BUILD_ARG(itt_sync_obj)); + break; + } case bp_hyper_bar: { KMP_ASSERT(__kmp_barrier_gather_branch_bits[bs_forkjoin_barrier]); __kmp_hyper_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid, @@ -2469,6 +3300,11 @@ TRUE USE_ITT_BUILD_ARG(NULL)); break; } + case bp_hard_bar: { + __kmp_hard_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid, + TRUE USE_ITT_BUILD_ARG(itt_sync_obj)); + break; + } case bp_hyper_bar: { KMP_ASSERT(__kmp_barrier_release_branch_bits[bs_forkjoin_barrier]); __kmp_hyper_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid, diff --git a/openmp/runtime/src/kmp_global.cpp b/openmp/runtime/src/kmp_global.cpp --- a/openmp/runtime/src/kmp_global.cpp +++ b/openmp/runtime/src/kmp_global.cpp @@ -111,7 +111,7 @@ #endif // KMP_FAST_REDUCTION_BARRIER }; char const *__kmp_barrier_pattern_name[bp_last_bar] = { - "linear", "tree", "hyper", "hierarchical", "dist"}; + "linear", "tree", "hyper", "hierarchical", "dist", "hard"}; int __kmp_allThreadsSpecified = 0; size_t __kmp_align_alloc = CACHE_LINE; diff --git a/openmp/runtime/src/kmp_runtime.cpp b/openmp/runtime/src/kmp_runtime.cpp --- a/openmp/runtime/src/kmp_runtime.cpp +++ b/openmp/runtime/src/kmp_runtime.cpp @@ -13,6 +13,7 @@ #include "kmp.h" #include "kmp_affinity.h" #include "kmp_atomic.h" +#include "kmp_barrier.h" #include "kmp_environment.h" #include "kmp_error.h" #include "kmp_i18n.h" @@ -111,6 +112,10 @@ int new_nthreads); void __kmp_add_threads_to_team(kmp_team_t *team, int new_nthreads); +static void __kmp_resize_hard_barrier(kmp_team_t *team, kmp_info_t *master, + int new_nproc, int new_proc_bind, + bool force = false); + /* Calculate the identifier of the current thread */ /* fast (and somewhat portable) way to get unique identifier of executing thread. Returns KMP_GTID_DNE if we haven't been assigned a gtid. */ @@ -1011,6 +1016,15 @@ __kmp_partition_places(team); } #endif + if (__kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_hard_bar) { + // affinity is not setup for teams construct therefore we cannot use hard + // barrier for teams construct + if (!fork_teams_workers && team->t.t_nproc > 1) { + __kmp_resize_hard_barrier(team, master_th, team->t.t_nproc, + team->t.t_proc_bind); + } + __kmp_add_threads_to_team(team, team->t.t_nproc); + } } if (__kmp_display_affinity && team->t.t_display_affinity != 1) { @@ -1350,6 +1364,45 @@ #endif } +// return true if hard barrier can be used for this team +static bool __kmp_can_use_hard_barrier(kmp_team_t *team, kmp_info_t *master, + int nthreads, int proc_bind) { + // TODO: add support of KMP_AFFINITY + if (proc_bind != proc_bind_close) { + KA_TRACE(20, ("__kmp_can_use_hard_barrier: proc bind is not close: %d/%d\n", + proc_bind, __kmp_affinity_type)); + return false; + } + + if (master->th.th_affin_mask->count() != 1) { + KA_TRACE( + 20, + ("__kmp_can_use_hard_barrier: master affinity is not appropriate: %d\n", + master->th.th_affin_mask->count())); + return false; + } + + int primary_core = master->th.th_affin_mask->begin(); + if (((primary_core % hardBarrier::CORES_PER_GROUP) + nthreads) > + hardBarrier::CORES_PER_GROUP * hardBarrier::NUM_GROUPS) { + KA_TRACE(20, ("__kmp_can_use_hard_barrier: too many workers: %d/%d/%d\n", + primary_core, nthreads, team->t.t_nproc)); + return false; + } + + if ((primary_core + nthreads) % hardBarrier::CORES_PER_GROUP == 1) { + KA_TRACE(20, ("__kmp_can_use_hard_barrier: each group needs at least 1 " + "threads: %d/%d/%d\n", + primary_core, nthreads, team->t.t_nproc)); + return false; + } + + KA_TRACE(20, ("__kmp_can_use_hard_barrier: hard barrier can be used for this " + "team:%p\n", + team)); + return true; +} + /* most of the work for a fork */ /* return true if we really went parallel, false if serialized */ int __kmp_fork_call(ident_t *loc, int gtid, @@ -1658,6 +1711,14 @@ __kmp_partition_places(parent_team); #endif + if (__kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_hard_bar) { + // force resize since this team may be able to use hard barrier + __kmp_resize_hard_barrier(parent_team, parent_team->t.t_threads[0], + parent_team->t.t_nproc, + parent_team->t.t_proc_bind, true); + __kmp_add_threads_to_team(parent_team, parent_team->t.t_nproc); + } + KF_TRACE(10, ("__kmp_fork_call: before internal fork: root=%p, team=%p, " "master_th=%p, gtid=%d\n", root, parent_team, master_th, gtid)); @@ -2051,6 +2112,8 @@ argc USE_NESTED_HOT_ARG(master_th)); if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) copy_icvs((kmp_internal_control_t *)team->t.b->team_icvs, &new_icvs); + if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_hard_bar) + copy_icvs((kmp_internal_control_t *)team->t.h->team_icvs, &new_icvs); } else { /* allocate a new parallel team */ KF_TRACE(10, ("__kmp_fork_call: before __kmp_allocate_team\n")); @@ -2064,6 +2127,10 @@ if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) copy_icvs((kmp_internal_control_t *)team->t.b->team_icvs, &master_th->th.th_current_task->td_icvs); + if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_hard_bar) { + copy_icvs((kmp_internal_control_t *)team->t.h->team_icvs, + &master_th->th.th_current_task->td_icvs); + } } KF_TRACE( 10, ("__kmp_fork_call: after __kmp_allocate_team - team = %p\n", team)); @@ -2400,6 +2467,12 @@ } #endif + if (__kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_hard_bar && + parent_team->t.h && parent_team->t.h->is_barrier_allocated()) { + // reset barrier window status for parent team + parent_team->t.h->get_window(master_th, 0); + } + return; } @@ -2435,6 +2508,10 @@ __kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) { team->t.b->update_num_threads(team->t.t_nproc); __kmp_add_threads_to_team(team, team->t.t_nproc); + } else if (team->t.t_nproc > 1 && + __kmp_barrier_gather_pattern[bs_forkjoin_barrier] == + bp_hard_bar) { + __kmp_add_threads_to_team(team, team->t.t_nproc); } } @@ -2602,6 +2679,10 @@ master_th->th.th_team_nproc = parent_team->t.t_nproc; master_th->th.th_team_master = parent_team->t.t_threads[0]; master_th->th.th_team_serialized = parent_team->t.t_serialized; + if (__kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_hard_bar && + parent_team->t.h && parent_team->t.h->is_barrier_allocated()) { + parent_team->t.h->get_window(master_th, 0); + } /* restore serialized team, if need be */ if (parent_team->t.t_serialized && @@ -2729,6 +2810,10 @@ if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) { __kmp_resize_dist_barrier(hot_team, hot_team->t.t_nproc, new_nth); } + if (__kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_hard_bar) { + __kmp_resize_hard_barrier(hot_team, hot_team->t.t_threads[0], new_nth, + hot_team->t.t_proc_bind); + } // Release the extra threads we don't need any more. for (f = new_nth; f < hot_team->t.t_nproc; f++) { KMP_DEBUG_ASSERT(hot_team->t.t_threads[f] != NULL); @@ -2751,6 +2836,9 @@ if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) { hot_team->t.b->update_num_threads(new_nth); __kmp_add_threads_to_team(hot_team, new_nth); + } else if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == + bp_hard_bar) { + __kmp_add_threads_to_team(hot_team, new_nth); } __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock); @@ -4369,7 +4457,10 @@ new_thr->th.th_task_state_top = 0; new_thr->th.th_task_state_stack_sz = 4; - if (__kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) { + if (__kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar || + (__kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_hard_bar && + new_thr->th.th_team->t.h && + new_thr->th.th_team->t.h->is_barrier_allocated())) { // Make sure pool thread has transitioned to waiting on own thread struct KMP_DEBUG_ASSERT(new_thr->th.th_used_in_team.load() == 0); // Thread activated in __kmp_allocate_team when increasing team size @@ -5058,6 +5149,117 @@ #endif // KMP_AFFINITY_SUPPORTED +// check if we can use hard barrier in this team and setup bb. +// otherwise just fallback to use soft barrier +static void __kmp_resize_hard_barrier(kmp_team_t *team, kmp_info_t *master, + int new_nproc, int new_proc_bind, + bool force) { + + int old_nproc = team->t.t_nproc; + + KA_TRACE(10, ("__kmp_resize_hard_barrier: enter T#%d, nthreads: %d -> %d, " + "proc_bind %d -> %d\n", + team->t.t_id, team->t.t_nproc, new_nproc, team->t.t_proc_bind, + new_proc_bind)); + + // below logic is mostly the same as distribution barrier + if (force || (old_nproc > 1 && old_nproc != new_nproc)) { + if (team->t.h->is_barrier_allocated()) { + // set all workers to transition state + for (int f = 1; f < old_nproc; ++f) { + KMP_DEBUG_ASSERT(team->t.t_threads[f] != NULL); + // Ignore threads that are already inactive or not present in the team + if (team->t.t_threads[f]->th.th_used_in_team.load() == 0) { + // teams construct causes thread_limit to get passed in, and some of + // those could be inactive; just ignore them + continue; + } + // If thread is transitioning still to in_use state, wait for it + if (team->t.t_threads[f]->th.th_used_in_team.load() == 3) { + while (team->t.t_threads[f]->th.th_used_in_team.load() == 3) + KMP_CPU_PAUSE(); + } + // The thread should be in_use now + KMP_DEBUG_ASSERT(team->t.t_threads[f]->th.th_used_in_team.load() == 1); + // Transition to unused state (but sync before leaving) + team->t.t_threads[f]->th.th_used_in_team.store(4); + } + + // wakeup old thread waiting in release barrier + KA_TRACE(20, + ("__kmp_resize_hard_barrier: T#%d, wake up old hard barrier\n", + team->t.t_id)); + if (team->t.h->is_hybrid) { + for (int i = 1; i < team->t.h->num_groups; i++) { + int child_tid = team->t.h->get_tid_of_group_leader(i); + kmp_flag_64<> flag(&team->t.t_threads[child_tid] + ->th.th_bar[bs_forkjoin_barrier] + .bb.b_go, + team->t.t_threads[child_tid]); + flag.release(); + } + } + team->t.h->sync(master, FALSE, 0, 0 USE_ITT_BUILD_ARG(NULL)); + + // next team may use soft barrier, reset status + for (int bs = 0; bs < bs_last_barrier; bs++) { + kmp_balign_team_t *team_bar = &team->t.t_bar[bs]; + team_bar->b_arrived = KMP_INIT_BARRIER_STATE; + } + } else { + // thread is sleeping in softbarrier + for (int f = 1; f < old_nproc; ++f) { + team->t.t_threads[f]->th.th_used_in_team.store(4); + } + KA_TRACE(20, + ("__kmp_resize_hard_barrier: T#%d, wake up old soft barrier\n", + team->t.t_id)); + + // XXX: it seems master->th.th_team points task_team at this point. + // is there any better way to handle this? + int temp = master->th.th_team_nproc; + kmp_team_t *temp_team = master->th.th_team; + master->th.th_team_nproc = team->t.t_nproc; + master->th.th_team = team; + __kmp_hard_barrier_wakeup_soft(master); + master->th.th_team_nproc = temp; + master->th.th_team = temp_team; + } + + // wait for threads to be removed from team + for (int f = 1; f < old_nproc; ++f) { + while (team->t.t_threads[f]->th.th_used_in_team.load() != 0) + KMP_CPU_PAUSE(); + } + } + + // resize main logic + if (team->t.h->is_barrier_allocated() && team->t.t_nproc == new_nproc && + team->t.t_proc_bind == new_proc_bind) { + KA_TRACE(10, ("__kmp_resize_hard_barrier: T#%d, hard barrier is alredy set " + "up for team\n", + team->t.t_id)); + } else { + if (__kmp_can_use_hard_barrier(team, master, new_nproc, new_proc_bind) && + team->t.h->barrier_alloc(master, new_nproc) == 0) { + KA_TRACE(10, ("__kmp_resize_hard_barrier: T#%d, allocate hard barrier. " + "nthreads: %d, " + "proc_bind: %d\n", + team->t.t_id, new_nproc, new_proc_bind)); + } else { + // in some case this is expected (i.e. outer team of nested parallelism + // if OMP_PROC_BIND=spread,close is used) + KMP_INFORM(HardBarrierCannotBeUsed); + team->t.h->barrier_free(); + KA_TRACE( + 10, + ("__kmp_resize_hard_barrier: T#%d, cannot use hard barrier. use soft " + "barrier. nthreads: %d, proc_bind: %d\n", + team->t.t_id, new_nproc, new_proc_bind)); + } + } +} + /* allocate a new team data structure to use. take one off of the free pool if available */ kmp_team_t * @@ -5138,6 +5340,19 @@ __kmp_resize_dist_barrier(team, old_nthr, new_nproc); } + if (__kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_hard_bar) { + if (!team->t.h) { + // See comments in hardBarrier::allocate() for why this is needed + team->t.h = hardBarrier::allocate(); + } + // if do_place_partion == 0 then this team cannot use hard barrier + // since affinity will not be set up for this team. Skip hard barrier + // initialization and use software barrier + if (do_place_partition) { + __kmp_resize_hard_barrier(team, master, new_nproc, new_proc_bind); + } + } + // If not doing the place partition, then reset the team's proc bind // to indicate that partitioning of all threads still needs to take place if (do_place_partition == 0) @@ -5267,6 +5482,12 @@ __kmp_partition_places(team); #endif } + + if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_hard_bar) { + // call __kmp_add_thread_to_team after __kmp_partition_places + // since hard barrier needs affinity + __kmp_add_threads_to_team(team, new_nproc); + } } else { // team->t.t_nproc < new_nproc #if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED kmp_affin_mask_t *old_mask; @@ -5408,6 +5629,12 @@ __kmp_partition_places(team); #endif } + + if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_hard_bar) { + // call __kmp_add_thread_to_team after __kmp_partition_places + // since hard barrier needs affinity + __kmp_add_threads_to_team(team, new_nproc); + } } // Check changes in number of threads kmp_info_t *master = team->t.t_threads[0]; @@ -5480,6 +5707,13 @@ } } + if (max_nproc > 1 && + __kmp_barrier_gather_pattern[bs_plain_barrier] == bp_hard_bar) { + if (!team->t.h) { + team->t.h = hardBarrier::allocate(); + } + } + /* setup the team for fresh use */ __kmp_initialize_team(team, new_nproc, new_icvs, NULL); @@ -5541,6 +5775,12 @@ team->t.b = distributedBarrier::allocate(__kmp_dflt_team_nth_ub); } + // if max_nproc == 1 then there is no need to use barrier at all + if (max_nproc > 1 && + __kmp_barrier_gather_pattern[bs_plain_barrier] == bp_hard_bar) { + team->t.h = hardBarrier::allocate(); + } + /* NOTE well, for some reason allocating one big buffer and dividing it up seems to really hurt performance a lot on the P4, so, let's not use this */ __kmp_allocate_team_arrays(team, max_nproc); @@ -5702,7 +5942,10 @@ KMP_COMPARE_AND_STORE_ACQ32(&(team->t.t_threads[f]->th.th_used_in_team), 1, 2); } - __kmp_free_thread(team->t.t_threads[f]); + if (__kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_hard_bar) { + KMP_COMPARE_AND_STORE_ACQ32(&(team->t.t_threads[f]->th.th_used_in_team), + 1, 4); + } } if (__kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) { @@ -5726,7 +5969,52 @@ } } + if (__kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_hard_bar) { + if (team->t.h && team->t.h->is_barrier_allocated()) { + // wakeup sleeping threads + if (team->t.h->is_hybrid) { + for (int i = 1; i < team->t.h->num_groups; i++) { + int child_tid = team->t.h->get_tid_of_group_leader(i); + kmp_flag_64<> flag(&team->t.t_threads[child_tid] + ->th.th_bar[bs_forkjoin_barrier] + .bb.b_go, + team->t.t_threads[child_tid]); + flag.release(); + } + } +#if KMP_NESTED_HOT_TEAMS + if (team->t.t_threads[0]->th.th_hot_teams) { + // If nested hot teams is used, __kmp_free_hot_teams will call + // this function in for loop during final cleanup and therefore + // current affinity may not match the primary thread's affinity + // for this team. Restore the affinity to perform below sync + int gtid = team->t.t_threads[0]->th.th_info.ds.ds_gtid; + __kmp_affinity_set_place(gtid); + } +#endif + team->t.h->sync(team->t.t_threads[0], FALSE, 0, + 0 USE_ITT_BUILD_ARG(NULL)); + } else { + // same comments as in __kmp_can_use_hard_barrier + int temp_nproc = team->t.t_threads[0]->th.th_team_nproc; + kmp_team_t *temp_team = team->t.t_threads[0]->th.th_team; + + team->t.t_threads[0]->th.th_team_nproc = team->t.t_nproc; + team->t.t_threads[0]->th.th_team = team; + __kmp_hard_barrier_wakeup_soft(team->t.t_threads[0]); + team->t.t_threads[0]->th.th_team_nproc = temp_nproc; + team->t.t_threads[0]->th.th_team = temp_team; + } + + // Wait for threads to be removed from team + for (int f = 1; f < team->t.t_nproc; ++f) { + while (team->t.t_threads[f]->th.th_used_in_team.load() != 0) + KMP_CPU_PAUSE(); + } + } + for (f = 1; f < team->t.t_nproc; ++f) { + __kmp_free_thread(team->t.t_threads[f]); team->t.t_threads[f] = NULL; } @@ -5735,6 +6023,13 @@ distributedBarrier::deallocate(team->t.b); team->t.b = NULL; } + if (team->t.t_max_nproc > 1 && + __kmp_barrier_gather_pattern[bs_plain_barrier] == bp_hard_bar) { + if (team->t.h) { + hardBarrier::deallocate(team->t.h); + team->t.h = NULL; + } + } /* put the team back in the team pool */ /* TODO limit size of team pool, call reap_team if pool too large */ team->t.t_next_pool = CCAST(kmp_team_t *, __kmp_team_pool); @@ -6133,7 +6428,8 @@ KA_TRACE( 20, ("__kmp_reap_thread: releasing T#%d from fork barrier for reap\n", gtid)); - if (__kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) { + if (__kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar || + __kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_hard_bar) { while ( !KMP_COMPARE_AND_STORE_ACQ32(&(thread->th.th_used_in_team), 0, 3)) KMP_CPU_PAUSE(); @@ -7261,6 +7557,32 @@ #endif /* KMP_AFFINITY_SUPPORTED */ + // If we cannot use hardware barrier at all, fallback to hyper barrier here + // As hardware barrier has some restriction on place setting and it will not + // change during runtime, check the condition after + // __kmp_affinity_initialize() + if (__kmp_barrier_release_pattern[bs_plain_barrier] == bp_hard_bar) { + bool fallback = false; + + if (!hardBarrier::system_supports_hard_barrier()) { + KMP_WARNING(HardBarrierNotSupported, + "System does not support hardware barrier"); + fallback = true; + } + if (!__kmp_check_places_for_hard_barrier()) { + KMP_WARNING(HardBarrierNotSupported, + "Place setting is not appropraite for hardware barrier"); + fallback = true; + } + + if (fallback) { + for (int i = bs_plain_barrier; i < bs_last_barrier; i++) { + __kmp_barrier_release_pattern[i] = bp_hyper_bar; + __kmp_barrier_gather_pattern[i] = bp_hyper_bar; + } + } + } + KMP_ASSERT(__kmp_xproc > 0); if (__kmp_avail_proc == 0) { __kmp_avail_proc = __kmp_xproc; diff --git a/openmp/runtime/src/kmp_settings.cpp b/openmp/runtime/src/kmp_settings.cpp --- a/openmp/runtime/src/kmp_settings.cpp +++ b/openmp/runtime/src/kmp_settings.cpp @@ -13,6 +13,7 @@ #include "kmp.h" #include "kmp_affinity.h" #include "kmp_atomic.h" +#include "kmp_barrier.h" #if KMP_USE_HIER_SCHED #include "kmp_dispatch_hier.h" #endif @@ -1714,7 +1715,9 @@ const char *var; /* ---------- Barrier method control ------------ */ - static int dist_req = 0, non_dist_req = 0; + static int dist_req = 0; + static int hard_req = 0; + static int other_req = 0; static bool warn = 1; for (int i = bs_plain_barrier; i < bs_last_barrier; i++) { var = __kmp_barrier_pattern_env_name[i]; @@ -1729,8 +1732,10 @@ ',')) { if (j == bp_dist_bar) { dist_req++; + } else if (j == bp_hard_bar) { + hard_req++; } else { - non_dist_req++; + other_req++; } __kmp_barrier_gather_pattern[i] = (kmp_bar_pat_e)j; break; @@ -1748,8 +1753,10 @@ if (__kmp_str_match(__kmp_barrier_pattern_name[j], 1, comma + 1)) { if (j == bp_dist_bar) { dist_req++; + } else if (j == bp_hard_bar) { + hard_req++; } else { - non_dist_req++; + other_req++; } __kmp_barrier_release_pattern[i] = (kmp_bar_pat_e)j; break; @@ -1767,9 +1774,8 @@ } if (dist_req != 0) { // set all barriers to dist - if ((non_dist_req != 0) && warn) { - KMP_INFORM(BarrierPatternOverride, name, - __kmp_barrier_pattern_name[bp_dist_bar]); + if ((hard_req + other_req != 0) && warn) { + KMP_INFORM(BarrierPatternOverride, "dist"); warn = 0; } for (int i = bs_plain_barrier; i < bs_last_barrier; i++) { @@ -1778,6 +1784,16 @@ if (__kmp_barrier_gather_pattern[i] != bp_dist_bar) __kmp_barrier_gather_pattern[i] = bp_dist_bar; } + } else if (hard_req != 0) { + // set all barriers to hard + if (other_req != 0 && warn) { + KMP_INFORM(BarrierPatternOverride, "hard"); + warn = 0; + } + for (int i = bs_plain_barrier; i < bs_last_barrier; i++) { + __kmp_barrier_release_pattern[i] = bp_hard_bar; + __kmp_barrier_gather_pattern[i] = bp_hard_bar; + } } } // __kmp_stg_parse_barrier_pattern diff --git a/openmp/runtime/src/kmp_stats.h b/openmp/runtime/src/kmp_stats.h --- a/openmp/runtime/src/kmp_stats.h +++ b/openmp/runtime/src/kmp_stats.h @@ -248,6 +248,8 @@ // KMP_hyper_release -- time in __kmp_hyper_barrier_release // KMP_dist_gather -- time in __kmp_dist_barrier_gather // KMP_dist_release -- time in __kmp_dist_barrier_release +// KMP_hard_gather -- time in __kmp_hard_barrier_gather +// KMP_hard_release -- time in __kmp_hard_barrier_release // clang-format off #define KMP_FOREACH_DEVELOPER_TIMER(macro, arg) \ macro(KMP_fork_call, 0, arg) \ @@ -263,6 +265,8 @@ macro(KMP_linear_release, 0, arg) \ macro(KMP_tree_gather, 0, arg) \ macro(KMP_tree_release, 0, arg) \ + macro(KMP_hard_gather, 0, arg) \ + macro(KMP_hard_release, 0, arg) \ macro(USER_resume, 0, arg) \ macro(USER_suspend, 0, arg) \ macro(USER_mwait, 0, arg) \ diff --git a/openmp/runtime/src/kmp_tasking.cpp b/openmp/runtime/src/kmp_tasking.cpp --- a/openmp/runtime/src/kmp_tasking.cpp +++ b/openmp/runtime/src/kmp_tasking.cpp @@ -16,6 +16,7 @@ #include "kmp_stats.h" #include "kmp_wait_release.h" #include "kmp_taskdeps.h" +#include "kmp_barrier.h" #if OMPT_SUPPORT #include "ompt-specific.h" @@ -584,6 +585,11 @@ gtid, taskdata, thread_data->td.td_deque_ntasks, thread_data->td.td_deque_head, thread_data->td.td_deque_tail)); + if (__kmp_barrier_gather_pattern[bs_plain_barrier] == bp_hard_bar) { + // wakeup sleeping thread if any + hardBarrier::wakeup(); + } + __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock); return TASK_SUCCESSFULLY_PUSHED; @@ -3261,7 +3267,7 @@ // proceed. If this thread is in the last spin loop in the barrier, // waiting to be released, we know that the termination condition will not // be satisfied, so don't waste any cycles checking it. - if (flag == NULL || (!final_spin && flag->done_check())) { + if (!final_spin && (flag == NULL || flag->done_check())) { KA_TRACE( 15, ("__kmp_execute_tasks_template: T#%d spin condition satisfied\n", diff --git a/openmp/runtime/src/z_Linux_util.cpp b/openmp/runtime/src/z_Linux_util.cpp --- a/openmp/runtime/src/z_Linux_util.cpp +++ b/openmp/runtime/src/z_Linux_util.cpp @@ -2729,4 +2729,11 @@ } #endif // KMP_OS_LINUX +#ifdef KMP_OS_LINUX +int __kmp_get_online_cores() { + int r; + __kmp_type_convert(sysconf(_SC_NPROCESSORS_ONLN), &(r)); + return r; +} +#endif // end of file //