diff --git a/openmp/runtime/src/i18n/en_US.txt b/openmp/runtime/src/i18n/en_US.txt --- a/openmp/runtime/src/i18n/en_US.txt +++ b/openmp/runtime/src/i18n/en_US.txt @@ -417,6 +417,8 @@ AffIgnoringHwloc "%1$s: Ignoring hwloc mechanism." AffHwlocErrorOccurred "%1$s: Hwloc failed in %2$s. Relying on internal affinity mechanisms." EnvSerialWarn "%1$s must be set prior to OpenMP runtime library initialization; ignored." +EnvMwaitWarn "You have enabled the use of umonitor/umwait. If the CPU doesn't have that enabled " + "you'll get an illegal instruction exception." EnvVarDeprecated "%1$s variable deprecated, please use %2$s instead." RedMethodNotSupported "KMP_FORCE_REDUCTION: %1$s method is not supported; using critical." AffHWSubsetNoHWLOC "KMP_HW_SUBSET ignored: unsupported item requested for non-HWLOC topology method (KMP_TOPOLOGY_METHOD)" diff --git a/openmp/runtime/src/kmp.h b/openmp/runtime/src/kmp.h --- a/openmp/runtime/src/kmp.h +++ b/openmp/runtime/src/kmp.h @@ -255,6 +255,10 @@ typedef union kmp_info kmp_info_p; typedef union kmp_root kmp_root_p; +template class kmp_flag_32; +template class kmp_flag_64; +class kmp_flag_oncore; + #ifdef __cplusplus extern "C" { #endif @@ -1313,6 +1317,96 @@ } \ } +// User-level Monitor/Mwait +#if KMP_HAVE_UMWAIT +// We always try for UMWAIT first +#if (KMP_COMPILER_ICC && __INTEL_COMPILER >= 1300) || \ + (KMP_COMPILER_MSVC && _MSC_VER >= 1700) || \ + (KMP_COMPILER_CLANG && (KMP_MSVC_COMPAT || __MINGW32__)) || \ + (KMP_COMPILER_GCC && __MINGW32__) +#if KMP_OS_UNIX +#include +#else +#include +#endif // KMP_OS_UNIX +#else +#define USE_MWAIT_ASM \ + KMP_OS_UNIX && (!KMP_COMPILER_ICC || __INTEL_COMPILER < 1900) +#endif // KMP_COMPILER_ICC etc. +#if KMP_OS_UNIX && 0 // "waitpkg" not recognized yet +__attribute__((target("waitpkg"))) +#endif +static inline int +__kmp_tpause(uint32_t hint, uint64_t counter) { +#if (USE_MWAIT_ASM) + uint32_t timeHi = uint32_t(counter >> 32); + uint32_t timeLo = uint32_t(counter & 0xffffffff); + char flag; + __asm__ volatile("#tpause\n.byte 0x66, 0x0F, 0xAE, 0xF1\n" + "setb %0" + : "=r"(flag) + : "a"(timeLo), "d"(timeHi), "c"(hint) + :); + return flag; +#else + return _tpause(hint, counter); +#endif +} +#if KMP_OS_UNIX && 0 // "waitpkg" not recognized on our build machine +__attribute__((target("waitpkg"))) +#endif +static inline void +__kmp_umonitor(void *cacheline) { +#if (USE_MWAIT_ASM) + __asm__ volatile("# umonitor\n.byte 0xF3, 0x0F, 0xAE, 0x01 " + : + : "a"(cacheline) + :); +#else + _umonitor(cacheline); +#endif +} +#if KMP_OS_UNIX && 0 // "waitpkg" not recognized on our build machine +__attribute__((target("waitpkg"))) +#endif +static inline int +__kmp_umwait(uint32_t hint, uint64_t counter) { +#if (USE_MWAIT_ASM) + uint32_t timeHi = uint32_t(counter >> 32); + uint32_t timeLo = uint32_t(counter & 0xffffffff); + char flag; + __asm__ volatile("#umwait\n.byte 0xF2, 0x0F, 0xAE, 0xF1\n" + "setb %0" + : "=r"(flag) + : "a"(timeLo), "d"(timeHi), "c"(hint) + :); + return flag; +#else + return _umwait(hint, counter); +#endif +} +#elif KMP_HAVE_MWAIT +#if KMP_OS_UNIX +#include +#else +#include +#endif +#if KMP_OS_UNIX +__attribute__((target("sse3"))) +#endif +static inline void +__kmp_mm_monitor(void *cacheline, unsigned extensions, unsigned hints) { + _mm_monitor(cacheline, extensions, hints); +} +#if KMP_OS_UNIX +__attribute__((target("sse3"))) +#endif +static inline void +__kmp_mm_mwait(unsigned extensions, unsigned hints) { + _mm_mwait(extensions, hints); +} +#endif // KMP_HAVE_UMWAIT + /* ------------------------------------------------------------------------ */ /* Support datatypes for the orphaned construct nesting checks. */ /* ------------------------------------------------------------------------ */ @@ -3089,6 +3183,13 @@ KMP_FATAL(ThreadIdentInvalid); } +#if KMP_HAVE_MWAIT || KMP_HAVE_UMWAIT +extern int __kmp_user_level_mwait; // TRUE or FALSE; from KMP_USER_LEVEL_MWAIT +extern int __kmp_umwait_enabled; // Runtime check if user-level mwait enabled +extern int __kmp_mwait_enabled; // Runtime check if ring3 mwait is enabled +extern int __kmp_mwait_hints; // Hints to pass in to mwait +#endif + /* ------------------------------------------------------------------------- */ extern kmp_global_t __kmp_global; /* global status */ @@ -3290,17 +3391,14 @@ extern void __kmp_wait_4_ptr(void *spinner, kmp_uint32 checker, kmp_uint32 (*pred)(void *, kmp_uint32), void *obj); -class kmp_flag_32; -class kmp_flag_64; -class kmp_flag_oncore; -extern void __kmp_wait_64(kmp_info_t *this_thr, kmp_flag_64 *flag, +extern void __kmp_wait_64(kmp_info_t *this_thr, kmp_flag_64<> *flag, int final_spin #if USE_ITT_BUILD , void *itt_sync_obj #endif ); -extern void __kmp_release_64(kmp_flag_64 *flag); +extern void __kmp_release_64(kmp_flag_64<> *flag); extern void __kmp_infinite_loop(void); @@ -3398,13 +3496,6 @@ extern void __kmp_lock_suspend_mx(kmp_info_t *th); extern void __kmp_unlock_suspend_mx(kmp_info_t *th); -extern void __kmp_suspend_32(int th_gtid, kmp_flag_32 *flag); -extern void __kmp_suspend_64(int th_gtid, kmp_flag_64 *flag); -extern void __kmp_suspend_oncore(int th_gtid, kmp_flag_oncore *flag); -extern void __kmp_resume_32(int target_gtid, kmp_flag_32 *flag); -extern void __kmp_resume_64(int target_gtid, kmp_flag_64 *flag); -extern void __kmp_resume_oncore(int target_gtid, kmp_flag_oncore *flag); - extern void __kmp_elapsed(double *); extern void __kmp_elapsed_tick(double *); @@ -3529,28 +3620,6 @@ kmp_task_t *task); extern void __kmp_fulfill_event(kmp_event_t *event); -int __kmp_execute_tasks_32(kmp_info_t *thread, kmp_int32 gtid, - kmp_flag_32 *flag, int final_spin, - int *thread_finished, -#if USE_ITT_BUILD - void *itt_sync_obj, -#endif /* USE_ITT_BUILD */ - kmp_int32 is_constrained); -int __kmp_execute_tasks_64(kmp_info_t *thread, kmp_int32 gtid, - kmp_flag_64 *flag, int final_spin, - int *thread_finished, -#if USE_ITT_BUILD - void *itt_sync_obj, -#endif /* USE_ITT_BUILD */ - kmp_int32 is_constrained); -int __kmp_execute_tasks_oncore(kmp_info_t *thread, kmp_int32 gtid, - kmp_flag_oncore *flag, int final_spin, - int *thread_finished, -#if USE_ITT_BUILD - void *itt_sync_obj, -#endif /* USE_ITT_BUILD */ - kmp_int32 is_constrained); - extern void __kmp_free_task_team(kmp_info_t *thread, kmp_task_team_t *task_team); extern void __kmp_reap_task_teams(void); @@ -3914,4 +3983,46 @@ } #endif +template +extern void __kmp_suspend_32(int th_gtid, kmp_flag_32 *flag); +template +extern void __kmp_suspend_64(int th_gtid, kmp_flag_64 *flag); +extern void __kmp_suspend_oncore(int th_gtid, kmp_flag_oncore *flag); +template +#if KMP_HAVE_MWAIT || KMP_HAVE_UMWAIT +extern void __kmp_mwait_32(int th_gtid, kmp_flag_32 *flag); +template +extern void __kmp_mwait_64(int th_gtid, kmp_flag_64 *flag); +extern void __kmp_mwait_oncore(int th_gtid, kmp_flag_oncore *flag); +template +#endif +extern void __kmp_resume_32(int target_gtid, kmp_flag_32 *flag); +template +extern void __kmp_resume_64(int target_gtid, kmp_flag_64 *flag); +extern void __kmp_resume_oncore(int target_gtid, kmp_flag_oncore *flag); + +template +int __kmp_execute_tasks_32(kmp_info_t *thread, kmp_int32 gtid, + kmp_flag_32 *flag, int final_spin, + int *thread_finished, +#if USE_ITT_BUILD + void *itt_sync_obj, +#endif /* USE_ITT_BUILD */ + kmp_int32 is_constrained); +template +int __kmp_execute_tasks_64(kmp_info_t *thread, kmp_int32 gtid, + kmp_flag_64 *flag, int final_spin, + int *thread_finished, +#if USE_ITT_BUILD + void *itt_sync_obj, +#endif /* USE_ITT_BUILD */ + kmp_int32 is_constrained); +int __kmp_execute_tasks_oncore(kmp_info_t *thread, kmp_int32 gtid, + kmp_flag_oncore *flag, int final_spin, + int *thread_finished, +#if USE_ITT_BUILD + void *itt_sync_obj, +#endif /* USE_ITT_BUILD */ + kmp_int32 is_constrained); + #endif /* KMP_H */ diff --git a/openmp/runtime/src/kmp_barrier.cpp b/openmp/runtime/src/kmp_barrier.cpp --- a/openmp/runtime/src/kmp_barrier.cpp +++ b/openmp/runtime/src/kmp_barrier.cpp @@ -78,7 +78,7 @@ is valid any more - it could be deallocated by the master thread at any time. */ ANNOTATE_BARRIER_BEGIN(this_thr); - kmp_flag_64 flag(&thr_bar->b_arrived, other_threads[0]); + kmp_flag_64<> flag(&thr_bar->b_arrived, other_threads[0]); flag.release(); } else { kmp_balign_team_t *team_bar = &team->t.t_bar[bt]; @@ -101,14 +101,14 @@ &other_threads[i]->th.th_bar[bt].bb.b_arrived, new_state)); // Wait for worker thread to arrive - kmp_flag_64 flag(&other_threads[i]->th.th_bar[bt].bb.b_arrived, - new_state); if (cancellable) { - bool cancelled = flag.wait_cancellable_nosleep( - this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj)); - if (cancelled) + kmp_flag_64 flag( + &other_threads[i]->th.th_bar[bt].bb.b_arrived, new_state); + if (flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj))) return true; } else { + kmp_flag_64<> flag(&other_threads[i]->th.th_bar[bt].bb.b_arrived, + new_state); flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj)); } ANNOTATE_BARRIER_END(other_threads[i]); @@ -203,22 +203,20 @@ other_threads[i]->th.th_bar[bt].bb.b_go, other_threads[i]->th.th_bar[bt].bb.b_go + KMP_BARRIER_STATE_BUMP)); ANNOTATE_BARRIER_BEGIN(other_threads[i]); - kmp_flag_64 flag(&other_threads[i]->th.th_bar[bt].bb.b_go, - other_threads[i]); + kmp_flag_64<> flag(&other_threads[i]->th.th_bar[bt].bb.b_go, + other_threads[i]); flag.release(); } } } else { // Wait for the MASTER thread to release us KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d wait go(%p) == %u\n", gtid, &thr_bar->b_go, KMP_BARRIER_STATE_BUMP)); - kmp_flag_64 flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP); if (cancellable) { - bool cancelled = flag.wait_cancellable_nosleep( - this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj)); - if (cancelled) { + kmp_flag_64 flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP); + if (flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj))) return true; - } } else { + kmp_flag_64<> flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP); flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj)); } ANNOTATE_BARRIER_END(this_thr); @@ -339,7 +337,7 @@ gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team), team->t.t_id, child_tid, &child_bar->b_arrived, new_state)); // Wait for child to arrive - kmp_flag_64 flag(&child_bar->b_arrived, new_state); + kmp_flag_64<> flag(&child_bar->b_arrived, new_state); flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj)); ANNOTATE_BARRIER_END(child_thr); #if USE_ITT_BUILD && USE_ITT_NOTIFY @@ -384,7 +382,7 @@ is valid any more - it could be deallocated by the master thread at any time. */ ANNOTATE_BARRIER_BEGIN(this_thr); - kmp_flag_64 flag(&thr_bar->b_arrived, other_threads[parent_tid]); + kmp_flag_64<> flag(&thr_bar->b_arrived, other_threads[parent_tid]); flag.release(); } else { // Need to update the team arrived pointer if we are the master thread @@ -420,7 +418,7 @@ KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d wait go(%p) == %u\n", gtid, &thr_bar->b_go, KMP_BARRIER_STATE_BUMP)); // Wait for parent thread to release us - kmp_flag_64 flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP); + kmp_flag_64<> flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP); flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj)); ANNOTATE_BARRIER_END(this_thr); #if USE_ITT_BUILD && USE_ITT_NOTIFY @@ -498,7 +496,7 @@ child_bar->b_go + KMP_BARRIER_STATE_BUMP)); // Release child from barrier ANNOTATE_BARRIER_BEGIN(child_thr); - kmp_flag_64 flag(&child_bar->b_go, child_thr); + kmp_flag_64<> flag(&child_bar->b_go, child_thr); flag.release(); child++; child_tid++; @@ -540,7 +538,7 @@ #endif /* Perform a hypercube-embedded tree gather to wait until all of the threads have arrived, and reduce any required data as we go. */ - kmp_flag_64 p_flag(&thr_bar->b_arrived); + kmp_flag_64<> p_flag(&thr_bar->b_arrived); for (level = 0, offset = 1; offset < num_threads; level += branch_bits, offset <<= branch_bits) { kmp_uint32 child; @@ -588,7 +586,7 @@ gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team), team->t.t_id, child_tid, &child_bar->b_arrived, new_state)); // Wait for child to arrive - kmp_flag_64 c_flag(&child_bar->b_arrived, new_state); + kmp_flag_64<> c_flag(&child_bar->b_arrived, new_state); c_flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj)); ANNOTATE_BARRIER_END(child_thr); KMP_MB(); // Synchronize parent and child threads. @@ -670,7 +668,7 @@ KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d wait go(%p) == %u\n", gtid, &thr_bar->b_go, KMP_BARRIER_STATE_BUMP)); // Wait for parent thread to release us - kmp_flag_64 flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP); + kmp_flag_64<> flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP); flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj)); ANNOTATE_BARRIER_END(this_thr); #if USE_ITT_BUILD && USE_ITT_NOTIFY @@ -772,7 +770,7 @@ child_bar->b_go + KMP_BARRIER_STATE_BUMP)); // Release child from barrier ANNOTATE_BARRIER_BEGIN(child_thr); - kmp_flag_64 flag(&child_bar->b_go, child_thr); + kmp_flag_64<> flag(&child_bar->b_go, child_thr); flag.release(); } } @@ -917,7 +915,7 @@ KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) waiting " "for leaf kids\n", gtid, team->t.t_id, tid)); - kmp_flag_64 flag(&thr_bar->b_arrived, leaf_state); + kmp_flag_64<> flag(&thr_bar->b_arrived, leaf_state); flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj)); if (reduce) { ANNOTATE_REDUCE_AFTER(reduce); @@ -957,7 +955,7 @@ gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team), team->t.t_id, child_tid, &child_bar->b_arrived, new_state)); - kmp_flag_64 flag(&child_bar->b_arrived, new_state); + kmp_flag_64<> flag(&child_bar->b_arrived, new_state); flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj)); ANNOTATE_BARRIER_END(child_thr); if (reduce) { @@ -990,7 +988,7 @@ gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team), team->t.t_id, child_tid, &child_bar->b_arrived, new_state)); - kmp_flag_64 flag(&child_bar->b_arrived, new_state); + kmp_flag_64<> flag(&child_bar->b_arrived, new_state); flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj)); ANNOTATE_BARRIER_END(child_thr); if (reduce) { @@ -1025,7 +1023,8 @@ !thr_bar->use_oncore_barrier) { // Parent is waiting on my b_arrived // flag; release it ANNOTATE_BARRIER_BEGIN(this_thr); - kmp_flag_64 flag(&thr_bar->b_arrived, other_threads[thr_bar->parent_tid]); + kmp_flag_64<> flag(&thr_bar->b_arrived, + other_threads[thr_bar->parent_tid]); flag.release(); } else { // Leaf does special release on "offset" bits of parent's b_arrived flag @@ -1069,7 +1068,7 @@ thr_bar->team == NULL) { // Use traditional method of waiting on my own b_go flag thr_bar->wait_flag = KMP_BARRIER_OWN_FLAG; - kmp_flag_64 flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP); + kmp_flag_64<> flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP); flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj)); ANNOTATE_BARRIER_END(this_thr); TCW_8(thr_bar->b_go, @@ -1218,7 +1217,7 @@ child_bar->b_go + KMP_BARRIER_STATE_BUMP)); // Release child using child's b_go flag ANNOTATE_BARRIER_BEGIN(child_thr); - kmp_flag_64 flag(&child_bar->b_go, child_thr); + kmp_flag_64<> flag(&child_bar->b_go, child_thr); flag.release(); } } else { // Release all children at once with leaf_state bits on my own @@ -1244,7 +1243,7 @@ child_bar->b_go + KMP_BARRIER_STATE_BUMP)); // Release child using child's b_go flag ANNOTATE_BARRIER_BEGIN(child_thr); - kmp_flag_64 flag(&child_bar->b_go, child_thr); + kmp_flag_64<> flag(&child_bar->b_go, child_thr); flag.release(); } } diff --git a/openmp/runtime/src/kmp_global.cpp b/openmp/runtime/src/kmp_global.cpp --- a/openmp/runtime/src/kmp_global.cpp +++ b/openmp/runtime/src/kmp_global.cpp @@ -206,6 +206,13 @@ int __kmp_display_env_verbose = FALSE; int __kmp_omp_cancellation = FALSE; +#if KMP_HAVE_MWAIT || KMP_HAVE_UMWAIT +int __kmp_user_level_mwait = FALSE; +int __kmp_umwait_enabled = FALSE; +int __kmp_mwait_enabled = FALSE; +int __kmp_mwait_hints = 0; +#endif + /* map OMP 3.0 schedule types with our internal schedule types */ enum sched_type __kmp_sch_map[kmp_sched_upper - kmp_sched_lower_ext + kmp_sched_upper_std - kmp_sched_lower - 2] = { diff --git a/openmp/runtime/src/kmp_os.h b/openmp/runtime/src/kmp_os.h --- a/openmp/runtime/src/kmp_os.h +++ b/openmp/runtime/src/kmp_os.h @@ -281,6 +281,16 @@ #define __forceinline __inline #endif +/* Check if the OS/arch can support user-level mwait */ +// All mwait code tests for UMWAIT first, so it should only fall back to ring3 +// MWAIT for KNL. +#define KMP_HAVE_MWAIT \ + ((KMP_ARCH_X86 || KMP_ARCH_X86_64) && (KMP_OS_LINUX || KMP_OS_WINDOWS) && \ + !KMP_MIC2) +#define KMP_HAVE_UMWAIT \ + ((KMP_ARCH_X86 || KMP_ARCH_X86_64) && (KMP_OS_LINUX || KMP_OS_WINDOWS) && \ + !KMP_MIC) + #if KMP_OS_WINDOWS #include diff --git a/openmp/runtime/src/kmp_runtime.cpp b/openmp/runtime/src/kmp_runtime.cpp --- a/openmp/runtime/src/kmp_runtime.cpp +++ b/openmp/runtime/src/kmp_runtime.cpp @@ -5463,7 +5463,7 @@ } #endif // first check if thread is sleeping - kmp_flag_64 fl(&th->th.th_bar[bs_forkjoin_barrier].bb.b_go, th); + kmp_flag_64<> fl(&th->th.th_bar[bs_forkjoin_barrier].bb.b_go, th); if (fl.is_sleeping()) fl.resume(__kmp_gtid_from_thread(th)); KMP_CPU_PAUSE(); @@ -5904,7 +5904,7 @@ /* Need release fence here to prevent seg faults for tree forkjoin barrier * (GEH) */ ANNOTATE_HAPPENS_BEFORE(thread); - kmp_flag_64 flag(&thread->th.th_bar[bs_forkjoin_barrier].bb.b_go, thread); + kmp_flag_64<> flag(&thread->th.th_bar[bs_forkjoin_barrier].bb.b_go, thread); __kmp_release_64(&flag); } @@ -6597,6 +6597,48 @@ #endif /* KMP_MIC_SUPPORTED */ +#if KMP_HAVE_UMWAIT +static void __kmp_user_level_mwait_init() { + struct kmp_cpuid buf; + __kmp_x86_cpuid(7, 0, &buf); + __kmp_umwait_enabled = ((buf.ecx >> 5) & 1) && __kmp_user_level_mwait; + KF_TRACE(30, ("__kmp_user_level_mwait_init: __kmp_umwait_enabled = %d\n", + __kmp_umwait_enabled)); +} +#elif KMP_HAVE_MWAIT +#ifndef AT_INTELPHIUSERMWAIT +// Spurious, non-existent value that should always fail to return anything. +// Will be replaced with the correct value when we know that. +#define AT_INTELPHIUSERMWAIT 10000 +#endif +// getauxval() function is available in RHEL7 and SLES12. If a system with an +// earlier OS is used to build the RTL, we'll use the following internal +// function when the entry is not found. +unsigned long getauxval(unsigned long) KMP_WEAK_ATTRIBUTE_EXTERNAL; +unsigned long getauxval(unsigned long) { return 0; } + +static void __kmp_user_level_mwait_init() { + // When getauxval() and correct value of AT_INTELPHIUSERMWAIT are available + // use them to find if the user-level mwait is enabled. Otherwise, forcibly + // set __kmp_mwait_enabled=TRUE on Intel MIC if the environment variable + // KMP_USER_LEVEL_MWAIT was set to TRUE. + if (__kmp_mic_type == mic3) { + unsigned long res = getauxval(AT_INTELPHIUSERMWAIT); + if ((res & 0x1) || __kmp_user_level_mwait) { + __kmp_mwait_enabled = TRUE; + if (__kmp_user_level_mwait) { + KMP_INFORM(EnvMwaitWarn); + } + } else { + __kmp_mwait_enabled = FALSE; + } + } + KF_TRACE(30, ("__kmp_user_level_mwait_init: __kmp_mic_type = %d, " + "__kmp_mwait_enabled = %d\n", + __kmp_mic_type, __kmp_mwait_enabled)); +} +#endif /* KMP_HAVE_UMWAIT */ + static void __kmp_do_serial_initialize(void) { int i, gtid; int size; @@ -6771,6 +6813,9 @@ __kmp_env_initialize(NULL); +#if KMP_HAVE_MWAIT || KMP_HAVE_UMWAIT + __kmp_user_level_mwait_init(); +#endif // Print all messages in message catalog for testing purposes. #ifdef KMP_DEBUG char const *val = __kmp_env_get("KMP_DUMP_CATALOG"); @@ -8371,7 +8416,8 @@ for (int gtid = 1; gtid < __kmp_threads_capacity; ++gtid) { kmp_info_t *thread = __kmp_threads[gtid]; if (thread) { // Wake it if sleeping - kmp_flag_64 fl(&thread->th.th_bar[bs_forkjoin_barrier].bb.b_go, thread); + kmp_flag_64<> fl(&thread->th.th_bar[bs_forkjoin_barrier].bb.b_go, + thread); if (fl.is_sleeping()) fl.resume(gtid); else if (__kmp_try_suspend_mx(thread)) { // got suspend lock diff --git a/openmp/runtime/src/kmp_settings.cpp b/openmp/runtime/src/kmp_settings.cpp --- a/openmp/runtime/src/kmp_settings.cpp +++ b/openmp/runtime/src/kmp_settings.cpp @@ -4621,6 +4621,35 @@ __kmp_stg_print_bool(buffer, name, __kmp_enable_task_throttling); } // __kmp_stg_print_task_throttling +#if KMP_HAVE_MWAIT || KMP_HAVE_UMWAIT +// ----------------------------------------------------------------------------- +// KMP_USER_LEVEL_MWAIT + +static void __kmp_stg_parse_user_level_mwait(char const *name, + char const *value, void *data) { + __kmp_stg_parse_bool(name, value, &__kmp_user_level_mwait); +} // __kmp_stg_parse_user_level_mwait + +static void __kmp_stg_print_user_level_mwait(kmp_str_buf_t *buffer, + char const *name, void *data) { + __kmp_stg_print_bool(buffer, name, __kmp_user_level_mwait); +} // __kmp_stg_print_user_level_mwait + +// ----------------------------------------------------------------------------- +// KMP_MWAIT_HINTS + +static void __kmp_stg_parse_mwait_hints(char const *name, char const *value, + void *data) { + __kmp_stg_parse_int(name, value, 0, INT_MAX, &__kmp_mwait_hints); +} // __kmp_stg_parse_mwait_hints + +static void __kmp_stg_print_mwait_hints(kmp_str_buf_t *buffer, char const *name, + void *data) { + __kmp_stg_print_int(buffer, name, __kmp_mwait_hints); +} // __kmp_stg_print_mwait_hints + +#endif // KMP_HAVE_MWAIT || KMP_HAVE_UMWAIT + // ----------------------------------------------------------------------------- // OMP_DISPLAY_ENV @@ -4939,6 +4968,12 @@ __kmp_stg_print_omp_tool_libraries, NULL, 0, 0}, #endif +#if KMP_HAVE_MWAIT || KMP_HAVE_UMWAIT + {"KMP_USER_LEVEL_MWAIT", __kmp_stg_parse_user_level_mwait, + __kmp_stg_print_user_level_mwait, NULL, 0, 0}, + {"KMP_MWAIT_HINTS", __kmp_stg_parse_mwait_hints, + __kmp_stg_print_mwait_hints, NULL, 0, 0}, +#endif {"", NULL, NULL, NULL, 0, 0}}; // settings static int const __kmp_stg_count = diff --git a/openmp/runtime/src/kmp_stats.h b/openmp/runtime/src/kmp_stats.h --- a/openmp/runtime/src/kmp_stats.h +++ b/openmp/runtime/src/kmp_stats.h @@ -258,6 +258,7 @@ macro(KMP_tree_release, 0, arg) \ macro(USER_resume, 0, arg) \ macro(USER_suspend, 0, arg) \ + macro(USER_mwait, 0, arg) \ macro(KMP_allocate_team, 0, arg) \ macro(KMP_setup_icv_copy, 0, arg) \ macro(USER_icv_copy, 0, arg) \ diff --git a/openmp/runtime/src/kmp_taskdeps.cpp b/openmp/runtime/src/kmp_taskdeps.cpp --- a/openmp/runtime/src/kmp_taskdeps.cpp +++ b/openmp/runtime/src/kmp_taskdeps.cpp @@ -786,7 +786,8 @@ } int thread_finished = FALSE; - kmp_flag_32 flag((std::atomic *)&node.dn.npredecessors, 0U); + kmp_flag_32 flag( + (std::atomic *)&node.dn.npredecessors, 0U); while (node.dn.npredecessors > 0) { flag.execute_tasks(thread, gtid, FALSE, &thread_finished USE_ITT_BUILD_ARG(NULL), diff --git a/openmp/runtime/src/kmp_tasking.cpp b/openmp/runtime/src/kmp_tasking.cpp --- a/openmp/runtime/src/kmp_tasking.cpp +++ b/openmp/runtime/src/kmp_tasking.cpp @@ -1876,9 +1876,10 @@ must_wait = must_wait || (thread->th.th_task_team != NULL && thread->th.th_task_team->tt.tt_found_proxy_tasks); if (must_wait) { - kmp_flag_32 flag(RCAST(std::atomic *, - &(taskdata->td_incomplete_child_tasks)), - 0U); + kmp_flag_32 flag( + RCAST(std::atomic *, + &(taskdata->td_incomplete_child_tasks)), + 0U); while (KMP_ATOMIC_LD_ACQ(&taskdata->td_incomplete_child_tasks) != 0) { flag.execute_tasks(thread, gtid, FALSE, &thread_finished USE_ITT_BUILD_ARG(itt_sync_obj), @@ -1984,7 +1985,7 @@ thread->th.ompt_thread_info.ompt_task_yielded = 1; #endif __kmp_execute_tasks_32( - thread, gtid, NULL, FALSE, + thread, gtid, (kmp_flag_32<> *)NULL, FALSE, &thread_finished USE_ITT_BUILD_ARG(itt_sync_obj), __kmp_task_stealing_constraint); #if OMPT_SUPPORT @@ -2512,8 +2513,8 @@ if (!taskdata->td_flags.team_serial || (thread->th.th_task_team != NULL && thread->th.th_task_team->tt.tt_found_proxy_tasks)) { - kmp_flag_32 flag(RCAST(std::atomic *, &(taskgroup->count)), - 0U); + kmp_flag_32 flag( + RCAST(std::atomic *, &(taskgroup->count)), 0U); while (KMP_ATOMIC_LD_ACQ(&taskgroup->count) != 0) { flag.execute_tasks(thread, gtid, FALSE, &thread_finished USE_ITT_BUILD_ARG(itt_sync_obj), @@ -3021,8 +3022,9 @@ } } +template int __kmp_execute_tasks_32( - kmp_info_t *thread, kmp_int32 gtid, kmp_flag_32 *flag, int final_spin, + kmp_info_t *thread, kmp_int32 gtid, kmp_flag_32 *flag, int final_spin, int *thread_finished USE_ITT_BUILD_ARG(void *itt_sync_obj), kmp_int32 is_constrained) { return __kmp_execute_tasks_template( @@ -3030,8 +3032,9 @@ thread_finished USE_ITT_BUILD_ARG(itt_sync_obj), is_constrained); } +template int __kmp_execute_tasks_64( - kmp_info_t *thread, kmp_int32 gtid, kmp_flag_64 *flag, int final_spin, + kmp_info_t *thread, kmp_int32 gtid, kmp_flag_64 *flag, int final_spin, int *thread_finished USE_ITT_BUILD_ARG(void *itt_sync_obj), kmp_int32 is_constrained) { return __kmp_execute_tasks_template( @@ -3048,6 +3051,23 @@ thread_finished USE_ITT_BUILD_ARG(itt_sync_obj), is_constrained); } +template int +__kmp_execute_tasks_32(kmp_info_t *, kmp_int32, + kmp_flag_32 *, int, + int *USE_ITT_BUILD_ARG(void *), kmp_int32); + +template int __kmp_execute_tasks_64(kmp_info_t *, kmp_int32, + kmp_flag_64 *, + int, + int *USE_ITT_BUILD_ARG(void *), + kmp_int32); + +template int __kmp_execute_tasks_64(kmp_info_t *, kmp_int32, + kmp_flag_64 *, + int, + int *USE_ITT_BUILD_ARG(void *), + kmp_int32); + // __kmp_enable_tasking: Allocate task team and resume threads sleeping at the // next barrier so they can assist in executing enqueued tasks. // First thread in allocates the task team atomically. @@ -3597,9 +3617,10 @@ // Worker threads may have dropped through to release phase, but could // still be executing tasks. Wait here for tasks to complete. To avoid // memory contention, only master thread checks termination condition. - kmp_flag_32 flag(RCAST(std::atomic *, - &task_team->tt.tt_unfinished_threads), - 0U); + kmp_flag_32 flag( + RCAST(std::atomic *, + &task_team->tt.tt_unfinished_threads), + 0U); flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj)); } // Deactivate the old task team, so that the worker threads will stop @@ -3621,7 +3642,7 @@ } // __kmp_tasking_barrier: -// This routine may only called when __kmp_tasking_mode == tskm_extra_barrier. +// This routine is called only when __kmp_tasking_mode == tskm_extra_barrier. // Internal function to execute all tasks prior to a regular barrier or a join // barrier. It is a full barrier itself, which unfortunately turns regular // barriers into double barriers and join barriers into 1 1/2 barriers. @@ -3635,7 +3656,7 @@ #if USE_ITT_BUILD KMP_FSYNC_SPIN_INIT(spin, NULL); #endif /* USE_ITT_BUILD */ - kmp_flag_32 spin_flag(spin, 0U); + kmp_flag_32 spin_flag(spin, 0U); while (!spin_flag.execute_tasks(thread, gtid, TRUE, &flag USE_ITT_BUILD_ARG(NULL), 0)) { #if USE_ITT_BUILD diff --git a/openmp/runtime/src/kmp_wait_release.h b/openmp/runtime/src/kmp_wait_release.h --- a/openmp/runtime/src/kmp_wait_release.h +++ b/openmp/runtime/src/kmp_wait_release.h @@ -42,20 +42,26 @@ flag_oncore /**< special 64-bit flag for on-core barrier (hierarchical) */ }; +struct flag_properties { + unsigned int type : 16; + unsigned int reserved : 16; +}; + /*! * Base class for wait/release volatile flag */ template class kmp_flag_native { volatile P *loc; - flag_type t; + flag_properties t; public: typedef P flag_t; - kmp_flag_native(volatile P *p, flag_type ft) : loc(p), t(ft) {} + kmp_flag_native(volatile P *p, flag_type ft) + : loc(p), t({(unsigned int)ft, 0U}) {} volatile P *get() { return loc; } void *get_void_p() { return RCAST(void *, CCAST(P *, loc)); } void set(volatile P *new_loc) { loc = new_loc; } - flag_type get_type() { return t; } + flag_type get_type() { return (flag_type)(t.type); } P load() { return *loc; } void store(P val) { *loc = val; } }; @@ -67,10 +73,12 @@ std::atomic

*loc; /**< Pointer to the flag storage that is modified by another thread */ - flag_type t; /**< "Type" of the flag in loc */ + flag_properties t; /**< "Type" of the flag in loc */ + public: typedef P flag_t; - kmp_flag(std::atomic

*p, flag_type ft) : loc(p), t(ft) {} + kmp_flag(std::atomic

*p, flag_type ft) + : loc(p), t({(unsigned int)ft, 0U}) {} /*! * @result the pointer to the actual flag */ @@ -86,7 +94,7 @@ /*! * @result the flag_type */ - flag_type get_type() { return t; } + flag_type get_type() { return (flag_type)(t.type); } /*! * @result flag value */ @@ -104,6 +112,7 @@ bool notdone_check(); P internal_release(); void suspend(int th_gtid); + void mwait(int th_gtid); void resume(int th_gtid); P set_sleeping(); P unset_sleeping(); @@ -160,8 +169,8 @@ to wake it back up to prevent deadlocks! NOTE: We may not belong to a team at this point. */ -template +template static inline bool __kmp_wait_template(kmp_info_t *this_thr, C *flag USE_ITT_BUILD_ARG(void *itt_sync_obj)) { @@ -185,7 +194,7 @@ return false; } th_gtid = this_thr->th.th_info.ds.ds_gtid; - if (cancellable) { + if (Cancellable) { kmp_team_t *team = this_thr->th.th_team; if (team && team->t.t_cancel_request == cancel_parallel) return true; @@ -375,7 +384,7 @@ } #endif // Check if the barrier surrounding this wait loop has been cancelled - if (cancellable) { + if (Cancellable) { kmp_team_t *team = this_thr->th.th_team; if (team && team->t.t_cancel_request == cancel_parallel) break; @@ -400,23 +409,31 @@ #endif // Don't suspend if wait loop designated non-sleepable // in template parameters - if (!sleepable) + if (!Sleepable) continue; if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME && __kmp_pause_status != kmp_soft_paused) continue; - KF_TRACE(50, ("__kmp_wait_sleep: T#%d suspend time reached\n", th_gtid)); - +#if KMP_HAVE_MWAIT || KMP_HAVE_UMWAIT + if (__kmp_mwait_enabled || __kmp_umwait_enabled) { + KF_TRACE(50, ("__kmp_wait_sleep: T#%d using monitor/mwait\n", th_gtid)); + flag->mwait(th_gtid); + } else { +#endif + KF_TRACE(50, ("__kmp_wait_sleep: T#%d suspend time reached\n", th_gtid)); #if KMP_OS_UNIX - if (final_spin) - KMP_ATOMIC_ST_REL(&this_thr->th.th_blocking, false); + if (final_spin) + KMP_ATOMIC_ST_REL(&this_thr->th.th_blocking, false); #endif - flag->suspend(th_gtid); + flag->suspend(th_gtid); #if KMP_OS_UNIX - if (final_spin) - KMP_ATOMIC_ST_REL(&this_thr->th.th_blocking, true); + if (final_spin) + KMP_ATOMIC_ST_REL(&this_thr->th.th_blocking, true); +#endif +#if KMP_HAVE_MWAIT || KMP_HAVE_UMWAIT + } #endif if (TCR_4(__kmp_global.g.g_done)) { @@ -458,7 +475,7 @@ KMP_ATOMIC_ST_REL(&this_thr->th.th_blocking, false); #endif KMP_FSYNC_SPIN_ACQUIRED(CCAST(void *, spin)); - if (cancellable) { + if (Cancellable) { kmp_team_t *team = this_thr->th.th_team; if (team && team->t.t_cancel_request == cancel_parallel) { if (tasks_completed) { @@ -475,6 +492,83 @@ return false; } +#if KMP_HAVE_MWAIT || KMP_HAVE_UMWAIT +// Set up a monitor on the flag variable causing the calling thread to wait in +// a less active state until the flag variable is modified. +template +static inline void __kmp_mwait_template(int th_gtid, C *flag) { + KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(USER_mwait); + kmp_info_t *th = __kmp_threads[th_gtid]; + + KF_TRACE(30, ("__kmp_mwait_template: T#%d enter for flag = %p\n", th_gtid, + flag->get())); + + // User-level mwait is available + KMP_DEBUG_ASSERT(__kmp_mwait_enabled || __kmp_umwait_enabled); + + __kmp_suspend_initialize_thread(th); + __kmp_lock_suspend_mx(th); + + volatile void *spin = flag->get(); + void *cacheline = (void *)(kmp_uint64(spin) & ~(CACHE_LINE - 1)); + + if (!flag->done_check()) { + // Mark thread as no longer active + th->th.th_active = FALSE; + if (th->th.th_active_in_pool) { + th->th.th_active_in_pool = FALSE; + KMP_ATOMIC_DEC(&__kmp_thread_pool_active_nth); + KMP_DEBUG_ASSERT(TCR_4(__kmp_thread_pool_active_nth) >= 0); + } + flag->set_sleeping(); + KF_TRACE(50, ("__kmp_mwait_template: T#%d calling monitor\n", th_gtid)); +#if KMP_HAVE_UMWAIT + if (__kmp_umwait_enabled) { + __kmp_umonitor(cacheline); + } +#elif KMP_HAVE_MWAIT + if (__kmp_mwait_enabled) { + __kmp_mm_monitor(cacheline, 0, 0); + } +#endif + // To avoid a race, check flag between 'monitor' and 'mwait'. A write to + // the address could happen after the last time we checked and before + // monitoring started, in which case monitor can't detect the change. + if (flag->done_check()) + flag->unset_sleeping(); + else { + // if flag changes here, wake-up happens immediately + TCW_PTR(th->th.th_sleep_loc, (void *)flag); + __kmp_unlock_suspend_mx(th); + KF_TRACE(50, ("__kmp_mwait_template: T#%d calling mwait\n", th_gtid)); +#if KMP_HAVE_UMWAIT + if (__kmp_umwait_enabled) { + __kmp_umwait(1, 100); // to do: enable ctrl via hints, backoff counter + } +#elif KMP_HAVE_MWAIT + if (__kmp_mwait_enabled) { + __kmp_mm_mwait(0, __kmp_mwait_hints); + } +#endif + KF_TRACE(50, ("__kmp_mwait_template: T#%d mwait done\n", th_gtid)); + __kmp_lock_suspend_mx(th); + // Clean up sleep info; doesn't matter how/why this thread stopped waiting + if (flag->is_sleeping()) + flag->unset_sleeping(); + TCW_PTR(th->th.th_sleep_loc, NULL); + } + // Mark thread as active again + th->th.th_active = TRUE; + if (TCR_4(th->th.th_in_pool)) { + KMP_ATOMIC_INC(&__kmp_thread_pool_active_nth); + th->th.th_active_in_pool = TRUE; + } + } // Drop out to main wait loop to check flag, handle tasks, etc. + __kmp_unlock_suspend_mx(th); + KF_TRACE(30, ("__kmp_mwait_template: T#%d exit\n", th_gtid)); +} +#endif // KMP_HAVE_MWAIT || KMP_HAVE_UMWAIT + /* Release any threads specified as waiting on the flag by releasing the flag and resume the waiting thread if indicated by the sleep bit(s). A thread that calls __kmp_wait_template must call this function to wake up the potentially @@ -545,7 +639,7 @@ }; // Basic flag that does not use C11 Atomics -template +template class kmp_basic_flag_native : public kmp_flag_native { typedef flag_traits traits_type; FlagType checker; /**< Value to compare flag to to check if flag has been @@ -588,7 +682,13 @@ /*! * @result true if the flag object has been released. */ - bool done_check() { return traits_type::tcr(*(this->get())) == checker; } + bool done_check() { + if (Sleepable) + return (traits_type::tcr(*(this->get())) & ~KMP_BARRIER_SLEEP_STATE) == + checker; + else + return traits_type::tcr(*(this->get())) == checker; + } /*! * @param old_loc in old value of flag * @result true if the flag's old value indicates it was released. @@ -643,7 +743,8 @@ enum barrier_type get_bt() { return bs_last_barrier; } }; -template class kmp_basic_flag : public kmp_flag { +template +class kmp_basic_flag : public kmp_flag { typedef flag_traits traits_type; FlagType checker; /**< Value to compare flag to to check if flag has been released. */ @@ -685,7 +786,12 @@ /*! * @result true if the flag object has been released. */ - bool done_check() { return this->load() == checker; } + bool done_check() { + if (Sleepable) + return (this->load() & ~KMP_BARRIER_SLEEP_STATE) == checker; + else + return this->load() == checker; + } /*! * @param old_loc in old value of flag * @result true if the flag's old value indicates it was released. @@ -736,14 +842,19 @@ enum barrier_type get_bt() { return bs_last_barrier; } }; -class kmp_flag_32 : public kmp_basic_flag { +template +class kmp_flag_32 : public kmp_basic_flag { public: - kmp_flag_32(std::atomic *p) : kmp_basic_flag(p) {} + kmp_flag_32(std::atomic *p) + : kmp_basic_flag(p) {} kmp_flag_32(std::atomic *p, kmp_info_t *thr) - : kmp_basic_flag(p, thr) {} + : kmp_basic_flag(p, thr) {} kmp_flag_32(std::atomic *p, kmp_uint32 c) - : kmp_basic_flag(p, c) {} + : kmp_basic_flag(p, c) {} void suspend(int th_gtid) { __kmp_suspend_32(th_gtid, this); } +#if KMP_HAVE_MWAIT || KMP_HAVE_UMWAIT + void mwait(int th_gtid) { __kmp_mwait_32(th_gtid, this); } +#endif void resume(int th_gtid) { __kmp_resume_32(th_gtid, this); } int execute_tasks(kmp_info_t *this_thr, kmp_int32 gtid, int final_spin, int *thread_finished USE_ITT_BUILD_ARG(void *itt_sync_obj), @@ -752,27 +863,32 @@ this_thr, gtid, this, final_spin, thread_finished USE_ITT_BUILD_ARG(itt_sync_obj), is_constrained); } - void wait(kmp_info_t *this_thr, + bool wait(kmp_info_t *this_thr, int final_spin USE_ITT_BUILD_ARG(void *itt_sync_obj)) { if (final_spin) - __kmp_wait_template( + return __kmp_wait_template( this_thr, this USE_ITT_BUILD_ARG(itt_sync_obj)); else - __kmp_wait_template( + return __kmp_wait_template( this_thr, this USE_ITT_BUILD_ARG(itt_sync_obj)); } void release() { __kmp_release_template(this); } flag_type get_ptr_type() { return flag32; } }; -class kmp_flag_64 : public kmp_basic_flag_native { +template +class kmp_flag_64 : public kmp_basic_flag_native { public: - kmp_flag_64(volatile kmp_uint64 *p) : kmp_basic_flag_native(p) {} + kmp_flag_64(volatile kmp_uint64 *p) + : kmp_basic_flag_native(p) {} kmp_flag_64(volatile kmp_uint64 *p, kmp_info_t *thr) - : kmp_basic_flag_native(p, thr) {} + : kmp_basic_flag_native(p, thr) {} kmp_flag_64(volatile kmp_uint64 *p, kmp_uint64 c) - : kmp_basic_flag_native(p, c) {} + : kmp_basic_flag_native(p, c) {} void suspend(int th_gtid) { __kmp_suspend_64(th_gtid, this); } +#if KMP_HAVE_MWAIT || KMP_HAVE_UMWAIT + void mwait(int th_gtid) { __kmp_mwait_64(th_gtid, this); } +#endif void resume(int th_gtid) { __kmp_resume_64(th_gtid, this); } int execute_tasks(kmp_info_t *this_thr, kmp_int32 gtid, int final_spin, int *thread_finished USE_ITT_BUILD_ARG(void *itt_sync_obj), @@ -781,27 +897,15 @@ this_thr, gtid, this, final_spin, thread_finished USE_ITT_BUILD_ARG(itt_sync_obj), is_constrained); } - void wait(kmp_info_t *this_thr, + bool wait(kmp_info_t *this_thr, int final_spin USE_ITT_BUILD_ARG(void *itt_sync_obj)) { if (final_spin) - __kmp_wait_template( + return __kmp_wait_template( this_thr, this USE_ITT_BUILD_ARG(itt_sync_obj)); else - __kmp_wait_template( + return __kmp_wait_template( this_thr, this USE_ITT_BUILD_ARG(itt_sync_obj)); } - bool wait_cancellable_nosleep(kmp_info_t *this_thr, - int final_spin - USE_ITT_BUILD_ARG(void *itt_sync_obj)) { - bool retval = false; - if (final_spin) - retval = __kmp_wait_template( - this_thr, this USE_ITT_BUILD_ARG(itt_sync_obj)); - else - retval = __kmp_wait_template( - this_thr, this USE_ITT_BUILD_ARG(itt_sync_obj)); - return retval; - } void release() { __kmp_release_template(this); } flag_type get_ptr_type() { return flag64; } }; @@ -859,8 +963,8 @@ return true; else if (flag_switch) { this_thr->th.th_bar[bt].bb.wait_flag = KMP_BARRIER_SWITCHING; - kmp_flag_64 flag(&this_thr->th.th_bar[bt].bb.b_go, - (kmp_uint64)KMP_BARRIER_STATE_BUMP); + kmp_flag_64<> flag(&this_thr->th.th_bar[bt].bb.b_go, + (kmp_uint64)KMP_BARRIER_STATE_BUMP); __kmp_wait_64(this_thr, &flag, TRUE USE_ITT_BUILD_ARG(itt_sync_obj)); } return false; @@ -896,6 +1000,9 @@ } void release() { __kmp_release_template(this); } void suspend(int th_gtid) { __kmp_suspend_oncore(th_gtid, this); } +#if KMP_HAVE_MWAIT || KMP_HAVE_UMWAIT + void mwait(int th_gtid) { __kmp_mwait_oncore(th_gtid, this); } +#endif void resume(int th_gtid) { __kmp_resume_oncore(th_gtid, this); } int execute_tasks(kmp_info_t *this_thr, kmp_int32 gtid, int final_spin, int *thread_finished USE_ITT_BUILD_ARG(void *itt_sync_obj), @@ -915,15 +1022,15 @@ if (!flag) return; - switch (RCAST(kmp_flag_64 *, CCAST(void *, flag))->get_type()) { + switch (RCAST(kmp_flag_64<> *, CCAST(void *, flag))->get_type()) { case flag32: - __kmp_resume_32(gtid, NULL); + __kmp_resume_32(gtid, (kmp_flag_32<> *)NULL); break; case flag64: - __kmp_resume_64(gtid, NULL); + __kmp_resume_64(gtid, (kmp_flag_64<> *)NULL); break; case flag_oncore: - __kmp_resume_oncore(gtid, NULL); + __kmp_resume_oncore(gtid, (kmp_flag_oncore *)NULL); break; } } diff --git a/openmp/runtime/src/kmp_wait_release.cpp b/openmp/runtime/src/kmp_wait_release.cpp --- a/openmp/runtime/src/kmp_wait_release.cpp +++ b/openmp/runtime/src/kmp_wait_release.cpp @@ -12,14 +12,32 @@ #include "kmp_wait_release.h" -void __kmp_wait_64(kmp_info_t *this_thr, kmp_flag_64 *flag, +void __kmp_wait_64(kmp_info_t *this_thr, kmp_flag_64<> *flag, int final_spin USE_ITT_BUILD_ARG(void *itt_sync_obj)) { if (final_spin) - __kmp_wait_template( + __kmp_wait_template, TRUE>( this_thr, flag USE_ITT_BUILD_ARG(itt_sync_obj)); else - __kmp_wait_template( + __kmp_wait_template, FALSE>( this_thr, flag USE_ITT_BUILD_ARG(itt_sync_obj)); } -void __kmp_release_64(kmp_flag_64 *flag) { __kmp_release_template(flag); } +void __kmp_release_64(kmp_flag_64<> *flag) { __kmp_release_template(flag); } + +#if KMP_HAVE_MWAIT || KMP_HAVE_UMWAIT +template +void __kmp_mwait_32(int th_gtid, kmp_flag_32 *flag) { + __kmp_mwait_template(th_gtid, flag); +} +template +void __kmp_mwait_64(int th_gtid, kmp_flag_64 *flag) { + __kmp_mwait_template(th_gtid, flag); +} +void __kmp_mwait_oncore(int th_gtid, kmp_flag_oncore *flag) { + __kmp_mwait_template(th_gtid, flag); +} + +template void __kmp_mwait_32(int, kmp_flag_32 *); +template void __kmp_mwait_64(int, kmp_flag_64 *); +template void __kmp_mwait_64(int, kmp_flag_64 *); +#endif diff --git a/openmp/runtime/src/z_Linux_util.cpp b/openmp/runtime/src/z_Linux_util.cpp --- a/openmp/runtime/src/z_Linux_util.cpp +++ b/openmp/runtime/src/z_Linux_util.cpp @@ -1459,8 +1459,7 @@ __kmp_suspend_initialize_thread(th); - status = pthread_mutex_lock(&th->th.th_suspend_mx.m_mutex); - KMP_CHECK_SYSFAIL("pthread_mutex_lock", status); + __kmp_lock_suspend_mx(th); KF_TRACE(10, ("__kmp_suspend_template: T#%d setting sleep bit for spin(%p)\n", th_gtid, flag->get())); @@ -1471,8 +1470,7 @@ if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME && __kmp_pause_status != kmp_soft_paused) { flag->unset_sleeping(); - status = pthread_mutex_unlock(&th->th.th_suspend_mx.m_mutex); - KMP_CHECK_SYSFAIL("pthread_mutex_unlock", status); + __kmp_unlock_suspend_mx(th); return; } KF_TRACE(5, ("__kmp_suspend_template: T#%d set sleep bit for spin(%p)==%x," @@ -1535,7 +1533,7 @@ th_gtid)); status = pthread_cond_wait(&th->th.th_suspend_cv.c_cond, &th->th.th_suspend_mx.m_mutex); -#endif +#endif // USE_SUSPEND_TIMEOUT if ((status != 0) && (status != EINTR) && (status != ETIMEDOUT)) { KMP_SYSFAIL("pthread_cond_wait", status); @@ -1575,21 +1573,26 @@ } #endif - status = pthread_mutex_unlock(&th->th.th_suspend_mx.m_mutex); - KMP_CHECK_SYSFAIL("pthread_mutex_unlock", status); + __kmp_unlock_suspend_mx(th); KF_TRACE(30, ("__kmp_suspend_template: T#%d exit\n", th_gtid)); } -void __kmp_suspend_32(int th_gtid, kmp_flag_32 *flag) { +template +void __kmp_suspend_32(int th_gtid, kmp_flag_32 *flag) { __kmp_suspend_template(th_gtid, flag); } -void __kmp_suspend_64(int th_gtid, kmp_flag_64 *flag) { +template +void __kmp_suspend_64(int th_gtid, kmp_flag_64 *flag) { __kmp_suspend_template(th_gtid, flag); } void __kmp_suspend_oncore(int th_gtid, kmp_flag_oncore *flag) { __kmp_suspend_template(th_gtid, flag); } +template void __kmp_suspend_32(int, kmp_flag_32 *); +template void __kmp_suspend_64(int, kmp_flag_64 *); +template void __kmp_suspend_64(int, kmp_flag_64 *); + /* This routine signals the thread specified by target_gtid to wake up after setting the sleep bit indicated by the flag argument to FALSE. The target thread must already have called __kmp_suspend_template() */ @@ -1608,9 +1611,7 @@ KMP_DEBUG_ASSERT(gtid != target_gtid); __kmp_suspend_initialize_thread(th); - - status = pthread_mutex_lock(&th->th.th_suspend_mx.m_mutex); - KMP_CHECK_SYSFAIL("pthread_mutex_lock", status); + __kmp_lock_suspend_mx(th); if (!flag) { // coming from __kmp_null_resume_wrapper flag = (C *)CCAST(void *, th->th.th_sleep_loc); @@ -1619,13 +1620,11 @@ // First, check if the flag is null or its type has changed. If so, someone // else woke it up. if (!flag || flag->get_type() != flag->get_ptr_type()) { // get_ptr_type - // simply shows what - // flag was cast to + // simply shows what flag was cast to KF_TRACE(5, ("__kmp_resume_template: T#%d exiting, thread T#%d already " "awake: flag(%p)\n", gtid, target_gtid, NULL)); - status = pthread_mutex_unlock(&th->th.th_suspend_mx.m_mutex); - KMP_CHECK_SYSFAIL("pthread_mutex_unlock", status); + __kmp_unlock_suspend_mx(th); return; } else { // if multiple threads are sleeping, flag should be internally // referring to a specific thread here @@ -1635,8 +1634,7 @@ "awake: flag(%p): " "%u => %u\n", gtid, target_gtid, flag->get(), old_spin, flag->load())); - status = pthread_mutex_unlock(&th->th.th_suspend_mx.m_mutex); - KMP_CHECK_SYSFAIL("pthread_mutex_unlock", status); + __kmp_unlock_suspend_mx(th); return; } KF_TRACE(5, ("__kmp_resume_template: T#%d about to wakeup T#%d, reset " @@ -1656,23 +1654,27 @@ #endif status = pthread_cond_signal(&th->th.th_suspend_cv.c_cond); KMP_CHECK_SYSFAIL("pthread_cond_signal", status); - status = pthread_mutex_unlock(&th->th.th_suspend_mx.m_mutex); - KMP_CHECK_SYSFAIL("pthread_mutex_unlock", status); + __kmp_unlock_suspend_mx(th); KF_TRACE(30, ("__kmp_resume_template: T#%d exiting after signaling wake up" " for T#%d\n", gtid, target_gtid)); } -void __kmp_resume_32(int target_gtid, kmp_flag_32 *flag) { +template +void __kmp_resume_32(int target_gtid, kmp_flag_32 *flag) { __kmp_resume_template(target_gtid, flag); } -void __kmp_resume_64(int target_gtid, kmp_flag_64 *flag) { +template +void __kmp_resume_64(int target_gtid, kmp_flag_64 *flag) { __kmp_resume_template(target_gtid, flag); } void __kmp_resume_oncore(int target_gtid, kmp_flag_oncore *flag) { __kmp_resume_template(target_gtid, flag); } +template void __kmp_resume_32(int, kmp_flag_32 *); +template void __kmp_resume_64(int, kmp_flag_64 *); + #if KMP_USE_MONITOR void __kmp_resume_monitor() { KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(USER_resume); diff --git a/openmp/runtime/src/z_Windows_NT_util.cpp b/openmp/runtime/src/z_Windows_NT_util.cpp --- a/openmp/runtime/src/z_Windows_NT_util.cpp +++ b/openmp/runtime/src/z_Windows_NT_util.cpp @@ -363,7 +363,7 @@ th_gtid, flag->get())); __kmp_suspend_initialize_thread(th); - __kmp_win32_mutex_lock(&th->th.th_suspend_mx); + __kmp_lock_suspend_mx(th); KF_TRACE(10, ("__kmp_suspend_template: T#%d setting sleep bit for flag's" " loc(%p)\n", @@ -375,7 +375,7 @@ if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME && __kmp_pause_status != kmp_soft_paused) { flag->unset_sleeping(); - __kmp_win32_mutex_unlock(&th->th.th_suspend_mx); + __kmp_unlock_suspend_mx(th); return; } @@ -437,21 +437,26 @@ } } - __kmp_win32_mutex_unlock(&th->th.th_suspend_mx); - + __kmp_unlock_suspend_mx(th); KF_TRACE(30, ("__kmp_suspend_template: T#%d exit\n", th_gtid)); } -void __kmp_suspend_32(int th_gtid, kmp_flag_32 *flag) { +template +void __kmp_suspend_32(int th_gtid, kmp_flag_32 *flag) { __kmp_suspend_template(th_gtid, flag); } -void __kmp_suspend_64(int th_gtid, kmp_flag_64 *flag) { +template +void __kmp_suspend_64(int th_gtid, kmp_flag_64 *flag) { __kmp_suspend_template(th_gtid, flag); } void __kmp_suspend_oncore(int th_gtid, kmp_flag_oncore *flag) { __kmp_suspend_template(th_gtid, flag); } +template void __kmp_suspend_32(int, kmp_flag_32 *); +template void __kmp_suspend_64(int, kmp_flag_64 *); +template void __kmp_suspend_64(int, kmp_flag_64 *); + /* This routine signals the thread specified by target_gtid to wake up after setting the sleep bit indicated by the flag argument to FALSE */ template @@ -467,7 +472,7 @@ gtid, target_gtid)); __kmp_suspend_initialize_thread(th); - __kmp_win32_mutex_lock(&th->th.th_suspend_mx); + __kmp_lock_suspend_mx(th); if (!flag) { // coming from __kmp_null_resume_wrapper flag = (C *)th->th.th_sleep_loc; @@ -481,7 +486,7 @@ KF_TRACE(5, ("__kmp_resume_template: T#%d exiting, thread T#%d already " "awake: flag's loc(%p)\n", gtid, target_gtid, NULL)); - __kmp_win32_mutex_unlock(&th->th.th_suspend_mx); + __kmp_unlock_suspend_mx(th); return; } else { typename C::flag_t old_spin = flag->unset_sleeping(); @@ -489,7 +494,7 @@ KF_TRACE(5, ("__kmp_resume_template: T#%d exiting, thread T#%d already " "awake: flag's loc(%p): %u => %u\n", gtid, target_gtid, flag->get(), old_spin, *(flag->get()))); - __kmp_win32_mutex_unlock(&th->th.th_suspend_mx); + __kmp_unlock_suspend_mx(th); return; } } @@ -499,23 +504,28 @@ gtid, target_gtid, flag->get())); __kmp_win32_cond_signal(&th->th.th_suspend_cv); - __kmp_win32_mutex_unlock(&th->th.th_suspend_mx); + __kmp_unlock_suspend_mx(th); KF_TRACE(30, ("__kmp_resume_template: T#%d exiting after signaling wake up" " for T#%d\n", gtid, target_gtid)); } -void __kmp_resume_32(int target_gtid, kmp_flag_32 *flag) { +template +void __kmp_resume_32(int target_gtid, kmp_flag_32 *flag) { __kmp_resume_template(target_gtid, flag); } -void __kmp_resume_64(int target_gtid, kmp_flag_64 *flag) { +template +void __kmp_resume_64(int target_gtid, kmp_flag_64 *flag) { __kmp_resume_template(target_gtid, flag); } void __kmp_resume_oncore(int target_gtid, kmp_flag_oncore *flag) { __kmp_resume_template(target_gtid, flag); } +template void __kmp_resume_32(int, kmp_flag_32 *); +template void __kmp_resume_64(int, kmp_flag_64 *); + void __kmp_yield() { Sleep(0); } void __kmp_gtid_set_specific(int gtid) {