Index: openmp/runtime/cmake/config-ix.cmake =================================================================== --- openmp/runtime/cmake/config-ix.cmake +++ openmp/runtime/cmake/config-ix.cmake @@ -10,6 +10,7 @@ include(CheckCCompilerFlag) include(CheckCSourceCompiles) +include(CheckCXXSourceCompiles) include(CheckCXXCompilerFlag) include(CheckIncludeFile) include(CheckLibraryExists) @@ -141,6 +142,53 @@ endif() endif() +# Checking for x86-specific waitpkg and rtm attribute and intrinsics +if (IA32 OR INTEL64) + check_include_file(immintrin.h LIBOMP_HAVE_IMMINTRIN_H) + if (NOT LIBOMP_HAVE_IMMINTRIN_H) + check_include_file(intrin.h LIBOMP_HAVE_INTRIN_H) + endif() + check_cxx_source_compiles("__attribute__((target(\"rtm\"))) + int main() {return 0;}" LIBOMP_HAVE_ATTRIBUTE_RTM) + check_cxx_source_compiles("__attribute__((target(\"waitpkg\"))) + int main() {return 0;}" LIBOMP_HAVE_ATTRIBUTE_WAITPKG) + libomp_append(CMAKE_REQUIRED_DEFINITIONS -DIMMINTRIN_H LIBOMP_HAVE_IMMINTRIN_H) + libomp_append(CMAKE_REQUIRED_DEFINITIONS -DINTRIN_H LIBOMP_HAVE_INTRIN_H) + libomp_append(CMAKE_REQUIRED_DEFINITIONS -DATTRIBUTE_WAITPKG LIBOMP_HAVE_ATTRIBUTE_WAITPKG) + libomp_append(CMAKE_REQUIRED_DEFINITIONS -DATTRIBUTE_RTM LIBOMP_HAVE_ATTRIBUTE_RTM) + set(source_code "// check for attribute and wait pkg intrinsics + #ifdef IMMINTRIN_H + #include + #endif + #ifdef INTRIN_H + #include + #endif + #ifdef ATTRIBUTE_WAITPKG + __attribute__((target(\"waitpkg\"))) + #endif + static inline int __kmp_umwait(unsigned hint, unsigned long long counter) { + return _umwait(hint, counter); + } + int main() { int a = __kmp_umwait(0, 1000); return a; }") + check_cxx_source_compiles("${source_code}" LIBOMP_HAVE_WAITPKG_INTRINSICS) + set(source_code "// check for attribute rtm and rtm intrinsics + #ifdef IMMINTRIN_H + #include + #endif + #ifdef INTRIN_H + #include + #endif + #ifdef ATTRIBUTE_RTM + __attribute__((target(\"rtm\"))) + #endif + static inline int __kmp_xbegin() { + return _xbegin(); + } + int main() { int a = __kmp_xbegin(); return a; }") + check_cxx_source_compiles("${source_code}" LIBOMP_HAVE_RTM_INTRINSICS) + set(CMAKE_REQUIRED_DEFINITIONS) +endif() + # Find perl executable # Perl is used to create omp.h (and other headers) along with kmp_i18n_id.inc and kmp_i18n_default.inc find_package(Perl REQUIRED) Index: openmp/runtime/src/i18n/en_US.txt =================================================================== --- openmp/runtime/src/i18n/en_US.txt +++ openmp/runtime/src/i18n/en_US.txt @@ -417,6 +417,8 @@ AffIgnoringHwloc "%1$s: Ignoring hwloc mechanism." AffHwlocErrorOccurred "%1$s: Hwloc failed in %2$s. Relying on internal affinity mechanisms." EnvSerialWarn "%1$s must be set prior to OpenMP runtime library initialization; ignored." +EnvMwaitWarn "You have enabled the use of umonitor/umwait. If the CPU doesn't have that enabled " + "you'll get an illegal instruction exception." EnvVarDeprecated "%1$s variable deprecated, please use %2$s instead." RedMethodNotSupported "KMP_FORCE_REDUCTION: %1$s method is not supported; using critical." AffHWSubsetNoHWLOC "KMP_HW_SUBSET ignored: unsupported item requested for non-HWLOC topology method (KMP_TOPOLOGY_METHOD)" Index: openmp/runtime/src/kmp.h =================================================================== --- openmp/runtime/src/kmp.h +++ openmp/runtime/src/kmp.h @@ -255,6 +255,10 @@ typedef union kmp_info kmp_info_p; typedef union kmp_root kmp_root_p; +template class kmp_flag_32; +template class kmp_flag_64; +class kmp_flag_oncore; + #ifdef __cplusplus extern "C" { #endif @@ -1318,6 +1322,84 @@ } \ } +// User-level Monitor/Mwait +#if KMP_HAVE_UMWAIT +// We always try for UMWAIT first +#if KMP_HAVE_WAITPKG_INTRINSICS +#if KMP_HAVE_IMMINTRIN_H +#include +#elif KMP_HAVE_INTRIN_H +#include +#endif +#endif // KMP_HAVE_WAITPKG_INTRINSICS +KMP_ATTRIBUTE_TARGET_WAITPKG +static inline int +__kmp_tpause(uint32_t hint, uint64_t counter) { +#if !KMP_HAVE_WAITPKG_INTRINSICS + uint32_t timeHi = uint32_t(counter >> 32); + uint32_t timeLo = uint32_t(counter & 0xffffffff); + char flag; + __asm__ volatile("#tpause\n.byte 0x66, 0x0F, 0xAE, 0xF1\n" + "setb %0" + : "=r"(flag) + : "a"(timeLo), "d"(timeHi), "c"(hint) + :); + return flag; +#else + return _tpause(hint, counter); +#endif +} +KMP_ATTRIBUTE_TARGET_WAITPKG +static inline void +__kmp_umonitor(void *cacheline) { +#if !KMP_HAVE_WAITPKG_INTRINSICS + __asm__ volatile("# umonitor\n.byte 0xF3, 0x0F, 0xAE, 0x01 " + : + : "a"(cacheline) + :); +#else + _umonitor(cacheline); +#endif +} +KMP_ATTRIBUTE_TARGET_WAITPKG +static inline int +__kmp_umwait(uint32_t hint, uint64_t counter) { +#if !KMP_HAVE_WAITPKG_INTRINSICS + uint32_t timeHi = uint32_t(counter >> 32); + uint32_t timeLo = uint32_t(counter & 0xffffffff); + char flag; + __asm__ volatile("#umwait\n.byte 0xF2, 0x0F, 0xAE, 0xF1\n" + "setb %0" + : "=r"(flag) + : "a"(timeLo), "d"(timeHi), "c"(hint) + :); + return flag; +#else + return _umwait(hint, counter); +#endif +} +#elif KMP_HAVE_MWAIT +#if KMP_OS_UNIX +#include +#else +#include +#endif +#if KMP_OS_UNIX +__attribute__((target("sse3"))) +#endif +static inline void +__kmp_mm_monitor(void *cacheline, unsigned extensions, unsigned hints) { + _mm_monitor(cacheline, extensions, hints); +} +#if KMP_OS_UNIX +__attribute__((target("sse3"))) +#endif +static inline void +__kmp_mm_mwait(unsigned extensions, unsigned hints) { + _mm_mwait(extensions, hints); +} +#endif // KMP_HAVE_UMWAIT + /* ------------------------------------------------------------------------ */ /* Support datatypes for the orphaned construct nesting checks. */ /* ------------------------------------------------------------------------ */ @@ -3094,6 +3176,13 @@ KMP_FATAL(ThreadIdentInvalid); } +#if KMP_HAVE_MWAIT || KMP_HAVE_UMWAIT +extern int __kmp_user_level_mwait; // TRUE or FALSE; from KMP_USER_LEVEL_MWAIT +extern int __kmp_umwait_enabled; // Runtime check if user-level mwait enabled +extern int __kmp_mwait_enabled; // Runtime check if ring3 mwait is enabled +extern int __kmp_mwait_hints; // Hints to pass in to mwait +#endif + /* ------------------------------------------------------------------------- */ extern kmp_global_t __kmp_global; /* global status */ @@ -3295,17 +3384,14 @@ extern void __kmp_wait_4_ptr(void *spinner, kmp_uint32 checker, kmp_uint32 (*pred)(void *, kmp_uint32), void *obj); -class kmp_flag_32; -class kmp_flag_64; -class kmp_flag_oncore; -extern void __kmp_wait_64(kmp_info_t *this_thr, kmp_flag_64 *flag, +extern void __kmp_wait_64(kmp_info_t *this_thr, kmp_flag_64<> *flag, int final_spin #if USE_ITT_BUILD , void *itt_sync_obj #endif ); -extern void __kmp_release_64(kmp_flag_64 *flag); +extern void __kmp_release_64(kmp_flag_64<> *flag); extern void __kmp_infinite_loop(void); @@ -3403,13 +3489,6 @@ extern void __kmp_lock_suspend_mx(kmp_info_t *th); extern void __kmp_unlock_suspend_mx(kmp_info_t *th); -extern void __kmp_suspend_32(int th_gtid, kmp_flag_32 *flag); -extern void __kmp_suspend_64(int th_gtid, kmp_flag_64 *flag); -extern void __kmp_suspend_oncore(int th_gtid, kmp_flag_oncore *flag); -extern void __kmp_resume_32(int target_gtid, kmp_flag_32 *flag); -extern void __kmp_resume_64(int target_gtid, kmp_flag_64 *flag); -extern void __kmp_resume_oncore(int target_gtid, kmp_flag_oncore *flag); - extern void __kmp_elapsed(double *); extern void __kmp_elapsed_tick(double *); @@ -3534,28 +3613,6 @@ kmp_task_t *task); extern void __kmp_fulfill_event(kmp_event_t *event); -int __kmp_execute_tasks_32(kmp_info_t *thread, kmp_int32 gtid, - kmp_flag_32 *flag, int final_spin, - int *thread_finished, -#if USE_ITT_BUILD - void *itt_sync_obj, -#endif /* USE_ITT_BUILD */ - kmp_int32 is_constrained); -int __kmp_execute_tasks_64(kmp_info_t *thread, kmp_int32 gtid, - kmp_flag_64 *flag, int final_spin, - int *thread_finished, -#if USE_ITT_BUILD - void *itt_sync_obj, -#endif /* USE_ITT_BUILD */ - kmp_int32 is_constrained); -int __kmp_execute_tasks_oncore(kmp_info_t *thread, kmp_int32 gtid, - kmp_flag_oncore *flag, int final_spin, - int *thread_finished, -#if USE_ITT_BUILD - void *itt_sync_obj, -#endif /* USE_ITT_BUILD */ - kmp_int32 is_constrained); - extern void __kmp_free_task_team(kmp_info_t *thread, kmp_task_team_t *task_team); extern void __kmp_reap_task_teams(void); @@ -3919,4 +3976,46 @@ } #endif +template +extern void __kmp_suspend_32(int th_gtid, kmp_flag_32 *flag); +template +extern void __kmp_suspend_64(int th_gtid, kmp_flag_64 *flag); +extern void __kmp_suspend_oncore(int th_gtid, kmp_flag_oncore *flag); +#if KMP_HAVE_MWAIT || KMP_HAVE_UMWAIT +template +extern void __kmp_mwait_32(int th_gtid, kmp_flag_32 *flag); +template +extern void __kmp_mwait_64(int th_gtid, kmp_flag_64 *flag); +extern void __kmp_mwait_oncore(int th_gtid, kmp_flag_oncore *flag); +#endif +template +extern void __kmp_resume_32(int target_gtid, kmp_flag_32 *flag); +template +extern void __kmp_resume_64(int target_gtid, kmp_flag_64 *flag); +extern void __kmp_resume_oncore(int target_gtid, kmp_flag_oncore *flag); + +template +int __kmp_execute_tasks_32(kmp_info_t *thread, kmp_int32 gtid, + kmp_flag_32 *flag, int final_spin, + int *thread_finished, +#if USE_ITT_BUILD + void *itt_sync_obj, +#endif /* USE_ITT_BUILD */ + kmp_int32 is_constrained); +template +int __kmp_execute_tasks_64(kmp_info_t *thread, kmp_int32 gtid, + kmp_flag_64 *flag, int final_spin, + int *thread_finished, +#if USE_ITT_BUILD + void *itt_sync_obj, +#endif /* USE_ITT_BUILD */ + kmp_int32 is_constrained); +int __kmp_execute_tasks_oncore(kmp_info_t *thread, kmp_int32 gtid, + kmp_flag_oncore *flag, int final_spin, + int *thread_finished, +#if USE_ITT_BUILD + void *itt_sync_obj, +#endif /* USE_ITT_BUILD */ + kmp_int32 is_constrained); + #endif /* KMP_H */ Index: openmp/runtime/src/kmp_barrier.cpp =================================================================== --- openmp/runtime/src/kmp_barrier.cpp +++ openmp/runtime/src/kmp_barrier.cpp @@ -78,7 +78,7 @@ is valid any more - it could be deallocated by the master thread at any time. */ ANNOTATE_BARRIER_BEGIN(this_thr); - kmp_flag_64 flag(&thr_bar->b_arrived, other_threads[0]); + kmp_flag_64<> flag(&thr_bar->b_arrived, other_threads[0]); flag.release(); } else { kmp_balign_team_t *team_bar = &team->t.t_bar[bt]; @@ -101,14 +101,14 @@ &other_threads[i]->th.th_bar[bt].bb.b_arrived, new_state)); // Wait for worker thread to arrive - kmp_flag_64 flag(&other_threads[i]->th.th_bar[bt].bb.b_arrived, - new_state); if (cancellable) { - bool cancelled = flag.wait_cancellable_nosleep( - this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj)); - if (cancelled) + kmp_flag_64 flag( + &other_threads[i]->th.th_bar[bt].bb.b_arrived, new_state); + if (flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj))) return true; } else { + kmp_flag_64<> flag(&other_threads[i]->th.th_bar[bt].bb.b_arrived, + new_state); flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj)); } ANNOTATE_BARRIER_END(other_threads[i]); @@ -203,7 +203,7 @@ other_threads[i]->th.th_bar[bt].bb.b_go, other_threads[i]->th.th_bar[bt].bb.b_go + KMP_BARRIER_STATE_BUMP)); ANNOTATE_BARRIER_BEGIN(other_threads[i]); - kmp_flag_64 flag(&other_threads[i]->th.th_bar[bt].bb.b_go, + kmp_flag_64<> flag(&other_threads[i]->th.th_bar[bt].bb.b_go, other_threads[i]); flag.release(); } @@ -211,14 +211,12 @@ } else { // Wait for the MASTER thread to release us KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d wait go(%p) == %u\n", gtid, &thr_bar->b_go, KMP_BARRIER_STATE_BUMP)); - kmp_flag_64 flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP); if (cancellable) { - bool cancelled = flag.wait_cancellable_nosleep( - this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj)); - if (cancelled) { + kmp_flag_64 flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP); + if (flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj))) return true; - } } else { + kmp_flag_64<> flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP); flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj)); } ANNOTATE_BARRIER_END(this_thr); @@ -339,7 +337,7 @@ gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team), team->t.t_id, child_tid, &child_bar->b_arrived, new_state)); // Wait for child to arrive - kmp_flag_64 flag(&child_bar->b_arrived, new_state); + kmp_flag_64<> flag(&child_bar->b_arrived, new_state); flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj)); ANNOTATE_BARRIER_END(child_thr); #if USE_ITT_BUILD && USE_ITT_NOTIFY @@ -384,7 +382,7 @@ is valid any more - it could be deallocated by the master thread at any time. */ ANNOTATE_BARRIER_BEGIN(this_thr); - kmp_flag_64 flag(&thr_bar->b_arrived, other_threads[parent_tid]); + kmp_flag_64<> flag(&thr_bar->b_arrived, other_threads[parent_tid]); flag.release(); } else { // Need to update the team arrived pointer if we are the master thread @@ -420,7 +418,7 @@ KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d wait go(%p) == %u\n", gtid, &thr_bar->b_go, KMP_BARRIER_STATE_BUMP)); // Wait for parent thread to release us - kmp_flag_64 flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP); + kmp_flag_64<> flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP); flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj)); ANNOTATE_BARRIER_END(this_thr); #if USE_ITT_BUILD && USE_ITT_NOTIFY @@ -498,7 +496,7 @@ child_bar->b_go + KMP_BARRIER_STATE_BUMP)); // Release child from barrier ANNOTATE_BARRIER_BEGIN(child_thr); - kmp_flag_64 flag(&child_bar->b_go, child_thr); + kmp_flag_64<> flag(&child_bar->b_go, child_thr); flag.release(); child++; child_tid++; @@ -540,7 +538,7 @@ #endif /* Perform a hypercube-embedded tree gather to wait until all of the threads have arrived, and reduce any required data as we go. */ - kmp_flag_64 p_flag(&thr_bar->b_arrived); + kmp_flag_64<> p_flag(&thr_bar->b_arrived); for (level = 0, offset = 1; offset < num_threads; level += branch_bits, offset <<= branch_bits) { kmp_uint32 child; @@ -588,7 +586,7 @@ gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team), team->t.t_id, child_tid, &child_bar->b_arrived, new_state)); // Wait for child to arrive - kmp_flag_64 c_flag(&child_bar->b_arrived, new_state); + kmp_flag_64<> c_flag(&child_bar->b_arrived, new_state); c_flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj)); ANNOTATE_BARRIER_END(child_thr); KMP_MB(); // Synchronize parent and child threads. @@ -670,7 +668,7 @@ KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d wait go(%p) == %u\n", gtid, &thr_bar->b_go, KMP_BARRIER_STATE_BUMP)); // Wait for parent thread to release us - kmp_flag_64 flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP); + kmp_flag_64<> flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP); flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj)); ANNOTATE_BARRIER_END(this_thr); #if USE_ITT_BUILD && USE_ITT_NOTIFY @@ -772,7 +770,7 @@ child_bar->b_go + KMP_BARRIER_STATE_BUMP)); // Release child from barrier ANNOTATE_BARRIER_BEGIN(child_thr); - kmp_flag_64 flag(&child_bar->b_go, child_thr); + kmp_flag_64<> flag(&child_bar->b_go, child_thr); flag.release(); } } @@ -917,7 +915,7 @@ KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) waiting " "for leaf kids\n", gtid, team->t.t_id, tid)); - kmp_flag_64 flag(&thr_bar->b_arrived, leaf_state); + kmp_flag_64<> flag(&thr_bar->b_arrived, leaf_state); flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj)); if (reduce) { ANNOTATE_REDUCE_AFTER(reduce); @@ -957,7 +955,7 @@ gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team), team->t.t_id, child_tid, &child_bar->b_arrived, new_state)); - kmp_flag_64 flag(&child_bar->b_arrived, new_state); + kmp_flag_64<> flag(&child_bar->b_arrived, new_state); flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj)); ANNOTATE_BARRIER_END(child_thr); if (reduce) { @@ -990,7 +988,7 @@ gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team), team->t.t_id, child_tid, &child_bar->b_arrived, new_state)); - kmp_flag_64 flag(&child_bar->b_arrived, new_state); + kmp_flag_64<> flag(&child_bar->b_arrived, new_state); flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj)); ANNOTATE_BARRIER_END(child_thr); if (reduce) { @@ -1025,7 +1023,8 @@ !thr_bar->use_oncore_barrier) { // Parent is waiting on my b_arrived // flag; release it ANNOTATE_BARRIER_BEGIN(this_thr); - kmp_flag_64 flag(&thr_bar->b_arrived, other_threads[thr_bar->parent_tid]); + kmp_flag_64<> flag(&thr_bar->b_arrived, + other_threads[thr_bar->parent_tid]); flag.release(); } else { // Leaf does special release on "offset" bits of parent's b_arrived flag @@ -1069,7 +1068,7 @@ thr_bar->team == NULL) { // Use traditional method of waiting on my own b_go flag thr_bar->wait_flag = KMP_BARRIER_OWN_FLAG; - kmp_flag_64 flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP); + kmp_flag_64<> flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP); flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj)); ANNOTATE_BARRIER_END(this_thr); TCW_8(thr_bar->b_go, @@ -1218,7 +1217,7 @@ child_bar->b_go + KMP_BARRIER_STATE_BUMP)); // Release child using child's b_go flag ANNOTATE_BARRIER_BEGIN(child_thr); - kmp_flag_64 flag(&child_bar->b_go, child_thr); + kmp_flag_64<> flag(&child_bar->b_go, child_thr); flag.release(); } } else { // Release all children at once with leaf_state bits on my own @@ -1244,7 +1243,7 @@ child_bar->b_go + KMP_BARRIER_STATE_BUMP)); // Release child using child's b_go flag ANNOTATE_BARRIER_BEGIN(child_thr); - kmp_flag_64 flag(&child_bar->b_go, child_thr); + kmp_flag_64<> flag(&child_bar->b_go, child_thr); flag.release(); } } Index: openmp/runtime/src/kmp_config.h.cmake =================================================================== --- openmp/runtime/src/kmp_config.h.cmake +++ openmp/runtime/src/kmp_config.h.cmake @@ -70,6 +70,18 @@ #endif #cmakedefine01 MSVC #define KMP_MSVC_COMPAT MSVC +#cmakedefine01 LIBOMP_HAVE_WAITPKG_INTRINSICS +#define KMP_HAVE_WAITPKG_INTRINSICS LIBOMP_HAVE_WAITPKG_INTRINSICS +#cmakedefine01 LIBOMP_HAVE_RTM_INTRINSICS +#define KMP_HAVE_RTM_INTRINSICS LIBOMP_HAVE_RTM_INTRINSICS +#cmakedefine01 LIBOMP_HAVE_IMMINTRIN_H +#define KMP_HAVE_IMMINTRIN_H LIBOMP_HAVE_IMMINTRIN_H +#cmakedefine01 LIBOMP_HAVE_INTRIN_H +#define KMP_HAVE_INTRIN_H LIBOMP_HAVE_INTRIN_H +#cmakedefine01 LIBOMP_HAVE_ATTRIBUTE_WAITPKG +#define KMP_HAVE_ATTRIBUTE_WAITPKG LIBOMP_HAVE_ATTRIBUTE_WAITPKG +#cmakedefine01 LIBOMP_HAVE_ATTRIBUTE_RTM +#define KMP_HAVE_ATTRIBUTE_RTM LIBOMP_HAVE_ATTRIBUTE_RTM // Configured cache line based on architecture #if KMP_ARCH_PPC64 Index: openmp/runtime/src/kmp_global.cpp =================================================================== --- openmp/runtime/src/kmp_global.cpp +++ openmp/runtime/src/kmp_global.cpp @@ -206,6 +206,13 @@ int __kmp_display_env_verbose = FALSE; int __kmp_omp_cancellation = FALSE; +#if KMP_HAVE_MWAIT || KMP_HAVE_UMWAIT +int __kmp_user_level_mwait = FALSE; +int __kmp_umwait_enabled = FALSE; +int __kmp_mwait_enabled = FALSE; +int __kmp_mwait_hints = 0; +#endif + /* map OMP 3.0 schedule types with our internal schedule types */ enum sched_type __kmp_sch_map[kmp_sched_upper - kmp_sched_lower_ext + kmp_sched_upper_std - kmp_sched_lower - 2] = { Index: openmp/runtime/src/kmp_lock.cpp =================================================================== --- openmp/runtime/src/kmp_lock.cpp +++ openmp/runtime/src/kmp_lock.cpp @@ -1704,11 +1704,7 @@ /* RTM Adaptive locks */ -#if (KMP_COMPILER_ICC && __INTEL_COMPILER >= 1300) || \ - (KMP_COMPILER_MSVC && _MSC_VER >= 1700) || \ - (KMP_COMPILER_CLANG && (KMP_MSVC_COMPAT || __MINGW32__)) || \ - (KMP_COMPILER_GCC && __MINGW32__) - +#if KMP_HAVE_RTM_INTRINSICS #include #define SOFT_ABORT_MASK (_XABORT_RETRY | _XABORT_CONFLICT | _XABORT_EXPLICIT) @@ -2003,6 +1999,7 @@ } // Check whether speculation should be attempted. +KMP_ATTRIBUTE_TARGET_RTM static __inline int __kmp_should_speculate(kmp_adaptive_lock_t *lck, kmp_int32 gtid) { kmp_uint32 badness = lck->lk.adaptive.badness; @@ -2013,6 +2010,7 @@ // Attempt to acquire only the speculative lock. // Does not back off to the non-speculative lock. +KMP_ATTRIBUTE_TARGET_RTM static int __kmp_test_adaptive_lock_only(kmp_adaptive_lock_t *lck, kmp_int32 gtid) { int retries = lck->lk.adaptive.max_soft_retries; @@ -2154,6 +2152,7 @@ lck->lk.qlk.owner_id = gtid + 1; } +KMP_ATTRIBUTE_TARGET_RTM static int __kmp_release_adaptive_lock(kmp_adaptive_lock_t *lck, kmp_int32 gtid) { if (__kmp_is_unlocked_queuing_lock(GET_QLK_PTR( @@ -2777,6 +2776,9 @@ __kmp_destroy_queuing_lock_with_checks(lck); } +#if KMP_HAVE_ATTRIBUTE_RTM +__attribute__((target("rtm"))) +#endif static void __kmp_acquire_rtm_lock(kmp_queuing_lock_t *lck, kmp_int32 gtid) { unsigned retries = 3, status; do { @@ -2804,6 +2806,7 @@ __kmp_acquire_rtm_lock(lck, gtid); } +KMP_ATTRIBUTE_TARGET_RTM static int __kmp_release_rtm_lock(kmp_queuing_lock_t *lck, kmp_int32 gtid) { if (__kmp_is_unlocked_queuing_lock(lck)) { // Releasing from speculation @@ -2820,6 +2823,7 @@ return __kmp_release_rtm_lock(lck, gtid); } +KMP_ATTRIBUTE_TARGET_RTM static int __kmp_test_rtm_lock(kmp_queuing_lock_t *lck, kmp_int32 gtid) { unsigned retries = 3, status; do { Index: openmp/runtime/src/kmp_os.h =================================================================== --- openmp/runtime/src/kmp_os.h +++ openmp/runtime/src/kmp_os.h @@ -281,6 +281,16 @@ #define __forceinline __inline #endif +/* Check if the OS/arch can support user-level mwait */ +// All mwait code tests for UMWAIT first, so it should only fall back to ring3 +// MWAIT for KNL. +#define KMP_HAVE_MWAIT \ + ((KMP_ARCH_X86 || KMP_ARCH_X86_64) && (KMP_OS_LINUX || KMP_OS_WINDOWS) && \ + !KMP_MIC2) +#define KMP_HAVE_UMWAIT \ + ((KMP_ARCH_X86 || KMP_ARCH_X86_64) && (KMP_OS_LINUX || KMP_OS_WINDOWS) && \ + !KMP_MIC) + #if KMP_OS_WINDOWS #include @@ -332,6 +342,18 @@ # define KMP_FALLTHROUGH() ((void)0) #endif +#if KMP_HAVE_ATTRIBUTE_WAITPKG +#define KMP_ATTRIBUTE_TARGET_WAITPKG __attribute__((target("waitpkg"))) +#else +#define KMP_ATTRIBUTE_TARGET_WAITPKG /* Nothing */ +#endif + +#if KMP_HAVE_ATTRIBUTE_RTM +#define KMP_ATTRIBUTE_TARGET_RTM __attribute__((target("rtm"))) +#else +#define KMP_ATTRIBUTE_TARGET_RTM /* Nothing */ +#endif + // Define attribute that indicates a function does not return #if __cplusplus >= 201103L #define KMP_NORETURN [[noreturn]] Index: openmp/runtime/src/kmp_runtime.cpp =================================================================== --- openmp/runtime/src/kmp_runtime.cpp +++ openmp/runtime/src/kmp_runtime.cpp @@ -5458,7 +5458,7 @@ } #endif // first check if thread is sleeping - kmp_flag_64 fl(&th->th.th_bar[bs_forkjoin_barrier].bb.b_go, th); + kmp_flag_64<> fl(&th->th.th_bar[bs_forkjoin_barrier].bb.b_go, th); if (fl.is_sleeping()) fl.resume(__kmp_gtid_from_thread(th)); KMP_CPU_PAUSE(); @@ -5885,7 +5885,8 @@ /* Need release fence here to prevent seg faults for tree forkjoin barrier * (GEH) */ ANNOTATE_HAPPENS_BEFORE(thread); - kmp_flag_64 flag(&thread->th.th_bar[bs_forkjoin_barrier].bb.b_go, thread); + kmp_flag_64<> flag(&thread->th.th_bar[bs_forkjoin_barrier].bb.b_go, + thread); __kmp_release_64(&flag); } @@ -6579,6 +6580,48 @@ #endif /* KMP_MIC_SUPPORTED */ +#if KMP_HAVE_UMWAIT +static void __kmp_user_level_mwait_init() { + struct kmp_cpuid buf; + __kmp_x86_cpuid(7, 0, &buf); + __kmp_umwait_enabled = ((buf.ecx >> 5) & 1) && __kmp_user_level_mwait; + KF_TRACE(30, ("__kmp_user_level_mwait_init: __kmp_umwait_enabled = %d\n", + __kmp_umwait_enabled)); +} +#elif KMP_HAVE_MWAIT +#ifndef AT_INTELPHIUSERMWAIT +// Spurious, non-existent value that should always fail to return anything. +// Will be replaced with the correct value when we know that. +#define AT_INTELPHIUSERMWAIT 10000 +#endif +// getauxval() function is available in RHEL7 and SLES12. If a system with an +// earlier OS is used to build the RTL, we'll use the following internal +// function when the entry is not found. +unsigned long getauxval(unsigned long) KMP_WEAK_ATTRIBUTE_EXTERNAL; +unsigned long getauxval(unsigned long) { return 0; } + +static void __kmp_user_level_mwait_init() { + // When getauxval() and correct value of AT_INTELPHIUSERMWAIT are available + // use them to find if the user-level mwait is enabled. Otherwise, forcibly + // set __kmp_mwait_enabled=TRUE on Intel MIC if the environment variable + // KMP_USER_LEVEL_MWAIT was set to TRUE. + if (__kmp_mic_type == mic3) { + unsigned long res = getauxval(AT_INTELPHIUSERMWAIT); + if ((res & 0x1) || __kmp_user_level_mwait) { + __kmp_mwait_enabled = TRUE; + if (__kmp_user_level_mwait) { + KMP_INFORM(EnvMwaitWarn); + } + } else { + __kmp_mwait_enabled = FALSE; + } + } + KF_TRACE(30, ("__kmp_user_level_mwait_init: __kmp_mic_type = %d, " + "__kmp_mwait_enabled = %d\n", + __kmp_mic_type, __kmp_mwait_enabled)); +} +#endif /* KMP_HAVE_UMWAIT */ + static void __kmp_do_serial_initialize(void) { int i, gtid; int size; @@ -6753,6 +6796,9 @@ __kmp_env_initialize(NULL); +#if KMP_HAVE_MWAIT || KMP_HAVE_UMWAIT + __kmp_user_level_mwait_init(); +#endif // Print all messages in message catalog for testing purposes. #ifdef KMP_DEBUG char const *val = __kmp_env_get("KMP_DUMP_CATALOG"); @@ -8353,7 +8399,8 @@ for (int gtid = 1; gtid < __kmp_threads_capacity; ++gtid) { kmp_info_t *thread = __kmp_threads[gtid]; if (thread) { // Wake it if sleeping - kmp_flag_64 fl(&thread->th.th_bar[bs_forkjoin_barrier].bb.b_go, thread); + kmp_flag_64<> fl(&thread->th.th_bar[bs_forkjoin_barrier].bb.b_go, + thread); if (fl.is_sleeping()) fl.resume(gtid); else if (__kmp_try_suspend_mx(thread)) { // got suspend lock Index: openmp/runtime/src/kmp_settings.cpp =================================================================== --- openmp/runtime/src/kmp_settings.cpp +++ openmp/runtime/src/kmp_settings.cpp @@ -4621,6 +4621,35 @@ __kmp_stg_print_bool(buffer, name, __kmp_enable_task_throttling); } // __kmp_stg_print_task_throttling +#if KMP_HAVE_MWAIT || KMP_HAVE_UMWAIT +// ----------------------------------------------------------------------------- +// KMP_USER_LEVEL_MWAIT + +static void __kmp_stg_parse_user_level_mwait(char const *name, + char const *value, void *data) { + __kmp_stg_parse_bool(name, value, &__kmp_user_level_mwait); +} // __kmp_stg_parse_user_level_mwait + +static void __kmp_stg_print_user_level_mwait(kmp_str_buf_t *buffer, + char const *name, void *data) { + __kmp_stg_print_bool(buffer, name, __kmp_user_level_mwait); +} // __kmp_stg_print_user_level_mwait + +// ----------------------------------------------------------------------------- +// KMP_MWAIT_HINTS + +static void __kmp_stg_parse_mwait_hints(char const *name, char const *value, + void *data) { + __kmp_stg_parse_int(name, value, 0, INT_MAX, &__kmp_mwait_hints); +} // __kmp_stg_parse_mwait_hints + +static void __kmp_stg_print_mwait_hints(kmp_str_buf_t *buffer, char const *name, + void *data) { + __kmp_stg_print_int(buffer, name, __kmp_mwait_hints); +} // __kmp_stg_print_mwait_hints + +#endif // KMP_HAVE_MWAIT || KMP_HAVE_UMWAIT + // ----------------------------------------------------------------------------- // OMP_DISPLAY_ENV @@ -4939,6 +4968,12 @@ __kmp_stg_print_omp_tool_libraries, NULL, 0, 0}, #endif +#if KMP_HAVE_MWAIT || KMP_HAVE_UMWAIT + {"KMP_USER_LEVEL_MWAIT", __kmp_stg_parse_user_level_mwait, + __kmp_stg_print_user_level_mwait, NULL, 0, 0}, + {"KMP_MWAIT_HINTS", __kmp_stg_parse_mwait_hints, + __kmp_stg_print_mwait_hints, NULL, 0, 0}, +#endif {"", NULL, NULL, NULL, 0, 0}}; // settings static int const __kmp_stg_count = Index: openmp/runtime/src/kmp_stats.h =================================================================== --- openmp/runtime/src/kmp_stats.h +++ openmp/runtime/src/kmp_stats.h @@ -258,6 +258,7 @@ macro(KMP_tree_release, 0, arg) \ macro(USER_resume, 0, arg) \ macro(USER_suspend, 0, arg) \ + macro(USER_mwait, 0, arg) \ macro(KMP_allocate_team, 0, arg) \ macro(KMP_setup_icv_copy, 0, arg) \ macro(USER_icv_copy, 0, arg) \ Index: openmp/runtime/src/kmp_taskdeps.cpp =================================================================== --- openmp/runtime/src/kmp_taskdeps.cpp +++ openmp/runtime/src/kmp_taskdeps.cpp @@ -786,7 +786,8 @@ } int thread_finished = FALSE; - kmp_flag_32 flag((std::atomic *)&node.dn.npredecessors, 0U); + kmp_flag_32 flag( + (std::atomic *)&node.dn.npredecessors, 0U); while (node.dn.npredecessors > 0) { flag.execute_tasks(thread, gtid, FALSE, &thread_finished USE_ITT_BUILD_ARG(NULL), Index: openmp/runtime/src/kmp_tasking.cpp =================================================================== --- openmp/runtime/src/kmp_tasking.cpp +++ openmp/runtime/src/kmp_tasking.cpp @@ -931,7 +931,7 @@ #endif // Only need to keep track of count if team parallel and tasking not - // serialized, or task is detachable and event has already been fulfilled + // serialized, or task is detachable and event has already been fulfilled if (!(taskdata->td_flags.team_serial || taskdata->td_flags.tasking_ser) || taskdata->td_flags.detachable == TASK_DETACHABLE) { // Predecrement simulated by "- 1" calculation @@ -1876,9 +1876,10 @@ must_wait = must_wait || (thread->th.th_task_team != NULL && thread->th.th_task_team->tt.tt_found_proxy_tasks); if (must_wait) { - kmp_flag_32 flag(RCAST(std::atomic *, - &(taskdata->td_incomplete_child_tasks)), - 0U); + kmp_flag_32 flag( + RCAST(std::atomic *, + &(taskdata->td_incomplete_child_tasks)), + 0U); while (KMP_ATOMIC_LD_ACQ(&taskdata->td_incomplete_child_tasks) != 0) { flag.execute_tasks(thread, gtid, FALSE, &thread_finished USE_ITT_BUILD_ARG(itt_sync_obj), @@ -1984,7 +1985,7 @@ thread->th.ompt_thread_info.ompt_task_yielded = 1; #endif __kmp_execute_tasks_32( - thread, gtid, NULL, FALSE, + thread, gtid, (kmp_flag_32<> *)NULL, FALSE, &thread_finished USE_ITT_BUILD_ARG(itt_sync_obj), __kmp_task_stealing_constraint); #if OMPT_SUPPORT @@ -2512,8 +2513,8 @@ if (!taskdata->td_flags.team_serial || (thread->th.th_task_team != NULL && thread->th.th_task_team->tt.tt_found_proxy_tasks)) { - kmp_flag_32 flag(RCAST(std::atomic *, &(taskgroup->count)), - 0U); + kmp_flag_32 flag( + RCAST(std::atomic *, &(taskgroup->count)), 0U); while (KMP_ATOMIC_LD_ACQ(&taskgroup->count) != 0) { flag.execute_tasks(thread, gtid, FALSE, &thread_finished USE_ITT_BUILD_ARG(itt_sync_obj), @@ -3021,8 +3022,9 @@ } } +template int __kmp_execute_tasks_32( - kmp_info_t *thread, kmp_int32 gtid, kmp_flag_32 *flag, int final_spin, + kmp_info_t *thread, kmp_int32 gtid, kmp_flag_32 *flag, int final_spin, int *thread_finished USE_ITT_BUILD_ARG(void *itt_sync_obj), kmp_int32 is_constrained) { return __kmp_execute_tasks_template( @@ -3030,8 +3032,9 @@ thread_finished USE_ITT_BUILD_ARG(itt_sync_obj), is_constrained); } +template int __kmp_execute_tasks_64( - kmp_info_t *thread, kmp_int32 gtid, kmp_flag_64 *flag, int final_spin, + kmp_info_t *thread, kmp_int32 gtid, kmp_flag_64 *flag, int final_spin, int *thread_finished USE_ITT_BUILD_ARG(void *itt_sync_obj), kmp_int32 is_constrained) { return __kmp_execute_tasks_template( @@ -3048,6 +3051,23 @@ thread_finished USE_ITT_BUILD_ARG(itt_sync_obj), is_constrained); } +template int +__kmp_execute_tasks_32(kmp_info_t *, kmp_int32, + kmp_flag_32 *, int, + int *USE_ITT_BUILD_ARG(void *), kmp_int32); + +template int __kmp_execute_tasks_64(kmp_info_t *, kmp_int32, + kmp_flag_64 *, + int, + int *USE_ITT_BUILD_ARG(void *), + kmp_int32); + +template int __kmp_execute_tasks_64(kmp_info_t *, kmp_int32, + kmp_flag_64 *, + int, + int *USE_ITT_BUILD_ARG(void *), + kmp_int32); + // __kmp_enable_tasking: Allocate task team and resume threads sleeping at the // next barrier so they can assist in executing enqueued tasks. // First thread in allocates the task team atomically. @@ -3597,9 +3617,10 @@ // Worker threads may have dropped through to release phase, but could // still be executing tasks. Wait here for tasks to complete. To avoid // memory contention, only master thread checks termination condition. - kmp_flag_32 flag(RCAST(std::atomic *, - &task_team->tt.tt_unfinished_threads), - 0U); + kmp_flag_32 flag( + RCAST(std::atomic *, + &task_team->tt.tt_unfinished_threads), + 0U); flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj)); } // Deactivate the old task team, so that the worker threads will stop @@ -3621,7 +3642,7 @@ } // __kmp_tasking_barrier: -// This routine may only called when __kmp_tasking_mode == tskm_extra_barrier. +// This routine is called only when __kmp_tasking_mode == tskm_extra_barrier. // Internal function to execute all tasks prior to a regular barrier or a join // barrier. It is a full barrier itself, which unfortunately turns regular // barriers into double barriers and join barriers into 1 1/2 barriers. @@ -3635,7 +3656,7 @@ #if USE_ITT_BUILD KMP_FSYNC_SPIN_INIT(spin, NULL); #endif /* USE_ITT_BUILD */ - kmp_flag_32 spin_flag(spin, 0U); + kmp_flag_32 spin_flag(spin, 0U); while (!spin_flag.execute_tasks(thread, gtid, TRUE, &flag USE_ITT_BUILD_ARG(NULL), 0)) { #if USE_ITT_BUILD Index: openmp/runtime/src/kmp_wait_release.h =================================================================== --- openmp/runtime/src/kmp_wait_release.h +++ openmp/runtime/src/kmp_wait_release.h @@ -42,20 +42,26 @@ flag_oncore /**< special 64-bit flag for on-core barrier (hierarchical) */ }; +struct flag_properties { + unsigned int type : 16; + unsigned int reserved : 16; +}; + /*! * Base class for wait/release volatile flag */ template class kmp_flag_native { volatile P *loc; - flag_type t; + flag_properties t; public: typedef P flag_t; - kmp_flag_native(volatile P *p, flag_type ft) : loc(p), t(ft) {} + kmp_flag_native(volatile P *p, flag_type ft) + : loc(p), t({(unsigned int)ft, 0U}) {} volatile P *get() { return loc; } void *get_void_p() { return RCAST(void *, CCAST(P *, loc)); } void set(volatile P *new_loc) { loc = new_loc; } - flag_type get_type() { return t; } + flag_type get_type() { return (flag_type)(t.type); } P load() { return *loc; } void store(P val) { *loc = val; } }; @@ -67,10 +73,11 @@ std::atomic

*loc; /**< Pointer to the flag storage that is modified by another thread */ - flag_type t; /**< "Type" of the flag in loc */ + flag_properties t; /**< "Type" of the flag in loc */ public: typedef P flag_t; - kmp_flag(std::atomic

*p, flag_type ft) : loc(p), t(ft) {} + kmp_flag(std::atomic

*p, flag_type ft) + : loc(p), t({(unsigned int)ft, 0U}) {} /*! * @result the pointer to the actual flag */ @@ -86,7 +93,7 @@ /*! * @result the flag_type */ - flag_type get_type() { return t; } + flag_type get_type() { return (flag_type)(t.type); } /*! * @result flag value */ @@ -104,6 +111,7 @@ bool notdone_check(); P internal_release(); void suspend(int th_gtid); + void mwait(int th_gtid); void resume(int th_gtid); P set_sleeping(); P unset_sleeping(); @@ -160,8 +168,8 @@ to wake it back up to prevent deadlocks! NOTE: We may not belong to a team at this point. */ -template +template static inline bool __kmp_wait_template(kmp_info_t *this_thr, C *flag USE_ITT_BUILD_ARG(void *itt_sync_obj)) { @@ -185,7 +193,7 @@ return false; } th_gtid = this_thr->th.th_info.ds.ds_gtid; - if (cancellable) { + if (Cancellable) { kmp_team_t *team = this_thr->th.th_team; if (team && team->t.t_cancel_request == cancel_parallel) return true; @@ -375,7 +383,7 @@ } #endif // Check if the barrier surrounding this wait loop has been cancelled - if (cancellable) { + if (Cancellable) { kmp_team_t *team = this_thr->th.th_team; if (team && team->t.t_cancel_request == cancel_parallel) break; @@ -400,23 +408,31 @@ #endif // Don't suspend if wait loop designated non-sleepable // in template parameters - if (!sleepable) + if (!Sleepable) continue; if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME && __kmp_pause_status != kmp_soft_paused) continue; - KF_TRACE(50, ("__kmp_wait_sleep: T#%d suspend time reached\n", th_gtid)); - +#if KMP_HAVE_MWAIT || KMP_HAVE_UMWAIT + if (__kmp_mwait_enabled || __kmp_umwait_enabled) { + KF_TRACE(50, ("__kmp_wait_sleep: T#%d using monitor/mwait\n", th_gtid)); + flag->mwait(th_gtid); + } else { +#endif + KF_TRACE(50, ("__kmp_wait_sleep: T#%d suspend time reached\n", th_gtid)); #if KMP_OS_UNIX - if (final_spin) - KMP_ATOMIC_ST_REL(&this_thr->th.th_blocking, false); + if (final_spin) + KMP_ATOMIC_ST_REL(&this_thr->th.th_blocking, false); #endif - flag->suspend(th_gtid); + flag->suspend(th_gtid); #if KMP_OS_UNIX - if (final_spin) - KMP_ATOMIC_ST_REL(&this_thr->th.th_blocking, true); + if (final_spin) + KMP_ATOMIC_ST_REL(&this_thr->th.th_blocking, true); +#endif +#if KMP_HAVE_MWAIT || KMP_HAVE_UMWAIT + } #endif if (TCR_4(__kmp_global.g.g_done)) { @@ -458,7 +474,7 @@ KMP_ATOMIC_ST_REL(&this_thr->th.th_blocking, false); #endif KMP_FSYNC_SPIN_ACQUIRED(CCAST(void *, spin)); - if (cancellable) { + if (Cancellable) { kmp_team_t *team = this_thr->th.th_team; if (team && team->t.t_cancel_request == cancel_parallel) { if (tasks_completed) { @@ -475,6 +491,83 @@ return false; } +#if KMP_HAVE_MWAIT || KMP_HAVE_UMWAIT +// Set up a monitor on the flag variable causing the calling thread to wait in +// a less active state until the flag variable is modified. +template +static inline void __kmp_mwait_template(int th_gtid, C *flag) { + KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(USER_mwait); + kmp_info_t *th = __kmp_threads[th_gtid]; + + KF_TRACE(30, ("__kmp_mwait_template: T#%d enter for flag = %p\n", th_gtid, + flag->get())); + + // User-level mwait is available + KMP_DEBUG_ASSERT(__kmp_mwait_enabled || __kmp_umwait_enabled); + + __kmp_suspend_initialize_thread(th); + __kmp_lock_suspend_mx(th); + + volatile void *spin = flag->get(); + void *cacheline = (void *)(kmp_uint64(spin) & ~(CACHE_LINE - 1)); + + if (!flag->done_check()) { + // Mark thread as no longer active + th->th.th_active = FALSE; + if (th->th.th_active_in_pool) { + th->th.th_active_in_pool = FALSE; + KMP_ATOMIC_DEC(&__kmp_thread_pool_active_nth); + KMP_DEBUG_ASSERT(TCR_4(__kmp_thread_pool_active_nth) >= 0); + } + flag->set_sleeping(); + KF_TRACE(50, ("__kmp_mwait_template: T#%d calling monitor\n", th_gtid)); +#if KMP_HAVE_UMWAIT + if (__kmp_umwait_enabled) { + __kmp_umonitor(cacheline); + } +#elif KMP_HAVE_MWAIT + if (__kmp_mwait_enabled) { + __kmp_mm_monitor(cacheline, 0, 0); + } +#endif + // To avoid a race, check flag between 'monitor' and 'mwait'. A write to + // the address could happen after the last time we checked and before + // monitoring started, in which case monitor can't detect the change. + if (flag->done_check()) + flag->unset_sleeping(); + else { + // if flag changes here, wake-up happens immediately + TCW_PTR(th->th.th_sleep_loc, (void *)flag); + __kmp_unlock_suspend_mx(th); + KF_TRACE(50, ("__kmp_mwait_template: T#%d calling mwait\n", th_gtid)); +#if KMP_HAVE_UMWAIT + if (__kmp_umwait_enabled) { + __kmp_umwait(1, 100); // to do: enable ctrl via hints, backoff counter + } +#elif KMP_HAVE_MWAIT + if (__kmp_mwait_enabled) { + __kmp_mm_mwait(0, __kmp_mwait_hints); + } +#endif + KF_TRACE(50, ("__kmp_mwait_template: T#%d mwait done\n", th_gtid)); + __kmp_lock_suspend_mx(th); + // Clean up sleep info; doesn't matter how/why this thread stopped waiting + if (flag->is_sleeping()) + flag->unset_sleeping(); + TCW_PTR(th->th.th_sleep_loc, NULL); + } + // Mark thread as active again + th->th.th_active = TRUE; + if (TCR_4(th->th.th_in_pool)) { + KMP_ATOMIC_INC(&__kmp_thread_pool_active_nth); + th->th.th_active_in_pool = TRUE; + } + } // Drop out to main wait loop to check flag, handle tasks, etc. + __kmp_unlock_suspend_mx(th); + KF_TRACE(30, ("__kmp_mwait_template: T#%d exit\n", th_gtid)); +} +#endif // KMP_HAVE_MWAIT || KMP_HAVE_UMWAIT + /* Release any threads specified as waiting on the flag by releasing the flag and resume the waiting thread if indicated by the sleep bit(s). A thread that calls __kmp_wait_template must call this function to wake up the potentially @@ -545,7 +638,7 @@ }; // Basic flag that does not use C11 Atomics -template +template class kmp_basic_flag_native : public kmp_flag_native { typedef flag_traits traits_type; FlagType checker; /**< Value to compare flag to to check if flag has been @@ -588,7 +681,13 @@ /*! * @result true if the flag object has been released. */ - bool done_check() { return traits_type::tcr(*(this->get())) == checker; } + bool done_check() { + if (Sleepable) + return (traits_type::tcr(*(this->get())) & ~KMP_BARRIER_SLEEP_STATE) == + checker; + else + return traits_type::tcr(*(this->get())) == checker; + } /*! * @param old_loc in old value of flag * @result true if the flag's old value indicates it was released. @@ -643,7 +742,8 @@ enum barrier_type get_bt() { return bs_last_barrier; } }; -template class kmp_basic_flag : public kmp_flag { +template +class kmp_basic_flag : public kmp_flag { typedef flag_traits traits_type; FlagType checker; /**< Value to compare flag to to check if flag has been released. */ @@ -685,7 +785,12 @@ /*! * @result true if the flag object has been released. */ - bool done_check() { return this->load() == checker; } + bool done_check() { + if (Sleepable) + return (this->load() & ~KMP_BARRIER_SLEEP_STATE) == checker; + else + return this->load() == checker; + } /*! * @param old_loc in old value of flag * @result true if the flag's old value indicates it was released. @@ -736,14 +841,19 @@ enum barrier_type get_bt() { return bs_last_barrier; } }; -class kmp_flag_32 : public kmp_basic_flag { +template +class kmp_flag_32 : public kmp_basic_flag { public: - kmp_flag_32(std::atomic *p) : kmp_basic_flag(p) {} + kmp_flag_32(std::atomic *p) + : kmp_basic_flag(p) {} kmp_flag_32(std::atomic *p, kmp_info_t *thr) - : kmp_basic_flag(p, thr) {} + : kmp_basic_flag(p, thr) {} kmp_flag_32(std::atomic *p, kmp_uint32 c) - : kmp_basic_flag(p, c) {} + : kmp_basic_flag(p, c) {} void suspend(int th_gtid) { __kmp_suspend_32(th_gtid, this); } +#if KMP_HAVE_MWAIT || KMP_HAVE_UMWAIT + void mwait(int th_gtid) { __kmp_mwait_32(th_gtid, this); } +#endif void resume(int th_gtid) { __kmp_resume_32(th_gtid, this); } int execute_tasks(kmp_info_t *this_thr, kmp_int32 gtid, int final_spin, int *thread_finished USE_ITT_BUILD_ARG(void *itt_sync_obj), @@ -752,27 +862,32 @@ this_thr, gtid, this, final_spin, thread_finished USE_ITT_BUILD_ARG(itt_sync_obj), is_constrained); } - void wait(kmp_info_t *this_thr, + bool wait(kmp_info_t *this_thr, int final_spin USE_ITT_BUILD_ARG(void *itt_sync_obj)) { if (final_spin) - __kmp_wait_template( + return __kmp_wait_template( this_thr, this USE_ITT_BUILD_ARG(itt_sync_obj)); else - __kmp_wait_template( + return __kmp_wait_template( this_thr, this USE_ITT_BUILD_ARG(itt_sync_obj)); } void release() { __kmp_release_template(this); } flag_type get_ptr_type() { return flag32; } }; -class kmp_flag_64 : public kmp_basic_flag_native { +template +class kmp_flag_64 : public kmp_basic_flag_native { public: - kmp_flag_64(volatile kmp_uint64 *p) : kmp_basic_flag_native(p) {} + kmp_flag_64(volatile kmp_uint64 *p) + : kmp_basic_flag_native(p) {} kmp_flag_64(volatile kmp_uint64 *p, kmp_info_t *thr) - : kmp_basic_flag_native(p, thr) {} + : kmp_basic_flag_native(p, thr) {} kmp_flag_64(volatile kmp_uint64 *p, kmp_uint64 c) - : kmp_basic_flag_native(p, c) {} + : kmp_basic_flag_native(p, c) {} void suspend(int th_gtid) { __kmp_suspend_64(th_gtid, this); } +#if KMP_HAVE_MWAIT || KMP_HAVE_UMWAIT + void mwait(int th_gtid) { __kmp_mwait_64(th_gtid, this); } +#endif void resume(int th_gtid) { __kmp_resume_64(th_gtid, this); } int execute_tasks(kmp_info_t *this_thr, kmp_int32 gtid, int final_spin, int *thread_finished USE_ITT_BUILD_ARG(void *itt_sync_obj), @@ -781,27 +896,15 @@ this_thr, gtid, this, final_spin, thread_finished USE_ITT_BUILD_ARG(itt_sync_obj), is_constrained); } - void wait(kmp_info_t *this_thr, + bool wait(kmp_info_t *this_thr, int final_spin USE_ITT_BUILD_ARG(void *itt_sync_obj)) { if (final_spin) - __kmp_wait_template( + return __kmp_wait_template( this_thr, this USE_ITT_BUILD_ARG(itt_sync_obj)); else - __kmp_wait_template( + return __kmp_wait_template( this_thr, this USE_ITT_BUILD_ARG(itt_sync_obj)); } - bool wait_cancellable_nosleep(kmp_info_t *this_thr, - int final_spin - USE_ITT_BUILD_ARG(void *itt_sync_obj)) { - bool retval = false; - if (final_spin) - retval = __kmp_wait_template( - this_thr, this USE_ITT_BUILD_ARG(itt_sync_obj)); - else - retval = __kmp_wait_template( - this_thr, this USE_ITT_BUILD_ARG(itt_sync_obj)); - return retval; - } void release() { __kmp_release_template(this); } flag_type get_ptr_type() { return flag64; } }; @@ -859,7 +962,7 @@ return true; else if (flag_switch) { this_thr->th.th_bar[bt].bb.wait_flag = KMP_BARRIER_SWITCHING; - kmp_flag_64 flag(&this_thr->th.th_bar[bt].bb.b_go, + kmp_flag_64<> flag(&this_thr->th.th_bar[bt].bb.b_go, (kmp_uint64)KMP_BARRIER_STATE_BUMP); __kmp_wait_64(this_thr, &flag, TRUE USE_ITT_BUILD_ARG(itt_sync_obj)); } @@ -896,6 +999,9 @@ } void release() { __kmp_release_template(this); } void suspend(int th_gtid) { __kmp_suspend_oncore(th_gtid, this); } +#if KMP_HAVE_MWAIT || KMP_HAVE_UMWAIT + void mwait(int th_gtid) { __kmp_mwait_oncore(th_gtid, this); } +#endif void resume(int th_gtid) { __kmp_resume_oncore(th_gtid, this); } int execute_tasks(kmp_info_t *this_thr, kmp_int32 gtid, int final_spin, int *thread_finished USE_ITT_BUILD_ARG(void *itt_sync_obj), @@ -915,15 +1021,15 @@ if (!flag) return; - switch (RCAST(kmp_flag_64 *, CCAST(void *, flag))->get_type()) { + switch (RCAST(kmp_flag_64<> *, CCAST(void *, flag))->get_type()) { case flag32: - __kmp_resume_32(gtid, NULL); + __kmp_resume_32(gtid, (kmp_flag_32<> *)NULL); break; case flag64: - __kmp_resume_64(gtid, NULL); + __kmp_resume_64(gtid, (kmp_flag_64<> *)NULL); break; case flag_oncore: - __kmp_resume_oncore(gtid, NULL); + __kmp_resume_oncore(gtid, (kmp_flag_oncore *)NULL); break; } } Index: openmp/runtime/src/kmp_wait_release.cpp =================================================================== --- openmp/runtime/src/kmp_wait_release.cpp +++ openmp/runtime/src/kmp_wait_release.cpp @@ -12,14 +12,32 @@ #include "kmp_wait_release.h" -void __kmp_wait_64(kmp_info_t *this_thr, kmp_flag_64 *flag, +void __kmp_wait_64(kmp_info_t *this_thr, kmp_flag_64<> *flag, int final_spin USE_ITT_BUILD_ARG(void *itt_sync_obj)) { if (final_spin) - __kmp_wait_template( + __kmp_wait_template, TRUE>( this_thr, flag USE_ITT_BUILD_ARG(itt_sync_obj)); else - __kmp_wait_template( + __kmp_wait_template, FALSE>( this_thr, flag USE_ITT_BUILD_ARG(itt_sync_obj)); } -void __kmp_release_64(kmp_flag_64 *flag) { __kmp_release_template(flag); } +void __kmp_release_64(kmp_flag_64<> *flag) { __kmp_release_template(flag); } + +#if KMP_HAVE_MWAIT || KMP_HAVE_UMWAIT +template +void __kmp_mwait_32(int th_gtid, kmp_flag_32 *flag) { + __kmp_mwait_template(th_gtid, flag); +} +template +void __kmp_mwait_64(int th_gtid, kmp_flag_64 *flag) { + __kmp_mwait_template(th_gtid, flag); +} +void __kmp_mwait_oncore(int th_gtid, kmp_flag_oncore *flag) { + __kmp_mwait_template(th_gtid, flag); +} + +template void __kmp_mwait_32(int, kmp_flag_32 *); +template void __kmp_mwait_64(int, kmp_flag_64 *); +template void __kmp_mwait_64(int, kmp_flag_64 *); +#endif Index: openmp/runtime/src/z_Linux_util.cpp =================================================================== --- openmp/runtime/src/z_Linux_util.cpp +++ openmp/runtime/src/z_Linux_util.cpp @@ -1459,8 +1459,7 @@ __kmp_suspend_initialize_thread(th); - status = pthread_mutex_lock(&th->th.th_suspend_mx.m_mutex); - KMP_CHECK_SYSFAIL("pthread_mutex_lock", status); + __kmp_lock_suspend_mx(th); KF_TRACE(10, ("__kmp_suspend_template: T#%d setting sleep bit for spin(%p)\n", th_gtid, flag->get())); @@ -1471,8 +1470,7 @@ if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME && __kmp_pause_status != kmp_soft_paused) { flag->unset_sleeping(); - status = pthread_mutex_unlock(&th->th.th_suspend_mx.m_mutex); - KMP_CHECK_SYSFAIL("pthread_mutex_unlock", status); + __kmp_unlock_suspend_mx(th); return; } KF_TRACE(5, ("__kmp_suspend_template: T#%d set sleep bit for spin(%p)==%x," @@ -1535,7 +1533,7 @@ th_gtid)); status = pthread_cond_wait(&th->th.th_suspend_cv.c_cond, &th->th.th_suspend_mx.m_mutex); -#endif +#endif // USE_SUSPEND_TIMEOUT if ((status != 0) && (status != EINTR) && (status != ETIMEDOUT)) { KMP_SYSFAIL("pthread_cond_wait", status); @@ -1575,21 +1573,26 @@ } #endif - status = pthread_mutex_unlock(&th->th.th_suspend_mx.m_mutex); - KMP_CHECK_SYSFAIL("pthread_mutex_unlock", status); + __kmp_unlock_suspend_mx(th); KF_TRACE(30, ("__kmp_suspend_template: T#%d exit\n", th_gtid)); } -void __kmp_suspend_32(int th_gtid, kmp_flag_32 *flag) { +template +void __kmp_suspend_32(int th_gtid, kmp_flag_32 *flag) { __kmp_suspend_template(th_gtid, flag); } -void __kmp_suspend_64(int th_gtid, kmp_flag_64 *flag) { +template +void __kmp_suspend_64(int th_gtid, kmp_flag_64 *flag) { __kmp_suspend_template(th_gtid, flag); } void __kmp_suspend_oncore(int th_gtid, kmp_flag_oncore *flag) { __kmp_suspend_template(th_gtid, flag); } +template void __kmp_suspend_32(int, kmp_flag_32 *); +template void __kmp_suspend_64(int, kmp_flag_64 *); +template void __kmp_suspend_64(int, kmp_flag_64 *); + /* This routine signals the thread specified by target_gtid to wake up after setting the sleep bit indicated by the flag argument to FALSE. The target thread must already have called __kmp_suspend_template() */ @@ -1609,8 +1612,7 @@ __kmp_suspend_initialize_thread(th); - status = pthread_mutex_lock(&th->th.th_suspend_mx.m_mutex); - KMP_CHECK_SYSFAIL("pthread_mutex_lock", status); + __kmp_lock_suspend_mx(th); if (!flag) { // coming from __kmp_null_resume_wrapper flag = (C *)CCAST(void *, th->th.th_sleep_loc); @@ -1619,13 +1621,11 @@ // First, check if the flag is null or its type has changed. If so, someone // else woke it up. if (!flag || flag->get_type() != flag->get_ptr_type()) { // get_ptr_type - // simply shows what - // flag was cast to + // simply shows what flag was cast to KF_TRACE(5, ("__kmp_resume_template: T#%d exiting, thread T#%d already " "awake: flag(%p)\n", gtid, target_gtid, NULL)); - status = pthread_mutex_unlock(&th->th.th_suspend_mx.m_mutex); - KMP_CHECK_SYSFAIL("pthread_mutex_unlock", status); + __kmp_unlock_suspend_mx(th); return; } else { // if multiple threads are sleeping, flag should be internally // referring to a specific thread here @@ -1635,8 +1635,7 @@ "awake: flag(%p): " "%u => %u\n", gtid, target_gtid, flag->get(), old_spin, flag->load())); - status = pthread_mutex_unlock(&th->th.th_suspend_mx.m_mutex); - KMP_CHECK_SYSFAIL("pthread_mutex_unlock", status); + __kmp_unlock_suspend_mx(th); return; } KF_TRACE(5, ("__kmp_resume_template: T#%d about to wakeup T#%d, reset " @@ -1656,23 +1655,27 @@ #endif status = pthread_cond_signal(&th->th.th_suspend_cv.c_cond); KMP_CHECK_SYSFAIL("pthread_cond_signal", status); - status = pthread_mutex_unlock(&th->th.th_suspend_mx.m_mutex); - KMP_CHECK_SYSFAIL("pthread_mutex_unlock", status); + __kmp_unlock_suspend_mx(th); KF_TRACE(30, ("__kmp_resume_template: T#%d exiting after signaling wake up" " for T#%d\n", gtid, target_gtid)); } -void __kmp_resume_32(int target_gtid, kmp_flag_32 *flag) { +template +void __kmp_resume_32(int target_gtid, kmp_flag_32 *flag) { __kmp_resume_template(target_gtid, flag); } -void __kmp_resume_64(int target_gtid, kmp_flag_64 *flag) { +template +void __kmp_resume_64(int target_gtid, kmp_flag_64 *flag) { __kmp_resume_template(target_gtid, flag); } void __kmp_resume_oncore(int target_gtid, kmp_flag_oncore *flag) { __kmp_resume_template(target_gtid, flag); } +template void __kmp_resume_32(int, kmp_flag_32 *); +template void __kmp_resume_64(int, kmp_flag_64 *); + #if KMP_USE_MONITOR void __kmp_resume_monitor() { KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(USER_resume); Index: openmp/runtime/src/z_Windows_NT_util.cpp =================================================================== --- openmp/runtime/src/z_Windows_NT_util.cpp +++ openmp/runtime/src/z_Windows_NT_util.cpp @@ -363,7 +363,7 @@ th_gtid, flag->get())); __kmp_suspend_initialize_thread(th); - __kmp_win32_mutex_lock(&th->th.th_suspend_mx); + __kmp_lock_suspend_mx(th); KF_TRACE(10, ("__kmp_suspend_template: T#%d setting sleep bit for flag's" " loc(%p)\n", @@ -375,7 +375,7 @@ if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME && __kmp_pause_status != kmp_soft_paused) { flag->unset_sleeping(); - __kmp_win32_mutex_unlock(&th->th.th_suspend_mx); + __kmp_unlock_suspend_mx(th); return; } @@ -437,21 +437,26 @@ } } - __kmp_win32_mutex_unlock(&th->th.th_suspend_mx); - + __kmp_unlock_suspend_mx(th); KF_TRACE(30, ("__kmp_suspend_template: T#%d exit\n", th_gtid)); } -void __kmp_suspend_32(int th_gtid, kmp_flag_32 *flag) { +template +void __kmp_suspend_32(int th_gtid, kmp_flag_32 *flag) { __kmp_suspend_template(th_gtid, flag); } -void __kmp_suspend_64(int th_gtid, kmp_flag_64 *flag) { +template +void __kmp_suspend_64(int th_gtid, kmp_flag_64 *flag) { __kmp_suspend_template(th_gtid, flag); } void __kmp_suspend_oncore(int th_gtid, kmp_flag_oncore *flag) { __kmp_suspend_template(th_gtid, flag); } +template void __kmp_suspend_32(int, kmp_flag_32 *); +template void __kmp_suspend_64(int, kmp_flag_64 *); +template void __kmp_suspend_64(int, kmp_flag_64 *); + /* This routine signals the thread specified by target_gtid to wake up after setting the sleep bit indicated by the flag argument to FALSE */ template @@ -467,7 +472,7 @@ gtid, target_gtid)); __kmp_suspend_initialize_thread(th); - __kmp_win32_mutex_lock(&th->th.th_suspend_mx); + __kmp_lock_suspend_mx(th); if (!flag) { // coming from __kmp_null_resume_wrapper flag = (C *)th->th.th_sleep_loc; @@ -481,7 +486,7 @@ KF_TRACE(5, ("__kmp_resume_template: T#%d exiting, thread T#%d already " "awake: flag's loc(%p)\n", gtid, target_gtid, NULL)); - __kmp_win32_mutex_unlock(&th->th.th_suspend_mx); + __kmp_unlock_suspend_mx(th); return; } else { typename C::flag_t old_spin = flag->unset_sleeping(); @@ -489,7 +494,7 @@ KF_TRACE(5, ("__kmp_resume_template: T#%d exiting, thread T#%d already " "awake: flag's loc(%p): %u => %u\n", gtid, target_gtid, flag->get(), old_spin, *(flag->get()))); - __kmp_win32_mutex_unlock(&th->th.th_suspend_mx); + __kmp_unlock_suspend_mx(th); return; } } @@ -499,23 +504,28 @@ gtid, target_gtid, flag->get())); __kmp_win32_cond_signal(&th->th.th_suspend_cv); - __kmp_win32_mutex_unlock(&th->th.th_suspend_mx); + __kmp_unlock_suspend_mx(th); KF_TRACE(30, ("__kmp_resume_template: T#%d exiting after signaling wake up" " for T#%d\n", gtid, target_gtid)); } -void __kmp_resume_32(int target_gtid, kmp_flag_32 *flag) { +template +void __kmp_resume_32(int target_gtid, kmp_flag_32 *flag) { __kmp_resume_template(target_gtid, flag); } -void __kmp_resume_64(int target_gtid, kmp_flag_64 *flag) { +template +void __kmp_resume_64(int target_gtid, kmp_flag_64 *flag) { __kmp_resume_template(target_gtid, flag); } void __kmp_resume_oncore(int target_gtid, kmp_flag_oncore *flag) { __kmp_resume_template(target_gtid, flag); } +template void __kmp_resume_32(int, kmp_flag_32 *); +template void __kmp_resume_64(int, kmp_flag_64 *); + void __kmp_yield() { Sleep(0); } void __kmp_gtid_set_specific(int gtid) {