diff --git a/openmp/runtime/src/kmp_csupport.cpp b/openmp/runtime/src/kmp_csupport.cpp --- a/openmp/runtime/src/kmp_csupport.cpp +++ b/openmp/runtime/src/kmp_csupport.cpp @@ -1249,7 +1249,7 @@ if (hint & kmp_lock_hint_hle) return KMP_TSX_LOCK(hle); if (hint & kmp_lock_hint_rtm) - return KMP_CPUINFO_RTM ? KMP_TSX_LOCK(rtm) : __kmp_user_lock_seq; + return KMP_CPUINFO_RTM ? KMP_TSX_LOCK(rtm_queuing) : __kmp_user_lock_seq; if (hint & kmp_lock_hint_adaptive) return KMP_CPUINFO_RTM ? KMP_TSX_LOCK(adaptive) : __kmp_user_lock_seq; @@ -1268,9 +1268,9 @@ if ((hint & omp_lock_hint_uncontended) && !(hint & omp_lock_hint_speculative)) return lockseq_tas; - // HLE lock for speculation + // Use RTM lock for speculation if (hint & omp_lock_hint_speculative) - return KMP_TSX_LOCK(hle); + return KMP_CPUINFO_RTM ? KMP_TSX_LOCK(rtm_spin) : __kmp_user_lock_seq; return __kmp_user_lock_seq; } @@ -1291,6 +1291,7 @@ return kmp_mutex_impl_spin; #if KMP_USE_TSX case locktag_hle: + case locktag_rtm_spin: return kmp_mutex_impl_speculative; #endif default: @@ -1302,7 +1303,7 @@ switch (ilock->type) { #if KMP_USE_TSX case locktag_adaptive: - case locktag_rtm: + case locktag_rtm_queuing: return kmp_mutex_impl_speculative; #endif case locktag_nested_tas: @@ -1336,7 +1337,8 @@ return kmp_mutex_impl_queuing; #if KMP_USE_TSX case lk_hle: - case lk_rtm: + case lk_rtm_queuing: + case lk_rtm_spin: case lk_adaptive: return kmp_mutex_impl_speculative; #endif @@ -2144,7 +2146,8 @@ kmp_dyna_lockseq_t seq) { #if KMP_USE_TSX // Don't have nested lock implementation for speculative locks - if (seq == lockseq_hle || seq == lockseq_rtm || seq == lockseq_adaptive) + if (seq == lockseq_hle || seq == lockseq_rtm_queuing || + seq == lockseq_rtm_spin || seq == lockseq_adaptive) seq = __kmp_user_lock_seq; #endif switch (seq) { diff --git a/openmp/runtime/src/kmp_lock.h b/openmp/runtime/src/kmp_lock.h --- a/openmp/runtime/src/kmp_lock.h +++ b/openmp/runtime/src/kmp_lock.h @@ -587,7 +587,8 @@ #endif #if KMP_USE_DYNAMIC_LOCK && KMP_USE_TSX lk_hle, - lk_rtm, + lk_rtm_queuing, + lk_rtm_spin, #endif lk_ticket, lk_queuing, @@ -1041,19 +1042,19 @@ // All nested locks are indirect lock types. #if KMP_USE_TSX #if KMP_USE_FUTEX -#define KMP_FOREACH_D_LOCK(m, a) m(tas, a) m(futex, a) m(hle, a) +#define KMP_FOREACH_D_LOCK(m, a) m(tas, a) m(futex, a) m(hle, a) m(rtm_spin, a) #define KMP_FOREACH_I_LOCK(m, a) \ - m(ticket, a) m(queuing, a) m(adaptive, a) m(drdpa, a) m(rtm, a) \ + m(ticket, a) m(queuing, a) m(adaptive, a) m(drdpa, a) m(rtm_queuing, a) \ m(nested_tas, a) m(nested_futex, a) m(nested_ticket, a) \ m(nested_queuing, a) m(nested_drdpa, a) #else -#define KMP_FOREACH_D_LOCK(m, a) m(tas, a) m(hle, a) +#define KMP_FOREACH_D_LOCK(m, a) m(tas, a) m(hle, a) m(rtm_spin, a) #define KMP_FOREACH_I_LOCK(m, a) \ - m(ticket, a) m(queuing, a) m(adaptive, a) m(drdpa, a) m(rtm, a) \ + m(ticket, a) m(queuing, a) m(adaptive, a) m(drdpa, a) m(rtm_queuing, a) \ m(nested_tas, a) m(nested_ticket, a) m(nested_queuing, a) \ m(nested_drdpa, a) #endif // KMP_USE_FUTEX -#define KMP_LAST_D_LOCK lockseq_hle +#define KMP_LAST_D_LOCK lockseq_rtm_spin #else #if KMP_USE_FUTEX #define KMP_FOREACH_D_LOCK(m, a) m(tas, a) m(futex, a) diff --git a/openmp/runtime/src/kmp_lock.cpp b/openmp/runtime/src/kmp_lock.cpp --- a/openmp/runtime/src/kmp_lock.cpp +++ b/openmp/runtime/src/kmp_lock.cpp @@ -2764,20 +2764,22 @@ return __kmp_test_hle_lock(lck, gtid); // TODO: add checks } -static void __kmp_init_rtm_lock(kmp_queuing_lock_t *lck) { +static void __kmp_init_rtm_queuing_lock(kmp_queuing_lock_t *lck) { __kmp_init_queuing_lock(lck); } -static void __kmp_destroy_rtm_lock(kmp_queuing_lock_t *lck) { +static void __kmp_destroy_rtm_queuing_lock(kmp_queuing_lock_t *lck) { __kmp_destroy_queuing_lock(lck); } -static void __kmp_destroy_rtm_lock_with_checks(kmp_queuing_lock_t *lck) { +static void +__kmp_destroy_rtm_queuing_lock_with_checks(kmp_queuing_lock_t *lck) { __kmp_destroy_queuing_lock_with_checks(lck); } KMP_ATTRIBUTE_TARGET_RTM -static void __kmp_acquire_rtm_lock(kmp_queuing_lock_t *lck, kmp_int32 gtid) { +static void __kmp_acquire_rtm_queuing_lock(kmp_queuing_lock_t *lck, + kmp_int32 gtid) { unsigned retries = 3, status; do { status = _xbegin(); @@ -2799,13 +2801,14 @@ __kmp_acquire_queuing_lock(lck, gtid); } -static void __kmp_acquire_rtm_lock_with_checks(kmp_queuing_lock_t *lck, - kmp_int32 gtid) { - __kmp_acquire_rtm_lock(lck, gtid); +static void __kmp_acquire_rtm_queuing_lock_with_checks(kmp_queuing_lock_t *lck, + kmp_int32 gtid) { + __kmp_acquire_rtm_queuing_lock(lck, gtid); } KMP_ATTRIBUTE_TARGET_RTM -static int __kmp_release_rtm_lock(kmp_queuing_lock_t *lck, kmp_int32 gtid) { +static int __kmp_release_rtm_queuing_lock(kmp_queuing_lock_t *lck, + kmp_int32 gtid) { if (__kmp_is_unlocked_queuing_lock(lck)) { // Releasing from speculation _xend(); @@ -2816,13 +2819,14 @@ return KMP_LOCK_RELEASED; } -static int __kmp_release_rtm_lock_with_checks(kmp_queuing_lock_t *lck, - kmp_int32 gtid) { - return __kmp_release_rtm_lock(lck, gtid); +static int __kmp_release_rtm_queuing_lock_with_checks(kmp_queuing_lock_t *lck, + kmp_int32 gtid) { + return __kmp_release_rtm_queuing_lock(lck, gtid); } KMP_ATTRIBUTE_TARGET_RTM -static int __kmp_test_rtm_lock(kmp_queuing_lock_t *lck, kmp_int32 gtid) { +static int __kmp_test_rtm_queuing_lock(kmp_queuing_lock_t *lck, + kmp_int32 gtid) { unsigned retries = 3, status; do { status = _xbegin(); @@ -2833,12 +2837,108 @@ break; } while (retries--); - return (__kmp_is_unlocked_queuing_lock(lck)) ? 1 : 0; + return __kmp_test_queuing_lock(lck, gtid); } -static int __kmp_test_rtm_lock_with_checks(kmp_queuing_lock_t *lck, - kmp_int32 gtid) { - return __kmp_test_rtm_lock(lck, gtid); +static int __kmp_test_rtm_queuing_lock_with_checks(kmp_queuing_lock_t *lck, + kmp_int32 gtid) { + return __kmp_test_rtm_queuing_lock(lck, gtid); +} + +// Reuse kmp_tas_lock_t for TSX lock which use RTM with fall-back spin lock. +typedef kmp_tas_lock_t kmp_rtm_spin_lock_t; + +static void __kmp_destroy_rtm_spin_lock(kmp_rtm_spin_lock_t *lck) { + KMP_ATOMIC_ST_REL(&lck->lk.poll, 0); +} + +static void __kmp_destroy_rtm_spin_lock_with_checks(kmp_rtm_spin_lock_t *lck) { + __kmp_destroy_rtm_spin_lock(lck); +} + +KMP_ATTRIBUTE_TARGET_RTM +static int __kmp_acquire_rtm_spin_lock(kmp_rtm_spin_lock_t *lck, + kmp_int32 gtid) { + unsigned retries = 3, status; + kmp_int32 lock_free = KMP_LOCK_FREE(rtm_spin); + kmp_int32 lock_busy = KMP_LOCK_BUSY(1, rtm_spin); + do { + status = _xbegin(); + if (status == _XBEGIN_STARTED) { + if (KMP_ATOMIC_LD_RLX(&lck->lk.poll) == lock_free) + return KMP_LOCK_ACQUIRED_FIRST; + _xabort(0xff); + } + if ((status & _XABORT_EXPLICIT) && _XABORT_CODE(status) == 0xff) { + // Wait until lock becomes free + while (KMP_ATOMIC_LD_RLX(&lck->lk.poll) != lock_free) { + KMP_YIELD(TRUE); + } + } else if (!(status & _XABORT_RETRY)) + break; + } while (retries--); + + // Fall-back spin lock + KMP_FSYNC_PREPARE(lck); + kmp_backoff_t backoff = __kmp_spin_backoff_params; + while (KMP_ATOMIC_LD_RLX(&lck->lk.poll) != lock_free || + !__kmp_atomic_compare_store_acq(&lck->lk.poll, lock_free, lock_busy)) { + __kmp_spin_backoff(&backoff); + } + KMP_FSYNC_ACQUIRED(lck); + return KMP_LOCK_ACQUIRED_FIRST; +} + +static int __kmp_acquire_rtm_spin_lock_with_checks(kmp_rtm_spin_lock_t *lck, + kmp_int32 gtid) { + return __kmp_acquire_rtm_spin_lock(lck, gtid); +} + +KMP_ATTRIBUTE_TARGET_RTM +static int __kmp_release_rtm_spin_lock(kmp_rtm_spin_lock_t *lck, + kmp_int32 gtid) { + if (KMP_ATOMIC_LD_RLX(&lck->lk.poll) == KMP_LOCK_FREE(rtm_spin)) { + // Releasing from speculation + _xend(); + } else { + // Releasing from a real lock + KMP_FSYNC_RELEASING(lck); + KMP_ATOMIC_ST_REL(&lck->lk.poll, KMP_LOCK_FREE(rtm_spin)); + } + return KMP_LOCK_RELEASED; +} + +static int __kmp_release_rtm_spin_lock_with_checks(kmp_rtm_spin_lock_t *lck, + kmp_int32 gtid) { + return __kmp_release_rtm_spin_lock(lck, gtid); +} + +KMP_ATTRIBUTE_TARGET_RTM +static int __kmp_test_rtm_spin_lock(kmp_rtm_spin_lock_t *lck, kmp_int32 gtid) { + unsigned retries = 3, status; + kmp_int32 lock_free = KMP_LOCK_FREE(rtm_spin); + kmp_int32 lock_busy = KMP_LOCK_BUSY(1, rtm_spin); + do { + status = _xbegin(); + if (status == _XBEGIN_STARTED && + KMP_ATOMIC_LD_RLX(&lck->lk.poll) == lock_free) { + return TRUE; + } + if (!(status & _XABORT_RETRY)) + break; + } while (retries--); + + if (KMP_ATOMIC_LD_RLX(&lck->lk.poll) == lock_free && + __kmp_atomic_compare_store_acq(&lck->lk.poll, lock_free, lock_busy)) { + KMP_FSYNC_ACQUIRED(lck); + return TRUE; + } + return FALSE; +} + +static int __kmp_test_rtm_spin_lock_with_checks(kmp_rtm_spin_lock_t *lck, + kmp_int32 gtid) { + return __kmp_test_rtm_spin_lock(lck, gtid); } #endif // KMP_USE_TSX @@ -3124,7 +3224,7 @@ } #endif #if KMP_USE_TSX - if (seq == lockseq_rtm && !__kmp_cpuinfo.rtm) { + if (seq == lockseq_rtm_queuing && !__kmp_cpuinfo.rtm) { seq = lockseq_queuing; } #endif @@ -3266,7 +3366,7 @@ #endif __kmp_indirect_lock_size[locktag_drdpa] = sizeof(kmp_drdpa_lock_t); #if KMP_USE_TSX - __kmp_indirect_lock_size[locktag_rtm] = sizeof(kmp_queuing_lock_t); + __kmp_indirect_lock_size[locktag_rtm_queuing] = sizeof(kmp_queuing_lock_t); #endif __kmp_indirect_lock_size[locktag_nested_tas] = sizeof(kmp_tas_lock_t); #if KMP_USE_FUTEX diff --git a/openmp/runtime/src/kmp_runtime.cpp b/openmp/runtime/src/kmp_runtime.cpp --- a/openmp/runtime/src/kmp_runtime.cpp +++ b/openmp/runtime/src/kmp_runtime.cpp @@ -1630,7 +1630,7 @@ } #endif -#if USE_ITT_BUILD +#if USE_ITT_BUILD && USE_ITT_NOTIFY if (((__itt_frame_submit_v3_ptr && __itt_get_timestamp_ptr) || KMP_ITT_DEBUG) && __kmp_forkjoin_frames_mode == 3 && @@ -1644,7 +1644,7 @@ // create new stack stitching id before entering fork barrier parent_team->t.t_stack_id = __kmp_itt_stack_caller_create(); } -#endif /* USE_ITT_BUILD */ +#endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */ KF_TRACE(10, ("__kmp_fork_call: before internal fork: root=%p, team=%p, " "master_th=%p, gtid=%d\n", diff --git a/openmp/runtime/src/kmp_settings.cpp b/openmp/runtime/src/kmp_settings.cpp --- a/openmp/runtime/src/kmp_settings.cpp +++ b/openmp/runtime/src/kmp_settings.cpp @@ -4102,15 +4102,24 @@ } #endif // KMP_USE_ADAPTIVE_LOCKS #if KMP_USE_DYNAMIC_LOCK && KMP_USE_TSX - else if (__kmp_str_match("rtm", 1, value)) { + else if (__kmp_str_match("rtm_queuing", 1, value)) { if (__kmp_cpuinfo.rtm) { - __kmp_user_lock_kind = lk_rtm; - KMP_STORE_LOCK_SEQ(rtm); + __kmp_user_lock_kind = lk_rtm_queuing; + KMP_STORE_LOCK_SEQ(rtm_queuing); } else { KMP_WARNING(AdaptiveNotSupported, name, value); __kmp_user_lock_kind = lk_queuing; KMP_STORE_LOCK_SEQ(queuing); } + } else if (__kmp_str_match("rtm_spin", 1, value)) { + if (__kmp_cpuinfo.rtm) { + __kmp_user_lock_kind = lk_rtm_spin; + KMP_STORE_LOCK_SEQ(rtm_spin); + } else { + KMP_WARNING(AdaptiveNotSupported, name, value); + __kmp_user_lock_kind = lk_tas; + KMP_STORE_LOCK_SEQ(queuing); + } } else if (__kmp_str_match("hle", 1, value)) { __kmp_user_lock_kind = lk_hle; KMP_STORE_LOCK_SEQ(hle); @@ -4141,8 +4150,12 @@ #endif #if KMP_USE_DYNAMIC_LOCK && KMP_USE_TSX - case lk_rtm: - value = "rtm"; + case lk_rtm_queuing: + value = "rtm_queuing"; + break; + + case lk_rtm_spin: + value = "rtm_spin"; break; case lk_hle: