Index: runtime/src/kmp.h =================================================================== --- runtime/src/kmp.h +++ runtime/src/kmp.h @@ -1444,6 +1444,8 @@ /* keeps tracked of threadprivate cache allocations for cleanup later */ typedef struct kmp_cached_addr { void **addr; /* address of allocated cache */ + void ***compiler_cache; /* pointer to compiler's cache */ + void *data; /* pointer to global data */ struct kmp_cached_addr *next; /* pointer to next cached address */ } kmp_cached_addr_t; @@ -3774,6 +3776,8 @@ struct private_common *kmp_threadprivate_insert(int gtid, void *pc_addr, void *data_addr, size_t pc_size); +void __kmp_threadprivate_resize_cache(int newCapacity); +void __kmp_cleanup_threadprivate_caches(); // ompc_, kmpc_ entries moved from omp.h. #if KMP_OS_WINDOWS Index: runtime/src/kmp_runtime.cpp =================================================================== --- runtime/src/kmp_runtime.cpp +++ runtime/src/kmp_runtime.cpp @@ -3508,8 +3508,14 @@ If any argument is negative, the behavior is undefined. */ static int __kmp_expand_threads(int nNeed) { int added = 0; - int old_tp_cached; - int __kmp_actual_max_nth; + int minimumRequiredCapacity; + int newCapacity; + kmp_info_t **newThreads; + kmp_root_t **newRoot; + +// All calls to __kmp_expand_threads should be under __kmp_forkjoin_lock, so +// resizing __kmp_threads does not need additional protection if foreign +// threads are present #if KMP_OS_WINDOWS && !defined KMP_DYNAMIC_LIB /* only for Windows static library */ @@ -3525,91 +3531,64 @@ if (nNeed <= 0) return added; - while (1) { - int nTarget; - int minimumRequiredCapacity; - int newCapacity; - kmp_info_t **newThreads; - kmp_root_t **newRoot; - - // Note that __kmp_threads_capacity is not bounded by __kmp_max_nth. If - // __kmp_max_nth is set to some value less than __kmp_sys_max_nth by the - // user via KMP_DEVICE_THREAD_LIMIT, then __kmp_threads_capacity may become - // > __kmp_max_nth in one of two ways: - // - // 1) The initialization thread (gtid = 0) exits. __kmp_threads[0] - // may not be resused by another thread, so we may need to increase - // __kmp_threads_capacity to __kmp_max_nth + 1. - // - // 2) New foreign root(s) are encountered. We always register new foreign - // roots. This may cause a smaller # of threads to be allocated at - // subsequent parallel regions, but the worker threads hang around (and - // eventually go to sleep) and need slots in the __kmp_threads[] array. - // - // Anyway, that is the reason for moving the check to see if - // __kmp_max_nth was exceeded into __kmp_reserve_threads() - // instead of having it performed here. -BB - old_tp_cached = __kmp_tp_cached; - __kmp_actual_max_nth = - old_tp_cached ? __kmp_tp_capacity : __kmp_sys_max_nth; - KMP_DEBUG_ASSERT(__kmp_actual_max_nth >= __kmp_threads_capacity); - - /* compute expansion headroom to check if we can expand */ - nTarget = nNeed; - if (__kmp_actual_max_nth - __kmp_threads_capacity < nTarget) { - /* possible expansion too small -- give up */ - break; - } - minimumRequiredCapacity = __kmp_threads_capacity + nTarget; + // Note that __kmp_threads_capacity is not bounded by __kmp_max_nth. If + // __kmp_max_nth is set to some value less than __kmp_sys_max_nth by the + // user via KMP_DEVICE_THREAD_LIMIT, then __kmp_threads_capacity may become + // > __kmp_max_nth in one of two ways: + // + // 1) The initialization thread (gtid = 0) exits. __kmp_threads[0] + // may not be resused by another thread, so we may need to increase + // __kmp_threads_capacity to __kmp_max_nth + 1. + // + // 2) New foreign root(s) are encountered. We always register new foreign + // roots. This may cause a smaller # of threads to be allocated at + // subsequent parallel regions, but the worker threads hang around (and + // eventually go to sleep) and need slots in the __kmp_threads[] array. + // + // Anyway, that is the reason for moving the check to see if + // __kmp_max_nth was exceeded into __kmp_reserve_threads() + // instead of having it performed here. -BB + + KMP_DEBUG_ASSERT(__kmp_sys_max_nth >= __kmp_threads_capacity); + + /* compute expansion headroom to check if we can expand */ + if (__kmp_sys_max_nth - __kmp_threads_capacity < nNeed) { + /* possible expansion too small -- give up */ + return added; + } + minimumRequiredCapacity = __kmp_threads_capacity + nNeed; - newCapacity = __kmp_threads_capacity; - do { - newCapacity = newCapacity <= (__kmp_actual_max_nth >> 1) - ? (newCapacity << 1) - : __kmp_actual_max_nth; - } while (newCapacity < minimumRequiredCapacity); - newThreads = (kmp_info_t **)__kmp_allocate( - (sizeof(kmp_info_t *) + sizeof(kmp_root_t *)) * newCapacity + - CACHE_LINE); - newRoot = (kmp_root_t **)((char *)newThreads + - sizeof(kmp_info_t *) * newCapacity); - KMP_MEMCPY(newThreads, __kmp_threads, - __kmp_threads_capacity * sizeof(kmp_info_t *)); - KMP_MEMCPY(newRoot, __kmp_root, - __kmp_threads_capacity * sizeof(kmp_root_t *)); - memset(newThreads + __kmp_threads_capacity, 0, - (newCapacity - __kmp_threads_capacity) * sizeof(kmp_info_t *)); - memset(newRoot + __kmp_threads_capacity, 0, - (newCapacity - __kmp_threads_capacity) * sizeof(kmp_root_t *)); - - if (!old_tp_cached && __kmp_tp_cached && newCapacity > __kmp_tp_capacity) { - /* __kmp_tp_cached has changed, i.e. __kmpc_threadprivate_cached has - allocated a threadprivate cache while we were allocating the expanded - array, and our new capacity is larger than the threadprivate cache - capacity, so we should deallocate the expanded arrays and try again. - This is the first check of a double-check pair. */ - __kmp_free(newThreads); - continue; /* start over and try again */ - } + newCapacity = __kmp_threads_capacity; + do { + newCapacity = newCapacity <= (__kmp_sys_max_nth >> 1) ? (newCapacity << 1) + : __kmp_sys_max_nth; + } while (newCapacity < minimumRequiredCapacity); + newThreads = (kmp_info_t **)__kmp_allocate( + (sizeof(kmp_info_t *) + sizeof(kmp_root_t *)) * newCapacity + CACHE_LINE); + newRoot = + (kmp_root_t **)((char *)newThreads + sizeof(kmp_info_t *) * newCapacity); + KMP_MEMCPY(newThreads, __kmp_threads, + __kmp_threads_capacity * sizeof(kmp_info_t *)); + KMP_MEMCPY(newRoot, __kmp_root, + __kmp_threads_capacity * sizeof(kmp_root_t *)); + + kmp_info_t **temp_threads = __kmp_threads; + *(kmp_info_t * *volatile *)&__kmp_threads = newThreads; + *(kmp_root_t * *volatile *)&__kmp_root = newRoot; + __kmp_free(temp_threads); + added += newCapacity - __kmp_threads_capacity; + *(volatile int *)&__kmp_threads_capacity = newCapacity; + + if (newCapacity > __kmp_tp_capacity) { __kmp_acquire_bootstrap_lock(&__kmp_tp_cached_lock); - if (!old_tp_cached && __kmp_tp_cached && newCapacity > __kmp_tp_capacity) { - /* Same check as above, but this time with the lock so we can be sure if - we can succeed. */ - __kmp_release_bootstrap_lock(&__kmp_tp_cached_lock); - __kmp_free(newThreads); - continue; /* start over and try again */ - } else { - /* success */ - // __kmp_free( __kmp_threads ); // ATT: It leads to crash. Need to be - // investigated. - *(kmp_info_t * *volatile *)&__kmp_threads = newThreads; - *(kmp_root_t * *volatile *)&__kmp_root = newRoot; - added += newCapacity - __kmp_threads_capacity; - *(volatile int *)&__kmp_threads_capacity = newCapacity; - __kmp_release_bootstrap_lock(&__kmp_tp_cached_lock); - break; /* succeeded, so we can exit the loop */ + if (__kmp_tp_cached && newCapacity > __kmp_tp_capacity) { + __kmp_threadprivate_resize_cache(newCapacity); + } else { // increase __kmp_tp_capacity to correspond with kmp_threads size + *(volatile int *)&__kmp_tp_capacity = newCapacity; } + __kmp_release_bootstrap_lock(&__kmp_tp_cached_lock); } + return added; } @@ -7335,6 +7314,8 @@ __kmp_init_serial = FALSE; } + __kmp_cleanup_threadprivate_caches(); + for (f = 0; f < __kmp_threads_capacity; f++) { if (__kmp_root[f] != NULL) { __kmp_free(__kmp_root[f]); Index: runtime/src/kmp_threadprivate.cpp =================================================================== --- runtime/src/kmp_threadprivate.cpp +++ runtime/src/kmp_threadprivate.cpp @@ -594,6 +594,13 @@ return ret; } +static kmp_cached_addr_t *__kmp_find_cache(void *data) { + kmp_cached_addr_t *ptr = __kmp_threadpriv_cache_list; + while (ptr && ptr->data != data) + ptr = ptr->next; + return ptr; +} + /*! @ingroup THREADPRIVATE @param loc source location information @@ -620,35 +627,40 @@ if (TCR_PTR(*cache) == 0) { __kmp_acquire_bootstrap_lock(&__kmp_tp_cached_lock); - __kmp_tp_cached = 1; - __kmp_release_bootstrap_lock(&__kmp_tp_cached_lock); + // Compiler often passes in NULL cache, even if it's already been created void **my_cache; - KMP_ITT_IGNORE( - my_cache = (void **)__kmp_allocate( - sizeof(void *) * __kmp_tp_capacity + sizeof(kmp_cached_addr_t));); - // No need to zero the allocated memory; __kmp_allocate does that. - KC_TRACE( - 50, - ("__kmpc_threadprivate_cached: T#%d allocated cache at address %p\n", - global_tid, my_cache)); - - /* TODO: free all this memory in __kmp_common_destroy using - * __kmp_threadpriv_cache_list */ - /* Add address of mycache to linked list for cleanup later */ kmp_cached_addr_t *tp_cache_addr; - - tp_cache_addr = (kmp_cached_addr_t *)&my_cache[__kmp_tp_capacity]; - tp_cache_addr->addr = my_cache; - tp_cache_addr->next = __kmp_threadpriv_cache_list; - __kmp_threadpriv_cache_list = tp_cache_addr; - + // Look for an existing cache + tp_cache_addr = __kmp_find_cache(data); + if (!tp_cache_addr) { // Cache was never created; do it now + __kmp_tp_cached = 1; + KMP_ITT_IGNORE(my_cache = (void **)__kmp_allocate( + sizeof(void *) * __kmp_tp_capacity + + sizeof(kmp_cached_addr_t));); + // No need to zero the allocated memory; __kmp_allocate does that. + KC_TRACE(50, ("__kmpc_threadprivate_cached: T#%d allocated cache at " + "address %p\n", + global_tid, my_cache)); + /* TODO: free all this memory in __kmp_common_destroy using + * __kmp_threadpriv_cache_list */ + /* Add address of mycache to linked list for cleanup later */ + tp_cache_addr = (kmp_cached_addr_t *)&my_cache[__kmp_tp_capacity]; + tp_cache_addr->addr = my_cache; + tp_cache_addr->data = data; + tp_cache_addr->compiler_cache = cache; + tp_cache_addr->next = __kmp_threadpriv_cache_list; + __kmp_threadpriv_cache_list = tp_cache_addr; + } else { // A cache was already created; use it + my_cache = tp_cache_addr->addr; + tp_cache_addr->compiler_cache = cache; + } KMP_MB(); TCW_PTR(*cache, my_cache); + __kmp_release_bootstrap_lock(&__kmp_tp_cached_lock); KMP_MB(); } - __kmp_release_lock(&__kmp_global_lock, global_tid); } @@ -661,10 +673,66 @@ KC_TRACE(10, ("__kmpc_threadprivate_cached: T#%d exiting; return value = %p\n", global_tid, ret)); - return ret; } +// This function should only be called when both __kmp_tp_cached_lock and +// kmp_forkjoin_lock are held. +void __kmp_threadprivate_resize_cache(int newCapacity) { + KC_TRACE(10, ("__kmp_threadprivate_resize_cache: called with size: %d\n", + newCapacity)); + + kmp_cached_addr_t *ptr = __kmp_threadpriv_cache_list; + + while (ptr) { + if (ptr->data) { // this location has an active cache; resize it + void **my_cache; + KMP_ITT_IGNORE(my_cache = + (void **)__kmp_allocate(sizeof(void *) * newCapacity + + sizeof(kmp_cached_addr_t));); + // No need to zero the allocated memory; __kmp_allocate does that. + KC_TRACE(50, ("__kmp_threadprivate_resize_cache: allocated cache at %p\n", + my_cache)); + // Now copy old cache into new cache + void **old_cache = ptr->addr; + for (int i = 0; i < __kmp_tp_capacity; ++i) { + my_cache[i] = old_cache[i]; + } + + // Add address of new my_cache to linked list for cleanup later + kmp_cached_addr_t *tp_cache_addr; + tp_cache_addr = (kmp_cached_addr_t *)&my_cache[newCapacity]; + tp_cache_addr->addr = my_cache; + tp_cache_addr->data = ptr->data; + tp_cache_addr->compiler_cache = ptr->compiler_cache; + tp_cache_addr->next = __kmp_threadpriv_cache_list; + __kmp_threadpriv_cache_list = tp_cache_addr; + + // Copy new cache to compiler's location: + // We can only copy directly to (*compiler_cache) after compiler + // guarantees it will keep using the same location for the cache. Assuming + // that change makes it into 19.0 compiler, we can switch to the following + // line when 18.0 is no longer supported: + // TCW_PTR(*(tp_cache_addr->compiler_cache), my_cache); + // Meanwhile, check if compiler_cache is still pointing at old cache, and + // if so, we can point it at the new cache with an atomic compare&swap + // operation. + (void)KMP_COMPARE_AND_STORE_PTR(tp_cache_addr->compiler_cache, old_cache, + my_cache); + // If the store doesn't happen here, the compiler's old behavior will + // inevitably call __kmpc_threadprivate_cache with a new location for the + // cache, and that function will store the resized cache there at that + // point. + + // Nullify old cache's data pointer so we skip it next time + ptr->data = NULL; + } + ptr = ptr->next; + } + // After all caches are resized, update __kmp_tp_capacity to the new size + *(volatile int *)&__kmp_tp_capacity = newCapacity; +} + /*! @ingroup THREADPRIVATE @param loc source location information @@ -701,14 +769,30 @@ d_tn->dt.dtorv = dtor; d_tn->is_vec = TRUE; d_tn->vec_len = (size_t)vector_length; - /* - d_tn->obj_init = 0; // AC: commented out because __kmp_allocate - zeroes the memory - d_tn->pod_init = 0; - */ + // d_tn->obj_init = 0; // AC: __kmp_allocate zeroes the memory + // d_tn->pod_init = 0; lnk_tn = &(__kmp_threadprivate_d_table.data[KMP_HASH(data)]); d_tn->next = *lnk_tn; *lnk_tn = d_tn; } } + +void __kmp_cleanup_threadprivate_caches() { + kmp_cached_addr_t *ptr = __kmp_threadpriv_cache_list; + + while (ptr) { + void **cache = ptr->addr; + __kmp_threadpriv_cache_list = ptr->next; + if (*ptr->compiler_cache) + *ptr->compiler_cache = NULL; + ptr->compiler_cache = NULL; + ptr->data = NULL; + ptr->addr = NULL; + ptr->next = NULL; + // Threadprivate data pointed at by cache entries are destroyed at end of + // __kmp_launch_thread with __kmp_common_destroy_gtid. + __kmp_free(cache); // implicitly frees ptr too + ptr = __kmp_threadpriv_cache_list; + } +}