diff --git a/openmp/runtime/src/kmp.h b/openmp/runtime/src/kmp.h --- a/openmp/runtime/src/kmp.h +++ b/openmp/runtime/src/kmp.h @@ -2504,6 +2504,25 @@ char tt_pad[KMP_PAD(kmp_base_task_team_t, CACHE_LINE)]; }; +// Declarations for custom fast memory allocator +// NOTE: bufsize must be a signed datatype +#if KMP_OS_WINDOWS +#if KMP_ARCH_X86 || KMP_ARCH_ARM +typedef kmp_int32 bufsize; +#else +typedef kmp_int64 bufsize; +#endif +#else +typedef ssize_t bufsize; +#endif // KMP_OS_WINDOWS + +typedef struct kmp_mem_descr { // Memory block descriptor. + void *ptr_allocated; // Pointer returned by malloc(), subject for free(). + size_t size_allocated; // Size of allocated memory block. + void *ptr_aligned; // Pointer to aligned memory, to be used by client code. + size_t size_aligned; // Size of aligned memory block. +} kmp_mem_descr_t; + #if (USE_FAST_MEMORY == 3) || (USE_FAST_MEMORY == 5) // Free lists keep same-size free memory slots for fast memory allocation // routines @@ -2696,6 +2715,11 @@ kmp_base_info_t th; } kmp_info_t; +// some memory allocation wrappers declaration +extern void *kmp_b_alloc(kmp_info_t *th, bufsize s); +extern void kmp_b_free(kmp_info_t *th, void *buf); +extern void kmp_b_dequeue(kmp_info_t *th); + // OpenMP thread team data structures typedef struct kmp_base_data { volatile kmp_uint32 t_value; } kmp_base_data_t; diff --git a/openmp/runtime/src/kmp_alloc.cpp b/openmp/runtime/src/kmp_alloc.cpp --- a/openmp/runtime/src/kmp_alloc.cpp +++ b/openmp/runtime/src/kmp_alloc.cpp @@ -23,18 +23,6 @@ typedef void *(*bget_acquire_t)(size_t); typedef void (*bget_release_t)(void *); -/* NOTE: bufsize must be a signed datatype */ - -#if KMP_OS_WINDOWS -#if KMP_ARCH_X86 || KMP_ARCH_ARM -typedef kmp_int32 bufsize; -#else -typedef kmp_int64 bufsize; -#endif -#else -typedef ssize_t bufsize; -#endif // KMP_OS_WINDOWS - /* The three modes of operation are, fifo search, lifo search, and best-fit */ typedef enum bget_mode { @@ -1734,13 +1722,6 @@ memory leaks, but it may be useful for debugging memory corruptions, used freed pointers, etc. */ /* #define LEAK_MEMORY */ -struct kmp_mem_descr { // Memory block descriptor. - void *ptr_allocated; // Pointer returned by malloc(), subject for free(). - size_t size_allocated; // Size of allocated memory block. - void *ptr_aligned; // Pointer to aligned memory, to be used by client code. - size_t size_aligned; // Size of aligned memory block. -}; -typedef struct kmp_mem_descr kmp_mem_descr_t; /* Allocate memory on requested boundary, fill allocated memory with 0x00. NULL is NEVER returned, __kmp_abort() is called in case of memory allocation @@ -1900,216 +1881,12 @@ KE_TRACE(25, ("<- __kmp_free() returns\n")); } // func ___kmp_free -#if USE_FAST_MEMORY == 3 -// Allocate fast memory by first scanning the thread's free lists -// If a chunk the right size exists, grab it off the free list. -// Otherwise allocate normally using kmp_thread_malloc. - -// AC: How to choose the limit? Just get 16 for now... -#define KMP_FREE_LIST_LIMIT 16 - -// Always use 128 bytes for determining buckets for caching memory blocks -#define DCACHE_LINE 128 - -void *___kmp_fast_allocate(kmp_info_t *this_thr, size_t size KMP_SRC_LOC_DECL) { - void *ptr; - size_t num_lines, idx; - int index; - void *alloc_ptr; - size_t alloc_size; - kmp_mem_descr_t *descr; - - KE_TRACE(25, ("-> __kmp_fast_allocate( T#%d, %d ) called from %s:%d\n", - __kmp_gtid_from_thread(this_thr), (int)size KMP_SRC_LOC_PARM)); - - num_lines = (size + DCACHE_LINE - 1) / DCACHE_LINE; - idx = num_lines - 1; - KMP_DEBUG_ASSERT(idx >= 0); - if (idx < 2) { - index = 0; // idx is [ 0, 1 ], use first free list - num_lines = 2; // 1, 2 cache lines or less than cache line - } else if ((idx >>= 2) == 0) { - index = 1; // idx is [ 2, 3 ], use second free list - num_lines = 4; // 3, 4 cache lines - } else if ((idx >>= 2) == 0) { - index = 2; // idx is [ 4, 15 ], use third free list - num_lines = 16; // 5, 6, ..., 16 cache lines - } else if ((idx >>= 2) == 0) { - index = 3; // idx is [ 16, 63 ], use fourth free list - num_lines = 64; // 17, 18, ..., 64 cache lines - } else { - goto alloc_call; // 65 or more cache lines ( > 8KB ), don't use free lists - } - - ptr = this_thr->th.th_free_lists[index].th_free_list_self; - if (ptr != NULL) { - // pop the head of no-sync free list - this_thr->th.th_free_lists[index].th_free_list_self = *((void **)ptr); - KMP_DEBUG_ASSERT( - this_thr == - ((kmp_mem_descr_t *)((kmp_uintptr_t)ptr - sizeof(kmp_mem_descr_t))) - ->ptr_aligned); - goto end; - } - ptr = TCR_SYNC_PTR(this_thr->th.th_free_lists[index].th_free_list_sync); - if (ptr != NULL) { - // no-sync free list is empty, use sync free list (filled in by other - // threads only) - // pop the head of the sync free list, push NULL instead - while (!KMP_COMPARE_AND_STORE_PTR( - &this_thr->th.th_free_lists[index].th_free_list_sync, ptr, nullptr)) { - KMP_CPU_PAUSE(); - ptr = TCR_SYNC_PTR(this_thr->th.th_free_lists[index].th_free_list_sync); - } - // push the rest of chain into no-sync free list (can be NULL if there was - // the only block) - this_thr->th.th_free_lists[index].th_free_list_self = *((void **)ptr); - KMP_DEBUG_ASSERT( - this_thr == - ((kmp_mem_descr_t *)((kmp_uintptr_t)ptr - sizeof(kmp_mem_descr_t))) - ->ptr_aligned); - goto end; - } - -alloc_call: - // haven't found block in the free lists, thus allocate it - size = num_lines * DCACHE_LINE; - - alloc_size = size + sizeof(kmp_mem_descr_t) + DCACHE_LINE; - KE_TRACE(25, ("__kmp_fast_allocate: T#%d Calling __kmp_thread_malloc with " - "alloc_size %d\n", - __kmp_gtid_from_thread(this_thr), alloc_size)); - alloc_ptr = bget(this_thr, (bufsize)alloc_size); - - // align ptr to DCACHE_LINE - ptr = (void *)((((kmp_uintptr_t)alloc_ptr) + sizeof(kmp_mem_descr_t) + - DCACHE_LINE) & - ~(DCACHE_LINE - 1)); - descr = (kmp_mem_descr_t *)(((kmp_uintptr_t)ptr) - sizeof(kmp_mem_descr_t)); - - descr->ptr_allocated = alloc_ptr; // remember allocated pointer - // we don't need size_allocated - descr->ptr_aligned = (void *)this_thr; // remember allocating thread - // (it is already saved in bget buffer, - // but we may want to use another allocator in future) - descr->size_aligned = size; - -end: - KE_TRACE(25, ("<- __kmp_fast_allocate( T#%d ) returns %p\n", - __kmp_gtid_from_thread(this_thr), ptr)); - return ptr; -} // func __kmp_fast_allocate - -// Free fast memory and place it on the thread's free list if it is of -// the correct size. -void ___kmp_fast_free(kmp_info_t *this_thr, void *ptr KMP_SRC_LOC_DECL) { - kmp_mem_descr_t *descr; - kmp_info_t *alloc_thr; - size_t size; - size_t idx; - int index; - - KE_TRACE(25, ("-> __kmp_fast_free( T#%d, %p ) called from %s:%d\n", - __kmp_gtid_from_thread(this_thr), ptr KMP_SRC_LOC_PARM)); - KMP_ASSERT(ptr != NULL); - - descr = (kmp_mem_descr_t *)(((kmp_uintptr_t)ptr) - sizeof(kmp_mem_descr_t)); +// some memory allocation wrappers definition +void *kmp_b_alloc(kmp_info_t *th, bufsize s) { return bget(th, s); } +void kmp_b_free(kmp_info_t *th, void *buf) { brel(th, buf); } +void kmp_b_dequeue(kmp_info_t *th) { __kmp_bget_dequeue(th); } - KE_TRACE(26, (" __kmp_fast_free: size_aligned=%d\n", - (int)descr->size_aligned)); - - size = descr->size_aligned; // 2, 4, 16, 64, 65, 66, ... cache lines - - idx = DCACHE_LINE * 2; // 2 cache lines is minimal size of block - if (idx == size) { - index = 0; // 2 cache lines - } else if ((idx <<= 1) == size) { - index = 1; // 4 cache lines - } else if ((idx <<= 2) == size) { - index = 2; // 16 cache lines - } else if ((idx <<= 2) == size) { - index = 3; // 64 cache lines - } else { - KMP_DEBUG_ASSERT(size > DCACHE_LINE * 64); - goto free_call; // 65 or more cache lines ( > 8KB ) - } - - alloc_thr = (kmp_info_t *)descr->ptr_aligned; // get thread owning the block - if (alloc_thr == this_thr) { - // push block to self no-sync free list, linking previous head (LIFO) - *((void **)ptr) = this_thr->th.th_free_lists[index].th_free_list_self; - this_thr->th.th_free_lists[index].th_free_list_self = ptr; - } else { - void *head = this_thr->th.th_free_lists[index].th_free_list_other; - if (head == NULL) { - // Create new free list - this_thr->th.th_free_lists[index].th_free_list_other = ptr; - *((void **)ptr) = NULL; // mark the tail of the list - descr->size_allocated = (size_t)1; // head of the list keeps its length - } else { - // need to check existed "other" list's owner thread and size of queue - kmp_mem_descr_t *dsc = - (kmp_mem_descr_t *)((char *)head - sizeof(kmp_mem_descr_t)); - // allocating thread, same for all queue nodes - kmp_info_t *q_th = (kmp_info_t *)(dsc->ptr_aligned); - size_t q_sz = - dsc->size_allocated + 1; // new size in case we add current task - if (q_th == alloc_thr && q_sz <= KMP_FREE_LIST_LIMIT) { - // we can add current task to "other" list, no sync needed - *((void **)ptr) = head; - descr->size_allocated = q_sz; - this_thr->th.th_free_lists[index].th_free_list_other = ptr; - } else { - // either queue blocks owner is changing or size limit exceeded - // return old queue to allocating thread (q_th) synchronously, - // and start new list for alloc_thr's tasks - void *old_ptr; - void *tail = head; - void *next = *((void **)head); - while (next != NULL) { - KMP_DEBUG_ASSERT( - // queue size should decrease by 1 each step through the list - ((kmp_mem_descr_t *)((char *)next - sizeof(kmp_mem_descr_t))) - ->size_allocated + - 1 == - ((kmp_mem_descr_t *)((char *)tail - sizeof(kmp_mem_descr_t))) - ->size_allocated); - tail = next; // remember tail node - next = *((void **)next); - } - KMP_DEBUG_ASSERT(q_th != NULL); - // push block to owner's sync free list - old_ptr = TCR_PTR(q_th->th.th_free_lists[index].th_free_list_sync); - /* the next pointer must be set before setting free_list to ptr to avoid - exposing a broken list to other threads, even for an instant. */ - *((void **)tail) = old_ptr; - - while (!KMP_COMPARE_AND_STORE_PTR( - &q_th->th.th_free_lists[index].th_free_list_sync, old_ptr, head)) { - KMP_CPU_PAUSE(); - old_ptr = TCR_PTR(q_th->th.th_free_lists[index].th_free_list_sync); - *((void **)tail) = old_ptr; - } - - // start new list of not-selt tasks - this_thr->th.th_free_lists[index].th_free_list_other = ptr; - *((void **)ptr) = NULL; - descr->size_allocated = (size_t)1; // head of queue keeps its length - } - } - } - goto end; - -free_call: - KE_TRACE(25, ("__kmp_fast_free: T#%d Calling __kmp_thread_free for size %d\n", - __kmp_gtid_from_thread(this_thr), size)); - __kmp_bget_dequeue(this_thr); /* Release any queued buffers */ - brel(this_thr, descr->ptr_allocated); - -end: - KE_TRACE(25, ("<- __kmp_fast_free() returns\n")); - -} // func __kmp_fast_free +#if USE_FAST_MEMORY == 3 // Initialize the thread free lists related to fast memory // Only do this when a thread is initially created. diff --git a/openmp/runtime/src/kmp_tasking.cpp b/openmp/runtime/src/kmp_tasking.cpp --- a/openmp/runtime/src/kmp_tasking.cpp +++ b/openmp/runtime/src/kmp_tasking.cpp @@ -1453,6 +1453,226 @@ sizeof_shareds, task_entry); } +#if USE_FAST_MEMORY == 3 +// Allocate fast memory by first scanning the thread's free lists +// If a chunk the right size exists, grab it off the free list. +// Otherwise allocate normally using kmp_thread_malloc. + +// AC: How to choose the limit? Just get 16 for now... +#define KMP_FREE_LIST_LIMIT 16 + +// Always use 128 bytes for determining buckets for caching memory blocks +#define DCACHE_LINE 128 + +void *___kmp_fast_allocate(kmp_info_t *this_thr, size_t size KMP_SRC_LOC_DECL) { + void *ptr; + size_t num_lines, idx; + int index; + void *alloc_ptr; + size_t alloc_size; + kmp_mem_descr_t *descr; + + KE_TRACE(25, ("-> __kmp_fast_allocate( T#%d, %d ) called from %s:%d\n", + __kmp_gtid_from_thread(this_thr), (int)size KMP_SRC_LOC_PARM)); + + num_lines = (size + DCACHE_LINE - 1) / DCACHE_LINE; + idx = num_lines - 1; + KMP_DEBUG_ASSERT(idx >= 0); + if (idx < 2) { + index = 0; // idx is [ 0, 1 ], use first free list + num_lines = 2; // 1, 2 cache lines or less than cache line + } else if ((idx >>= 2) == 0) { + index = 1; // idx is [ 2, 3 ], use second free list + num_lines = 4; // 3, 4 cache lines + } else if ((idx >>= 2) == 0) { + index = 2; // idx is [ 4, 15 ], use third free list + num_lines = 16; // 5, 6, ..., 16 cache lines + } else if ((idx >>= 2) == 0) { + index = 3; // idx is [ 16, 63 ], use fourth free list + num_lines = 64; // 17, 18, ..., 64 cache lines + } else { + goto alloc_call; // 65 or more cache lines ( > 8KB ), don't use free lists + } + + ptr = this_thr->th.th_free_lists[index].th_free_list_self; + if (ptr != NULL) { + // pop the head of no-sync free list + this_thr->th.th_free_lists[index].th_free_list_self = *((void **)ptr); + KMP_DEBUG_ASSERT( + this_thr == + ((kmp_mem_descr_t *)((kmp_uintptr_t)ptr - sizeof(kmp_mem_descr_t))) + ->ptr_aligned); + goto end; + } + ptr = TCR_SYNC_PTR(this_thr->th.th_free_lists[index].th_free_list_sync); + if (ptr != NULL) { + // no-sync free list is empty, use sync free list (filled in by other + // threads only) + // pop the head of the sync free list, push NULL instead + while (!KMP_COMPARE_AND_STORE_PTR( + &this_thr->th.th_free_lists[index].th_free_list_sync, ptr, nullptr)) { + KMP_CPU_PAUSE(); + ptr = TCR_SYNC_PTR(this_thr->th.th_free_lists[index].th_free_list_sync); + } + // push the rest of chain into no-sync free list (can be NULL if there was + // the only block) + this_thr->th.th_free_lists[index].th_free_list_self = *((void **)ptr); + KMP_DEBUG_ASSERT( + this_thr == + ((kmp_mem_descr_t *)((kmp_uintptr_t)ptr - sizeof(kmp_mem_descr_t))) + ->ptr_aligned); + goto end; + } + +alloc_call: + // haven't found block in the free lists, thus allocate it + size = num_lines * DCACHE_LINE; + + alloc_size = size + sizeof(kmp_mem_descr_t) + DCACHE_LINE; + KE_TRACE(25, ("__kmp_fast_allocate: T#%d Calling __kmp_thread_malloc with " + "alloc_size %d\n", + __kmp_gtid_from_thread(this_thr), alloc_size)); +#if INTEL_PRIVATE +// AC: replace tbbmalloc with BGET for big blocks until the issues with +// 358.botsalgn test is fixed: +// CQ180918: slowdown, +// CQ292223: memory consumption, +// CQ291050: unneeded allocations. +// alloc_ptr = ___kmp_thread_malloc( this_thr, alloc_size KMP_SRC_LOC_PARM ); +#endif /* INTEL_PRIVATE */ + alloc_ptr = kmp_b_alloc(this_thr, (bufsize)alloc_size); + + // align ptr to DCACHE_LINE + ptr = (void *)((((kmp_uintptr_t)alloc_ptr) + sizeof(kmp_mem_descr_t) + + DCACHE_LINE) & + ~(DCACHE_LINE - 1)); + descr = (kmp_mem_descr_t *)(((kmp_uintptr_t)ptr) - sizeof(kmp_mem_descr_t)); + + descr->ptr_allocated = alloc_ptr; // remember allocated pointer + // we don't need size_allocated + descr->ptr_aligned = (void *)this_thr; // remember allocating thread + // (it is already saved in bget buffer, + // but we may want to use another allocator in future) + descr->size_aligned = size; + +end: + KE_TRACE(25, ("<- __kmp_fast_allocate( T#%d ) returns %p\n", + __kmp_gtid_from_thread(this_thr), ptr)); + return ptr; +} // func __kmp_fast_allocate + +// Free fast memory and place it on the thread's free list if it is of +// the correct size. +void ___kmp_fast_free(kmp_info_t *this_thr, void *ptr KMP_SRC_LOC_DECL) { + kmp_mem_descr_t *descr; + kmp_info_t *alloc_thr; + size_t size; + size_t idx; + int index; + + KE_TRACE(25, ("-> __kmp_fast_free( T#%d, %p ) called from %s:%d\n", + __kmp_gtid_from_thread(this_thr), ptr KMP_SRC_LOC_PARM)); + KMP_ASSERT(ptr != NULL); + + descr = (kmp_mem_descr_t *)(((kmp_uintptr_t)ptr) - sizeof(kmp_mem_descr_t)); + + KE_TRACE(26, (" __kmp_fast_free: size_aligned=%d\n", + (int)descr->size_aligned)); + + size = descr->size_aligned; // 2, 4, 16, 64, 65, 66, ... cache lines + + idx = DCACHE_LINE * 2; // 2 cache lines is minimal size of block + if (idx == size) { + index = 0; // 2 cache lines + } else if ((idx <<= 1) == size) { + index = 1; // 4 cache lines + } else if ((idx <<= 2) == size) { + index = 2; // 16 cache lines + } else if ((idx <<= 2) == size) { + index = 3; // 64 cache lines + } else { + KMP_DEBUG_ASSERT(size > DCACHE_LINE * 64); + goto free_call; // 65 or more cache lines ( > 8KB ) + } + + alloc_thr = (kmp_info_t *)descr->ptr_aligned; // get thread owning the block + if (alloc_thr == this_thr) { + // push block to self no-sync free list, linking previous head (LIFO) + *((void **)ptr) = this_thr->th.th_free_lists[index].th_free_list_self; + this_thr->th.th_free_lists[index].th_free_list_self = ptr; + } else { + void *head = this_thr->th.th_free_lists[index].th_free_list_other; + if (head == NULL) { + // Create new free list + this_thr->th.th_free_lists[index].th_free_list_other = ptr; + *((void **)ptr) = NULL; // mark the tail of the list + descr->size_allocated = (size_t)1; // head of the list keeps its length + } else { + // need to check existed "other" list's owner thread and size of queue + kmp_mem_descr_t *dsc = + (kmp_mem_descr_t *)((char *)head - sizeof(kmp_mem_descr_t)); + // allocating thread, same for all queue nodes + kmp_info_t *q_th = (kmp_info_t *)(dsc->ptr_aligned); + size_t q_sz = + dsc->size_allocated + 1; // new size in case we add current task + if (q_th == alloc_thr && q_sz <= KMP_FREE_LIST_LIMIT) { + // we can add current task to "other" list, no sync needed + *((void **)ptr) = head; + descr->size_allocated = q_sz; + this_thr->th.th_free_lists[index].th_free_list_other = ptr; + } else { + // either queue blocks owner is changing or size limit exceeded + // return old queue to allocating thread (q_th) synchroneously, + // and start new list for alloc_thr's tasks + void *old_ptr; + void *tail = head; + void *next = *((void **)head); + while (next != NULL) { + KMP_DEBUG_ASSERT( + // queue size should decrease by 1 each step through the list + ((kmp_mem_descr_t *)((char *)next - sizeof(kmp_mem_descr_t))) + ->size_allocated + + 1 == + ((kmp_mem_descr_t *)((char *)tail - sizeof(kmp_mem_descr_t))) + ->size_allocated); + tail = next; // remember tail node + next = *((void **)next); + } + KMP_DEBUG_ASSERT(q_th != NULL); + // push block to owner's sync free list + old_ptr = TCR_PTR(q_th->th.th_free_lists[index].th_free_list_sync); + /* the next pointer must be set before setting free_list to ptr to avoid + exposing a broken list to other threads, even for an instant. */ + *((void **)tail) = old_ptr; + + while (!KMP_COMPARE_AND_STORE_PTR( + &q_th->th.th_free_lists[index].th_free_list_sync, old_ptr, head)) { + KMP_CPU_PAUSE(); + old_ptr = TCR_PTR(q_th->th.th_free_lists[index].th_free_list_sync); + *((void **)tail) = old_ptr; + } + + // start new list of not-selt tasks + this_thr->th.th_free_lists[index].th_free_list_other = ptr; + *((void **)ptr) = NULL; + descr->size_allocated = (size_t)1; // head of queue keeps its length + } + } + } + goto end; + +free_call: + KE_TRACE(25, ("__kmp_fast_free: T#%d Calling __kmp_thread_free for size %d\n", + __kmp_gtid_from_thread(this_thr), size)); + kmp_b_dequeue(this_thr); /* Release any queued buffers */ + kmp_b_free(this_thr, descr->ptr_allocated); + +end: + KE_TRACE(25, ("<- __kmp_fast_free() returns\n")); + +} // func __kmp_fast_free +#endif // USE_FAST_MEMORY + /*! @ingroup TASKING @param loc_ref location of the original task directive