Index: llvm/CMakeLists.txt =================================================================== --- llvm/CMakeLists.txt +++ llvm/CMakeLists.txt @@ -544,6 +544,16 @@ option (LLVM_VERSION_PRINTER_SHOW_HOST_TARGET_INFO "Show target and host info when tools are invoked with --version." ON) +option(LLVM_ENABLE_RPMALLOC "Replace the Windows CRT allocator with rpmalloc. Only works with /MT enabled." OFF) +if(LLVM_ENABLE_RPMALLOC) + if (CMAKE_BUILD_TYPE AND NOT ${LLVM_USE_CRT_${uppercase_CMAKE_BUILD_TYPE}} MATCHES "^(MT|MTd)$") + message(FATAL_ERROR "LLVM_ENABLE_RPMALLOC only works with /MT or /MTd. Use LLVM_USE_CRT_${uppercase_CMAKE_BUILD_TYPE} to set the appropriate option.") + endif() + if (LLVM_USE_SANITIZER) + message(FATAL_ERROR "LLVM_ENABLE_RPMALLOC cannot be used along with LLVM_USE_SANITIZER!") + endif() +endif() + # You can configure which libraries from LLVM you want to include in the # shared library by setting LLVM_DYLIB_COMPONENTS to a semi-colon delimited # list of LLVM components. All component names handled by llvm-config are valid. Index: llvm/include/llvm/Config/config.h.cmake =================================================================== --- llvm/include/llvm/Config/config.h.cmake +++ llvm/include/llvm/Config/config.h.cmake @@ -347,4 +347,7 @@ /* Whether Timers signpost passes in Xcode Instruments */ #cmakedefine01 LLVM_SUPPORT_XCODE_SIGNPOSTS +/* When enabled, overrides the Windows CRT allocator with rpmalloc. */ +#cmakedefine01 LLVM_ENABLE_RPMALLOC + #endif Index: llvm/lib/Support/CMakeLists.txt =================================================================== --- llvm/lib/Support/CMakeLists.txt +++ llvm/lib/Support/CMakeLists.txt @@ -51,6 +51,12 @@ set(Z3_LINK_FILES "") endif() +if(LLVM_ENABLE_RPMALLOC) + set(RPMALLOC_FILES rpmalloc/rpmalloc.c) +else() + set(RPMALLOC_FILES "") +endif() + add_llvm_component_library(LLVMSupport AArch64TargetParser.cpp ABIBreak.cpp @@ -163,6 +169,8 @@ xxhash.cpp Z3Solver.cpp + ${RPMALLOC_FILES} + # System Atomic.cpp DynamicLibrary.cpp Index: llvm/lib/Support/Windows/Memory.inc =================================================================== --- llvm/lib/Support/Windows/Memory.inc +++ llvm/lib/Support/Windows/Memory.inc @@ -11,6 +11,7 @@ // //===----------------------------------------------------------------------===// +#include "llvm/Config/config.h" #include "llvm/Support/DataTypes.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/Process.h" @@ -198,3 +199,74 @@ } // namespace sys } // namespace llvm + +#if defined(_MT) && !defined(_DLL) && !defined(_DEBUG) && LLVM_ENABLE_RPMALLOC + +#include "../rpmalloc/rpmalloc.h" + +// This function is registered as a TLS callback and will be called first before +// anything else. Its purpose is to set-up the memory page size if the +// environment variable LLVM_RPMALLOC_PAGESIZE is provided. +static void NTAPI InitializeRPMalloc(PVOID DllHandle, DWORD dwReason, PVOID) { + if (dwReason == DLL_PROCESS_ATTACH) { + rpmalloc_config_t Config; + memset(&Config, 0, sizeof(rpmalloc_config_t)); + + char PageSizeStr[16]; + // Use a Win32 API call because ::getenv uses the CRT, and at this point it + // isn't initialized yet + DWORD R = ::GetEnvironmentVariableA("LLVM_RPMALLOC_PAGESIZE", PageSizeStr, + sizeof(PageSizeStr)); + if (R > 0) { + llvm::StringRef AsRef(PageSizeStr); + if (AsRef.startswith_lower("4k")) { + // RPMalloc already defaults to 4k pages + } else if (AsRef.startswith_lower("2M")) { + Config.page_size = 2 * 1024 * 1024; + Config.enable_huge_pages = 1; + } else if (AsRef.startswith_lower("1G")) { + Config.page_size = 1024 * 1024 * 1024; + Config.enable_huge_pages = 1; + } + } + rpmalloc_initialize_config(&Config); + } else if (dwReason == DLL_PROCESS_DETACH) { + rpmalloc_finalize(); + } +} + +// Ensure the symbols are not dropped by the linker +#ifdef _WIN64 +// in %VCToolsInstallDir%\crt\src\vcruntime\tlssup.cpp +#pragma comment(linker, "/INCLUDE:_tls_used") +#pragma comment(linker, "/INCLUDE:InitRPMallocTLSCallback") +#else +#pragma comment(linker, "/INCLUDE:__tls_used") +#pragma comment(linker, "/INCLUDE:_InitRPMallocTLSCallback") +#endif + +// Add our callback to the CRT TLS callbacks list +#ifdef _WIN64 +#pragma const_seg(".CRT$XLF") +EXTERN_C const +#else +#pragma data_seg(".CRT$XLF") +EXTERN_C +#endif + PIMAGE_TLS_CALLBACK InitRPMallocTLSCallback = InitializeRPMalloc; +#ifdef _WIN64 +#pragma const_seg() +#else +#pragma data_seg() +#endif //_WIN64 + +// Bypass CRT debug allocator +#ifdef _DEBUG +void *operator new(decltype(sizeof(0)) n) noexcept(false) { return malloc(n); } +void __CRTDECL operator delete(void *const block) noexcept { free(block); } + +void *operator new[](std::size_t s) throw(std::bad_alloc) { return malloc(s); } +void operator delete[](void *p) throw() { free(p); } +#endif + +#endif // _MT && !_DLL && !_DEBUG && LLVM_ENABLE_RPMALLOC Index: llvm/lib/Support/rpmalloc/LICENSE =================================================================== --- /dev/null +++ llvm/lib/Support/rpmalloc/LICENSE @@ -0,0 +1,24 @@ +This is free and unencumbered software released into the public domain. + +Anyone is free to copy, modify, publish, use, compile, sell, or +distribute this software, either in source code form or as a compiled +binary, for any purpose, commercial or non-commercial, and by any +means. + +In jurisdictions that recognize copyright laws, the author or authors +of this software dedicate any and all copyright interest in the +software to the public domain. We make this dedication for the benefit +of the public at large and to the detriment of our heirs and +successors. We intend this dedication to be an overt act of +relinquishment in perpetuity of all present and future rights to this +software under copyright law. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR +OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +OTHER DEALINGS IN THE SOFTWARE. + +For more information, please refer to Index: llvm/lib/Support/rpmalloc/malloc.c =================================================================== --- /dev/null +++ llvm/lib/Support/rpmalloc/malloc.c @@ -0,0 +1,371 @@ +/* malloc.c - Memory allocator - Public Domain - 2016 Mattias Jansson + * + * This library provides a cross-platform lock free thread caching malloc implementation in C11. + * The latest source code is always available at + * + * https://github.com/mjansson/rpmalloc + * + * This library is put in the public domain; you can redistribute it and/or modify it without any restrictions. + * + */ + +// +// This file provides overrides for the standard library malloc entry points for C and new/delete operators for C++ +// It also provides automatic initialization/finalization of process and threads +// + +#ifndef ARCH_64BIT +# if defined(__LLP64__) || defined(__LP64__) || defined(_WIN64) +# define ARCH_64BIT 1 +_Static_assert(sizeof(size_t) == 8, "Data type size mismatch"); +_Static_assert(sizeof(void*) == 8, "Data type size mismatch"); +# else +# define ARCH_64BIT 0 +_Static_assert(sizeof(size_t) == 4, "Data type size mismatch"); +_Static_assert(sizeof(void*) == 4, "Data type size mismatch"); +# endif +#endif + +#if (defined(__GNUC__) || defined(__clang__)) && !defined(__MACH__) +#pragma GCC visibility push(default) +#endif + +#define USE_IMPLEMENT 1 +#define USE_INTERPOSE 0 +#define USE_ALIAS 0 + +#if defined(__APPLE__) && ENABLE_PRELOAD +#undef USE_INTERPOSE +#define USE_INTERPOSE 1 + +typedef struct interpose_t { + void* new_func; + void* orig_func; +} interpose_t; + +#define MAC_INTERPOSE_PAIR(newf, oldf) { (void*)newf, (void*)oldf } +#define MAC_INTERPOSE_SINGLE(newf, oldf) \ +__attribute__((used)) static const interpose_t macinterpose##newf##oldf \ +__attribute__ ((section("__DATA, __interpose"))) = MAC_INTERPOSE_PAIR(newf, oldf) + +#endif + +#if !defined(_WIN32) && !USE_INTERPOSE +#undef USE_IMPLEMENT +#undef USE_ALIAS +#define USE_IMPLEMENT 0 +#define USE_ALIAS 1 +#endif + +#ifdef _MSC_VER +#pragma warning (disable : 4100) +#undef malloc +#undef free +#undef calloc +#endif + +#if ENABLE_OVERRIDE + +#if USE_IMPLEMENT + +extern inline void* RPMALLOC_CDECL malloc(size_t size) { return rpmalloc(size); } +extern inline void* RPMALLOC_CDECL calloc(size_t count, size_t size) { return rpcalloc(count, size); } +extern inline void* RPMALLOC_CDECL realloc(void* ptr, size_t size) { return rprealloc(ptr, size); } +extern inline void* RPMALLOC_CDECL reallocf(void* ptr, size_t size) { return rprealloc(ptr, size); } +extern inline void* RPMALLOC_CDECL aligned_alloc(size_t alignment, size_t size) { return rpaligned_alloc(alignment, size); } +extern inline void* RPMALLOC_CDECL memalign(size_t alignment, size_t size) { return rpmemalign(alignment, size); } +extern inline int RPMALLOC_CDECL posix_memalign(void** memptr, size_t alignment, size_t size) { return rpposix_memalign(memptr, alignment, size); } +extern inline void RPMALLOC_CDECL free(void* ptr) { rpfree(ptr); } +extern inline void RPMALLOC_CDECL cfree(void* ptr) { rpfree(ptr); } +extern inline size_t RPMALLOC_CDECL malloc_usable_size(void* ptr) { return rpmalloc_usable_size(ptr); } +extern inline size_t RPMALLOC_CDECL malloc_size(void* ptr) { return rpmalloc_usable_size(ptr); } + +// Overload the C++ operators using the mangled names (https://itanium-cxx-abi.github.io/cxx-abi/abi.html#mangling) +// operators delete and delete[] +extern void _ZdlPv(void* p); void _ZdlPv(void* p) { rpfree(p); } +extern void _ZdaPv(void* p); void _ZdaPv(void* p) { rpfree(p); } +#if ARCH_64BIT +// 64-bit operators new and new[], normal and aligned +extern void* _Znwm(uint64_t size); void* _Znwm(uint64_t size) { return rpmalloc(size); } +extern void* _Znam(uint64_t size); void* _Znam(uint64_t size) { return rpmalloc(size); } +extern void* _Znwmm(uint64_t size, uint64_t align); void* _Znwmm(uint64_t size, uint64_t align) { return rpaligned_alloc(align, size); } +extern void* _Znamm(uint64_t size, uint64_t align); void* _Znamm(uint64_t size, uint64_t align) { return rpaligned_alloc(align, size); } +#else +// 32-bit operators new and new[], normal and aligned +extern void* _Znwj(uint32_t size); void* _Znwj(uint32_t size) { return rpmalloc(size); } +extern void* _Znaj(uint32_t size); void* _Znaj(uint32_t size) { return rpmalloc(size); } +extern void* _Znwjj(uint64_t size, uint64_t align); void* _Znwjj(uint64_t size, uint64_t align) { return rpaligned_alloc(align, size); } +extern void* _Znajj(uint64_t size, uint64_t align); void* _Znajj(uint64_t size, uint64_t align) { return rpaligned_alloc(align, size); } +#endif + +#endif + +#if USE_INTERPOSE + +__attribute__((used)) static const interpose_t macinterpose_malloc[] +__attribute__ ((section("__DATA, __interpose"))) = { + //new and new[] + MAC_INTERPOSE_PAIR(rpmalloc, _Znwm), + MAC_INTERPOSE_PAIR(rpmalloc, _Znam), + //delete and delete[] + MAC_INTERPOSE_PAIR(rpfree, _ZdlPv), + MAC_INTERPOSE_PAIR(rpfree, _ZdaPv), + MAC_INTERPOSE_PAIR(rpmalloc, malloc), + MAC_INTERPOSE_PAIR(rpmalloc, calloc), + MAC_INTERPOSE_PAIR(rprealloc, realloc), + MAC_INTERPOSE_PAIR(rprealloc, reallocf), + MAC_INTERPOSE_PAIR(rpaligned_alloc, aligned_alloc), + MAC_INTERPOSE_PAIR(rpmemalign, memalign), + MAC_INTERPOSE_PAIR(rpposix_memalign, posix_memalign), + MAC_INTERPOSE_PAIR(rpfree, free), + MAC_INTERPOSE_PAIR(rpfree, cfree), + MAC_INTERPOSE_PAIR(rpmalloc_usable_size, malloc_usable_size), + MAC_INTERPOSE_PAIR(rpmalloc_usable_size, malloc_size) +}; + +#endif + +#if USE_ALIAS + +#define RPALIAS(fn) __attribute__((alias(#fn), used, visibility("default"))); + +// Alias the C++ operators using the mangled names (https://itanium-cxx-abi.github.io/cxx-abi/abi.html#mangling) + +// operators delete and delete[] +void _ZdlPv(void* p) RPALIAS(rpfree) +void _ZdaPv(void* p) RPALIAS(rpfree) + +#if ARCH_64BIT +// 64-bit operators new and new[], normal and aligned +void* _Znwm(uint64_t size) RPALIAS(rpmalloc) +void* _Znam(uint64_t size) RPALIAS(rpmalloc) +extern inline void* _Znwmm(uint64_t size, uint64_t align) { return rpaligned_alloc(align, size); } +extern inline void* _Znamm(uint64_t size, uint64_t align) { return rpaligned_alloc(align, size); } +#else +// 32-bit operators new and new[], normal and aligned +void* _Znwj(uint32_t size) RPALIAS(rpmalloc) +void* _Znaj(uint32_t size) RPALIAS(rpmalloc) +extern inline void* _Znwjj(uint32_t size, uint32_t align) { return rpaligned_alloc(align, size); } +extern inline void* _Znajj(uint32_t size, uint32_t align) { return rpaligned_alloc(align, size); } +#endif + +void* malloc(size_t size) RPALIAS(rpmalloc) +void* calloc(size_t count, size_t size) RPALIAS(rpcalloc) +void* realloc(void* ptr, size_t size) RPALIAS(rprealloc) +void* reallocf(void* ptr, size_t size) RPALIAS(rprealloc) +void* aligned_alloc(size_t alignment, size_t size) RPALIAS(rpaligned_alloc) +void* memalign(size_t alignment, size_t size) RPALIAS(rpmemalign) +int posix_memalign(void** memptr, size_t alignment, size_t size) RPALIAS(rpposix_memalign) +void free(void* ptr) RPALIAS(rpfree) +void cfree(void* ptr) RPALIAS(rpfree) +size_t malloc_usable_size(void* ptr) RPALIAS(rpmalloc_usable_size) +size_t malloc_size(void* ptr) RPALIAS(rpmalloc_usable_size) + +#endif + +#if defined(__GLIBC__) && defined(__linux__) + +extern inline void* RPMALLOC_CDECL +reallocarray(void* ptr, size_t count, size_t size) { + size_t total; +#if ENABLE_VALIDATE_ARGS +#ifdef _MSC_VER + int err = SizeTMult(count, size, &total); + if ((err != S_OK) || (total >= MAX_ALLOC_SIZE)) { + errno = EINVAL; + return 0; + } +#else + int err = __builtin_umull_overflow(count, size, &total); + if (err || (total >= MAX_ALLOC_SIZE)) { + errno = EINVAL; + return 0; + } +#endif +#else + total = count * size; +#endif + return realloc(ptr, total); +} + +extern inline void* RPMALLOC_CDECL +valloc(size_t size) { + get_thread_heap(); + if (!size) + size = _memory_page_size; + size_t total_size = size + _memory_page_size; +#if ENABLE_VALIDATE_ARGS + if (total_size < size) { + errno = EINVAL; + return 0; + } +#endif + void* buffer = rpmalloc(total_size); + if ((uintptr_t)buffer & (_memory_page_size - 1)) + return (void*)(((uintptr_t)buffer & ~(_memory_page_size - 1)) + _memory_page_size); + return buffer; +} + +extern inline void* RPMALLOC_CDECL +pvalloc(size_t size) { + get_thread_heap(); + size_t aligned_size = size; + if (aligned_size % _memory_page_size) + aligned_size = (1 + (aligned_size / _memory_page_size)) * _memory_page_size; +#if ENABLE_VALIDATE_ARGS + if (aligned_size < size) { + errno = EINVAL; + return 0; + } +#endif + return valloc(size); +} + +#endif + +#endif // ENABLE_OVERRIDE + +#if ENABLE_PRELOAD + +#ifdef _WIN32 + +#if defined(BUILD_DYNAMIC_LINK) && BUILD_DYNAMIC_LINK + +__declspec(dllexport) BOOL WINAPI +DllMain(HINSTANCE instance, DWORD reason, LPVOID reserved) { + (void)sizeof(reserved); + (void)sizeof(instance); + if (reason == DLL_PROCESS_ATTACH) + rpmalloc_initialize(); + else if (reason == DLL_PROCESS_DETACH) + rpmalloc_finalize(); + else if (reason == DLL_THREAD_ATTACH) + rpmalloc_thread_initialize(); + else if (reason == DLL_THREAD_DETACH) + rpmalloc_thread_finalize(); + return TRUE; +} + +#endif + +#else + +#include +#include +#include +#include + +static pthread_key_t destructor_key; + +static void +thread_destructor(void*); + +static void __attribute__((constructor)) +initializer(void) { + rpmalloc_initialize(); + pthread_key_create(&destructor_key, thread_destructor); +} + +static void __attribute__((destructor)) +finalizer(void) { + rpmalloc_finalize(); +} + +typedef struct { + void* (*real_start)(void*); + void* real_arg; +} thread_starter_arg; + +static void* +thread_starter(void* argptr) { + thread_starter_arg* arg = argptr; + void* (*real_start)(void*) = arg->real_start; + void* real_arg = arg->real_arg; + rpmalloc_thread_initialize(); + rpfree(argptr); + pthread_setspecific(destructor_key, (void*)1); + return (*real_start)(real_arg); +} + +static void +thread_destructor(void* value) { + (void)sizeof(value); + rpmalloc_thread_finalize(); +} + +#ifdef __APPLE__ + +static int +pthread_create_proxy(pthread_t* thread, + const pthread_attr_t* attr, + void* (*start_routine)(void*), + void* arg) { + rpmalloc_initialize(); + thread_starter_arg* starter_arg = rpmalloc(sizeof(thread_starter_arg)); + starter_arg->real_start = start_routine; + starter_arg->real_arg = arg; + return pthread_create(thread, attr, thread_starter, starter_arg); +} + +MAC_INTERPOSE_SINGLE(pthread_create_proxy, pthread_create); + +#else + +#include + +int +pthread_create(pthread_t* thread, + const pthread_attr_t* attr, + void* (*start_routine)(void*), + void* arg) { +#if defined(__linux__) || defined(__FreeBSD__) || defined(__OpenBSD__) || defined(__APPLE__) || defined(__HAIKU__) + char fname[] = "pthread_create"; +#else + char fname[] = "_pthread_create"; +#endif + void* real_pthread_create = dlsym(RTLD_NEXT, fname); + rpmalloc_thread_initialize(); + thread_starter_arg* starter_arg = rpmalloc(sizeof(thread_starter_arg)); + starter_arg->real_start = start_routine; + starter_arg->real_arg = arg; + return (*(int (*)(pthread_t*, const pthread_attr_t*, void* (*)(void*), void*))real_pthread_create)(thread, attr, thread_starter, starter_arg); +} + +#endif + +#endif + +#endif + +#if ENABLE_OVERRIDE + +#if defined(__GLIBC__) && defined(__linux__) + +void* __libc_malloc(size_t size) RPALIAS(rpmalloc) +void* __libc_calloc(size_t count, size_t size) RPALIAS(rpcalloc) +void* __libc_realloc(void* p, size_t size) RPALIAS(rprealloc) +void __libc_free(void* p) RPALIAS(rpfree) +void __libc_cfree(void* p) RPALIAS(rpfree) +void* __libc_memalign(size_t align, size_t size) RPALIAS(rpmemalign) +int __posix_memalign(void** p, size_t align, size_t size) RPALIAS(rpposix_memalign) + +extern void* __libc_valloc(size_t size); +extern void* __libc_pvalloc(size_t size); + +void* +__libc_valloc(size_t size) { + return valloc(size); +} + +void* +__libc_pvalloc(size_t size) { + return pvalloc(size); +} + +#endif + +#endif + +#if (defined(__GNUC__) || defined(__clang__)) && !defined(__MACH__) +#pragma GCC visibility pop +#endif Index: llvm/lib/Support/rpmalloc/rpmalloc.h =================================================================== --- /dev/null +++ llvm/lib/Support/rpmalloc/rpmalloc.h @@ -0,0 +1,263 @@ +/* rpmalloc.h - Memory allocator - Public Domain - 2016 Mattias Jansson + * + * This library provides a cross-platform lock free thread caching malloc implementation in C11. + * The latest source code is always available at + * + * https://github.com/mjansson/rpmalloc + * + * This library is put in the public domain; you can redistribute it and/or modify it without any restrictions. + * + */ + +#pragma once + +#include + +#ifdef __cplusplus +extern "C" { +#endif + +#if defined(__clang__) || defined(__GNUC__) +# define RPMALLOC_EXPORT __attribute__((visibility("default"))) +# define RPMALLOC_ALLOCATOR +# define RPMALLOC_ATTRIB_MALLOC __attribute__((__malloc__)) +# if defined(__clang_major__) && (__clang_major__ < 4) +# define RPMALLOC_ATTRIB_ALLOC_SIZE(size) +# define RPMALLOC_ATTRIB_ALLOC_SIZE2(count, size) +# else +# define RPMALLOC_ATTRIB_ALLOC_SIZE(size) __attribute__((alloc_size(size))) +# define RPMALLOC_ATTRIB_ALLOC_SIZE2(count, size) __attribute__((alloc_size(count, size))) +# endif +# define RPMALLOC_CDECL +#elif defined(_MSC_VER) +# define RPMALLOC_EXPORT +# define RPMALLOC_ALLOCATOR __declspec(allocator) __declspec(restrict) +# define RPMALLOC_ATTRIB_MALLOC +# define RPMALLOC_ATTRIB_ALLOC_SIZE(size) +# define RPMALLOC_ATTRIB_ALLOC_SIZE2(count,size) +# define RPMALLOC_CDECL __cdecl +#else +# define RPMALLOC_EXPORT +# define RPMALLOC_ALLOCATOR +# define RPMALLOC_ATTRIB_MALLOC +# define RPMALLOC_ATTRIB_ALLOC_SIZE(size) +# define RPMALLOC_ATTRIB_ALLOC_SIZE2(count,size) +# define RPMALLOC_CDECL +#endif + +//! Define RPMALLOC_CONFIGURABLE to enable configuring sizes +#ifndef RPMALLOC_CONFIGURABLE +#define RPMALLOC_CONFIGURABLE 1 +#endif + +//! Flag to rpaligned_realloc to not preserve content in reallocation +#define RPMALLOC_NO_PRESERVE 1 + +typedef struct rpmalloc_global_statistics_t { + //! Current amount of virtual memory mapped, all of which might not have been committed (only if ENABLE_STATISTICS=1) + size_t mapped; + //! Peak amount of virtual memory mapped, all of which might not have been committed (only if ENABLE_STATISTICS=1) + size_t mapped_peak; + //! Current amount of memory in global caches for small and medium sizes (<32KiB) + size_t cached; + //! Current amount of memory allocated in huge allocations, i.e larger than LARGE_SIZE_LIMIT which is 2MiB by default (only if ENABLE_STATISTICS=1) + size_t huge_alloc; + //! Peak amount of memory allocated in huge allocations, i.e larger than LARGE_SIZE_LIMIT which is 2MiB by default (only if ENABLE_STATISTICS=1) + size_t huge_alloc_peak; + //! Total amount of memory mapped since initialization (only if ENABLE_STATISTICS=1) + size_t mapped_total; + //! Total amount of memory unmapped since initialization (only if ENABLE_STATISTICS=1) + size_t unmapped_total; +} rpmalloc_global_statistics_t; + +typedef struct rpmalloc_thread_statistics_t { + //! Current number of bytes available in thread size class caches for small and medium sizes (<32KiB) + size_t sizecache; + //! Current number of bytes available in thread span caches for small and medium sizes (<32KiB) + size_t spancache; + //! Total number of bytes transitioned from thread cache to global cache (only if ENABLE_STATISTICS=1) + size_t thread_to_global; + //! Total number of bytes transitioned from global cache to thread cache (only if ENABLE_STATISTICS=1) + size_t global_to_thread; + //! Per span count statistics (only if ENABLE_STATISTICS=1) + struct { + //! Currently used number of spans + size_t current; + //! High water mark of spans used + size_t peak; + //! Number of spans transitioned to global cache + size_t to_global; + //! Number of spans transitioned from global cache + size_t from_global; + //! Number of spans transitioned to thread cache + size_t to_cache; + //! Number of spans transitioned from thread cache + size_t from_cache; + //! Number of spans transitioned to reserved state + size_t to_reserved; + //! Number of spans transitioned from reserved state + size_t from_reserved; + //! Number of raw memory map calls (not hitting the reserve spans but resulting in actual OS mmap calls) + size_t map_calls; + } span_use[32]; + //! Per size class statistics (only if ENABLE_STATISTICS=1) + struct { + //! Current number of allocations + size_t alloc_current; + //! Peak number of allocations + size_t alloc_peak; + //! Total number of allocations + size_t alloc_total; + //! Total number of frees + size_t free_total; + //! Number of spans transitioned to cache + size_t spans_to_cache; + //! Number of spans transitioned from cache + size_t spans_from_cache; + //! Number of spans transitioned from reserved state + size_t spans_from_reserved; + //! Number of raw memory map calls (not hitting the reserve spans but resulting in actual OS mmap calls) + size_t map_calls; + } size_use[128]; +} rpmalloc_thread_statistics_t; + +typedef struct rpmalloc_config_t { + //! Map memory pages for the given number of bytes. The returned address MUST be + // aligned to the rpmalloc span size, which will always be a power of two. + // Optionally the function can store an alignment offset in the offset variable + // in case it performs alignment and the returned pointer is offset from the + // actual start of the memory region due to this alignment. The alignment offset + // will be passed to the memory unmap function. The alignment offset MUST NOT be + // larger than 65535 (storable in an uint16_t), if it is you must use natural + // alignment to shift it into 16 bits. If you set a memory_map function, you + // must also set a memory_unmap function or else the default implementation will + // be used for both. + void* (*memory_map)(size_t size, size_t* offset); + //! Unmap the memory pages starting at address and spanning the given number of bytes. + // If release is set to non-zero, the unmap is for an entire span range as returned by + // a previous call to memory_map and that the entire range should be released. The + // release argument holds the size of the entire span range. If release is set to 0, + // the unmap is a partial decommit of a subset of the mapped memory range. + // If you set a memory_unmap function, you must also set a memory_map function or + // else the default implementation will be used for both. + void (*memory_unmap)(void* address, size_t size, size_t offset, size_t release); + //! Size of memory pages. The page size MUST be a power of two. All memory mapping + // requests to memory_map will be made with size set to a multiple of the page size. + // Used if RPMALLOC_CONFIGURABLE is defined to 1, otherwise system page size is used. + size_t page_size; + //! Size of a span of memory blocks. MUST be a power of two, and in [4096,262144] + // range (unless 0 - set to 0 to use the default span size). Used if RPMALLOC_CONFIGURABLE + // is defined to 1. + size_t span_size; + //! Number of spans to map at each request to map new virtual memory blocks. This can + // be used to minimize the system call overhead at the cost of virtual memory address + // space. The extra mapped pages will not be written until actually used, so physical + // committed memory should not be affected in the default implementation. Will be + // aligned to a multiple of spans that match memory page size in case of huge pages. + size_t span_map_count; + //! Enable use of large/huge pages. If this flag is set to non-zero and page size is + // zero, the allocator will try to enable huge pages and auto detect the configuration. + // If this is set to non-zero and page_size is also non-zero, the allocator will + // assume huge pages have been configured and enabled prior to initializing the + // allocator. + // For Windows, see https://docs.microsoft.com/en-us/windows/desktop/memory/large-page-support + // For Linux, see https://www.kernel.org/doc/Documentation/vm/hugetlbpage.txt + int enable_huge_pages; +} rpmalloc_config_t; + +//! Initialize allocator with default configuration +RPMALLOC_EXPORT int +rpmalloc_initialize(void); + +//! Initialize allocator with given configuration +RPMALLOC_EXPORT int +rpmalloc_initialize_config(const rpmalloc_config_t* config); + +//! Get allocator configuration +RPMALLOC_EXPORT const rpmalloc_config_t* +rpmalloc_config(void); + +//! Finalize allocator +RPMALLOC_EXPORT void +rpmalloc_finalize(void); + +//! Initialize allocator for calling thread +RPMALLOC_EXPORT void +rpmalloc_thread_initialize(void); + +//! Finalize allocator for calling thread +RPMALLOC_EXPORT void +rpmalloc_thread_finalize(void); + +//! Perform deferred deallocations pending for the calling thread heap +RPMALLOC_EXPORT void +rpmalloc_thread_collect(void); + +//! Query if allocator is initialized for calling thread +RPMALLOC_EXPORT int +rpmalloc_is_thread_initialized(void); + +//! Get per-thread statistics +RPMALLOC_EXPORT void +rpmalloc_thread_statistics(rpmalloc_thread_statistics_t* stats); + +//! Get global statistics +RPMALLOC_EXPORT void +rpmalloc_global_statistics(rpmalloc_global_statistics_t* stats); + +//! Dump all statistics in human readable format to file (should be a FILE*) +RPMALLOC_EXPORT void +rpmalloc_dump_statistics(void* file); + +//! Allocate a memory block of at least the given size +RPMALLOC_EXPORT RPMALLOC_ALLOCATOR void* +rpmalloc(size_t size) RPMALLOC_ATTRIB_MALLOC RPMALLOC_ATTRIB_ALLOC_SIZE(1); + +//! Free the given memory block +RPMALLOC_EXPORT void +rpfree(void* ptr); + +//! Allocate a memory block of at least the given size and zero initialize it +RPMALLOC_EXPORT RPMALLOC_ALLOCATOR void* +rpcalloc(size_t num, size_t size) RPMALLOC_ATTRIB_MALLOC RPMALLOC_ATTRIB_ALLOC_SIZE2(1, 2); + +//! Reallocate the given block to at least the given size +RPMALLOC_EXPORT RPMALLOC_ALLOCATOR void* +rprealloc(void* ptr, size_t size) RPMALLOC_ATTRIB_MALLOC RPMALLOC_ATTRIB_ALLOC_SIZE(2); + +//! Reallocate the given block to at least the given size and alignment, +// with optional control flags (see RPMALLOC_NO_PRESERVE). +// Alignment must be a power of two and a multiple of sizeof(void*), +// and should ideally be less than memory page size. A caveat of rpmalloc +// internals is that this must also be strictly less than the span size (default 64KiB) +RPMALLOC_EXPORT RPMALLOC_ALLOCATOR void* +rpaligned_realloc(void* ptr, size_t alignment, size_t size, size_t oldsize, unsigned int flags) RPMALLOC_ATTRIB_MALLOC RPMALLOC_ATTRIB_ALLOC_SIZE(3); + +//! Allocate a memory block of at least the given size and alignment. +// Alignment must be a power of two and a multiple of sizeof(void*), +// and should ideally be less than memory page size. A caveat of rpmalloc +// internals is that this must also be strictly less than the span size (default 64KiB) +RPMALLOC_EXPORT RPMALLOC_ALLOCATOR void* +rpaligned_alloc(size_t alignment, size_t size) RPMALLOC_ATTRIB_MALLOC RPMALLOC_ATTRIB_ALLOC_SIZE(2); + +//! Allocate a memory block of at least the given size and alignment. +// Alignment must be a power of two and a multiple of sizeof(void*), +// and should ideally be less than memory page size. A caveat of rpmalloc +// internals is that this must also be strictly less than the span size (default 64KiB) +RPMALLOC_EXPORT RPMALLOC_ALLOCATOR void* +rpmemalign(size_t alignment, size_t size) RPMALLOC_ATTRIB_MALLOC RPMALLOC_ATTRIB_ALLOC_SIZE(2); + +//! Allocate a memory block of at least the given size and alignment. +// Alignment must be a power of two and a multiple of sizeof(void*), +// and should ideally be less than memory page size. A caveat of rpmalloc +// internals is that this must also be strictly less than the span size (default 64KiB) +RPMALLOC_EXPORT int +rpposix_memalign(void **memptr, size_t alignment, size_t size); + +//! Query the usable size of the given memory block (from given pointer to the end of block) +RPMALLOC_EXPORT size_t +rpmalloc_usable_size(void* ptr); + +#ifdef __cplusplus +} +#endif Index: llvm/lib/Support/rpmalloc/rpmalloc.c =================================================================== --- /dev/null +++ llvm/lib/Support/rpmalloc/rpmalloc.c @@ -0,0 +1,2602 @@ +/* rpmalloc.c - Memory allocator - Public Domain - 2016 Mattias Jansson + * + * This library provides a cross-platform lock free thread caching malloc implementation in C11. + * The latest source code is always available at + * + * https://github.com/mjansson/rpmalloc + * + * This library is put in the public domain; you can redistribute it and/or modify it without any restrictions. + * + */ + +#include "rpmalloc.h" + +/// Build time configurable limits +#ifndef HEAP_ARRAY_SIZE +//! Size of heap hashmap +#define HEAP_ARRAY_SIZE 47 +#endif +#ifndef ENABLE_THREAD_CACHE +//! Enable per-thread cache +#define ENABLE_THREAD_CACHE 1 +#endif +#ifndef ENABLE_GLOBAL_CACHE +//! Enable global cache shared between all threads, requires thread cache +#define ENABLE_GLOBAL_CACHE 1 +#endif +#ifndef ENABLE_VALIDATE_ARGS +//! Enable validation of args to public entry points +#define ENABLE_VALIDATE_ARGS 0 +#endif +#ifndef ENABLE_STATISTICS +//! Enable statistics collection +#define ENABLE_STATISTICS 0 +#endif +#ifndef ENABLE_ASSERTS +//! Enable asserts +#define ENABLE_ASSERTS 0 +#endif +#ifndef ENABLE_OVERRIDE +//! Override standard library malloc/free and new/delete entry points +#define ENABLE_OVERRIDE 1 +#endif +#ifndef ENABLE_PRELOAD +//! Support preloading +#define ENABLE_PRELOAD 1 +#endif +#ifndef DISABLE_UNMAP +//! Disable unmapping memory pages +#define DISABLE_UNMAP 0 +#endif +#ifndef DEFAULT_SPAN_MAP_COUNT +//! Default number of spans to map in call to map more virtual memory (default values yield 4MiB here) +#define DEFAULT_SPAN_MAP_COUNT 64 +#endif + +#if ENABLE_THREAD_CACHE +#ifndef ENABLE_UNLIMITED_CACHE +//! Unlimited thread and global cache +#define ENABLE_UNLIMITED_CACHE 0 +#endif +#ifndef ENABLE_UNLIMITED_THREAD_CACHE +//! Unlimited cache disables any thread cache limitations +#define ENABLE_UNLIMITED_THREAD_CACHE ENABLE_UNLIMITED_CACHE +#endif +#if !ENABLE_UNLIMITED_THREAD_CACHE +#ifndef THREAD_CACHE_MULTIPLIER +//! Multiplier for thread cache (cache limit will be span release count multiplied by this value) +#define THREAD_CACHE_MULTIPLIER 16 +#endif +#ifndef ENABLE_ADAPTIVE_THREAD_CACHE +//! Enable adaptive size of per-thread cache (still bounded by THREAD_CACHE_MULTIPLIER hard limit) +#define ENABLE_ADAPTIVE_THREAD_CACHE 0 +#endif +#endif +#endif + +#if ENABLE_GLOBAL_CACHE && ENABLE_THREAD_CACHE +#ifndef ENABLE_UNLIMITED_GLOBAL_CACHE +//! Unlimited cache disables any global cache limitations +#define ENABLE_UNLIMITED_GLOBAL_CACHE ENABLE_UNLIMITED_CACHE +#endif +#if !ENABLE_UNLIMITED_GLOBAL_CACHE +//! Multiplier for global cache (cache limit will be span release count multiplied by this value) +#define GLOBAL_CACHE_MULTIPLIER (THREAD_CACHE_MULTIPLIER * 6) +#endif +#else +# undef ENABLE_GLOBAL_CACHE +# define ENABLE_GLOBAL_CACHE 0 +#endif + +#if !ENABLE_THREAD_CACHE || ENABLE_UNLIMITED_THREAD_CACHE +# undef ENABLE_ADAPTIVE_THREAD_CACHE +# define ENABLE_ADAPTIVE_THREAD_CACHE 0 +#endif + +#if DISABLE_UNMAP && !ENABLE_GLOBAL_CACHE +# error Must use global cache if unmap is disabled +#endif + +#if defined( _WIN32 ) || defined( __WIN32__ ) || defined( _WIN64 ) +# define PLATFORM_WINDOWS 1 +# define PLATFORM_POSIX 0 +#else +# define PLATFORM_WINDOWS 0 +# define PLATFORM_POSIX 1 +#endif + +/// Platform and arch specifics +#if defined(_MSC_VER) && !defined(__clang__) +# define FORCEINLINE inline __forceinline +# define _Static_assert static_assert +#else +# ifndef FORCEINLINE +# define FORCEINLINE inline __attribute__((__always_inline__)) +# endif +#endif +#if PLATFORM_WINDOWS +# ifndef WIN32_LEAN_AND_MEAN +# define WIN32_LEAN_AND_MEAN +# endif +# include +# if ENABLE_VALIDATE_ARGS +# include +# endif +#else +# include +# include +# include +# if defined(__APPLE__) +# include +# include +# include +# endif +# if defined(__HAIKU__) +# include +# include +# endif +#endif + +#include +#include + +#if ENABLE_ASSERTS +# undef NDEBUG +# if defined(_MSC_VER) && !defined(_DEBUG) +# define _DEBUG +# endif +# include +#else +# undef assert +# define assert(x) do {} while(0) +#endif +#if ENABLE_STATISTICS +# include +#endif + +/// Atomic access abstraction +#if defined(_MSC_VER) && !defined(__clang__) + +typedef volatile long atomic32_t; +typedef volatile long long atomic64_t; +typedef volatile void* atomicptr_t; + +#define atomic_thread_fence_acquire() +#define atomic_thread_fence_release() + +static FORCEINLINE int32_t atomic_load32(atomic32_t* src) { return *src; } +static FORCEINLINE void atomic_store32(atomic32_t* dst, int32_t val) { *dst = val; } +static FORCEINLINE int32_t atomic_incr32(atomic32_t* val) { return (int32_t)_InterlockedExchangeAdd(val, 1) + 1; } +#if ENABLE_STATISTICS || ENABLE_ADAPTIVE_THREAD_CACHE +static FORCEINLINE int32_t atomic_decr32(atomic32_t* val) { return (int32_t)_InterlockedExchangeAdd(val, -1) - 1; } +#endif +static FORCEINLINE int32_t atomic_add32(atomic32_t* val, int32_t add) { return (int32_t)_InterlockedExchangeAdd(val, add) + add; } +static FORCEINLINE void* atomic_load_ptr(atomicptr_t* src) { return (void*)*src; } +static FORCEINLINE void atomic_store_ptr(atomicptr_t* dst, void* val) { *dst = val; } +# if defined(__LLP64__) || defined(__LP64__) || defined(_WIN64) +static FORCEINLINE int atomic_cas_ptr(atomicptr_t* dst, void* val, void* ref) { return (_InterlockedCompareExchange64((volatile long long*)dst, (long long)val, (long long)ref) == (long long)ref) ? 1 : 0; } +#else +static FORCEINLINE int atomic_cas_ptr(atomicptr_t* dst, void* val, void* ref) { return (_InterlockedCompareExchange((volatile long*)dst, (long)val, (long)ref) == (long)ref) ? 1 : 0; } +#endif + +#define EXPECTED(x) (x) +#define UNEXPECTED(x) (x) + +#else + +#include + +typedef volatile _Atomic(int32_t) atomic32_t; +typedef volatile _Atomic(int64_t) atomic64_t; +typedef volatile _Atomic(void*) atomicptr_t; + +#define atomic_thread_fence_acquire() atomic_thread_fence(memory_order_acquire) +#define atomic_thread_fence_release() atomic_thread_fence(memory_order_release) + +static FORCEINLINE int32_t atomic_load32(atomic32_t* src) { return atomic_load_explicit(src, memory_order_relaxed); } +static FORCEINLINE void atomic_store32(atomic32_t* dst, int32_t val) { atomic_store_explicit(dst, val, memory_order_relaxed); } +static FORCEINLINE int32_t atomic_incr32(atomic32_t* val) { return atomic_fetch_add_explicit(val, 1, memory_order_relaxed) + 1; } +#if ENABLE_STATISTICS || ENABLE_ADAPTIVE_THREAD_CACHE +static FORCEINLINE int32_t atomic_decr32(atomic32_t* val) { return atomic_fetch_add_explicit(val, -1, memory_order_relaxed) - 1; } +#endif +static FORCEINLINE int32_t atomic_add32(atomic32_t* val, int32_t add) { return atomic_fetch_add_explicit(val, add, memory_order_relaxed) + add; } +static FORCEINLINE void* atomic_load_ptr(atomicptr_t* src) { return atomic_load_explicit(src, memory_order_relaxed); } +static FORCEINLINE void atomic_store_ptr(atomicptr_t* dst, void* val) { atomic_store_explicit(dst, val, memory_order_relaxed); } +static FORCEINLINE int atomic_cas_ptr(atomicptr_t* dst, void* val, void* ref) { return atomic_compare_exchange_weak_explicit(dst, &ref, val, memory_order_release, memory_order_acquire); } + +#define EXPECTED(x) __builtin_expect((x), 1) +#define UNEXPECTED(x) __builtin_expect((x), 0) + +#endif + +/// Preconfigured limits and sizes +//! Granularity of a small allocation block +#define SMALL_GRANULARITY 16 +//! Small granularity shift count +#define SMALL_GRANULARITY_SHIFT 4 +//! Number of small block size classes +#define SMALL_CLASS_COUNT 65 +//! Maximum size of a small block +#define SMALL_SIZE_LIMIT (SMALL_GRANULARITY * (SMALL_CLASS_COUNT - 1)) +//! Granularity of a medium allocation block +#define MEDIUM_GRANULARITY 512 +//! Medium granularity shift count +#define MEDIUM_GRANULARITY_SHIFT 9 +//! Number of medium block size classes +#define MEDIUM_CLASS_COUNT 61 +//! Total number of small + medium size classes +#define SIZE_CLASS_COUNT (SMALL_CLASS_COUNT + MEDIUM_CLASS_COUNT) +//! Number of large block size classes +#define LARGE_CLASS_COUNT 32 +//! Maximum size of a medium block +#define MEDIUM_SIZE_LIMIT (SMALL_SIZE_LIMIT + (MEDIUM_GRANULARITY * MEDIUM_CLASS_COUNT)) +//! Maximum size of a large block +#define LARGE_SIZE_LIMIT ((LARGE_CLASS_COUNT * _memory_span_size) - SPAN_HEADER_SIZE) +//! Size of a span header (must be a multiple of SMALL_GRANULARITY) +#define SPAN_HEADER_SIZE 96 +//! ABA protection size in orhpan heap list (also becomes limit of smallest page size) +#define HEAP_ORPHAN_ABA_SIZE 512 + +#if ENABLE_VALIDATE_ARGS +//! Maximum allocation size to avoid integer overflow +#undef MAX_ALLOC_SIZE +#define MAX_ALLOC_SIZE (((size_t)-1) - _memory_span_size) +#endif + +#define pointer_offset(ptr, ofs) (void*)((char*)(ptr) + (ptrdiff_t)(ofs)) +#define pointer_diff(first, second) (ptrdiff_t)((const char*)(first) - (const char*)(second)) + +#define INVALID_POINTER ((void*)((uintptr_t)-1)) + +/// Data types +//! A memory heap, per thread +typedef struct heap_t heap_t; +//! Heap spans per size class +typedef struct heap_class_t heap_class_t; +//! Span of memory pages +typedef struct span_t span_t; +//! Span list +typedef struct span_list_t span_list_t; +//! Span active data +typedef struct span_active_t span_active_t; +//! Size class definition +typedef struct size_class_t size_class_t; +//! Global cache +typedef struct global_cache_t global_cache_t; + +//! Flag indicating span is the first (master) span of a split superspan +#define SPAN_FLAG_MASTER 1U +//! Flag indicating span is a secondary (sub) span of a split superspan +#define SPAN_FLAG_SUBSPAN 2U +//! Flag indicating span has blocks with increased alignment +#define SPAN_FLAG_ALIGNED_BLOCKS 4U + +#if ENABLE_ADAPTIVE_THREAD_CACHE || ENABLE_STATISTICS +struct span_use_t { + //! Current number of spans used (actually used, not in cache) + atomic32_t current; + //! High water mark of spans used + uint32_t high; +#if ENABLE_STATISTICS + //! Number of spans transitioned to global cache + uint32_t spans_to_global; + //! Number of spans transitioned from global cache + uint32_t spans_from_global; + //! Number of spans transitioned to thread cache + uint32_t spans_to_cache; + //! Number of spans transitioned from thread cache + uint32_t spans_from_cache; + //! Number of spans transitioned to reserved state + uint32_t spans_to_reserved; + //! Number of spans transitioned from reserved state + uint32_t spans_from_reserved; + //! Number of raw memory map calls + uint32_t spans_map_calls; +#endif +}; +typedef struct span_use_t span_use_t; +#endif + +#if ENABLE_STATISTICS +struct size_class_use_t { + //! Current number of allocations + atomic32_t alloc_current; + //! Peak number of allocations + int32_t alloc_peak; + //! Total number of allocations + int32_t alloc_total; + //! Total number of frees + atomic32_t free_total; + //! Number of spans in use + uint32_t spans_current; + //! Number of spans transitioned to cache + uint32_t spans_peak; + //! Number of spans transitioned to cache + uint32_t spans_to_cache; + //! Number of spans transitioned from cache + uint32_t spans_from_cache; + //! Number of spans transitioned from reserved state + uint32_t spans_from_reserved; + //! Number of spans mapped + uint32_t spans_map_calls; +}; +typedef struct size_class_use_t size_class_use_t; +#endif + +typedef enum span_state_t { + SPAN_STATE_ACTIVE = 0, + SPAN_STATE_PARTIAL, + SPAN_STATE_FULL +} span_state_t; + +//A span can either represent a single span of memory pages with size declared by span_map_count configuration variable, +//or a set of spans in a continuous region, a super span. Any reference to the term "span" usually refers to both a single +//span or a super span. A super span can further be divided into multiple spans (or this, super spans), where the first +//(super)span is the master and subsequent (super)spans are subspans. The master span keeps track of how many subspans +//that are still alive and mapped in virtual memory, and once all subspans and master have been unmapped the entire +//superspan region is released and unmapped (on Windows for example, the entire superspan range has to be released +//in the same call to release the virtual memory range, but individual subranges can be decommitted individually +//to reduce physical memory use). +struct span_t { + //! Free list + void* free_list; + //! State + uint32_t state; + //! Used count when not active (not including deferred free list) + uint32_t used_count; + //! Block count + uint32_t block_count; + //! Size class + uint32_t size_class; + //! Index of last block initialized in free list + uint32_t free_list_limit; + //! Span list size when part of a cache list, or size of deferred free list when partial/full + uint32_t list_size; + //! Deferred free list + atomicptr_t free_list_deferred; + //! Size of a block + uint32_t block_size; + //! Flags and counters + uint32_t flags; + //! Number of spans + uint32_t span_count; + //! Total span counter for master spans, distance for subspans + uint32_t total_spans_or_distance; + //! Remaining span counter, for master spans + atomic32_t remaining_spans; + //! Alignment offset + uint32_t align_offset; + //! Owning heap + heap_t* heap; + //! Next span + span_t* next; + //! Previous span + span_t* prev; +}; +_Static_assert(sizeof(span_t) <= SPAN_HEADER_SIZE, "span size mismatch"); + +struct heap_class_t { + //! Free list of active span + void* free_list; + //! Double linked list of partially used spans with free blocks for each size class. + // Current active span is at head of list. Previous span pointer in head points to tail span of list. + span_t* partial_span; +}; + +struct heap_t { + //! Active and semi-used span data per size class + heap_class_t span_class[SIZE_CLASS_COUNT]; +#if ENABLE_THREAD_CACHE + //! List of free spans (single linked list) + span_t* span_cache[LARGE_CLASS_COUNT]; + //! List of deferred free spans of class 0 (single linked list) + atomicptr_t span_cache_deferred; +#endif +#if ENABLE_ADAPTIVE_THREAD_CACHE || ENABLE_STATISTICS + //! Current and high water mark of spans used per span count + span_use_t span_use[LARGE_CLASS_COUNT]; +#endif + //! Mapped but unused spans + span_t* span_reserve; + //! Master span for mapped but unused spans + span_t* span_reserve_master; + //! Number of mapped but unused spans + size_t spans_reserved; + //! Next heap in id list + heap_t* next_heap; + //! Next heap in orphan list + heap_t* next_orphan; + //! Memory pages alignment offset + size_t align_offset; + //! Heap ID + int32_t id; +#if ENABLE_STATISTICS + //! Number of bytes transitioned thread -> global + size_t thread_to_global; + //! Number of bytes transitioned global -> thread + size_t global_to_thread; + //! Allocation stats per size class + size_class_use_t size_class_use[SIZE_CLASS_COUNT + 1]; +#endif + //! Master heap owning the memory pages + heap_t* master_heap; +}; + +struct size_class_t { + //! Size of blocks in this class + uint32_t block_size; + //! Number of blocks in each chunk + uint16_t block_count; + //! Class index this class is merged with + uint16_t class_idx; +}; +_Static_assert(sizeof(size_class_t) == 8, "Size class size mismatch"); + +struct global_cache_t { + //! Cache list pointer + atomicptr_t cache; + //! Cache size + atomic32_t size; + //! ABA counter + atomic32_t counter; +}; + +/// Global data +//! Initialized flag +static int _rpmalloc_initialized; +//! Configuration +static rpmalloc_config_t _memory_config; +//! Memory page size +static size_t _memory_page_size; +//! Shift to divide by page size +static size_t _memory_page_size_shift; +//! Granularity at which memory pages are mapped by OS +static size_t _memory_map_granularity; +#if RPMALLOC_CONFIGURABLE +//! Size of a span of memory pages +static size_t _memory_span_size; +//! Shift to divide by span size +static size_t _memory_span_size_shift; +//! Mask to get to start of a memory span +static uintptr_t _memory_span_mask; +#else +//! Hardwired span size (64KiB) +#define _memory_span_size (64 * 1024) +#define _memory_span_size_shift 16 +#define _memory_span_mask (~((uintptr_t)(_memory_span_size - 1))) +#endif +//! Number of spans to map in each map call +static size_t _memory_span_map_count; +//! Number of spans to release from thread cache to global cache (single spans) +static size_t _memory_span_release_count; +//! Number of spans to release from thread cache to global cache (large multiple spans) +static size_t _memory_span_release_count_large; +//! Global size classes +static size_class_t _memory_size_class[SIZE_CLASS_COUNT]; +//! Run-time size limit of medium blocks +static size_t _memory_medium_size_limit; +//! Heap ID counter +static atomic32_t _memory_heap_id; +//! Huge page support +static int _memory_huge_pages; +#if ENABLE_GLOBAL_CACHE +//! Global span cache +static global_cache_t _memory_span_cache[LARGE_CLASS_COUNT]; +#endif +//! All heaps +static atomicptr_t _memory_heaps[HEAP_ARRAY_SIZE]; +//! Orphaned heaps +static atomicptr_t _memory_orphan_heaps; +//! Running orphan counter to avoid ABA issues in linked list +static atomic32_t _memory_orphan_counter; +#if ENABLE_STATISTICS +//! Active heap count +static atomic32_t _memory_active_heaps; +//! Number of currently mapped memory pages +static atomic32_t _mapped_pages; +//! Peak number of concurrently mapped memory pages +static int32_t _mapped_pages_peak; +//! Number of currently unused spans +static atomic32_t _reserved_spans; +//! Running counter of total number of mapped memory pages since start +static atomic32_t _mapped_total; +//! Running counter of total number of unmapped memory pages since start +static atomic32_t _unmapped_total; +//! Number of currently mapped memory pages in OS calls +static atomic32_t _mapped_pages_os; +//! Number of currently allocated pages in huge allocations +static atomic32_t _huge_pages_current; +//! Peak number of currently allocated pages in huge allocations +static int32_t _huge_pages_peak; +#endif + +//! Current thread heap +#if (defined(__APPLE__) || defined(__HAIKU__)) && ENABLE_PRELOAD +static pthread_key_t _memory_thread_heap; +#else +# ifdef _MSC_VER +# define _Thread_local __declspec(thread) +# define TLS_MODEL +# else +# define TLS_MODEL __attribute__((tls_model("initial-exec"))) +# if !defined(__clang__) && defined(__GNUC__) +# define _Thread_local __thread +# endif +# endif +static _Thread_local heap_t* _memory_thread_heap TLS_MODEL; +#endif + +static inline heap_t* +get_thread_heap_raw(void) { +#if (defined(__APPLE__) || defined(__HAIKU__)) && ENABLE_PRELOAD + return pthread_getspecific(_memory_thread_heap); +#else + return _memory_thread_heap; +#endif +} + +//! Get the current thread heap +static inline heap_t* +get_thread_heap(void) { + heap_t* heap = get_thread_heap_raw(); +#if ENABLE_PRELOAD + if (EXPECTED(heap != 0)) + return heap; + rpmalloc_initialize(); + return get_thread_heap_raw(); +#else + return heap; +#endif +} + +//! Set the current thread heap +static void +set_thread_heap(heap_t* heap) { +#if (defined(__APPLE__) || defined(__HAIKU__)) && ENABLE_PRELOAD + pthread_setspecific(_memory_thread_heap, heap); +#else + _memory_thread_heap = heap; +#endif +} + +//! Default implementation to map more virtual memory +static void* +_memory_map_os(size_t size, size_t* offset); + +//! Default implementation to unmap virtual memory +static void +_memory_unmap_os(void* address, size_t size, size_t offset, size_t release); + +#if ENABLE_ASSERTS +//! Lookup a memory heap from heap ID +static heap_t* +_memory_heap_lookup(int32_t id) { + uint32_t list_idx = id % HEAP_ARRAY_SIZE; + heap_t* heap = (heap_t*)atomic_load_ptr(&_memory_heaps[list_idx]); + while (heap && (heap->id != id)) + heap = heap->next_heap; + return heap; +} +#endif + +#if ENABLE_STATISTICS +# define _memory_statistics_inc(counter, value) counter += value +# define _memory_statistics_dec(counter, value) counter -= value +# define _memory_statistics_add(atomic_counter, value) atomic_add32(atomic_counter, (int32_t)(value)) +# define _memory_statistics_add_peak(atomic_counter, value, peak) do { int32_t _cur_count = atomic_add32(atomic_counter, (int32_t)(value)); if (_cur_count > (peak)) peak = _cur_count; } while (0) +# define _memory_statistics_sub(atomic_counter, value) atomic_add32(atomic_counter, -(int32_t)(value)) +# define _memory_statistics_inc_alloc(heap, class_idx) do { \ + int32_t alloc_current = atomic_incr32(&heap->size_class_use[class_idx].alloc_current); \ + if (alloc_current > heap->size_class_use[class_idx].alloc_peak) \ + heap->size_class_use[class_idx].alloc_peak = alloc_current; \ + heap->size_class_use[class_idx].alloc_total++; \ +} while(0) +# define _memory_statistics_inc_free(heap, class_idx) do { \ + atomic_decr32(&heap->size_class_use[class_idx].alloc_current); \ + atomic_incr32(&heap->size_class_use[class_idx].free_total); \ +} while(0) +#else +# define _memory_statistics_inc(counter, value) do {} while(0) +# define _memory_statistics_dec(counter, value) do {} while(0) +# define _memory_statistics_add(atomic_counter, value) do {} while(0) +# define _memory_statistics_add_peak(atomic_counter, value, peak) do {} while (0) +# define _memory_statistics_sub(atomic_counter, value) do {} while(0) +# define _memory_statistics_inc_alloc(heap, class_idx) do {} while(0) +# define _memory_statistics_inc_free(heap, class_idx) do {} while(0) +#endif + +static void +_memory_heap_cache_insert(heap_t* heap, span_t* span); + +//! Map more virtual memory +static void* +_memory_map(size_t size, size_t* offset) { + assert(!(size % _memory_page_size)); + assert(size >= _memory_page_size); + _memory_statistics_add_peak(&_mapped_pages, (size >> _memory_page_size_shift), _mapped_pages_peak); + _memory_statistics_add(&_mapped_total, (size >> _memory_page_size_shift)); + return _memory_config.memory_map(size, offset); +} + +//! Unmap virtual memory +static void +_memory_unmap(void* address, size_t size, size_t offset, size_t release) { + assert(!release || (release >= size)); + assert(!release || (release >= _memory_page_size)); + if (release) { + assert(!(release % _memory_page_size)); + _memory_statistics_sub(&_mapped_pages, (release >> _memory_page_size_shift)); + _memory_statistics_add(&_unmapped_total, (release >> _memory_page_size_shift)); + } + _memory_config.memory_unmap(address, size, offset, release); +} + +//! Declare the span to be a subspan and store distance from master span and span count +static void +_memory_span_mark_as_subspan_unless_master(span_t* master, span_t* subspan, size_t span_count) { + assert((subspan != master) || (subspan->flags & SPAN_FLAG_MASTER)); + if (subspan != master) { + subspan->flags = SPAN_FLAG_SUBSPAN; + subspan->total_spans_or_distance = (uint32_t)((uintptr_t)pointer_diff(subspan, master) >> _memory_span_size_shift); + subspan->align_offset = 0; + } + subspan->span_count = (uint32_t)span_count; +} + +//! Use reserved spans to fulfill a memory map request (reserve size must be checked by caller) +static span_t* +_memory_map_from_reserve(heap_t* heap, size_t span_count) { + //Update the heap span reserve + span_t* span = heap->span_reserve; + heap->span_reserve = (span_t*)pointer_offset(span, span_count * _memory_span_size); + heap->spans_reserved -= span_count; + + _memory_span_mark_as_subspan_unless_master(heap->span_reserve_master, span, span_count); + if (span_count <= LARGE_CLASS_COUNT) + _memory_statistics_inc(heap->span_use[span_count - 1].spans_from_reserved, 1); + + return span; +} + +//! Get the aligned number of spans to map in based on wanted count, configured mapping granularity and the page size +static size_t +_memory_map_align_span_count(size_t span_count) { + size_t request_count = (span_count > _memory_span_map_count) ? span_count : _memory_span_map_count; + if ((_memory_page_size > _memory_span_size) && ((request_count * _memory_span_size) % _memory_page_size)) + request_count += _memory_span_map_count - (request_count % _memory_span_map_count); + return request_count; +} + +//! Store the given spans as reserve in the given heap +static void +_memory_heap_set_reserved_spans(heap_t* heap, span_t* master, span_t* reserve, size_t reserve_span_count) { + heap->span_reserve_master = master; + heap->span_reserve = reserve; + heap->spans_reserved = reserve_span_count; +} + +//! Setup a newly mapped span +static void +_memory_span_initialize(span_t* span, size_t total_span_count, size_t span_count, size_t align_offset) { + span->total_spans_or_distance = (uint32_t)total_span_count; + span->span_count = (uint32_t)span_count; + span->align_offset = (uint32_t)align_offset; + span->flags = SPAN_FLAG_MASTER; + atomic_store32(&span->remaining_spans, (int32_t)total_span_count); +} + +//! Map a akigned set of spans, taking configured mapping granularity and the page size into account +static span_t* +_memory_map_aligned_span_count(heap_t* heap, size_t span_count) { + //If we already have some, but not enough, reserved spans, release those to heap cache and map a new + //full set of spans. Otherwise we would waste memory if page size > span size (huge pages) + size_t aligned_span_count = _memory_map_align_span_count(span_count); + size_t align_offset = 0; + span_t* span = (span_t*)_memory_map(aligned_span_count * _memory_span_size, &align_offset); + if (!span) + return 0; + _memory_span_initialize(span, aligned_span_count, span_count, align_offset); + _memory_statistics_add(&_reserved_spans, aligned_span_count); + if (span_count <= LARGE_CLASS_COUNT) + _memory_statistics_inc(heap->span_use[span_count - 1].spans_map_calls, 1); + if (aligned_span_count > span_count) { + if (heap->spans_reserved) { + _memory_span_mark_as_subspan_unless_master(heap->span_reserve_master, heap->span_reserve, heap->spans_reserved); + _memory_heap_cache_insert(heap, heap->span_reserve); + } + _memory_heap_set_reserved_spans(heap, span, (span_t*)pointer_offset(span, span_count * _memory_span_size), aligned_span_count - span_count); + } + return span; +} + +//! Map in memory pages for the given number of spans (or use previously reserved pages) +static span_t* +_memory_map_spans(heap_t* heap, size_t span_count) { + if (span_count <= heap->spans_reserved) + return _memory_map_from_reserve(heap, span_count); + return _memory_map_aligned_span_count(heap, span_count); +} + +//! Unmap memory pages for the given number of spans (or mark as unused if no partial unmappings) +static void +_memory_unmap_span(span_t* span) { + assert((span->flags & SPAN_FLAG_MASTER) || (span->flags & SPAN_FLAG_SUBSPAN)); + assert(!(span->flags & SPAN_FLAG_MASTER) || !(span->flags & SPAN_FLAG_SUBSPAN)); + + int is_master = !!(span->flags & SPAN_FLAG_MASTER); + span_t* master = is_master ? span : ((span_t*)pointer_offset(span, -(intptr_t)((uintptr_t)span->total_spans_or_distance * _memory_span_size))); + assert(is_master || (span->flags & SPAN_FLAG_SUBSPAN)); + assert(master->flags & SPAN_FLAG_MASTER); + + size_t span_count = span->span_count; + if (!is_master) { + //Directly unmap subspans (unless huge pages, in which case we defer and unmap entire page range with master) + assert(span->align_offset == 0); + if (_memory_span_size >= _memory_page_size) { + _memory_unmap(span, span_count * _memory_span_size, 0, 0); + _memory_statistics_sub(&_reserved_spans, span_count); + } + } else { + //Special double flag to denote an unmapped master + //It must be kept in memory since span header must be used + span->flags |= SPAN_FLAG_MASTER | SPAN_FLAG_SUBSPAN; + } + + if (atomic_add32(&master->remaining_spans, -(int32_t)span_count) <= 0) { + //Everything unmapped, unmap the master span with release flag to unmap the entire range of the super span + assert(!!(master->flags & SPAN_FLAG_MASTER) && !!(master->flags & SPAN_FLAG_SUBSPAN)); + size_t unmap_count = master->span_count; + if (_memory_span_size < _memory_page_size) + unmap_count = master->total_spans_or_distance; + _memory_statistics_sub(&_reserved_spans, unmap_count); + _memory_unmap(master, unmap_count * _memory_span_size, master->align_offset, master->total_spans_or_distance * _memory_span_size); + } +} + +#if ENABLE_THREAD_CACHE + +//! Unmap a single linked list of spans +static void +_memory_unmap_span_list(span_t* span) { + size_t list_size = span->list_size; + for (size_t ispan = 0; ispan < list_size; ++ispan) { + span_t* next_span = span->next; + _memory_unmap_span(span); + span = next_span; + } + assert(!span); +} + +//! Add span to head of single linked span list +static size_t +_memory_span_list_push(span_t** head, span_t* span) { + span->next = *head; + if (*head) + span->list_size = (*head)->list_size + 1; + else + span->list_size = 1; + *head = span; + return span->list_size; +} + +//! Remove span from head of single linked span list, returns the new list head +static span_t* +_memory_span_list_pop(span_t** head) { + span_t* span = *head; + span_t* next_span = 0; + if (span->list_size > 1) { + assert(span->next); + next_span = span->next; + assert(next_span); + next_span->list_size = span->list_size - 1; + } + *head = next_span; + return span; +} + +//! Split a single linked span list +static span_t* +_memory_span_list_split(span_t* span, size_t limit) { + span_t* next = 0; + if (limit < 2) + limit = 2; + if (span->list_size > limit) { + uint32_t list_size = 1; + span_t* last = span; + next = span->next; + while (list_size < limit) { + last = next; + next = next->next; + ++list_size; + } + last->next = 0; + assert(next); + next->list_size = span->list_size - list_size; + span->list_size = list_size; + span->prev = 0; + } + return next; +} + +#endif + +//! Add a span to partial span double linked list at the head +static void +_memory_span_partial_list_add(span_t** head, span_t* span) { + if (*head) { + span->next = *head; + //Maintain pointer to tail span + span->prev = (*head)->prev; + (*head)->prev = span; + } else { + span->next = 0; + span->prev = span; + } + *head = span; +} + +//! Add a span to partial span double linked list at the tail +static void +_memory_span_partial_list_add_tail(span_t** head, span_t* span) { + span->next = 0; + if (*head) { + span_t* tail = (*head)->prev; + tail->next = span; + span->prev = tail; + //Maintain pointer to tail span + (*head)->prev = span; + } else { + span->prev = span; + *head = span; + } +} + +//! Pop head span from partial span double linked list +static void +_memory_span_partial_list_pop_head(span_t** head) { + span_t* span = *head; + *head = span->next; + if (*head) { + //Maintain pointer to tail span + (*head)->prev = span->prev; + } +} + +//! Remove a span from partial span double linked list +static void +_memory_span_partial_list_remove(span_t** head, span_t* span) { + if (UNEXPECTED(*head == span)) { + _memory_span_partial_list_pop_head(head); + } else { + span_t* next_span = span->next; + span_t* prev_span = span->prev; + prev_span->next = next_span; + if (EXPECTED(next_span != 0)) { + next_span->prev = prev_span; + } else { + //Update pointer to tail span + (*head)->prev = prev_span; + } + } +} + +#if ENABLE_GLOBAL_CACHE + +//! Insert the given list of memory page spans in the global cache +static void +_memory_cache_insert(global_cache_t* cache, span_t* span, size_t cache_limit) { + assert((span->list_size == 1) || (span->next != 0)); + int32_t list_size = (int32_t)span->list_size; + //Unmap if cache has reached the limit + if (atomic_add32(&cache->size, list_size) > (int32_t)cache_limit) { +#if !ENABLE_UNLIMITED_GLOBAL_CACHE + _memory_unmap_span_list(span); + atomic_add32(&cache->size, -list_size); + return; +#endif + } + void* current_cache, *new_cache; + do { + current_cache = atomic_load_ptr(&cache->cache); + span->prev = (span_t*)((uintptr_t)current_cache & _memory_span_mask); + new_cache = (void*)((uintptr_t)span | ((uintptr_t)atomic_incr32(&cache->counter) & ~_memory_span_mask)); + } while (!atomic_cas_ptr(&cache->cache, new_cache, current_cache)); +} + +//! Extract a number of memory page spans from the global cache +static span_t* +_memory_cache_extract(global_cache_t* cache) { + uintptr_t span_ptr; + do { + void* global_span = atomic_load_ptr(&cache->cache); + span_ptr = (uintptr_t)global_span & _memory_span_mask; + if (span_ptr) { + span_t* span = (span_t*)span_ptr; + //By accessing the span ptr before it is swapped out of list we assume that a contending thread + //does not manage to traverse the span to being unmapped before we access it + void* new_cache = (void*)((uintptr_t)span->prev | ((uintptr_t)atomic_incr32(&cache->counter) & ~_memory_span_mask)); + if (atomic_cas_ptr(&cache->cache, new_cache, global_span)) { + atomic_add32(&cache->size, -(int32_t)span->list_size); + return span; + } + } + } while (span_ptr); + return 0; +} + +//! Finalize a global cache, only valid from allocator finalization (not thread safe) +static void +_memory_cache_finalize(global_cache_t* cache) { + void* current_cache = atomic_load_ptr(&cache->cache); + span_t* span = (span_t*)((uintptr_t)current_cache & _memory_span_mask); + while (span) { + span_t* skip_span = (span_t*)((uintptr_t)span->prev & _memory_span_mask); + atomic_add32(&cache->size, -(int32_t)span->list_size); + _memory_unmap_span_list(span); + span = skip_span; + } + assert(!atomic_load32(&cache->size)); + atomic_store_ptr(&cache->cache, 0); + atomic_store32(&cache->size, 0); +} + +//! Insert the given list of memory page spans in the global cache +static void +_memory_global_cache_insert(span_t* span) { + size_t span_count = span->span_count; +#if ENABLE_UNLIMITED_GLOBAL_CACHE + _memory_cache_insert(&_memory_span_cache[span_count - 1], span, 0); +#else + const size_t cache_limit = (GLOBAL_CACHE_MULTIPLIER * ((span_count == 1) ? _memory_span_release_count : _memory_span_release_count_large)); + _memory_cache_insert(&_memory_span_cache[span_count - 1], span, cache_limit); +#endif +} + +//! Extract a number of memory page spans from the global cache for large blocks +static span_t* +_memory_global_cache_extract(size_t span_count) { + span_t* span = _memory_cache_extract(&_memory_span_cache[span_count - 1]); + assert(!span || (span->span_count == span_count)); + return span; +} + +#endif + +#if ENABLE_THREAD_CACHE +//! Adopt the deferred span cache list +static void +_memory_heap_cache_adopt_deferred(heap_t* heap) { + atomic_thread_fence_acquire(); + span_t* span = (span_t*)atomic_load_ptr(&heap->span_cache_deferred); + if (!span) + return; + do { + span = (span_t*)atomic_load_ptr(&heap->span_cache_deferred); + } while (!atomic_cas_ptr(&heap->span_cache_deferred, 0, span)); + while (span) { + span_t* next_span = span->next; + _memory_span_list_push(&heap->span_cache[0], span); +#if ENABLE_STATISTICS + atomic_decr32(&heap->span_use[span->span_count - 1].current); + ++heap->size_class_use[span->size_class].spans_to_cache; + --heap->size_class_use[span->size_class].spans_current; +#endif + span = next_span; + } +} +#endif + +//! Insert a single span into thread heap cache, releasing to global cache if overflow +static void +_memory_heap_cache_insert(heap_t* heap, span_t* span) { +#if ENABLE_THREAD_CACHE + size_t span_count = span->span_count; + size_t idx = span_count - 1; + _memory_statistics_inc(heap->span_use[idx].spans_to_cache, 1); + if (!idx) + _memory_heap_cache_adopt_deferred(heap); +#if ENABLE_UNLIMITED_THREAD_CACHE + _memory_span_list_push(&heap->span_cache[idx], span); +#else + const size_t release_count = (!idx ? _memory_span_release_count : _memory_span_release_count_large); + size_t current_cache_size = _memory_span_list_push(&heap->span_cache[idx], span); + if (current_cache_size <= release_count) + return; + const size_t hard_limit = release_count * THREAD_CACHE_MULTIPLIER; + if (current_cache_size <= hard_limit) { +#if ENABLE_ADAPTIVE_THREAD_CACHE + //Require 25% of high water mark to remain in cache (and at least 1, if use is 0) + const size_t high_mark = heap->span_use[idx].high; + const size_t min_limit = (high_mark >> 2) + release_count + 1; + if (current_cache_size < min_limit) + return; +#else + return; +#endif + } + heap->span_cache[idx] = _memory_span_list_split(span, release_count); + assert(span->list_size == release_count); +#if ENABLE_STATISTICS + heap->thread_to_global += (size_t)span->list_size * span_count * _memory_span_size; + heap->span_use[idx].spans_to_global += span->list_size; +#endif +#if ENABLE_GLOBAL_CACHE + _memory_global_cache_insert(span); +#else + _memory_unmap_span_list(span); +#endif +#endif +#else + (void)sizeof(heap); + _memory_unmap_span(span); +#endif +} + +//! Extract the given number of spans from the different cache levels +static span_t* +_memory_heap_thread_cache_extract(heap_t* heap, size_t span_count) { +#if ENABLE_THREAD_CACHE + size_t idx = span_count - 1; + if (!idx) + _memory_heap_cache_adopt_deferred(heap); + if (heap->span_cache[idx]) { +#if ENABLE_STATISTICS + heap->span_use[idx].spans_from_cache++; +#endif + return _memory_span_list_pop(&heap->span_cache[idx]); + } +#endif + return 0; +} + +static span_t* +_memory_heap_reserved_extract(heap_t* heap, size_t span_count) { + if (heap->spans_reserved >= span_count) + return _memory_map_spans(heap, span_count); + return 0; +} + +//! Extract a span from the global cache +static span_t* +_memory_heap_global_cache_extract(heap_t* heap, size_t span_count) { +#if ENABLE_GLOBAL_CACHE + size_t idx = span_count - 1; + heap->span_cache[idx] = _memory_global_cache_extract(span_count); + if (heap->span_cache[idx]) { +#if ENABLE_STATISTICS + heap->global_to_thread += (size_t)heap->span_cache[idx]->list_size * span_count * _memory_span_size; + heap->span_use[idx].spans_from_global += heap->span_cache[idx]->list_size; +#endif + return _memory_span_list_pop(&heap->span_cache[idx]); + } +#endif + return 0; +} + +//! Get a span from one of the cache levels (thread cache, reserved, global cache) or fallback to mapping more memory +static span_t* +_memory_heap_extract_new_span(heap_t* heap, size_t span_count, uint32_t class_idx) { + (void)sizeof(class_idx); +#if ENABLE_ADAPTIVE_THREAD_CACHE || ENABLE_STATISTICS + uint32_t idx = (uint32_t)span_count - 1; + uint32_t current_count = (uint32_t)atomic_incr32(&heap->span_use[idx].current); + if (current_count > heap->span_use[idx].high) + heap->span_use[idx].high = current_count; +#if ENABLE_STATISTICS + uint32_t spans_current = ++heap->size_class_use[class_idx].spans_current; + if (spans_current > heap->size_class_use[class_idx].spans_peak) + heap->size_class_use[class_idx].spans_peak = spans_current; +#endif +#endif + span_t* span = _memory_heap_thread_cache_extract(heap, span_count); + if (EXPECTED(span != 0)) { + _memory_statistics_inc(heap->size_class_use[class_idx].spans_from_cache, 1); + return span; + } + span = _memory_heap_reserved_extract(heap, span_count); + if (EXPECTED(span != 0)) { + _memory_statistics_inc(heap->size_class_use[class_idx].spans_from_reserved, 1); + return span; + } + span = _memory_heap_global_cache_extract(heap, span_count); + if (EXPECTED(span != 0)) { + _memory_statistics_inc(heap->size_class_use[class_idx].spans_from_cache, 1); + return span; + } + //Final fallback, map in more virtual memory + span = _memory_map_spans(heap, span_count); + _memory_statistics_inc(heap->size_class_use[class_idx].spans_map_calls, 1); + return span; +} + +//! Move the span (used for small or medium allocations) to the heap thread cache +static void +_memory_span_release_to_cache(heap_t* heap, span_t* span) { + heap_class_t* heap_class = heap->span_class + span->size_class; + assert(heap_class->partial_span != span); + if (span->state == SPAN_STATE_PARTIAL) + _memory_span_partial_list_remove(&heap_class->partial_span, span); +#if ENABLE_ADAPTIVE_THREAD_CACHE || ENABLE_STATISTICS + atomic_decr32(&heap->span_use[0].current); +#endif + _memory_statistics_inc(heap->span_use[0].spans_to_cache, 1); + _memory_statistics_inc(heap->size_class_use[span->size_class].spans_to_cache, 1); + _memory_statistics_dec(heap->size_class_use[span->size_class].spans_current, 1); + _memory_heap_cache_insert(heap, span); +} + +//! Initialize a (partial) free list up to next system memory page, while reserving the first block +//! as allocated, returning number of blocks in list +static uint32_t +free_list_partial_init(void** list, void** first_block, void* page_start, void* block_start, + uint32_t block_count, uint32_t block_size) { + assert(block_count); + *first_block = block_start; + if (block_count > 1) { + void* free_block = pointer_offset(block_start, block_size); + void* block_end = pointer_offset(block_start, block_size * block_count); + //If block size is less than half a memory page, bound init to next memory page boundary + if (block_size < (_memory_page_size >> 1)) { + void* page_end = pointer_offset(page_start, _memory_page_size); + if (page_end < block_end) + block_end = page_end; + } + *list = free_block; + block_count = 2; + void* next_block = pointer_offset(free_block, block_size); + while (next_block < block_end) { + *((void**)free_block) = next_block; + free_block = next_block; + ++block_count; + next_block = pointer_offset(next_block, block_size); + } + *((void**)free_block) = 0; + } else { + *list = 0; + } + return block_count; +} + +//! Initialize an unused span (from cache or mapped) to be new active span +static void* +_memory_span_set_new_active(heap_t* heap, heap_class_t* heap_class, span_t* span, uint32_t class_idx) { + assert(span->span_count == 1); + size_class_t* size_class = _memory_size_class + class_idx; + span->size_class = class_idx; + span->heap = heap; + span->flags &= ~SPAN_FLAG_ALIGNED_BLOCKS; + span->block_count = size_class->block_count; + span->block_size = size_class->block_size; + span->state = SPAN_STATE_ACTIVE; + span->free_list = 0; + + //Setup free list. Only initialize one system page worth of free blocks in list + void* block; + span->free_list_limit = free_list_partial_init(&heap_class->free_list, &block, + span, pointer_offset(span, SPAN_HEADER_SIZE), size_class->block_count, size_class->block_size); + atomic_store_ptr(&span->free_list_deferred, 0); + span->list_size = 0; + atomic_thread_fence_release(); + + _memory_span_partial_list_add(&heap_class->partial_span, span); + return block; +} + +//! Promote a partially used span (from heap used list) to be new active span +static void +_memory_span_set_partial_active(heap_class_t* heap_class, span_t* span) { + assert(span->state == SPAN_STATE_PARTIAL); + assert(span->block_count == _memory_size_class[span->size_class].block_count); + //Move data to heap size class and set span as active + heap_class->free_list = span->free_list; + span->state = SPAN_STATE_ACTIVE; + span->free_list = 0; + assert(heap_class->free_list); +} + +//! Mark span as full (from active) +static void +_memory_span_set_active_full(heap_class_t* heap_class, span_t* span) { + assert(span->state == SPAN_STATE_ACTIVE); + assert(span == heap_class->partial_span); + _memory_span_partial_list_pop_head(&heap_class->partial_span); + span->used_count = span->block_count; + span->state = SPAN_STATE_FULL; + span->free_list = 0; +} + +//! Move span from full to partial state +static void +_memory_span_set_full_partial(heap_t* heap, span_t* span) { + assert(span->state == SPAN_STATE_FULL); + heap_class_t* heap_class = &heap->span_class[span->size_class]; + span->state = SPAN_STATE_PARTIAL; + _memory_span_partial_list_add_tail(&heap_class->partial_span, span); +} + +static void* +_memory_span_extract_deferred(span_t* span) { + void* free_list; + do { + free_list = atomic_load_ptr(&span->free_list_deferred); + } while ((free_list == INVALID_POINTER) || !atomic_cas_ptr(&span->free_list_deferred, INVALID_POINTER, free_list)); + span->list_size = 0; + atomic_store_ptr(&span->free_list_deferred, 0); + atomic_thread_fence_release(); + return free_list; +} + +//! Pop first block from a free list +static void* +free_list_pop(void** list) { + void* block = *list; + *list = *((void**)block); + return block; +} + +//! Allocate a small/medium sized memory block from the given heap +static void* +_memory_allocate_from_heap_fallback(heap_t* heap, uint32_t class_idx) { + heap_class_t* heap_class = &heap->span_class[class_idx]; + void* block; + + span_t* active_span = heap_class->partial_span; + if (EXPECTED(active_span != 0)) { + assert(active_span->state == SPAN_STATE_ACTIVE); + assert(active_span->block_count == _memory_size_class[active_span->size_class].block_count); + //Swap in free list if not empty + if (active_span->free_list) { + heap_class->free_list = active_span->free_list; + active_span->free_list = 0; + return free_list_pop(&heap_class->free_list); + } + //If the span did not fully initialize free list, link up another page worth of blocks + if (active_span->free_list_limit < active_span->block_count) { + void* block_start = pointer_offset(active_span, SPAN_HEADER_SIZE + (active_span->free_list_limit * active_span->block_size)); + active_span->free_list_limit += free_list_partial_init(&heap_class->free_list, &block, + (void*)((uintptr_t)block_start & ~(_memory_page_size - 1)), block_start, + active_span->block_count - active_span->free_list_limit, active_span->block_size); + return block; + } + //Swap in deferred free list + atomic_thread_fence_acquire(); + if (atomic_load_ptr(&active_span->free_list_deferred)) { + heap_class->free_list = _memory_span_extract_deferred(active_span); + return free_list_pop(&heap_class->free_list); + } + + //If the active span is fully allocated, mark span as free floating (fully allocated and not part of any list) + assert(!heap_class->free_list); + assert(active_span->free_list_limit >= active_span->block_count); + _memory_span_set_active_full(heap_class, active_span); + } + assert(!heap_class->free_list); + + //Try promoting a semi-used span to active + active_span = heap_class->partial_span; + if (EXPECTED(active_span != 0)) { + _memory_span_set_partial_active(heap_class, active_span); + return free_list_pop(&heap_class->free_list); + } + assert(!heap_class->free_list); + assert(!heap_class->partial_span); + + //Find a span in one of the cache levels + active_span = _memory_heap_extract_new_span(heap, 1, class_idx); + + if (!active_span) + return active_span; + + //Mark span as owned by this heap and set base data, return first block + return _memory_span_set_new_active(heap, heap_class, active_span, class_idx); +} + +//! Allocate a small sized memory block from the given heap +static void* +_memory_allocate_small(heap_t* heap, size_t size) { + //Small sizes have unique size classes + const uint32_t class_idx = (uint32_t)((size + (SMALL_GRANULARITY - 1)) >> SMALL_GRANULARITY_SHIFT); + _memory_statistics_inc_alloc(heap, class_idx); + if (EXPECTED(heap->span_class[class_idx].free_list != 0)) + return free_list_pop(&heap->span_class[class_idx].free_list); + return _memory_allocate_from_heap_fallback(heap, class_idx); +} + +//! Allocate a medium sized memory block from the given heap +static void* +_memory_allocate_medium(heap_t* heap, size_t size) { + //Calculate the size class index and do a dependent lookup of the final class index (in case of merged classes) + const uint32_t base_idx = (uint32_t)(SMALL_CLASS_COUNT + ((size - (SMALL_SIZE_LIMIT + 1)) >> MEDIUM_GRANULARITY_SHIFT)); + const uint32_t class_idx = _memory_size_class[base_idx].class_idx; + _memory_statistics_inc_alloc(heap, class_idx); + if (EXPECTED(heap->span_class[class_idx].free_list != 0)) + return free_list_pop(&heap->span_class[class_idx].free_list); + return _memory_allocate_from_heap_fallback(heap, class_idx); +} + +//! Allocate a large sized memory block from the given heap +static void* +_memory_allocate_large(heap_t* heap, size_t size) { + //Calculate number of needed max sized spans (including header) + //Since this function is never called if size > LARGE_SIZE_LIMIT + //the span_count is guaranteed to be <= LARGE_CLASS_COUNT + size += SPAN_HEADER_SIZE; + size_t span_count = size >> _memory_span_size_shift; + if (size & (_memory_span_size - 1)) + ++span_count; + size_t idx = span_count - 1; + + //Find a span in one of the cache levels + span_t* span = _memory_heap_extract_new_span(heap, span_count, SIZE_CLASS_COUNT); + + if (!span) + return span; + + //Mark span as owned by this heap and set base data + assert(span->span_count == span_count); + span->size_class = (uint32_t)(SIZE_CLASS_COUNT + idx); + span->heap = heap; + atomic_thread_fence_release(); + + return pointer_offset(span, SPAN_HEADER_SIZE); +} + +//! Allocate a huge block by mapping memory pages directly +static void* +_memory_allocate_huge(size_t size) { + size += SPAN_HEADER_SIZE; + size_t num_pages = size >> _memory_page_size_shift; + if (size & (_memory_page_size - 1)) + ++num_pages; + size_t align_offset = 0; + span_t* span = (span_t*)_memory_map(num_pages * _memory_page_size, &align_offset); + if (!span) + return span; + //Store page count in span_count + span->size_class = (uint32_t)-1; + span->span_count = (uint32_t)num_pages; + span->align_offset = (uint32_t)align_offset; + _memory_statistics_add_peak(&_huge_pages_current, num_pages, _huge_pages_peak); + + return pointer_offset(span, SPAN_HEADER_SIZE); +} + +//! Allocate a block larger than medium size +static void* +_memory_allocate_oversized(heap_t* heap, size_t size) { + if (size <= LARGE_SIZE_LIMIT) + return _memory_allocate_large(heap, size); + return _memory_allocate_huge(size); +} + +//! Allocate a block of the given size +static void* +_memory_allocate(heap_t* heap, size_t size) { + if (EXPECTED(size <= SMALL_SIZE_LIMIT)) + return _memory_allocate_small(heap, size); + else if (size <= _memory_medium_size_limit) + return _memory_allocate_medium(heap, size); + return _memory_allocate_oversized(heap, size); +} + +static void +_memory_heap_initialize(heap_t* heap) { + memset(heap, 0, sizeof(heap_t)); + + //Get a new heap ID + heap->id = atomic_incr32(&_memory_heap_id); + assert(heap->id != 0); + //assert(!_memory_heap_lookup(heap->id)); + + //Link in heap in heap ID map + heap_t* next_heap; + size_t list_idx = heap->id % HEAP_ARRAY_SIZE; + do { + next_heap = (heap_t*)atomic_load_ptr(&_memory_heaps[list_idx]); + heap->next_heap = next_heap; + } while (!atomic_cas_ptr(&_memory_heaps[list_idx], heap, next_heap)); +} + +static void +_memory_heap_orphan(heap_t* heap) { + void* raw_heap; + uintptr_t orphan_counter; + heap_t* last_heap; + do { + last_heap = (heap_t*)atomic_load_ptr(&_memory_orphan_heaps); + heap->next_orphan = (heap_t*)((uintptr_t)last_heap & ~(uintptr_t)(HEAP_ORPHAN_ABA_SIZE - 1)); + orphan_counter = (uintptr_t)atomic_incr32(&_memory_orphan_counter); + raw_heap = (void*)((uintptr_t)heap | (orphan_counter & (uintptr_t)(HEAP_ORPHAN_ABA_SIZE - 1))); + } while (!atomic_cas_ptr(&_memory_orphan_heaps, raw_heap, last_heap)); +} + +//! Allocate a new heap +static heap_t* +_memory_allocate_heap(void) { + void* raw_heap; + void* next_raw_heap; + uintptr_t orphan_counter; + heap_t* heap; + heap_t* next_heap; + //Try getting an orphaned heap + atomic_thread_fence_acquire(); + do { + raw_heap = atomic_load_ptr(&_memory_orphan_heaps); + heap = (heap_t*)((uintptr_t)raw_heap & ~(uintptr_t)(HEAP_ORPHAN_ABA_SIZE - 1)); + if (!heap) + break; + next_heap = heap->next_orphan; + orphan_counter = (uintptr_t)atomic_incr32(&_memory_orphan_counter); + next_raw_heap = (void*)((uintptr_t)next_heap | (orphan_counter & (uintptr_t)(HEAP_ORPHAN_ABA_SIZE - 1))); + } while (!atomic_cas_ptr(&_memory_orphan_heaps, next_raw_heap, raw_heap)); + + if (!heap) { + //Map in pages for a new heap + size_t align_offset = 0; + size_t block_size = (1 + (sizeof(heap_t) >> _memory_page_size_shift)) * _memory_page_size; + heap = (heap_t*)_memory_map(block_size, &align_offset); + if (!heap) + return heap; + + _memory_heap_initialize(heap); + heap->align_offset = align_offset; + + //Put extra heaps as orphans, aligning to make sure ABA protection bits fit in pointer low bits + size_t aligned_heap_size = sizeof(heap_t); + if (aligned_heap_size % HEAP_ORPHAN_ABA_SIZE) + aligned_heap_size += HEAP_ORPHAN_ABA_SIZE - (aligned_heap_size % HEAP_ORPHAN_ABA_SIZE); + size_t num_heaps = block_size / aligned_heap_size; + heap_t* extra_heap = (heap_t*)pointer_offset(heap, aligned_heap_size); + while (num_heaps > 1) { + _memory_heap_initialize(extra_heap); + extra_heap->master_heap = heap; + _memory_heap_orphan(extra_heap); + extra_heap = (heap_t*)pointer_offset(extra_heap, aligned_heap_size); + --num_heaps; + } + } + + return heap; +} + +//! Deallocate the given small/medium memory block in the current thread local heap +static void +_memory_deallocate_direct(span_t* span, void* block) { + assert(span->heap == get_thread_heap_raw()); + uint32_t state = span->state; + //Add block to free list + *((void**)block) = span->free_list; + span->free_list = block; + if (UNEXPECTED(state == SPAN_STATE_ACTIVE)) + return; + uint32_t used = --span->used_count; + uint32_t free = span->list_size; + if (UNEXPECTED(used == free)) + _memory_span_release_to_cache(span->heap, span); + else if (UNEXPECTED(state == SPAN_STATE_FULL)) + _memory_span_set_full_partial(span->heap, span); +} + +//! Put the block in the deferred free list of the owning span +static void +_memory_deallocate_defer(span_t* span, void* block) { + atomic_thread_fence_acquire(); + if (span->state == SPAN_STATE_FULL) { + if ((span->list_size + 1) == span->block_count) { + //Span will be completely freed by deferred deallocations, no other thread can + //currently touch it. Safe to move to owner heap deferred cache + span_t* last_head; + heap_t* heap = span->heap; + do { + last_head = (span_t*)atomic_load_ptr(&heap->span_cache_deferred); + span->next = last_head; + } while (!atomic_cas_ptr(&heap->span_cache_deferred, span, last_head)); + return; + } + } + + void* free_list; + do { + atomic_thread_fence_acquire(); + free_list = atomic_load_ptr(&span->free_list_deferred); + *((void**)block) = free_list; + } while ((free_list == INVALID_POINTER) || !atomic_cas_ptr(&span->free_list_deferred, INVALID_POINTER, free_list)); + ++span->list_size; + atomic_store_ptr(&span->free_list_deferred, block); +} + +static void +_memory_deallocate_small_or_medium(span_t* span, void* p) { + _memory_statistics_inc_free(span->heap, span->size_class); + if (span->flags & SPAN_FLAG_ALIGNED_BLOCKS) { + //Realign pointer to block start + void* blocks_start = pointer_offset(span, SPAN_HEADER_SIZE); + uint32_t block_offset = (uint32_t)pointer_diff(p, blocks_start); + p = pointer_offset(p, -(int32_t)(block_offset % span->block_size)); + } + //Check if block belongs to this heap or if deallocation should be deferred + if (span->heap == get_thread_heap_raw()) + _memory_deallocate_direct(span, p); + else + _memory_deallocate_defer(span, p); +} + +//! Deallocate the given large memory block to the current heap +static void +_memory_deallocate_large(span_t* span) { + //Decrease counter + assert(span->span_count == ((size_t)span->size_class - SIZE_CLASS_COUNT + 1)); + assert(span->size_class >= SIZE_CLASS_COUNT); + assert(span->size_class - SIZE_CLASS_COUNT < LARGE_CLASS_COUNT); + assert(!(span->flags & SPAN_FLAG_MASTER) || !(span->flags & SPAN_FLAG_SUBSPAN)); + assert((span->flags & SPAN_FLAG_MASTER) || (span->flags & SPAN_FLAG_SUBSPAN)); + //Large blocks can always be deallocated and transferred between heaps + //Investigate if it is better to defer large spans as well through span_cache_deferred, + //possibly with some heuristics to pick either scheme at runtime per deallocation + heap_t* heap = get_thread_heap(); +#if ENABLE_ADAPTIVE_THREAD_CACHE || ENABLE_STATISTICS + size_t idx = span->span_count - 1; + atomic_decr32(&span->heap->span_use[idx].current); +#endif + if ((span->span_count > 1) && !heap->spans_reserved) { + heap->span_reserve = span; + heap->spans_reserved = span->span_count; + if (span->flags & SPAN_FLAG_MASTER) { + heap->span_reserve_master = span; + } else { //SPAN_FLAG_SUBSPAN + uintptr_t distance = span->total_spans_or_distance; + span_t* master = (span_t*)pointer_offset(span, -(intptr_t)(distance * _memory_span_size)); + heap->span_reserve_master = master; + assert(master->flags & SPAN_FLAG_MASTER); + assert(atomic_load32(&master->remaining_spans) >= (int32_t)span->span_count); + } + _memory_statistics_inc(heap->span_use[idx].spans_to_reserved, 1); + } else { + //Insert into cache list + _memory_heap_cache_insert(heap, span); + } +} + +//! Deallocate the given huge span +static void +_memory_deallocate_huge(span_t* span) { + //Oversized allocation, page count is stored in span_count + size_t num_pages = span->span_count; + _memory_unmap(span, num_pages * _memory_page_size, span->align_offset, num_pages * _memory_page_size); + _memory_statistics_sub(&_huge_pages_current, num_pages); +} + +//! Deallocate the given block +static void +_memory_deallocate(void* p) { + //Grab the span (always at start of span, using span alignment) + span_t* span = (span_t*)((uintptr_t)p & _memory_span_mask); + if (UNEXPECTED(!span)) + return; + if (EXPECTED(span->size_class < SIZE_CLASS_COUNT)) + _memory_deallocate_small_or_medium(span, p); + else if (span->size_class != (uint32_t)-1) + _memory_deallocate_large(span); + else + _memory_deallocate_huge(span); +} + +//! Reallocate the given block to the given size +static void* +_memory_reallocate(void* p, size_t size, size_t oldsize, unsigned int flags) { + if (p) { + //Grab the span using guaranteed span alignment + span_t* span = (span_t*)((uintptr_t)p & _memory_span_mask); + if (span->heap) { + if (span->size_class < SIZE_CLASS_COUNT) { + //Small/medium sized block + assert(span->span_count == 1); + void* blocks_start = pointer_offset(span, SPAN_HEADER_SIZE); + uint32_t block_offset = (uint32_t)pointer_diff(p, blocks_start); + uint32_t block_idx = block_offset / span->block_size; + void* block = pointer_offset(blocks_start, block_idx * span->block_size); + if (!oldsize) + oldsize = span->block_size - (uint32_t)pointer_diff(p, block); + if ((size_t)span->block_size >= size) { + //Still fits in block, never mind trying to save memory, but preserve data if alignment changed + if ((p != block) && !(flags & RPMALLOC_NO_PRESERVE)) + memmove(block, p, oldsize); + return block; + } + } else { + //Large block + size_t total_size = size + SPAN_HEADER_SIZE; + size_t num_spans = total_size >> _memory_span_size_shift; + if (total_size & (_memory_span_mask - 1)) + ++num_spans; + size_t current_spans = span->span_count; + assert(current_spans == ((span->size_class - SIZE_CLASS_COUNT) + 1)); + void* block = pointer_offset(span, SPAN_HEADER_SIZE); + if (!oldsize) + oldsize = (current_spans * _memory_span_size) - (size_t)pointer_diff(p, block) - SPAN_HEADER_SIZE; + if ((current_spans >= num_spans) && (num_spans >= (current_spans / 2))) { + //Still fits in block, never mind trying to save memory, but preserve data if alignment changed + if ((p != block) && !(flags & RPMALLOC_NO_PRESERVE)) + memmove(block, p, oldsize); + return block; + } + } + } else { + //Oversized block + size_t total_size = size + SPAN_HEADER_SIZE; + size_t num_pages = total_size >> _memory_page_size_shift; + if (total_size & (_memory_page_size - 1)) + ++num_pages; + //Page count is stored in span_count + size_t current_pages = span->span_count; + void* block = pointer_offset(span, SPAN_HEADER_SIZE); + if (!oldsize) + oldsize = (current_pages * _memory_page_size) - (size_t)pointer_diff(p, block) - SPAN_HEADER_SIZE; + if ((current_pages >= num_pages) && (num_pages >= (current_pages / 2))) { + //Still fits in block, never mind trying to save memory, but preserve data if alignment changed + if ((p != block) && !(flags & RPMALLOC_NO_PRESERVE)) + memmove(block, p, oldsize); + return block; + } + } + } else { + oldsize = 0; + } + + //Size is greater than block size, need to allocate a new block and deallocate the old + heap_t* heap = get_thread_heap(); + //Avoid hysteresis by overallocating if increase is small (below 37%) + size_t lower_bound = oldsize + (oldsize >> 2) + (oldsize >> 3); + size_t new_size = (size > lower_bound) ? size : ((size > oldsize) ? lower_bound : size); + void* block = _memory_allocate(heap, new_size); + if (p && block) { + if (!(flags & RPMALLOC_NO_PRESERVE)) + memcpy(block, p, oldsize < new_size ? oldsize : new_size); + _memory_deallocate(p); + } + + return block; +} + +//! Get the usable size of the given block +static size_t +_memory_usable_size(void* p) { + //Grab the span using guaranteed span alignment + span_t* span = (span_t*)((uintptr_t)p & _memory_span_mask); + if (span->heap) { + //Small/medium block + if (span->size_class < SIZE_CLASS_COUNT) { + void* blocks_start = pointer_offset(span, SPAN_HEADER_SIZE); + return span->block_size - ((size_t)pointer_diff(p, blocks_start) % span->block_size); + } + + //Large block + size_t current_spans = (span->size_class - SIZE_CLASS_COUNT) + 1; + return (current_spans * _memory_span_size) - (size_t)pointer_diff(p, span); + } + + //Oversized block, page count is stored in span_count + size_t current_pages = span->span_count; + return (current_pages * _memory_page_size) - (size_t)pointer_diff(p, span); +} + +//! Adjust and optimize the size class properties for the given class +static void +_memory_adjust_size_class(size_t iclass) { + size_t block_size = _memory_size_class[iclass].block_size; + size_t block_count = (_memory_span_size - SPAN_HEADER_SIZE) / block_size; + + _memory_size_class[iclass].block_count = (uint16_t)block_count; + _memory_size_class[iclass].class_idx = (uint16_t)iclass; + + //Check if previous size classes can be merged + size_t prevclass = iclass; + while (prevclass > 0) { + --prevclass; + //A class can be merged if number of pages and number of blocks are equal + if (_memory_size_class[prevclass].block_count == _memory_size_class[iclass].block_count) + memcpy(_memory_size_class + prevclass, _memory_size_class + iclass, sizeof(_memory_size_class[iclass])); + else + break; + } +} + +static void +_memory_heap_finalize(void* heapptr) { + heap_t* heap = (heap_t*)heapptr; + if (!heap) + return; + //Release thread cache spans back to global cache +#if ENABLE_THREAD_CACHE + _memory_heap_cache_adopt_deferred(heap); + for (size_t iclass = 0; iclass < LARGE_CLASS_COUNT; ++iclass) { + span_t* span = heap->span_cache[iclass]; +#if ENABLE_GLOBAL_CACHE + while (span) { + assert(span->span_count == (iclass + 1)); + size_t release_count = (!iclass ? _memory_span_release_count : _memory_span_release_count_large); + span_t* next = _memory_span_list_split(span, (uint32_t)release_count); +#if ENABLE_STATISTICS + heap->thread_to_global += (size_t)span->list_size * span->span_count * _memory_span_size; + heap->span_use[iclass].spans_to_global += span->list_size; +#endif + _memory_global_cache_insert(span); + span = next; + } +#else + if (span) + _memory_unmap_span_list(span); +#endif + heap->span_cache[iclass] = 0; + } +#endif + + //Orphan the heap + _memory_heap_orphan(heap); + + set_thread_heap(0); + +#if ENABLE_STATISTICS + atomic_decr32(&_memory_active_heaps); + assert(atomic_load32(&_memory_active_heaps) >= 0); +#endif +} + +#if defined(_MSC_VER) && !defined(__clang__) && (!defined(BUILD_DYNAMIC_LINK) || !BUILD_DYNAMIC_LINK) +#include +static DWORD fls_key; +static void NTAPI +rp_thread_destructor(void* value) { + if (value) + rpmalloc_thread_finalize(); +} +#endif + +#if PLATFORM_POSIX +# include +# include +# ifdef __FreeBSD__ +# include +# define MAP_HUGETLB MAP_ALIGNED_SUPER +# endif +# ifndef MAP_UNINITIALIZED +# define MAP_UNINITIALIZED 0 +# endif +#endif +#include + +//! Initialize the allocator and setup global data +extern inline int +rpmalloc_initialize(void) { + if (_rpmalloc_initialized) { + rpmalloc_thread_initialize(); + return 0; + } + memset(&_memory_config, 0, sizeof(rpmalloc_config_t)); + return rpmalloc_initialize_config(0); +} + +#define RPMALLOC_CEIL(a, b) ((a / b + (a % b != 0)) * b) + +int +rpmalloc_initialize_config(const rpmalloc_config_t* config) { + if (_rpmalloc_initialized) { + rpmalloc_thread_initialize(); + return 0; + } + _rpmalloc_initialized = 1; + + if (config) + memcpy(&_memory_config, config, sizeof(rpmalloc_config_t)); + + if (!_memory_config.memory_map || !_memory_config.memory_unmap) { + _memory_config.memory_map = _memory_map_os; + _memory_config.memory_unmap = _memory_unmap_os; + } + +#if RPMALLOC_CONFIGURABLE + _memory_page_size = _memory_config.page_size; +#else + _memory_page_size = 0; +#endif + _memory_huge_pages = 0; + _memory_map_granularity = _memory_page_size; + +#if PLATFORM_WINDOWS + SYSTEM_INFO system_info; + memset(&system_info, 0, sizeof(system_info)); + GetSystemInfo(&system_info); + if (!_memory_page_size) { + _memory_page_size = system_info.dwPageSize; + _memory_map_granularity = system_info.dwAllocationGranularity; + } else { + // Align to closest sizes + _memory_page_size = RPMALLOC_CEIL(_memory_page_size, system_info.dwPageSize); + _memory_map_granularity = RPMALLOC_CEIL(_memory_map_granularity, system_info.dwAllocationGranularity); + } + if (config && config->enable_huge_pages) { + HANDLE token = 0; + size_t large_page_minimum = GetLargePageMinimum(); + if (large_page_minimum) + OpenProcessToken(GetCurrentProcess(), TOKEN_ADJUST_PRIVILEGES | TOKEN_QUERY, &token); + if (token) { + LUID luid; + if (LookupPrivilegeValue(0, SE_LOCK_MEMORY_NAME, &luid)) { + TOKEN_PRIVILEGES token_privileges; + memset(&token_privileges, 0, sizeof(token_privileges)); + token_privileges.PrivilegeCount = 1; + token_privileges.Privileges[0].Luid = luid; + token_privileges.Privileges[0].Attributes = SE_PRIVILEGE_ENABLED; + if (AdjustTokenPrivileges(token, FALSE, &token_privileges, 0, 0, 0)) { + DWORD err = GetLastError(); + if (err == ERROR_SUCCESS) { + _memory_huge_pages = 1; + } + } + } + CloseHandle(token); + } + if (_memory_huge_pages) { + if (!_memory_page_size) { + _memory_map_granularity = _memory_page_size = large_page_minimum; + } else { + _memory_map_granularity = _memory_page_size = RPMALLOC_CEIL(_memory_page_size, large_page_minimum); + } + } else { + // Revert to default 4k pages if the user doesn't have enough privileges to activate large pages + _memory_page_size = system_info.dwPageSize; + _memory_map_granularity = system_info.dwAllocationGranularity; + } + } +#else + _memory_page_size = (size_t)sysconf(_SC_PAGESIZE); + _memory_map_granularity = _memory_page_size; + if (config && config->enable_huge_pages) { +#if defined(__linux__) + size_t huge_page_size = 0; + FILE* meminfo = fopen("/proc/meminfo", "r"); + if (meminfo) { + char line[128]; + while (!huge_page_size && fgets(line, sizeof(line) - 1, meminfo)) { + line[sizeof(line) - 1] = 0; + if (strstr(line, "Hugepagesize:")) + huge_page_size = (size_t)strtol(line + 13, 0, 10) * 1024; + } + fclose(meminfo); + } + if (huge_page_size) { + _memory_huge_pages = 1; + _memory_page_size = huge_page_size; + _memory_map_granularity = huge_page_size; + } +#elif defined(__FreeBSD__) + int rc; + size_t sz = sizeof(rc); + + if (sysctlbyname("vm.pmap.pg_ps_enabled", &rc, &sz, NULL, 0) == 0 && rc == 1) { + _memory_huge_pages = 1; + _memory_page_size = 2 * 1024 * 1024; + _memory_map_granularity = _memory_page_size; + } +#elif defined(__APPLE__) + _memory_huge_pages = 1; + _memory_page_size = 2 * 1024 * 1024; + _memory_map_granularity = _memory_page_size; +#endif + } +#endif + + //The ABA counter in heap orphan list is tied to using HEAP_ORPHAN_ABA_SIZE + size_t min_span_size = HEAP_ORPHAN_ABA_SIZE; + size_t max_page_size = 4 * 1024 * 1024; + const size_t ptrbits = sizeof(void*); + if (ptrbits > 4) + max_page_size = 4096ULL * 1024ULL * 1024ULL; + if (_memory_page_size < min_span_size) + _memory_page_size = min_span_size; + if (_memory_page_size > max_page_size) + _memory_page_size = max_page_size; + _memory_page_size_shift = 0; + size_t page_size_bit = _memory_page_size; + while (page_size_bit != 1) { + ++_memory_page_size_shift; + page_size_bit >>= 1; + } + _memory_page_size = ((size_t)1 << _memory_page_size_shift); + +#if RPMALLOC_CONFIGURABLE + size_t span_size = _memory_config.span_size; + if (!span_size) + span_size = (64 * 1024); + if (span_size > (256 * 1024)) + span_size = (256 * 1024); + _memory_span_size = 4096; + _memory_span_size_shift = 12; + while (_memory_span_size < span_size) { + _memory_span_size <<= 1; + ++_memory_span_size_shift; + } + _memory_span_mask = ~(uintptr_t)(_memory_span_size - 1); +#endif + + _memory_span_map_count = ( _memory_config.span_map_count ? _memory_config.span_map_count : DEFAULT_SPAN_MAP_COUNT); + if ((_memory_span_size * _memory_span_map_count) < _memory_page_size) + _memory_span_map_count = (_memory_page_size / _memory_span_size); + if ((_memory_page_size >= _memory_span_size) && ((_memory_span_map_count * _memory_span_size) % _memory_page_size)) + _memory_span_map_count = (_memory_page_size / _memory_span_size); + + _memory_config.page_size = _memory_page_size; + _memory_config.span_size = _memory_span_size; + _memory_config.span_map_count = _memory_span_map_count; + _memory_config.enable_huge_pages = _memory_huge_pages; + + _memory_span_release_count = (_memory_span_map_count > 4 ? ((_memory_span_map_count < 64) ? _memory_span_map_count : 64) : 4); + _memory_span_release_count_large = (_memory_span_release_count > 8 ? (_memory_span_release_count / 4) : 2); + +#if (defined(__APPLE__) || defined(__HAIKU__)) && ENABLE_PRELOAD + if (pthread_key_create(&_memory_thread_heap, _memory_heap_finalize)) + return -1; +#endif +#if defined(_MSC_VER) && !defined(__clang__) && (!defined(BUILD_DYNAMIC_LINK) || !BUILD_DYNAMIC_LINK) + fls_key = FlsAlloc(&rp_thread_destructor); +#endif + + atomic_store32(&_memory_heap_id, 1); + atomic_store32(&_memory_orphan_counter, 0); +#if ENABLE_STATISTICS + atomic_store32(&_memory_active_heaps, 0); + atomic_store32(&_reserved_spans, 0); + atomic_store32(&_mapped_pages, 0); + _mapped_pages_peak = 0; + atomic_store32(&_mapped_total, 0); + atomic_store32(&_unmapped_total, 0); + atomic_store32(&_mapped_pages_os, 0); + atomic_store32(&_huge_pages_current, 0); + _huge_pages_peak = 0; +#endif + + //Setup all small and medium size classes + size_t iclass = 0; + _memory_size_class[iclass].block_size = SMALL_GRANULARITY; + _memory_adjust_size_class(iclass); + for (iclass = 1; iclass < SMALL_CLASS_COUNT; ++iclass) { + size_t size = iclass * SMALL_GRANULARITY; + _memory_size_class[iclass].block_size = (uint32_t)size; + _memory_adjust_size_class(iclass); + } + //At least two blocks per span, then fall back to large allocations + _memory_medium_size_limit = (_memory_span_size - SPAN_HEADER_SIZE) >> 1; + if (_memory_medium_size_limit > MEDIUM_SIZE_LIMIT) + _memory_medium_size_limit = MEDIUM_SIZE_LIMIT; + for (iclass = 0; iclass < MEDIUM_CLASS_COUNT; ++iclass) { + size_t size = SMALL_SIZE_LIMIT + ((iclass + 1) * MEDIUM_GRANULARITY); + if (size > _memory_medium_size_limit) + break; + _memory_size_class[SMALL_CLASS_COUNT + iclass].block_size = (uint32_t)size; + _memory_adjust_size_class(SMALL_CLASS_COUNT + iclass); + } + + for (size_t list_idx = 0; list_idx < HEAP_ARRAY_SIZE; ++list_idx) + atomic_store_ptr(&_memory_heaps[list_idx], 0); + + //Initialize this thread + rpmalloc_thread_initialize(); + return 0; +} + +//! Finalize the allocator +void +rpmalloc_finalize(void) { + atomic_thread_fence_acquire(); + + rpmalloc_thread_finalize(); +#if ENABLE_STATISTICS + rpmalloc_dump_statistics(stderr); +#endif + //Free all thread caches + heap_t* master_heaps = 0; + for (size_t list_idx = 0; list_idx < HEAP_ARRAY_SIZE; ++list_idx) { + heap_t* heap = (heap_t*)atomic_load_ptr(&_memory_heaps[list_idx]); + while (heap) { + if (heap->spans_reserved) { + span_t* span = _memory_map_spans(heap, heap->spans_reserved); + _memory_unmap_span(span); + } + + for (size_t iclass = 0; iclass < SIZE_CLASS_COUNT; ++iclass) { + heap_class_t* heap_class = heap->span_class + iclass; + span_t* span = heap_class->partial_span; + while (span) { + span_t* next = span->next; + if (span->state == SPAN_STATE_ACTIVE) { + uint32_t used_blocks = span->block_count; + if (span->free_list_limit < span->block_count) + used_blocks = span->free_list_limit; + uint32_t free_blocks = 0; + void* block = heap_class->free_list; + while (block) { + ++free_blocks; + block = *((void**)block); + } + block = span->free_list; + while (block) { + ++free_blocks; + block = *((void**)block); + } + if (used_blocks == (free_blocks + span->list_size)) + _memory_heap_cache_insert(heap, span); + } else { + if (span->used_count == span->list_size) + _memory_heap_cache_insert(heap, span); + } + span = next; + } + } + +#if ENABLE_THREAD_CACHE + //Free span caches (other thread might have deferred after the thread using this heap finalized) + _memory_heap_cache_adopt_deferred(heap); + for (size_t iclass = 0; iclass < LARGE_CLASS_COUNT; ++iclass) { + if (heap->span_cache[iclass]) + _memory_unmap_span_list(heap->span_cache[iclass]); + } +#endif + heap_t* next_heap = heap->next_heap; + if (!heap->master_heap) { + heap->next_heap = master_heaps; + master_heaps = heap; + } + heap = next_heap; + } + } + + //Finally free all master heaps pages + heap_t* master_heap = master_heaps; + while (master_heap) { + heap_t* next_heap = master_heap->next_heap; + size_t block_size = (1 + (sizeof(heap_t) >> _memory_page_size_shift)) * _memory_page_size; + _memory_unmap(master_heap, block_size, master_heap->align_offset, block_size); + master_heap = next_heap; + } + +#if ENABLE_GLOBAL_CACHE + //Free global caches + for (size_t iclass = 0; iclass < LARGE_CLASS_COUNT; ++iclass) + _memory_cache_finalize(&_memory_span_cache[iclass]); +#endif + + atomic_store_ptr(&_memory_orphan_heaps, 0); + atomic_thread_fence_release(); + +#if (defined(__APPLE__) || defined(__HAIKU__)) && ENABLE_PRELOAD + pthread_key_delete(_memory_thread_heap); +#endif +#if defined(_MSC_VER) && !defined(__clang__) && (!defined(BUILD_DYNAMIC_LINK) || !BUILD_DYNAMIC_LINK) + FlsFree(fls_key); +#endif + +#if ENABLE_STATISTICS + //If you hit these asserts you probably have memory leaks or double frees in your code + assert(!atomic_load32(&_mapped_pages)); + assert(!atomic_load32(&_reserved_spans)); + assert(!atomic_load32(&_mapped_pages_os)); +#endif + + _rpmalloc_initialized = 0; +} + +//! Initialize thread, assign heap +extern inline void +rpmalloc_thread_initialize(void) { + if (!get_thread_heap_raw()) { + heap_t* heap = _memory_allocate_heap(); + if (heap) { + atomic_thread_fence_acquire(); +#if ENABLE_STATISTICS + atomic_incr32(&_memory_active_heaps); +#endif + set_thread_heap(heap); +#if defined(_MSC_VER) && !defined(__clang__) && (!defined(BUILD_DYNAMIC_LINK) || !BUILD_DYNAMIC_LINK) + FlsSetValue(fls_key, heap); +#endif + } + } +} + +//! Finalize thread, orphan heap +void +rpmalloc_thread_finalize(void) { + heap_t* heap = get_thread_heap_raw(); + if (heap) + _memory_heap_finalize(heap); +} + +int +rpmalloc_is_thread_initialized(void) { + return (get_thread_heap_raw() != 0) ? 1 : 0; +} + +const rpmalloc_config_t* +rpmalloc_config(void) { + return &_memory_config; +} + +//! Map new pages to virtual memory +static void* +_memory_map_os(size_t size, size_t* offset) { + //Either size is a heap (a single page) or a (multiple) span - we only need to align spans, and only if larger than map granularity + size_t padding = ((size >= _memory_span_size) && (_memory_span_size > _memory_map_granularity)) ? _memory_span_size : 0; + assert(size >= _memory_page_size); +#if PLATFORM_WINDOWS + //Ok to MEM_COMMIT - according to MSDN, "actual physical pages are not allocated unless/until the virtual addresses are actually accessed" + void* ptr = VirtualAlloc(0, size + padding, (_memory_huge_pages ? MEM_LARGE_PAGES : 0) | MEM_RESERVE | MEM_COMMIT, PAGE_READWRITE); + if (!ptr) { + assert(!"Failed to map virtual memory block"); + return 0; + } +#else + int flags = MAP_PRIVATE | MAP_ANONYMOUS | MAP_UNINITIALIZED; +# if defined(__APPLE__) + int fd = (int)VM_MAKE_TAG(240U); + if (_memory_huge_pages) + fd |= VM_FLAGS_SUPERPAGE_SIZE_2MB; + void* ptr = mmap(0, size + padding, PROT_READ | PROT_WRITE, flags, fd, 0); +# elif defined(MAP_HUGETLB) + void* ptr = mmap(0, size + padding, PROT_READ | PROT_WRITE, (_memory_huge_pages ? MAP_HUGETLB : 0) | flags, -1, 0); +# else + void* ptr = mmap(0, size + padding, PROT_READ | PROT_WRITE, flags, -1, 0); +# endif + if ((ptr == MAP_FAILED) || !ptr) { + assert("Failed to map virtual memory block" == 0); + return 0; + } +#endif +#if ENABLE_STATISTICS + atomic_add32(&_mapped_pages_os, (int32_t)((size + padding) >> _memory_page_size_shift)); +#endif + if (padding) { + size_t final_padding = padding - ((uintptr_t)ptr & ~_memory_span_mask); + assert(final_padding <= _memory_span_size); + assert(final_padding <= padding); + assert(!(final_padding % 8)); + ptr = pointer_offset(ptr, final_padding); + *offset = final_padding >> 3; + } + assert((size < _memory_span_size) || !((uintptr_t)ptr & ~_memory_span_mask)); + return ptr; +} + +//! Unmap pages from virtual memory +static void +_memory_unmap_os(void* address, size_t size, size_t offset, size_t release) { + assert(release || (offset == 0)); + assert(!release || (release >= _memory_page_size)); + assert(size >= _memory_page_size); + if (release && offset) { + offset <<= 3; + address = pointer_offset(address, -(int32_t)offset); +#if PLATFORM_POSIX + //Padding is always one span size + release += _memory_span_size; +#endif + } +#if !DISABLE_UNMAP +#if PLATFORM_WINDOWS + if (!VirtualFree(address, release ? 0 : size, release ? MEM_RELEASE : MEM_DECOMMIT)) { + assert(!"Failed to unmap virtual memory block"); + } +#else + if (release) { + if (munmap(address, release)) { + assert("Failed to unmap virtual memory block" == 0); + } + } + else { +#if defined(POSIX_MADV_FREE) + if (posix_madvise(address, size, POSIX_MADV_FREE)) +#endif + if (posix_madvise(address, size, POSIX_MADV_DONTNEED)) { + assert("Failed to madvise virtual memory block as free" == 0); + } + } +#endif +#endif +#if ENABLE_STATISTICS + if (release) + atomic_add32(&_mapped_pages_os, -(int32_t)(release >> _memory_page_size_shift)); +#endif +} + +// Extern interface + +extern inline RPMALLOC_ALLOCATOR void* +rpmalloc(size_t size) { +#if ENABLE_VALIDATE_ARGS + if (size >= MAX_ALLOC_SIZE) { + errno = EINVAL; + return 0; + } +#endif + heap_t* heap = get_thread_heap(); + return _memory_allocate(heap, size); +} + +extern inline void +rpfree(void* ptr) { + _memory_deallocate(ptr); +} + +extern inline RPMALLOC_ALLOCATOR void* +rpcalloc(size_t num, size_t size) { + size_t total; +#if ENABLE_VALIDATE_ARGS +#if PLATFORM_WINDOWS + int err = SizeTMult(num, size, &total); + if ((err != S_OK) || (total >= MAX_ALLOC_SIZE)) { + errno = EINVAL; + return 0; + } +#else + int err = __builtin_umull_overflow(num, size, &total); + if (err || (total >= MAX_ALLOC_SIZE)) { + errno = EINVAL; + return 0; + } +#endif +#else + total = num * size; +#endif + heap_t* heap = get_thread_heap(); + void* block = _memory_allocate(heap, total); + memset(block, 0, total); + return block; +} + +extern inline RPMALLOC_ALLOCATOR void* +rprealloc(void* ptr, size_t size) { +#if ENABLE_VALIDATE_ARGS + if (size >= MAX_ALLOC_SIZE) { + errno = EINVAL; + return ptr; + } +#endif + return _memory_reallocate(ptr, size, 0, 0); +} + +extern RPMALLOC_ALLOCATOR void* +rpaligned_realloc(void* ptr, size_t alignment, size_t size, size_t oldsize, + unsigned int flags) { +#if ENABLE_VALIDATE_ARGS + if ((size + alignment < size) || (alignment > _memory_page_size)) { + errno = EINVAL; + return 0; + } +#endif + void* block; + if (alignment > 32) { + size_t usablesize = _memory_usable_size(ptr); + if ((usablesize >= size) && (size >= (usablesize / 2)) && !((uintptr_t)ptr & (alignment - 1))) + return ptr; + + block = rpaligned_alloc(alignment, size); + if (ptr) { + if (!oldsize) + oldsize = usablesize; + if (!(flags & RPMALLOC_NO_PRESERVE)) + memcpy(block, ptr, oldsize < size ? oldsize : size); + rpfree(ptr); + } + //Mark as having aligned blocks + span_t* span = (span_t*)((uintptr_t)block & _memory_span_mask); + span->flags |= SPAN_FLAG_ALIGNED_BLOCKS; + } else { + block = _memory_reallocate(ptr, size, oldsize, flags); + } + return block; +} + +extern RPMALLOC_ALLOCATOR void* +rpaligned_alloc(size_t alignment, size_t size) { + if (alignment <= 16) + return rpmalloc(size); + +#if ENABLE_VALIDATE_ARGS + if ((size + alignment) < size) { + errno = EINVAL; + return 0; + } + if (alignment & (alignment - 1)) { + errno = EINVAL; + return 0; + } +#endif + + void* ptr = 0; + size_t align_mask = alignment - 1; + if (alignment < _memory_page_size) { + ptr = rpmalloc(size + alignment); + if ((uintptr_t)ptr & align_mask) + ptr = (void*)(((uintptr_t)ptr & ~(uintptr_t)align_mask) + alignment); + //Mark as having aligned blocks + span_t* span = (span_t*)((uintptr_t)ptr & _memory_span_mask); + span->flags |= SPAN_FLAG_ALIGNED_BLOCKS; + return ptr; + } + + // Fallback to mapping new pages for this request. Since pointers passed + // to rpfree must be able to reach the start of the span by bitmasking of + // the address with the span size, the returned aligned pointer from this + // function must be with a span size of the start of the mapped area. + // In worst case this requires us to loop and map pages until we get a + // suitable memory address. It also means we can never align to span size + // or greater, since the span header will push alignment more than one + // span size away from span start (thus causing pointer mask to give us + // an invalid span start on free) + if (alignment & align_mask) { + errno = EINVAL; + return 0; + } + if (alignment >= _memory_span_size) { + errno = EINVAL; + return 0; + } + + size_t extra_pages = alignment / _memory_page_size; + + // Since each span has a header, we will at least need one extra memory page + size_t num_pages = 1 + (size / _memory_page_size); + if (size & (_memory_page_size - 1)) + ++num_pages; + + if (extra_pages > num_pages) + num_pages = 1 + extra_pages; + + size_t original_pages = num_pages; + size_t limit_pages = (_memory_span_size / _memory_page_size) * 2; + if (limit_pages < (original_pages * 2)) + limit_pages = original_pages * 2; + + size_t mapped_size, align_offset; + span_t* span; + +retry: + align_offset = 0; + mapped_size = num_pages * _memory_page_size; + + span = (span_t*)_memory_map(mapped_size, &align_offset); + if (!span) { + errno = ENOMEM; + return 0; + } + ptr = pointer_offset(span, SPAN_HEADER_SIZE); + + if ((uintptr_t)ptr & align_mask) + ptr = (void*)(((uintptr_t)ptr & ~(uintptr_t)align_mask) + alignment); + + if (((size_t)pointer_diff(ptr, span) >= _memory_span_size) || + (pointer_offset(ptr, size) > pointer_offset(span, mapped_size)) || + (((uintptr_t)ptr & _memory_span_mask) != (uintptr_t)span)) { + _memory_unmap(span, mapped_size, align_offset, mapped_size); + ++num_pages; + if (num_pages > limit_pages) { + errno = EINVAL; + return 0; + } + goto retry; + } + + //Store page count in span_count + span->size_class = (uint32_t)-1; + span->span_count = (uint32_t)num_pages; + span->align_offset = (uint32_t)align_offset; + _memory_statistics_add_peak(&_huge_pages_current, num_pages, _huge_pages_peak); + + return ptr; +} + +extern inline RPMALLOC_ALLOCATOR void* +rpmemalign(size_t alignment, size_t size) { + return rpaligned_alloc(alignment, size); +} + +extern inline int +rpposix_memalign(void **memptr, size_t alignment, size_t size) { + if (memptr) + *memptr = rpaligned_alloc(alignment, size); + else + return EINVAL; + return *memptr ? 0 : ENOMEM; +} + +extern inline size_t +rpmalloc_usable_size(void* ptr) { + return (ptr ? _memory_usable_size(ptr) : 0); +} + +extern inline void +rpmalloc_thread_collect(void) { +} + +void +rpmalloc_thread_statistics(rpmalloc_thread_statistics_t* stats) { + memset(stats, 0, sizeof(rpmalloc_thread_statistics_t)); + heap_t* heap = get_thread_heap_raw(); + if (!heap) + return; + + for (size_t iclass = 0; iclass < SIZE_CLASS_COUNT; ++iclass) { + size_class_t* size_class = _memory_size_class + iclass; + heap_class_t* heap_class = heap->span_class + iclass; + span_t* span = heap_class->partial_span; + while (span) { + atomic_thread_fence_acquire(); + size_t free_count = span->list_size; + if (span->state == SPAN_STATE_PARTIAL) + free_count += (size_class->block_count - span->used_count); + stats->sizecache = free_count * size_class->block_size; + span = span->next; + } + } + +#if ENABLE_THREAD_CACHE + for (size_t iclass = 0; iclass < LARGE_CLASS_COUNT; ++iclass) { + if (heap->span_cache[iclass]) + stats->spancache = (size_t)heap->span_cache[iclass]->list_size * (iclass + 1) * _memory_span_size; + span_t* deferred_list = !iclass ? (span_t*)atomic_load_ptr(&heap->span_cache_deferred) : 0; + //TODO: Incorrect, for deferred lists the size is NOT stored in list_size + if (deferred_list) + stats->spancache = (size_t)deferred_list->list_size * (iclass + 1) * _memory_span_size; + } +#endif +#if ENABLE_STATISTICS + stats->thread_to_global = heap->thread_to_global; + stats->global_to_thread = heap->global_to_thread; + + for (size_t iclass = 0; iclass < LARGE_CLASS_COUNT; ++iclass) { + stats->span_use[iclass].current = (size_t)atomic_load32(&heap->span_use[iclass].current); + stats->span_use[iclass].peak = (size_t)heap->span_use[iclass].high; + stats->span_use[iclass].to_global = (size_t)heap->span_use[iclass].spans_to_global; + stats->span_use[iclass].from_global = (size_t)heap->span_use[iclass].spans_from_global; + stats->span_use[iclass].to_cache = (size_t)heap->span_use[iclass].spans_to_cache; + stats->span_use[iclass].from_cache = (size_t)heap->span_use[iclass].spans_from_cache; + stats->span_use[iclass].to_reserved = (size_t)heap->span_use[iclass].spans_to_reserved; + stats->span_use[iclass].from_reserved = (size_t)heap->span_use[iclass].spans_from_reserved; + stats->span_use[iclass].map_calls = (size_t)heap->span_use[iclass].spans_map_calls; + } + for (size_t iclass = 0; iclass < SIZE_CLASS_COUNT; ++iclass) { + stats->size_use[iclass].alloc_current = (size_t)atomic_load32(&heap->size_class_use[iclass].alloc_current); + stats->size_use[iclass].alloc_peak = (size_t)heap->size_class_use[iclass].alloc_peak; + stats->size_use[iclass].alloc_total = (size_t)heap->size_class_use[iclass].alloc_total; + stats->size_use[iclass].free_total = (size_t)atomic_load32(&heap->size_class_use[iclass].free_total); + stats->size_use[iclass].spans_to_cache = (size_t)heap->size_class_use[iclass].spans_to_cache; + stats->size_use[iclass].spans_from_cache = (size_t)heap->size_class_use[iclass].spans_from_cache; + stats->size_use[iclass].spans_from_reserved = (size_t)heap->size_class_use[iclass].spans_from_reserved; + stats->size_use[iclass].map_calls = (size_t)heap->size_class_use[iclass].spans_map_calls; + } +#endif +} + +void +rpmalloc_global_statistics(rpmalloc_global_statistics_t* stats) { + memset(stats, 0, sizeof(rpmalloc_global_statistics_t)); +#if ENABLE_STATISTICS + stats->mapped = (size_t)atomic_load32(&_mapped_pages) * _memory_page_size; + stats->mapped_peak = (size_t)_mapped_pages_peak * _memory_page_size; + stats->mapped_total = (size_t)atomic_load32(&_mapped_total) * _memory_page_size; + stats->unmapped_total = (size_t)atomic_load32(&_unmapped_total) * _memory_page_size; + stats->huge_alloc = (size_t)atomic_load32(&_huge_pages_current) * _memory_page_size; + stats->huge_alloc_peak = (size_t)_huge_pages_peak * _memory_page_size; +#endif +#if ENABLE_GLOBAL_CACHE + for (size_t iclass = 0; iclass < LARGE_CLASS_COUNT; ++iclass) { + stats->cached += (size_t)atomic_load32(&_memory_span_cache[iclass].size) * (iclass + 1) * _memory_span_size; + } +#endif +} + +#if ENABLE_STATISTICS + +static void +_memory_heap_dump_statistics(heap_t* heap, void* file) { + fprintf(file, "Heap %d stats:\n", heap->id); + fprintf(file, "Class CurAlloc PeakAlloc TotAlloc TotFree BlkSize BlkCount SpansCur SpansPeak PeakAllocMiB ToCacheMiB FromCacheMiB FromReserveMiB MmapCalls\n"); + for (size_t iclass = 0; iclass < SIZE_CLASS_COUNT; ++iclass) { + if (!heap->size_class_use[iclass].alloc_total) + continue; + fprintf(file, "%3u: %10u %10u %10u %10u %8u %8u %8d %9d %13zu %11zu %12zu %14zu %9u\n", (uint32_t)iclass, + atomic_load32(&heap->size_class_use[iclass].alloc_current), + heap->size_class_use[iclass].alloc_peak, + heap->size_class_use[iclass].alloc_total, + atomic_load32(&heap->size_class_use[iclass].free_total), + _memory_size_class[iclass].block_size, + _memory_size_class[iclass].block_count, + heap->size_class_use[iclass].spans_current, + heap->size_class_use[iclass].spans_peak, + ((size_t)heap->size_class_use[iclass].alloc_peak * (size_t)_memory_size_class[iclass].block_size) / (size_t)(1024 * 1024), + ((size_t)heap->size_class_use[iclass].spans_to_cache * _memory_span_size) / (size_t)(1024 * 1024), + ((size_t)heap->size_class_use[iclass].spans_from_cache * _memory_span_size) / (size_t)(1024 * 1024), + ((size_t)heap->size_class_use[iclass].spans_from_reserved * _memory_span_size) / (size_t)(1024 * 1024), + heap->size_class_use[iclass].spans_map_calls); + } + fprintf(file, "Spans Current Peak PeakMiB Cached ToCacheMiB FromCacheMiB ToReserveMiB FromReserveMiB ToGlobalMiB FromGlobalMiB MmapCalls\n"); + for (size_t iclass = 0; iclass < LARGE_CLASS_COUNT; ++iclass) { + if (!heap->span_use[iclass].high && !heap->span_use[iclass].spans_map_calls) + continue; + fprintf(file, "%4u: %8d %8u %8zu %7u %11zu %12zu %12zu %14zu %11zu %13zu %10u\n", (uint32_t)(iclass + 1), + atomic_load32(&heap->span_use[iclass].current), + heap->span_use[iclass].high, + ((size_t)heap->span_use[iclass].high * (size_t)_memory_span_size * (iclass + 1)) / (size_t)(1024 * 1024), + heap->span_cache[iclass] ? heap->span_cache[iclass]->list_size : 0, + ((size_t)heap->span_use[iclass].spans_to_cache * (iclass + 1) * _memory_span_size) / (size_t)(1024 * 1024), + ((size_t)heap->span_use[iclass].spans_from_cache * (iclass + 1) * _memory_span_size) / (size_t)(1024 * 1024), + ((size_t)heap->span_use[iclass].spans_to_reserved * (iclass + 1) * _memory_span_size) / (size_t)(1024 * 1024), + ((size_t)heap->span_use[iclass].spans_from_reserved * (iclass + 1) * _memory_span_size) / (size_t)(1024 * 1024), + ((size_t)heap->span_use[iclass].spans_to_global * (size_t)_memory_span_size * (iclass + 1)) / (size_t)(1024 * 1024), + ((size_t)heap->span_use[iclass].spans_from_global * (size_t)_memory_span_size * (iclass + 1)) / (size_t)(1024 * 1024), + heap->span_use[iclass].spans_map_calls); + } + fprintf(file, "ThreadToGlobalMiB GlobalToThreadMiB\n"); + fprintf(file, "%17zu %17zu\n", (size_t)heap->thread_to_global / (size_t)(1024 * 1024), (size_t)heap->global_to_thread / (size_t)(1024 * 1024)); +} + +#endif + +void +rpmalloc_dump_statistics(void* file) { +#if ENABLE_STATISTICS + //If you hit this assert, you still have active threads or forgot to finalize some thread(s) + assert(atomic_load32(&_memory_active_heaps) == 0); + for (size_t list_idx = 0; list_idx < HEAP_ARRAY_SIZE; ++list_idx) { + heap_t* heap = atomic_load_ptr(&_memory_heaps[list_idx]); + while (heap) { + int need_dump = 0; + for (size_t iclass = 0; !need_dump && (iclass < SIZE_CLASS_COUNT); ++iclass) { + if (!heap->size_class_use[iclass].alloc_total) { + assert(!atomic_load32(&heap->size_class_use[iclass].free_total)); + assert(!heap->size_class_use[iclass].spans_map_calls); + continue; + } + need_dump = 1; + } + for (size_t iclass = 0; !need_dump && (iclass < LARGE_CLASS_COUNT); ++iclass) { + if (!heap->span_use[iclass].high && !heap->span_use[iclass].spans_map_calls) + continue; + need_dump = 1; + } + if (need_dump) + _memory_heap_dump_statistics(heap, file); + heap = heap->next_heap; + } + } + fprintf(file, "Global stats:\n"); + size_t huge_current = (size_t)atomic_load32(&_huge_pages_current) * _memory_page_size; + size_t huge_peak = (size_t)_huge_pages_peak * _memory_page_size; + fprintf(file, "HugeCurrentMiB HugePeakMiB\n"); + fprintf(file, "%14zu %11zu\n", huge_current / (size_t)(1024 * 1024), huge_peak / (size_t)(1024 * 1024)); + + size_t mapped = (size_t)atomic_load32(&_mapped_pages) * _memory_page_size; + size_t mapped_os = (size_t)atomic_load32(&_mapped_pages_os) * _memory_page_size; + size_t mapped_peak = (size_t)_mapped_pages_peak * _memory_page_size; + size_t mapped_total = (size_t)atomic_load32(&_mapped_total) * _memory_page_size; + size_t unmapped_total = (size_t)atomic_load32(&_unmapped_total) * _memory_page_size; + size_t reserved_total = (size_t)atomic_load32(&_reserved_spans) * _memory_span_size; + fprintf(file, "MappedMiB MappedOSMiB MappedPeakMiB MappedTotalMiB UnmappedTotalMiB ReservedTotalMiB\n"); + fprintf(file, "%9zu %11zu %13zu %14zu %16zu %16zu\n", + mapped / (size_t)(1024 * 1024), + mapped_os / (size_t)(1024 * 1024), + mapped_peak / (size_t)(1024 * 1024), + mapped_total / (size_t)(1024 * 1024), + unmapped_total / (size_t)(1024 * 1024), + reserved_total / (size_t)(1024 * 1024)); + + fprintf(file, "\n"); +#else + (void)sizeof(file); +#endif +} + +#if ENABLE_PRELOAD || ENABLE_OVERRIDE + +#include "malloc.c" + +#endif Index: llvm/tools/llvm-shlib/CMakeLists.txt =================================================================== --- llvm/tools/llvm-shlib/CMakeLists.txt +++ llvm/tools/llvm-shlib/CMakeLists.txt @@ -172,4 +172,9 @@ # Finally link the target. add_llvm_library(LLVM-C SHARED INSTALL_WITH_TOOLCHAIN ${SOURCES} DEPENDS intrinsics_gen) + if (LLVM_ENABLE_RPMALLOC AND MSVC) + # Make sure we search LLVMSupport first, before the CRT libs + set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -INCLUDE:malloc") + endif() + endif() Index: llvm/tools/remarks-shlib/CMakeLists.txt =================================================================== --- llvm/tools/remarks-shlib/CMakeLists.txt +++ llvm/tools/remarks-shlib/CMakeLists.txt @@ -10,6 +10,11 @@ add_llvm_library(Remarks SHARED INSTALL_WITH_TOOLCHAIN ${SOURCES}) +if (LLVM_ENABLE_RPMALLOC AND MSVC) + # Make sure we search LLVMSupport first, before the CRT libs + set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -INCLUDE:malloc") +endif() + install(FILES ${LLVM_MAIN_INCLUDE_DIR}/llvm-c/Remarks.h DESTINATION include/llvm-c COMPONENT Remarks) Index: llvm/unittests/Support/DynamicLibrary/CMakeLists.txt =================================================================== --- llvm/unittests/Support/DynamicLibrary/CMakeLists.txt +++ llvm/unittests/Support/DynamicLibrary/CMakeLists.txt @@ -38,6 +38,14 @@ ) add_dependencies(DynamicLibraryTests ${NAME}) + + # We need to link in the Support lib for the Memory allocator override, + # otherwise the DynamicLibrary.Shutdown test will fail, because it would + # allocate memory with the CRT allocator, and release it with our custom + # allocator (see llvm/lib/Support/Windows/Memory.inc). + # /INCLUDE:malloc is there to force searching into LLVMSupport before libucrt + llvm_map_components_to_libnames(llvm_libs Support) + target_link_libraries(${NAME} ${llvm_libs} "-INCLUDE:malloc") endfunction(dynlib_add_module) # Revert -Wl,-z,nodelete on this test since it relies on the file