Index: lib/scudo/standalone/CMakeLists.txt =================================================================== --- lib/scudo/standalone/CMakeLists.txt +++ lib/scudo/standalone/CMakeLists.txt @@ -80,6 +80,9 @@ size_class_map.h stats.h string_utils.h + tsd.h + tsd_exclusive.h + tsd_shared.h vector.h) if(COMPILER_RT_HAS_SCUDO_STANDALONE) Index: lib/scudo/standalone/internal_defs.h =================================================================== --- lib/scudo/standalone/internal_defs.h +++ lib/scudo/standalone/internal_defs.h @@ -17,7 +17,7 @@ #define SCUDO_DEBUG 0 #endif -#define ARRAY_SIZE(a) (sizeof(a) / sizeof((a)[0])) +#define ARRAY_SIZE(A) (sizeof(A) / sizeof((A)[0])) // String related macros. Index: lib/scudo/standalone/mutex.h =================================================================== --- lib/scudo/standalone/mutex.h +++ lib/scudo/standalone/mutex.h @@ -16,7 +16,6 @@ class StaticSpinMutex { public: - void initLinkerInitialized() {} void init() { atomic_store_relaxed(&State, 0); } void lock() { Index: lib/scudo/standalone/quarantine.h =================================================================== --- lib/scudo/standalone/quarantine.h +++ lib/scudo/standalone/quarantine.h @@ -185,8 +185,6 @@ atomic_store_relaxed(&MaxCacheSize, CacheSize); Cache.initLinkerInitialized(); - CacheMutex.initLinkerInitialized(); - RecyleMutex.initLinkerInitialized(); } void init(uptr Size, uptr CacheSize) { memset(this, 0, sizeof(*this)); Index: lib/scudo/standalone/tests/CMakeLists.txt =================================================================== --- lib/scudo/standalone/tests/CMakeLists.txt +++ lib/scudo/standalone/tests/CMakeLists.txt @@ -65,6 +65,7 @@ size_class_map_test.cc stats_test.cc strings_test.cc + tsd_test.cc vector_test.cc scudo_unit_test_main.cc) Index: lib/scudo/standalone/tests/primary_test.cc =================================================================== --- lib/scudo/standalone/tests/primary_test.cc +++ lib/scudo/standalone/tests/primary_test.cc @@ -47,7 +47,7 @@ TEST(ScudoPrimaryTest, BasicPrimary) { using SizeClassMap = scudo::DefaultSizeClassMap; - testPrimary>(); + testPrimary>(); testPrimary>(); } @@ -121,10 +121,14 @@ TEST(ScudoPrimaryTest, PrimaryIterate) { using SizeClassMap = scudo::DefaultSizeClassMap; - testIteratePrimary>(); + testIteratePrimary>(); testIteratePrimary>(); } +// TODO(kostyak): reenable on 32-bit after implementing unmapTestOnly for the +// primary: we are running out of addressable space without. +#if SCUDO_WORDSIZE == 64U + static std::mutex Mutex; static std::condition_variable Cv; static bool Ready = false; @@ -142,7 +146,8 @@ const scudo::uptr Size = std::rand() % Primary::SizeClassMap::MaxSize; const scudo::uptr ClassId = Primary::SizeClassMap::getClassIdBySize(Size); void *P = Cache.allocate(ClassId); - V.push_back(std::make_pair(ClassId, P)); + if (P) + V.push_back(std::make_pair(ClassId, P)); } while (!V.empty()) { auto Pair = V.back(); @@ -155,8 +160,8 @@ template static void testPrimaryThreaded() { std::unique_ptr Allocator(new Primary); Allocator->init(/*ReleaseToOsInterval=*/-1); - std::thread Threads[10]; - for (scudo::uptr I = 0; I < 10U; I++) + std::thread Threads[32]; + for (scudo::uptr I = 0; I < ARRAY_SIZE(Threads); I++) Threads[I] = std::thread(performAllocations, Allocator.get()); { std::unique_lock Lock(Mutex); @@ -171,6 +176,8 @@ TEST(ScudoPrimaryTest, PrimaryThreaded) { using SizeClassMap = scudo::SvelteSizeClassMap; - testPrimaryThreaded>(); - testPrimaryThreaded>(); + testPrimaryThreaded>(); + testPrimaryThreaded>(); } + +#endif // SCUDO_WORDSIZE == 64U Index: lib/scudo/standalone/tests/tsd_test.cc =================================================================== --- /dev/null +++ lib/scudo/standalone/tests/tsd_test.cc @@ -0,0 +1,152 @@ +//===-- tsd_test.cc ---------------------------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "tsd_exclusive.h" +#include "tsd_shared.h" + +#include "gtest/gtest.h" + +#include +#include +#include + +// We mock out an allocator with a TSD registry, mostly using empty stubs. The +// cache contains a single volatile uptr, to be able to test that several +// concurrent threads will not access or modify the same cache at the same time. +template class MockAllocator { +public: + using ThisT = MockAllocator; + using TSDRegistryT = typename Config::template TSDRegistryT; + using CacheT = struct MockCache { volatile scudo::uptr Canary; }; + using QuarantineCacheT = struct MockQuarantine {}; + + void initLinkerInitialized() { + // This should only be called once by the registry. + EXPECT_FALSE(Initialized); + Initialized = true; + } + void reset() { memset(this, 0, sizeof(*this)); } + + void initCache(CacheT *Cache) { memset(Cache, 0, sizeof(*Cache)); } + void commitBack(scudo::TSD *TSD) {} + TSDRegistryT *getTSDRegistry() { return &TSDRegistry; } + + bool isInitialized() { return Initialized; } + +private: + bool Initialized; + TSDRegistryT TSDRegistry; +}; + +struct OneCache { + template + using TSDRegistryT = scudo::TSDRegistrySharedT; +}; + +struct SharedCaches { + template + using TSDRegistryT = scudo::TSDRegistrySharedT; +}; + +struct ExclusiveCaches { + template + using TSDRegistryT = scudo::TSDRegistryExT; +}; + +TEST(ScudoTSDTest, TSDRegistryInit) { + using AllocatorT = MockAllocator; + std::unique_ptr Allocator(new AllocatorT); + Allocator->reset(); + EXPECT_FALSE(Allocator->isInitialized()); + + auto Registry = Allocator->getTSDRegistry(); + Registry->initLinkerInitialized(Allocator.get()); + EXPECT_TRUE(Allocator->isInitialized()); +} + +template static void testRegistry() { + std::unique_ptr Allocator(new AllocatorT); + Allocator->reset(); + EXPECT_FALSE(Allocator->isInitialized()); + + auto Registry = Allocator->getTSDRegistry(); + Registry->initThreadMaybe(Allocator.get(), /*MinimalInit=*/true); + EXPECT_TRUE(Allocator->isInitialized()); + + bool UnlockRequired; + auto TSD = Registry->getTSDAndLock(&UnlockRequired); + EXPECT_NE(TSD, nullptr); + EXPECT_EQ(TSD->Cache.Canary, 0U); + if (UnlockRequired) + TSD->unlock(); + + Registry->initThreadMaybe(Allocator.get(), /*MinimalInit=*/false); + TSD = Registry->getTSDAndLock(&UnlockRequired); + EXPECT_NE(TSD, nullptr); + EXPECT_EQ(TSD->Cache.Canary, 0U); + memset(&TSD->Cache, 0x42, sizeof(TSD->Cache)); + if (UnlockRequired) + TSD->unlock(); +} + +TEST(ScudoTSDTest, TSDRegistryBasic) { + testRegistry>(); + testRegistry>(); + testRegistry>(); +} + +static std::mutex Mutex; +static std::condition_variable Cv; +static bool Ready = false; + +template static void stressCache(AllocatorT *Allocator) { + auto Registry = Allocator->getTSDRegistry(); + { + std::unique_lock Lock(Mutex); + while (!Ready) + Cv.wait(Lock); + } + Registry->initThreadMaybe(Allocator, /*MinimalInit=*/false); + bool UnlockRequired; + auto TSD = Registry->getTSDAndLock(&UnlockRequired); + EXPECT_NE(TSD, nullptr); + // For an exclusive TSD, the cache should be empty. We cannot guarantee the + // same for a shared TSD. + if (!UnlockRequired) + EXPECT_EQ(TSD->Cache.Canary, 0U); + // Transform the thread id to a uptr to use it as canary. + const scudo::uptr Canary = static_cast( + std::hash{}(std::this_thread::get_id())); + TSD->Cache.Canary = Canary; + // Loop a few times to make sure that a concurrent thread isn't modifying it. + for (scudo::uptr I = 0; I < 4096U; I++) + EXPECT_EQ(TSD->Cache.Canary, Canary); + if (UnlockRequired) + TSD->unlock(); +} + +template static void testRegistryThreaded() { + std::unique_ptr Allocator(new AllocatorT); + Allocator->reset(); + std::thread Threads[32]; + for (scudo::uptr I = 0; I < ARRAY_SIZE(Threads); I++) + Threads[I] = std::thread(stressCache, Allocator.get()); + { + std::unique_lock Lock(Mutex); + Ready = true; + Cv.notify_all(); + } + for (auto &T : Threads) + T.join(); +} + +TEST(ScudoTSDTest, TSDRegistryThreaded) { + testRegistryThreaded>(); + testRegistryThreaded>(); + testRegistryThreaded>(); +} Index: lib/scudo/standalone/tsd.h =================================================================== --- /dev/null +++ lib/scudo/standalone/tsd.h @@ -0,0 +1,61 @@ +//===-- tsd.h ---------------------------------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef SCUDO_TSD_H_ +#define SCUDO_TSD_H_ + +#include "atomic_helpers.h" +#include "common.h" +#include "mutex.h" + +#include // for PTHREAD_DESTRUCTOR_ITERATIONS + +namespace scudo { + +template struct ALIGNED(SCUDO_CACHE_LINE_SIZE) TSD { + typename Allocator::CacheT Cache; + typename Allocator::QuarantineCacheT QuarantineCache; + u8 DestructorIterations; + + void initLinkerInitialized(Allocator *Instance) { + Instance->initCache(&Cache); + DestructorIterations = PTHREAD_DESTRUCTOR_ITERATIONS; + } + void init(Allocator *Instance) { + memset(this, 0, sizeof(*this)); + initLinkerInitialized(Instance); + } + + void commitBack(Allocator *Instance) { Instance->commitBack(this); } + + INLINE bool tryLock() { + if (Mutex.tryLock()) { + atomic_store_relaxed(&Precedence, 0); + return true; + } + if (atomic_load_relaxed(&Precedence) == 0) + atomic_store_relaxed( + &Precedence, + static_cast(getMonotonicTime() >> FIRST_32_SECOND_64(16, 0))); + return false; + } + INLINE void lock() { + atomic_store_relaxed(&Precedence, 0); + Mutex.lock(); + } + INLINE void unlock() { Mutex.unlock(); } + INLINE uptr getPrecedence() { return atomic_load_relaxed(&Precedence); } + +private: + StaticSpinMutex Mutex; + atomic_uptr Precedence; +}; + +} // namespace scudo + +#endif // SCUDO_TSD_H_ Index: lib/scudo/standalone/tsd_exclusive.h =================================================================== --- /dev/null +++ lib/scudo/standalone/tsd_exclusive.h @@ -0,0 +1,115 @@ +//===-- tsd_exclusive.h -----------------------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef SCUDO_TSD_EXCLUSIVE_H_ +#define SCUDO_TSD_EXCLUSIVE_H_ + +#include "tsd.h" + +#include + +namespace scudo { + +enum class ThreadState : u8 { + NotInitialized = 0, + Initialized, + TornDown, +}; + +template void teardownThread(void *Ptr); + +template struct TSDRegistryExT { + void initLinkerInitialized(Allocator *Instance) { + Instance->initLinkerInitialized(); + CHECK_EQ(pthread_key_create(&PThreadKey, teardownThread), 0); + FallbackTSD = reinterpret_cast *>( + map(nullptr, sizeof(TSD), "scudo:tsd")); + FallbackTSD->initLinkerInitialized(Instance); + atomic_store(&Initialized, 1U, memory_order_seq_cst); + } + void init(Allocator *Instance) { + memset(this, 0, sizeof(*this)); + initLinkerInitialized(Instance); + } + + ALWAYS_INLINE void initThreadMaybe(Allocator *Instance, bool MinimalInit) { + if (LIKELY(State != ThreadState::NotInitialized)) + return; + initThread(Instance, MinimalInit); + } + + ALWAYS_INLINE TSD *getTSDAndLock(bool *UnlockRequired) { + if (LIKELY(State == ThreadState::Initialized)) { + *UnlockRequired = false; + return &ThreadTSD; + } + DCHECK(FallbackTSD); + FallbackTSD->lock(); + *UnlockRequired = true; + return FallbackTSD; + } + +private: + void initOnce(Allocator *Instance) { + SpinMutexLock L(&Mutex); + if (atomic_load(&Initialized, memory_order_seq_cst)) + return; + initLinkerInitialized(Instance); // Sets Initialized. + } + + // Using minimal initialization allows for global initialization while keeping + // the thread specific structure untouched. The fallback structure will be + // used instead. + NOINLINE void initThread(Allocator *Instance, bool MinimalInit) { + if (UNLIKELY(!atomic_load(&Initialized, memory_order_seq_cst))) + initOnce(Instance); + if (MinimalInit) + return; + CHECK_EQ( + pthread_setspecific(PThreadKey, reinterpret_cast(Instance)), 0); + ThreadTSD.initLinkerInitialized(Instance); + State = ThreadState::Initialized; + } + + pthread_key_t PThreadKey; + atomic_u8 Initialized; + TSD *FallbackTSD; + StaticSpinMutex Mutex; + static THREADLOCAL ThreadState State; + static THREADLOCAL TSD ThreadTSD; + + friend void teardownThread(void *Ptr); +}; + +template +THREADLOCAL TSD TSDRegistryExT::ThreadTSD; +template +THREADLOCAL ThreadState TSDRegistryExT::State; + +template void teardownThread(void *Ptr) { + typedef TSDRegistryExT TSDRegistryT; + Allocator *Instance = reinterpret_cast(Ptr); + // The glibc POSIX thread-local-storage deallocation routine calls user + // provided destructors in a loop of PTHREAD_DESTRUCTOR_ITERATIONS. + // We want to be called last since other destructors might call free and the + // like, so we wait until PTHREAD_DESTRUCTOR_ITERATIONS before draining the + // quarantine and swallowing the cache. + if (TSDRegistryT::ThreadTSD.DestructorIterations > 1) { + TSDRegistryT::ThreadTSD.DestructorIterations--; + // If pthread_setspecific fails, we will go ahead with the teardown. + if (LIKELY(pthread_setspecific(Instance->getTSDRegistry()->PThreadKey, + Ptr) == 0)) + return; + } + TSDRegistryT::ThreadTSD.commitBack(Instance); + TSDRegistryT::State = ThreadState::TornDown; +} + +} // namespace scudo + +#endif // SCUDO_TSD_EXCLUSIVE_H_ Index: lib/scudo/standalone/tsd_shared.h =================================================================== --- /dev/null +++ lib/scudo/standalone/tsd_shared.h @@ -0,0 +1,167 @@ +//===-- tsd_shared.h --------------------------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef SCUDO_TSD_SHARED_H_ +#define SCUDO_TSD_SHARED_H_ + +#include "linux.h" // for getAndroidTlsPtr() +#include "tsd.h" + +#include + +namespace scudo { + +template struct TSDRegistrySharedT { + void initLinkerInitialized(Allocator *Instance) { + Instance->initLinkerInitialized(); + CHECK_EQ(pthread_key_create(&PThreadKey, nullptr), 0); // For non-TLS + NumberOfTSDs = Min(Max(1U, getNumberOfCPUs()), MaxTSDCount); + TSDs = reinterpret_cast *>( + map(nullptr, sizeof(TSD) * NumberOfTSDs, "scudo:tsd")); + for (u32 I = 0; I < NumberOfTSDs; I++) + TSDs[I].initLinkerInitialized(Instance); + // Compute all the coprimes of NumberOfTSDs. This will be used to walk the + // array of TSDs in a random order. For details, see: + // https://lemire.me/blog/2017/09/18/visiting-all-values-in-an-array-exactly-once-in-random-order/ + for (u32 I = 0; I < NumberOfTSDs; I++) { + u32 A = I + 1; + u32 B = NumberOfTSDs; + // Find the GCD between I + 1 and NumberOfTSDs. If 1, they are coprimes. + while (B != 0) { + const u32 T = A; + A = B; + B = T % B; + } + if (A == 1) + CoPrimes[NumberOfCoPrimes++] = I + 1; + } + atomic_store(&Initialized, 1U, memory_order_seq_cst); + } + void init(Allocator *Instance) { + memset(this, 0, sizeof(*this)); + initLinkerInitialized(Instance); + } + + ALWAYS_INLINE void initThreadMaybe(Allocator *Instance, + UNUSED bool MinimalInit) { + if (LIKELY(getCurrentTSD())) + return; + initThread(Instance); + } + + ALWAYS_INLINE TSD *getTSDAndLock(bool *UnlockRequired) { + TSD *TSD = getCurrentTSD(); + DCHECK(TSD); + *UnlockRequired = true; + // Try to lock the currently associated context. + if (TSD->tryLock()) + return TSD; + // If that fails, go down the slow path. + return getTSDAndLockSlow(TSD); + } + +private: + ALWAYS_INLINE void setCurrentTSD(TSD *CurrentTSD) { +#if SCUDO_ANDROID + *getAndroidTlsPtr() = reinterpret_cast(CurrentTSD); +#elif SCUDO_LINUX + ThreadTSD = CurrentTSD; +#else + CHECK_EQ( + pthread_setspecific(PThreadKey, reinterpret_cast(CurrentTSD)), + 0); +#endif + } + + ALWAYS_INLINE TSD *getCurrentTSD() { +#if SCUDO_ANDROID + return reinterpret_cast *>(*getAndroidTlsPtr()); +#elif SCUDO_LINUX + return ThreadTSD; +#else + return reinterpret_cast *>(pthread_getspecific(PThreadKey)); +#endif + } + + void initOnce(Allocator *Instance) { + SpinMutexLock L(&Mutex); + if (atomic_load(&Initialized, memory_order_seq_cst)) + return; + initLinkerInitialized(Instance); // Sets Initialized. + } + + NOINLINE void initThread(Allocator *Instance) { + if (UNLIKELY(!atomic_load(&Initialized, memory_order_seq_cst))) + initOnce(Instance); + // Initial context assignment is done in a plain round-robin fashion. + const u32 Index = atomic_fetch_add(&CurrentIndex, 1U, memory_order_relaxed); + setCurrentTSD(&TSDs[Index % NumberOfTSDs]); + } + + NOINLINE TSD *getTSDAndLockSlow(TSD *CurrentTSD) { + if (MaxTSDCount > 1U && NumberOfTSDs > 1U) { + // Use the Precedence of the current TSD as our random seed. Since we are + // in the slow path, it means that tryLock failed, and as a result it's + // very likely that said Precedence is non-zero. + u32 RandState = static_cast(CurrentTSD->getPrecedence()); + const u32 R = getRandomU32(&RandState); + const u32 Inc = CoPrimes[R % NumberOfCoPrimes]; + u32 Index = R % NumberOfTSDs; + uptr LowestPrecedence = UINTPTR_MAX; + TSD *CandidateTSD = nullptr; + // Go randomly through at most 4 contexts and find a candidate. + for (u32 I = 0; I < Min(4U, NumberOfTSDs); I++) { + if (TSDs[Index].tryLock()) { + setCurrentTSD(&TSDs[Index]); + return &TSDs[Index]; + } + const uptr Precedence = TSDs[Index].getPrecedence(); + // A 0 precedence here means another thread just locked this TSD. + if (UNLIKELY(Precedence == 0)) + continue; + if (Precedence < LowestPrecedence) { + CandidateTSD = &TSDs[Index]; + LowestPrecedence = Precedence; + } + Index += Inc; + if (Index >= NumberOfTSDs) + Index -= NumberOfTSDs; + } + if (CandidateTSD) { + CandidateTSD->lock(); + setCurrentTSD(CandidateTSD); + return CandidateTSD; + } + } + // Last resort, stick with the current one. + CurrentTSD->lock(); + return CurrentTSD; + } + + pthread_key_t PThreadKey; + atomic_u32 CurrentIndex; + u32 NumberOfTSDs; + TSD *TSDs; + u32 NumberOfCoPrimes; + u32 CoPrimes[MaxTSDCount]; + atomic_u8 Initialized; + StaticSpinMutex Mutex; +#if SCUDO_LINUX && !SCUDO_ANDROID + static THREADLOCAL TSD *ThreadTSD; +#endif +}; + +#if SCUDO_LINUX && !SCUDO_ANDROID +template +THREADLOCAL TSD + *TSDRegistrySharedT::ThreadTSD; +#endif + +} // namespace scudo + +#endif // SCUDO_TSD_SHARED_H_