Index: compiler-rt/lib/xray/CMakeLists.txt =================================================================== --- compiler-rt/lib/xray/CMakeLists.txt +++ compiler-rt/lib/xray/CMakeLists.txt @@ -2,6 +2,7 @@ # XRay runtime library implementation files. set(XRAY_SOURCES + xray_buffer_queue.cc xray_init.cc xray_flags.cc xray_interface.cc @@ -11,7 +12,6 @@ # Implementation files for all XRay modes. set(XRAY_FDR_MODE_SOURCES xray_fdr_flags.cc - xray_buffer_queue.cc xray_fdr_logging.cc) set(XRAY_BASIC_MODE_SOURCES Index: compiler-rt/lib/xray/tests/unit/allocator_test.cc =================================================================== --- compiler-rt/lib/xray/tests/unit/allocator_test.cc +++ compiler-rt/lib/xray/tests/unit/allocator_test.cc @@ -12,6 +12,7 @@ //===----------------------------------------------------------------------===// #include "xray_allocator.h" +#include "xray_buffer_queue.h" #include "gtest/gtest.h" namespace __xray { @@ -56,5 +57,26 @@ ASSERT_EQ(C, Expected); } +TEST(AllocatorTest, AllocateFromNonOwned) { + bool Success = false; + BufferQueue BQ(GetPageSizeCached(), 10, Success); + ASSERT_TRUE(Success); + BufferQueue::Buffer B; + ASSERT_EQ(BQ.getBuffer(B), BufferQueue::ErrorCode::Ok); + { + Allocator A(B.Data, B.Size); + + // Keep allocating until we hit a nullptr block. + unsigned C = 0; + auto Expected = + GetPageSizeCached() / RoundUpTo(sizeof(OddSizedData), kCacheLineSize); + for (auto B = A.Allocate(); B.Data != nullptr; B = A.Allocate(), ++C) + ; + + ASSERT_EQ(C, Expected); + } + ASSERT_EQ(BQ.releaseBuffer(B), BufferQueue::ErrorCode::Ok); +} + } // namespace } // namespace __xray Index: compiler-rt/lib/xray/tests/unit/profile_collector_test.cc =================================================================== --- compiler-rt/lib/xray/tests/unit/profile_collector_test.cc +++ compiler-rt/lib/xray/tests/unit/profile_collector_test.cc @@ -110,24 +110,31 @@ TEST(profileCollectorServiceTest, PostSerializeCollect) { profilingFlags()->setDefaults(); - // The most basic use-case (the one we actually only care about) is the one - // where we ensure that we can post FunctionCallTrie instances, which are then - // destroyed but serialized properly. - // - // First, we initialise a set of allocators in the local scope. This ensures - // that we're able to copy the contents of the FunctionCallTrie that uses - // the local allocators. - auto Allocators = FunctionCallTrie::InitAllocators(); + bool Success = false; + BufferQueue BQ(profilingFlags()->per_thread_allocator_max, + profilingFlags()->buffers_max, Success); + ASSERT_EQ(Success, true); + FunctionCallTrie::Allocators::Buffers Buffers; + ASSERT_EQ(BQ.getBuffer(Buffers.NodeBuffer), BufferQueue::ErrorCode::Ok); + ASSERT_EQ(BQ.getBuffer(Buffers.RootsBuffer), BufferQueue::ErrorCode::Ok); + ASSERT_EQ(BQ.getBuffer(Buffers.ShadowStackBuffer), + BufferQueue::ErrorCode::Ok); + ASSERT_EQ(BQ.getBuffer(Buffers.NodeIdPairBuffer), BufferQueue::ErrorCode::Ok); + auto Allocators = FunctionCallTrie::InitAllocatorsFromBuffers(Buffers); FunctionCallTrie T(Allocators); - // Then, we populate the trie with some data. + // Populate the trie with some data. T.enterFunction(1, 1, 0); T.enterFunction(2, 2, 0); T.exitFunction(2, 3, 0); T.exitFunction(1, 4, 0); + // Reset the collector data structures. + profileCollectorService::reset(); + // Then we post the data to the global profile collector service. - profileCollectorService::post(T, 1); + profileCollectorService::post(&BQ, std::move(T), std::move(Allocators), + std::move(Buffers), 1); // Then we serialize the data. profileCollectorService::serialize(); @@ -174,7 +181,21 @@ // profileCollectorService. This simulates what the threads being profiled would // be doing anyway, but through the XRay logging implementation. void threadProcessing() { - thread_local auto Allocators = FunctionCallTrie::InitAllocators(); + static bool Success = false; + static BufferQueue BQ(profilingFlags()->per_thread_allocator_max, + profilingFlags()->buffers_max, Success); + thread_local FunctionCallTrie::Allocators::Buffers Buffers = [] { + FunctionCallTrie::Allocators::Buffers B; + BQ.getBuffer(B.NodeBuffer); + BQ.getBuffer(B.RootsBuffer); + BQ.getBuffer(B.ShadowStackBuffer); + BQ.getBuffer(B.NodeIdPairBuffer); + return B; + }(); + + thread_local auto Allocators = + FunctionCallTrie::InitAllocatorsFromBuffers(Buffers); + FunctionCallTrie T(Allocators); T.enterFunction(1, 1, 0); @@ -182,11 +203,15 @@ T.exitFunction(2, 3, 0); T.exitFunction(1, 4, 0); - profileCollectorService::post(T, GetTid()); + profileCollectorService::post(&BQ, std::move(T), std::move(Allocators), + std::move(Buffers), GetTid()); } TEST(profileCollectorServiceTest, PostSerializeCollectMultipleThread) { profilingFlags()->setDefaults(); + + profileCollectorService::reset(); + std::thread t1(threadProcessing); std::thread t2(threadProcessing); Index: compiler-rt/lib/xray/tests/unit/segmented_array_test.cc =================================================================== --- compiler-rt/lib/xray/tests/unit/segmented_array_test.cc +++ compiler-rt/lib/xray/tests/unit/segmented_array_test.cc @@ -2,6 +2,9 @@ #include "xray_segmented_array.h" #include "gmock/gmock.h" #include "gtest/gtest.h" +#include +#include +#include namespace __xray { namespace { @@ -307,5 +310,40 @@ } } +TEST(SegmentedArrayTest, ArrayOfPointersIteratorAccess) { + using PtrArray = Array; + PtrArray::AllocatorType Alloc(16384); + Array A(Alloc); + static constexpr size_t Count = 100; + std::vector Integers(Count); + std::iota(Integers.begin(), Integers.end(), 0); + for (auto &I : Integers) + ASSERT_NE(A.Append(&I), nullptr); + int V = 0; + ASSERT_EQ(A.size(), Count); + for (auto P : A) { + ASSERT_NE(P, nullptr); + ASSERT_EQ(*P, V++); + } +} + +TEST(SegmentedArrayTest, ArrayOfPointersIteratorAccessExhaustion) { + using PtrArray = Array; + PtrArray::AllocatorType Alloc(4096); + Array A(Alloc); + static constexpr size_t Count = 1000; + std::vector Integers(Count); + std::iota(Integers.begin(), Integers.end(), 0); + for (auto &I : Integers) + if (A.Append(&I) == nullptr) + break; + int V = 0; + ASSERT_LT(A.size(), Count); + for (auto P : A) { + ASSERT_NE(P, nullptr); + ASSERT_EQ(*P, V++); + } +} + } // namespace } // namespace __xray Index: compiler-rt/lib/xray/xray_allocator.h =================================================================== --- compiler-rt/lib/xray/xray_allocator.h +++ compiler-rt/lib/xray/xray_allocator.h @@ -21,8 +21,8 @@ #include "sanitizer_common/sanitizer_mutex.h" #if SANITIZER_FUCHSIA #include -#include #include +#include #else #include "sanitizer_common/sanitizer_posix.h" #endif @@ -50,13 +50,13 @@ } uintptr_t B; Status = - _zx_vmar_map(_zx_vmar_root_self(), ZX_VM_PERM_READ | ZX_VM_PERM_WRITE, 0, - Vmo, 0, sizeof(T), &B); + _zx_vmar_map(_zx_vmar_root_self(), ZX_VM_PERM_READ | ZX_VM_PERM_WRITE, 0, + Vmo, 0, sizeof(T), &B); _zx_handle_close(Vmo); if (Status != ZX_OK) { if (Verbosity()) - Report("XRay Profiling: Failed to map VMAR of size %zu: %s\n", - sizeof(T), _zx_status_get_string(Status)); + Report("XRay Profiling: Failed to map VMAR of size %zu: %s\n", sizeof(T), + _zx_status_get_string(Status)); return nullptr; } return reinterpret_cast(B); @@ -80,8 +80,8 @@ return; uptr RoundedSize = RoundUpTo(sizeof(T), GetPageSizeCached()); #if SANITIZER_FUCHSIA - _zx_vmar_unmap(_zx_vmar_root_self(), - reinterpret_cast(B), RoundedSize); + _zx_vmar_unmap(_zx_vmar_root_self(), reinterpret_cast(B), + RoundedSize); #else internal_munmap(B, RoundedSize); #endif @@ -95,19 +95,18 @@ zx_status_t Status = _zx_vmo_create(RoundedSize, 0, &Vmo); if (Status != ZX_OK) { if (Verbosity()) - Report("XRay Profiling: Failed to create VMO of size %zu: %s\n", - S, _zx_status_get_string(Status)); + Report("XRay Profiling: Failed to create VMO of size %zu: %s\n", S, + _zx_status_get_string(Status)); return nullptr; } uintptr_t B; - Status = - _zx_vmar_map(_zx_vmar_root_self(), ZX_VM_PERM_READ | ZX_VM_PERM_WRITE, 0, - Vmo, 0, S, &B); + Status = _zx_vmar_map(_zx_vmar_root_self(), + ZX_VM_PERM_READ | ZX_VM_PERM_WRITE, 0, Vmo, 0, S, &B); _zx_handle_close(Vmo); if (Status != ZX_OK) { if (Verbosity()) - Report("XRay Profiling: Failed to map VMAR of size %zu: %s\n", - S, _zx_status_get_string(Status)); + Report("XRay Profiling: Failed to map VMAR of size %zu: %s\n", S, + _zx_status_get_string(Status)); return nullptr; } #else @@ -130,7 +129,8 @@ return; uptr RoundedSize = RoundUpTo(S * sizeof(T), GetPageSizeCached()); #if SANITIZER_FUCHSIA - _zx_vmar_unmap(_zx_vmar_root_self(), reinterpret_cast(B), RoundedSize); + _zx_vmar_unmap(_zx_vmar_root_self(), reinterpret_cast(B), + RoundedSize); #else internal_munmap(B, RoundedSize); #endif @@ -175,6 +175,7 @@ unsigned char *BackingStore = nullptr; unsigned char *AlignedNextBlock = nullptr; size_t AllocatedBlocks = 0; + bool Owned; SpinMutex Mutex{}; void *Alloc() XRAY_NEVER_INSTRUMENT { @@ -209,14 +210,14 @@ 0); } - if ((AllocatedBlocks * Block::Size) >= MaxMemory) + if (((AllocatedBlocks + 1) * Block::Size) > MaxMemory) return nullptr; // Align the pointer we'd like to return to an appropriate alignment, then // advance the pointer from where to start allocations. void *Result = AlignedNextBlock; - AlignedNextBlock = reinterpret_cast( - reinterpret_cast(AlignedNextBlock) + N); + AlignedNextBlock = + reinterpret_cast(AlignedNextBlock) + Block::Size; ++AllocatedBlocks; return Result; } @@ -227,6 +228,15 @@ BackingStore(nullptr), AlignedNextBlock(nullptr), AllocatedBlocks(0), + Owned(true), + Mutex() {} + + explicit Allocator(void *P, size_t M) XRAY_NEVER_INSTRUMENT + : MaxMemory(M), + BackingStore(reinterpret_cast(P)), + AlignedNextBlock(reinterpret_cast(P)), + AllocatedBlocks(0), + Owned(false), Mutex() {} Allocator(const Allocator &) = delete; @@ -243,6 +253,8 @@ O.AlignedNextBlock = nullptr; AllocatedBlocks = O.AllocatedBlocks; O.AllocatedBlocks = 0; + Owned = O.Owned; + O.Owned = false; } Allocator &operator=(Allocator &&O) XRAY_NEVER_INSTRUMENT { @@ -250,7 +262,7 @@ SpinMutexLock L1(&O.Mutex); MaxMemory = O.MaxMemory; O.MaxMemory = 0; - if (BackingStore != nullptr) + if (Owned && BackingStore != nullptr) deallocate(BackingStore, MaxMemory); BackingStore = O.BackingStore; O.BackingStore = nullptr; @@ -258,13 +270,15 @@ O.AlignedNextBlock = nullptr; AllocatedBlocks = O.AllocatedBlocks; O.AllocatedBlocks = 0; + Owned = O.Owned; + O.Owned = false; return *this; } Block Allocate() XRAY_NEVER_INSTRUMENT { return {Alloc()}; } ~Allocator() NOEXCEPT XRAY_NEVER_INSTRUMENT { - if (BackingStore != nullptr) { + if (Owned && BackingStore != nullptr) { deallocateBuffer(BackingStore, MaxMemory); } } Index: compiler-rt/lib/xray/xray_function_call_trie.h =================================================================== --- compiler-rt/lib/xray/xray_function_call_trie.h +++ compiler-rt/lib/xray/xray_function_call_trie.h @@ -15,6 +15,7 @@ #ifndef XRAY_FUNCTION_CALL_TRIE_H #define XRAY_FUNCTION_CALL_TRIE_H +#include "xray_buffer_queue.h" #include "xray_defs.h" #include "xray_profiling_flags.h" #include "xray_segmented_array.h" @@ -161,6 +162,35 @@ Allocators(const Allocators &) = delete; Allocators &operator=(const Allocators &) = delete; + struct Buffers { + BufferQueue::Buffer NodeBuffer; + BufferQueue::Buffer RootsBuffer; + BufferQueue::Buffer ShadowStackBuffer; + BufferQueue::Buffer NodeIdPairBuffer; + }; + + explicit Allocators(Buffers &B) XRAY_NEVER_INSTRUMENT { + new (&NodeAllocatorStorage) + NodeAllocatorType(B.NodeBuffer.Data, B.NodeBuffer.Size); + NodeAllocator = + reinterpret_cast(&NodeAllocatorStorage); + + new (&RootAllocatorStorage) + RootAllocatorType(B.RootsBuffer.Data, B.RootsBuffer.Size); + RootAllocator = + reinterpret_cast(&RootAllocatorStorage); + + new (&ShadowStackAllocatorStorage) ShadowStackAllocatorType( + B.ShadowStackBuffer.Data, B.ShadowStackBuffer.Size); + ShadowStackAllocator = reinterpret_cast( + &ShadowStackAllocatorStorage); + + new (&NodeIdPairAllocatorStorage) NodeIdPairAllocatorType( + B.NodeIdPairBuffer.Data, B.NodeIdPairBuffer.Size); + NodeIdPairAllocator = reinterpret_cast( + &NodeIdPairAllocatorStorage); + } + explicit Allocators(uptr Max) XRAY_NEVER_INSTRUMENT { new (&NodeAllocatorStorage) NodeAllocatorType(Max); NodeAllocator = @@ -283,6 +313,12 @@ return A; } + static Allocators + InitAllocatorsFromBuffers(Allocators::Buffers &Bufs) XRAY_NEVER_INSTRUMENT { + Allocators A(Bufs); + return A; + } + private: NodeArray Nodes; RootArray Roots; @@ -323,16 +359,27 @@ void enterFunction(const int32_t FId, uint64_t TSC, uint16_t CPU) XRAY_NEVER_INSTRUMENT { DCHECK_NE(FId, 0); - // This function primarily deals with ensuring that the ShadowStack is - // consistent and ready for when an exit event is encountered. + + // If we're already overflowed the function call stack, do not bother + // attempting to record any more function entries. + if (UNLIKELY(OverflowedFunctions)) { + ++OverflowedFunctions; + return; + } + + // If this is the first function we've encountered, we want to set up the + // node(s) and treat it as a root. if (UNLIKELY(ShadowStack.empty())) { - auto NewRoot = Nodes.AppendEmplace( - nullptr, NodeIdPairArray{*NodeIdPairAllocator}, 0u, 0u, FId); + auto *NewRoot = Nodes.AppendEmplace( + nullptr, NodeIdPairArray(*NodeIdPairAllocator), 0u, 0u, FId); if (UNLIKELY(NewRoot == nullptr)) return; - if (Roots.Append(NewRoot) == nullptr) + if (Roots.AppendEmplace(NewRoot) == nullptr) { + Nodes.trim(1); return; + } if (ShadowStack.AppendEmplace(TSC, NewRoot, CPU) == nullptr) { + Nodes.trim(1); Roots.trim(1); ++OverflowedFunctions; return; @@ -340,13 +387,14 @@ return; } - auto &Top = ShadowStack.back(); - auto TopNode = Top.NodePtr; + // From this point on, we require that the stack is not empty. + DCHECK(!ShadowStack.empty()); + auto TopNode = ShadowStack.back().NodePtr; DCHECK_NE(TopNode, nullptr); - // If we've seen this callee before, then we just access that node and place - // that on the top of the stack. - auto Callee = TopNode->Callees.find_element( + // If we've seen this callee before, then we access that node and place that + // on the top of the stack. + auto* Callee = TopNode->Callees.find_element( [FId](const NodeIdPair &NR) { return NR.FId == FId; }); if (Callee != nullptr) { CHECK_NE(Callee->NodePtr, nullptr); @@ -356,7 +404,7 @@ } // This means we've never seen this stack before, create a new node here. - auto NewNode = Nodes.AppendEmplace( + auto* NewNode = Nodes.AppendEmplace( TopNode, NodeIdPairArray(*NodeIdPairAllocator), 0u, 0u, FId); if (UNLIKELY(NewNode == nullptr)) return; @@ -364,7 +412,6 @@ TopNode->Callees.AppendEmplace(NewNode, FId); if (ShadowStack.AppendEmplace(TSC, NewNode, CPU) == nullptr) ++OverflowedFunctions; - DCHECK_NE(ShadowStack.back().NodePtr, nullptr); return; } @@ -456,11 +503,13 @@ if (UNLIKELY(NewRoot == nullptr)) return; - O.Roots.Append(NewRoot); + if (UNLIKELY(O.Roots.Append(NewRoot) == nullptr)) + return; // TODO: Figure out what to do if we fail to allocate any more stack // space. Maybe warn or report once? - DFSStack.AppendEmplace(Root, NewRoot); + if (DFSStack.AppendEmplace(Root, NewRoot) == nullptr) + return; while (!DFSStack.empty()) { NodeAndParent NP = DFSStack.back(); DCHECK_NE(NP.Node, nullptr); @@ -473,8 +522,12 @@ Callee.FId); if (UNLIKELY(NewNode == nullptr)) return; - NP.NewNode->Callees.AppendEmplace(NewNode, Callee.FId); - DFSStack.AppendEmplace(Callee.NodePtr, NewNode); + if (UNLIKELY(NP.NewNode->Callees.AppendEmplace(NewNode, Callee.FId) == + nullptr)) + return; + if (UNLIKELY(DFSStack.AppendEmplace(Callee.NodePtr, NewNode) == + nullptr)) + return; } } } Index: compiler-rt/lib/xray/xray_profile_collector.h =================================================================== --- compiler-rt/lib/xray/xray_profile_collector.h +++ compiler-rt/lib/xray/xray_profile_collector.h @@ -33,27 +33,13 @@ /// Posts the FunctionCallTrie associated with a specific Thread ID. This /// will: /// -/// - Make a copy of the FunctionCallTrie and store that against the Thread -/// ID. This will use the global allocator for the service-managed -/// FunctionCallTrie instances. -/// - Queue up a pointer to the FunctionCallTrie. -/// - If the queue is long enough (longer than some arbitrary threshold) we -/// then pre-calculate a single FunctionCallTrie for the whole process. +/// Moves the collection of FunctionCallTrie, Allocators, and Buffers associated +/// with a thread's data to the queue. This takes ownership of the memory +/// associated with a thread, and manages those exclusively. /// -/// -/// We are making a copy of the FunctionCallTrie because the intent is to have -/// this function be called at thread exit, or soon after the profiling -/// handler is finalized through the XRay APIs. By letting threads each -/// process their own thread-local FunctionCallTrie instances, we're removing -/// the need for synchronisation across threads while we're profiling. -/// However, once we're done profiling, we can then collect copies of these -/// FunctionCallTrie instances and pay the cost of the copy. -/// -/// NOTE: In the future, if this turns out to be more costly than "moving" the -/// FunctionCallTrie instances from the owning thread to the collector -/// service, then we can change the implementation to do it this way (moving) -/// instead. -void post(const FunctionCallTrie &T, tid_t TId); +void post(BufferQueue *Q, FunctionCallTrie &&T, + FunctionCallTrie::Allocators &&A, + FunctionCallTrie::Allocators::Buffers &&B, tid_t TId); /// The serialize will process all FunctionCallTrie instances in memory, and /// turn those into specifically formatted blocks, each describing the Index: compiler-rt/lib/xray/xray_profile_collector.cc =================================================================== --- compiler-rt/lib/xray/xray_profile_collector.cc +++ compiler-rt/lib/xray/xray_profile_collector.cc @@ -57,52 +57,90 @@ u64 ThreadId; }; -using ThreadTriesArray = Array; +struct ThreadData { + BufferQueue *BQ; + FunctionCallTrie::Allocators::Buffers Buffers; + FunctionCallTrie::Allocators Allocators; + FunctionCallTrie FCT; + tid_t TId; +}; + +using ThreadDataArray = Array; +using ThreadDataAllocator = ThreadDataArray::AllocatorType; + +// We use a separate buffer queue for the backing store for the allocator used +// by the ThreadData array. This lets us host the buffers, allocators, and tries +// associated with a thread by moving the data into the array instead of +// attempting to copy the data to a separately backed set of tries. +static typename std::aligned_storage< + sizeof(BufferQueue), alignof(BufferQueue)>::type BufferQueueStorage; +static BufferQueue *BQ = nullptr; +static BufferQueue::Buffer Buffer; +static typename std::aligned_storage::type + ThreadDataAllocatorStorage; +static typename std::aligned_storage::type + ThreadDataArrayStorage; + +static ThreadDataAllocator *TDAllocator = nullptr; +static ThreadDataArray *TDArray = nullptr; + using ProfileBufferArray = Array; -using ThreadTriesArrayAllocator = typename ThreadTriesArray::AllocatorType; using ProfileBufferArrayAllocator = typename ProfileBufferArray::AllocatorType; // These need to be global aligned storage to avoid dynamic initialization. We // need these to be aligned to allow us to placement new objects into the // storage, and have pointers to those objects be appropriately aligned. -static typename std::aligned_storage::type - AllocatorStorage; -static typename std::aligned_storage::type - ThreadTriesStorage; static typename std::aligned_storage::type ProfileBuffersStorage; -static typename std::aligned_storage::type - ThreadTriesArrayAllocatorStorage; static typename std::aligned_storage::type ProfileBufferArrayAllocatorStorage; -static ThreadTriesArray *ThreadTries = nullptr; -static ThreadTriesArrayAllocator *ThreadTriesAllocator = nullptr; -static ProfileBufferArray *ProfileBuffers = nullptr; static ProfileBufferArrayAllocator *ProfileBuffersAllocator = nullptr; -static FunctionCallTrie::Allocators *GlobalAllocators = nullptr; +static ProfileBufferArray *ProfileBuffers = nullptr; + +// Use a global flag to determine whether the collector implementation has been +// initialized. +static atomic_uint8_t CollectorInitialized{0}; } // namespace -void post(const FunctionCallTrie &T, tid_t TId) XRAY_NEVER_INSTRUMENT { - static pthread_once_t Once = PTHREAD_ONCE_INIT; - pthread_once( - &Once, +[]() XRAY_NEVER_INSTRUMENT { reset(); }); +void post(BufferQueue *Q, FunctionCallTrie &&T, + FunctionCallTrie::Allocators &&A, + FunctionCallTrie::Allocators::Buffers &&B, + tid_t TId) XRAY_NEVER_INSTRUMENT { + DCHECK_NE(Q, nullptr); + + // Bail out early if the collector has not been initialized. + if (!atomic_load(&CollectorInitialized, memory_order_acquire)) { + T.~FunctionCallTrie(); + A.~Allocators(); + Q->releaseBuffer(B.NodeBuffer); + Q->releaseBuffer(B.RootsBuffer); + Q->releaseBuffer(B.ShadowStackBuffer); + Q->releaseBuffer(B.NodeIdPairBuffer); + B.~Buffers(); + return; + } - ThreadTrie *Item = nullptr; { SpinMutexLock Lock(&GlobalMutex); - if (GlobalAllocators == nullptr || ThreadTries == nullptr) - return; - - Item = ThreadTries->Append({}); - if (Item == nullptr) - return; - - Item->TId = TId; - auto Trie = reinterpret_cast(&Item->TrieStorage); - new (Trie) FunctionCallTrie(*GlobalAllocators); - T.deepCopyInto(*Trie); + DCHECK_NE(TDAllocator, nullptr); + DCHECK_NE(TDArray, nullptr); + + if (TDArray->AppendEmplace(Q, std::move(B), std::move(A), std::move(T), + TId) == nullptr) { + // If we fail to add the data to the array, we should destroy the objects + // handed us. + T.~FunctionCallTrie(); + A.~Allocators(); + Q->releaseBuffer(B.NodeBuffer); + Q->releaseBuffer(B.RootsBuffer); + Q->releaseBuffer(B.ShadowStackBuffer); + Q->releaseBuffer(B.NodeIdPairBuffer); + B.~Buffers(); + } } } @@ -133,11 +171,13 @@ using StackAllocator = typename StackArray::AllocatorType; StackAllocator StackAlloc(profilingFlags()->stack_allocator_max); StackArray DFSStack(StackAlloc); - for (const auto R : Trie.getRoots()) { + for (const auto *R : Trie.getRoots()) { DFSStack.Append(R); while (!DFSStack.empty()) { - auto Node = DFSStack.back(); + auto *Node = DFSStack.back(); DFSStack.trim(1); + if (Node == nullptr) + continue; auto Record = PRs.AppendEmplace(PathArray{PA}, Node); if (Record == nullptr) return; @@ -191,40 +231,54 @@ } // namespace void serialize() XRAY_NEVER_INSTRUMENT { - SpinMutexLock Lock(&GlobalMutex); - - if (GlobalAllocators == nullptr || ThreadTries == nullptr || - ProfileBuffers == nullptr) + if (!atomic_load(&CollectorInitialized, memory_order_acquire)) return; + SpinMutexLock Lock(&GlobalMutex); + // Clear out the global ProfileBuffers, if it's not empty. for (auto &B : *ProfileBuffers) deallocateBuffer(reinterpret_cast(B.Data), B.Size); ProfileBuffers->trim(ProfileBuffers->size()); - if (ThreadTries->empty()) + DCHECK_NE(TDArray, nullptr); + if (TDArray->empty()) return; // Then repopulate the global ProfileBuffers. u32 I = 0; - for (const auto &ThreadTrie : *ThreadTries) { + auto MaxSize = profilingFlags()->global_allocator_max; + auto ProfileArena = allocateBuffer(MaxSize); + if (ProfileArena == nullptr) + return; + + auto ProfileArenaCleanup = at_scope_exit( + [&]() XRAY_NEVER_INSTRUMENT { deallocateBuffer(ProfileArena, MaxSize); }); + + auto PathArena = allocateBuffer(profilingFlags()->global_allocator_max); + if (PathArena == nullptr) + return; + + auto PathArenaCleanup = at_scope_exit( + [&]() XRAY_NEVER_INSTRUMENT { deallocateBuffer(PathArena, MaxSize); }); + + for (const auto &ThreadTrie : *TDArray) { using ProfileRecordAllocator = typename ProfileRecordArray::AllocatorType; - ProfileRecordAllocator PRAlloc(profilingFlags()->global_allocator_max); + ProfileRecordAllocator PRAlloc(ProfileArena, + profilingFlags()->global_allocator_max); ProfileRecord::PathAllocator PathAlloc( - profilingFlags()->global_allocator_max); + PathArena, profilingFlags()->global_allocator_max); ProfileRecordArray ProfileRecords(PRAlloc); // First, we want to compute the amount of space we're going to need. We'll // use a local allocator and an __xray::Array<...> to store the intermediary // data, then compute the size as we're going along. Then we'll allocate the // contiguous space to contain the thread buffer data. - const auto &Trie = - *reinterpret_cast(&(ThreadTrie.TrieStorage)); - if (Trie.getRoots().empty()) + if (ThreadTrie.FCT.getRoots().empty()) continue; - populateRecords(ProfileRecords, PathAlloc, Trie); - DCHECK(!Trie.getRoots().empty()); + populateRecords(ProfileRecords, PathAlloc, ThreadTrie.FCT); + DCHECK(!ThreadTrie.FCT.getRoots().empty()); DCHECK(!ProfileRecords.empty()); // Go through each record, to compute the sizes. @@ -241,15 +295,16 @@ CumulativeSizes += 20 + (4 * Record.Path.size()); BlockHeader Header{16 + CumulativeSizes, I++, ThreadTrie.TId}; - auto Buffer = ProfileBuffers->Append({}); - Buffer->Size = sizeof(Header) + CumulativeSizes; - Buffer->Data = allocateBuffer(Buffer->Size); - DCHECK_NE(Buffer->Data, nullptr); - serializeRecords(Buffer, Header, ProfileRecords); + auto B = ProfileBuffers->Append({}); + B->Size = sizeof(Header) + CumulativeSizes; + B->Data = allocateBuffer(B->Size); + DCHECK_NE(B->Data, nullptr); + serializeRecords(B, Header, ProfileRecords); } } void reset() XRAY_NEVER_INSTRUMENT { + atomic_store(&CollectorInitialized, 0, memory_order_release); SpinMutexLock Lock(&GlobalMutex); if (ProfileBuffers != nullptr) { @@ -257,46 +312,68 @@ for (auto &B : *ProfileBuffers) deallocateBuffer(reinterpret_cast(B.Data), B.Size); ProfileBuffers->trim(ProfileBuffers->size()); + ProfileBuffers = nullptr; } - if (ThreadTries != nullptr) { - // Clear out the function call tries per thread. - for (auto &T : *ThreadTries) { - auto Trie = reinterpret_cast(&T.TrieStorage); - Trie->~FunctionCallTrie(); + if (TDArray != nullptr) { + // Release the resources as required. + for (auto &TD : *TDArray) { + TD.BQ->releaseBuffer(TD.Buffers.NodeBuffer); + TD.BQ->releaseBuffer(TD.Buffers.RootsBuffer); + TD.BQ->releaseBuffer(TD.Buffers.ShadowStackBuffer); + TD.BQ->releaseBuffer(TD.Buffers.NodeIdPairBuffer); } - ThreadTries->trim(ThreadTries->size()); + // We don't bother destroying the array here because we've already + // potentially freed the backing store for the array. Instead we're going to + // reset the pointer to nullptr, and re-use the storage later instead + // (placement-new'ing into the storage as-is). + TDArray = nullptr; } - // Reset the global allocators. - if (GlobalAllocators != nullptr) - GlobalAllocators->~Allocators(); + if (TDAllocator != nullptr) { + TDAllocator->~Allocator(); + TDAllocator = nullptr; + } - GlobalAllocators = - reinterpret_cast(&AllocatorStorage); - new (GlobalAllocators) - FunctionCallTrie::Allocators(FunctionCallTrie::InitAllocators()); + if (Buffer.Data != nullptr) { + BQ->releaseBuffer(Buffer); + } - if (ThreadTriesAllocator != nullptr) - ThreadTriesAllocator->~ThreadTriesArrayAllocator(); + if (BQ == nullptr) { + bool Success = false; + new (&BufferQueueStorage) + BufferQueue(profilingFlags()->global_allocator_max, 1, Success); + if (!Success) + return; + BQ = reinterpret_cast(&BufferQueueStorage); + } else { + BQ->finalize(); - ThreadTriesAllocator = reinterpret_cast( - &ThreadTriesArrayAllocatorStorage); - new (ThreadTriesAllocator) - ThreadTriesArrayAllocator(profilingFlags()->global_allocator_max); - ThreadTries = reinterpret_cast(&ThreadTriesStorage); - new (ThreadTries) ThreadTriesArray(*ThreadTriesAllocator); + if (BQ->init(profilingFlags()->global_allocator_max, 1) != + BufferQueue::ErrorCode::Ok) + return; + } - if (ProfileBuffersAllocator != nullptr) - ProfileBuffersAllocator->~ProfileBufferArrayAllocator(); + if (BQ->getBuffer(Buffer) != BufferQueue::ErrorCode::Ok) + return; + new (&ProfileBufferArrayAllocatorStorage) + ProfileBufferArrayAllocator(profilingFlags()->global_allocator_max); ProfileBuffersAllocator = reinterpret_cast( &ProfileBufferArrayAllocatorStorage); - new (ProfileBuffersAllocator) - ProfileBufferArrayAllocator(profilingFlags()->global_allocator_max); + + new (&ProfileBuffersStorage) ProfileBufferArray(*ProfileBuffersAllocator); ProfileBuffers = reinterpret_cast(&ProfileBuffersStorage); - new (ProfileBuffers) ProfileBufferArray(*ProfileBuffersAllocator); + + new (&ThreadDataAllocatorStorage) + ThreadDataAllocator(Buffer.Data, Buffer.Size); + TDAllocator = + reinterpret_cast(&ThreadDataAllocatorStorage); + new (&ThreadDataArrayStorage) ThreadDataArray(*TDAllocator); + TDArray = reinterpret_cast(&ThreadDataArrayStorage); + + atomic_store(&CollectorInitialized, 1, memory_order_release); } XRayBuffer nextBuffer(XRayBuffer B) XRAY_NEVER_INSTRUMENT { Index: compiler-rt/lib/xray/xray_profiling.cc =================================================================== --- compiler-rt/lib/xray/xray_profiling.cc +++ compiler-rt/lib/xray/xray_profiling.cc @@ -19,6 +19,7 @@ #include "sanitizer_common/sanitizer_flags.h" #include "xray/xray_interface.h" #include "xray/xray_log_interface.h" +#include "xray_buffer_queue.h" #include "xray_flags.h" #include "xray_profile_collector.h" #include "xray_profiling_flags.h" @@ -46,6 +47,13 @@ static pthread_key_t ProfilingKey; +// We use a global buffer queue, which gets initialized once at initialisation +// time, and gets reset when profiling is "done". +static std::aligned_storage::type + BufferQueueStorage; +static BufferQueue *BQ = nullptr; + +thread_local FunctionCallTrie::Allocators::Buffers ThreadBuffers; thread_local std::aligned_storage::type AllocatorsStorage; @@ -81,17 +89,58 @@ uintptr_t Allocators = 0; if (atomic_compare_exchange_strong(&TLD.Allocators, &Allocators, 1, memory_order_acq_rel)) { - new (&AllocatorsStorage) - FunctionCallTrie::Allocators(FunctionCallTrie::InitAllocators()); + bool Success = false; + auto AllocatorsUndo = at_scope_exit([&]() XRAY_NEVER_INSTRUMENT { + if (!Success) + atomic_store(&TLD.Allocators, 0, memory_order_release); + }); + + // Acquire a set of buffers for this thread. + if (BQ == nullptr) + return nullptr; + + if (BQ->getBuffer(ThreadBuffers.NodeBuffer) != BufferQueue::ErrorCode::Ok) + return nullptr; + auto NodeBufferUndo = at_scope_exit([&]() XRAY_NEVER_INSTRUMENT { + if (!Success) + BQ->releaseBuffer(ThreadBuffers.NodeBuffer); + }); + + if (BQ->getBuffer(ThreadBuffers.RootsBuffer) != BufferQueue::ErrorCode::Ok) + return nullptr; + auto RootsBufferUndo = at_scope_exit([&]() XRAY_NEVER_INSTRUMENT { + if (!Success) + BQ->releaseBuffer(ThreadBuffers.RootsBuffer); + }); + + if (BQ->getBuffer(ThreadBuffers.ShadowStackBuffer) != + BufferQueue::ErrorCode::Ok) + return nullptr; + auto ShadowStackBufferUndo = at_scope_exit([&]() XRAY_NEVER_INSTRUMENT { + if (!Success) + BQ->releaseBuffer(ThreadBuffers.ShadowStackBuffer); + }); + + if (BQ->getBuffer(ThreadBuffers.NodeIdPairBuffer) != + BufferQueue::ErrorCode::Ok) + return nullptr; + + Success = true; + new (&AllocatorsStorage) FunctionCallTrie::Allocators( + FunctionCallTrie::InitAllocatorsFromBuffers(ThreadBuffers)); Allocators = reinterpret_cast( reinterpret_cast(&AllocatorsStorage)); atomic_store(&TLD.Allocators, Allocators, memory_order_release); } + if (Allocators == 1) + return nullptr; + uintptr_t FCT = 0; if (atomic_compare_exchange_strong(&TLD.FCT, &FCT, 1, memory_order_acq_rel)) { - new (&FunctionCallTrieStorage) FunctionCallTrie( - *reinterpret_cast(Allocators)); + new (&FunctionCallTrieStorage) + FunctionCallTrie(*reinterpret_cast( + atomic_load_relaxed(&TLD.Allocators))); FCT = reinterpret_cast( reinterpret_cast(&FunctionCallTrieStorage)); atomic_store(&TLD.FCT, FCT, memory_order_release); @@ -104,10 +153,6 @@ } static void cleanupTLD() XRAY_NEVER_INSTRUMENT { - RecursionGuard TLDInit(TLDInitGuard); - if (!TLDInit) - return; - auto FCT = atomic_exchange(&TLD.FCT, 0, memory_order_acq_rel); if (FCT == reinterpret_cast(reinterpret_cast( &FunctionCallTrieStorage))) @@ -125,7 +170,7 @@ if (!TLDInit) return; - uintptr_t P = atomic_load(&T.FCT, memory_order_acquire); + uintptr_t P = atomic_exchange(&T.FCT, 0, memory_order_acq_rel); if (P != reinterpret_cast( reinterpret_cast(&FunctionCallTrieStorage))) return; @@ -133,10 +178,21 @@ auto FCT = reinterpret_cast(P); DCHECK_NE(FCT, nullptr); - if (!FCT->getRoots().empty()) - profileCollectorService::post(*FCT, GetTid()); + uintptr_t A = atomic_exchange(&T.Allocators, 0, memory_order_acq_rel); + if (A != + reinterpret_cast( + reinterpret_cast(&AllocatorsStorage))) + return; - cleanupTLD(); + auto Allocators = reinterpret_cast(A); + DCHECK_NE(Allocators, nullptr); + + // Always move the data into the profile collector. + profileCollectorService::post(BQ, std::move(*FCT), std::move(*Allocators), + std::move(ThreadBuffers), GetTid()); + + // Re-initialize the ThreadBuffers object to a known "default" state. + ThreadBuffers = FunctionCallTrie::Allocators::Buffers{}; } } // namespace @@ -176,8 +232,6 @@ return XRayLogFlushStatus::XRAY_LOG_FLUSHING; } - postCurrentThreadFCT(TLD); - // At this point, we'll create the file that will contain the profile, but // only if the options say so. if (!profilingFlags()->no_flush) { @@ -205,14 +259,11 @@ } } - // Clean up the current thread's TLD information as well. - cleanupTLD(); - profileCollectorService::reset(); atomic_store(&ProfilerLogFlushStatus, XRayLogFlushStatus::XRAY_LOG_FLUSHED, memory_order_release); - atomic_store(&ProfilerLogStatus, XRayLogFlushStatus::XRAY_LOG_FLUSHED, + atomic_store(&ProfilerLogStatus, XRayLogInitStatus::XRAY_LOG_UNINITIALIZED, memory_order_release); return XRayLogFlushStatus::XRAY_LOG_FLUSHED; @@ -272,6 +323,12 @@ return static_cast(CurrentStatus); } + // Mark then finalize the current generation of buffers. This allows us to let + // the threads currently holding onto new buffers still use them, but let the + // last reference do the memory cleanup. + DCHECK_NE(BQ, nullptr); + BQ->finalize(); + // Wait a grace period to allow threads to see that we're finalizing. SleepForMillis(profilingFlags()->grace_period_ms); @@ -293,8 +350,8 @@ } XRayLogInitStatus -profilingLoggingInit(UNUSED size_t BufferSize, UNUSED size_t BufferMax, - void *Options, size_t OptionsSize) XRAY_NEVER_INSTRUMENT { +profilingLoggingInit(size_t, size_t, void *Options, + size_t OptionsSize) XRAY_NEVER_INSTRUMENT { RecursionGuard G(ReentranceGuard); if (!G) return XRayLogInitStatus::XRAY_LOG_UNINITIALIZED; @@ -302,7 +359,7 @@ s32 CurrentStatus = XRayLogInitStatus::XRAY_LOG_UNINITIALIZED; if (!atomic_compare_exchange_strong(&ProfilerLogStatus, &CurrentStatus, XRayLogInitStatus::XRAY_LOG_INITIALIZING, - memory_order_release)) { + memory_order_acq_rel)) { if (Verbosity()) Report("Cannot initialize already initialised profiling " "implementation.\n"); @@ -331,6 +388,41 @@ // We need to reset the profile data collection implementation now. profileCollectorService::reset(); + // Then also reset the buffer queue implementation. + if (BQ == nullptr) { + bool Success = false; + new (&BufferQueueStorage) + BufferQueue(profilingFlags()->per_thread_allocator_max, + profilingFlags()->buffers_max, Success); + if (!Success) { + if (Verbosity()) + Report("Failed to initialize preallocated memory buffers!"); + atomic_store(&ProfilerLogStatus, + XRayLogInitStatus::XRAY_LOG_UNINITIALIZED, + memory_order_release); + return XRayLogInitStatus::XRAY_LOG_UNINITIALIZED; + } + + // If we've succeded, set the global pointer to the initialised storage. + BQ = reinterpret_cast(&BufferQueueStorage); + } else { + BQ->finalize(); + auto InitStatus = BQ->init(profilingFlags()->per_thread_allocator_max, + profilingFlags()->buffers_max); + + if (InitStatus != BufferQueue::ErrorCode::Ok) { + if (Verbosity()) + Report("Failed to initialize preallocated memory buffers; error: %s", + BufferQueue::getErrorString(InitStatus)); + atomic_store(&ProfilerLogStatus, + XRayLogInitStatus::XRAY_LOG_UNINITIALIZED, + memory_order_release); + return XRayLogInitStatus::XRAY_LOG_UNINITIALIZED; + } + + DCHECK(!BQ->finalizing()); + } + // We need to set up the exit handlers. static pthread_once_t Once = PTHREAD_ONCE_INIT; pthread_once( Index: compiler-rt/lib/xray/xray_profiling_flags.inc =================================================================== --- compiler-rt/lib/xray/xray_profiling_flags.inc +++ compiler-rt/lib/xray/xray_profiling_flags.inc @@ -14,7 +14,7 @@ #error "Define XRAY_FLAG prior to including this file!" #endif -XRAY_FLAG(uptr, per_thread_allocator_max, 2 << 20, +XRAY_FLAG(uptr, per_thread_allocator_max, 16384, "Maximum size of any single per-thread allocator.") XRAY_FLAG(uptr, global_allocator_max, 2 << 24, "Maximum size of the global allocator for profile storage.") @@ -27,3 +27,6 @@ XRAY_FLAG(bool, no_flush, false, "Set to true if we want the profiling implementation to not write " "out files.") +XRAY_FLAG(int, buffers_max, 128, + "The number of buffers to pre-allocate used by the profiling " + "implementation.") Index: compiler-rt/lib/xray/xray_segmented_array.h =================================================================== --- compiler-rt/lib/xray/xray_segmented_array.h +++ compiler-rt/lib/xray/xray_segmented_array.h @@ -372,7 +372,7 @@ auto Base = &Tail->Data; auto AlignedOffset = Base + (Offset * AlignedElementStorageSize); DCHECK_LE(AlignedOffset + sizeof(T), - reinterpret_cast(Tail) + SegmentSize); + reinterpret_cast(Base) + SegmentSize); // In-place construct at Position. new (AlignedOffset) T{std::forward(args)...};