Index: compiler-rt/trunk/lib/xray/xray_profile_collector.cc =================================================================== --- compiler-rt/trunk/lib/xray/xray_profile_collector.cc +++ compiler-rt/trunk/lib/xray/xray_profile_collector.cc @@ -13,10 +13,10 @@ // //===----------------------------------------------------------------------===// #include "xray_profile_collector.h" -#include "sanitizer_common/sanitizer_allocator_internal.h" #include "sanitizer_common/sanitizer_common.h" -#include "sanitizer_common/sanitizer_vector.h" +#include "xray_allocator.h" #include "xray_profiling_flags.h" +#include "xray_segmented_array.h" #include #include #include @@ -29,7 +29,7 @@ SpinMutex GlobalMutex; struct ThreadTrie { tid_t TId; - FunctionCallTrie *Trie; + typename std::aligned_storage::type TrieStorage; }; struct ProfileBuffer { @@ -56,65 +56,68 @@ u64 ThreadId; }; -// These need to be pointers that point to heap/internal-allocator-allocated -// objects because these are accessed even at program exit. -Vector *ThreadTries = nullptr; -Vector *ProfileBuffers = nullptr; -FunctionCallTrie::Allocators *GlobalAllocators = nullptr; +using ThreadTriesArray = Array; +using ProfileBufferArray = Array; +using ThreadTriesArrayAllocator = typename ThreadTriesArray::AllocatorType; +using ProfileBufferArrayAllocator = typename ProfileBufferArray::AllocatorType; + +// These need to be global aligned storage to avoid dynamic initialization. We +// need these to be aligned to allow us to placement new objects into the +// storage, and have pointers to those objects be appropriately aligned. +static typename std::aligned_storage::type + AllocatorStorage; +static typename std::aligned_storage::type + ThreadTriesStorage; +static typename std::aligned_storage::type + ProfileBuffersStorage; +static typename std::aligned_storage::type + ThreadTriesArrayAllocatorStorage; +static typename std::aligned_storage::type + ProfileBufferArrayAllocatorStorage; + +static ThreadTriesArray *ThreadTries = nullptr; +static ThreadTriesArrayAllocator *ThreadTriesAllocator = nullptr; +static ProfileBufferArray *ProfileBuffers = nullptr; +static ProfileBufferArrayAllocator *ProfileBuffersAllocator = nullptr; +static FunctionCallTrie::Allocators *GlobalAllocators = nullptr; + +static void *allocateBuffer(size_t S) { + auto B = reinterpret_cast(internal_mmap( + NULL, S, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0)); + if (B == MAP_FAILED) { + if (Verbosity()) + Report("XRay Profiling: Failed to allocate memory of size %d.\n", S); + return nullptr; + } + return B; +} + +static void deallocateBuffer(void *B, size_t S) { + if (B == nullptr) + return; + internal_munmap(B, S); +} } // namespace void post(const FunctionCallTrie &T, tid_t TId) { static pthread_once_t Once = PTHREAD_ONCE_INIT; - pthread_once(&Once, +[] { - SpinMutexLock Lock(&GlobalMutex); - GlobalAllocators = reinterpret_cast( - InternalAlloc(sizeof(FunctionCallTrie::Allocators))); - new (GlobalAllocators) FunctionCallTrie::Allocators(); - *GlobalAllocators = FunctionCallTrie::InitAllocatorsCustom( - profilingFlags()->global_allocator_max); - ThreadTries = reinterpret_cast *>( - InternalAlloc(sizeof(Vector))); - new (ThreadTries) Vector(); - ProfileBuffers = reinterpret_cast *>( - InternalAlloc(sizeof(Vector))); - new (ProfileBuffers) Vector(); - }); - DCHECK_NE(GlobalAllocators, nullptr); - DCHECK_NE(ThreadTries, nullptr); - DCHECK_NE(ProfileBuffers, nullptr); + pthread_once(&Once, +[] { reset(); }); ThreadTrie *Item = nullptr; { SpinMutexLock Lock(&GlobalMutex); - if (GlobalAllocators == nullptr) + if (GlobalAllocators == nullptr || ThreadTries == nullptr) return; - Item = ThreadTries->PushBack(); + Item = ThreadTries->Append({}); Item->TId = TId; - - // Here we're using the internal allocator instead of the managed allocator - // because: - // - // 1) We're not using the segmented array data structure to host - // FunctionCallTrie objects. We're using a Vector (from sanitizer_common) - // which works like a std::vector<...> keeping elements contiguous in - // memory. The segmented array data structure assumes that elements are - // trivially destructible, where FunctionCallTrie isn't. - // - // 2) Using a managed allocator means we need to manage that separately, - // which complicates the nature of this code. To get around that, we're - // using the internal allocator instead, which has its own global state - // and is decoupled from the lifetime management required by the managed - // allocator we have in XRay. - // - Item->Trie = reinterpret_cast(InternalAlloc( - sizeof(FunctionCallTrie), nullptr, alignof(FunctionCallTrie))); - DCHECK_NE(Item->Trie, nullptr); - new (Item->Trie) FunctionCallTrie(*GlobalAllocators); + auto Trie = reinterpret_cast(&Item->TrieStorage); + new (Trie) FunctionCallTrie(*GlobalAllocators); } - T.deepCopyInto(*Item->Trie); + auto Trie = reinterpret_cast(&Item->TrieStorage); + T.deepCopyInto(*Trie); } // A PathArray represents the function id's representing a stack trace. In this @@ -127,18 +130,12 @@ // The Path in this record is the function id's from the leaf to the root of // the function call stack as represented from a FunctionCallTrie. - PathArray *Path = nullptr; + PathArray Path; const FunctionCallTrie::Node *Node = nullptr; // Constructor for in-place construction. ProfileRecord(PathAllocator &A, const FunctionCallTrie::Node *N) - : Path([&] { - auto P = - reinterpret_cast(InternalAlloc(sizeof(PathArray))); - new (P) PathArray(A); - return P; - }()), - Node(N) {} + : Path(A), Node(N) {} }; namespace { @@ -167,8 +164,8 @@ // Traverse the Node's parents and as we're doing so, get the FIds in // the order they appear. for (auto N = Node; N != nullptr; N = N->Parent) - Record->Path->Append(N->FId); - DCHECK(!Record->Path->empty()); + Record->Path.Append(N->FId); + DCHECK(!Record->Path.empty()); for (const auto C : Node->Callees) DFSStack.Append(C.NodePtr); @@ -183,7 +180,7 @@ sizeof(Header); for (const auto &Record : ProfileRecords) { // List of IDs follow: - for (const auto FId : *Record.Path) + for (const auto FId : Record.Path) NextPtr = static_cast(internal_memcpy(NextPtr, &FId, sizeof(FId))) + sizeof(FId); @@ -213,16 +210,21 @@ void serialize() { SpinMutexLock Lock(&GlobalMutex); - // Clear out the global ProfileBuffers. - for (uptr I = 0; I < ProfileBuffers->Size(); ++I) - InternalFree((*ProfileBuffers)[I].Data); - ProfileBuffers->Reset(); + if (GlobalAllocators == nullptr || ThreadTries == nullptr || + ProfileBuffers == nullptr) + return; + + // Clear out the global ProfileBuffers, if it's not empty. + for (auto &B : *ProfileBuffers) + deallocateBuffer(B.Data, B.Size); + ProfileBuffers->trim(ProfileBuffers->size()); - if (ThreadTries->Size() == 0) + if (ThreadTries->empty()) return; // Then repopulate the global ProfileBuffers. - for (u32 I = 0; I < ThreadTries->Size(); ++I) { + u32 I = 0; + for (const auto &ThreadTrie : *ThreadTries) { using ProfileRecordAllocator = typename ProfileRecordArray::AllocatorType; ProfileRecordAllocator PRAlloc(profilingFlags()->global_allocator_max); ProfileRecord::PathAllocator PathAlloc( @@ -233,9 +235,11 @@ // use a local allocator and an __xray::Array<...> to store the intermediary // data, then compute the size as we're going along. Then we'll allocate the // contiguous space to contain the thread buffer data. - const auto &Trie = *(*ThreadTries)[I].Trie; + const auto &Trie = + *reinterpret_cast(&(ThreadTrie.TrieStorage)); if (Trie.getRoots().empty()) continue; + populateRecords(ProfileRecords, PathAlloc, Trie); DCHECK(!Trie.getRoots().empty()); DCHECK(!ProfileRecords.empty()); @@ -251,68 +255,71 @@ // + end of record (8 bytes) u32 CumulativeSizes = 0; for (const auto &Record : ProfileRecords) - CumulativeSizes += 20 + (4 * Record.Path->size()); + CumulativeSizes += 20 + (4 * Record.Path.size()); - BlockHeader Header{16 + CumulativeSizes, I, (*ThreadTries)[I].TId}; - auto Buffer = ProfileBuffers->PushBack(); + BlockHeader Header{16 + CumulativeSizes, I++, ThreadTrie.TId}; + auto Buffer = ProfileBuffers->Append({}); Buffer->Size = sizeof(Header) + CumulativeSizes; - Buffer->Data = InternalAlloc(Buffer->Size, nullptr, 64); + Buffer->Data = allocateBuffer(Buffer->Size); DCHECK_NE(Buffer->Data, nullptr); serializeRecords(Buffer, Header, ProfileRecords); - - // Now clean up the ProfileRecords array, one at a time. - for (auto &Record : ProfileRecords) { - Record.Path->~PathArray(); - InternalFree(Record.Path); - } } } void reset() { SpinMutexLock Lock(&GlobalMutex); + if (ProfileBuffers != nullptr) { // Clear out the profile buffers that have been serialized. - for (uptr I = 0; I < ProfileBuffers->Size(); ++I) - InternalFree((*ProfileBuffers)[I].Data); - ProfileBuffers->Reset(); - InternalFree(ProfileBuffers); - ProfileBuffers = nullptr; + for (auto &B : *ProfileBuffers) + deallocateBuffer(B.Data, B.Size); + ProfileBuffers->trim(ProfileBuffers->size()); } if (ThreadTries != nullptr) { // Clear out the function call tries per thread. - for (uptr I = 0; I < ThreadTries->Size(); ++I) { - auto &T = (*ThreadTries)[I]; - T.Trie->~FunctionCallTrie(); - InternalFree(T.Trie); + for (auto &T : *ThreadTries) { + auto Trie = reinterpret_cast(&T.TrieStorage); + Trie->~FunctionCallTrie(); } - ThreadTries->Reset(); - InternalFree(ThreadTries); - ThreadTries = nullptr; + ThreadTries->trim(ThreadTries->size()); } // Reset the global allocators. - if (GlobalAllocators != nullptr) { + if (GlobalAllocators != nullptr) GlobalAllocators->~Allocators(); - InternalFree(GlobalAllocators); - GlobalAllocators = nullptr; - } - GlobalAllocators = reinterpret_cast( - InternalAlloc(sizeof(FunctionCallTrie::Allocators))); + + GlobalAllocators = + reinterpret_cast(&AllocatorStorage); new (GlobalAllocators) FunctionCallTrie::Allocators(); *GlobalAllocators = FunctionCallTrie::InitAllocators(); - ThreadTries = reinterpret_cast *>( - InternalAlloc(sizeof(Vector))); - new (ThreadTries) Vector(); - ProfileBuffers = reinterpret_cast *>( - InternalAlloc(sizeof(Vector))); - new (ProfileBuffers) Vector(); + + if (ThreadTriesAllocator != nullptr) + ThreadTriesAllocator->~ThreadTriesArrayAllocator(); + + ThreadTriesAllocator = reinterpret_cast( + &ThreadTriesArrayAllocatorStorage); + new (ThreadTriesAllocator) + ThreadTriesArrayAllocator(profilingFlags()->global_allocator_max); + ThreadTries = reinterpret_cast(&ThreadTriesStorage); + new (ThreadTries) ThreadTriesArray(*ThreadTriesAllocator); + + if (ProfileBuffersAllocator != nullptr) + ProfileBuffersAllocator->~ProfileBufferArrayAllocator(); + + ProfileBuffersAllocator = reinterpret_cast( + &ProfileBufferArrayAllocatorStorage); + new (ProfileBuffersAllocator) + ProfileBufferArrayAllocator(profilingFlags()->global_allocator_max); + ProfileBuffers = + reinterpret_cast(&ProfileBuffersStorage); + new (ProfileBuffers) ProfileBufferArray(*ProfileBuffersAllocator); } XRayBuffer nextBuffer(XRayBuffer B) { SpinMutexLock Lock(&GlobalMutex); - if (ProfileBuffers == nullptr || ProfileBuffers->Size() == 0) + if (ProfileBuffers == nullptr || ProfileBuffers->size() == 0) return {nullptr, 0}; static pthread_once_t Once = PTHREAD_ONCE_INIT; @@ -336,7 +343,7 @@ BlockHeader Header; internal_memcpy(&Header, B.Data, sizeof(BlockHeader)); auto NextBlock = Header.BlockNum + 1; - if (NextBlock < ProfileBuffers->Size()) + if (NextBlock < ProfileBuffers->size()) return {(*ProfileBuffers)[NextBlock].Data, (*ProfileBuffers)[NextBlock].Size}; return {nullptr, 0}; Index: compiler-rt/trunk/lib/xray/xray_segmented_array.h =================================================================== --- compiler-rt/trunk/lib/xray/xray_segmented_array.h +++ compiler-rt/trunk/lib/xray/xray_segmented_array.h @@ -325,6 +325,9 @@ /// Remove N Elements from the end. This leaves the blocks behind, and not /// require allocation of new blocks for new elements added after trimming. void trim(size_t Elements) { + if (Elements == 0) + return; + DCHECK_LE(Elements, Size); DCHECK_GT(Size, 0); auto OldSize = Size;