Index: compiler-rt/lib/xray/xray_profile_collector.cc =================================================================== --- compiler-rt/lib/xray/xray_profile_collector.cc +++ compiler-rt/lib/xray/xray_profile_collector.cc @@ -30,13 +30,11 @@ tid_t TId; FunctionCallTrie *Trie; }; -Vector ThreadTries; struct ProfileBuffer { void *Data; size_t Size; }; -Vector ProfileBuffers; struct BlockHeader { u32 BlockSize; @@ -44,6 +42,10 @@ u64 ThreadId; }; +// These need to be pointers that point to heap/internal-allocator-allocated +// objects because these are accessed even at program exit. +Vector *ThreadTries = nullptr; +Vector *ProfileBuffers = nullptr; FunctionCallTrie::Allocators *GlobalAllocators = nullptr; } // namespace @@ -57,8 +59,16 @@ new (GlobalAllocators) FunctionCallTrie::Allocators(); *GlobalAllocators = FunctionCallTrie::InitAllocatorsCustom( profilingFlags()->global_allocator_max); + ThreadTries = reinterpret_cast *>( + InternalAlloc(sizeof(Vector))); + new (ThreadTries) Vector(); + ProfileBuffers = reinterpret_cast *>( + InternalAlloc(sizeof(Vector))); + new (ProfileBuffers) Vector(); }); DCHECK_NE(GlobalAllocators, nullptr); + DCHECK_NE(ThreadTries, nullptr); + DCHECK_NE(ProfileBuffers, nullptr); ThreadTrie *Item = nullptr; { @@ -66,7 +76,7 @@ if (GlobalAllocators == nullptr) return; - Item = ThreadTries.PushBack(); + Item = ThreadTries->PushBack(); Item->TId = TId; // Here we're using the internal allocator instead of the managed allocator @@ -188,15 +198,15 @@ SpinMutexLock Lock(&GlobalMutex); // Clear out the global ProfileBuffers. - for (uptr I = 0; I < ProfileBuffers.Size(); ++I) - InternalFree(ProfileBuffers[I].Data); - ProfileBuffers.Reset(); + for (uptr I = 0; I < ProfileBuffers->Size(); ++I) + InternalFree((*ProfileBuffers)[I].Data); + ProfileBuffers->Reset(); - if (ThreadTries.Size() == 0) + if (ThreadTries->Size() == 0) return; // Then repopulate the global ProfileBuffers. - for (u32 I = 0; I < ThreadTries.Size(); ++I) { + for (u32 I = 0; I < ThreadTries->Size(); ++I) { using ProfileRecordAllocator = typename ProfileRecordArray::AllocatorType; ProfileRecordAllocator PRAlloc(profilingFlags()->global_allocator_max, 0); ProfileRecord::PathAllocator PathAlloc( @@ -207,7 +217,7 @@ // use a local allocator and an __xray::Array<...> to store the intermediary // data, then compute the size as we're going along. Then we'll allocate the // contiguous space to contain the thread buffer data. - const auto &Trie = *ThreadTries[I].Trie; + const auto &Trie = *(*ThreadTries)[I].Trie; if (Trie.getRoots().empty()) continue; populateRecords(ProfileRecords, PathAlloc, Trie); @@ -227,8 +237,8 @@ for (const auto &Record : ProfileRecords) CumulativeSizes += 20 + (4 * Record.Path->size()); - BlockHeader Header{16 + CumulativeSizes, I, ThreadTries[I].TId}; - auto Buffer = ProfileBuffers.PushBack(); + BlockHeader Header{16 + CumulativeSizes, I, (*ThreadTries)[I].TId}; + auto Buffer = ProfileBuffers->PushBack(); Buffer->Size = sizeof(Header) + CumulativeSizes; Buffer->Data = InternalAlloc(Buffer->Size, nullptr, 64); DCHECK_NE(Buffer->Data, nullptr); @@ -244,18 +254,26 @@ void reset() { SpinMutexLock Lock(&GlobalMutex); - // Clear out the profile buffers that have been serialized. - for (uptr I = 0; I < ProfileBuffers.Size(); ++I) - InternalFree(ProfileBuffers[I].Data); - ProfileBuffers.Reset(); - - // Clear out the function call tries per thread. - for (uptr I = 0; I < ThreadTries.Size(); ++I) { - auto &T = ThreadTries[I]; - T.Trie->~FunctionCallTrie(); - InternalFree(T.Trie); + if (ProfileBuffers != nullptr) { + // Clear out the profile buffers that have been serialized. + for (uptr I = 0; I < ProfileBuffers->Size(); ++I) + InternalFree((*ProfileBuffers)[I].Data); + ProfileBuffers->Reset(); + InternalFree(ProfileBuffers); + ProfileBuffers = nullptr; + } + + if (ThreadTries != nullptr) { + // Clear out the function call tries per thread. + for (uptr I = 0; I < ThreadTries->Size(); ++I) { + auto &T = (*ThreadTries)[I]; + T.Trie->~FunctionCallTrie(); + InternalFree(T.Trie); + } + ThreadTries->Reset(); + InternalFree(ThreadTries); + ThreadTries = nullptr; } - ThreadTries.Reset(); // Reset the global allocators. if (GlobalAllocators != nullptr) { @@ -267,18 +285,29 @@ InternalAlloc(sizeof(FunctionCallTrie::Allocators))); new (GlobalAllocators) FunctionCallTrie::Allocators(); *GlobalAllocators = FunctionCallTrie::InitAllocators(); + ThreadTries = reinterpret_cast *>( + InternalAlloc(sizeof(Vector))); + new (ThreadTries) Vector(); + ProfileBuffers = reinterpret_cast *>( + InternalAlloc(sizeof(Vector))); + new (ProfileBuffers) Vector(); } XRayBuffer nextBuffer(XRayBuffer B) { SpinMutexLock Lock(&GlobalMutex); - if (B.Data == nullptr && ProfileBuffers.Size()) - return {ProfileBuffers[0].Data, ProfileBuffers[0].Size}; + + if (ProfileBuffers == nullptr || ProfileBuffers->Size() == 0) + return {nullptr, 0}; + + if (B.Data == nullptr) + return {(*ProfileBuffers)[0].Data, (*ProfileBuffers)[0].Size}; BlockHeader Header; internal_memcpy(&Header, B.Data, sizeof(BlockHeader)); auto NextBlock = Header.BlockNum + 1; - if (NextBlock < ProfileBuffers.Size()) - return {ProfileBuffers[NextBlock].Data, ProfileBuffers[NextBlock].Size}; + if (NextBlock < ProfileBuffers->Size()) + return {(*ProfileBuffers)[NextBlock].Data, + (*ProfileBuffers)[NextBlock].Size}; return {nullptr, 0}; } Index: compiler-rt/lib/xray/xray_profiling.cc =================================================================== --- compiler-rt/lib/xray/xray_profiling.cc +++ compiler-rt/lib/xray/xray_profiling.cc @@ -277,7 +277,7 @@ // We need to reset the profile data collection implementation now. profileCollectorService::reset(); - // We need to set up the at-thread-exit handler. + // We need to set up the exit handlers. static pthread_once_t Once = PTHREAD_ONCE_INIT; pthread_once(&Once, +[] { pthread_key_create(&ProfilingKey, +[](void *P) { @@ -288,6 +288,19 @@ postCurrentThreadFCT(TLD); }); + + // We also need to set up an exit handler, so that we can get the profile + // information at exit time. We use the C API to do this, to not rely on C++ + // ABI functions for registering exit handlers. + Atexit(+[] { + // Finalize and flush. + if (profilingFinalize() != XRAY_LOG_FINALIZED) + return; + if (profilingFlush() != XRAY_LOG_FLUSHED) + return; + if (Verbosity()) + Report("XRay Profile flushed at exit."); + }); }); __xray_log_set_buffer_iterator(profileCollectorService::nextBuffer); @@ -321,13 +334,16 @@ profilingFlush, }; auto RegistrationResult = __xray_log_register_mode("xray-profiling", Impl); - if (RegistrationResult != XRayLogRegisterStatus::XRAY_REGISTRATION_OK && - Verbosity()) - Report("Cannot register XRay Profiling mode to 'xray-profiling'; error = " - "%d\n", - RegistrationResult); + if (RegistrationResult != XRayLogRegisterStatus::XRAY_REGISTRATION_OK) { + if (Verbosity()) + Report("Cannot register XRay Profiling mode to 'xray-profiling'; error = " + "%d\n", + RegistrationResult); + return false; + } + if (!internal_strcmp(flags()->xray_mode, "xray-profiling")) - __xray_set_log_impl(Impl); + __xray_log_select_mode("xray_profiling"); return true; } Index: compiler-rt/lib/xray/xray_profiling_flags.inc =================================================================== --- compiler-rt/lib/xray/xray_profiling_flags.inc +++ compiler-rt/lib/xray/xray_profiling_flags.inc @@ -20,7 +20,7 @@ "Maximum size of the global allocator for profile storage.") XRAY_FLAG(uptr, stack_allocator_max, 2 << 24, "Maximum size of the traversal stack allocator.") -XRAY_FLAG(int, grace_period_ms, 100, +XRAY_FLAG(int, grace_period_ms, 1, "Profile collection will wait this much time in milliseconds before " "resetting the global state. This gives a chance to threads to " "notice that the profiler has been finalized and clean up.") Index: compiler-rt/test/xray/TestCases/Posix/profiling-multi-threaded.cc =================================================================== --- compiler-rt/test/xray/TestCases/Posix/profiling-multi-threaded.cc +++ compiler-rt/test/xray/TestCases/Posix/profiling-multi-threaded.cc @@ -8,7 +8,7 @@ // RUN: XRAY_PROFILING_OPTIONS=no_flush=1 %run %t // RUN: XRAY_OPTIONS=verbosity=1 %run %t // RUN: PROFILES=`ls xray-log.profiling-multi-* | wc -l` -// RUN: [ $PROFILES -eq 1 ] +// RUN: [ $PROFILES -ge 1 ] // RUN: rm -f xray-log.profiling-multi-* // // REQUIRES: x86_64-target-arch Index: compiler-rt/test/xray/TestCases/Posix/profiling-single-threaded.cc =================================================================== --- compiler-rt/test/xray/TestCases/Posix/profiling-single-threaded.cc +++ compiler-rt/test/xray/TestCases/Posix/profiling-single-threaded.cc @@ -8,7 +8,7 @@ // RUN: XRAY_PROFILING_OPTIONS=no_flush=true %run %t // RUN: XRAY_OPTIONS=verbosity=1 %run %t // RUN: PROFILES=`ls xray-log.profiling-single-* | wc -l` -// RUN: [ $PROFILES -eq 2 ] +// RUN: [ $PROFILES -ge 2 ] // RUN: rm -f xray-log.profiling-single-* // // REQUIRES: x86_64-target-arch