Index: compiler-rt/lib/xray/CMakeLists.txt =================================================================== --- compiler-rt/lib/xray/CMakeLists.txt +++ compiler-rt/lib/xray/CMakeLists.txt @@ -17,6 +17,7 @@ xray_inmemory_log.cc) set(XRAY_PROFILER_MODE_SOURCES + xray_profile_collector.cc xray_profiler_flags.cc) # Implementation files for all XRay architectures. Index: compiler-rt/lib/xray/tests/unit/CMakeLists.txt =================================================================== --- compiler-rt/lib/xray/tests/unit/CMakeLists.txt +++ compiler-rt/lib/xray/tests/unit/CMakeLists.txt @@ -8,3 +8,5 @@ segmented_array_test.cc xray_unit_test_main.cc) add_xray_unittest(XRayFunctionCallTrieTest SOURCES function_call_trie_test.cc xray_unit_test_main.cc) +add_xray_unittest(XRayProfileCollectorTest SOURCES + profile_collector_test.cc xray_unit_test_main.cc) Index: compiler-rt/lib/xray/tests/unit/profile_collector_test.cc =================================================================== --- /dev/null +++ compiler-rt/lib/xray/tests/unit/profile_collector_test.cc @@ -0,0 +1,111 @@ +//===-- function_call_trie_test.cc ----------------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file is a part of XRay, a function call tracing system. +// +//===----------------------------------------------------------------------===// +#include "gtest/gtest.h" + +#include "xray_profile_collector.h" +#include "xray_profiler_flags.h" +#include + +namespace __xray { +namespace { + +static constexpr auto kHeaderSize = 16u; + +void ValidateBlock(XRayBuffer B) { + profilerFlags()->setDefaults(); + ASSERT_NE(static_cast(B.Data), nullptr); + ASSERT_NE(B.Size, 0u); + ASSERT_GE(B.Size, kHeaderSize); + // We look at the block size, the block number, and the thread ID to ensure + // that none of them are zero (or that the header data is laid out as we + // expect). + char LocalBuffer[kHeaderSize] = {}; + memcpy(LocalBuffer, B.Data, kHeaderSize); + u32 BlockSize = 0; + u32 BlockNumber = 0; + u64 ThreadId = 0; + memcpy(&BlockSize, LocalBuffer, sizeof(u32)); + memcpy(&BlockNumber, LocalBuffer + sizeof(u32), sizeof(u32)); + memcpy(&ThreadId, LocalBuffer + (2 * sizeof(u32)), sizeof(u64)); + EXPECT_NE(BlockSize, 0u); + EXPECT_GE(BlockNumber, 0u); + EXPECT_NE(ThreadId, 0u); +} + +TEST(profileCollectorServiceTest, PostSerializeCollect) { + profilerFlags()->setDefaults(); + // The most basic use-case (the one we actually only care about) is the one + // where we ensure that we can post FunctionCallTrie instances, which are then + // destroyed but serialized properly. + // + // First, we initialise a set of allocators in the local scope. This ensures + // that we're able to copy the contents of the FunctionCallTrie that uses + // the local allocators. + auto Allocators = FunctionCallTrie::InitAllocators(); + FunctionCallTrie Trie(Allocators); + + // Then, we populate the trie with some data. + Trie.enterFunction(1, 1); + Trie.enterFunction(2, 2); + Trie.exitFunction(2, 3); + Trie.exitFunction(1, 4); + + // Then we post the data to the global profile collector service. + profileCollectorService::post(Trie, 1); + + // Then we serialize the data. + profileCollectorService::serialize(); + + // Then we go through a single buffer to see whether we're getting the data we + // expect. + auto B = profileCollectorService::nextBuffer({nullptr, 0}); + ValidateBlock(B); +} + +// We break out a function that will be run in multiple threads, one that will +// use a thread local allocator, and will post the FunctionCallTrie to the +// profileCollectorService. This simulates what the threads being profiled would +// be doing anyway, but through the XRay logging implementation. +void threadProcessing() { + thread_local auto Allocators = FunctionCallTrie::InitAllocators(); + FunctionCallTrie Trie(Allocators); + + Trie.enterFunction(1, 1); + Trie.enterFunction(2, 2); + Trie.exitFunction(2, 3); + Trie.exitFunction(1, 4); + + profileCollectorService::post(Trie, __sanitizer::GetTid()); +} + +TEST(profileCollectorServiceTest, PostSerializeCollectMultipleThread) { + profilerFlags()->setDefaults(); + std::thread t1(threadProcessing); + std::thread t2(threadProcessing); + + t1.join(); + t2.join(); + + // At this point, t1 and t2 are already done with what they were doing. + profileCollectorService::serialize(); + + // Ensure that we see two buffers. + auto B = profileCollectorService::nextBuffer({nullptr, 0}); + ValidateBlock(B); + + B = profileCollectorService::nextBuffer(B); + ValidateBlock(B); +} + +} // namespace +} // namespace __xray Index: compiler-rt/lib/xray/xray_profile_collector.h =================================================================== --- /dev/null +++ compiler-rt/lib/xray/xray_profile_collector.h @@ -0,0 +1,88 @@ +//===-- xray_profile_collector.h -------------------------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file is a part of XRay, a dynamic runtime instrumentation system. +// +// This file defines the interface for a data collection service, for XRay +// profiling. What we implement here is an in-process service where +// FunctionCallTrie instances can be handed off by threads, to be +// consolidated/collected. +// +//===----------------------------------------------------------------------===// +#ifndef XRAY_XRAY_PROFILE_COLLECTOR_H +#define XRAY_XRAY_PROFILE_COLLECTOR_H + +#include "xray_function_call_trie.h" + +#include "xray/xray_log_interface.h" + +namespace __xray { + +/// The ProfileCollectorService implements a centralised mechanism for +/// collecting FunctionCallTrie instances, indexed by thread ID. On demand, the +/// ProfileCollectorService can be queried for the most recent state of the +/// data, in a form that allows traversal. +namespace profileCollectorService { + +/// Posts the FunctionCallTrie assocaited with a specific Thread ID. This +/// will: +/// +/// - Make a copy of the FunctionCallTrie and store that against the Thread +/// ID. This will use the global allocator for the service-managed +/// FunctionCallTrie instances. +/// - Queue up a pointer to the FunctionCallTrie. +/// - If the queue is long enough (longer than some arbitrary threshold) we +/// then pre-calculate a single FunctionCallTrie for the whole process. +/// +/// +/// We are making a copy of the FunctionCallTrie because the intent is to have +/// this function be called at thread exit, or soon after the profiling +/// handler is finalized through the XRay APIs. By letting threads each +/// process their own thread-local FunctionCallTrie instances, we're removing +/// the need for synchronisation across threads while we're profiling. +/// However, once we're done profiling, we can then collect copies of these +/// FunctionCallTrie instances and pay the cost of the copy. +/// +/// NOTE: In the future, if this turns out to be more costly than "moving" the +/// FunctionCallTrie instances from the owning thread to the collector +/// service, then we can change the implementation to do it this way (moving) +/// instead. +void post(const FunctionCallTrie &T, tid_t TId); + +/// The serialize will process all FunctionCallTrie instances in memory, and +/// turn those into specifically formatted blocks, each describing the +/// function call trie's contents in a compact form. In memory, this looks +/// like the following layout: +/// +/// - block size (32 bits) +/// - block number (32 bits) +/// - thread id (64 bits) +/// - list of records: +/// - function ids in reverse call order, from leaf to root, terminated by +/// 0 (32 bits per function id) +/// - call count (64 bit) +/// - cumulative local time (64 bit) +/// - record delimiter (64 bit, 0x0) +/// +void serialize(); + +/// The reset function will clear out any internal memory held by the +/// service. The intent is to have the resetting be done in calls to the +/// initialization routine, or explicitly through the flush log API. +void reset(); + +/// This nextBuffer function is meant to implement the iterator functionality, +/// provided in the XRay API. +XRayBuffer nextBuffer(XRayBuffer B); + +}; // namespace profileCollectorService + +} // namespace __xray + +#endif // XRAY_XRAY_PROFILE_COLLECTOR_H Index: compiler-rt/lib/xray/xray_profile_collector.cc =================================================================== --- /dev/null +++ compiler-rt/lib/xray/xray_profile_collector.cc @@ -0,0 +1,275 @@ +//===-- xray_profile_collector.cc ------------------------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file is a part of XRay, a dynamic runtime instrumentation system. +// +// This implements the interface for the profileCollectorService. +// +//===----------------------------------------------------------------------===// +#include + +#include "xray_profile_collector.h" +#include "sanitizer_common/sanitizer_common.h" +#include "sanitizer_common/sanitizer_vector.h" +#include "xray_profiler_flags.h" +#include + +namespace __xray { +namespace profileCollectorService { + +namespace { + +SpinMutex GlobalMutex; +struct ThreadTrie { + tid_t TId; + FunctionCallTrie *Trie; +}; +Vector ThreadTries; + +struct ProfileBuffer { + void *Data; + size_t Size; +}; +Vector ProfileBuffers; + +struct BlockHeader { + u32 BlockSize; + u32 BlockNum; + u64 ThreadId; +}; + +FunctionCallTrie::Allocators *GlobalAllocators = nullptr; + +} // namespace + +void post(const FunctionCallTrie &T, tid_t TId) { + static const bool UNUSED Once = [] { + SpinMutexLock Lock(&GlobalMutex); + GlobalAllocators = reinterpret_cast( + InternalAlloc(sizeof(FunctionCallTrie::Allocators))); + new (GlobalAllocators) FunctionCallTrie::Allocators(); + *GlobalAllocators = FunctionCallTrie::InitAllocators(); + return false; + }(); + DCHECK_NE(GlobalAllocators, nullptr); + + ThreadTrie *Item = nullptr; + { + SpinMutexLock Lock(&GlobalMutex); + if (GlobalAllocators == nullptr) + return; + + Item = ThreadTries.PushBack(); + Item->TId = TId; + Item->Trie = reinterpret_cast( + InternalAlloc(sizeof(FunctionCallTrie))); + new (Item->Trie) FunctionCallTrie(*GlobalAllocators); + DCHECK_NE(Item->Trie, nullptr); + } + DCHECK_NE(Item, nullptr); + + T.deepCopyInto(*Item->Trie); +} + +using PathArray = Array; + +struct ProfileRecord { + using PathAllocator = typename PathArray::AllocatorType; + + // The Path in this record is in reverse order. + PathArray *Path = nullptr; + const FunctionCallTrie::Node *Node = nullptr; + + // Constructor for in-place construction. + ProfileRecord(PathAllocator &A, const FunctionCallTrie::Node *N) + : Path([&] { + auto P = + reinterpret_cast(InternalAlloc(sizeof(PathArray))); + new (P) PathArray(A); + return P; + }()), + Node(N) {} +}; + +namespace { + +using ProfileRecordArray = Array; + +// We go through a FunctionCallTrie and traverse from the root, in DFS fashion, +// to generate the path(s) and output the data. +void populateRecords(ProfileRecordArray &PRs, ProfileRecord::PathAllocator &PA, + const FunctionCallTrie &Trie) { + for (const auto R : Trie.getRoots()) { + using StackArray = Array; + using StackAllocator = typename StackArray::AllocatorType; + StackAllocator StackAlloc( + profilerFlags()->xray_profiling_stack_allocator_max, 0); + StackArray DFSStack(StackAlloc); + DFSStack.Append(R); + while (!DFSStack.empty()) { + auto Node = DFSStack.back(); + DFSStack.trim(1); + auto Record = PRs.AppendEmplace(PA, Node); + DCHECK_NE(Record, nullptr); + + // Traverse the Node's parents and as we're doing so, get the FIds in + // the order they appear. + for (auto N = Node; N != nullptr; N = N->Parent) + Record->Path->Append(N->FId); + DCHECK(!Record->Path->empty()); + + for (const auto C : Node->Callees) + DFSStack.Append(C.NodePtr); + } + } +} + +void serializeRecords(ProfileBuffer *Buffer, const BlockHeader &Header, + const ProfileRecordArray &ProfileRecords) { + auto NextPtr = static_cast( + internal_memcpy(Buffer->Data, &Header, sizeof(Header))) + + sizeof(Header); + for (const auto &Record : ProfileRecords) { + // List of IDs follow: + for (const auto FId : *Record.Path) + NextPtr = + static_cast(internal_memcpy(NextPtr, &FId, sizeof(FId))) + + sizeof(FId); + + // Add the sentinel here. + constexpr int32_t SentinelFId = 0; + NextPtr = static_cast( + internal_memset(NextPtr, SentinelFId, sizeof(SentinelFId))) + + sizeof(SentinelFId); + + // Add the node data here. + NextPtr = + static_cast(internal_memcpy(NextPtr, &Record.Node->CallCount, + sizeof(Record.Node->CallCount))) + + sizeof(Record.Node->CallCount); + NextPtr = static_cast( + internal_memcpy(NextPtr, &Record.Node->CumulativeLocalTime, + sizeof(Record.Node->CumulativeLocalTime))) + + sizeof(Record.Node->CumulativeLocalTime); + + // Add an end of record sentinel here. + constexpr uint64_t EndOfRecord = 0x0; + NextPtr = static_cast( + internal_memset(NextPtr, EndOfRecord, sizeof(EndOfRecord))) + + sizeof(EndOfRecord); + } + + DCHECK_EQ(NextPtr - static_cast(Buffer->Data), Buffer->Size); +} + +} // namespace + +void serialize() { + SpinMutexLock Lock(&GlobalMutex); + + // Clear out the global ProfileBuffers. + for (uptr I = 0; I < ProfileBuffers.Size(); ++I) + InternalFree(ProfileBuffers[I].Data); + ProfileBuffers.Reset(); + + if (ThreadTries.Size() == 0) + return; + + // Then repopulate the global ProfileBuffers. + for (u32 I = 0; I < ThreadTries.Size(); ++I) { + using ProfileRecordAllocator = typename ProfileRecordArray::AllocatorType; + ProfileRecordAllocator PRAlloc( + profilerFlags()->xray_profiling_global_allocator_max, 0); + ProfileRecord::PathAllocator PathAlloc( + profilerFlags()->xray_profiling_global_allocator_max, 0); + ProfileRecordArray ProfileRecords(PRAlloc); + + // First, we want to compute the amount of space we're going to need. We'll + // use a local allocator and an __xray::Array<...> to store the intermediary + // data, then compute the size as we're going along. Then we'll allocate the + // contiguous space to contain the thread buffer data. + const auto &Trie = *ThreadTries[I].Trie; + if (Trie.getRoots().empty()) + continue; + populateRecords(ProfileRecords, PathAlloc, Trie); + DCHECK(!Trie.getRoots().empty()); + DCHECK(!ProfileRecords.empty()); + + // Go through each record, to compute the sizes. + // + // header size = block size (4 bytes) + // + block number (4 bytes) + // + thread id (8 bytes) + // record size = path ids (4 bytes * number of ids + sentinel 4 bytes) + // + call count (8 bytes) + // + local time (8 bytes) + // + end of record (8 bytes) + u32 CumulativeSizes = 0; + for (const auto &Record : ProfileRecords) + CumulativeSizes += 28 + (4 * Record.Path->size()); + + BlockHeader Header{16 + CumulativeSizes, I, ThreadTries[I].TId}; + auto Buffer = ProfileBuffers.PushBack(); + Buffer->Size = sizeof(Header) + CumulativeSizes; + Buffer->Data = InternalAlloc(Buffer->Size, nullptr, 64); + DCHECK_NE(Buffer->Data, nullptr); + serializeRecords(Buffer, Header, ProfileRecords); + + // Now clean up the ProfileRecords array, one at a time. + for (auto &Record : ProfileRecords) { + Record.Path->~PathArray(); + InternalFree(Record.Path); + } + } +} + +void reset() { + SpinMutexLock Lock(&GlobalMutex); + // Clear out the profile buffers that have been serialized. + for (uptr I = 0; I < ProfileBuffers.Size(); ++I) + InternalFree(ProfileBuffers[I].Data); + ProfileBuffers.Reset(); + + // Clear out the function call tries per thread. + for (uptr I = 0; I < ThreadTries.Size(); ++I) { + auto &T = ThreadTries[I]; + T.Trie->~FunctionCallTrie(); + InternalFree(T.Trie); + } + ThreadTries.Reset(); + + // Reset the global allocators. + if (GlobalAllocators != nullptr) { + GlobalAllocators->~Allocators(); + InternalFree(GlobalAllocators); + GlobalAllocators = nullptr; + } + GlobalAllocators = reinterpret_cast( + InternalAlloc(sizeof(FunctionCallTrie::Allocators))); + new (GlobalAllocators) FunctionCallTrie::Allocators(); + *GlobalAllocators = FunctionCallTrie::InitAllocators(); +} + +XRayBuffer nextBuffer(XRayBuffer B) { + // FIXME: Find a way to identify the buffers, somehow, from within this + // function. + SpinMutexLock Lock(&GlobalMutex); + if (B.Data == nullptr && ProfileBuffers.Size()) + return {ProfileBuffers[0].Data, ProfileBuffers[0].Size}; + + BlockHeader Header; + internal_memcpy(&Header, B.Data, sizeof(BlockHeader)); + auto NextBlock = Header.BlockNum + 1; + if (NextBlock < ProfileBuffers.Size()) + return {ProfileBuffers[NextBlock].Data, ProfileBuffers[NextBlock].Size}; + return {nullptr, 0}; +} + +} // namespace profileCollectorService +} // namespace __xray