Index: compiler-rt/lib/xray/tests/unit/allocator_test.cc =================================================================== --- compiler-rt/lib/xray/tests/unit/allocator_test.cc +++ compiler-rt/lib/xray/tests/unit/allocator_test.cc @@ -22,16 +22,16 @@ s64 Second; }; -TEST(AllocatorTest, Construction) { Allocator A(2 << 11, 0); } +TEST(AllocatorTest, Construction) { Allocator A(2 << 11); } TEST(AllocatorTest, Allocate) { - Allocator A(2 << 11, 0); + Allocator A(2 << 11); auto B = A.Allocate(); ASSERT_NE(B.Data, nullptr); } TEST(AllocatorTest, OverAllocate) { - Allocator A(sizeof(TestData), 0); + Allocator A(sizeof(TestData)); auto B1 = A.Allocate(); (void)B1; auto B2 = A.Allocate(); Index: compiler-rt/lib/xray/tests/unit/function_call_trie_test.cc =================================================================== --- compiler-rt/lib/xray/tests/unit/function_call_trie_test.cc +++ compiler-rt/lib/xray/tests/unit/function_call_trie_test.cc @@ -19,8 +19,6 @@ namespace { TEST(FunctionCallTrieTest, ConstructWithTLSAllocators) { - // FIXME: Support passing in configuration for allocators in the allocator - // constructors. profilingFlags()->setDefaults(); FunctionCallTrie::Allocators Allocators = FunctionCallTrie::InitAllocators(); FunctionCallTrie Trie(Allocators); @@ -47,6 +45,7 @@ } TEST(FunctionCallTrieTest, MissingFunctionEntry) { + profilingFlags()->setDefaults(); auto A = FunctionCallTrie::InitAllocators(); FunctionCallTrie Trie(A); Trie.exitFunction(1, 1); @@ -56,6 +55,7 @@ } TEST(FunctionCallTrieTest, NoMatchingEntersForExit) { + profilingFlags()->setDefaults(); auto A = FunctionCallTrie::InitAllocators(); FunctionCallTrie Trie(A); Trie.enterFunction(2, 1); @@ -63,16 +63,19 @@ Trie.exitFunction(1, 5); const auto &R = Trie.getRoots(); - ASSERT_TRUE(R.empty()); + ASSERT_FALSE(R.empty()); + EXPECT_EQ(R.size(), size_t{1}); } TEST(FunctionCallTrieTest, MissingFunctionExit) { + profilingFlags()->setDefaults(); auto A = FunctionCallTrie::InitAllocators(); FunctionCallTrie Trie(A); Trie.enterFunction(1, 1); const auto &R = Trie.getRoots(); - ASSERT_TRUE(R.empty()); + ASSERT_FALSE(R.empty()); + EXPECT_EQ(R.size(), size_t{1}); } TEST(FunctionCallTrieTest, MultipleRoots) { Index: compiler-rt/lib/xray/tests/unit/segmented_array_test.cc =================================================================== --- compiler-rt/lib/xray/tests/unit/segmented_array_test.cc +++ compiler-rt/lib/xray/tests/unit/segmented_array_test.cc @@ -12,27 +12,29 @@ TestData(s64 F, s64 S) : First(F), Second(S) {} }; -TEST(SegmentedArrayTest, Construction) { - Array Data; - (void)Data; -} - -TEST(SegmentedArrayTest, ConstructWithAllocator) { +TEST(SegmentedArrayTest, ConstructWithAllocators) { using AllocatorType = typename Array::AllocatorType; - AllocatorType A(1 << 4, 0); - Array Data(A); + AllocatorType A(1 << 4); + ChunkAllocator CA(1 << 4); + Array Data(A, CA); (void)Data; } TEST(SegmentedArrayTest, ConstructAndPopulate) { - Array data; + using AllocatorType = typename Array::AllocatorType; + AllocatorType A(1 << 4); + ChunkAllocator CA(1 << 4); + Array data(A, CA); ASSERT_NE(data.Append(TestData{0, 0}), nullptr); ASSERT_NE(data.Append(TestData{1, 1}), nullptr); ASSERT_EQ(data.size(), 2u); } TEST(SegmentedArrayTest, ConstructPopulateAndLookup) { - Array data; + using AllocatorType = typename Array::AllocatorType; + AllocatorType A(1 << 4); + ChunkAllocator CA(1 << 4); + Array data(A, CA); ASSERT_NE(data.Append(TestData{0, 1}), nullptr); ASSERT_EQ(data.size(), 1u); ASSERT_EQ(data[0].First, 0); @@ -40,7 +42,10 @@ } TEST(SegmentedArrayTest, PopulateWithMoreElements) { - Array data; + using AllocatorType = typename Array::AllocatorType; + AllocatorType A(1 << 24); + ChunkAllocator CA(1 << 20); + Array data(A, CA); static const auto kMaxElements = 100u; for (auto I = 0u; I < kMaxElements; ++I) { ASSERT_NE(data.Append(TestData{I, I + 1}), nullptr); @@ -53,14 +58,20 @@ } TEST(SegmentedArrayTest, AppendEmplace) { - Array data; + using AllocatorType = typename Array::AllocatorType; + AllocatorType A(1 << 4); + ChunkAllocator CA(1 << 4); + Array data(A, CA); ASSERT_NE(data.AppendEmplace(1, 1), nullptr); ASSERT_EQ(data[0].First, 1); ASSERT_EQ(data[0].Second, 1); } TEST(SegmentedArrayTest, AppendAndTrim) { - Array data; + using AllocatorType = typename Array::AllocatorType; + AllocatorType A(1 << 4); + ChunkAllocator CA(1 << 4); + Array data(A, CA); ASSERT_NE(data.AppendEmplace(1, 1), nullptr); ASSERT_EQ(data.size(), 1u); data.trim(1); @@ -69,7 +80,10 @@ } TEST(SegmentedArrayTest, IteratorAdvance) { - Array data; + using AllocatorType = typename Array::AllocatorType; + AllocatorType A(1 << 4); + ChunkAllocator CA(1 << 4); + Array data(A, CA); ASSERT_TRUE(data.empty()); ASSERT_EQ(data.begin(), data.end()); auto I0 = data.begin(); @@ -88,7 +102,10 @@ } TEST(SegmentedArrayTest, IteratorRetreat) { - Array data; + using AllocatorType = typename Array::AllocatorType; + AllocatorType A(1 << 4); + ChunkAllocator CA(1 << 4); + Array data(A, CA); ASSERT_TRUE(data.empty()); ASSERT_EQ(data.begin(), data.end()); ASSERT_NE(data.AppendEmplace(1, 1), nullptr); @@ -108,8 +125,9 @@ TEST(SegmentedArrayTest, IteratorTrimBehaviour) { using AllocatorType = typename Array::AllocatorType; - AllocatorType A(1 << 10, 0); - Array Data(A); + AllocatorType A(1 << 20); + ChunkAllocator CA(1 << 10); + Array Data(A, CA); ASSERT_TRUE(Data.empty()); auto I0Begin = Data.begin(), I0End = Data.end(); // Add enough elements in Data to have more than one chunk. @@ -160,8 +178,9 @@ TEST(SegmentedArrayTest, SimulateStackBehaviour) { using AllocatorType = typename Array::AllocatorType; - AllocatorType A(1 << 10, 0); - Array Data(A); + AllocatorType A(1 << 10); + ChunkAllocator CA(1 << 10); + Array Data(A, CA); static uint64_t Dummy = 0; constexpr uint64_t Max = 9; Index: compiler-rt/lib/xray/xray_allocator.h =================================================================== --- compiler-rt/lib/xray/xray_allocator.h +++ compiler-rt/lib/xray/xray_allocator.h @@ -16,10 +16,11 @@ #ifndef XRAY_ALLOCATOR_H #define XRAY_ALLOCATOR_H -#include "sanitizer_common/sanitizer_allocator_internal.h" #include "sanitizer_common/sanitizer_common.h" #include "sanitizer_common/sanitizer_internal_defs.h" #include "sanitizer_common/sanitizer_mutex.h" +#include "sanitizer_common/sanitizer_posix.h" +#include "sys/mman.h" #include "xray_utils.h" #include #include @@ -36,130 +37,87 @@ /// allocation function. N is used to compute the size of a block, which is /// cache-line-size multiples worth of memory. We compute the size of a block by /// determining how many cache lines worth of memory is required to subsume N. +/// +/// The Allocator instance will manage its own memory acquired through mmap. +/// This severely constrains the platforms on which this can be used to POSIX +/// systems where mmap semantics are well-defined. +/// +/// FIXME: Isolate the lower-level memory management to a different abstraction +/// that can be platform-specific. template struct Allocator { // The Allocator returns memory as Block instances. struct Block { /// Compute the minimum cache-line size multiple that is >= N. static constexpr auto Size = nearest_boundary(N, kCacheLineSize); - void *Data = nullptr; + void *Data; }; private: - // A BlockLink will contain a fixed number of blocks, each with an identifier - // to specify whether it's been handed out or not. We keep track of BlockLink - // iterators, which are basically a pointer to the link and an offset into - // the fixed set of blocks associated with a link. The iterators are - // bidirectional. - // - // We're calling it a "link" in the context of seeing these as a chain of - // block pointer containers (i.e. links in a chain). - struct BlockLink { - static_assert(kCacheLineSize % sizeof(void *) == 0, - "Cache line size is not divisible by size of void*; none of " - "the assumptions of the BlockLink will hold."); - - // We compute the number of pointers to areas in memory where we consider as - // individual blocks we've allocated. To ensure that instances of the - // BlockLink object are cache-line sized, we deduct two pointers worth - // representing the pointer to the previous link and the backing store for - // the whole block. - // - // This structure corresponds to the following layout: - // - // Blocks [ 0, 1, 2, .., BlockPtrCount - 2] - // - static constexpr auto BlockPtrCount = - (kCacheLineSize / sizeof(Block *)) - 2; - - BlockLink() { - // Zero out Blocks. - // FIXME: Use a braced member initializer when we drop support for GCC - // 4.8. - internal_memset(Blocks, 0, sizeof(Blocks)); - } - - // FIXME: Align this to cache-line address boundaries? - Block Blocks[BlockPtrCount]; - BlockLink *Prev = nullptr; - void *BackingStore = nullptr; - }; - - static_assert(sizeof(BlockLink) == kCacheLineSize, - "BlockLink instances must be cache-line-sized."); + const size_t MaxMemory{0}; + void *BackingStore = nullptr; + void *AlignedNextBlock = nullptr; + size_t AllocatedBlocks = 0; + SpinMutex Mutex{}; - static BlockLink NullLink; + void *Alloc() { + SpinMutexLock Lock(&Mutex); + if (UNLIKELY(BackingStore == nullptr)) { + BackingStore = reinterpret_cast( + internal_mmap(NULL, MaxMemory, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS | MAP_NORESERVE, 0, 0)); + if (BackingStore == MAP_FAILED) { + BackingStore = nullptr; + if (Verbosity()) + Report("XRay Profiling: Failed to allocate memory for allocator.\n"); + return nullptr; + } + + AlignedNextBlock = BackingStore; + + // Ensure that NextBlock is aligned appropriately. + auto BackingStoreNum = reinterpret_cast(BackingStore); + auto AlignedNextBlockNum = nearest_boundary( + reinterpret_cast(AlignedNextBlock), kCacheLineSize); + if (diff(AlignedNextBlockNum, BackingStoreNum) > ptrdiff_t(MaxMemory)) { + munmap(BackingStore, MaxMemory); + AlignedNextBlock = BackingStore = nullptr; + if (Verbosity()) + Report("XRay Profiling: Cannot obtain enough memory from " + "preallocated region.\n"); + return nullptr; + } + + AlignedNextBlock = reinterpret_cast(AlignedNextBlockNum); + + // Assert that AlignedNextBlock is cache-line aligned. + DCHECK_EQ(reinterpret_cast(AlignedNextBlock) % kCacheLineSize, + 0); + } - // FIXME: Implement a freelist, in case we actually do intend to return memory - // to the allocator, as opposed to just de-allocating everything in one go? + if ((AllocatedBlocks * Block::Size) >= MaxMemory) + return nullptr; - size_t MaxMemory; - SpinMutex Mutex{}; - BlockLink *Tail = &NullLink; - size_t Counter = 0; - - BlockLink *NewChainLink(uint64_t Alignment) { - auto NewChain = reinterpret_cast( - InternalAlloc(sizeof(BlockLink), nullptr, kCacheLineSize)); - auto BackingStore = reinterpret_cast(InternalAlloc( - (BlockLink::BlockPtrCount + 1) * Block::Size, nullptr, Alignment)); - size_t Offset = 0; - DCHECK_NE(NewChain, nullptr); - DCHECK_NE(BackingStore, nullptr); - NewChain->BackingStore = BackingStore; - - // Here we ensure that the alignment of the pointers we're handing out - // adhere to the alignment requirements of the call to Allocate(). - for (auto &B : NewChain->Blocks) { - auto AlignmentAdjustment = - nearest_boundary(reinterpret_cast(BackingStore + Offset), - Alignment) - - reinterpret_cast(BackingStore + Offset); - B.Data = BackingStore + AlignmentAdjustment + Offset; - DCHECK_EQ(reinterpret_cast(B.Data) % Alignment, 0); - Offset += AlignmentAdjustment + Block::Size; - } - NewChain->Prev = Tail; - return NewChain; + // Align the pointer we'd like to return to an appropriate alignment, then + // advance the pointer from where to start allocations. + void *Result = AlignedNextBlock; + AlignedNextBlock = reinterpret_cast( + reinterpret_cast(AlignedNextBlock) + N); + ++AllocatedBlocks; + return Result; } public: - Allocator(size_t M, size_t PreAllocate) : MaxMemory(M) { - // FIXME: Implement PreAllocate support! - } - - Block Allocate(uint64_t Alignment = 8) { - SpinMutexLock Lock(&Mutex); - // Check whether we're over quota. - if (Counter * Block::Size >= MaxMemory) - return {}; + explicit Allocator(size_t M) + : MaxMemory(nearest_boundary(M, kCacheLineSize)) {} - size_t ChainOffset = Counter % BlockLink::BlockPtrCount; - - Block B{}; - BlockLink *Link = Tail; - if (UNLIKELY(Counter == 0 || ChainOffset == 0)) - Tail = Link = NewChainLink(Alignment); - - B = Link->Blocks[ChainOffset]; - ++Counter; - return B; - } + Block Allocate() { return {Alloc()}; } ~Allocator() NOEXCEPT { - // We need to deallocate all the blocks, including the chain links. - for (auto *C = Tail; C != &NullLink;) { - // We know that the data block is a large contiguous page, we deallocate - // that at once. - InternalFree(C->BackingStore); - auto Prev = C->Prev; - InternalFree(C); - C = Prev; + if (BackingStore != nullptr) { + internal_munmap(BackingStore, MaxMemory); } } -}; // namespace __xray - -// Storage for the NullLink sentinel. -template typename Allocator::BlockLink Allocator::NullLink; +}; } // namespace __xray Index: compiler-rt/lib/xray/xray_function_call_trie.h =================================================================== --- compiler-rt/lib/xray/xray_function_call_trie.h +++ compiler-rt/lib/xray/xray_function_call_trie.h @@ -15,6 +15,7 @@ #ifndef XRAY_FUNCTION_CALL_TRIE_H #define XRAY_FUNCTION_CALL_TRIE_H +#include "sanitizer_common/sanitizer_allocator_internal.h" #include "xray_profiling_flags.h" #include "xray_segmented_array.h" #include // For placement new. @@ -118,9 +119,9 @@ // We add a constructor here to allow us to inplace-construct through // Array<...>'s AppendEmplace. - Node(Node *P, NodeIdPairAllocatorType &A, int64_t CC, int64_t CLT, - int32_t F) - : Parent(P), Callees(A), CallCount(CC), CumulativeLocalTime(CLT), + Node(Node *P, NodeIdPairAllocatorType &A, ChunkAllocator &CA, int64_t CC, + int64_t CLT, int32_t F) + : Parent(P), Callees(A, CA), CallCount(CC), CumulativeLocalTime(CLT), FId(F) {} // TODO: Include the compact histogram. @@ -152,6 +153,7 @@ RootAllocatorType *RootAllocator = nullptr; ShadowStackAllocatorType *ShadowStackAllocator = nullptr; NodeIdPairAllocatorType *NodeIdPairAllocator = nullptr; + ChunkAllocator *ChunkAlloc = nullptr; Allocators() {} Allocators(const Allocators &) = delete; @@ -160,11 +162,12 @@ Allocators(Allocators &&O) : NodeAllocator(O.NodeAllocator), RootAllocator(O.RootAllocator), ShadowStackAllocator(O.ShadowStackAllocator), - NodeIdPairAllocator(O.NodeIdPairAllocator) { + NodeIdPairAllocator(O.NodeIdPairAllocator), ChunkAlloc(O.ChunkAlloc) { O.NodeAllocator = nullptr; O.RootAllocator = nullptr; O.ShadowStackAllocator = nullptr; O.NodeIdPairAllocator = nullptr; + O.ChunkAlloc = nullptr; } Allocators &operator=(Allocators &&O) { @@ -188,6 +191,11 @@ O.NodeIdPairAllocator = this->NodeIdPairAllocator; this->NodeIdPairAllocator = Tmp; } + { + auto Tmp = O.ChunkAlloc; + O.ChunkAlloc = this->ChunkAlloc; + this->ChunkAlloc = Tmp; + } return *this; } @@ -198,18 +206,27 @@ if (NodeAllocator != nullptr) { NodeAllocator->~NodeAllocatorType(); InternalFree(NodeAllocator); + NodeAllocator = nullptr; } if (RootAllocator != nullptr) { RootAllocator->~RootAllocatorType(); InternalFree(RootAllocator); + RootAllocator = nullptr; } if (ShadowStackAllocator != nullptr) { ShadowStackAllocator->~ShadowStackAllocatorType(); InternalFree(ShadowStackAllocator); + ShadowStackAllocator = nullptr; } if (NodeIdPairAllocator != nullptr) { NodeIdPairAllocator->~NodeIdPairAllocatorType(); InternalFree(NodeIdPairAllocator); + NodeIdPairAllocator = nullptr; + } + if (ChunkAlloc != nullptr) { + ChunkAlloc->~ChunkAllocator(); + InternalFree(ChunkAlloc); + ChunkAlloc = nullptr; } } }; @@ -223,24 +240,29 @@ Allocators A; auto NodeAllocator = reinterpret_cast( InternalAlloc(sizeof(Allocators::NodeAllocatorType))); - new (NodeAllocator) Allocators::NodeAllocatorType(Max, 0); + new (NodeAllocator) Allocators::NodeAllocatorType(Max); A.NodeAllocator = NodeAllocator; auto RootAllocator = reinterpret_cast( InternalAlloc(sizeof(Allocators::RootAllocatorType))); - new (RootAllocator) Allocators::RootAllocatorType(Max, 0); + new (RootAllocator) Allocators::RootAllocatorType(Max); A.RootAllocator = RootAllocator; auto ShadowStackAllocator = reinterpret_cast( InternalAlloc(sizeof(Allocators::ShadowStackAllocatorType))); - new (ShadowStackAllocator) Allocators::ShadowStackAllocatorType(Max, 0); + new (ShadowStackAllocator) Allocators::ShadowStackAllocatorType(Max); A.ShadowStackAllocator = ShadowStackAllocator; auto NodeIdPairAllocator = reinterpret_cast( InternalAlloc(sizeof(NodeIdPairAllocatorType))); - new (NodeIdPairAllocator) NodeIdPairAllocatorType(Max, 0); + new (NodeIdPairAllocator) NodeIdPairAllocatorType(Max); A.NodeIdPairAllocator = NodeIdPairAllocator; + + auto ChunkAlloc = reinterpret_cast( + InternalAlloc(sizeof(ChunkAllocator))); + new (ChunkAlloc) ChunkAllocator(Max); + A.ChunkAlloc = ChunkAlloc; return A; } @@ -249,20 +271,22 @@ RootArray Roots; ShadowStackArray ShadowStack; NodeIdPairAllocatorType *NodeIdPairAllocator = nullptr; + ChunkAllocator *ChunkAlloc = nullptr; public: explicit FunctionCallTrie(const Allocators &A) - : Nodes(*A.NodeAllocator), Roots(*A.RootAllocator), - ShadowStack(*A.ShadowStackAllocator), - NodeIdPairAllocator(A.NodeIdPairAllocator) {} + : Nodes(*A.NodeAllocator, *A.ChunkAlloc), + Roots(*A.RootAllocator, *A.ChunkAlloc), + ShadowStack(*A.ShadowStackAllocator, *A.ChunkAlloc), + NodeIdPairAllocator(A.NodeIdPairAllocator), ChunkAlloc(A.ChunkAlloc) {} void enterFunction(const int32_t FId, uint64_t TSC) { DCHECK_NE(FId, 0); // This function primarily deals with ensuring that the ShadowStack is // consistent and ready for when an exit event is encountered. if (UNLIKELY(ShadowStack.empty())) { - auto NewRoot = - Nodes.AppendEmplace(nullptr, *NodeIdPairAllocator, 0, 0, FId); + auto NewRoot = Nodes.AppendEmplace(nullptr, *NodeIdPairAllocator, + *ChunkAlloc, 0, 0, FId); if (UNLIKELY(NewRoot == nullptr)) return; Roots.Append(NewRoot); @@ -285,8 +309,8 @@ } // This means we've never seen this stack before, create a new node here. - auto NewNode = - Nodes.AppendEmplace(TopNode, *NodeIdPairAllocator, 0, 0, FId); + auto NewNode = Nodes.AppendEmplace(TopNode, *NodeIdPairAllocator, + *ChunkAlloc, 0, 0, FId); if (UNLIKELY(NewNode == nullptr)) return; DCHECK_NE(NewNode, nullptr); @@ -345,13 +369,14 @@ using Stack = Array; typename Stack::AllocatorType StackAllocator( - profilingFlags()->stack_allocator_max, 0); - Stack DFSStack(StackAllocator); + profilingFlags()->stack_allocator_max); + ChunkAllocator StackChunkAllocator(profilingFlags()->stack_allocator_max); + Stack DFSStack(StackAllocator, StackChunkAllocator); for (const auto Root : getRoots()) { // Add a node in O for this root. auto NewRoot = O.Nodes.AppendEmplace( - nullptr, *O.NodeIdPairAllocator, Root->CallCount, + nullptr, *O.NodeIdPairAllocator, *O.ChunkAlloc, Root->CallCount, Root->CumulativeLocalTime, Root->FId); // Because we cannot allocate more memory we should bail out right away. @@ -370,8 +395,9 @@ DFSStack.trim(1); for (const auto Callee : NP.Node->Callees) { auto NewNode = O.Nodes.AppendEmplace( - NP.NewNode, *O.NodeIdPairAllocator, Callee.NodePtr->CallCount, - Callee.NodePtr->CumulativeLocalTime, Callee.FId); + NP.NewNode, *O.NodeIdPairAllocator, *O.ChunkAlloc, + Callee.NodePtr->CallCount, Callee.NodePtr->CumulativeLocalTime, + Callee.FId); if (UNLIKELY(NewNode == nullptr)) return; NP.NewNode->Callees.AppendEmplace(NewNode, Callee.FId); @@ -396,16 +422,17 @@ }; using Stack = Array; typename Stack::AllocatorType StackAllocator( - profilingFlags()->stack_allocator_max, 0); - Stack DFSStack(StackAllocator); + profilingFlags()->stack_allocator_max); + ChunkAllocator CA(profilingFlags()->stack_allocator_max); + Stack DFSStack(StackAllocator, CA); for (const auto Root : getRoots()) { Node *TargetRoot = nullptr; auto R = O.Roots.find_element( [&](const Node *Node) { return Node->FId == Root->FId; }); if (R == nullptr) { - TargetRoot = O.Nodes.AppendEmplace(nullptr, *O.NodeIdPairAllocator, 0, - 0, Root->FId); + TargetRoot = O.Nodes.AppendEmplace(nullptr, *O.NodeIdPairAllocator, + *O.ChunkAlloc, 0, 0, Root->FId); if (UNLIKELY(TargetRoot == nullptr)) return; @@ -429,8 +456,9 @@ return C.FId == Callee.FId; }); if (TargetCallee == nullptr) { - auto NewTargetNode = O.Nodes.AppendEmplace( - NT.TargetNode, *O.NodeIdPairAllocator, 0, 0, Callee.FId); + auto NewTargetNode = + O.Nodes.AppendEmplace(NT.TargetNode, *O.NodeIdPairAllocator, + *O.ChunkAlloc, 0, 0, Callee.FId); if (UNLIKELY(NewTargetNode == nullptr)) return; Index: compiler-rt/lib/xray/xray_profile_collector.cc =================================================================== --- compiler-rt/lib/xray/xray_profile_collector.cc +++ compiler-rt/lib/xray/xray_profile_collector.cc @@ -13,6 +13,7 @@ // //===----------------------------------------------------------------------===// #include "xray_profile_collector.h" +#include "sanitizer_common/sanitizer_allocator_internal.h" #include "sanitizer_common/sanitizer_common.h" #include "sanitizer_common/sanitizer_vector.h" #include "xray_profiling_flags.h" @@ -117,11 +118,12 @@ const FunctionCallTrie::Node *Node = nullptr; // Constructor for in-place construction. - ProfileRecord(PathAllocator &A, const FunctionCallTrie::Node *N) + ProfileRecord(PathAllocator &A, ChunkAllocator &CA, + const FunctionCallTrie::Node *N) : Path([&] { auto P = reinterpret_cast(InternalAlloc(sizeof(PathArray))); - new (P) PathArray(A); + new (P) PathArray(A, CA); return P; }()), Node(N) {} @@ -135,17 +137,20 @@ // the path(s) and the data associated with the path. static void populateRecords(ProfileRecordArray &PRs, ProfileRecord::PathAllocator &PA, - const FunctionCallTrie &Trie) { + ChunkAllocator &CA, const FunctionCallTrie &Trie) { using StackArray = Array; using StackAllocator = typename StackArray::AllocatorType; - StackAllocator StackAlloc(profilingFlags()->stack_allocator_max, 0); - StackArray DFSStack(StackAlloc); + StackAllocator StackAlloc(profilingFlags()->stack_allocator_max); + ChunkAllocator StackChunkAlloc(profilingFlags()->stack_allocator_max); + StackArray DFSStack(StackAlloc, StackChunkAlloc); for (const auto R : Trie.getRoots()) { DFSStack.Append(R); while (!DFSStack.empty()) { auto Node = DFSStack.back(); DFSStack.trim(1); - auto Record = PRs.AppendEmplace(PA, Node); + auto Record = PRs.AppendEmplace(PA, CA, Node); + if (Record == nullptr) + return; DCHECK_NE(Record, nullptr); // Traverse the Node's parents and as we're doing so, get the FIds in @@ -208,10 +213,11 @@ // Then repopulate the global ProfileBuffers. for (u32 I = 0; I < ThreadTries->Size(); ++I) { using ProfileRecordAllocator = typename ProfileRecordArray::AllocatorType; - ProfileRecordAllocator PRAlloc(profilingFlags()->global_allocator_max, 0); + ProfileRecordAllocator PRAlloc(profilingFlags()->global_allocator_max); + ChunkAllocator CA(profilingFlags()->global_allocator_max); ProfileRecord::PathAllocator PathAlloc( - profilingFlags()->global_allocator_max, 0); - ProfileRecordArray ProfileRecords(PRAlloc); + profilingFlags()->global_allocator_max); + ProfileRecordArray ProfileRecords(PRAlloc, CA); // First, we want to compute the amount of space we're going to need. We'll // use a local allocator and an __xray::Array<...> to store the intermediary @@ -220,7 +226,7 @@ const auto &Trie = *(*ThreadTries)[I].Trie; if (Trie.getRoots().empty()) continue; - populateRecords(ProfileRecords, PathAlloc, Trie); + populateRecords(ProfileRecords, PathAlloc, CA, Trie); DCHECK(!Trie.getRoots().empty()); DCHECK(!ProfileRecords.empty()); Index: compiler-rt/lib/xray/xray_profiling.cc =================================================================== --- compiler-rt/lib/xray/xray_profiling.cc +++ compiler-rt/lib/xray/xray_profiling.cc @@ -56,8 +56,9 @@ static pthread_key_t ProfilingKey; -ProfilingData &getThreadLocalData() XRAY_NEVER_INSTRUMENT { - thread_local std::aligned_storage::type ThreadStorage; +thread_local std::aligned_storage::type ThreadStorage{}; + +static ProfilingData &getThreadLocalData() XRAY_NEVER_INSTRUMENT { if (pthread_getspecific(ProfilingKey) == NULL) { new (&ThreadStorage) ProfilingData{}; pthread_setspecific(ProfilingKey, &ThreadStorage); @@ -86,6 +87,18 @@ return TLD; } +static void cleanupTLD() XRAY_NEVER_INSTRUMENT { + auto &TLD = *reinterpret_cast(&ThreadStorage); + if (TLD.Allocators != nullptr && TLD.FCT != nullptr) { + TLD.FCT->~FunctionCallTrie(); + TLD.Allocators->~Allocators(); + InternalFree(TLD.FCT); + InternalFree(TLD.Allocators); + TLD.FCT = nullptr; + TLD.Allocators = nullptr; + } +} + } // namespace const char *profilingCompilerDefinedFlags() XRAY_NEVER_INSTRUMENT { @@ -148,6 +161,9 @@ profileCollectorService::reset(); + // Flush the current thread's local data structures as well. + cleanupTLD(); + atomic_store(&ProfilerLogStatus, XRayLogFlushStatus::XRAY_LOG_FLUSHED, memory_order_release); @@ -158,17 +174,12 @@ thread_local atomic_uint8_t ReentranceGuard{0}; -void postCurrentThreadFCT(ProfilingData &TLD) { +static void postCurrentThreadFCT(ProfilingData &TLD) { if (TLD.Allocators == nullptr || TLD.FCT == nullptr) return; profileCollectorService::post(*TLD.FCT, GetTid()); - TLD.FCT->~FunctionCallTrie(); - TLD.Allocators->~Allocators(); - InternalFree(TLD.FCT); - InternalFree(TLD.Allocators); - TLD.FCT = nullptr; - TLD.Allocators = nullptr; + cleanupTLD(); } } // namespace @@ -294,10 +305,14 @@ // ABI functions for registering exit handlers. Atexit(+[] { // Finalize and flush. - if (profilingFinalize() != XRAY_LOG_FINALIZED) + if (profilingFinalize() != XRAY_LOG_FINALIZED) { + cleanupTLD(); return; - if (profilingFlush() != XRAY_LOG_FLUSHED) + } + if (profilingFlush() != XRAY_LOG_FLUSHED) { + cleanupTLD(); return; + } if (Verbosity()) Report("XRay Profile flushed at exit."); }); Index: compiler-rt/lib/xray/xray_profiling_flags.inc =================================================================== --- compiler-rt/lib/xray/xray_profiling_flags.inc +++ compiler-rt/lib/xray/xray_profiling_flags.inc @@ -18,7 +18,7 @@ "Maximum size of any single per-thread allocator.") XRAY_FLAG(uptr, global_allocator_max, 2 << 24, "Maximum size of the global allocator for profile storage.") -XRAY_FLAG(uptr, stack_allocator_max, 2 << 24, +XRAY_FLAG(uptr, stack_allocator_max, 2 << 20, "Maximum size of the traversal stack allocator.") XRAY_FLAG(int, grace_period_ms, 1, "Profile collection will wait this much time in milliseconds before " Index: compiler-rt/lib/xray/xray_segmented_array.h =================================================================== --- compiler-rt/lib/xray/xray_segmented_array.h +++ compiler-rt/lib/xray/xray_segmented_array.h @@ -24,6 +24,13 @@ #include namespace __xray { +struct Chunk { + void *Data; + Chunk *Prev; + Chunk *Next; +}; + +using ChunkAllocator = Allocator; /// The Array type provides an interface similar to std::vector<...> but does /// not shrink in size. Once constructed, elements can be appended but cannot be @@ -51,16 +58,10 @@ // TODO: Consider co-locating the chunk information with the data in the // Block, as in an intrusive list -- i.e. putting the next and previous // pointer values inside the Block storage. - struct Chunk { - typename AllocatorType::Block Block; - static constexpr size_t Size = N; - Chunk *Prev = this; - Chunk *Next = this; - }; - static Chunk SentinelChunk; AllocatorType *Alloc; + ChunkAllocator *ChunkAlloc; Chunk *Head = &SentinelChunk; Chunk *Tail = &SentinelChunk; size_t Size = 0; @@ -82,30 +83,19 @@ return FreeChunk; } - auto Block = Alloc->Allocate(ElementStorageSize); + auto Block = Alloc->Allocate(); if (Block.Data == nullptr) return nullptr; - // TODO: Maybe use a separate managed allocator for Chunk instances? - auto C = reinterpret_cast( - InternalAlloc(sizeof(Chunk), nullptr, kCacheLineSize)); - if (C == nullptr) + auto ChunkElement = ChunkAlloc->Allocate(); + if (ChunkElement.Data == nullptr) return nullptr; - C->Block = Block; - C->Prev = &SentinelChunk; - C->Next = &SentinelChunk; - return C; - } - static AllocatorType &GetGlobalAllocator() { - static AllocatorType *const GlobalAllocator = [] { - AllocatorType *A = reinterpret_cast( - InternalAlloc(sizeof(AllocatorType))); - new (A) AllocatorType(2 << 10, 0); - return A; - }(); - - return *GlobalAllocator; + // Placement-new the Chunk element at the appropriate location. + auto C = reinterpret_cast(ChunkElement.Data); + Chunk LocalC{Block.Data, &SentinelChunk, &SentinelChunk}; + internal_memcpy(C, &LocalC, sizeof(LocalC)); + return C; } Chunk *InitHeadAndTail() { @@ -201,21 +191,21 @@ U &operator*() const { DCHECK_NE(C, &SentinelChunk); auto RelOff = Offset % N; - return *reinterpret_cast(reinterpret_cast(C->Block.Data) + + return *reinterpret_cast(reinterpret_cast(C->Data) + (RelOff * ElementStorageSize)); } U *operator->() const { DCHECK_NE(C, &SentinelChunk); auto RelOff = Offset % N; - return reinterpret_cast(reinterpret_cast(C->Block.Data) + + return reinterpret_cast(reinterpret_cast(C->Data) + (RelOff * ElementStorageSize)); } }; public: - explicit Array(AllocatorType &A) : Alloc(&A) {} - Array() : Array(GetGlobalAllocator()) {} + explicit Array(AllocatorType &A, ChunkAllocator &CA) + : Alloc(&A), ChunkAlloc(&CA) {} Array(const Array &) = delete; Array(Array &&O) NOEXCEPT : Alloc(O.Alloc), @@ -246,9 +236,8 @@ if (AppendNewChunk() == nullptr) return nullptr; - auto Position = - reinterpret_cast(reinterpret_cast(Tail->Block.Data) + - (Offset * ElementStorageSize)); + auto Position = reinterpret_cast(reinterpret_cast(Tail->Data) + + (Offset * ElementStorageSize)); *Position = E; ++Size; return Position; @@ -268,7 +257,7 @@ } DCHECK_NE(Tail, &SentinelChunk); - auto Position = reinterpret_cast(LatestChunk->Block.Data) + + auto Position = reinterpret_cast(LatestChunk->Data) + (Offset * ElementStorageSize); DCHECK_EQ(reinterpret_cast(Position) % ElementStorageSize, 0); // In-place construct at Position. @@ -286,7 +275,7 @@ Offset -= N; DCHECK_NE(C, &SentinelChunk); } - return *reinterpret_cast(reinterpret_cast(C->Block.Data) + + return *reinterpret_cast(reinterpret_cast(C->Data) + (Offset * ElementStorageSize)); } @@ -360,7 +349,8 @@ // ensure that storage for the SentinelChunk is defined and has a single // address. template -typename Array::Chunk Array::SentinelChunk; +Chunk Array::SentinelChunk{nullptr, &Array::SentinelChunk, + &Array::SentinelChunk}; } // namespace __xray Index: compiler-rt/lib/xray/xray_utils.h =================================================================== --- compiler-rt/lib/xray/xray_utils.h +++ compiler-rt/lib/xray/xray_utils.h @@ -15,6 +15,8 @@ #ifndef XRAY_UTILS_H #define XRAY_UTILS_H +#include +#include #include #include @@ -54,6 +56,14 @@ return next_pow2_helper(number, 1); } +template constexpr T &max(T &A, T &B) { return A > B ? A : B; } + +template constexpr T &min(T &A, T &B) { return A <= B ? A : B; } + +constexpr ptrdiff_t diff(uintptr_t A, uintptr_t B) { + return max(A, B) - min(A, B); +} + } // namespace __xray #endif // XRAY_UTILS_H