Index: lib/xray/tests/unit/function_call_trie_test.cc =================================================================== --- lib/xray/tests/unit/function_call_trie_test.cc +++ lib/xray/tests/unit/function_call_trie_test.cc @@ -309,6 +309,36 @@ EXPECT_EQ(F2.Callees.size(), 0u); } +TEST(FunctionCallTrieTest, PlacementNewOnAlignedStorage) { + profilingFlags()->setDefaults(); + typename std::aligned_storage::type + AllocatorsStorage; + new (&AllocatorsStorage) + FunctionCallTrie::Allocators(FunctionCallTrie::InitAllocators()); + auto *A = + reinterpret_cast(&AllocatorsStorage); + + typename std::aligned_storage::type FCTStorage; + new (&FCTStorage) FunctionCallTrie(*A); + auto *T = reinterpret_cast(&FCTStorage); + + // Put some data into it. + T->enterFunction(1, 0, 0); + T->exitFunction(1, 1, 0); + + // Re-initialize the objects in storage. + T->~FunctionCallTrie(); + A->~Allocators(); + new (A) FunctionCallTrie::Allocators(FunctionCallTrie::InitAllocators()); + new (T) FunctionCallTrie(*A); + + // Then put some data into it again. + T->enterFunction(1, 0, 0); + T->exitFunction(1, 1, 0); +} + } // namespace } // namespace __xray Index: lib/xray/tests/unit/segmented_array_test.cc =================================================================== --- lib/xray/tests/unit/segmented_array_test.cc +++ lib/xray/tests/unit/segmented_array_test.cc @@ -221,5 +221,91 @@ } } +TEST(SegmentedArrayTest, PlacementNewOnAlignedStorage) { + using AllocatorType = typename Array::AllocatorType; + typename std::aligned_storage::type AllocatorStorage; + new (&AllocatorStorage) AllocatorType(1 << 10); + auto *A = reinterpret_cast(&AllocatorStorage); + typename std::aligned_storage), + alignof(Array)>::type + ArrayStorage; + new (&ArrayStorage) Array(*A); + auto *Data = reinterpret_cast *>(&ArrayStorage); + + static uint64_t Dummy = 0; + constexpr uint64_t Max = 9; + + for (uint64_t i = 0; i < Max; ++i) { + auto P = Data->Append({i, &Dummy}); + ASSERT_NE(P, nullptr); + ASSERT_EQ(P->NodePtr, &Dummy); + auto &Back = Data->back(); + ASSERT_EQ(Back.NodePtr, &Dummy); + ASSERT_EQ(Back.EntryTSC, i); + } + + // Simulate a stack by checking the data from the end as we're trimming. + auto Counter = Max; + ASSERT_EQ(Data->size(), size_t(Max)); + while (!Data->empty()) { + const auto &Top = Data->back(); + uint64_t *TopNode = Top.NodePtr; + EXPECT_EQ(TopNode, &Dummy) << "Counter = " << Counter; + Data->trim(1); + --Counter; + ASSERT_EQ(Data->size(), size_t(Counter)); + } + + // Once the stack is exhausted, we re-use the storage. + for (uint64_t i = 0; i < Max; ++i) { + auto P = Data->Append({i, &Dummy}); + ASSERT_NE(P, nullptr); + ASSERT_EQ(P->NodePtr, &Dummy); + auto &Back = Data->back(); + ASSERT_EQ(Back.NodePtr, &Dummy); + ASSERT_EQ(Back.EntryTSC, i); + } + + // We re-initialize the storage, by calling the destructor and + // placement-new'ing again. + Data->~Array(); + A->~AllocatorType(); + new (A) AllocatorType(1 << 10); + new (Data) Array(*A); + + // Then re-do the test. + for (uint64_t i = 0; i < Max; ++i) { + auto P = Data->Append({i, &Dummy}); + ASSERT_NE(P, nullptr); + ASSERT_EQ(P->NodePtr, &Dummy); + auto &Back = Data->back(); + ASSERT_EQ(Back.NodePtr, &Dummy); + ASSERT_EQ(Back.EntryTSC, i); + } + + // Simulate a stack by checking the data from the end as we're trimming. + Counter = Max; + ASSERT_EQ(Data->size(), size_t(Max)); + while (!Data->empty()) { + const auto &Top = Data->back(); + uint64_t *TopNode = Top.NodePtr; + EXPECT_EQ(TopNode, &Dummy) << "Counter = " << Counter; + Data->trim(1); + --Counter; + ASSERT_EQ(Data->size(), size_t(Counter)); + } + + // Once the stack is exhausted, we re-use the storage. + for (uint64_t i = 0; i < Max; ++i) { + auto P = Data->Append({i, &Dummy}); + ASSERT_NE(P, nullptr); + ASSERT_EQ(P->NodePtr, &Dummy); + auto &Back = Data->back(); + ASSERT_EQ(Back.NodePtr, &Dummy); + ASSERT_EQ(Back.EntryTSC, i); + } +} + } // namespace } // namespace __xray Index: lib/xray/xray_allocator.h =================================================================== --- lib/xray/xray_allocator.h +++ lib/xray/xray_allocator.h @@ -63,7 +63,7 @@ #else uptr B = internal_mmap(NULL, RoundedSize, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); - int ErrNo; + int ErrNo = 0; if (UNLIKELY(internal_iserror(B, &ErrNo))) { if (Verbosity()) Report( @@ -113,7 +113,7 @@ #else uptr B = internal_mmap(NULL, RoundedSize, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); - int ErrNo; + int ErrNo = 0; if (UNLIKELY(internal_iserror(B, &ErrNo))) { if (Verbosity()) Report( @@ -171,7 +171,7 @@ }; private: - const size_t MaxMemory{0}; + size_t MaxMemory{0}; unsigned char *BackingStore = nullptr; unsigned char *AlignedNextBlock = nullptr; size_t AllocatedBlocks = 0; @@ -223,7 +223,43 @@ public: explicit Allocator(size_t M) XRAY_NEVER_INSTRUMENT - : MaxMemory(RoundUpTo(M, kCacheLineSize)) {} + : MaxMemory(RoundUpTo(M, kCacheLineSize)), + BackingStore(nullptr), + AlignedNextBlock(nullptr), + AllocatedBlocks(0), + Mutex() {} + + Allocator(const Allocator &) = delete; + Allocator &operator=(const Allocator &) = delete; + + Allocator(Allocator &&O) XRAY_NEVER_INSTRUMENT { + SpinMutexLock L0(&Mutex); + SpinMutexLock L1(&O.Mutex); + MaxMemory = O.MaxMemory; + O.MaxMemory = 0; + BackingStore = O.BackingStore; + O.BackingStore = nullptr; + AlignedNextBlock = O.AlignedNextBlock; + O.AlignedNextBlock = nullptr; + AllocatedBlocks = O.AllocatedBlocks; + O.AllocatedBlocks = 0; + } + + Allocator &operator=(Allocator &&O) XRAY_NEVER_INSTRUMENT { + SpinMutexLock L0(&Mutex); + SpinMutexLock L1(&O.Mutex); + MaxMemory = O.MaxMemory; + O.MaxMemory = 0; + if (BackingStore != nullptr) + deallocate(BackingStore, MaxMemory); + BackingStore = O.BackingStore; + O.BackingStore = nullptr; + AlignedNextBlock = O.AlignedNextBlock; + O.AlignedNextBlock = nullptr; + AllocatedBlocks = O.AllocatedBlocks; + O.AllocatedBlocks = 0; + return *this; + } Block Allocate() XRAY_NEVER_INSTRUMENT { return {Alloc()}; } Index: lib/xray/xray_function_call_trie.h =================================================================== --- lib/xray/xray_function_call_trie.h +++ lib/xray/xray_function_call_trie.h @@ -98,9 +98,6 @@ struct NodeIdPair { Node *NodePtr; int32_t FId; - - // Constructor for inplace-construction. - NodeIdPair(Node *N, int32_t F) : NodePtr(N), FId(F) {} }; using NodeIdPairArray = Array; @@ -118,15 +115,6 @@ uint64_t CumulativeLocalTime; // Typically in TSC deltas, not wall-time. int32_t FId; - // We add a constructor here to allow us to inplace-construct through - // Array<...>'s AppendEmplace. - Node(Node *P, NodeIdPairAllocatorType &A, uint64_t CC, uint64_t CLT, - int32_t F) XRAY_NEVER_INSTRUMENT : Parent(P), - Callees(A), - CallCount(CC), - CumulativeLocalTime(CLT), - FId(F) {} - // TODO: Include the compact histogram. }; @@ -135,13 +123,6 @@ uint64_t EntryTSC; Node *NodePtr; uint16_t EntryCPU; - - // We add a constructor here to allow us to inplace-construct through - // Array<...>'s AppendEmplace. - ShadowStackEntry(uint64_t T, Node *N, uint16_t C) XRAY_NEVER_INSTRUMENT - : EntryTSC{T}, - NodePtr{N}, - EntryCPU{C} {} }; using NodeArray = Array; @@ -156,20 +137,71 @@ using RootAllocatorType = RootArray::AllocatorType; using ShadowStackAllocatorType = ShadowStackArray::AllocatorType; + // Use hosted aligned storage members to allow for trivial move and init. + // This also allows us to sidestep the potential-failing allocation issue. + typename std::aligned_storage::type + NodeAllocatorStorage; + typename std::aligned_storage::type + RootAllocatorStorage; + typename std::aligned_storage::type + ShadowStackAllocatorStorage; + typename std::aligned_storage::type + NodeIdPairAllocatorStorage; + NodeAllocatorType *NodeAllocator = nullptr; RootAllocatorType *RootAllocator = nullptr; ShadowStackAllocatorType *ShadowStackAllocator = nullptr; NodeIdPairAllocatorType *NodeIdPairAllocator = nullptr; - Allocators() {} + Allocators() = default; Allocators(const Allocators &) = delete; Allocators &operator=(const Allocators &) = delete; - Allocators(Allocators &&O) XRAY_NEVER_INSTRUMENT - : NodeAllocator(O.NodeAllocator), - RootAllocator(O.RootAllocator), - ShadowStackAllocator(O.ShadowStackAllocator), - NodeIdPairAllocator(O.NodeIdPairAllocator) { + explicit Allocators(uptr Max) XRAY_NEVER_INSTRUMENT { + new (&NodeAllocatorStorage) NodeAllocatorType(Max); + NodeAllocator = + reinterpret_cast(&NodeAllocatorStorage); + + new (&RootAllocatorStorage) RootAllocatorType(Max); + RootAllocator = + reinterpret_cast(&RootAllocatorStorage); + + new (&ShadowStackAllocatorStorage) ShadowStackAllocatorType(Max); + ShadowStackAllocator = reinterpret_cast( + &ShadowStackAllocatorStorage); + + new (&NodeIdPairAllocatorStorage) NodeIdPairAllocatorType(Max); + NodeIdPairAllocator = reinterpret_cast( + &NodeIdPairAllocatorStorage); + } + + Allocators(Allocators &&O) XRAY_NEVER_INSTRUMENT { + // Here we rely on the safety of memcpy'ing contents of the storage + // members, and then pointing the source pointers to nullptr. + internal_memcpy(&NodeAllocatorStorage, &O.NodeAllocatorStorage, + sizeof(NodeAllocatorType)); + internal_memcpy(&RootAllocatorStorage, &O.RootAllocatorStorage, + sizeof(RootAllocatorType)); + internal_memcpy(&ShadowStackAllocatorStorage, + &O.ShadowStackAllocatorStorage, + sizeof(ShadowStackAllocatorType)); + internal_memcpy(&NodeIdPairAllocatorStorage, + &O.NodeIdPairAllocatorStorage, + sizeof(NodeIdPairAllocatorType)); + + NodeAllocator = + reinterpret_cast(&NodeAllocatorStorage); + RootAllocator = + reinterpret_cast(&RootAllocatorStorage); + ShadowStackAllocator = reinterpret_cast( + &ShadowStackAllocatorStorage); + NodeIdPairAllocator = reinterpret_cast( + &NodeIdPairAllocatorStorage); + O.NodeAllocator = nullptr; O.RootAllocator = nullptr; O.ShadowStackAllocator = nullptr; @@ -177,79 +209,77 @@ } Allocators &operator=(Allocators &&O) XRAY_NEVER_INSTRUMENT { - { - auto Tmp = O.NodeAllocator; - O.NodeAllocator = this->NodeAllocator; - this->NodeAllocator = Tmp; - } - { - auto Tmp = O.RootAllocator; - O.RootAllocator = this->RootAllocator; - this->RootAllocator = Tmp; - } - { - auto Tmp = O.ShadowStackAllocator; - O.ShadowStackAllocator = this->ShadowStackAllocator; - this->ShadowStackAllocator = Tmp; - } - { - auto Tmp = O.NodeIdPairAllocator; - O.NodeIdPairAllocator = this->NodeIdPairAllocator; - this->NodeIdPairAllocator = Tmp; - } - return *this; - } - - ~Allocators() XRAY_NEVER_INSTRUMENT { - // Note that we cannot use delete on these pointers, as they need to be - // returned to the sanitizer_common library's internal memory tracking - // system. - if (NodeAllocator != nullptr) { + // When moving into an existing instance, we ensure that we clean up the + // current allocators. + if (NodeAllocator) NodeAllocator->~NodeAllocatorType(); - deallocate(NodeAllocator); + if (O.NodeAllocator) { + new (&NodeAllocatorStorage) + NodeAllocatorType(std::move(*O.NodeAllocator)); + NodeAllocator = + reinterpret_cast(&NodeAllocatorStorage); + O.NodeAllocator = nullptr; + } else { NodeAllocator = nullptr; } - if (RootAllocator != nullptr) { + + if (RootAllocator) RootAllocator->~RootAllocatorType(); - deallocate(RootAllocator); + if (O.RootAllocator) { + new (&RootAllocatorStorage) + RootAllocatorType(std::move(*O.RootAllocator)); + RootAllocator = + reinterpret_cast(&RootAllocatorStorage); + O.RootAllocator = nullptr; + } else { RootAllocator = nullptr; } - if (ShadowStackAllocator != nullptr) { + + if (ShadowStackAllocator) ShadowStackAllocator->~ShadowStackAllocatorType(); - deallocate(ShadowStackAllocator); + if (O.ShadowStackAllocator) { + new (&ShadowStackAllocatorStorage) + ShadowStackAllocatorType(std::move(*O.ShadowStackAllocator)); + ShadowStackAllocator = reinterpret_cast( + &ShadowStackAllocatorStorage); + O.ShadowStackAllocator = nullptr; + } else { ShadowStackAllocator = nullptr; } - if (NodeIdPairAllocator != nullptr) { + + if (NodeIdPairAllocator) NodeIdPairAllocator->~NodeIdPairAllocatorType(); - deallocate(NodeIdPairAllocator); + if (O.NodeIdPairAllocator) { + new (&NodeIdPairAllocatorStorage) + NodeIdPairAllocatorType(std::move(*O.NodeIdPairAllocator)); + NodeIdPairAllocator = reinterpret_cast( + &NodeIdPairAllocatorStorage); + O.NodeIdPairAllocator = nullptr; + } else { NodeIdPairAllocator = nullptr; } + + return *this; + } + + ~Allocators() XRAY_NEVER_INSTRUMENT { + if (NodeAllocator != nullptr) + NodeAllocator->~NodeAllocatorType(); + if (RootAllocator != nullptr) + RootAllocator->~RootAllocatorType(); + if (ShadowStackAllocator != nullptr) + ShadowStackAllocator->~ShadowStackAllocatorType(); + if (NodeIdPairAllocator != nullptr) + NodeIdPairAllocator->~NodeIdPairAllocatorType(); } }; - // TODO: Support configuration of options through the arguments. static Allocators InitAllocators() XRAY_NEVER_INSTRUMENT { return InitAllocatorsCustom(profilingFlags()->per_thread_allocator_max); } static Allocators InitAllocatorsCustom(uptr Max) XRAY_NEVER_INSTRUMENT { - Allocators A; - auto NodeAllocator = allocate(); - new (NodeAllocator) Allocators::NodeAllocatorType(Max); - A.NodeAllocator = NodeAllocator; - - auto RootAllocator = allocate(); - new (RootAllocator) Allocators::RootAllocatorType(Max); - A.RootAllocator = RootAllocator; - - auto ShadowStackAllocator = - allocate(); - new (ShadowStackAllocator) Allocators::ShadowStackAllocatorType(Max); - A.ShadowStackAllocator = ShadowStackAllocator; - - auto NodeIdPairAllocator = allocate(); - new (NodeIdPairAllocator) NodeIdPairAllocatorType(Max); - A.NodeIdPairAllocator = NodeIdPairAllocator; + Allocators A(Max); return A; } @@ -257,14 +287,38 @@ NodeArray Nodes; RootArray Roots; ShadowStackArray ShadowStack; - NodeIdPairAllocatorType *NodeIdPairAllocator = nullptr; + NodeIdPairAllocatorType *NodeIdPairAllocator; + uint32_t OverflowedFunctions; public: explicit FunctionCallTrie(const Allocators &A) XRAY_NEVER_INSTRUMENT : Nodes(*A.NodeAllocator), Roots(*A.RootAllocator), ShadowStack(*A.ShadowStackAllocator), - NodeIdPairAllocator(A.NodeIdPairAllocator) {} + NodeIdPairAllocator(A.NodeIdPairAllocator), + OverflowedFunctions(0) {} + + FunctionCallTrie() = delete; + FunctionCallTrie(const FunctionCallTrie &) = delete; + FunctionCallTrie &operator=(const FunctionCallTrie &) = delete; + + FunctionCallTrie(FunctionCallTrie &&O) XRAY_NEVER_INSTRUMENT + : Nodes(std::move(O.Nodes)), + Roots(std::move(O.Roots)), + ShadowStack(std::move(O.ShadowStack)), + NodeIdPairAllocator(O.NodeIdPairAllocator), + OverflowedFunctions(O.OverflowedFunctions) {} + + FunctionCallTrie &operator=(FunctionCallTrie &&O) XRAY_NEVER_INSTRUMENT { + Nodes = std::move(O.Nodes); + Roots = std::move(O.Roots); + ShadowStack = std::move(O.ShadowStack); + NodeIdPairAllocator = O.NodeIdPairAllocator; + OverflowedFunctions = O.OverflowedFunctions; + return *this; + } + + ~FunctionCallTrie() XRAY_NEVER_INSTRUMENT {} void enterFunction(const int32_t FId, uint64_t TSC, uint16_t CPU) XRAY_NEVER_INSTRUMENT { @@ -272,12 +326,17 @@ // This function primarily deals with ensuring that the ShadowStack is // consistent and ready for when an exit event is encountered. if (UNLIKELY(ShadowStack.empty())) { - auto NewRoot = - Nodes.AppendEmplace(nullptr, *NodeIdPairAllocator, 0u, 0u, FId); + auto NewRoot = Nodes.AppendEmplace( + nullptr, NodeIdPairArray{*NodeIdPairAllocator}, 0u, 0u, FId); if (UNLIKELY(NewRoot == nullptr)) return; - Roots.Append(NewRoot); - ShadowStack.AppendEmplace(TSC, NewRoot, CPU); + if (Roots.Append(NewRoot) == nullptr) + return; + if (ShadowStack.AppendEmplace(TSC, NewRoot, CPU) == nullptr) { + Roots.trim(1); + ++OverflowedFunctions; + return; + } return; } @@ -291,29 +350,39 @@ [FId](const NodeIdPair &NR) { return NR.FId == FId; }); if (Callee != nullptr) { CHECK_NE(Callee->NodePtr, nullptr); - ShadowStack.AppendEmplace(TSC, Callee->NodePtr, CPU); + if (ShadowStack.AppendEmplace(TSC, Callee->NodePtr, CPU) == nullptr) + ++OverflowedFunctions; return; } // This means we've never seen this stack before, create a new node here. - auto NewNode = - Nodes.AppendEmplace(TopNode, *NodeIdPairAllocator, 0u, 0u, FId); + auto NewNode = Nodes.AppendEmplace( + TopNode, NodeIdPairArray(*NodeIdPairAllocator), 0u, 0u, FId); if (UNLIKELY(NewNode == nullptr)) return; DCHECK_NE(NewNode, nullptr); TopNode->Callees.AppendEmplace(NewNode, FId); - ShadowStack.AppendEmplace(TSC, NewNode, CPU); + if (ShadowStack.AppendEmplace(TSC, NewNode, CPU) == nullptr) + ++OverflowedFunctions; DCHECK_NE(ShadowStack.back().NodePtr, nullptr); return; } void exitFunction(int32_t FId, uint64_t TSC, uint16_t CPU) XRAY_NEVER_INSTRUMENT { + // If we're exiting functions that have "overflowed" or don't fit into the + // stack due to allocator constraints, we then decrement that count first. + if (OverflowedFunctions) { + --OverflowedFunctions; + return; + } + // When we exit a function, we look up the ShadowStack to see whether we've // entered this function before. We do as little processing here as we can, // since most of the hard work would have already been done at function // entry. uint64_t CumulativeTreeTime = 0; + while (!ShadowStack.empty()) { const auto &Top = ShadowStack.back(); auto TopNode = Top.NodePtr; @@ -380,7 +449,7 @@ for (const auto Root : getRoots()) { // Add a node in O for this root. auto NewRoot = O.Nodes.AppendEmplace( - nullptr, *O.NodeIdPairAllocator, Root->CallCount, + nullptr, NodeIdPairArray(*O.NodeIdPairAllocator), Root->CallCount, Root->CumulativeLocalTime, Root->FId); // Because we cannot allocate more memory we should bail out right away. @@ -399,8 +468,9 @@ DFSStack.trim(1); for (const auto Callee : NP.Node->Callees) { auto NewNode = O.Nodes.AppendEmplace( - NP.NewNode, *O.NodeIdPairAllocator, Callee.NodePtr->CallCount, - Callee.NodePtr->CumulativeLocalTime, Callee.FId); + NP.NewNode, NodeIdPairArray(*O.NodeIdPairAllocator), + Callee.NodePtr->CallCount, Callee.NodePtr->CumulativeLocalTime, + Callee.FId); if (UNLIKELY(NewNode == nullptr)) return; NP.NewNode->Callees.AppendEmplace(NewNode, Callee.FId); @@ -433,8 +503,9 @@ auto R = O.Roots.find_element( [&](const Node *Node) { return Node->FId == Root->FId; }); if (R == nullptr) { - TargetRoot = O.Nodes.AppendEmplace(nullptr, *O.NodeIdPairAllocator, 0u, - 0u, Root->FId); + TargetRoot = O.Nodes.AppendEmplace( + nullptr, NodeIdPairArray(*O.NodeIdPairAllocator), 0u, 0u, + Root->FId); if (UNLIKELY(TargetRoot == nullptr)) return; @@ -459,7 +530,8 @@ }); if (TargetCallee == nullptr) { auto NewTargetNode = O.Nodes.AppendEmplace( - NT.TargetNode, *O.NodeIdPairAllocator, 0u, 0u, Callee.FId); + NT.TargetNode, NodeIdPairArray(*O.NodeIdPairAllocator), 0u, 0u, + Callee.FId); if (UNLIKELY(NewTargetNode == nullptr)) return; Index: lib/xray/xray_profile_collector.cc =================================================================== --- lib/xray/xray_profile_collector.cc +++ lib/xray/xray_profile_collector.cc @@ -86,7 +86,8 @@ void post(const FunctionCallTrie &T, tid_t TId) XRAY_NEVER_INSTRUMENT { static pthread_once_t Once = PTHREAD_ONCE_INIT; - pthread_once(&Once, +[] { reset(); }); + pthread_once( + &Once, +[]() XRAY_NEVER_INSTRUMENT { reset(); }); ThreadTrie *Item = nullptr; { @@ -95,13 +96,14 @@ return; Item = ThreadTries->Append({}); + if (Item == nullptr) + return; + Item->TId = TId; auto Trie = reinterpret_cast(&Item->TrieStorage); new (Trie) FunctionCallTrie(*GlobalAllocators); + T.deepCopyInto(*Trie); } - - auto Trie = reinterpret_cast(&Item->TrieStorage); - T.deepCopyInto(*Trie); } // A PathArray represents the function id's representing a stack trace. In this @@ -115,13 +117,7 @@ // The Path in this record is the function id's from the leaf to the root of // the function call stack as represented from a FunctionCallTrie. PathArray Path; - const FunctionCallTrie::Node *Node = nullptr; - - // Constructor for in-place construction. - ProfileRecord(PathAllocator &A, - const FunctionCallTrie::Node *N) XRAY_NEVER_INSTRUMENT - : Path(A), - Node(N) {} + const FunctionCallTrie::Node *Node; }; namespace { @@ -142,7 +138,7 @@ while (!DFSStack.empty()) { auto Node = DFSStack.back(); DFSStack.trim(1); - auto Record = PRs.AppendEmplace(PA, Node); + auto Record = PRs.AppendEmplace(PathArray{PA}, Node); if (Record == nullptr) return; DCHECK_NE(Record, nullptr); @@ -203,7 +199,7 @@ // Clear out the global ProfileBuffers, if it's not empty. for (auto &B : *ProfileBuffers) - deallocateBuffer(reinterpret_cast(B.Data), B.Size); + deallocateBuffer(reinterpret_cast(B.Data), B.Size); ProfileBuffers->trim(ProfileBuffers->size()); if (ThreadTries->empty()) @@ -278,8 +274,8 @@ GlobalAllocators = reinterpret_cast(&AllocatorStorage); - new (GlobalAllocators) FunctionCallTrie::Allocators(); - *GlobalAllocators = FunctionCallTrie::InitAllocators(); + new (GlobalAllocators) + FunctionCallTrie::Allocators(FunctionCallTrie::InitAllocators()); if (ThreadTriesAllocator != nullptr) ThreadTriesAllocator->~ThreadTriesArrayAllocator(); @@ -312,8 +308,10 @@ static pthread_once_t Once = PTHREAD_ONCE_INIT; static typename std::aligned_storage::type FileHeaderStorage; - pthread_once(&Once, - +[] { new (&FileHeaderStorage) XRayProfilingFileHeader{}; }); + pthread_once( + &Once, +[]() XRAY_NEVER_INSTRUMENT { + new (&FileHeaderStorage) XRayProfilingFileHeader{}; + }); if (UNLIKELY(B.Data == nullptr)) { // The first buffer should always contain the file header information. Index: lib/xray/xray_profiling.cc =================================================================== --- lib/xray/xray_profiling.cc +++ lib/xray/xray_profiling.cc @@ -31,67 +31,112 @@ namespace { -atomic_sint32_t ProfilerLogFlushStatus = { +static atomic_sint32_t ProfilerLogFlushStatus = { XRayLogFlushStatus::XRAY_LOG_NOT_FLUSHING}; -atomic_sint32_t ProfilerLogStatus = {XRayLogInitStatus::XRAY_LOG_UNINITIALIZED}; +static atomic_sint32_t ProfilerLogStatus = { + XRayLogInitStatus::XRAY_LOG_UNINITIALIZED}; -SpinMutex ProfilerOptionsMutex; +static SpinMutex ProfilerOptionsMutex; -struct alignas(64) ProfilingData { - FunctionCallTrie::Allocators *Allocators; - FunctionCallTrie *FCT; +struct ProfilingData { + atomic_uintptr_t Allocators; + atomic_uintptr_t FCT; }; static pthread_key_t ProfilingKey; -thread_local std::aligned_storage::type +thread_local std::aligned_storage::type AllocatorsStorage; -thread_local std::aligned_storage::type +thread_local std::aligned_storage::type FunctionCallTrieStorage; -thread_local std::aligned_storage::type ThreadStorage{}; +thread_local ProfilingData TLD{{0}, {0}}; +thread_local atomic_uint8_t ReentranceGuard{0}; -static ProfilingData &getThreadLocalData() XRAY_NEVER_INSTRUMENT { - thread_local auto ThreadOnce = [] { - new (&ThreadStorage) ProfilingData{}; - auto *Allocators = - reinterpret_cast(&AllocatorsStorage); - new (Allocators) FunctionCallTrie::Allocators(); - *Allocators = FunctionCallTrie::InitAllocators(); - auto *FCT = reinterpret_cast(&FunctionCallTrieStorage); - new (FCT) FunctionCallTrie(*Allocators); - auto &TLD = *reinterpret_cast(&ThreadStorage); - TLD.Allocators = Allocators; - TLD.FCT = FCT; - pthread_setspecific(ProfilingKey, &ThreadStorage); +// We use a separate guard for ensuring that for this thread, if we're already +// cleaning up, that any signal handlers don't attempt to cleanup nor +// initialise. +thread_local atomic_uint8_t TLDInitGuard{0}; + +// We also use a separate latch to signal that the thread is exiting, and +// non-essential work should be ignored (things like recording events, etc.). +thread_local atomic_uint8_t ThreadExitingLatch{0}; + +static ProfilingData *getThreadLocalData() XRAY_NEVER_INSTRUMENT { + thread_local auto ThreadOnce = []() XRAY_NEVER_INSTRUMENT { + pthread_setspecific(ProfilingKey, &TLD); return false; }(); (void)ThreadOnce; - auto &TLD = *reinterpret_cast(&ThreadStorage); - - if (UNLIKELY(TLD.Allocators == nullptr || TLD.FCT == nullptr)) { - auto *Allocators = - reinterpret_cast(&AllocatorsStorage); - new (Allocators) FunctionCallTrie::Allocators(); - *Allocators = FunctionCallTrie::InitAllocators(); - auto *FCT = reinterpret_cast(&FunctionCallTrieStorage); - new (FCT) FunctionCallTrie(*Allocators); - TLD.Allocators = Allocators; - TLD.FCT = FCT; + RecursionGuard TLDInit(TLDInitGuard); + if (!TLDInit) + return nullptr; + + if (atomic_load_relaxed(&ThreadExitingLatch)) + return nullptr; + + uintptr_t Allocators = 0; + if (atomic_compare_exchange_strong(&TLD.Allocators, &Allocators, 1, + memory_order_acq_rel)) { + new (&AllocatorsStorage) + FunctionCallTrie::Allocators(FunctionCallTrie::InitAllocators()); + Allocators = reinterpret_cast( + reinterpret_cast(&AllocatorsStorage)); + atomic_store(&TLD.Allocators, Allocators, memory_order_release); + } + + uintptr_t FCT = 0; + if (atomic_compare_exchange_strong(&TLD.FCT, &FCT, 1, memory_order_acq_rel)) { + new (&FunctionCallTrieStorage) FunctionCallTrie( + *reinterpret_cast(Allocators)); + FCT = reinterpret_cast( + reinterpret_cast(&FunctionCallTrieStorage)); + atomic_store(&TLD.FCT, FCT, memory_order_release); } - return *reinterpret_cast(&ThreadStorage); + if (FCT == 1) + return nullptr; + + return &TLD; } static void cleanupTLD() XRAY_NEVER_INSTRUMENT { - auto &TLD = *reinterpret_cast(&ThreadStorage); - if (TLD.Allocators != nullptr && TLD.FCT != nullptr) { - TLD.FCT->~FunctionCallTrie(); - TLD.Allocators->~Allocators(); - TLD.FCT = nullptr; - TLD.Allocators = nullptr; - } + RecursionGuard TLDInit(TLDInitGuard); + if (!TLDInit) + return; + + auto FCT = atomic_exchange(&TLD.FCT, 0, memory_order_acq_rel); + if (FCT == reinterpret_cast(reinterpret_cast( + &FunctionCallTrieStorage))) + reinterpret_cast(FCT)->~FunctionCallTrie(); + + auto Allocators = atomic_exchange(&TLD.Allocators, 0, memory_order_acq_rel); + if (Allocators == + reinterpret_cast( + reinterpret_cast(&AllocatorsStorage))) + reinterpret_cast(Allocators)->~Allocators(); +} + +static void postCurrentThreadFCT(ProfilingData &T) XRAY_NEVER_INSTRUMENT { + RecursionGuard TLDInit(TLDInitGuard); + if (!TLDInit) + return; + + uintptr_t P = atomic_load(&T.FCT, memory_order_acquire); + if (P != reinterpret_cast( + reinterpret_cast(&FunctionCallTrieStorage))) + return; + + auto FCT = reinterpret_cast(P); + DCHECK_NE(FCT, nullptr); + + if (!FCT->getRoots().empty()) + profileCollectorService::post(*FCT, GetTid()); + + cleanupTLD(); } } // namespace @@ -104,9 +149,6 @@ #endif } -atomic_sint32_t ProfileFlushStatus = { - XRayLogFlushStatus::XRAY_LOG_NOT_FLUSHING}; - XRayLogFlushStatus profilingFlush() XRAY_NEVER_INSTRUMENT { if (atomic_load(&ProfilerLogStatus, memory_order_acquire) != XRayLogInitStatus::XRAY_LOG_FINALIZED) { @@ -115,14 +157,27 @@ return XRayLogFlushStatus::XRAY_LOG_NOT_FLUSHING; } - s32 Result = XRayLogFlushStatus::XRAY_LOG_NOT_FLUSHING; - if (!atomic_compare_exchange_strong(&ProfilerLogFlushStatus, &Result, - XRayLogFlushStatus::XRAY_LOG_FLUSHING, - memory_order_acq_rel)) { + RecursionGuard SignalGuard(ReentranceGuard); + if (!SignalGuard) { if (Verbosity()) - Report("Not flushing profiles, implementation still finalizing.\n"); + Report("Cannot finalize properly inside a signal handler!\n"); + atomic_store(&ProfilerLogFlushStatus, + XRayLogFlushStatus::XRAY_LOG_NOT_FLUSHING, + memory_order_release); + return XRayLogFlushStatus::XRAY_LOG_NOT_FLUSHING; } + s32 Previous = atomic_exchange(&ProfilerLogFlushStatus, + XRayLogFlushStatus::XRAY_LOG_FLUSHING, + memory_order_acq_rel); + if (Previous == XRayLogFlushStatus::XRAY_LOG_FLUSHING) { + if (Verbosity()) + Report("Not flushing profiles, implementation still flushing.\n"); + return XRayLogFlushStatus::XRAY_LOG_FLUSHING; + } + + postCurrentThreadFCT(TLD); + // At this point, we'll create the file that will contain the profile, but // only if the options say so. if (!profilingFlags()->no_flush) { @@ -150,33 +205,19 @@ } } - profileCollectorService::reset(); - - // Flush the current thread's local data structures as well. + // Clean up the current thread's TLD information as well. cleanupTLD(); + profileCollectorService::reset(); + + atomic_store(&ProfilerLogFlushStatus, XRayLogFlushStatus::XRAY_LOG_FLUSHED, + memory_order_release); atomic_store(&ProfilerLogStatus, XRayLogFlushStatus::XRAY_LOG_FLUSHED, memory_order_release); return XRayLogFlushStatus::XRAY_LOG_FLUSHED; } -namespace { - -thread_local atomic_uint8_t ReentranceGuard{0}; - -static void postCurrentThreadFCT(ProfilingData &TLD) XRAY_NEVER_INSTRUMENT { - if (TLD.Allocators == nullptr || TLD.FCT == nullptr) - return; - - if (!TLD.FCT->getRoots().empty()) - profileCollectorService::post(*TLD.FCT, GetTid()); - - cleanupTLD(); -} - -} // namespace - void profilingHandleArg0(int32_t FuncId, XRayEntryType Entry) XRAY_NEVER_INSTRUMENT { unsigned char CPU; @@ -186,22 +227,29 @@ return; auto Status = atomic_load(&ProfilerLogStatus, memory_order_acquire); + if (UNLIKELY(Status == XRayLogInitStatus::XRAY_LOG_UNINITIALIZED || + Status == XRayLogInitStatus::XRAY_LOG_INITIALIZING)) + return; + if (UNLIKELY(Status == XRayLogInitStatus::XRAY_LOG_FINALIZED || Status == XRayLogInitStatus::XRAY_LOG_FINALIZING)) { - auto &TLD = getThreadLocalData(); postCurrentThreadFCT(TLD); return; } - auto &TLD = getThreadLocalData(); + auto T = getThreadLocalData(); + if (T == nullptr) + return; + + auto FCT = reinterpret_cast(atomic_load_relaxed(&T->FCT)); switch (Entry) { case XRayEntryType::ENTRY: case XRayEntryType::LOG_ARGS_ENTRY: - TLD.FCT->enterFunction(FuncId, TSC, CPU); + FCT->enterFunction(FuncId, TSC, CPU); break; case XRayEntryType::EXIT: case XRayEntryType::TAIL: - TLD.FCT->exitFunction(FuncId, TSC, CPU); + FCT->exitFunction(FuncId, TSC, CPU); break; default: // FIXME: Handle bugs. @@ -227,15 +275,14 @@ // Wait a grace period to allow threads to see that we're finalizing. SleepForMillis(profilingFlags()->grace_period_ms); - // We also want to make sure that the current thread's data is cleaned up, if - // we have any. We need to ensure that the call to postCurrentThreadFCT() is - // guarded by our recursion guard. - auto &TLD = getThreadLocalData(); - { - RecursionGuard G(ReentranceGuard); - if (G) - postCurrentThreadFCT(TLD); - } + // If we for some reason are entering this function from an instrumented + // handler, we bail out. + RecursionGuard G(ReentranceGuard); + if (!G) + return static_cast(CurrentStatus); + + // Post the current thread's data if we have any. + postCurrentThreadFCT(TLD); // Then we force serialize the log data. profileCollectorService::serialize(); @@ -248,6 +295,10 @@ XRayLogInitStatus profilingLoggingInit(UNUSED size_t BufferSize, UNUSED size_t BufferMax, void *Options, size_t OptionsSize) XRAY_NEVER_INSTRUMENT { + RecursionGuard G(ReentranceGuard); + if (!G) + return XRayLogInitStatus::XRAY_LOG_UNINITIALIZED; + s32 CurrentStatus = XRayLogInitStatus::XRAY_LOG_UNINITIALIZED; if (!atomic_compare_exchange_strong(&ProfilerLogStatus, &CurrentStatus, XRayLogInitStatus::XRAY_LOG_INITIALIZING, @@ -282,39 +333,51 @@ // We need to set up the exit handlers. static pthread_once_t Once = PTHREAD_ONCE_INIT; - pthread_once(&Once, +[] { - pthread_key_create(&ProfilingKey, +[](void *P) { - // This is the thread-exit handler. - auto &TLD = *reinterpret_cast(P); - if (TLD.Allocators == nullptr && TLD.FCT == nullptr) - return; - - { - // If we're somehow executing this while inside a non-reentrant-friendly - // context, we skip attempting to post the current thread's data. - RecursionGuard G(ReentranceGuard); - if (G) - postCurrentThreadFCT(TLD); - } - }); - - // We also need to set up an exit handler, so that we can get the profile - // information at exit time. We use the C API to do this, to not rely on C++ - // ABI functions for registering exit handlers. - Atexit(+[] { - // Finalize and flush. - if (profilingFinalize() != XRAY_LOG_FINALIZED) { - cleanupTLD(); - return; - } - if (profilingFlush() != XRAY_LOG_FLUSHED) { - cleanupTLD(); - return; - } - if (Verbosity()) - Report("XRay Profile flushed at exit."); - }); - }); + pthread_once( + &Once, +[] { + pthread_key_create( + &ProfilingKey, +[](void *P) XRAY_NEVER_INSTRUMENT { + if (atomic_exchange(&ThreadExitingLatch, 1, memory_order_acq_rel)) + return; + + if (P == nullptr) + return; + + auto T = reinterpret_cast(P); + if (atomic_load_relaxed(&T->Allocators) == 0) + return; + + { + // If we're somehow executing this while inside a + // non-reentrant-friendly context, we skip attempting to post + // the current thread's data. + RecursionGuard G(ReentranceGuard); + if (!G) + return; + + postCurrentThreadFCT(*T); + } + }); + + // We also need to set up an exit handler, so that we can get the + // profile information at exit time. We use the C API to do this, to not + // rely on C++ ABI functions for registering exit handlers. + Atexit(+[]() XRAY_NEVER_INSTRUMENT { + if (atomic_exchange(&ThreadExitingLatch, 1, memory_order_acq_rel)) + return; + + auto Cleanup = + at_scope_exit([]() XRAY_NEVER_INSTRUMENT { cleanupTLD(); }); + + // Finalize and flush. + if (profilingFinalize() != XRAY_LOG_FINALIZED || + profilingFlush() != XRAY_LOG_FLUSHED) + return; + + if (Verbosity()) + Report("XRay Profile flushed at exit."); + }); + }); __xray_log_set_buffer_iterator(profileCollectorService::nextBuffer); __xray_set_handler(profilingHandleArg0); Index: lib/xray/xray_segmented_array.h =================================================================== --- lib/xray/xray_segmented_array.h +++ lib/xray/xray_segmented_array.h @@ -32,14 +32,9 @@ /// is destroyed. When an Array is destroyed, it will destroy elements in the /// backing store but will not free the memory. template class Array { - struct SegmentBase { - SegmentBase *Prev; - SegmentBase *Next; - }; - - // We want each segment of the array to be cache-line aligned, and elements of - // the array be offset from the beginning of the segment. - struct Segment : SegmentBase { + struct Segment { + Segment *Prev; + Segment *Next; char Data[1]; }; @@ -62,91 +57,35 @@ // kCacheLineSize-multiple segments, minus the size of two pointers. // // - Request cacheline-multiple sized elements from the allocator. - static constexpr size_t AlignedElementStorageSize = + static constexpr uint64_t AlignedElementStorageSize = sizeof(typename std::aligned_storage::type); - static constexpr size_t SegmentSize = - nearest_boundary(sizeof(Segment) + next_pow2(sizeof(T)), kCacheLineSize); + static constexpr uint64_t SegmentControlBlockSize = sizeof(Segment *) * 2; + + static constexpr uint64_t SegmentSize = nearest_boundary( + SegmentControlBlockSize + next_pow2(sizeof(T)), kCacheLineSize); using AllocatorType = Allocator; - static constexpr size_t ElementsPerSegment = - (SegmentSize - sizeof(Segment)) / next_pow2(sizeof(T)); + static constexpr uint64_t ElementsPerSegment = + (SegmentSize - SegmentControlBlockSize) / next_pow2(sizeof(T)); static_assert(ElementsPerSegment > 0, "Must have at least 1 element per segment."); - static SegmentBase SentinelSegment; + static Segment SentinelSegment; - using size_type = size_t; + using size_type = uint64_t; private: - AllocatorType *Alloc; - SegmentBase *Head = &SentinelSegment; - SegmentBase *Tail = &SentinelSegment; - size_t Size = 0; - - // Here we keep track of segments in the freelist, to allow us to re-use - // segments when elements are trimmed off the end. - SegmentBase *Freelist = &SentinelSegment; - - Segment *NewSegment() XRAY_NEVER_INSTRUMENT { - // We need to handle the case in which enough elements have been trimmed to - // allow us to re-use segments we've allocated before. For this we look into - // the Freelist, to see whether we need to actually allocate new blocks or - // just re-use blocks we've already seen before. - if (Freelist != &SentinelSegment) { - auto *FreeSegment = Freelist; - Freelist = FreeSegment->Next; - FreeSegment->Next = &SentinelSegment; - Freelist->Prev = &SentinelSegment; - return static_cast(FreeSegment); - } - - auto SegmentBlock = Alloc->Allocate(); - if (SegmentBlock.Data == nullptr) - return nullptr; - - // Placement-new the Segment element at the beginning of the SegmentBlock. - auto S = reinterpret_cast(SegmentBlock.Data); - new (S) SegmentBase{&SentinelSegment, &SentinelSegment}; - return S; - } - - Segment *InitHeadAndTail() XRAY_NEVER_INSTRUMENT { - DCHECK_EQ(Head, &SentinelSegment); - DCHECK_EQ(Tail, &SentinelSegment); - auto Segment = NewSegment(); - if (Segment == nullptr) - return nullptr; - DCHECK_EQ(Segment->Next, &SentinelSegment); - DCHECK_EQ(Segment->Prev, &SentinelSegment); - Head = Tail = static_cast(Segment); - return Segment; - } - - Segment *AppendNewSegment() XRAY_NEVER_INSTRUMENT { - auto S = NewSegment(); - if (S == nullptr) - return nullptr; - DCHECK_NE(Tail, &SentinelSegment); - DCHECK_EQ(Tail->Next, &SentinelSegment); - DCHECK_EQ(S->Prev, &SentinelSegment); - DCHECK_EQ(S->Next, &SentinelSegment); - Tail->Next = S; - S->Prev = Tail; - Tail = S; - return static_cast(Tail); - } - // This Iterator models a BidirectionalIterator. template class Iterator { - SegmentBase *S = &SentinelSegment; - size_t Offset = 0; - size_t Size = 0; + Segment *S = &SentinelSegment; + uint64_t Offset = 0; + uint64_t Size = 0; public: - Iterator(SegmentBase *IS, size_t Off, size_t S) XRAY_NEVER_INSTRUMENT + Iterator(Segment *IS, uint64_t Off, uint64_t S) XRAY_NEVER_INSTRUMENT : S(IS), Offset(Off), Size(S) {} @@ -215,7 +154,7 @@ // We need to compute the character-aligned pointer, offset from the // segment's Data location to get the element in the position of Offset. - auto Base = static_cast(S)->Data; + auto Base = &S->Data; auto AlignedOffset = Base + (RelOff * AlignedElementStorageSize); return *reinterpret_cast(AlignedOffset); } @@ -223,17 +162,183 @@ U *operator->() const XRAY_NEVER_INSTRUMENT { return &(**this); } }; + AllocatorType *Alloc; + Segment *Head; + Segment *Tail; + + // Here we keep track of segments in the freelist, to allow us to re-use + // segments when elements are trimmed off the end. + Segment *Freelist; + uint64_t Size; + + // =============================== + // In the following implementation, we work through the algorithms and the + // list operations using the following notation: + // + // - pred(s) is the predecessor (previous node accessor) and succ(s) is + // the successor (next node accessor). + // + // - S is a sentinel segment, which has the following property: + // + // pred(S) == succ(S) == S + // + // - @ is a loop operator, which can imply pred(s) == s if it appears on + // the left of s, or succ(s) == S if it appears on the right of s. + // + // - sL <-> sR : means a bidirectional relation between sL and sR, which + // means: + // + // succ(sL) == sR && pred(SR) == sL + // + // - sL -> sR : implies a unidirectional relation between sL and SR, + // with the following properties: + // + // succ(sL) == sR + // + // sL <- sR : implies a unidirectional relation between sR and sL, + // with the following properties: + // + // pred(sR) == sL + // + // =============================== + + Segment *NewSegment() XRAY_NEVER_INSTRUMENT { + // We need to handle the case in which enough elements have been trimmed to + // allow us to re-use segments we've allocated before. For this we look into + // the Freelist, to see whether we need to actually allocate new blocks or + // just re-use blocks we've already seen before. + if (Freelist != &SentinelSegment) { + // The current state of lists resemble something like this at this point: + // + // Freelist: @S@<-f0->...<->fN->@S@ + // ^ Freelist + // + // We want to perform a splice of `f0` from Freelist to a temporary list, + // which looks like: + // + // Templist: @S@<-f0->@S@ + // ^ FreeSegment + // + // Our algorithm preconditions are: + DCHECK_EQ(Freelist->Prev, &SentinelSegment); + + // Then the algorithm we implement is: + // + // SFS = Freelist + // Freelist = succ(Freelist) + // if (Freelist != S) + // pred(Freelist) = S + // succ(SFS) = S + // pred(SFS) = S + // + auto *FreeSegment = Freelist; + Freelist = Freelist->Next; + + // Note that we need to handle the case where Freelist is now pointing to + // S, which we don't want to be overwriting. + // TODO: Determine whether the cost of the branch is higher than the cost + // of the blind assignment. + if (Freelist != &SentinelSegment) + Freelist->Prev = &SentinelSegment; + + FreeSegment->Next = &SentinelSegment; + FreeSegment->Prev = &SentinelSegment; + + // Our postconditions are: + DCHECK_EQ(Freelist->Prev, &SentinelSegment); + DCHECK_NE(FreeSegment, &SentinelSegment); + return FreeSegment; + } + + auto SegmentBlock = Alloc->Allocate(); + if (SegmentBlock.Data == nullptr) + return nullptr; + + // Placement-new the Segment element at the beginning of the SegmentBlock. + new (SegmentBlock.Data) Segment{&SentinelSegment, &SentinelSegment, {0}}; + auto SB = reinterpret_cast(SegmentBlock.Data); + return SB; + } + + Segment *InitHeadAndTail() XRAY_NEVER_INSTRUMENT { + DCHECK_EQ(Head, &SentinelSegment); + DCHECK_EQ(Tail, &SentinelSegment); + auto S = NewSegment(); + if (S == nullptr) + return nullptr; + DCHECK_EQ(S->Next, &SentinelSegment); + DCHECK_EQ(S->Prev, &SentinelSegment); + DCHECK_NE(S, &SentinelSegment); + Head = S; + Tail = S; + DCHECK_EQ(Head, Tail); + DCHECK_EQ(Tail->Next, &SentinelSegment); + DCHECK_EQ(Tail->Prev, &SentinelSegment); + return S; + } + + Segment *AppendNewSegment() XRAY_NEVER_INSTRUMENT { + auto S = NewSegment(); + if (S == nullptr) + return nullptr; + DCHECK_NE(Tail, &SentinelSegment); + DCHECK_EQ(Tail->Next, &SentinelSegment); + DCHECK_EQ(S->Prev, &SentinelSegment); + DCHECK_EQ(S->Next, &SentinelSegment); + S->Prev = Tail; + Tail->Next = S; + Tail = S; + DCHECK_EQ(S, S->Prev->Next); + DCHECK_EQ(Tail->Next, &SentinelSegment); + return S; + } + public: - explicit Array(AllocatorType &A) XRAY_NEVER_INSTRUMENT : Alloc(&A) {} + explicit Array(AllocatorType &A) XRAY_NEVER_INSTRUMENT + : Alloc(&A), + Head(&SentinelSegment), + Tail(&SentinelSegment), + Freelist(&SentinelSegment), + Size(0) {} + + Array() XRAY_NEVER_INSTRUMENT : Alloc(nullptr), + Head(&SentinelSegment), + Tail(&SentinelSegment), + Freelist(&SentinelSegment), + Size(0) {} Array(const Array &) = delete; - Array(Array &&O) NOEXCEPT : Alloc(O.Alloc), - Head(O.Head), - Tail(O.Tail), - Size(O.Size) { + Array &operator=(const Array &) = delete; + + Array(Array &&O) XRAY_NEVER_INSTRUMENT : Alloc(O.Alloc), + Head(O.Head), + Tail(O.Tail), + Freelist(O.Freelist), + Size(O.Size) { + O.Alloc = nullptr; O.Head = &SentinelSegment; O.Tail = &SentinelSegment; O.Size = 0; + O.Freelist = &SentinelSegment; + } + + Array &operator=(Array &&O) XRAY_NEVER_INSTRUMENT { + Alloc = O.Alloc; + O.Alloc = nullptr; + Head = O.Head; + O.Head = &SentinelSegment; + Tail = O.Tail; + O.Tail = &SentinelSegment; + Freelist = O.Freelist; + O.Freelist = &SentinelSegment; + Size = O.Size; + O.Size = 0; + return *this; + } + + ~Array() XRAY_NEVER_INSTRUMENT { + for (auto &E : *this) + (&E)->~T(); } bool empty() const XRAY_NEVER_INSTRUMENT { return Size == 0; } @@ -243,52 +348,41 @@ return *Alloc; } - size_t size() const XRAY_NEVER_INSTRUMENT { return Size; } - - T *Append(const T &E) XRAY_NEVER_INSTRUMENT { - if (UNLIKELY(Head == &SentinelSegment)) - if (InitHeadAndTail() == nullptr) - return nullptr; - - auto Offset = Size % ElementsPerSegment; - if (UNLIKELY(Size != 0 && Offset == 0)) - if (AppendNewSegment() == nullptr) - return nullptr; - - auto Base = static_cast(Tail)->Data; - auto AlignedOffset = Base + (Offset * AlignedElementStorageSize); - auto Position = reinterpret_cast(AlignedOffset); - *Position = E; - ++Size; - return Position; - } + uint64_t size() const XRAY_NEVER_INSTRUMENT { return Size; } template T *AppendEmplace(Args &&... args) XRAY_NEVER_INSTRUMENT { - if (UNLIKELY(Head == &SentinelSegment)) - if (InitHeadAndTail() == nullptr) + DCHECK((Size == 0 && Head == &SentinelSegment && Head == Tail) || + (Size != 0 && Head != &SentinelSegment && Tail != &SentinelSegment)); + if (UNLIKELY(Head == &SentinelSegment)) { + auto R = InitHeadAndTail(); + if (R == nullptr) return nullptr; + } + + DCHECK_NE(Head, &SentinelSegment); + DCHECK_NE(Tail, &SentinelSegment); auto Offset = Size % ElementsPerSegment; - auto *LatestSegment = Tail; - if (UNLIKELY(Size != 0 && Offset == 0)) { - LatestSegment = AppendNewSegment(); - if (LatestSegment == nullptr) + if (UNLIKELY(Size != 0 && Offset == 0)) + if (AppendNewSegment() == nullptr) return nullptr; - } DCHECK_NE(Tail, &SentinelSegment); - auto Base = static_cast(LatestSegment)->Data; + auto Base = &Tail->Data; auto AlignedOffset = Base + (Offset * AlignedElementStorageSize); - auto Position = reinterpret_cast(AlignedOffset); + DCHECK_LE(AlignedOffset + sizeof(T), + reinterpret_cast(Tail) + SegmentSize); // In-place construct at Position. - new (Position) T{std::forward(args)...}; + new (AlignedOffset) T{std::forward(args)...}; ++Size; - return reinterpret_cast(Position); + return reinterpret_cast(AlignedOffset); } - T &operator[](size_t Offset) const XRAY_NEVER_INSTRUMENT { + T *Append(const T &E) XRAY_NEVER_INSTRUMENT { return AppendEmplace(E); } + + T &operator[](uint64_t Offset) const XRAY_NEVER_INSTRUMENT { DCHECK_LE(Offset, Size); // We need to traverse the array enough times to find the element at Offset. auto S = Head; @@ -297,7 +391,7 @@ Offset -= ElementsPerSegment; DCHECK_NE(S, &SentinelSegment); } - auto Base = static_cast(S)->Data; + auto Base = &S->Data; auto AlignedOffset = Base + (Offset * AlignedElementStorageSize); auto Position = reinterpret_cast(AlignedOffset); return *reinterpret_cast(Position); @@ -332,41 +426,172 @@ /// Remove N Elements from the end. This leaves the blocks behind, and not /// require allocation of new blocks for new elements added after trimming. - void trim(size_t Elements) XRAY_NEVER_INSTRUMENT { - if (Elements == 0) - return; - + void trim(uint64_t Elements) XRAY_NEVER_INSTRUMENT { auto OldSize = Size; - Elements = Elements >= Size ? Size : Elements; + Elements = Elements > Size ? Size : Elements; Size -= Elements; - DCHECK_NE(Head, &SentinelSegment); - DCHECK_NE(Tail, &SentinelSegment); - - for (auto SegmentsToTrim = (nearest_boundary(OldSize, ElementsPerSegment) - - nearest_boundary(Size, ElementsPerSegment)) / - ElementsPerSegment; - SegmentsToTrim > 0; --SegmentsToTrim) { - - // We want to short-circuit if the trace is already empty. - if (Head == &SentinelSegment && Head == Tail) - return; - - // Put the tail into the Freelist. - auto *FreeSegment = Tail; - Tail = Tail->Prev; - if (Tail == &SentinelSegment) - Head = Tail; - else - Tail->Next = &SentinelSegment; - + // We compute the number of segments we're going to return from the tail by + // counting how many elements have been trimmed. Given the following: + // + // - Each segment has N valid positions, where N > 0 + // - The previous size > current size + // + // To compute the number of segments to return, we need to perform the + // following calculations for the number of segments required given 'x' + // elements: + // + // f(x) = { + // x == 0 : 0 + // , 0 < x <= N : 1 + // , N < x <= max : x / N + (x % N ? 1 : 0) + // } + // + // We can simplify this down to: + // + // f(x) = { + // x == 0 : 0, + // , 0 < x <= max : x / N + (x < N || x % N ? 1 : 0) + // } + // + // And further down to: + // + // f(x) = x ? x / N + (x < N || x % N ? 1 : 0) : 0 + // + // We can then perform the following calculation `s` which counts the number + // of segments we need to remove from the end of the data structure: + // + // s(p, c) = f(p) - f(c) + // + // If we treat p = previous size, and c = current size, and given the + // properties above, the possible range for s(...) is [0..max(typeof(p))/N] + // given that typeof(p) == typeof(c). + auto F = [](uint64_t X) { + return X ? (X / ElementsPerSegment) + + (X < ElementsPerSegment || X % ElementsPerSegment ? 1 : 0) + : 0; + }; + auto PS = F(OldSize); + auto CS = F(Size); + DCHECK_GE(PS, CS); + auto SegmentsToTrim = PS - CS; + for (auto I = 0uL; I < SegmentsToTrim; ++I) { + // Here we place the current tail segment to the freelist. To do this + // appropriately, we need to perform a splice operation on two + // bidirectional linked-lists. In particular, we have the current state of + // the doubly-linked list of segments: + // + // @S@ <- s0 <-> s1 <-> ... <-> sT -> @S@ + // + DCHECK_NE(Head, &SentinelSegment); + DCHECK_NE(Tail, &SentinelSegment); DCHECK_EQ(Tail->Next, &SentinelSegment); - FreeSegment->Next = Freelist; - FreeSegment->Prev = &SentinelSegment; - if (Freelist != &SentinelSegment) - Freelist->Prev = FreeSegment; - Freelist = FreeSegment; + + if (Freelist == &SentinelSegment) { + // Our two lists at this point are in this configuration: + // + // Freelist: (potentially) @S@ + // Mainlist: @S@<-s0<->s1<->...<->sPT<->sT->@S@ + // ^ Head ^ Tail + // + // The end state for us will be this configuration: + // + // Freelist: @S@<-sT->@S@ + // Mainlist: @S@<-s0<->s1<->...<->sPT->@S@ + // ^ Head ^ Tail + // + // The first step for us is to hold a reference to the tail of Mainlist, + // which in our notation is represented by sT. We call this our "free + // segment" which is the segment we are placing on the Freelist. + // + // sF = sT + // + // Then, we also hold a reference to the "pre-tail" element, which we + // call sPT: + // + // sPT = pred(sT) + // + // We want to splice sT into the beginning of the Freelist, which in + // an empty Freelist means placing a segment whose predecessor and + // successor is the sentinel segment. + // + // The splice operation then can be performed in the following + // algorithm: + // + // succ(sPT) = S + // pred(sT) = S + // succ(sT) = Freelist + // Freelist = sT + // Tail = sPT + // + auto SPT = Tail->Prev; + SPT->Next = &SentinelSegment; + Tail->Prev = &SentinelSegment; + Tail->Next = Freelist; + Freelist = Tail; + Tail = SPT; + + // Our post-conditions here are: + DCHECK_EQ(Tail->Next, &SentinelSegment); + DCHECK_EQ(Freelist->Prev, &SentinelSegment); + } else { + // In the other case, where the Freelist is not empty, we perform the + // following transformation instead: + // + // This transforms the current state: + // + // Freelist: @S@<-f0->@S@ + // ^ Freelist + // Mainlist: @S@<-s0<->s1<->...<->sPT<->sT->@S@ + // ^ Head ^ Tail + // + // Into the following: + // + // Freelist: @S@<-sT<->f0->@S@ + // ^ Freelist + // Mainlist: @S@<-s0<->s1<->...<->sPT->@S@ + // ^ Head ^ Tail + // + // The algorithm is: + // + // sFH = Freelist + // sPT = pred(sT) + // pred(SFH) = sT + // succ(sT) = Freelist + // pred(sT) = S + // succ(sPT) = S + // Tail = sPT + // Freelist = sT + // + auto SFH = Freelist; + auto SPT = Tail->Prev; + auto ST = Tail; + SFH->Prev = ST; + ST->Next = Freelist; + ST->Prev = &SentinelSegment; + SPT->Next = &SentinelSegment; + Tail = SPT; + Freelist = ST; + + // Our post-conditions here are: + DCHECK_EQ(Tail->Next, &SentinelSegment); + DCHECK_EQ(Freelist->Prev, &SentinelSegment); + DCHECK_EQ(Freelist->Next->Prev, Freelist); + } } + + // Now in case we've spliced all the segments in the end, we ensure that the + // main list is "empty", or both the head and tail pointing to the sentinel + // segment. + if (Tail == &SentinelSegment) + Head = Tail; + + DCHECK( + (Size == 0 && Head == &SentinelSegment && Tail == &SentinelSegment) || + (Size != 0 && Head != &SentinelSegment && Tail != &SentinelSegment)); + DCHECK( + (Freelist != &SentinelSegment && Freelist->Prev == &SentinelSegment) || + (Freelist == &SentinelSegment && Tail->Next == &SentinelSegment)); } // Provide iterators. @@ -388,8 +613,8 @@ // ensure that storage for the SentinelSegment is defined and has a single // address. template -typename Array::SegmentBase Array::SentinelSegment{ - &Array::SentinelSegment, &Array::SentinelSegment}; +typename Array::Segment Array::SentinelSegment{ + &Array::SentinelSegment, &Array::SentinelSegment, {'\0'}}; } // namespace __xray