Index: compiler-rt/trunk/lib/xray/tests/CMakeLists.txt =================================================================== --- compiler-rt/trunk/lib/xray/tests/CMakeLists.txt +++ compiler-rt/trunk/lib/xray/tests/CMakeLists.txt @@ -68,7 +68,11 @@ endfunction() set(XRAY_TEST_ARCH ${XRAY_SUPPORTED_ARCH}) -set(XRAY_UNITTEST_LINK_FLAGS ${CMAKE_THREAD_LIBS_INIT}) +set(XRAY_UNITTEST_LINK_FLAGS + ${CMAKE_THREAD_LIBS_INIT} + -fxray-instrument + -lstdc++ # TODO: Detect the correct standard library used! + ) if (NOT APPLE) append_list_if(COMPILER_RT_HAS_LIBM -lm XRAY_UNITTEST_LINK_FLAGS) append_list_if(COMPILER_RT_HAS_LIBRT -lrt XRAY_UNITTEST_LINK_FLAGS) @@ -94,7 +98,7 @@ RUNTIME "${XRAY_RUNTIME_LIBS}" DEPS gtest xray CFLAGS ${XRAY_UNITTEST_CFLAGS} - LINK_FLAGS ${TARGET_LINK_FLAGS} ${XRAY_UNITTEST_LINK_FLAGS} -lstdc++) + LINK_FLAGS ${TARGET_LINK_FLAGS} ${XRAY_UNITTEST_LINK_FLAGS}) set_target_properties(XRayUnitTests PROPERTIES RUNTIME_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}) endforeach() Index: compiler-rt/trunk/lib/xray/tests/unit/function_call_trie_test.cc =================================================================== --- compiler-rt/trunk/lib/xray/tests/unit/function_call_trie_test.cc +++ compiler-rt/trunk/lib/xray/tests/unit/function_call_trie_test.cc @@ -18,12 +18,6 @@ namespace { -TEST(FunctionCallTrieTest, Construction) { - // We want to make sure that we can create one of these without the set of - // allocators we need. This will by default use the global allocators. - FunctionCallTrie Trie; -} - TEST(FunctionCallTrieTest, ConstructWithTLSAllocators) { // FIXME: Support passing in configuration for allocators in the allocator // constructors. @@ -61,6 +55,17 @@ ASSERT_TRUE(R.empty()); } +TEST(FunctionCallTrieTest, NoMatchingEntersForExit) { + auto A = FunctionCallTrie::InitAllocators(); + FunctionCallTrie Trie(A); + Trie.enterFunction(2, 1); + Trie.enterFunction(3, 3); + Trie.exitFunction(1, 5); + const auto &R = Trie.getRoots(); + + ASSERT_TRUE(R.empty()); +} + TEST(FunctionCallTrieTest, MissingFunctionExit) { auto A = FunctionCallTrie::InitAllocators(); FunctionCallTrie Trie(A); @@ -153,6 +158,31 @@ EXPECT_EQ(F1.CumulativeLocalTime, 100); } +TEST(FunctionCallTrieTest, DeepCallStack) { + // Simulate a relatively deep call stack (32 levels) and ensure that we can + // properly pop all the way up the stack. + profilingFlags()->setDefaults(); + auto A = FunctionCallTrie::InitAllocators(); + FunctionCallTrie Trie(A); + for (int i = 0; i < 32; ++i) + Trie.enterFunction(i + 1, i); + Trie.exitFunction(1, 33); + + // Here, validate that we have a 32-level deep function call path from the + // root (1) down to the leaf (33). + const auto &R = Trie.getRoots(); + ASSERT_EQ(R.size(), 1u); + auto F = R[0]; + for (int i = 0; i < 32; ++i) { + EXPECT_EQ(F->FId, i + 1); + EXPECT_EQ(F->CallCount, 1); + if (F->Callees.empty() && i != 31) + FAIL() << "Empty callees for FId " << F->FId; + if (i != 31) + F = F->Callees[0].NodePtr; + } +} + // TODO: Test that we can handle cross-CPU migrations, where TSCs are not // guaranteed to be synchronised. TEST(FunctionCallTrieTest, DeepCopy) { Index: compiler-rt/trunk/lib/xray/tests/unit/segmented_array_test.cc =================================================================== --- compiler-rt/trunk/lib/xray/tests/unit/segmented_array_test.cc +++ compiler-rt/trunk/lib/xray/tests/unit/segmented_array_test.cc @@ -107,32 +107,84 @@ } TEST(SegmentedArrayTest, IteratorTrimBehaviour) { - Array data; - ASSERT_TRUE(data.empty()); - auto I0Begin = data.begin(), I0End = data.end(); - // Add enough elements in data to have more than one chunk. - constexpr auto ChunkX2 = Array::ChunkSize * 2; + using AllocatorType = typename Array::AllocatorType; + AllocatorType A(1 << 10, 0); + Array Data(A); + ASSERT_TRUE(Data.empty()); + auto I0Begin = Data.begin(), I0End = Data.end(); + // Add enough elements in Data to have more than one chunk. + constexpr auto Chunk = Array::ChunkSize; + constexpr auto ChunkX2 = Chunk * 2; for (auto i = ChunkX2; i > 0u; --i) { - data.Append({static_cast(i), static_cast(i)}); + Data.AppendEmplace(static_cast(i), static_cast(i)); + } + ASSERT_EQ(Data.size(), ChunkX2); + { + auto &Back = Data.back(); + ASSERT_EQ(Back.First, 1); + ASSERT_EQ(Back.Second, 1); } - ASSERT_EQ(data.size(), ChunkX2); + // Trim one chunk's elements worth. - data.trim(ChunkX2 / 2); - ASSERT_EQ(data.size(), ChunkX2 / 2); + Data.trim(Chunk); + ASSERT_EQ(Data.size(), Chunk); + + // Check that we are still able to access 'back' properly. + { + auto &Back = Data.back(); + ASSERT_EQ(Back.First, static_cast(Chunk + 1)); + ASSERT_EQ(Back.Second, static_cast(Chunk + 1)); + } + // Then trim until it's empty. - data.trim(ChunkX2 / 2); - ASSERT_TRUE(data.empty()); + Data.trim(Chunk); + ASSERT_TRUE(Data.empty()); // Here our iterators should be the same. - auto I1Begin = data.begin(), I1End = data.end(); + auto I1Begin = Data.begin(), I1End = Data.end(); EXPECT_EQ(I0Begin, I1Begin); EXPECT_EQ(I0End, I1End); // Then we ensure that adding elements back works just fine. for (auto i = ChunkX2; i > 0u; --i) { - data.Append({static_cast(i), static_cast(i)}); + Data.AppendEmplace(static_cast(i), static_cast(i)); + } + EXPECT_EQ(Data.size(), ChunkX2); +} + +struct ShadowStackEntry { + uint64_t EntryTSC = 0; + uint64_t *NodePtr = nullptr; + ShadowStackEntry(uint64_t T, uint64_t *N) : EntryTSC(T), NodePtr(N) {} +}; + +TEST(SegmentedArrayTest, SimulateStackBehaviour) { + using AllocatorType = typename Array::AllocatorType; + AllocatorType A(1 << 10, 0); + Array Data(A); + static uint64_t Dummy = 0; + constexpr uint64_t Max = 9; + + for (uint64_t i = 0; i < Max; ++i) { + auto P = Data.Append({i, &Dummy}); + ASSERT_NE(P, nullptr); + ASSERT_EQ(P->NodePtr, &Dummy); + auto &Back = Data.back(); + ASSERT_EQ(Back.NodePtr, &Dummy); + ASSERT_EQ(Back.EntryTSC, i); + } + + // Simulate a stack by checking the data from the end as we're trimming. + auto Counter = Max; + ASSERT_EQ(Data.size(), size_t(Max)); + while (!Data.empty()) { + const auto &Top = Data.back(); + uint64_t *TopNode = Top.NodePtr; + EXPECT_EQ(TopNode, &Dummy) << "Counter = " << Counter; + Data.trim(1); + --Counter; + ASSERT_EQ(Data.size(), size_t(Counter)); } - EXPECT_EQ(data.size(), ChunkX2); } } // namespace Index: compiler-rt/trunk/lib/xray/xray_allocator.h =================================================================== --- compiler-rt/trunk/lib/xray/xray_allocator.h +++ compiler-rt/trunk/lib/xray/xray_allocator.h @@ -18,12 +18,12 @@ #include "sanitizer_common/sanitizer_allocator_internal.h" #include "sanitizer_common/sanitizer_common.h" +#include "sanitizer_common/sanitizer_internal_defs.h" #include "sanitizer_common/sanitizer_mutex.h" +#include "xray_utils.h" #include #include -#include "sanitizer_common/sanitizer_internal_defs.h" - namespace __xray { /// The Allocator type hands out fixed-sized chunks of memory that are @@ -40,8 +40,7 @@ // The Allocator returns memory as Block instances. struct Block { /// Compute the minimum cache-line size multiple that is >= N. - static constexpr auto Size = - kCacheLineSize * ((N / kCacheLineSize) + (N % kCacheLineSize ? 1 : 0)); + static constexpr auto Size = nearest_boundary(N, kCacheLineSize); void *Data = nullptr; }; @@ -61,15 +60,16 @@ // We compute the number of pointers to areas in memory where we consider as // individual blocks we've allocated. To ensure that instances of the - // BlockLink object are cache-line sized, we deduct one additional - // pointers worth representing the pointer to the previous link. + // BlockLink object are cache-line sized, we deduct two pointers worth + // representing the pointer to the previous link and the backing store for + // the whole block. // // This structure corresponds to the following layout: // - // Blocks [ 0, 1, 2, .., BlockPtrCount - 1] + // Blocks [ 0, 1, 2, .., BlockPtrCount - 2] // static constexpr auto BlockPtrCount = - (kCacheLineSize / sizeof(Block *)) - 1; + (kCacheLineSize / sizeof(Block *)) - 2; BlockLink() { // Zero out Blocks. @@ -81,6 +81,7 @@ // FIXME: Align this to cache-line address boundaries? Block Blocks[BlockPtrCount]; BlockLink *Prev = nullptr; + void *BackingStore = nullptr; }; static_assert(sizeof(BlockLink) == kCacheLineSize, @@ -96,17 +97,26 @@ BlockLink *Tail = &NullLink; size_t Counter = 0; - BlockLink *NewChainLink() { + BlockLink *NewChainLink(uint64_t Alignment) { auto NewChain = reinterpret_cast( InternalAlloc(sizeof(BlockLink), nullptr, kCacheLineSize)); auto BackingStore = reinterpret_cast(InternalAlloc( - BlockLink::BlockPtrCount * Block::Size, nullptr, kCacheLineSize)); + (BlockLink::BlockPtrCount + 1) * Block::Size, nullptr, Alignment)); size_t Offset = 0; DCHECK_NE(NewChain, nullptr); DCHECK_NE(BackingStore, nullptr); + NewChain->BackingStore = BackingStore; + + // Here we ensure that the alignment of the pointers we're handing out + // adhere to the alignment requirements of the call to Allocate(). for (auto &B : NewChain->Blocks) { - B.Data = BackingStore + Offset; - Offset += Block::Size; + auto AlignmentAdjustment = + nearest_boundary(reinterpret_cast(BackingStore + Offset), + Alignment) - + reinterpret_cast(BackingStore + Offset); + B.Data = BackingStore + AlignmentAdjustment + Offset; + DCHECK_EQ(reinterpret_cast(B.Data) % Alignment, 0); + Offset += AlignmentAdjustment + Block::Size; } NewChain->Prev = Tail; return NewChain; @@ -117,7 +127,7 @@ // FIXME: Implement PreAllocate support! } - Block Allocate() { + Block Allocate(uint64_t Alignment = 8) { SpinMutexLock Lock(&Mutex); // Check whether we're over quota. if (Counter * Block::Size >= MaxMemory) @@ -128,7 +138,7 @@ Block B{}; BlockLink *Link = Tail; if (UNLIKELY(Counter == 0 || ChainOffset == 0)) - Tail = Link = NewChainLink(); + Tail = Link = NewChainLink(Alignment); B = Link->Blocks[ChainOffset]; ++Counter; @@ -140,7 +150,7 @@ for (auto *C = Tail; C != &NullLink;) { // We know that the data block is a large contiguous page, we deallocate // that at once. - InternalFree(C->Blocks[0].Data); + InternalFree(C->BackingStore); auto Prev = C->Prev; InternalFree(C); C = Prev; Index: compiler-rt/trunk/lib/xray/xray_function_call_trie.h =================================================================== --- compiler-rt/trunk/lib/xray/xray_function_call_trie.h +++ compiler-rt/trunk/lib/xray/xray_function_call_trie.h @@ -17,8 +17,8 @@ #include "xray_profiling_flags.h" #include "xray_segmented_array.h" +#include // For placement new. #include -#include // For placement new. namespace __xray { @@ -101,6 +101,8 @@ NodeIdPair(Node *N, int32_t F) : NodePtr(N), FId(F) {} }; + static_assert(sizeof(NodeIdPair) == 16, "Wrong size for NodeIDPair."); + using NodeIdPairArray = Array; using NodeIdPairAllocatorType = NodeIdPairArray::AllocatorType; @@ -128,15 +130,12 @@ private: struct ShadowStackEntry { - int32_t FId; // We're copying the function ID into the stack to avoid having - // to reach into the node just to get the function ID. uint64_t EntryTSC; Node *NodePtr; // We add a constructor here to allow us to inplace-construct through // Array<...>'s AppendEmplace. - ShadowStackEntry(int32_t F, uint64_t T, Node *N) - : FId(F), EntryTSC(T), NodePtr(N) {} + ShadowStackEntry(uint64_t T, Node *N) : EntryTSC{T}, NodePtr{N} {} }; using NodeArray = Array; @@ -219,30 +218,30 @@ // TODO: Support configuration of options through the arguments. static Allocators InitAllocators() { + return InitAllocatorsCustom(profilingFlags()->per_thread_allocator_max); + } + + static Allocators InitAllocatorsCustom(uptr Max) { Allocators A; auto NodeAllocator = reinterpret_cast( InternalAlloc(sizeof(Allocators::NodeAllocatorType))); - new (NodeAllocator) Allocators::NodeAllocatorType( - profilingFlags()->per_thread_allocator_max, 0); + new (NodeAllocator) Allocators::NodeAllocatorType(Max, 0); A.NodeAllocator = NodeAllocator; auto RootAllocator = reinterpret_cast( InternalAlloc(sizeof(Allocators::RootAllocatorType))); - new (RootAllocator) Allocators::RootAllocatorType( - profilingFlags()->per_thread_allocator_max, 0); + new (RootAllocator) Allocators::RootAllocatorType(Max, 0); A.RootAllocator = RootAllocator; auto ShadowStackAllocator = reinterpret_cast( InternalAlloc(sizeof(Allocators::ShadowStackAllocatorType))); - new (ShadowStackAllocator) Allocators::ShadowStackAllocatorType( - profilingFlags()->per_thread_allocator_max, 0); + new (ShadowStackAllocator) Allocators::ShadowStackAllocatorType(Max, 0); A.ShadowStackAllocator = ShadowStackAllocator; auto NodeIdPairAllocator = reinterpret_cast( InternalAlloc(sizeof(NodeIdPairAllocatorType))); - new (NodeIdPairAllocator) - NodeIdPairAllocatorType(profilingFlags()->per_thread_allocator_max, 0); + new (NodeIdPairAllocator) NodeIdPairAllocatorType(Max, 0); A.NodeIdPairAllocator = NodeIdPairAllocator; return A; } @@ -253,20 +252,14 @@ ShadowStackArray ShadowStack; NodeIdPairAllocatorType *NodeIdPairAllocator = nullptr; - const Allocators &GetGlobalAllocators() { - static const Allocators A = [] { return InitAllocators(); }(); - return A; - } - public: explicit FunctionCallTrie(const Allocators &A) : Nodes(*A.NodeAllocator), Roots(*A.RootAllocator), ShadowStack(*A.ShadowStackAllocator), NodeIdPairAllocator(A.NodeIdPairAllocator) {} - FunctionCallTrie() : FunctionCallTrie(GetGlobalAllocators()) {} - - void enterFunction(int32_t FId, uint64_t TSC) { + void enterFunction(const int32_t FId, uint64_t TSC) { + DCHECK_NE(FId, 0); // This function primarily deals with ensuring that the ShadowStack is // consistent and ready for when an exit event is encountered. if (UNLIKELY(ShadowStack.empty())) { @@ -275,12 +268,13 @@ if (UNLIKELY(NewRoot == nullptr)) return; Roots.Append(NewRoot); - ShadowStack.AppendEmplace(FId, TSC, NewRoot); + ShadowStack.AppendEmplace(TSC, NewRoot); return; } auto &Top = ShadowStack.back(); auto TopNode = Top.NodePtr; + DCHECK_NE(TopNode, nullptr); // If we've seen this callee before, then we just access that node and place // that on the top of the stack. @@ -288,7 +282,7 @@ [FId](const NodeIdPair &NR) { return NR.FId == FId; }); if (Callee != nullptr) { CHECK_NE(Callee->NodePtr, nullptr); - ShadowStack.AppendEmplace(FId, TSC, Callee->NodePtr); + ShadowStack.AppendEmplace(TSC, Callee->NodePtr); return; } @@ -297,8 +291,10 @@ Nodes.AppendEmplace(TopNode, *NodeIdPairAllocator, 0, 0, FId); if (UNLIKELY(NewNode == nullptr)) return; + DCHECK_NE(NewNode, nullptr); TopNode->Callees.AppendEmplace(NewNode, FId); - ShadowStack.AppendEmplace(FId, TSC, NewNode); + ShadowStack.AppendEmplace(TSC, NewNode); + DCHECK_NE(ShadowStack.back().NodePtr, nullptr); return; } @@ -307,14 +303,11 @@ // entered this function before. We do as little processing here as we can, // since most of the hard work would have already been done at function // entry. - if (UNLIKELY(ShadowStack.empty())) - return; - uint64_t CumulativeTreeTime = 0; while (!ShadowStack.empty()) { - auto &Top = ShadowStack.back(); + const auto &Top = ShadowStack.back(); auto TopNode = Top.NodePtr; - auto TopFId = TopNode->FId; + DCHECK_NE(TopNode, nullptr); auto LocalTime = TSC - Top.EntryTSC; TopNode->CallCount++; TopNode->CumulativeLocalTime += LocalTime - CumulativeTreeTime; @@ -322,7 +315,7 @@ ShadowStack.trim(1); // TODO: Update the histogram for the node. - if (TopFId == FId) + if (TopNode->FId == FId) break; } } @@ -344,28 +337,34 @@ // This function must *not* be called with a non-empty FunctionCallTrie |O|. void deepCopyInto(FunctionCallTrie &O) const { DCHECK(O.getRoots().empty()); + + // We then push the root into a stack, to use as the parent marker for new + // nodes we push in as we're traversing depth-first down the call tree. + struct NodeAndParent { + FunctionCallTrie::Node *Node; + FunctionCallTrie::Node *NewNode; + }; + using Stack = Array; + + typename Stack::AllocatorType StackAllocator( + profilingFlags()->stack_allocator_max, 0); + Stack DFSStack(StackAllocator); + for (const auto Root : getRoots()) { // Add a node in O for this root. auto NewRoot = O.Nodes.AppendEmplace( nullptr, *O.NodeIdPairAllocator, Root->CallCount, Root->CumulativeLocalTime, Root->FId); - O.Roots.Append(NewRoot); - // We then push the root into a stack, to use as the parent marker for new - // nodes we push in as we're traversing depth-first down the call tree. - struct NodeAndParent { - FunctionCallTrie::Node *Node; - FunctionCallTrie::Node *NewNode; - }; - using Stack = Array; - - typename Stack::AllocatorType StackAllocator( - profilingFlags()->stack_allocator_max, 0); - Stack DFSStack(StackAllocator); + // Because we cannot allocate more memory we should bail out right away. + if (UNLIKELY(NewRoot == nullptr)) + return; + + O.Roots.Append(NewRoot); // TODO: Figure out what to do if we fail to allocate any more stack // space. Maybe warn or report once? - DFSStack.Append(NodeAndParent{Root, NewRoot}); + DFSStack.AppendEmplace(Root, NewRoot); while (!DFSStack.empty()) { NodeAndParent NP = DFSStack.back(); DCHECK_NE(NP.Node, nullptr); @@ -375,9 +374,10 @@ auto NewNode = O.Nodes.AppendEmplace( NP.NewNode, *O.NodeIdPairAllocator, Callee.NodePtr->CallCount, Callee.NodePtr->CumulativeLocalTime, Callee.FId); - DCHECK_NE(NewNode, nullptr); + if (UNLIKELY(NewNode == nullptr)) + return; NP.NewNode->Callees.AppendEmplace(NewNode, Callee.FId); - DFSStack.Append(NodeAndParent{Callee.NodePtr, NewNode}); + DFSStack.AppendEmplace(Callee.NodePtr, NewNode); } } } @@ -408,6 +408,9 @@ if (R == nullptr) { TargetRoot = O.Nodes.AppendEmplace(nullptr, *O.NodeIdPairAllocator, 0, 0, Root->FId); + if (UNLIKELY(TargetRoot == nullptr)) + return; + O.Roots.Append(TargetRoot); } else { TargetRoot = *R; @@ -430,10 +433,14 @@ if (TargetCallee == nullptr) { auto NewTargetNode = O.Nodes.AppendEmplace( NT.TargetNode, *O.NodeIdPairAllocator, 0, 0, Callee.FId); + + if (UNLIKELY(NewTargetNode == nullptr)) + return; + TargetCallee = NT.TargetNode->Callees.AppendEmplace(NewTargetNode, Callee.FId); } - DFSStack.Append(NodeAndTarget{Callee.NodePtr, TargetCallee->NodePtr}); + DFSStack.AppendEmplace(Callee.NodePtr, TargetCallee->NodePtr); } } } Index: compiler-rt/trunk/lib/xray/xray_profile_collector.cc =================================================================== --- compiler-rt/trunk/lib/xray/xray_profile_collector.cc +++ compiler-rt/trunk/lib/xray/xray_profile_collector.cc @@ -16,8 +16,8 @@ #include "sanitizer_common/sanitizer_common.h" #include "sanitizer_common/sanitizer_vector.h" #include "xray_profiling_flags.h" -#include #include +#include #include namespace __xray { @@ -55,7 +55,8 @@ GlobalAllocators = reinterpret_cast( InternalAlloc(sizeof(FunctionCallTrie::Allocators))); new (GlobalAllocators) FunctionCallTrie::Allocators(); - *GlobalAllocators = FunctionCallTrie::InitAllocators(); + *GlobalAllocators = FunctionCallTrie::InitAllocatorsCustom( + profilingFlags()->global_allocator_max); }); DCHECK_NE(GlobalAllocators, nullptr); @@ -83,12 +84,11 @@ // and is decoupled from the lifetime management required by the managed // allocator we have in XRay. // - Item->Trie = reinterpret_cast( - InternalAlloc(sizeof(FunctionCallTrie))); + Item->Trie = reinterpret_cast(InternalAlloc( + sizeof(FunctionCallTrie), nullptr, alignof(FunctionCallTrie))); DCHECK_NE(Item->Trie, nullptr); new (Item->Trie) FunctionCallTrie(*GlobalAllocators); } - DCHECK_NE(Item, nullptr); T.deepCopyInto(*Item->Trie); } Index: compiler-rt/trunk/lib/xray/xray_profiling.cc =================================================================== --- compiler-rt/trunk/lib/xray/xray_profiling.cc +++ compiler-rt/trunk/lib/xray/xray_profiling.cc @@ -258,9 +258,9 @@ { SpinMutexLock Lock(&ProfilerOptionsMutex); FlagParser ConfigParser; - auto *F = profilingFlags(); - F->setDefaults(); - registerProfilerFlags(&ConfigParser, F); + ProfilerFlags Flags; + Flags.setDefaults(); + registerProfilerFlags(&ConfigParser, &Flags); ConfigParser.ParseString(profilingCompilerDefinedFlags()); const char *Env = GetEnv("XRAY_PROFILING_OPTIONS"); if (Env == nullptr) @@ -271,6 +271,7 @@ ConfigParser.ParseString(static_cast(Options)); if (Verbosity()) ReportUnrecognizedFlags(); + *profilingFlags() = Flags; } // We need to reset the profile data collection implementation now. Index: compiler-rt/trunk/lib/xray/xray_segmented_array.h =================================================================== --- compiler-rt/trunk/lib/xray/xray_segmented_array.h +++ compiler-rt/trunk/lib/xray/xray_segmented_array.h @@ -18,21 +18,13 @@ #include "sanitizer_common/sanitizer_allocator.h" #include "xray_allocator.h" +#include "xray_utils.h" +#include #include #include namespace __xray { -namespace { - -constexpr size_t gcd(size_t a, size_t b) { - return (b == 0) ? a : gcd(b, a % b); -} - -constexpr size_t lcm(size_t a, size_t b) { return a * b / gcd(a, b); } - -} // namespace - /// The Array type provides an interface similar to std::vector<...> but does /// not shrink in size. Once constructed, elements can be appended but cannot be /// removed. The implementation is heavily dependent on the contract provided by @@ -45,10 +37,12 @@ /// size, to allow us to maximise the number of T objects we can place in /// cache-line multiple sized blocks. To get back the number of T's, we divide /// this least common multiple by the size of T. -template +template struct Array { static constexpr size_t ChunkSize = N; - static constexpr size_t AllocatorChunkSize = sizeof(T) * ChunkSize; + static constexpr size_t ElementStorageSize = next_pow2(sizeof(T)); + static constexpr size_t AllocatorChunkSize = ElementStorageSize * ChunkSize; using AllocatorType = Allocator; static_assert(std::is_trivially_destructible::value, "T must be trivially destructible."); @@ -60,8 +54,8 @@ struct Chunk { typename AllocatorType::Block Block; static constexpr size_t Size = N; - Chunk *Prev = nullptr; - Chunk *Next = nullptr; + Chunk *Prev = this; + Chunk *Next = this; }; static Chunk SentinelChunk; @@ -70,7 +64,6 @@ Chunk *Head = &SentinelChunk; Chunk *Tail = &SentinelChunk; size_t Size = 0; - size_t FreeElements = 0; // Here we keep track of chunks in the freelist, to allow us to re-use chunks // when elements are trimmed off the end. @@ -85,17 +78,22 @@ auto *FreeChunk = Freelist; Freelist = FreeChunk->Next; FreeChunk->Next = &SentinelChunk; + Freelist->Prev = &SentinelChunk; return FreeChunk; } - auto Block = Alloc->Allocate(); + auto Block = Alloc->Allocate(ElementStorageSize); if (Block.Data == nullptr) return nullptr; + // TODO: Maybe use a separate managed allocator for Chunk instances? - auto C = reinterpret_cast(InternalAlloc(sizeof(Chunk))); + auto C = reinterpret_cast( + InternalAlloc(sizeof(Chunk), nullptr, kCacheLineSize)); if (C == nullptr) return nullptr; C->Block = Block; + C->Prev = &SentinelChunk; + C->Next = &SentinelChunk; return C; } @@ -116,42 +114,52 @@ auto Chunk = NewChunk(); if (Chunk == nullptr) return nullptr; - Chunk->Prev = &SentinelChunk; - Chunk->Next = &SentinelChunk; - Head = Chunk; - Tail = Chunk; - return Chunk; + DCHECK_EQ(Chunk->Next, &SentinelChunk); + DCHECK_EQ(Chunk->Prev, &SentinelChunk); + return Head = Tail = Chunk; } Chunk *AppendNewChunk() { auto Chunk = NewChunk(); if (Chunk == nullptr) return nullptr; + DCHECK_NE(Tail, &SentinelChunk); + DCHECK_EQ(Tail->Next, &SentinelChunk); + DCHECK_EQ(Chunk->Prev, &SentinelChunk); + DCHECK_EQ(Chunk->Next, &SentinelChunk); Tail->Next = Chunk; Chunk->Prev = Tail; - Chunk->Next = &SentinelChunk; Tail = Chunk; - return Chunk; + return Tail; } // This Iterator models a BidirectionalIterator. template class Iterator { - Chunk *C = nullptr; + Chunk *C = &SentinelChunk; size_t Offset = 0; + size_t Size = 0; public: - Iterator(Chunk *IC, size_t Off) : C(IC), Offset(Off) {} + Iterator(Chunk *IC, size_t Off, size_t S) : C(IC), Offset(Off), Size(S) {} + Iterator(const Iterator &) noexcept = default; + Iterator() noexcept = default; + Iterator(Iterator &&) noexcept = default; + Iterator &operator=(const Iterator &) = default; + Iterator &operator=(Iterator &&) = default; + ~Iterator() = default; Iterator &operator++() { - if (++Offset % N) + if (++Offset % N || Offset == Size) return *this; - DCHECK_NE(C, &SentinelChunk); - // At this point, we know that Offset % N == 0, so we must advance the // chunk pointer. DCHECK_EQ(Offset % N, 0); + DCHECK_NE(Offset, Size); + DCHECK_NE(C, &SentinelChunk); + DCHECK_NE(C->Next, &SentinelChunk); C = C->Next; + DCHECK_NE(C, &SentinelChunk); return *this; } @@ -159,10 +167,12 @@ DCHECK_NE(C, &SentinelChunk); DCHECK_GT(Offset, 0); - // We check whether the offset was on a boundary before decrement, to see - // whether we need to retreat to the previous chunk. - if ((Offset-- % N) == 0) + auto PreviousOffset = Offset--; + if (PreviousOffset != Size && PreviousOffset % N == 0) { + DCHECK_NE(C->Prev, &SentinelChunk); C = C->Prev; + } + return *this; } @@ -190,12 +200,16 @@ U &operator*() const { DCHECK_NE(C, &SentinelChunk); - return reinterpret_cast(C->Block.Data)[Offset % N]; + auto RelOff = Offset % N; + return *reinterpret_cast(reinterpret_cast(C->Block.Data) + + (RelOff * ElementStorageSize)); } U *operator->() const { DCHECK_NE(C, &SentinelChunk); - return reinterpret_cast(C->Block.Data) + (Offset % N); + auto RelOff = Offset % N; + return reinterpret_cast(reinterpret_cast(C->Block.Data) + + (RelOff * ElementStorageSize)); } }; @@ -232,10 +246,11 @@ if (AppendNewChunk() == nullptr) return nullptr; - auto Position = reinterpret_cast(Tail->Block.Data) + Offset; + auto Position = + reinterpret_cast(reinterpret_cast(Tail->Block.Data) + + (Offset * ElementStorageSize)); *Position = E; ++Size; - FreeElements -= FreeElements ? 1 : 0; return Position; } @@ -245,16 +260,21 @@ return nullptr; auto Offset = Size % N; - if (UNLIKELY(Size != 0 && Offset == 0)) - if (AppendNewChunk() == nullptr) + auto *LatestChunk = Tail; + if (UNLIKELY(Size != 0 && Offset == 0)) { + LatestChunk = AppendNewChunk(); + if (LatestChunk == nullptr) return nullptr; + } - auto Position = reinterpret_cast(Tail->Block.Data) + Offset; + DCHECK_NE(Tail, &SentinelChunk); + auto Position = reinterpret_cast(LatestChunk->Block.Data) + + (Offset * ElementStorageSize); + DCHECK_EQ(reinterpret_cast(Position) % ElementStorageSize, 0); // In-place construct at Position. - new (Position) T(std::forward(args)...); + new (Position) T{std::forward(args)...}; ++Size; - FreeElements -= FreeElements ? 1 : 0; - return Position; + return reinterpret_cast(Position); } T &operator[](size_t Offset) const { @@ -266,20 +286,22 @@ Offset -= N; DCHECK_NE(C, &SentinelChunk); } - auto Position = reinterpret_cast(C->Block.Data) + Offset; - return *Position; + return *reinterpret_cast(reinterpret_cast(C->Block.Data) + + (Offset * ElementStorageSize)); } T &front() const { DCHECK_NE(Head, &SentinelChunk); DCHECK_NE(Size, 0u); - return *reinterpret_cast(Head->Block.Data); + return *begin(); } T &back() const { DCHECK_NE(Tail, &SentinelChunk); - auto Offset = (Size - 1) % N; - return *(reinterpret_cast(Tail->Block.Data) + Offset); + DCHECK_NE(Size, 0u); + auto It = end(); + --It; + return *It; } template T *find_element(Predicate P) const { @@ -298,15 +320,18 @@ /// require allocation of new blocks for new elements added after trimming. void trim(size_t Elements) { DCHECK_LE(Elements, Size); + DCHECK_GT(Size, 0); + auto OldSize = Size; Size -= Elements; - FreeElements += Elements; - // Here we need to check whether we've cleared enough elements to warrant - // putting blocks on to the freelist. We determine whether we need to - // right-size the internal list, by keeping track of the number of "free" - // elements still in the array. - auto ChunksToTrim = FreeElements / N; - for (size_t i = 0; i < ChunksToTrim; ++i, FreeElements -= N) { + DCHECK_NE(Head, &SentinelChunk); + DCHECK_NE(Tail, &SentinelChunk); + + for (auto ChunksToTrim = + (nearest_boundary(OldSize, N) - nearest_boundary(Size, N)) / N; + ChunksToTrim > 0; --ChunksToTrim) { + DCHECK_NE(Head, &SentinelChunk); + DCHECK_NE(Tail, &SentinelChunk); // Put the tail into the Freelist. auto *FreeChunk = Tail; Tail = Tail->Prev; @@ -314,17 +339,21 @@ Head = Tail; else Tail->Next = &SentinelChunk; + + DCHECK_EQ(Tail->Next, &SentinelChunk); FreeChunk->Next = Freelist; - FreeChunk->Prev = Freelist->Prev; + FreeChunk->Prev = &SentinelChunk; + if (Freelist != &SentinelChunk) + Freelist->Prev = FreeChunk; Freelist = FreeChunk; } } // Provide iterators. - Iterator begin() const { return Iterator(Head, 0); } - Iterator end() const { return Iterator(Tail, Size); } - Iterator cbegin() const { return Iterator(Head, 0); } - Iterator cend() const { return Iterator(Tail, Size); } + Iterator begin() const { return Iterator(Head, 0, Size); } + Iterator end() const { return Iterator(Tail, Size, Size); } + Iterator cbegin() const { return Iterator(Head, 0, Size); } + Iterator cend() const { return Iterator(Tail, Size, Size); } }; // We need to have this storage definition out-of-line so that the compiler can Index: compiler-rt/trunk/lib/xray/xray_utils.h =================================================================== --- compiler-rt/trunk/lib/xray/xray_utils.h +++ compiler-rt/trunk/lib/xray/xray_utils.h @@ -36,6 +36,24 @@ // file. int getLogFD(); +constexpr size_t gcd(size_t a, size_t b) { + return (b == 0) ? a : gcd(b, a % b); +} + +constexpr size_t lcm(size_t a, size_t b) { return a * b / gcd(a, b); } + +template constexpr T nearest_boundary(T number, T multiple) { + return multiple * ((number / multiple) + (number % multiple ? 1 : 0)); +} + +constexpr size_t next_pow2_helper(size_t num, size_t acc) { + return (1u << acc) >= num ? (1u << acc) : next_pow2_helper(num, acc + 1); +} + +constexpr size_t next_pow2(size_t number) { + return next_pow2_helper(number, 1); +} + } // namespace __xray #endif // XRAY_UTILS_H