Index: compiler-rt/lib/xray/tests/CMakeLists.txt =================================================================== --- compiler-rt/lib/xray/tests/CMakeLists.txt +++ compiler-rt/lib/xray/tests/CMakeLists.txt @@ -68,7 +68,11 @@ endfunction() set(XRAY_TEST_ARCH ${XRAY_SUPPORTED_ARCH}) -set(XRAY_UNITTEST_LINK_FLAGS ${CMAKE_THREAD_LIBS_INIT}) +set(XRAY_UNITTEST_LINK_FLAGS + ${CMAKE_THREAD_LIBS_INIT} + -fxray-instrument + -lstdc++ # TODO: Detect the correct standard library used! + ) if (NOT APPLE) append_list_if(COMPILER_RT_HAS_LIBM -lm XRAY_UNITTEST_LINK_FLAGS) append_list_if(COMPILER_RT_HAS_LIBRT -lrt XRAY_UNITTEST_LINK_FLAGS) @@ -94,7 +98,7 @@ RUNTIME "${XRAY_RUNTIME_LIBS}" DEPS gtest xray CFLAGS ${XRAY_UNITTEST_CFLAGS} - LINK_FLAGS ${TARGET_LINK_FLAGS} ${XRAY_UNITTEST_LINK_FLAGS} -lstdc++) + LINK_FLAGS ${TARGET_LINK_FLAGS} ${XRAY_UNITTEST_LINK_FLAGS}) set_target_properties(XRayUnitTests PROPERTIES RUNTIME_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}) endforeach() Index: compiler-rt/lib/xray/tests/unit/function_call_trie_test.cc =================================================================== --- compiler-rt/lib/xray/tests/unit/function_call_trie_test.cc +++ compiler-rt/lib/xray/tests/unit/function_call_trie_test.cc @@ -18,12 +18,6 @@ namespace { -TEST(FunctionCallTrieTest, Construction) { - // We want to make sure that we can create one of these without the set of - // allocators we need. This will by default use the global allocators. - FunctionCallTrie Trie; -} - TEST(FunctionCallTrieTest, ConstructWithTLSAllocators) { // FIXME: Support passing in configuration for allocators in the allocator // constructors. @@ -61,6 +55,17 @@ ASSERT_TRUE(R.empty()); } +TEST(FunctionCallTrieTest, NoMatchingEntersForExit) { + auto A = FunctionCallTrie::InitAllocators(); + FunctionCallTrie Trie(A); + Trie.enterFunction(2, 1); + Trie.enterFunction(3, 3); + Trie.exitFunction(1, 5); + const auto &R = Trie.getRoots(); + + ASSERT_TRUE(R.empty()); +} + TEST(FunctionCallTrieTest, MissingFunctionExit) { auto A = FunctionCallTrie::InitAllocators(); FunctionCallTrie Trie(A); @@ -153,6 +158,30 @@ EXPECT_EQ(F1.CumulativeLocalTime, 100); } +TEST(FunctionCallTrieTest, DeepCallStack) { + // Simulate a relatively deep call stack (32 levels) and ensure that we can + // properly pop all the way up the stack. + profilingFlags()->setDefaults(); + auto A = FunctionCallTrie::InitAllocators(); + FunctionCallTrie Trie(A); + for (int i = 0; i < 32; ++i) + Trie.enterFunction(i + 1, i); + Trie.exitFunction(1, 33); + + // Here, validate that we have a 32-level deep function call path from the + // root (1) down to the leaf (33). + const auto &R = Trie.getRoots(); + ASSERT_EQ(R.size(), 1u); + auto F = R[0]; + for (int i = 0; i < 32; ++i) { + EXPECT_EQ(F->FId, i + 1); + if (F->Callees.empty() && i != 31) + FAIL() << "Empty callees for FId " << F->FId; + if (i != 31) + F = F->Callees[0].NodePtr; + } +} + // TODO: Test that we can handle cross-CPU migrations, where TSCs are not // guaranteed to be synchronised. TEST(FunctionCallTrieTest, DeepCopy) { Index: compiler-rt/lib/xray/tests/unit/segmented_array_test.cc =================================================================== --- compiler-rt/lib/xray/tests/unit/segmented_array_test.cc +++ compiler-rt/lib/xray/tests/unit/segmented_array_test.cc @@ -107,32 +107,84 @@ } TEST(SegmentedArrayTest, IteratorTrimBehaviour) { - Array data; - ASSERT_TRUE(data.empty()); - auto I0Begin = data.begin(), I0End = data.end(); - // Add enough elements in data to have more than one chunk. - constexpr auto ChunkX2 = Array::ChunkSize * 2; + using AllocatorType = typename Array::AllocatorType; + AllocatorType A(1 << 10, 0); + Array Data(A); + ASSERT_TRUE(Data.empty()); + auto I0Begin = Data.begin(), I0End = Data.end(); + // Add enough elements in Data to have more than one chunk. + constexpr auto Chunk = Array::ChunkSize; + constexpr auto ChunkX2 = Chunk * 2; for (auto i = ChunkX2; i > 0u; --i) { - data.Append({static_cast(i), static_cast(i)}); + Data.AppendEmplace(static_cast(i), static_cast(i)); } - ASSERT_EQ(data.size(), ChunkX2); + ASSERT_EQ(Data.size(), ChunkX2); + { + auto &Back = Data.back(); + ASSERT_EQ(Back.First, 1); + ASSERT_EQ(Back.Second, 1); + } + // Trim one chunk's elements worth. - data.trim(ChunkX2 / 2); - ASSERT_EQ(data.size(), ChunkX2 / 2); + Data.trim(ChunkX2 / 2); + ASSERT_EQ(Data.size(), ChunkX2 / 2); + + // Check that we are still able to access 'back' properly. + { + auto &Back = Data.back(); + ASSERT_EQ(Back.First, static_cast(Chunk + 1)); + ASSERT_EQ(Back.Second, static_cast(Chunk + 1)); + } + // Then trim until it's empty. - data.trim(ChunkX2 / 2); - ASSERT_TRUE(data.empty()); + Data.trim(ChunkX2 / 2); + ASSERT_TRUE(Data.empty()); // Here our iterators should be the same. - auto I1Begin = data.begin(), I1End = data.end(); + auto I1Begin = Data.begin(), I1End = Data.end(); EXPECT_EQ(I0Begin, I1Begin); EXPECT_EQ(I0End, I1End); // Then we ensure that adding elements back works just fine. for (auto i = ChunkX2; i > 0u; --i) { - data.Append({static_cast(i), static_cast(i)}); + Data.Append({static_cast(i), static_cast(i)}); + } + EXPECT_EQ(Data.size(), ChunkX2); +} + +struct ShadowStackEntry { + uint64_t EntryTSC = 0; + uint64_t *NodePtr = nullptr; + ShadowStackEntry(uint64_t T, uint64_t *N) : EntryTSC(T), NodePtr(N) {} +}; + +TEST(SegmentedArrayTest, SimulateStackBehaviour) { + using AllocatorType = typename Array::AllocatorType; + AllocatorType A(1 << 10, 0); + Array Data(A); + static uint64_t Dummy = 0; + constexpr uint64_t Max = 9; + + for (uint64_t i = 0; i < Max; ++i) { + auto P = Data.Append({i, &Dummy}); + ASSERT_NE(P, nullptr); + ASSERT_EQ(P->NodePtr, &Dummy); + auto &Back = Data.back(); + ASSERT_EQ(Back.NodePtr, &Dummy); + ASSERT_EQ(Back.EntryTSC, i); + } + + // Simulate a stack by checking the data from the end as we're trimming. + auto Counter = Max; + ASSERT_EQ(Data.size(), size_t(Max)); + while (!Data.empty()) { + const auto &Top = Data.back(); + uint64_t *TopNode = Top.NodePtr; + EXPECT_EQ(TopNode, &Dummy) << "Counter = " << Counter; + Data.trim(1); + --Counter; + ASSERT_EQ(Data.size(), size_t(Counter)); } - EXPECT_EQ(data.size(), ChunkX2); } } // namespace Index: compiler-rt/lib/xray/xray_allocator.h =================================================================== --- compiler-rt/lib/xray/xray_allocator.h +++ compiler-rt/lib/xray/xray_allocator.h @@ -18,12 +18,12 @@ #include "sanitizer_common/sanitizer_allocator_internal.h" #include "sanitizer_common/sanitizer_common.h" +#include "sanitizer_common/sanitizer_internal_defs.h" #include "sanitizer_common/sanitizer_mutex.h" +#include "xray_utils.h" #include #include -#include "sanitizer_common/sanitizer_internal_defs.h" - namespace __xray { /// The Allocator type hands out fixed-sized chunks of memory that are @@ -40,8 +40,7 @@ // The Allocator returns memory as Block instances. struct Block { /// Compute the minimum cache-line size multiple that is >= N. - static constexpr auto Size = - kCacheLineSize * ((N / kCacheLineSize) + (N % kCacheLineSize ? 1 : 0)); + static constexpr auto Size = nearest_boundary(N, kCacheLineSize); void *Data = nullptr; }; @@ -61,15 +60,16 @@ // We compute the number of pointers to areas in memory where we consider as // individual blocks we've allocated. To ensure that instances of the - // BlockLink object are cache-line sized, we deduct one additional - // pointers worth representing the pointer to the previous link. + // BlockLink object are cache-line sized, we deduct two pointers worth + // representing the pointer to the previous link and the backing store for + // the whole block. // // This structure corresponds to the following layout: // - // Blocks [ 0, 1, 2, .., BlockPtrCount - 1] + // Blocks [ 0, 1, 2, .., BlockPtrCount - 2] // static constexpr auto BlockPtrCount = - (kCacheLineSize / sizeof(Block *)) - 1; + (kCacheLineSize / sizeof(Block *)) - 2; BlockLink() { // Zero out Blocks. @@ -81,6 +81,7 @@ // FIXME: Align this to cache-line address boundaries? Block Blocks[BlockPtrCount]; BlockLink *Prev = nullptr; + void *BackingStore = nullptr; }; static_assert(sizeof(BlockLink) == kCacheLineSize, @@ -96,17 +97,26 @@ BlockLink *Tail = &NullLink; size_t Counter = 0; - BlockLink *NewChainLink() { + BlockLink *NewChainLink(uint64_t Alignment) { auto NewChain = reinterpret_cast( InternalAlloc(sizeof(BlockLink), nullptr, kCacheLineSize)); auto BackingStore = reinterpret_cast(InternalAlloc( - BlockLink::BlockPtrCount * Block::Size, nullptr, kCacheLineSize)); + (BlockLink::BlockPtrCount + 1) * Block::Size, nullptr, Alignment)); size_t Offset = 0; DCHECK_NE(NewChain, nullptr); DCHECK_NE(BackingStore, nullptr); + NewChain->BackingStore = BackingStore; + + // Here we ensure that the alignment of the pointers we're handing out + // adhere to the alignment requirements of the call to Allocate(). for (auto &B : NewChain->Blocks) { - B.Data = BackingStore + Offset; - Offset += Block::Size; + auto AlignmentAdjustment = + nearest_boundary(reinterpret_cast(BackingStore + Offset), + Alignment) - + reinterpret_cast(BackingStore + Offset); + B.Data = BackingStore + AlignmentAdjustment + Offset; + DCHECK_EQ(reinterpret_cast(B.Data) % Alignment, 0); + Offset += AlignmentAdjustment + Block::Size; } NewChain->Prev = Tail; return NewChain; @@ -117,7 +127,7 @@ // FIXME: Implement PreAllocate support! } - Block Allocate() { + Block Allocate(uint64_t Alignment = 8) { SpinMutexLock Lock(&Mutex); // Check whether we're over quota. if (Counter * Block::Size >= MaxMemory) @@ -128,7 +138,7 @@ Block B{}; BlockLink *Link = Tail; if (UNLIKELY(Counter == 0 || ChainOffset == 0)) - Tail = Link = NewChainLink(); + Tail = Link = NewChainLink(Alignment); B = Link->Blocks[ChainOffset]; ++Counter; @@ -140,7 +150,7 @@ for (auto *C = Tail; C != &NullLink;) { // We know that the data block is a large contiguous page, we deallocate // that at once. - InternalFree(C->Blocks[0].Data); + InternalFree(C->BackingStore); auto Prev = C->Prev; InternalFree(C); C = Prev; Index: compiler-rt/lib/xray/xray_function_call_trie.h =================================================================== --- compiler-rt/lib/xray/xray_function_call_trie.h +++ compiler-rt/lib/xray/xray_function_call_trie.h @@ -17,8 +17,8 @@ #include "xray_profiling_flags.h" #include "xray_segmented_array.h" +#include // For placement new. #include -#include // For placement new. namespace __xray { @@ -101,6 +101,8 @@ NodeIdPair(Node *N, int32_t F) : NodePtr(N), FId(F) {} }; + static_assert(sizeof(NodeIdPair) == 16, ""); + using NodeIdPairArray = Array; using NodeIdPairAllocatorType = NodeIdPairArray::AllocatorType; @@ -128,15 +130,12 @@ private: struct ShadowStackEntry { - int32_t FId; // We're copying the function ID into the stack to avoid having - // to reach into the node just to get the function ID. uint64_t EntryTSC; Node *NodePtr; // We add a constructor here to allow us to inplace-construct through // Array<...>'s AppendEmplace. - ShadowStackEntry(int32_t F, uint64_t T, Node *N) - : FId(F), EntryTSC(T), NodePtr(N) {} + ShadowStackEntry(uint64_t T, Node *N) : EntryTSC{T}, NodePtr{N} {} }; using NodeArray = Array; @@ -219,30 +218,30 @@ // TODO: Support configuration of options through the arguments. static Allocators InitAllocators() { + return InitAllocatorsCustom(profilingFlags()->per_thread_allocator_max); + } + + static Allocators InitAllocatorsCustom(uptr Max) { Allocators A; auto NodeAllocator = reinterpret_cast( InternalAlloc(sizeof(Allocators::NodeAllocatorType))); - new (NodeAllocator) Allocators::NodeAllocatorType( - profilingFlags()->per_thread_allocator_max, 0); + new (NodeAllocator) Allocators::NodeAllocatorType(Max, 0); A.NodeAllocator = NodeAllocator; auto RootAllocator = reinterpret_cast( InternalAlloc(sizeof(Allocators::RootAllocatorType))); - new (RootAllocator) Allocators::RootAllocatorType( - profilingFlags()->per_thread_allocator_max, 0); + new (RootAllocator) Allocators::RootAllocatorType(Max, 0); A.RootAllocator = RootAllocator; auto ShadowStackAllocator = reinterpret_cast( InternalAlloc(sizeof(Allocators::ShadowStackAllocatorType))); - new (ShadowStackAllocator) Allocators::ShadowStackAllocatorType( - profilingFlags()->per_thread_allocator_max, 0); + new (ShadowStackAllocator) Allocators::ShadowStackAllocatorType(Max, 0); A.ShadowStackAllocator = ShadowStackAllocator; auto NodeIdPairAllocator = reinterpret_cast( InternalAlloc(sizeof(NodeIdPairAllocatorType))); - new (NodeIdPairAllocator) - NodeIdPairAllocatorType(profilingFlags()->per_thread_allocator_max, 0); + new (NodeIdPairAllocator) NodeIdPairAllocatorType(Max, 0); A.NodeIdPairAllocator = NodeIdPairAllocator; return A; } @@ -253,20 +252,14 @@ ShadowStackArray ShadowStack; NodeIdPairAllocatorType *NodeIdPairAllocator = nullptr; - const Allocators &GetGlobalAllocators() { - static const Allocators A = [] { return InitAllocators(); }(); - return A; - } - public: explicit FunctionCallTrie(const Allocators &A) : Nodes(*A.NodeAllocator), Roots(*A.RootAllocator), ShadowStack(*A.ShadowStackAllocator), NodeIdPairAllocator(A.NodeIdPairAllocator) {} - FunctionCallTrie() : FunctionCallTrie(GetGlobalAllocators()) {} - - void enterFunction(int32_t FId, uint64_t TSC) { + void enterFunction(const int32_t FId, uint64_t TSC) { + DCHECK_NE(FId, 0); // This function primarily deals with ensuring that the ShadowStack is // consistent and ready for when an exit event is encountered. if (UNLIKELY(ShadowStack.empty())) { @@ -275,12 +268,13 @@ if (UNLIKELY(NewRoot == nullptr)) return; Roots.Append(NewRoot); - ShadowStack.AppendEmplace(FId, TSC, NewRoot); + ShadowStack.AppendEmplace(TSC, NewRoot); return; } auto &Top = ShadowStack.back(); auto TopNode = Top.NodePtr; + DCHECK_NE(TopNode, nullptr); // If we've seen this callee before, then we just access that node and place // that on the top of the stack. @@ -288,7 +282,7 @@ [FId](const NodeIdPair &NR) { return NR.FId == FId; }); if (Callee != nullptr) { CHECK_NE(Callee->NodePtr, nullptr); - ShadowStack.AppendEmplace(FId, TSC, Callee->NodePtr); + ShadowStack.AppendEmplace(TSC, Callee->NodePtr); return; } @@ -297,8 +291,10 @@ Nodes.AppendEmplace(TopNode, *NodeIdPairAllocator, 0, 0, FId); if (UNLIKELY(NewNode == nullptr)) return; + DCHECK_NE(NewNode, nullptr); TopNode->Callees.AppendEmplace(NewNode, FId); - ShadowStack.AppendEmplace(FId, TSC, NewNode); + ShadowStack.AppendEmplace(TSC, NewNode); + DCHECK_NE(ShadowStack.back().NodePtr, nullptr); return; } @@ -307,14 +303,11 @@ // entered this function before. We do as little processing here as we can, // since most of the hard work would have already been done at function // entry. - if (UNLIKELY(ShadowStack.empty())) - return; - uint64_t CumulativeTreeTime = 0; while (!ShadowStack.empty()) { - auto &Top = ShadowStack.back(); + const auto &Top = ShadowStack.back(); auto TopNode = Top.NodePtr; - auto TopFId = TopNode->FId; + DCHECK_NE(TopNode, nullptr); auto LocalTime = TSC - Top.EntryTSC; TopNode->CallCount++; TopNode->CumulativeLocalTime += LocalTime - CumulativeTreeTime; @@ -322,7 +315,7 @@ ShadowStack.trim(1); // TODO: Update the histogram for the node. - if (TopFId == FId) + if (TopNode->FId == FId) break; } } @@ -344,28 +337,34 @@ // This function must *not* be called with a non-empty FunctionCallTrie |O|. void deepCopyInto(FunctionCallTrie &O) const { DCHECK(O.getRoots().empty()); + + // We then push the root into a stack, to use as the parent marker for new + // nodes we push in as we're traversing depth-first down the call tree. + struct NodeAndParent { + FunctionCallTrie::Node *Node; + FunctionCallTrie::Node *NewNode; + }; + using Stack = Array; + + typename Stack::AllocatorType StackAllocator( + profilingFlags()->stack_allocator_max, 0); + Stack DFSStack(StackAllocator); + for (const auto Root : getRoots()) { // Add a node in O for this root. auto NewRoot = O.Nodes.AppendEmplace( nullptr, *O.NodeIdPairAllocator, Root->CallCount, Root->CumulativeLocalTime, Root->FId); - O.Roots.Append(NewRoot); - // We then push the root into a stack, to use as the parent marker for new - // nodes we push in as we're traversing depth-first down the call tree. - struct NodeAndParent { - FunctionCallTrie::Node *Node; - FunctionCallTrie::Node *NewNode; - }; - using Stack = Array; + // Because we cannot allocate more memory we should bail out right away. + if (UNLIKELY(NewRoot == nullptr)) + return; - typename Stack::AllocatorType StackAllocator( - profilingFlags()->stack_allocator_max, 0); - Stack DFSStack(StackAllocator); + O.Roots.Append(NewRoot); // TODO: Figure out what to do if we fail to allocate any more stack // space. Maybe warn or report once? - DFSStack.Append(NodeAndParent{Root, NewRoot}); + DFSStack.AppendEmplace(Root, NewRoot); while (!DFSStack.empty()) { NodeAndParent NP = DFSStack.back(); DCHECK_NE(NP.Node, nullptr); @@ -375,9 +374,10 @@ auto NewNode = O.Nodes.AppendEmplace( NP.NewNode, *O.NodeIdPairAllocator, Callee.NodePtr->CallCount, Callee.NodePtr->CumulativeLocalTime, Callee.FId); - DCHECK_NE(NewNode, nullptr); + if (UNLIKELY(NewNode == nullptr)) + return; NP.NewNode->Callees.AppendEmplace(NewNode, Callee.FId); - DFSStack.Append(NodeAndParent{Callee.NodePtr, NewNode}); + DFSStack.AppendEmplace(Callee.NodePtr, NewNode); } } } @@ -408,6 +408,9 @@ if (R == nullptr) { TargetRoot = O.Nodes.AppendEmplace(nullptr, *O.NodeIdPairAllocator, 0, 0, Root->FId); + if (UNLIKELY(TargetRoot == nullptr)) + return; + O.Roots.Append(TargetRoot); } else { TargetRoot = *R; @@ -430,10 +433,14 @@ if (TargetCallee == nullptr) { auto NewTargetNode = O.Nodes.AppendEmplace( NT.TargetNode, *O.NodeIdPairAllocator, 0, 0, Callee.FId); + + if (UNLIKELY(NewTargetNode == nullptr)) + return; + TargetCallee = NT.TargetNode->Callees.AppendEmplace(NewTargetNode, Callee.FId); } - DFSStack.Append(NodeAndTarget{Callee.NodePtr, TargetCallee->NodePtr}); + DFSStack.AppendEmplace(Callee.NodePtr, TargetCallee->NodePtr); } } } Index: compiler-rt/lib/xray/xray_profile_collector.cc =================================================================== --- compiler-rt/lib/xray/xray_profile_collector.cc +++ compiler-rt/lib/xray/xray_profile_collector.cc @@ -16,8 +16,8 @@ #include "sanitizer_common/sanitizer_common.h" #include "sanitizer_common/sanitizer_vector.h" #include "xray_profiling_flags.h" -#include #include +#include #include namespace __xray { @@ -55,7 +55,8 @@ GlobalAllocators = reinterpret_cast( InternalAlloc(sizeof(FunctionCallTrie::Allocators))); new (GlobalAllocators) FunctionCallTrie::Allocators(); - *GlobalAllocators = FunctionCallTrie::InitAllocators(); + *GlobalAllocators = FunctionCallTrie::InitAllocatorsCustom( + profilingFlags()->global_allocator_max); }); DCHECK_NE(GlobalAllocators, nullptr); @@ -83,12 +84,11 @@ // and is decoupled from the lifetime management required by the managed // allocator we have in XRay. // - Item->Trie = reinterpret_cast( - InternalAlloc(sizeof(FunctionCallTrie))); + Item->Trie = reinterpret_cast(InternalAlloc( + sizeof(FunctionCallTrie), nullptr, alignof(FunctionCallTrie))); DCHECK_NE(Item->Trie, nullptr); new (Item->Trie) FunctionCallTrie(*GlobalAllocators); } - DCHECK_NE(Item, nullptr); T.deepCopyInto(*Item->Trie); } Index: compiler-rt/lib/xray/xray_profiling.cc =================================================================== --- compiler-rt/lib/xray/xray_profiling.cc +++ compiler-rt/lib/xray/xray_profiling.cc @@ -258,9 +258,9 @@ { SpinMutexLock Lock(&ProfilerOptionsMutex); FlagParser ConfigParser; - auto *F = profilingFlags(); - F->setDefaults(); - registerProfilerFlags(&ConfigParser, F); + ProfilerFlags Flags; + Flags.setDefaults(); + registerProfilerFlags(&ConfigParser, &Flags); ConfigParser.ParseString(profilingCompilerDefinedFlags()); const char *Env = GetEnv("XRAY_PROFILING_OPTIONS"); if (Env == nullptr) @@ -271,6 +271,7 @@ ConfigParser.ParseString(static_cast(Options)); if (Verbosity()) ReportUnrecognizedFlags(); + *profilingFlags() = Flags; } // We need to reset the profile data collection implementation now. Index: compiler-rt/lib/xray/xray_segmented_array.h =================================================================== --- compiler-rt/lib/xray/xray_segmented_array.h +++ compiler-rt/lib/xray/xray_segmented_array.h @@ -18,21 +18,13 @@ #include "sanitizer_common/sanitizer_allocator.h" #include "xray_allocator.h" +#include "xray_utils.h" +#include #include #include namespace __xray { -namespace { - -constexpr size_t gcd(size_t a, size_t b) { - return (b == 0) ? a : gcd(b, a % b); -} - -constexpr size_t lcm(size_t a, size_t b) { return a * b / gcd(a, b); } - -} // namespace - /// The Array type provides an interface similar to std::vector<...> but does /// not shrink in size. Once constructed, elements can be appended but cannot be /// removed. The implementation is heavily dependent on the contract provided by @@ -45,10 +37,12 @@ /// size, to allow us to maximise the number of T objects we can place in /// cache-line multiple sized blocks. To get back the number of T's, we divide /// this least common multiple by the size of T. -template +template struct Array { static constexpr size_t ChunkSize = N; - static constexpr size_t AllocatorChunkSize = sizeof(T) * ChunkSize; + static constexpr size_t ElementStorageSize = next_pow2(sizeof(T)); + static constexpr size_t AllocatorChunkSize = ElementStorageSize * ChunkSize; using AllocatorType = Allocator; static_assert(std::is_trivially_destructible::value, "T must be trivially destructible."); @@ -60,8 +54,8 @@ struct Chunk { typename AllocatorType::Block Block; static constexpr size_t Size = N; - Chunk *Prev = nullptr; - Chunk *Next = nullptr; + Chunk *Prev = this; + Chunk *Next = this; }; static Chunk SentinelChunk; @@ -70,7 +64,6 @@ Chunk *Head = &SentinelChunk; Chunk *Tail = &SentinelChunk; size_t Size = 0; - size_t FreeElements = 0; // Here we keep track of chunks in the freelist, to allow us to re-use chunks // when elements are trimmed off the end. @@ -85,17 +78,22 @@ auto *FreeChunk = Freelist; Freelist = FreeChunk->Next; FreeChunk->Next = &SentinelChunk; + Freelist->Prev = &SentinelChunk; return FreeChunk; } - auto Block = Alloc->Allocate(); + auto Block = Alloc->Allocate(ElementStorageSize); if (Block.Data == nullptr) return nullptr; + // TODO: Maybe use a separate managed allocator for Chunk instances? - auto C = reinterpret_cast(InternalAlloc(sizeof(Chunk))); + auto C = reinterpret_cast( + InternalAlloc(sizeof(Chunk), nullptr, kCacheLineSize)); if (C == nullptr) return nullptr; C->Block = Block; + C->Prev = &SentinelChunk; + C->Next = &SentinelChunk; return C; } @@ -116,42 +114,52 @@ auto Chunk = NewChunk(); if (Chunk == nullptr) return nullptr; - Chunk->Prev = &SentinelChunk; - Chunk->Next = &SentinelChunk; - Head = Chunk; - Tail = Chunk; - return Chunk; + DCHECK_EQ(Chunk->Next, &SentinelChunk); + DCHECK_EQ(Chunk->Prev, &SentinelChunk); + return Head = Tail = Chunk; } Chunk *AppendNewChunk() { auto Chunk = NewChunk(); if (Chunk == nullptr) return nullptr; + DCHECK_NE(Tail, &SentinelChunk); + DCHECK_EQ(Tail->Next, &SentinelChunk); + DCHECK_EQ(Chunk->Prev, &SentinelChunk); + DCHECK_EQ(Chunk->Next, &SentinelChunk); Tail->Next = Chunk; Chunk->Prev = Tail; - Chunk->Next = &SentinelChunk; Tail = Chunk; - return Chunk; + return Tail; } // This Iterator models a BidirectionalIterator. template class Iterator { - Chunk *C = nullptr; + Chunk *C = &SentinelChunk; size_t Offset = 0; + size_t Size = 0; public: - Iterator(Chunk *IC, size_t Off) : C(IC), Offset(Off) {} + Iterator(Chunk *IC, size_t Off, size_t S) : C(IC), Offset(Off), Size(S) {} + Iterator(const Iterator &) noexcept = default; + Iterator() noexcept = default; + Iterator(Iterator &&) noexcept = default; + Iterator &operator=(const Iterator &) = default; + Iterator &operator=(Iterator &&) = default; + ~Iterator() = default; Iterator &operator++() { - if (++Offset % N) + if (++Offset % N || Offset == Size) return *this; - DCHECK_NE(C, &SentinelChunk); - // At this point, we know that Offset % N == 0, so we must advance the // chunk pointer. DCHECK_EQ(Offset % N, 0); + DCHECK_NE(Offset, Size); + DCHECK_NE(C, &SentinelChunk); + DCHECK_NE(C->Next, &SentinelChunk); C = C->Next; + DCHECK_NE(C, &SentinelChunk); return *this; } @@ -159,10 +167,12 @@ DCHECK_NE(C, &SentinelChunk); DCHECK_GT(Offset, 0); - // We check whether the offset was on a boundary before decrement, to see - // whether we need to retreat to the previous chunk. - if ((Offset-- % N) == 0) + auto PreviousOffset = Offset--; + if (PreviousOffset != Size && PreviousOffset % N == 0) { + DCHECK_NE(C->Prev, &SentinelChunk); C = C->Prev; + } + return *this; } @@ -190,12 +200,16 @@ U &operator*() const { DCHECK_NE(C, &SentinelChunk); - return reinterpret_cast(C->Block.Data)[Offset % N]; + auto RelOff = Offset % N; + return *reinterpret_cast(reinterpret_cast(C->Block.Data) + + (RelOff * ElementStorageSize)); } U *operator->() const { DCHECK_NE(C, &SentinelChunk); - return reinterpret_cast(C->Block.Data) + (Offset % N); + auto RelOff = Offset % N; + return reinterpret_cast(reinterpret_cast(C->Block.Data) + + (RelOff * ElementStorageSize)); } }; @@ -232,10 +246,11 @@ if (AppendNewChunk() == nullptr) return nullptr; - auto Position = reinterpret_cast(Tail->Block.Data) + Offset; + auto Position = + reinterpret_cast(reinterpret_cast(Tail->Block.Data) + + (Offset * ElementStorageSize)); *Position = E; ++Size; - FreeElements -= FreeElements ? 1 : 0; return Position; } @@ -245,16 +260,21 @@ return nullptr; auto Offset = Size % N; - if (UNLIKELY(Size != 0 && Offset == 0)) - if (AppendNewChunk() == nullptr) + auto *LatestChunk = Tail; + if (UNLIKELY(Size != 0 && Offset == 0)) { + LatestChunk = AppendNewChunk(); + if (LatestChunk == nullptr) return nullptr; + } - auto Position = reinterpret_cast(Tail->Block.Data) + Offset; + DCHECK_NE(Tail, &SentinelChunk); + auto Position = reinterpret_cast(LatestChunk->Block.Data) + + (Offset * ElementStorageSize); + DCHECK_EQ(reinterpret_cast(Position) % ElementStorageSize, 0); // In-place construct at Position. - new (Position) T(std::forward(args)...); + new (Position) T{std::forward(args)...}; ++Size; - FreeElements -= FreeElements ? 1 : 0; - return Position; + return reinterpret_cast(Position); } T &operator[](size_t Offset) const { @@ -266,20 +286,22 @@ Offset -= N; DCHECK_NE(C, &SentinelChunk); } - auto Position = reinterpret_cast(C->Block.Data) + Offset; - return *Position; + return *reinterpret_cast(reinterpret_cast(C->Block.Data) + + (Offset * ElementStorageSize)); } T &front() const { DCHECK_NE(Head, &SentinelChunk); DCHECK_NE(Size, 0u); - return *reinterpret_cast(Head->Block.Data); + return *begin(); } T &back() const { DCHECK_NE(Tail, &SentinelChunk); - auto Offset = (Size - 1) % N; - return *(reinterpret_cast(Tail->Block.Data) + Offset); + DCHECK_NE(Size, 0u); + auto It = end(); + --It; + return *It; } template T *find_element(Predicate P) const { @@ -298,15 +320,18 @@ /// require allocation of new blocks for new elements added after trimming. void trim(size_t Elements) { DCHECK_LE(Elements, Size); + DCHECK_GT(Size, 0); + auto OldSize = Size; Size -= Elements; - FreeElements += Elements; - - // Here we need to check whether we've cleared enough elements to warrant - // putting blocks on to the freelist. We determine whether we need to - // right-size the internal list, by keeping track of the number of "free" - // elements still in the array. - auto ChunksToTrim = FreeElements / N; - for (size_t i = 0; i < ChunksToTrim; ++i, FreeElements -= N) { + + DCHECK_NE(Head, &SentinelChunk); + DCHECK_NE(Tail, &SentinelChunk); + + for (auto ChunksToTrim = + nearest_boundary(OldSize - nearest_boundary(Size, N), N) / N; + ChunksToTrim > 0; --ChunksToTrim) { + DCHECK_NE(Head, &SentinelChunk); + DCHECK_NE(Tail, &SentinelChunk); // Put the tail into the Freelist. auto *FreeChunk = Tail; Tail = Tail->Prev; @@ -314,17 +339,21 @@ Head = Tail; else Tail->Next = &SentinelChunk; + + DCHECK_EQ(Tail->Next, &SentinelChunk); FreeChunk->Next = Freelist; - FreeChunk->Prev = Freelist->Prev; + FreeChunk->Prev = &SentinelChunk; + if (Freelist != &SentinelChunk) + Freelist->Prev = FreeChunk; Freelist = FreeChunk; } } // Provide iterators. - Iterator begin() const { return Iterator(Head, 0); } - Iterator end() const { return Iterator(Tail, Size); } - Iterator cbegin() const { return Iterator(Head, 0); } - Iterator cend() const { return Iterator(Tail, Size); } + Iterator begin() const { return Iterator(Head, 0, Size); } + Iterator end() const { return Iterator(Tail, Size, Size); } + Iterator cbegin() const { return Iterator(Head, 0, Size); } + Iterator cend() const { return Iterator(Tail, Size, Size); } }; // We need to have this storage definition out-of-line so that the compiler can Index: compiler-rt/lib/xray/xray_utils.h =================================================================== --- compiler-rt/lib/xray/xray_utils.h +++ compiler-rt/lib/xray/xray_utils.h @@ -36,6 +36,24 @@ // file. int getLogFD(); +constexpr size_t gcd(size_t a, size_t b) { + return (b == 0) ? a : gcd(b, a % b); +} + +constexpr size_t lcm(size_t a, size_t b) { return a * b / gcd(a, b); } + +template constexpr T nearest_boundary(T number, T multiple) { + return multiple * ((number / multiple) + (number % multiple ? 1 : 0)); +} + +constexpr size_t next_pow2_helper(size_t num, size_t acc) { + return (1u << acc) >= num ? (1u << acc) : next_pow2_helper(num, acc + 1); +} + +constexpr size_t next_pow2(size_t number) { + return next_pow2_helper(number, 1); +} + } // namespace __xray #endif // XRAY_UTILS_H Index: llvm/include/llvm/XRay/Profile.h =================================================================== --- /dev/null +++ llvm/include/llvm/XRay/Profile.h @@ -0,0 +1,109 @@ +//===- Profile.h - XRay Profile Abstraction -------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// Defines the XRay Profile class representing the latency profile generated by +// XRay's profiling mode. +// +//===----------------------------------------------------------------------===// +#ifndef LLVM_XRAY_PROFILE_H +#define LLVM_XRAY_PROFILE_H + +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/Support/Error.h" +#include +#include +#include + +namespace llvm { +namespace xray { + +class Profile; + +/// This function will attempt to load an XRay Profiling Mode profile from the +/// provided |Filename|. +Expected loadProfile(StringRef Filename); + +/// This algorithm will merge two Profile instances into a single Profile +/// instance, aggregating blocks by Thread ID. +Profile mergeProfilesByThread(const Profile &L, const Profile &R); + +/// Profile instances are thread-compatible. +class Profile { +public: + using ThreadID = uint64_t; + using PathID = unsigned; + using FuncID = int32_t; + + struct Data { + uint64_t CallCount; + uint64_t CumulativeLocalTime; + }; + + struct Block { + ThreadID Thread; + std::vector> PathData; + }; + + /// Provides a sequence of function IDs from a previously interned PathID. + Expected> path(PathID P) const; + + /// The stack represented in |P| must be in stack order (leaf to root). + PathID internPath(ArrayRef P); + + /// Appends a fully-formed Block instance into the Profile. + Error addBlock(Block &&B); + + // No copies. + Profile(const Profile &) = delete; + Profile &operator=(const Profile &) = delete; + + // Move only. + Profile(Profile &&) noexcept = default; + Profile &operator=(Profile &&) noexcept = default; + + Profile() = default; + ~Profile() = default; + +private: + using BlockList = std::list; + + struct TrieNode { + FuncID Func = 0; + std::vector Callees{}; + TrieNode *Caller = nullptr; + PathID ID = 0; + }; + + // List of blocks associated with a Profile. + BlockList Blocks; + + // List of TrieNode elements we've seen. + std::list NodeStorage; + + // List of call stack roots. + SmallVector Roots; + + // Reverse mapping between a PathID to a TrieNode*. + DenseMap PathIDMap; + + // Used to increment + PathID NextID = 1; + +public: + using const_iterator = BlockList::const_iterator; + const_iterator begin() const { return Blocks.begin(); } + const_iterator end() const { return Blocks.end(); } +}; + +} // namespace xray +} // namespace llvm + +#endif Index: llvm/include/llvm/XRay/Trace.h =================================================================== --- llvm/include/llvm/XRay/Trace.h +++ llvm/include/llvm/XRay/Trace.h @@ -18,7 +18,6 @@ #include "llvm/ADT/StringRef.h" #include "llvm/Support/Error.h" -#include "llvm/Support/FileSystem.h" #include "llvm/XRay/XRayRecord.h" namespace llvm { @@ -62,7 +61,9 @@ }; /// This function will attempt to load XRay trace records from the provided -/// |Filename|. +/// |Filename|. If the file provided contains an XRay Profile, this will error +/// out and suggest to the user that the llvm/XRay/Profile.h header and +/// `loadProfile(...)` function be called instead. Expected loadTraceFile(StringRef Filename, bool Sort = false); } // namespace xray Index: llvm/lib/XRay/CMakeLists.txt =================================================================== --- llvm/lib/XRay/CMakeLists.txt +++ llvm/lib/XRay/CMakeLists.txt @@ -1,5 +1,6 @@ add_llvm_library(LLVMXRay InstrumentationMap.cpp + Profile.cpp Trace.cpp ADDITIONAL_HEADER_DIRS Index: llvm/lib/XRay/Profile.cpp =================================================================== --- /dev/null +++ llvm/lib/XRay/Profile.cpp @@ -0,0 +1,280 @@ +//===- Profile.cpp - XRay Profile Abstraction -----------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// Defines the XRay Profile class representing the latency profile generated by +// XRay's profiling mode. +// +//===----------------------------------------------------------------------===// +#include "llvm/XRay/Profile.h" + +#include "llvm/Support/DataExtractor.h" +#include "llvm/Support/Error.h" +#include "llvm/Support/FileSystem.h" +#include +#include + +namespace llvm { +namespace xray { + +namespace { + +struct BlockHeader { + uint32_t Size; + uint32_t Number; + uint64_t Thread; +}; + +static Expected readBlockHeader(DataExtractor &Extractor, + uint32_t &Offset) { + BlockHeader H; + uint32_t CurrentOffset = Offset; + H.Size = Extractor.getU32(&Offset); + if (Offset == CurrentOffset) + return make_error( + Twine("Error parsing block header size at offset '") + + Twine(CurrentOffset) + "'", + std::make_error_code(std::errc::invalid_argument)); + CurrentOffset = Offset; + H.Number = Extractor.getU32(&Offset); + if (Offset == CurrentOffset) + return make_error( + Twine("Error parsing block header number at offset '") + + Twine(CurrentOffset) + "'", + std::make_error_code(std::errc::invalid_argument)); + CurrentOffset = Offset; + H.Thread = Extractor.getU64(&Offset); + if (Offset == CurrentOffset) + return make_error( + Twine("Error parsing block header thread id at offset '") + + Twine(CurrentOffset) + "'", + std::make_error_code(std::errc::invalid_argument)); + return H; +} + +static Expected> readPath(DataExtractor &Extractor, + uint32_t &Offset) { + // We're reading a sequence of int32_t's until we find a 0. + std::vector Path; + auto CurrentOffset = Offset; + int32_t FuncId; + do { + FuncId = Extractor.getSigned(&Offset, 4); + if (CurrentOffset == Offset) + return make_error( + Twine("Error parsing path at offset '") + Twine(CurrentOffset) + "'", + std::make_error_code(std::errc::invalid_argument)); + CurrentOffset = Offset; + Path.push_back(FuncId); + } while (FuncId != 0); + return std::move(Path); +} + +static Expected readData(DataExtractor &Extractor, + uint32_t &Offset) { + // We expect a certain number of elements for Data: + // - A 64-bit CallCount + // - A 64-bit CumulativeLocalTime counter + Profile::Data D; + auto CurrentOffset = Offset; + D.CallCount = Extractor.getU64(&Offset); + if (CurrentOffset == Offset) + return make_error( + Twine("Error parsing call counts at offset '") + Twine(CurrentOffset) + + "'", + std::make_error_code(std::errc::invalid_argument)); + CurrentOffset = Offset; + D.CumulativeLocalTime = Extractor.getU64(&Offset); + if (CurrentOffset == Offset) + return make_error( + Twine("Error parsing cumulative local time at offset '") + + Twine(CurrentOffset) + "'", + std::make_error_code(std::errc::invalid_argument)); + return D; +} + +} // namespace + +Profile::PathID Profile::internPath(ArrayRef Path) { + if (Path.empty()) + return 0; + + auto RevPath = reverse(Path); + + // Find the root. + auto It = RevPath.begin(); + auto PathRoot = *It++; + auto RootIt = + find_if(Roots, [PathRoot](TrieNode *N) { return N->Func == PathRoot; }); + + // If we've not seen this root before, remember it. + TrieNode *Node = nullptr; + if (RootIt == Roots.end()) { + NodeStorage.emplace_back(); + Node = &NodeStorage.back(); + Node->Func = PathRoot; + Roots.push_back(Node); + } else { + Node = *RootIt; + } + + // Now traverse the path, re-creating if necessary. + while (It != RevPath.end()) { + auto NodeFuncID = *It++; + auto CalleeIt = find_if(Node->Callees, [NodeFuncID](TrieNode *N) { + return N->Func == NodeFuncID; + }); + if (CalleeIt == Node->Callees.end()) { + NodeStorage.emplace_back(); + auto NewNode = &NodeStorage.back(); + NewNode->Func = NodeFuncID; + NewNode->Caller = Node; + Node->Callees.push_back(NewNode); + Node = NewNode; + } else { + Node = *CalleeIt; + } + } + + // At this point, Node *must* be pointing at the leaf. + assert(Node->Func == Path.front()); + if (Node->ID == 0) { + Node->ID = NextID++; + PathIDMap.insert({Node->ID, Node}); + } + return Node->ID; +} + +Error Profile::addBlock(Block &&B) { + if (B.Thread == 0) + return make_error( + "Block may not have Thread ID 0.", + std::make_error_code(std::errc::invalid_argument)); + if (B.PathData.empty()) + return make_error( + "Block may not have empty path data.", + std::make_error_code(std::errc::invalid_argument)); + + Blocks.emplace_back(std::move(B)); + return Error::success(); +} + +Expected> Profile::path(Profile::PathID P) const { + auto It = PathIDMap.find(P); + if (It == PathIDMap.end()) + return make_error( + Twine("PathID not found: ") + Twine(P), + std::make_error_code(std::errc::invalid_argument)); + std::vector Path; + for (auto Node = It->second; Node; Node = Node->Caller) + Path.push_back(Node->Func); + return std::move(Path); +} + +Profile mergeProfilesByThread(const Profile &L, const Profile &R) { + Profile Merged; + using PathDataMap = DenseMap; + using PathDataMapPtr = std::unique_ptr; + using PathDataVector = decltype(Profile::Block::PathData); + DenseMap ThreadProfileIndex; + for (const auto &Block : L) { + auto Res = ThreadProfileIndex.insert( + {Block.Thread, PathDataMapPtr{new PathDataMap()}}); + auto &It = Res.first; + for (const auto &PathAndData : Block.PathData) { + auto &PathID = PathAndData.first; + auto &Data = PathAndData.second; + auto NewPathID = Merged.internPath(cantFail(L.path(PathID))); + It->second->insert({NewPathID, Data}); + } + } + for (const auto &Block : R) { + auto Res = ThreadProfileIndex.insert( + {Block.Thread, PathDataMapPtr{new PathDataMap()}}); + auto &It = Res.first; + for (const auto &PathAndData : Block.PathData) { + auto &PathID = PathAndData.first; + auto &Data = PathAndData.second; + auto NewPathID = Merged.internPath(cantFail(R.path(PathID))); + PathDataMap::iterator PathDataIt; + bool Inserted; + std::tie(PathDataIt, Inserted) = It->second->insert({NewPathID, Data}); + if (!Inserted) { + auto &ExistingData = PathDataIt->second; + ExistingData.CallCount += Data.CallCount; + ExistingData.CumulativeLocalTime += Data.CumulativeLocalTime; + } + } + } + + for (const auto &IndexedThreadBlock : ThreadProfileIndex) { + PathDataVector PathAndData; + PathAndData.reserve(IndexedThreadBlock.second->size()); + copy(*IndexedThreadBlock.second, std::back_inserter(PathAndData)); + cantFail( + Merged.addBlock({IndexedThreadBlock.first, std::move(PathAndData)})); + } + return Merged; +} + +Expected loadProfile(StringRef Filename) { + int Fd; + if (auto EC = sys::fs::openFileForRead(Filename, Fd)) + return make_error( + Twine("Cannot read profile from '") + Filename + "'", EC); + + uint64_t FileSize; + if (auto EC = sys::fs::file_size(Filename, FileSize)) + return make_error( + Twine("Cannot get filesize of '") + Filename + "'", EC); + + std::error_code EC; + sys::fs::mapped_file_region MappedFile( + Fd, sys::fs::mapped_file_region::mapmode::readonly, FileSize, 0, EC); + if (EC) + return make_error( + Twine("Cannot mmap profile '") + Filename + "'", EC); + StringRef Data(MappedFile.data(), MappedFile.size()); + + Profile P; + uint32_t Offset = 0; + DataExtractor Extractor(Data, true, 8); + + // For each block we get from the file: + while (Offset != MappedFile.size()) { + auto HeaderOrError = readBlockHeader(Extractor, Offset); + if (!HeaderOrError) + return HeaderOrError.takeError(); + + // TODO: Maybe store this header information for each block, even just for + // debugging? + const auto &Header = HeaderOrError.get(); + + // Read in the path data. + auto PathOrError = readPath(Extractor, Offset); + if (!PathOrError) + return PathOrError.takeError(); + const auto &Path = PathOrError.get(); + + // For each path we encounter, we should intern it to get a PathID. + auto DataOrError = readData(Extractor, Offset); + if (!DataOrError) + return DataOrError.takeError(); + auto &Data = DataOrError.get(); + + if (auto E = + P.addBlock(Profile::Block{Profile::ThreadID{Header.Thread}, + {{P.internPath(Path), std::move(Data)}}})) + return std::move(E); + } + + return P; +} + +} // namespace xray +} // namespace llvm Index: llvm/lib/XRay/Trace.cpp =================================================================== --- llvm/lib/XRay/Trace.cpp +++ llvm/lib/XRay/Trace.cpp @@ -30,16 +30,34 @@ // FIXME: Maybe deduce whether the data is little or big-endian using some // magic bytes in the beginning of the file? - // First 32 bytes of the file will always be the header. We assume a certain - // format here: + // First 32 bytes of the file will always be the header. For FDR and Basic + // mode, we assume a specific format: // // (2) uint16 : version // (2) uint16 : type // (4) uint32 : bitfield // (8) uint64 : cycle frequency // (16) - : padding + // + // We also look for the Profiling Mode output, which uses an alternate data + // format using a set of magic bytes at the beginning of the file. If we see + // the first 64 bits (16 bytes) to be that set of magic bytes, then we bail + // out advising the caller to use a separate loader. We're doing this because + // the data format for the profiling mode doesn't provide access to specific + // events compared to FDR and Basic mode. DataExtractor HeaderExtractor(Data, true, 8); + + { + uint32_t OffsetPtr = 0; + uint64_t MagicBytes = HeaderExtractor.getU64(&OffsetPtr); + if (MagicBytes == 0x7872617970726f66) + return make_error( + Twine("Detected XRay profiling mode file; use the `Profile` loader " + "instead."), + std::make_error_code(std::errc::invalid_argument)); + } + uint32_t OffsetPtr = 0; FileHeader.Version = HeaderExtractor.getU16(&OffsetPtr); FileHeader.Type = HeaderExtractor.getU16(&OffsetPtr); @@ -122,8 +140,8 @@ return make_error( Twine("Corrupted log, found arg payload following non-matching " "function + thread record. Record for function ") + - Twine(Record.FuncId) + " != " + Twine(FuncId) + "; offset: " + - Twine(S.data() - Data.data()), + Twine(Record.FuncId) + " != " + Twine(FuncId) + + "; offset: " + Twine(S.data() - Data.data()), std::make_error_code(std::errc::executable_format_error)); // Advance another four bytes to avoid padding. OffsetPtr += 4; @@ -708,9 +726,9 @@ if (Sort) std::stable_sort(T.Records.begin(), T.Records.end(), - [&](const XRayRecord &L, const XRayRecord &R) { - return L.TSC < R.TSC; - }); + [&](const XRayRecord &L, const XRayRecord &R) { + return L.TSC < R.TSC; + }); return std::move(T); } Index: llvm/tools/llvm-xray/xray-stacks.cpp =================================================================== --- llvm/tools/llvm-xray/xray-stacks.cpp +++ llvm/tools/llvm-xray/xray-stacks.cpp @@ -29,6 +29,7 @@ #include "llvm/Support/FormatVariadic.h" #include "llvm/XRay/Graph.h" #include "llvm/XRay/InstrumentationMap.h" +#include "llvm/XRay/Profile.h" #include "llvm/XRay/Trace.h" using namespace llvm; @@ -723,9 +724,18 @@ symbolize::LLVMSymbolizer Symbolizer(Opts); FuncIdConversionHelper FuncIdHelper(StacksInstrMap, Symbolizer, Map.getFunctionAddresses()); + // TODO: Someday, support output to files instead of just directly to // standard output. for (const auto &Filename : StackInputs) { + // First, we try to load the file using the Profile loader. If we happen to + // get a Profile object, we can then translate those directly into the + // format that we need. + auto ProfileOrErr = loadProfile(Filename); + if (!ProfileOrErr) + logAllUnhandledErrors(ProfileOrErr.takeError(), errs(), ""); + + // TODO: Handle Profile objects. auto TraceOrErr = loadTraceFile(Filename); if (!TraceOrErr) { if (!StackKeepGoing) Index: llvm/unittests/XRay/CMakeLists.txt =================================================================== --- llvm/unittests/XRay/CMakeLists.txt +++ llvm/unittests/XRay/CMakeLists.txt @@ -1,9 +1,11 @@ set(LLVM_LINK_COMPONENTS Support + XRay ) add_llvm_unittest(XRayTests GraphTest.cpp + ProfileTest.cpp ) add_dependencies(XRayTests intrinsics_gen) Index: llvm/unittests/XRay/ProfileTest.cpp =================================================================== --- /dev/null +++ llvm/unittests/XRay/ProfileTest.cpp @@ -0,0 +1,70 @@ +//===- ProfileTest.cpp - XRay Profile unit tests ----------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +#include "llvm/XRay/Profile.h" +#include "gmock/gmock.h" +#include "gtest/gtest.h" + +namespace llvm { +namespace xray { +namespace { + +using ::testing::AllOf; +using ::testing::ElementsAre; +using ::testing::Eq; +using ::testing::Field; +using ::testing::Not; +using ::testing::Pair; + +TEST(ProfileTest, CreateProfile) { Profile P; } + +TEST(ProfileTest, InternPath) { + Profile P; + auto Path0 = P.internPath({3, 2, 1}); + auto Path1 = P.internPath({3, 2, 1}); + auto Path2 = P.internPath({2, 1}); + EXPECT_THAT(Path0, Eq(Path1)); + EXPECT_THAT(Path0, Not(Eq(Path2))); +} + +TEST(ProfileTest, AddBlocks) { + Profile P; + EXPECT_TRUE(errorToBool(P.addBlock({}))); + EXPECT_TRUE(errorToBool(P.addBlock({1, {}}))); + EXPECT_FALSE(errorToBool(P.addBlock( + Profile::Block{Profile::ThreadID{1}, + { + {P.internPath({2, 1}), Profile::Data{1, 1000}}, + {P.internPath({3, 2, 1}), Profile::Data{10, 100}}, + }}))); +} + +TEST(ProfileTest, MergeProfilesByThread) { + Profile P0, P1; + EXPECT_FALSE(errorToBool(P0.addBlock( + Profile::Block{Profile::ThreadID{1}, + {{P0.internPath({2, 1}), Profile::Data{1, 1000}}}}))); + EXPECT_FALSE(errorToBool(P1.addBlock( + Profile::Block{Profile::ThreadID{1}, + {{P1.internPath({2, 1}), Profile::Data{1, 1000}}}}))); + + Profile Merged = mergeProfilesByThread(P0, P1); + EXPECT_THAT(Merged, + ElementsAre(AllOf( + Field(&Profile::Block::Thread, Eq(Profile::ThreadID{1})), + Field(&Profile::Block::PathData, + ElementsAre(Pair( + Merged.internPath({2, 1}), + AllOf(Field(&Profile::Data::CallCount, Eq(2u)), + Field(&Profile::Data::CumulativeLocalTime, + Eq(2000u))))))))); +} + +} // namespace +} // namespace xray +} // namespace llvm