Index: compiler-rt/trunk/lib/xray/tests/unit/function_call_trie_test.cc =================================================================== --- compiler-rt/trunk/lib/xray/tests/unit/function_call_trie_test.cc +++ compiler-rt/trunk/lib/xray/tests/unit/function_call_trie_test.cc @@ -10,9 +10,9 @@ // This file is a part of XRay, a function call tracing system. // //===----------------------------------------------------------------------===// -#include "gtest/gtest.h" - #include "xray_function_call_trie.h" +#include "gtest/gtest.h" +#include namespace __xray { @@ -29,26 +29,54 @@ auto A = FunctionCallTrie::InitAllocators(); FunctionCallTrie Trie(A); - Trie.enterFunction(1, 1); - Trie.exitFunction(1, 2); + uint64_t TSC = 1; + uint16_t CPU = 0; + Trie.enterFunction(1, TSC++, CPU++); + Trie.exitFunction(1, TSC++, CPU++); + const auto &R = Trie.getRoots(); - // We need a way to pull the data out. At this point, until we get a data - // collection service implemented, we're going to export the data as a list of - // roots, and manually walk through the structure ourselves. + ASSERT_EQ(R.size(), 1u); + ASSERT_EQ(R.front()->FId, 1); + ASSERT_EQ(R.front()->CallCount, 1u); + ASSERT_EQ(R.front()->CumulativeLocalTime, 1u); +} + +TEST(FunctionCallTrieTest, HandleTSCOverflow) { + profilingFlags()->setDefaults(); + auto A = FunctionCallTrie::InitAllocators(); + FunctionCallTrie Trie(A); + Trie.enterFunction(1, std::numeric_limits::max(), 0); + Trie.exitFunction(1, 1, 0); const auto &R = Trie.getRoots(); ASSERT_EQ(R.size(), 1u); ASSERT_EQ(R.front()->FId, 1); - ASSERT_EQ(R.front()->CallCount, 1); + ASSERT_EQ(R.front()->CallCount, 1u); ASSERT_EQ(R.front()->CumulativeLocalTime, 1u); } +TEST(FunctionCallTrieTest, MaximalCumulativeTime) { + profilingFlags()->setDefaults(); + auto A = FunctionCallTrie::InitAllocators(); + FunctionCallTrie Trie(A); + + Trie.enterFunction(1, 1, 0); + Trie.exitFunction(1, 0, 0); + const auto &R = Trie.getRoots(); + + ASSERT_EQ(R.size(), 1u); + ASSERT_EQ(R.front()->FId, 1); + ASSERT_EQ(R.front()->CallCount, 1u); + ASSERT_EQ(R.front()->CumulativeLocalTime, + std::numeric_limits::max() - 1); +} + TEST(FunctionCallTrieTest, MissingFunctionEntry) { profilingFlags()->setDefaults(); auto A = FunctionCallTrie::InitAllocators(); FunctionCallTrie Trie(A); - Trie.exitFunction(1, 1); + Trie.exitFunction(1, 1, 0); const auto &R = Trie.getRoots(); ASSERT_TRUE(R.empty()); @@ -58,9 +86,9 @@ profilingFlags()->setDefaults(); auto A = FunctionCallTrie::InitAllocators(); FunctionCallTrie Trie(A); - Trie.enterFunction(2, 1); - Trie.enterFunction(3, 3); - Trie.exitFunction(1, 5); + Trie.enterFunction(2, 1, 0); + Trie.enterFunction(3, 3, 0); + Trie.exitFunction(1, 5, 0); const auto &R = Trie.getRoots(); ASSERT_FALSE(R.empty()); @@ -71,7 +99,7 @@ profilingFlags()->setDefaults(); auto A = FunctionCallTrie::InitAllocators(); FunctionCallTrie Trie(A); - Trie.enterFunction(1, 1); + Trie.enterFunction(1, 1, 0); const auto &R = Trie.getRoots(); ASSERT_FALSE(R.empty()); @@ -84,12 +112,12 @@ FunctionCallTrie Trie(A); // Enter and exit FId = 1. - Trie.enterFunction(1, 1); - Trie.exitFunction(1, 2); + Trie.enterFunction(1, 1, 0); + Trie.exitFunction(1, 2, 0); // Enter and exit FId = 2. - Trie.enterFunction(2, 3); - Trie.exitFunction(2, 4); + Trie.enterFunction(2, 3, 0); + Trie.exitFunction(2, 4, 0); const auto &R = Trie.getRoots(); ASSERT_FALSE(R.empty()); @@ -126,11 +154,11 @@ auto A = FunctionCallTrie::InitAllocators(); FunctionCallTrie Trie(A); - Trie.enterFunction(1, 0); - Trie.enterFunction(2, 100); - Trie.enterFunction(3, 200); - Trie.exitFunction(3, 300); - Trie.exitFunction(1, 400); + Trie.enterFunction(1, 0, 0); + Trie.enterFunction(2, 100, 0); + Trie.enterFunction(3, 200, 0); + Trie.exitFunction(3, 300, 0); + Trie.exitFunction(1, 400, 0); // What we should see at this point is all the functions in the trie in a // specific order (1 -> 2 -> 3) with the appropriate count(s) and local @@ -153,12 +181,12 @@ // Now that we've established the preconditions, we check for specific aspects // of the nodes. - EXPECT_EQ(F3.CallCount, 1); - EXPECT_EQ(F2.CallCount, 1); - EXPECT_EQ(F1.CallCount, 1); - EXPECT_EQ(F3.CumulativeLocalTime, 100); - EXPECT_EQ(F2.CumulativeLocalTime, 300); - EXPECT_EQ(F1.CumulativeLocalTime, 100); + EXPECT_EQ(F3.CallCount, 1u); + EXPECT_EQ(F2.CallCount, 1u); + EXPECT_EQ(F1.CallCount, 1u); + EXPECT_EQ(F3.CumulativeLocalTime, 100u); + EXPECT_EQ(F2.CumulativeLocalTime, 300u); + EXPECT_EQ(F1.CumulativeLocalTime, 100u); } TEST(FunctionCallTrieTest, DeepCallStack) { @@ -168,8 +196,8 @@ auto A = FunctionCallTrie::InitAllocators(); FunctionCallTrie Trie(A); for (int i = 0; i < 32; ++i) - Trie.enterFunction(i + 1, i); - Trie.exitFunction(1, 33); + Trie.enterFunction(i + 1, i, 0); + Trie.exitFunction(1, 33, 0); // Here, validate that we have a 32-level deep function call path from the // root (1) down to the leaf (33). @@ -178,7 +206,7 @@ auto F = R[0]; for (int i = 0; i < 32; ++i) { EXPECT_EQ(F->FId, i + 1); - EXPECT_EQ(F->CallCount, 1); + EXPECT_EQ(F->CallCount, 1u); if (F->Callees.empty() && i != 31) FAIL() << "Empty callees for FId " << F->FId; if (i != 31) @@ -193,12 +221,12 @@ auto A = FunctionCallTrie::InitAllocators(); FunctionCallTrie Trie(A); - Trie.enterFunction(1, 0); - Trie.enterFunction(2, 1); - Trie.exitFunction(2, 2); - Trie.enterFunction(3, 3); - Trie.exitFunction(3, 4); - Trie.exitFunction(1, 5); + Trie.enterFunction(1, 0, 0); + Trie.enterFunction(2, 1, 0); + Trie.exitFunction(2, 2, 0); + Trie.enterFunction(3, 3, 0); + Trie.exitFunction(3, 4, 0); + Trie.exitFunction(1, 5, 0); // We want to make a deep copy and compare notes. auto B = FunctionCallTrie::InitAllocators(); @@ -236,20 +264,20 @@ FunctionCallTrie T1(A); // 1 -> 2 -> 3 - T0.enterFunction(1, 0); - T0.enterFunction(2, 1); - T0.enterFunction(3, 2); - T0.exitFunction(3, 3); - T0.exitFunction(2, 4); - T0.exitFunction(1, 5); + T0.enterFunction(1, 0, 0); + T0.enterFunction(2, 1, 0); + T0.enterFunction(3, 2, 0); + T0.exitFunction(3, 3, 0); + T0.exitFunction(2, 4, 0); + T0.exitFunction(1, 5, 0); // 1 -> 2 -> 3 - T1.enterFunction(1, 0); - T1.enterFunction(2, 1); - T1.enterFunction(3, 2); - T1.exitFunction(3, 3); - T1.exitFunction(2, 4); - T1.exitFunction(1, 5); + T1.enterFunction(1, 0, 0); + T1.enterFunction(2, 1, 0); + T1.enterFunction(3, 2, 0); + T1.exitFunction(3, 3, 0); + T1.exitFunction(2, 4, 0); + T1.exitFunction(1, 5, 0); // We use a different allocator here to make sure that we're able to transfer // data into a FunctionCallTrie which uses a different allocator. This @@ -264,20 +292,20 @@ ASSERT_EQ(Merged.getRoots().size(), 1u); const auto &R0 = *Merged.getRoots()[0]; EXPECT_EQ(R0.FId, 1); - EXPECT_EQ(R0.CallCount, 2); - EXPECT_EQ(R0.CumulativeLocalTime, 10); + EXPECT_EQ(R0.CallCount, 2u); + EXPECT_EQ(R0.CumulativeLocalTime, 10u); EXPECT_EQ(R0.Callees.size(), 1u); const auto &F1 = *R0.Callees[0].NodePtr; EXPECT_EQ(F1.FId, 2); - EXPECT_EQ(F1.CallCount, 2); - EXPECT_EQ(F1.CumulativeLocalTime, 6); + EXPECT_EQ(F1.CallCount, 2u); + EXPECT_EQ(F1.CumulativeLocalTime, 6u); EXPECT_EQ(F1.Callees.size(), 1u); const auto &F2 = *F1.Callees[0].NodePtr; EXPECT_EQ(F2.FId, 3); - EXPECT_EQ(F2.CallCount, 2); - EXPECT_EQ(F2.CumulativeLocalTime, 2); + EXPECT_EQ(F2.CallCount, 2u); + EXPECT_EQ(F2.CumulativeLocalTime, 2u); EXPECT_EQ(F2.Callees.size(), 0u); } Index: compiler-rt/trunk/lib/xray/tests/unit/profile_collector_test.cc =================================================================== --- compiler-rt/trunk/lib/xray/tests/unit/profile_collector_test.cc +++ compiler-rt/trunk/lib/xray/tests/unit/profile_collector_test.cc @@ -121,10 +121,10 @@ FunctionCallTrie T(Allocators); // Then, we populate the trie with some data. - T.enterFunction(1, 1); - T.enterFunction(2, 2); - T.exitFunction(2, 3); - T.exitFunction(1, 4); + T.enterFunction(1, 1, 0); + T.enterFunction(2, 2, 0); + T.exitFunction(2, 3, 0); + T.exitFunction(1, 4, 0); // Then we post the data to the global profile collector service. profileCollectorService::post(T, 1); @@ -177,10 +177,10 @@ thread_local auto Allocators = FunctionCallTrie::InitAllocators(); FunctionCallTrie T(Allocators); - T.enterFunction(1, 1); - T.enterFunction(2, 2); - T.exitFunction(2, 3); - T.exitFunction(1, 4); + T.enterFunction(1, 1, 0); + T.enterFunction(2, 2, 0); + T.exitFunction(2, 3, 0); + T.exitFunction(1, 4, 0); profileCollectorService::post(T, GetTid()); } Index: compiler-rt/trunk/lib/xray/xray_function_call_trie.h =================================================================== --- compiler-rt/trunk/lib/xray/xray_function_call_trie.h +++ compiler-rt/trunk/lib/xray/xray_function_call_trie.h @@ -18,6 +18,7 @@ #include "xray_defs.h" #include "xray_profiling_flags.h" #include "xray_segmented_array.h" +#include #include // For placement new. #include @@ -113,13 +114,13 @@ struct Node { Node *Parent; NodeIdPairArray Callees; - int64_t CallCount; - int64_t CumulativeLocalTime; // Typically in TSC deltas, not wall-time. + uint64_t CallCount; + uint64_t CumulativeLocalTime; // Typically in TSC deltas, not wall-time. int32_t FId; // We add a constructor here to allow us to inplace-construct through // Array<...>'s AppendEmplace. - Node(Node *P, NodeIdPairAllocatorType &A, int64_t CC, int64_t CLT, + Node(Node *P, NodeIdPairAllocatorType &A, uint64_t CC, uint64_t CLT, int32_t F) XRAY_NEVER_INSTRUMENT : Parent(P), Callees(A), CallCount(CC), @@ -133,11 +134,14 @@ struct ShadowStackEntry { uint64_t EntryTSC; Node *NodePtr; + uint16_t EntryCPU; // We add a constructor here to allow us to inplace-construct through // Array<...>'s AppendEmplace. - ShadowStackEntry(uint64_t T, Node *N) XRAY_NEVER_INSTRUMENT : EntryTSC{T}, - NodePtr{N} {} + ShadowStackEntry(uint64_t T, Node *N, uint16_t C) XRAY_NEVER_INSTRUMENT + : EntryTSC{T}, + NodePtr{N}, + EntryCPU{C} {} }; using NodeArray = Array; @@ -262,17 +266,18 @@ ShadowStack(*A.ShadowStackAllocator), NodeIdPairAllocator(A.NodeIdPairAllocator) {} - void enterFunction(const int32_t FId, uint64_t TSC) XRAY_NEVER_INSTRUMENT { + void enterFunction(const int32_t FId, uint64_t TSC, + uint16_t CPU) XRAY_NEVER_INSTRUMENT { DCHECK_NE(FId, 0); // This function primarily deals with ensuring that the ShadowStack is // consistent and ready for when an exit event is encountered. if (UNLIKELY(ShadowStack.empty())) { auto NewRoot = - Nodes.AppendEmplace(nullptr, *NodeIdPairAllocator, 0, 0, FId); + Nodes.AppendEmplace(nullptr, *NodeIdPairAllocator, 0u, 0u, FId); if (UNLIKELY(NewRoot == nullptr)) return; Roots.Append(NewRoot); - ShadowStack.AppendEmplace(TSC, NewRoot); + ShadowStack.AppendEmplace(TSC, NewRoot, CPU); return; } @@ -286,23 +291,24 @@ [FId](const NodeIdPair &NR) { return NR.FId == FId; }); if (Callee != nullptr) { CHECK_NE(Callee->NodePtr, nullptr); - ShadowStack.AppendEmplace(TSC, Callee->NodePtr); + ShadowStack.AppendEmplace(TSC, Callee->NodePtr, CPU); return; } // This means we've never seen this stack before, create a new node here. auto NewNode = - Nodes.AppendEmplace(TopNode, *NodeIdPairAllocator, 0, 0, FId); + Nodes.AppendEmplace(TopNode, *NodeIdPairAllocator, 0u, 0u, FId); if (UNLIKELY(NewNode == nullptr)) return; DCHECK_NE(NewNode, nullptr); TopNode->Callees.AppendEmplace(NewNode, FId); - ShadowStack.AppendEmplace(TSC, NewNode); + ShadowStack.AppendEmplace(TSC, NewNode, CPU); DCHECK_NE(ShadowStack.back().NodePtr, nullptr); return; } - void exitFunction(int32_t FId, uint64_t TSC) XRAY_NEVER_INSTRUMENT { + void exitFunction(int32_t FId, uint64_t TSC, + uint16_t CPU) XRAY_NEVER_INSTRUMENT { // When we exit a function, we look up the ShadowStack to see whether we've // entered this function before. We do as little processing here as we can, // since most of the hard work would have already been done at function @@ -312,7 +318,24 @@ const auto &Top = ShadowStack.back(); auto TopNode = Top.NodePtr; DCHECK_NE(TopNode, nullptr); - auto LocalTime = TSC - Top.EntryTSC; + + // We may encounter overflow on the TSC we're provided, which may end up + // being less than the TSC when we first entered the function. + // + // To get the accurate measurement of cycles, we need to check whether + // we've overflowed (TSC < Top.EntryTSC) and then account the difference + // between the entry TSC and the max for the TSC counter (max of uint64_t) + // then add the value of TSC. We can prove that the maximum delta we will + // get is at most the 64-bit unsigned value, since the difference between + // a TSC of 0 and a Top.EntryTSC of 1 is (numeric_limits::max() + // - 1) + 1. + // + // NOTE: This assumes that TSCs are synchronised across CPUs. + // TODO: Count the number of times we've seen CPU migrations. + uint64_t LocalTime = + Top.EntryTSC > TSC + ? (std::numeric_limits::max() - Top.EntryTSC) + TSC + : TSC - Top.EntryTSC; TopNode->CallCount++; TopNode->CumulativeLocalTime += LocalTime - CumulativeTreeTime; CumulativeTreeTime += LocalTime; @@ -410,8 +433,8 @@ auto R = O.Roots.find_element( [&](const Node *Node) { return Node->FId == Root->FId; }); if (R == nullptr) { - TargetRoot = O.Nodes.AppendEmplace(nullptr, *O.NodeIdPairAllocator, 0, - 0, Root->FId); + TargetRoot = O.Nodes.AppendEmplace(nullptr, *O.NodeIdPairAllocator, 0u, + 0u, Root->FId); if (UNLIKELY(TargetRoot == nullptr)) return; @@ -436,7 +459,7 @@ }); if (TargetCallee == nullptr) { auto NewTargetNode = O.Nodes.AppendEmplace( - NT.TargetNode, *O.NodeIdPairAllocator, 0, 0, Callee.FId); + NT.TargetNode, *O.NodeIdPairAllocator, 0u, 0u, Callee.FId); if (UNLIKELY(NewTargetNode == nullptr)) return; Index: compiler-rt/trunk/lib/xray/xray_profiling.cc =================================================================== --- compiler-rt/trunk/lib/xray/xray_profiling.cc +++ compiler-rt/trunk/lib/xray/xray_profiling.cc @@ -197,11 +197,11 @@ switch (Entry) { case XRayEntryType::ENTRY: case XRayEntryType::LOG_ARGS_ENTRY: - TLD.FCT->enterFunction(FuncId, TSC); + TLD.FCT->enterFunction(FuncId, TSC, CPU); break; case XRayEntryType::EXIT: case XRayEntryType::TAIL: - TLD.FCT->exitFunction(FuncId, TSC); + TLD.FCT->exitFunction(FuncId, TSC, CPU); break; default: // FIXME: Handle bugs.