Index: compiler-rt/lib/xray/CMakeLists.txt =================================================================== --- compiler-rt/lib/xray/CMakeLists.txt +++ compiler-rt/lib/xray/CMakeLists.txt @@ -2,33 +2,61 @@ # XRay runtime library implementation files. set(XRAY_SOURCES - xray_init.cc - xray_flags.cc - xray_interface.cc - xray_log_interface.cc - xray_utils.cc) + xray_init.cc + xray_flags.cc + xray_interface.cc + xray_log_interface.cc + xray_utils.cc) # Implementation files for all XRay modes. set(XRAY_FDR_MODE_SOURCES - xray_buffer_queue.cc - xray_fdr_logging.cc) + xray_buffer_queue.cc + xray_fdr_logging.cc) set(XRAY_BASIC_MODE_SOURCES - xray_inmemory_log.cc) + xray_inmemory_log.cc) + +set(XRAY_PROFILER_MODE_SOURCES + xray_profile_collector.cc + xray_profiler.cc + xray_profiler_flags.cc) # Implementation files for all XRay architectures. -set(aarch64_SOURCES xray_AArch64.cc xray_trampoline_AArch64.S) -set(arm_SOURCES xray_arm.cc xray_trampoline_arm.S) -set(armhf_SOURCES ${arm_SOURCES}) -set(mips_SOURCES xray_mips.cc xray_trampoline_mips.S) -set(mipsel_SOURCES xray_mips.cc xray_trampoline_mips.S) -set(mips64_SOURCES xray_mips64.cc xray_trampoline_mips64.S) -set(mips64el_SOURCES xray_mips64.cc xray_trampoline_mips64.S) +set(x86_64_SOURCES + xray_x86_64.cc + xray_trampoline_x86_64.S) + +set(arm_SOURCES + xray_arm.cc + xray_trampoline_arm.S) + +set(armhf_SOURCES + ${arm_SOURCES}) + +set(aarch64_SOURCES + xray_AArch64.cc + xray_trampoline_AArch64.S) + +set(mips_SOURCES + xray_mips.cc + xray_trampoline_mips.S) + +set(mipsel_SOURCES + xray_mips.cc + xray_trampoline_mips.S) + +set(mips64_SOURCES + xray_mips64.cc + xray_trampoline_mips64.S) + +set(mips64el_SOURCES + xray_mips64.cc + xray_trampoline_mips64.S) + set(powerpc64le_SOURCES xray_powerpc64.cc xray_trampoline_powerpc64.cc xray_trampoline_powerpc64_asm.S) -set(x86_64_SOURCES xray_x86_64.cc xray_trampoline_x86_64.S) # Now put it all together... include_directories(..) @@ -72,6 +100,12 @@ SOURCES ${XRAY_BASIC_MODE_SOURCES} CFLAGS ${XRAY_CFLAGS} DEFS ${XRAY_COMMON_DEFINITIONS}) + add_compiler_rt_object_libraries(RTXrayPROFILER + OS ${XRAY_SUPPORTED_OS} + ARCHS ${XRAY_SUPPORTED_ARCH} + SOURCES ${XRAY_PROFILER_MODE_SOURCES} + CFLAGS ${XRAY_CFLAGS} + DEFS ${XRAY_COMMON_DEFINITIONS}) # We only support running on osx for now. add_compiler_rt_runtime(clang_rt.xray @@ -106,6 +140,16 @@ LINK_FLAGS ${SANITIZER_COMMON_LINK_FLAGS} ${WEAK_SYMBOL_LINK_FLAGS} LINK_LIBS ${XRAY_LINK_LIBS} PARENT_TARGET xray) + add_compiler_rt_runtime(clang_rt.xray-profiler + STATIC + OS ${XRAY_SUPPORTED_OS} + ARCHS ${XRAY_SUPPORTED_ARCH} + OBJECT_LIBS RTXrayPROFILER + CFLAGS ${XRAY_CFLAGS} + DEFS ${XRAY_COMMON_DEFINITIONS} + LINK_FLAGS ${SANITIZER_COMMON_LINK_FLAGS} ${WEAK_SYMBOL_LINK_FLAGS} + LINK_LIBS ${XRAY_LINK_LIBS} + PARENT_TARGET xray) else() # not Apple foreach(arch ${XRAY_SUPPORTED_ARCH}) if(NOT CAN_TARGET_${arch}) @@ -123,6 +167,10 @@ ARCHS ${arch} SOURCES ${XRAY_BASIC_MODE_SOURCES} CFLAGS ${XRAY_CFLAGS} DEFS ${XRAY_COMMON_DEFINITIONS}) + add_compiler_rt_object_libraries(RTXrayPROFILER + ARCHS ${arch} + SOURCES ${XRAY_PROFILER_MODE_SOURCES} CFLAGS ${XRAY_CFLAGS} + DEFS ${XRAY_COMMON_DEFINITIONS}) # Common XRay archive for instrumented binaries. add_compiler_rt_runtime(clang_rt.xray @@ -148,6 +196,14 @@ DEFS ${XRAY_COMMON_DEFINITIONS} OBJECT_LIBS RTXrayBASIC PARENT_TARGET xray) + # Profiler Mode runtime + add_compiler_rt_runtime(clang_rt.xray-profiler + STATIC + ARCHS ${arch} + CFLAGS ${XRAY_CFLAGS} + DEFS ${XRAY_COMMON_DEFINITIONS} + OBJECT_LIBS RTXrayPROFILER + PARENT_TARGET xray) endforeach() endif() # not Apple Index: compiler-rt/lib/xray/tests/CMakeLists.txt =================================================================== --- compiler-rt/lib/xray/tests/CMakeLists.txt +++ compiler-rt/lib/xray/tests/CMakeLists.txt @@ -72,6 +72,7 @@ add_xray_lib("RTXRay.test.osx" $ $ + $ $ $) else() @@ -79,6 +80,7 @@ add_xray_lib("RTXRay.test.${arch}" $ $ + $ $ $) endforeach() Index: compiler-rt/lib/xray/tests/unit/CMakeLists.txt =================================================================== --- compiler-rt/lib/xray/tests/unit/CMakeLists.txt +++ compiler-rt/lib/xray/tests/unit/CMakeLists.txt @@ -2,3 +2,11 @@ buffer_queue_test.cc xray_unit_test_main.cc) add_xray_unittest(XRayFDRLoggingTest SOURCES fdr_logging_test.cc xray_unit_test_main.cc) +add_xray_unittest(XRayAllocatorTest SOURCES + allocator_test.cc xray_unit_test_main.cc) +add_xray_unittest(XRaySegmentedArrayTest SOURCES + segmented_array_test.cc xray_unit_test_main.cc) +add_xray_unittest(XRayFunctionCallTrieTest SOURCES + function_call_trie_test.cc xray_unit_test_main.cc) +add_xray_unittest(XRayProfileCollectorTest SOURCES + profile_collector_test.cc xray_unit_test_main.cc) Index: compiler-rt/lib/xray/tests/unit/allocator_test.cc =================================================================== --- /dev/null +++ compiler-rt/lib/xray/tests/unit/allocator_test.cc @@ -0,0 +1,42 @@ +//===-- allocator_test.cc -------------------------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file is a part of XRay, a function call tracing system. +// +//===----------------------------------------------------------------------===// + +#include "xray_allocator.h" +#include "gtest/gtest.h" + +namespace __xray { +namespace { + +struct TestData { + s64 First; + s64 Second; +}; + +TEST(AllocatorTest, Construction) { Allocator A(2 << 11, 0); } + +TEST(AllocatorTest, Allocate) { + Allocator A(2 << 11, 0); + auto B = A.Allocate(); + (void)B; +} + +TEST(AllocatorTest, OverAllocate) { + Allocator A(sizeof(TestData), 0); + auto B1 = A.Allocate(); + (void)B1; + auto B2 = A.Allocate(); + ASSERT_EQ(B2.Data, nullptr); +} + +} // namespace +} // namespace __xray Index: compiler-rt/lib/xray/tests/unit/function_call_trie_test.cc =================================================================== --- /dev/null +++ compiler-rt/lib/xray/tests/unit/function_call_trie_test.cc @@ -0,0 +1,225 @@ +//===-- function_call_trie_test.cc ----------------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file is a part of XRay, a function call tracing system. +// +//===----------------------------------------------------------------------===// + +#include "gtest/gtest.h" +#include "xray_function_call_trie.h" + +namespace __xray { + +namespace { + +TEST(FunctionCallTrieTest, Construction) { + // We want to make sure that we can create one of these without the set of + // allocators we need. This will by default use the global allocators. + FunctionCallTrie Trie; +} + +TEST(FunctionCallTrieTest, ConstructWithTLSAllocators) { + // FIXME: Support passing in configuration for allocators in the allocator + // constructors. + profilerFlags()->setDefaults(); + thread_local FunctionCallTrie::Allocators Allocators = + FunctionCallTrie::InitAllocators(); + FunctionCallTrie Trie(Allocators); +} + +TEST(FunctionCallTrieTest, EnterAndExitFunction) { + profilerFlags()->setDefaults(); + thread_local auto A = FunctionCallTrie::InitAllocators(); + FunctionCallTrie Trie(A); + + Trie.enterFunction(1, 1); + Trie.exitFunction(1, 2); + + // We need a way to pull the data out. At this point, until we get a data + // collection service implemented, we're going to export the data as a list of + // roots, and manually walk through the structure ourselves. + + const auto &R = Trie.getRoots(); + + ASSERT_EQ(R.size(), 1u); + ASSERT_EQ(R.front()->FId, 1); + ASSERT_EQ(R.front()->CallCount, 1); + ASSERT_EQ(R.front()->CumulativeLocalTime, 1u); +} + +TEST(FunctionCallTrieTest, MissingFunctionEntry) { + thread_local auto A = FunctionCallTrie::InitAllocators(); + FunctionCallTrie Trie(A); + Trie.exitFunction(1, 1); + const auto &R = Trie.getRoots(); + + ASSERT_TRUE(R.empty()); +} + +TEST(FunctionCallTrieTest, MultipleRoots) { + profilerFlags()->setDefaults(); + thread_local auto A = FunctionCallTrie::InitAllocators(); + FunctionCallTrie Trie(A); + + // Enter and exit FId = 1. + Trie.enterFunction(1, 1); + Trie.exitFunction(1, 2); + + // Enter and exit FId = 2. + Trie.enterFunction(2, 3); + Trie.exitFunction(2, 4); + + const auto &R = Trie.getRoots(); + ASSERT_FALSE(R.empty()); + ASSERT_EQ(R.size(), 2u); + + // Inspect the roots that they have the right data. + const auto R0 = R[0]; + ASSERT_NE(R0, nullptr); + EXPECT_EQ(R0->CallCount, 1u); + EXPECT_EQ(R0->CumulativeLocalTime, 1u); + + const auto R1 = R[1]; + ASSERT_NE(R1, nullptr); + EXPECT_EQ(R1->CallCount, 1u); + EXPECT_EQ(R1->CumulativeLocalTime, 1u); +} + +// While missing an intermediary entry may be rare in practice, we still enforce +// that we can handle the case where we've missed the entry event somehow, in +// between call entry/exits. To illustrate, imagine the following shadow call +// stack: +// +// f0@t0 -> f1@t1 -> f2@t2 +// +// If for whatever reason we see an exit for `f2` @ t3, followed by an exit for +// `f0` @ t4 (i.e. no `f1` exit in between) then we need to handle the case of +// accounting local time to `f2` from d = (t3 - t2), then local time to `f1` +// as d' = (t3 - t1) - d, and then local time to `f0` as d'' = (t3 - t0) - d'. +TEST(FunctionCallTrieTest, MissingIntermediaryExit) { + profilerFlags()->setDefaults(); + thread_local auto A = FunctionCallTrie::InitAllocators(); + FunctionCallTrie Trie(A); + + Trie.enterFunction(1, 0); + Trie.enterFunction(2, 100); + Trie.enterFunction(3, 200); + Trie.exitFunction(3, 300); + Trie.exitFunction(1, 400); + + // What we should see at this point is all the functions in the trie in a + // specific order (1 -> 2 -> 3) with the appropriate count(s) and local + // latencies. + const auto &R = Trie.getRoots(); + ASSERT_FALSE(R.empty()); + ASSERT_EQ(R.size(), 1u); + + const auto &F1 = *R[0]; + ASSERT_EQ(F1.FId, 1); + ASSERT_FALSE(F1.Callees.empty()); + + const auto &F2 = *F1.Callees[0].NodePtr; + ASSERT_EQ(F2.FId, 2); + ASSERT_FALSE(F2.Callees.empty()); + + const auto &F3 = *F2.Callees[0].NodePtr; + ASSERT_EQ(F3.FId, 3); + ASSERT_TRUE(F3.Callees.empty()); + + // Now that we've established the preconditions, we check for specific aspects + // of the nodes. + EXPECT_EQ(F3.CallCount, 1); + EXPECT_EQ(F2.CallCount, 1); + EXPECT_EQ(F1.CallCount, 1); + EXPECT_EQ(F3.CumulativeLocalTime, 100); + EXPECT_EQ(F2.CumulativeLocalTime, 300); + EXPECT_EQ(F1.CumulativeLocalTime, 100); +} + +// TODO: Test that we can handle cross-CPU migrations, where TSCs are not +// guaranteed to be synchronised. +TEST(FunctionCallTrieTest, DeepCopy) { + profilerFlags()->setDefaults(); + auto A = FunctionCallTrie::InitAllocators(); + FunctionCallTrie Trie(A); + + Trie.enterFunction(1, 0); + Trie.enterFunction(2, 1); + Trie.exitFunction(2, 2); + Trie.enterFunction(3, 3); + Trie.exitFunction(3, 4); + Trie.exitFunction(1, 5); + + // We want to make a deep copy and compare notes. + auto B = FunctionCallTrie::InitAllocators(); + FunctionCallTrie Copy(B); + Trie.deepCopyInto(Copy); + + ASSERT_NE(Trie.getRoots().size(), 0u); + ASSERT_EQ(Trie.getRoots().size(), Copy.getRoots().size()); + const auto &R0Orig = *Trie.getRoots()[0]; + const auto &R0Copy = *Copy.getRoots()[0]; + EXPECT_EQ(R0Orig.FId, 1); + EXPECT_EQ(R0Orig.FId, R0Copy.FId); + + ASSERT_EQ(R0Orig.Callees.size(), 2u); + ASSERT_EQ(R0Copy.Callees.size(), 2u); + + const auto &F1Orig = + *R0Orig.Callees + .find_element( + [](const FunctionCallTrie::NodeRef &R) { return R.FId == 2; }) + ->NodePtr; + const auto &F1Copy = + *R0Copy.Callees + .find_element( + [](const FunctionCallTrie::NodeRef &R) { return R.FId == 2; }) + ->NodePtr; + EXPECT_EQ(&R0Orig, F1Orig.Parent); + EXPECT_EQ(&R0Copy, F1Copy.Parent); +} + +TEST(FunctionCallTrieTest, MergeInto) { + profilerFlags()->setDefaults(); + auto A = FunctionCallTrie::InitAllocators(); + FunctionCallTrie T0(A); + FunctionCallTrie T1(A); + + // 1 -> 2 -> 3 + T0.enterFunction(1, 0); + T0.enterFunction(2, 1); + T0.enterFunction(3, 2); + T0.exitFunction(3, 3); + T0.exitFunction(2, 4); + T0.exitFunction(1, 5); + + // 1 -> 2 -> 3 + T1.enterFunction(1, 0); + T1.enterFunction(2, 1); + T1.enterFunction(3, 2); + T1.exitFunction(3, 3); + T1.exitFunction(2, 4); + T1.exitFunction(1, 5); + + auto B = FunctionCallTrie::InitAllocators(); + FunctionCallTrie Merged(B); + + T0.mergeInto(Merged); + T1.mergeInto(Merged); + + ASSERT_EQ(Merged.getRoots().size(), 1u); + const auto &R0 = *Merged.getRoots()[0]; + EXPECT_EQ(R0.FId, 1); + EXPECT_EQ(R0.CallCount, 2); + EXPECT_EQ(R0.CumulativeLocalTime, 10); +} + +} // namespace + +} // namespace __xray Index: compiler-rt/lib/xray/tests/unit/profile_collector_test.cc =================================================================== --- /dev/null +++ compiler-rt/lib/xray/tests/unit/profile_collector_test.cc @@ -0,0 +1,111 @@ +//===-- function_call_trie_test.cc ----------------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file is a part of XRay, a function call tracing system. +// +//===----------------------------------------------------------------------===// +#include "gtest/gtest.h" + +#include "xray_profile_collector.h" +#include "xray_profiler_flags.h" +#include + +namespace __xray { +namespace { + +static constexpr auto kHeaderSize = 16u; + +void ValidateBlock(XRayBuffer B) { + profilerFlags()->setDefaults(); + ASSERT_NE(static_cast(B.Data), nullptr); + ASSERT_NE(B.Size, 0u); + ASSERT_GE(B.Size, kHeaderSize); + // We look at the block size, the block number, and the thread ID to ensure + // that none of them are zero (or that the header data is laid out as we + // expect). + char LocalBuffer[kHeaderSize] = {}; + memcpy(LocalBuffer, B.Data, kHeaderSize); + u32 BlockSize = 0; + u32 BlockNumber = 0; + u64 ThreadId = 0; + memcpy(&BlockSize, LocalBuffer, sizeof(u32)); + memcpy(&BlockNumber, LocalBuffer + sizeof(u32), sizeof(u32)); + memcpy(&ThreadId, LocalBuffer + (2 * sizeof(u32)), sizeof(u64)); + EXPECT_NE(BlockSize, 0u); + EXPECT_GE(BlockNumber, 0u); + EXPECT_NE(ThreadId, 0u); +} + +TEST(profileCollectorServiceTest, PostSerializeCollect) { + profilerFlags()->setDefaults(); + // The most basic use-case (the one we actually only care about) is the one + // where we ensure that we can post FunctionCallTrie instances, which are then + // destroyed but serialized properly. + // + // First, we initialise a set of allocators in the local scope. This ensures + // that we're able to copy the contents of the FunctionCallTrie that uses + // the local allocators. + auto Allocators = FunctionCallTrie::InitAllocators(); + FunctionCallTrie Trie(Allocators); + + // Then, we populate the trie with some data. + Trie.enterFunction(1, 1); + Trie.enterFunction(2, 2); + Trie.exitFunction(2, 3); + Trie.exitFunction(1, 4); + + // Then we post the data to the global profile collector service. + profileCollectorService::post(Trie, 1); + + // Then we serialize the data. + profileCollectorService::serialize(); + + // Then we go through a single buffer to see whether we're getting the data we + // expect. + auto B = profileCollectorService::nextBuffer({nullptr, 0}); + ValidateBlock(B); +} + +// We break out a function that will be run in multiple threads, one that will +// use a thread local allocator, and will post the FunctionCallTrie to the +// profileCollectorService. This simulates what the threads being profiled would +// be doing anyway, but through the XRay logging implementation. +void threadProcessing() { + thread_local auto Allocators = FunctionCallTrie::InitAllocators(); + FunctionCallTrie Trie(Allocators); + + Trie.enterFunction(1, 1); + Trie.enterFunction(2, 2); + Trie.exitFunction(2, 3); + Trie.exitFunction(1, 4); + + profileCollectorService::post(Trie, __sanitizer::GetTid()); +} + +TEST(profileCollectorServiceTest, PostSerializeCollectMultipleThread) { + profilerFlags()->setDefaults(); + std::thread t1(threadProcessing); + std::thread t2(threadProcessing); + + t1.join(); + t2.join(); + + // At this point, t1 and t2 are already done with what they were doing. + profileCollectorService::serialize(); + + // Ensure that we see two buffers. + auto B = profileCollectorService::nextBuffer({nullptr, 0}); + ValidateBlock(B); + + B = profileCollectorService::nextBuffer(B); + ValidateBlock(B); +} + +} // namespace +} // namespace __xray Index: compiler-rt/lib/xray/tests/unit/segmented_array_test.cc =================================================================== --- /dev/null +++ compiler-rt/lib/xray/tests/unit/segmented_array_test.cc @@ -0,0 +1,91 @@ +#include "xray_segmented_array.h" +#include "gtest/gtest.h" + +namespace __xray { +namespace { + +struct TestData { + s64 First; + s64 Second; + + // Need a constructor for emplace operations. + TestData(s64 F, s64 S) : First(F), Second(S) {} +}; + +TEST(SegmentedArrayTest, Construction) { + Array Data; + (void)Data; +} + +TEST(SegmentedArrayTest, ConstructWithAllocator) { + using AllocatorType = typename Array::AllocatorType; + AllocatorType A(1 << 4, 0); + Array Data(A); + (void)Data; +} + +TEST(SegmentedArrayTest, ConstructAndPopulate) { + Array data; + ASSERT_NE(data.Append(TestData{0, 0}), nullptr); + ASSERT_NE(data.Append(TestData{1, 1}), nullptr); + ASSERT_EQ(data.size(), 2u); +} + +TEST(SegmentedArrayTest, ConstructPopulateAndLookup) { + Array data; + ASSERT_NE(data.Append(TestData{0, 1}), nullptr); + ASSERT_EQ(data.size(), 1u); + ASSERT_EQ(data[0].First, 0); + ASSERT_EQ(data[0].Second, 1); +} + +TEST(SegmentedArrayTest, PopulateWithMoreElements) { + Array data; + static const auto kMaxElements = 100u; + for (auto I = 0u; I < kMaxElements; ++I) { + ASSERT_NE(data.Append(TestData{I, I + 1}), nullptr); + } + ASSERT_EQ(data.size(), kMaxElements); + for (auto I = 0u; I < kMaxElements; ++I) { + ASSERT_EQ(data[I].First, I); + ASSERT_EQ(data[I].Second, I + 1); + } +} + +TEST(SegmentedArrayTest, AppendEmplace) { + Array data; + ASSERT_NE(data.AppendEmplace(1, 1), nullptr); + ASSERT_EQ(data[0].First, 1); + ASSERT_EQ(data[0].Second, 1); +} + +TEST(SegmentedArrayTest, AppendAndTrim) { + Array data; + ASSERT_NE(data.AppendEmplace(1, 1), nullptr); + ASSERT_EQ(data.size(), 1u); + data.trim(1); + ASSERT_EQ(data.size(), 0u); + ASSERT_TRUE(data.empty()); +} + +TEST(SegmentedArrayTest, Iterators) { + Array data; + ASSERT_TRUE(data.empty()); + ASSERT_EQ(data.begin(), data.end()); + auto I0 = data.begin(); + ASSERT_EQ(I0++, data.begin()); + ASSERT_NE(I0, data.begin()); + for (const auto &D : data) { + (void)D; + FAIL(); + } + ASSERT_NE(data.AppendEmplace(1, 1), nullptr); + ASSERT_EQ(data.size(), 1u); + ASSERT_NE(data.begin(), data.end()); + auto &D0 = *data.begin(); + ASSERT_EQ(D0.First, 1); + ASSERT_EQ(D0.Second, 1); +} + +} // namespace +} // namespace __xray Index: compiler-rt/lib/xray/xray_allocator.h =================================================================== --- /dev/null +++ compiler-rt/lib/xray/xray_allocator.h @@ -0,0 +1,175 @@ +//===-- xray_allocator.h ---------------------------------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file is a part of XRay, a dynamic runtime instrumentation system. +// +// Defines the allocator interface for an arena allocator, used primarily for +// the profiling runtime. +// +//===----------------------------------------------------------------------===// +#ifndef XRAY_ALLOCATOR_H +#define XRAY_ALLOCATOR_H + +#include "sanitizer_common/sanitizer_allocator_internal.h" +#include "sanitizer_common/sanitizer_common.h" +#include "sanitizer_common/sanitizer_mutex.h" +#include +#include + +#include "sanitizer_common/sanitizer_internal_defs.h" + +namespace __xray { + +/// The Allocator type hands out fixed-sized chunks of memory that are +/// cache-line aligned and sized. This is useful for placement of +/// performance-sensitive data in memory that's frequently accessed. The +/// allocator also self-limits the peak memory usage to a dynamically defined +/// maximum. +/// +template struct Allocator { + // Each Block will have a fixed size, dependent on N. The allocator will hand + // out individual blocks identified by the Data pointer, which is associated + // with a block. + struct Block { + /// Compute the minimum cache-line size multiple that is >= N. + static constexpr auto Size = + kCacheLineSize * ((N / kCacheLineSize) + (N % kCacheLineSize ? 1 : 0)); + void *Data = nullptr; + }; + +private: + // A BlockChain will contain a fixed number of blocks, each with an identifier + // to specify whether it's been handed out or not. We keep track of BlockChain + // iterators, which are basically a pointer to the chain and an offset into + // the fixed set of blocks associated with a link. The iterators are + // bidirectional. + struct BlockChain { + static_assert(kCacheLineSize % sizeof(void *) == 0, + "Cache line size is not divisible by size of void*; none of " + "the assumptions of the BlockChain will hold."); + + // This size represents the most number of pointers we can fit in a cache + // line, along with the next and previous pointers, and the bitmap to + // indicate which blocks have been handed out, and which ones are available. + // This structure corresponds to the following layout: + // + // Blocks [ 0, 1, 2, .., Size - 1] + // + // Using a 16-bit bitmap will allow us to cover a really big cache line that + // might store up to 16 pointers. + static constexpr auto Size = (kCacheLineSize / sizeof(Block)) - 3; + static_assert(Size <= 16, "Need to support larger than 16 "); + + // FIXME: Align this to cache-line address boundaries + Block Blocks[Size]{}; + BlockChain *Prev = nullptr; + BlockChain *Next = nullptr; + u16 UsedBits = 0; + }; + + static_assert(sizeof(BlockChain) == kCacheLineSize, + "BlockChain instances must be cache-line-sized."); + + static BlockChain NullLink; + + // FIXME: Implement a freelist, in case we actually do intend to return memory + // to the allocator, as opposed to just de-allocating everything in one go. + + size_t MaxMemory; + SpinMutex Mutex{}; + BlockChain *Chain = &NullLink; + size_t Counter = 0; + + BlockChain *NewChainLink() { + auto NewChain = reinterpret_cast( + InternalAlloc(sizeof(BlockChain), nullptr, kCacheLineSize)); + auto BackingStore = reinterpret_cast( + InternalAlloc(BlockChain::Size * Block::Size, nullptr, kCacheLineSize)); + size_t Offset = 0; + DCHECK_NE(NewChain, nullptr); + DCHECK_NE(BackingStore, nullptr); + for (auto &B : NewChain->Blocks) { + B.Data = BackingStore + Offset; + Offset += Block::Size; + } + NewChain->Prev = Chain; + NewChain->Next = &NullLink; + return NewChain; + } + +public: + Allocator(size_t M, size_t PreAllocate) : MaxMemory(M) { + // FIXME: Implement PreAllocate support! + } + + Block Allocate() { + SpinMutexLock Lock(&Mutex); + // Check whether we're over quota. + if (Counter * Block::Size >= MaxMemory) + return {}; + + size_t ChainOffset = Counter % BlockChain::Size; + u16 BitMask = 1 << ChainOffset; + + if (UNLIKELY(Chain == &NullLink)) { + // FIXME: Maybe pre-allocate some BlockChain instances in the same page + // as the Block objects. + auto NewChain = NewChainLink(); + + // Mark the first block in the newly crated link to "used". + NewChain->UsedBits |= BitMask; + + // Prepare to return the first block. + auto B = NewChain->Blocks[ChainOffset]; + Chain->Next = NewChain; + Chain = NewChain; + ++Counter; + return B; + } + + // We need to see whether we can get one of the blocks already in the chain + // link can be used. + if ((Chain->UsedBits | BitMask) == 0) { + auto B = Chain->Blocks[ChainOffset]; + Chain->UsedBits |= BitMask; + ++Counter; + return B; + } + + // At this point we know that we've wrapped around, and the current chain no + // longer has free blocks. We then allocate a new chain and make that the + // current chain. + auto NewChain = NewChainLink(); + NewChain->UsedBits |= BitMask; + auto B = NewChain->Blocks[ChainOffset]; + Chain->Next = NewChain; + Chain = NewChain; + ++Counter; + return B; + } + + ~Allocator() NOEXCEPT { + // We need to deallocate all the blocks, including the chain links. + for (auto C = Chain; C != &NullLink;) { + // We know that the data block is a large contiguous page, we deallocate + // that at once. + InternalFree(C->Blocks[0].Data); + auto Prev = C->Prev; + InternalFree(C); + C = Prev; + } + } +}; + +// Storage for the NullLink sentinel. +template typename Allocator::BlockChain Allocator::NullLink; + +} // namespace __xray + +#endif // XRAY_ALLOCATOR_H Index: compiler-rt/lib/xray/xray_function_call_trie.h =================================================================== --- /dev/null +++ compiler-rt/lib/xray/xray_function_call_trie.h @@ -0,0 +1,431 @@ +//===-- xray_function_call_trie.h ------------------------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file is a part of XRay, a dynamic runtime instrumentation system. +// +// This file defines the interface for a function call trie. +// +//===----------------------------------------------------------------------===// +#ifndef XRAY_FUNCTION_CALL_TRIE_H +#define XRAY_FUNCTION_CALL_TRIE_H + +#include "xray_profiler_flags.h" +#include "xray_segmented_array.h" +#include + +namespace __xray { + +/// A FunctionCallTrie represents the stack traces of XRay instrumented +/// functions that we've encountered, where a node corresponds to a function and +/// the path from the root to the node its stack trace. Each node in the trie +/// will contain some useful values, including: +/// +/// * The cumulative amount of time spent in this particular node/stack. +/// * The number of times this stack has appeared. +/// * A histogram of latencies for that particular node. +/// +/// Each node in the trie will also contain a list of callees, represented using +/// a Array -- each NodeRef instance will contain the function +/// ID of the callee, and a pointer to the node. +/// +/// If we visulaise this data structure, we'll find the following potential +/// representation: +/// +/// [function id node] -> [callees] [cumulative time] +/// [call counter] [latency histogram] +/// +/// As an example, when we have a function in this pseudocode: +/// +/// func f(N) { +/// g() +/// h() +/// for i := 1..N { j() } +/// } +/// +/// We may end up with a trie of the following form: +/// +/// f -> [ g, h, j ] [...] [1] [...] +/// g -> [ ... ] [...] [1] [...] +/// h -> [ ... ] [...] [1] [...] +/// j -> [ ... ] [...] [N] [...] +/// +/// If for instance the function g() called j() like so: +/// +/// func g() { +/// for i := 1..10 { j() } +/// } +/// +/// We'll find the following updated trie: +/// +/// f -> [ g, h, j ] [...] [1] [...] +/// g -> [ j' ] [...] [1] [...] +/// h -> [ ... ] [...] [1] [...] +/// j -> [ ... ] [...] [N] [...] +/// j' -> [ ... ] [...] [10] [...] +/// +/// Note that we'll have a new node representing the path `f -> g -> j'` with +/// isolated data. This isolation gives us a means of representing the stack +/// traces as a path, as opposed to a key in a table. The alternative +/// implementation here would be to use a separate table for the path, and use +/// hashes of the path as an identifier to accumuate the information. We've +/// moved away from this approach as it takes a lot of time to compute the hash +/// every time we need to update a function's call information as we're handling +/// the entry and exit events. +/// +/// This approach allows us to maintain a shadow stack, which represents the +/// currently executing path, and on function exits quickly compute the amount +/// of time elapsed from the entry, then update the counters for the node +/// already represented in the trie. This necessitates an efficient +/// representation of the various data structures (the list of callees must be +/// cache-aware and efficient to look up, and the histogram must be compact and +/// quick to update) to enable us to keep the overheads of this implementation +/// to the minimum. +class FunctionCallTrie { +public: + struct Node; + + // We use a NodeRef type instead of a std::pair<...> to not rely on the + // standard library types in this header. + struct NodeRef { + Node *NodePtr; + int32_t FId; + + // Constructor for inplace-construction. + NodeRef(Node *N, int32_t F) : NodePtr(N), FId(F) {} + }; + + using NodeRefArray = Array; + using NodeRefAllocatorType = NodeRefArray::AllocatorType; + + // A Node in the FunctionCallTrie gives us a list of callees, the cumulative + // number of times this node actually appeared, the cumulative amount of time + // for this particular node including its children call times, and just the + // local time spent on this node. Each Node will have the ID of the XRay + // instrumented function that it is associated to. + struct Node { + Node *Parent; + NodeRefArray Callees; + int64_t CallCount; + int64_t CumulativeLocalTime; // Typically in TSC deltas, not wall-time. + int32_t FId; + + // We add a constructor here to allow us to inplace-construct through + // Array<...>'s AppendEmplace. + Node(Node *P, NodeRefAllocatorType &A, int64_t CC, int64_t CLT, int32_t F) + : Parent(P), Callees(A), CallCount(CC), CumulativeLocalTime(CLT), + FId(F) {} + + // TODO: Include the compact histogram. + }; + +private: + struct ShadowStackEntry { + int32_t FId; // We're copying the function ID into the stack to avoid having + // to reach into the node just to get the function ID. + uint64_t EntryTSC; + Node *NodePtr; + + // We add a constructor here to allow us to inplace-construct through + // Array<...>'s AppendEmplace. + ShadowStackEntry(int32_t F, uint64_t T, Node *N) + : FId(F), EntryTSC(T), NodePtr(N) {} + }; + + using NodeArray = Array; + using RootArray = Array; + using ShadowStackArray = Array; + +public: + // We collate the allocators we need into a single struct, as a convenience to + // allow us to initialize these as a group. + struct Allocators { + using NodeAllocatorType = NodeArray::AllocatorType; + using RootAllocatorType = RootArray::AllocatorType; + using ShadowStackAllocatorType = ShadowStackArray::AllocatorType; + using NodeRefAllocatorType = NodeRefAllocatorType; + + NodeAllocatorType *NodeAllocator = nullptr; + RootAllocatorType *RootAllocator = nullptr; + ShadowStackAllocatorType *ShadowStackAllocator = nullptr; + NodeRefAllocatorType *NodeRefAllocator = nullptr; + + Allocators() {} + Allocators(const Allocators &) = delete; + Allocators &operator=(const Allocators &) = delete; + + Allocators(Allocators &&O) + : NodeAllocator(O.NodeAllocator), RootAllocator(O.RootAllocator), + ShadowStackAllocator(O.ShadowStackAllocator), + NodeRefAllocator(O.NodeRefAllocator) { + O.NodeAllocator = nullptr; + O.RootAllocator = nullptr; + O.ShadowStackAllocator = nullptr; + O.NodeRefAllocator = nullptr; + } + + Allocators &operator=(Allocators &&O) { + { + auto Tmp = O.NodeAllocator; + O.NodeAllocator = this->NodeAllocator; + this->NodeAllocator = Tmp; + } + { + auto Tmp = O.RootAllocator; + O.RootAllocator = this->RootAllocator; + this->RootAllocator = Tmp; + } + { + auto Tmp = O.ShadowStackAllocator; + O.ShadowStackAllocator = this->ShadowStackAllocator; + this->ShadowStackAllocator = Tmp; + } + { + auto Tmp = O.NodeRefAllocator; + O.NodeRefAllocator = this->NodeRefAllocator; + this->NodeRefAllocator = Tmp; + } + return *this; + } + + ~Allocators() { + // Note that we cannot use delete on these pointers, as they need to be + // returned to the sanitizer_common library's internal memory tracking + // system. + if (NodeAllocator != nullptr) { + NodeAllocator->~NodeAllocatorType(); + InternalFree(NodeAllocator); + } + if (RootAllocator != nullptr) { + RootAllocator->~RootAllocatorType(); + InternalFree(RootAllocator); + } + if (ShadowStackAllocator != nullptr) { + ShadowStackAllocator->~ShadowStackAllocatorType(); + InternalFree(ShadowStackAllocator); + } + if (NodeRefAllocator != nullptr) { + NodeRefAllocator->~NodeRefAllocatorType(); + InternalFree(NodeRefAllocator); + } + } + }; // namespace __xray + + // TODO: Support configuration of options through the arguments. + static Allocators InitAllocators() { + Allocators A; + auto NodeAllocator = reinterpret_cast( + InternalAlloc(sizeof(Allocators::NodeAllocatorType))); + new (NodeAllocator) Allocators::NodeAllocatorType( + profilerFlags()->xray_profiling_per_thread_allocator_max, 0); + A.NodeAllocator = NodeAllocator; + + auto RootAllocator = reinterpret_cast( + InternalAlloc(sizeof(Allocators::RootAllocatorType))); + new (RootAllocator) Allocators::RootAllocatorType( + profilerFlags()->xray_profiling_per_thread_allocator_max, 0); + A.RootAllocator = RootAllocator; + + auto ShadowStackAllocator = + reinterpret_cast( + InternalAlloc(sizeof(Allocators::ShadowStackAllocatorType))); + new (ShadowStackAllocator) Allocators::ShadowStackAllocatorType( + profilerFlags()->xray_profiling_per_thread_allocator_max, 0); + A.ShadowStackAllocator = ShadowStackAllocator; + + auto NodeRefAllocator = + reinterpret_cast( + InternalAlloc(sizeof(Allocators::NodeRefAllocatorType))); + new (NodeRefAllocator) Allocators::NodeRefAllocatorType( + profilerFlags()->xray_profiling_per_thread_allocator_max, 0); + A.NodeRefAllocator = NodeRefAllocator; + return A; + } + +private: + NodeArray Nodes; + RootArray Roots; + ShadowStackArray ShadowStack; + NodeRefAllocatorType *NodeRefAllocator = nullptr; + + const Allocators &GetGlobalAllocators() { + static const Allocators A = [] { return InitAllocators(); }(); + return A; + } + +public: + explicit FunctionCallTrie(const Allocators &A) + : Nodes(*A.NodeAllocator), Roots(*A.RootAllocator), + ShadowStack(*A.ShadowStackAllocator), + NodeRefAllocator(A.NodeRefAllocator) {} + + FunctionCallTrie() : FunctionCallTrie(GetGlobalAllocators()) {} + + void enterFunction(int32_t FId, uint64_t TSC) { + // This function primarily deals with ensuring that the ShadowStack is + // consistent and ready for when an exit event is encountered. + if (UNLIKELY(ShadowStack.empty())) { + auto NewRoot = Nodes.AppendEmplace(nullptr, *NodeRefAllocator, 0, 0, FId); + if (UNLIKELY(NewRoot == nullptr)) + return; + Roots.Append(NewRoot); + ShadowStack.AppendEmplace(FId, TSC, NewRoot); + return; + } + + auto &Top = ShadowStack.back(); + auto TopNode = Top.NodePtr; + + // If we've seen this callee before, then we just access that node and place + // that on the top of the stack. + auto Callee = TopNode->Callees.find_element( + [FId](const NodeRef &NR) { return NR.FId == FId; }); + if (Callee != nullptr) { + CHECK_NE(Callee->NodePtr, nullptr); + ShadowStack.AppendEmplace(FId, TSC, Callee->NodePtr); + return; + } + + // This means we've never seen this stack before, create a new node here. + auto NewNode = Nodes.AppendEmplace(TopNode, *NodeRefAllocator, 0, 0, FId); + if (UNLIKELY(NewNode == nullptr)) + return; + TopNode->Callees.AppendEmplace(NewNode, FId); + ShadowStack.AppendEmplace(FId, TSC, NewNode); + return; + } + + void exitFunction(int32_t FId, uint64_t TSC) { + // When we exit a function, we look up the ShadowStack to see whether we've + // entered this function before. We do as little processing here as we can, + // since most of the hard work would have already been done at function + // entry. + if (UNLIKELY(ShadowStack.empty())) + return; + + uint64_t CumulativeTreeTime = 0; + while (!ShadowStack.empty()) { + auto &Top = ShadowStack.back(); + auto TopNode = Top.NodePtr; + auto TopFId = TopNode->FId; + auto LocalTime = TSC - Top.EntryTSC; + TopNode->CallCount++; + TopNode->CumulativeLocalTime += LocalTime - CumulativeTreeTime; + CumulativeTreeTime += LocalTime; + ShadowStack.trim(1); + + // TODO: Update the histogram for the node. + if (TopFId == FId) + break; + } + } + + const RootArray &getRoots() const { return Roots; } + + // The deepCopyInto operation will update the provided FunctionCallTrie by + // re-creating the contents of this particular FunctionCallTrie in the other + // FunctionCallTrie. It will do this using a Depth First Traversal from the + // roots, and while doing so recreating the traversal in the provided + // FunctionCallTrie. + // + // This operation will *not* destroy the state in `O`, and thus may cause some + // duplicate entries in `O` if it is not empty. + void deepCopyInto(FunctionCallTrie &O) const { + for (const auto Root : getRoots()) { + // Add a node in O for this root. + auto NewRoot = + O.Nodes.AppendEmplace(nullptr, *O.NodeRefAllocator, Root->CallCount, + Root->CumulativeLocalTime, Root->FId); + O.Roots.Append(NewRoot); + + // We then push the root into a stack, to use as the parent marker for new + // nodes we push in as we're traversing depth-first down the call tree. + struct NodeAndParent { + FunctionCallTrie::Node *Node; + FunctionCallTrie::Node *NewNode; + }; + using Stack = Array; + + typename Stack::AllocatorType StackAllocator( + profilerFlags()->xray_profiling_stack_allocator_max, 0); + Stack DFSStack(StackAllocator); + DFSStack.Append(NodeAndParent{Root, NewRoot}); + while (!DFSStack.empty()) { + NodeAndParent NP = DFSStack.back(); + DCHECK_NE(NP.Node, nullptr); + DCHECK_NE(NP.NewNode, nullptr); + DFSStack.trim(1); + for (const auto Callee : NP.Node->Callees) { + auto NewNode = O.Nodes.AppendEmplace( + NP.NewNode, *O.NodeRefAllocator, Callee.NodePtr->CallCount, + Callee.NodePtr->CumulativeLocalTime, Callee.FId); + DCHECK_NE(NewNode, nullptr); + NP.NewNode->Callees.AppendEmplace(NewNode, Callee.FId); + DFSStack.Append(NodeAndParent{Callee.NodePtr, NewNode}); + } + } + } + } + + // The mergeInto operation will update the provided FunctionCallTrie by + // traversing the current trie's roots and updating (i.e. merging) the data in + // the nodes with the data in the target's nodes. If the node doesn't exist in + // the provided trie, we add a new one in the right position, and inherit the + // data from the original (current) trie, along with all its callees. + void mergeInto(FunctionCallTrie &O) const { + for (const auto Root : getRoots()) { + Node *TargetRoot = nullptr; + auto R = O.Roots.find_element( + [&](const Node *Node) { return Node->FId == Root->FId; }); + if (R == nullptr) { + TargetRoot = O.Nodes.AppendEmplace(nullptr, *O.NodeRefAllocator, 0, 0, + Root->FId); + O.Roots.Append(TargetRoot); + } else { + TargetRoot = *R; + } + + struct NodeAndTarget { + FunctionCallTrie::Node *OrigNode; + FunctionCallTrie::Node *TargetNode; + }; + using Stack = Array; + + typename Stack::AllocatorType StackAllocator( + profilerFlags()->xray_profiling_stack_allocator_max, 0); + Stack DFSStack(StackAllocator); + DFSStack.Append(NodeAndTarget{Root, TargetRoot}); + while (!DFSStack.empty()) { + NodeAndTarget NT = DFSStack.back(); + DCHECK_NE(NT.OrigNode, nullptr); + DCHECK_NE(NT.TargetNode, nullptr); + DFSStack.trim(1); + NT.TargetNode->CallCount += NT.OrigNode->CallCount; + NT.TargetNode->CumulativeLocalTime += NT.OrigNode->CumulativeLocalTime; + for (const auto Callee : NT.OrigNode->Callees) { + auto TargetCallee = NT.TargetNode->Callees.find_element( + [&](const FunctionCallTrie::NodeRef &C) { + return C.FId == Callee.FId; + }); + if (TargetCallee == nullptr) { + auto NewTargetNode = O.Nodes.AppendEmplace( + NT.TargetNode, *O.NodeRefAllocator, 0, 0, Callee.FId); + TargetCallee = + NT.TargetNode->Callees.AppendEmplace(NewTargetNode, Callee.FId); + } + DFSStack.Append(NodeAndTarget{Callee.NodePtr, TargetCallee->NodePtr}); + } + } + } + } + +}; // namespace __xray + +} // namespace __xray + +#endif // XRAY_FUNCTION_CALL_TRIE_H Index: compiler-rt/lib/xray/xray_profile_collector.h =================================================================== --- /dev/null +++ compiler-rt/lib/xray/xray_profile_collector.h @@ -0,0 +1,88 @@ +//===-- xray_profile_collector.h -------------------------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file is a part of XRay, a dynamic runtime instrumentation system. +// +// This file defines the interface for a data collection service, for XRay +// profiling. What we implement here is an in-process service where +// FunctionCallTrie instances can be handed off by threads, to be +// consolidated/collected. +// +//===----------------------------------------------------------------------===// +#ifndef XRAY_XRAY_PROFILE_COLLECTOR_H +#define XRAY_XRAY_PROFILE_COLLECTOR_H + +#include "xray_function_call_trie.h" + +#include "xray/xray_log_interface.h" + +namespace __xray { + +/// The ProfileCollectorService implements a centralised mechanism for +/// collecting FunctionCallTrie instances, indexed by thread ID. On demand, the +/// ProfileCollectorService can be queried for the most recent state of the +/// data, in a form that allows traversal. +namespace profileCollectorService { + +/// Posts the FunctionCallTrie assocaited with a specific Thread ID. This +/// will: +/// +/// - Make a copy of the FunctionCallTrie and store that against the Thread +/// ID. This will use the global allocator for the service-managed +/// FunctionCallTrie instances. +/// - Queue up a pointer to the FunctionCallTrie. +/// - If the queue is long enough (longer than some arbitrary threshold) we +/// then pre-calculate a single FunctionCallTrie for the whole process. +/// +/// +/// We are making a copy of the FunctionCallTrie because the intent is to have +/// this function be called at thread exit, or soon after the profiling +/// handler is finalized through the XRay APIs. By letting threads each +/// process their own thread-local FunctionCallTrie instances, we're removing +/// the need for synchronisation across threads while we're profiling. +/// However, once we're done profiling, we can then collect copies of these +/// FunctionCallTrie instances and pay the cost of the copy. +/// +/// NOTE: In the future, if this turns out to be more costly than "moving" the +/// FunctionCallTrie instances from the owning thread to the collector +/// service, then we can change the implementation to do it this way (moving) +/// instead. +void post(const FunctionCallTrie &T, tid_t TId); + +/// The serialize will process all FunctionCallTrie instances in memory, and +/// turn those into specifically formatted blocks, each describing the +/// function call trie's contents in a compact form. In memory, this looks +/// like the following layout: +/// +/// - block size (32 bits) +/// - block number (32 bits) +/// - thread id (64 bits) +/// - list of records: +/// - function ids in reverse call order, from leaf to root, terminated by +/// 0 (32 bits per function id) +/// - call count (64 bit) +/// - cumulative local time (64 bit) +/// - record delimiter (64 bit, 0x0) +/// +void serialize(); + +/// The reset function will clear out any internal memory held by the +/// service. The intent is to have the resetting be done in calls to the +/// initialization routine, or explicitly through the flush log API. +void reset(); + +/// This nextBuffer function is meant to implement the iterator functionality, +/// provided in the XRay API. +XRayBuffer nextBuffer(XRayBuffer B); + +}; // namespace profileCollectorService + +} // namespace __xray + +#endif // XRAY_XRAY_PROFILE_COLLECTOR_H Index: compiler-rt/lib/xray/xray_profile_collector.cc =================================================================== --- /dev/null +++ compiler-rt/lib/xray/xray_profile_collector.cc @@ -0,0 +1,275 @@ +//===-- xray_profile_collector.cc ------------------------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file is a part of XRay, a dynamic runtime instrumentation system. +// +// This implements the interface for the profileCollectorService. +// +//===----------------------------------------------------------------------===// +#include + +#include "xray_profile_collector.h" +#include "sanitizer_common/sanitizer_common.h" +#include "sanitizer_common/sanitizer_vector.h" +#include "xray_profiler_flags.h" +#include + +namespace __xray { +namespace profileCollectorService { + +namespace { + +SpinMutex GlobalMutex; +struct ThreadTrie { + tid_t TId; + FunctionCallTrie *Trie; +}; +Vector ThreadTries; + +struct ProfileBuffer { + void *Data; + size_t Size; +}; +Vector ProfileBuffers; + +struct BlockHeader { + u32 BlockSize; + u32 BlockNum; + u64 ThreadId; +}; + +FunctionCallTrie::Allocators *GlobalAllocators = nullptr; + +} // namespace + +void post(const FunctionCallTrie &T, tid_t TId) { + static const bool UNUSED Once = [] { + SpinMutexLock Lock(&GlobalMutex); + GlobalAllocators = reinterpret_cast( + InternalAlloc(sizeof(FunctionCallTrie::Allocators))); + new (GlobalAllocators) FunctionCallTrie::Allocators(); + *GlobalAllocators = FunctionCallTrie::InitAllocators(); + return false; + }(); + DCHECK_NE(GlobalAllocators, nullptr); + + ThreadTrie *Item = nullptr; + { + SpinMutexLock Lock(&GlobalMutex); + if (GlobalAllocators == nullptr) + return; + + Item = ThreadTries.PushBack(); + Item->TId = TId; + Item->Trie = reinterpret_cast( + InternalAlloc(sizeof(FunctionCallTrie))); + new (Item->Trie) FunctionCallTrie(*GlobalAllocators); + DCHECK_NE(Item->Trie, nullptr); + } + DCHECK_NE(Item, nullptr); + + T.deepCopyInto(*Item->Trie); +} + +using PathArray = Array; + +struct ProfileRecord { + using PathAllocator = typename PathArray::AllocatorType; + + // The Path in this record is in reverse order. + PathArray *Path = nullptr; + const FunctionCallTrie::Node *Node = nullptr; + + // Constructor for in-place construction. + ProfileRecord(PathAllocator &A, const FunctionCallTrie::Node *N) + : Path([&] { + auto P = + reinterpret_cast(InternalAlloc(sizeof(PathArray))); + new (P) PathArray(A); + return P; + }()), + Node(N) {} +}; + +namespace { + +using ProfileRecordArray = Array; + +// We go through a FunctionCallTrie and traverse from the root, in DFS fashion, +// to generate the path(s) and output the data. +void populateRecords(ProfileRecordArray &PRs, ProfileRecord::PathAllocator &PA, + const FunctionCallTrie &Trie) { + for (const auto R : Trie.getRoots()) { + using StackArray = Array; + using StackAllocator = typename StackArray::AllocatorType; + StackAllocator StackAlloc( + profilerFlags()->xray_profiling_stack_allocator_max, 0); + StackArray DFSStack(StackAlloc); + DFSStack.Append(R); + while (!DFSStack.empty()) { + auto Node = DFSStack.back(); + DFSStack.trim(1); + auto Record = PRs.AppendEmplace(PA, Node); + DCHECK_NE(Record, nullptr); + + // Traverse the Node's parents and as we're doing so, get the FIds in + // the order they appear. + for (auto N = Node; N != nullptr; N = N->Parent) + Record->Path->Append(N->FId); + DCHECK(!Record->Path->empty()); + + for (const auto C : Node->Callees) + DFSStack.Append(C.NodePtr); + } + } +} + +void serializeRecords(ProfileBuffer *Buffer, const BlockHeader &Header, + const ProfileRecordArray &ProfileRecords) { + auto NextPtr = static_cast( + internal_memcpy(Buffer->Data, &Header, sizeof(Header))) + + sizeof(Header); + for (const auto &Record : ProfileRecords) { + // List of IDs follow: + for (const auto FId : *Record.Path) + NextPtr = + static_cast(internal_memcpy(NextPtr, &FId, sizeof(FId))) + + sizeof(FId); + + // Add the sentinel here. + constexpr int32_t SentinelFId = 0; + NextPtr = static_cast( + internal_memset(NextPtr, SentinelFId, sizeof(SentinelFId))) + + sizeof(SentinelFId); + + // Add the node data here. + NextPtr = + static_cast(internal_memcpy(NextPtr, &Record.Node->CallCount, + sizeof(Record.Node->CallCount))) + + sizeof(Record.Node->CallCount); + NextPtr = static_cast( + internal_memcpy(NextPtr, &Record.Node->CumulativeLocalTime, + sizeof(Record.Node->CumulativeLocalTime))) + + sizeof(Record.Node->CumulativeLocalTime); + + // Add an end of record sentinel here. + constexpr uint64_t EndOfRecord = 0x0; + NextPtr = static_cast( + internal_memset(NextPtr, EndOfRecord, sizeof(EndOfRecord))) + + sizeof(EndOfRecord); + } + + DCHECK_EQ(NextPtr - static_cast(Buffer->Data), Buffer->Size); +} + +} // namespace + +void serialize() { + SpinMutexLock Lock(&GlobalMutex); + + // Clear out the global ProfileBuffers. + for (uptr I = 0; I < ProfileBuffers.Size(); ++I) + InternalFree(ProfileBuffers[I].Data); + ProfileBuffers.Reset(); + + if (ThreadTries.Size() == 0) + return; + + // Then repopulate the global ProfileBuffers. + for (u32 I = 0; I < ThreadTries.Size(); ++I) { + using ProfileRecordAllocator = typename ProfileRecordArray::AllocatorType; + ProfileRecordAllocator PRAlloc( + profilerFlags()->xray_profiling_global_allocator_max, 0); + ProfileRecord::PathAllocator PathAlloc( + profilerFlags()->xray_profiling_global_allocator_max, 0); + ProfileRecordArray ProfileRecords(PRAlloc); + + // First, we want to compute the amount of space we're going to need. We'll + // use a local allocator and an __xray::Array<...> to store the intermediary + // data, then compute the size as we're going along. Then we'll allocate the + // contiguous space to contain the thread buffer data. + const auto &Trie = *ThreadTries[I].Trie; + if (Trie.getRoots().empty()) + continue; + populateRecords(ProfileRecords, PathAlloc, Trie); + DCHECK(!Trie.getRoots().empty()); + DCHECK(!ProfileRecords.empty()); + + // Go through each record, to compute the sizes. + // + // header size = block size (4 bytes) + // + block number (4 bytes) + // + thread id (8 bytes) + // record size = path ids (4 bytes * number of ids + sentinel 4 bytes) + // + call count (8 bytes) + // + local time (8 bytes) + // + end of record (8 bytes) + u32 CumulativeSizes = 0; + for (const auto &Record : ProfileRecords) + CumulativeSizes += 28 + (4 * Record.Path->size()); + + BlockHeader Header{16 + CumulativeSizes, I, ThreadTries[I].TId}; + auto Buffer = ProfileBuffers.PushBack(); + Buffer->Size = sizeof(Header) + CumulativeSizes; + Buffer->Data = InternalAlloc(Buffer->Size, nullptr, 64); + DCHECK_NE(Buffer->Data, nullptr); + serializeRecords(Buffer, Header, ProfileRecords); + + // Now clean up the ProfileRecords array, one at a time. + for (auto &Record : ProfileRecords) { + Record.Path->~PathArray(); + InternalFree(Record.Path); + } + } +} + +void reset() { + SpinMutexLock Lock(&GlobalMutex); + // Clear out the profile buffers that have been serialized. + for (uptr I = 0; I < ProfileBuffers.Size(); ++I) + InternalFree(ProfileBuffers[I].Data); + ProfileBuffers.Reset(); + + // Clear out the function call tries per thread. + for (uptr I = 0; I < ThreadTries.Size(); ++I) { + auto &T = ThreadTries[I]; + T.Trie->~FunctionCallTrie(); + InternalFree(T.Trie); + } + ThreadTries.Reset(); + + // Reset the global allocators. + if (GlobalAllocators != nullptr) { + GlobalAllocators->~Allocators(); + InternalFree(GlobalAllocators); + GlobalAllocators = nullptr; + } + GlobalAllocators = reinterpret_cast( + InternalAlloc(sizeof(FunctionCallTrie::Allocators))); + new (GlobalAllocators) FunctionCallTrie::Allocators(); + *GlobalAllocators = FunctionCallTrie::InitAllocators(); +} + +XRayBuffer nextBuffer(XRayBuffer B) { + // FIXME: Find a way to identify the buffers, somehow, from within this + // function. + SpinMutexLock Lock(&GlobalMutex); + if (B.Data == nullptr && ProfileBuffers.Size()) + return {ProfileBuffers[0].Data, ProfileBuffers[0].Size}; + + BlockHeader Header; + internal_memcpy(&Header, B.Data, sizeof(BlockHeader)); + auto NextBlock = Header.BlockNum + 1; + if (NextBlock < ProfileBuffers.Size()) + return {ProfileBuffers[NextBlock].Data, ProfileBuffers[NextBlock].Size}; + return {nullptr, 0}; +} + +} // namespace profileCollectorService +} // namespace __xray Index: compiler-rt/lib/xray/xray_profiler.cc =================================================================== --- /dev/null +++ compiler-rt/lib/xray/xray_profiler.cc @@ -0,0 +1,296 @@ +//===-- xray_profiling.cc --------------------------------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file is a part of XRay, a dynamic runtime instrumentation system. +// +// This is the implementation of a profiling handler. +// +//===----------------------------------------------------------------------===// +#include + +#include "sanitizer_common/sanitizer_atomic.h" +#include "sanitizer_common/sanitizer_flags.h" +#include "xray/xray_interface.h" +#include "xray/xray_log_interface.h" + +#include "xray_flags.h" +#include "xray_profile_collector.h" +#include "xray_profiler_flags.h" +#include "xray_tsc.h" +#include "xray_utils.h" +#include + +namespace __xray { + +namespace { + +__sanitizer::atomic_sint32_t ProfilerLogFlushStatus = { + XRayLogFlushStatus::XRAY_LOG_NOT_FLUSHING}; + +__sanitizer::atomic_sint32_t ProfilerLogStatus = { + XRayLogInitStatus::XRAY_LOG_UNINITIALIZED}; + +__sanitizer::SpinMutex ProfilerOptionsMutex; + +struct alignas(64) ProfilingTLD { + FunctionCallTrie::Allocators *Allocators = nullptr; + FunctionCallTrie *FCT = nullptr; +}; + +static pthread_key_t ProfilingKey; + +ProfilingTLD &getThreadLocalData() XRAY_NEVER_INSTRUMENT { + thread_local ProfilingTLD TLD; + thread_local bool UNUSED Once = [] { + pthread_setspecific(ProfilingKey, &TLD); + return false; + }(); + + // We need to check whether the global flag to finalizing/finalized has been + // switched. If it is, then we ought to not actually initialise the data. + auto Status = __sanitizer::atomic_load(&ProfilerLogStatus, + __sanitizer::memory_order_acquire); + if (Status == XRayLogInitStatus::XRAY_LOG_FINALIZING || + Status == XRayLogInitStatus::XRAY_LOG_FINALIZED) + return TLD; + + // If we're live, then we re-initialize TLD if the pointers are not null. + if (UNLIKELY(TLD.Allocators == nullptr && TLD.FCT == nullptr)) { + TLD.Allocators = reinterpret_cast( + InternalAlloc(sizeof(FunctionCallTrie::Allocators))); + new (TLD.Allocators) FunctionCallTrie::Allocators(); + *TLD.Allocators = FunctionCallTrie::InitAllocators(); + TLD.FCT = reinterpret_cast( + InternalAlloc(sizeof(FunctionCallTrie))); + new (TLD.FCT) FunctionCallTrie(*TLD.Allocators); + } + + return TLD; +} + +} // namespace + +const char *profilerCompilerDefinedFlags() XRAY_NEVER_INSTRUMENT { +#ifdef XRAY_PROFILER_DEFAULT_OPTIONS + return SANITIZER_STRINGIFY(XRAY_PROFILER_DEFAULT_OPTIONS); +#else + return ""; +#endif +} + +__sanitizer::atomic_sint32_t ProfileFlushStatus = { + XRayLogFlushStatus::XRAY_LOG_NOT_FLUSHING}; + +XRayLogFlushStatus profilingFlush() XRAY_NEVER_INSTRUMENT { + // When flushing, all we really do is reset the global state, and only when + // the log has already been finalized. + if (__sanitizer::atomic_load(&ProfilerLogStatus, + __sanitizer::memory_order_acquire) != + XRayLogInitStatus::XRAY_LOG_FINALIZED) { + if (__sanitizer::Verbosity()) + Report("Not flushing profiles, profiler not been finalized.\n"); + return XRayLogFlushStatus::XRAY_LOG_NOT_FLUSHING; + } + + s32 Result = XRayLogFlushStatus::XRAY_LOG_NOT_FLUSHING; + if (!__sanitizer::atomic_compare_exchange_strong( + &ProfilerLogFlushStatus, &Result, + XRayLogFlushStatus::XRAY_LOG_FLUSHING, + __sanitizer::memory_order_release)) { + if (__sanitizer::Verbosity()) + Report("Not flushing profiles, implementation still finalizing.\n"); + } + + profileCollectorService::reset(); + + __sanitizer::atomic_store(&ProfilerLogStatus, + XRayLogFlushStatus::XRAY_LOG_FLUSHED, + __sanitizer::memory_order_release); + + return XRayLogFlushStatus::XRAY_LOG_FLUSHED; +} + +namespace { + +thread_local volatile bool ReentranceGuard = false; + +void postCurrentThreadFCT(ProfilingTLD &TLD) { + if (TLD.Allocators == nullptr || TLD.FCT == nullptr) + return; + + profileCollectorService::post(*TLD.FCT, GetTid()); + TLD.FCT->~FunctionCallTrie(); + TLD.Allocators->~Allocators(); + InternalFree(TLD.FCT); + InternalFree(TLD.Allocators); + TLD.FCT = nullptr; + TLD.Allocators = nullptr; +} + +} // namespace + +void profilingHandleArg0(int32_t FuncId, + XRayEntryType Entry) XRAY_NEVER_INSTRUMENT { + unsigned char CPU; + auto TSC = readTSC(CPU); + if (ReentranceGuard) + return; + ReentranceGuard = true; + + auto Status = __sanitizer::atomic_load(&ProfilerLogStatus, + __sanitizer::memory_order_acquire); + auto &TLD = getThreadLocalData(); + if (UNLIKELY(Status == XRayLogInitStatus::XRAY_LOG_FINALIZED || + Status == XRayLogInitStatus::XRAY_LOG_FINALIZING)) { + postCurrentThreadFCT(TLD); + ReentranceGuard = false; + return; + } + + switch (Entry) { + case XRayEntryType::ENTRY: + case XRayEntryType::LOG_ARGS_ENTRY: + TLD.FCT->enterFunction(FuncId, TSC); + break; + case XRayEntryType::EXIT: + case XRayEntryType::TAIL: + TLD.FCT->exitFunction(FuncId, TSC); + break; + default: + // FIXME: Handle bugs. + break; + } + + ReentranceGuard = false; +} + +void profilingHandleArg1(int32_t FuncId, XRayEntryType Entry, + uint64_t) XRAY_NEVER_INSTRUMENT { + return profilingHandleArg0(FuncId, Entry); +} + +XRayLogInitStatus profilingFinalize() XRAY_NEVER_INSTRUMENT { + s32 CurrentStatus = XRayLogInitStatus::XRAY_LOG_INITIALIZED; + if (!__sanitizer::atomic_compare_exchange_strong( + &ProfilerLogStatus, &CurrentStatus, + XRayLogInitStatus::XRAY_LOG_FINALIZING, + __sanitizer::memory_order_release)) { + if (__sanitizer::Verbosity()) + Report("Cannot finalize profile, the profiler is not initialized.\n"); + return static_cast(CurrentStatus); + } + + // Wait a grace period to allow threads to see that we're finalizing. + __sanitizer::SleepForMillis(profilerFlags()->xray_profiling_grace_period_ms); + + // We also want to make sure that the current thread's data is cleaned up, if + // we have any. + auto &TLD = getThreadLocalData(); + postCurrentThreadFCT(TLD); + + // Then we force serialize the log data. + profileCollectorService::serialize(); + + __sanitizer::atomic_store(&ProfilerLogStatus, + XRayLogInitStatus::XRAY_LOG_FINALIZED, + __sanitizer::memory_order_release); + return XRayLogInitStatus::XRAY_LOG_FINALIZED; +} + +XRayLogInitStatus +profilingLoggingInit(size_t BufferSize, size_t BufferMax, void *Options, + size_t OptionsSize) XRAY_NEVER_INSTRUMENT { + s32 CurrentStatus = XRayLogInitStatus::XRAY_LOG_UNINITIALIZED; + if (!__sanitizer::atomic_compare_exchange_strong( + &ProfilerLogStatus, &CurrentStatus, + XRayLogInitStatus::XRAY_LOG_INITIALIZING, + __sanitizer::memory_order_release)) { + if (__sanitizer::Verbosity()) + Report( + "Cannot initialize already initialised profiling implementation.\n"); + return static_cast(CurrentStatus); + } + + // Here we use the sanitizer flag parsing mechanism. When PR36790 is fixed, + // migrate to using a different API for configuration. + { + SpinMutexLock Lock(&ProfilerOptionsMutex); + FlagParser ConfigParser; + auto *F = profilerFlags(); + F->setDefaults(); + registerProfilerFlags(&ConfigParser, F); + const char *ProfilerCompileFlags = profilerCompilerDefinedFlags(); + ConfigParser.ParseString(ProfilerCompileFlags); + ConfigParser.ParseString(static_cast(Options)); + if (Verbosity()) + ReportUnrecognizedFlags(); + } + + // We need to reset the profile data collection implementation now. + profileCollectorService::reset(); + + // We need to set up the at-thread-exit handler. + static bool UNUSED Once = [] { + pthread_key_create(&ProfilingKey, +[](void *) { + // This is the thread-exit handler. + auto &TLD = getThreadLocalData(); + if (TLD.Allocators == nullptr && TLD.FCT == nullptr) + return; + + postCurrentThreadFCT(TLD); + }); + return false; + }(); + + __xray_log_set_buffer_iterator(profileCollectorService::nextBuffer); + __xray_set_handler(profilingHandleArg0); + __xray_set_handler_arg1(profilingHandleArg1); + + __sanitizer::atomic_store(&ProfilerLogStatus, + XRayLogInitStatus::XRAY_LOG_INITIALIZED, + __sanitizer::memory_order_release); + if (__sanitizer::Verbosity()) + Report("XRay Profiling init successful.\n"); + + return XRayLogInitStatus::XRAY_LOG_INITIALIZED; +} + +bool profilingDynamicInitializer() XRAY_NEVER_INSTRUMENT { + // Set up the flag defaults from the static defaults and the compiler-provided + // defaults. + { + SpinMutexLock Lock(&ProfilerOptionsMutex); + auto *F = profilerFlags(); + F->setDefaults(); + FlagParser ProfilingParser; + registerProfilerFlags(&ProfilingParser, F); + const char *ProfilerCompileFlags = profilerCompilerDefinedFlags(); + ProfilingParser.ParseString(ProfilerCompileFlags); + } + + XRayLogImpl Impl{ + profilingLoggingInit, + profilingFinalize, + profilingHandleArg0, + profilingFlush, + }; + auto RegistrationResult = __xray_log_register_mode("xray-profiling", Impl); + if (RegistrationResult != XRayLogRegisterStatus::XRAY_REGISTRATION_OK && + __sanitizer::Verbosity()) + Report( + "Cannot register XRay Profiling mode to 'xray-profiling'; error = %d\n", + RegistrationResult); + if (!__sanitizer::internal_strcmp(flags()->xray_mode, "xray-profiling")) + __xray_set_log_impl(Impl); + return true; +} + +} // namespace __xray + +static auto UNUSED Unused = __xray::profilingDynamicInitializer(); Index: compiler-rt/lib/xray/xray_profiler_flags.h =================================================================== --- /dev/null +++ compiler-rt/lib/xray/xray_profiler_flags.h @@ -0,0 +1,39 @@ +//===-- xray_profiler_flags.h ----------------------------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file is a part of XRay, a dynamic runtime instrumentation system. +// +// XRay profiler runtime flags. +//===----------------------------------------------------------------------===// + +#ifndef XRAY_PROFILER_FLAGS_H +#define XRAY_PROFILER_FLAGS_H + +#include "sanitizer_common/sanitizer_flag_parser.h" +#include "sanitizer_common/sanitizer_internal_defs.h" + +namespace __xray { + +struct ProfilerFlags { +#define XRAY_FLAG(Type, Name, DefaultValue, Description) Type Name; +#include "xray_profiler_flags.inc" +#undef XRAY_FLAG + + void setDefaults(); +}; + +extern ProfilerFlags xray_profiler_flags_dont_use_directly; +inline ProfilerFlags *profilerFlags() { + return &xray_profiler_flags_dont_use_directly; +} +void registerProfilerFlags(FlagParser *P, ProfilerFlags *F); + +} // namespace __xray + +#endif // XRAY_PROFILER_FLAGS_H Index: compiler-rt/lib/xray/xray_profiler_flags.cc =================================================================== --- /dev/null +++ compiler-rt/lib/xray/xray_profiler_flags.cc @@ -0,0 +1,40 @@ +//===-- xray_flags.h -------------------------------------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file is a part of XRay, a dynamic runtime instrumentation system. +// +// XRay runtime flags. +//===----------------------------------------------------------------------===// + +#include "xray_profiler_flags.h" +#include "sanitizer_common/sanitizer_common.h" +#include "sanitizer_common/sanitizer_flag_parser.h" +#include "sanitizer_common/sanitizer_libc.h" +#include "xray_defs.h" + +namespace __xray { + +// Storage for the profiler flags. +ProfilerFlags xray_profiler_flags_dont_use_directly; + +void ProfilerFlags::setDefaults() XRAY_NEVER_INSTRUMENT { +#define XRAY_FLAG(Type, Name, DefaultValue, Description) Name = DefaultValue; +#include "xray_profiler_flags.inc" +#undef XRAY_FLAG +} + +void registerProfilerFlags(FlagParser *P, + ProfilerFlags *F) XRAY_NEVER_INSTRUMENT { +#define XRAY_FLAG(Type, Name, DefaultValue, Description) \ + RegisterFlag(P, #Name, Description, &F->Name); +#include "xray_profiler_flags.inc" +#undef XRAY_FLAG +} + +} // namespace __xray Index: compiler-rt/lib/xray/xray_profiler_flags.inc =================================================================== --- /dev/null +++ compiler-rt/lib/xray/xray_profiler_flags.inc @@ -0,0 +1,26 @@ +//===-- xray_flags.inc ------------------------------------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// XRay profiling runtime flags. +// +//===----------------------------------------------------------------------===// +#ifndef XRAY_FLAG +#error "Define XRAY_FLAG prior to including this file!" +#endif + +XRAY_FLAG(uptr, xray_profiling_per_thread_allocator_max, 2 << 20, + "Maximum size of any single per-thread allocator.") +XRAY_FLAG(uptr, xray_profiling_global_allocator_max, 2 << 24, + "Maximum size of the global allocator for profile storage.") +XRAY_FLAG(uptr, xray_profiling_stack_allocator_max, 2 << 24, + "Maximum size of the traversal stack allocator.") +XRAY_FLAG(int, xray_profiling_grace_period_ms, 100, + "Profile collection will wait this much time in milliseconds before " + "resetting the global state. This gives a chance to threads to " + "notice that the profiler has been finalized and clean up.") Index: compiler-rt/lib/xray/xray_segmented_array.h =================================================================== --- /dev/null +++ compiler-rt/lib/xray/xray_segmented_array.h @@ -0,0 +1,292 @@ +//===-- xray_segmented_array.h ---------------------------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file is a part of XRay, a dynamic runtime instrumentation system. +// +// Defines the implementation of a segmented array, with fixed-size chunks +// backing the segments. +// +//===----------------------------------------------------------------------===// +#ifndef XRAY_SEGMENTED_ARRAY_H +#define XRAY_SEGMENTED_ARRAY_H + +#include "sanitizer_common/sanitizer_allocator.h" +#include "xray_allocator.h" +#include +#include + +namespace __xray { + +/// The Array type provides an interface similar to std::vector<...> but does +/// not shrink in size. Once constructed, elements can be appended but cannot be +/// removed. The implementation is heavily dependent on the contract provided by +/// the Allocator type, in that all memory will be released when the Allocator +/// is destroyed. When an Array is destroyed, it will destroy elements in the +/// backing store but will not free the memory. The parameter N defines how many +/// elements of T there should be in a single block. By default we compute this +/// number from packing as many T's as possible in a single cache line. +template +struct Array { + static constexpr size_t AllocatorChunkSize = sizeof(T) * N; + using AllocatorType = Allocator; + static_assert(std::is_trivially_destructible::value, + "T must be trivially destructible."); + +private: + struct Chunk { + typename AllocatorType::Block Block; + static constexpr size_t Size = N; + Chunk *Prev = nullptr; + Chunk *Next = nullptr; + }; + + static Chunk SentinelChunk; + + AllocatorType *Allocator; + Chunk *Head = &SentinelChunk; + Chunk *Tail = &SentinelChunk; + size_t Size = 0; + + Chunk *NewChunk() { + auto Block = Allocator->Allocate(); + if (Block.Data == nullptr) + return nullptr; + // TODO: Maybe use a separate managed allocator for Chunk instances? + auto C = reinterpret_cast(InternalAlloc(sizeof(Chunk))); + if (C == nullptr) + return nullptr; + C->Block = Block; + return C; + } + + static AllocatorType &GetGlobalAllocator() { + static AllocatorType *const GlobalAllocator = [] { + AllocatorType *A = reinterpret_cast( + InternalAlloc(sizeof(AllocatorType))); + new (A) AllocatorType(2 << 10, 0); + return A; + }(); + + return *GlobalAllocator; + } + + Chunk *InitHeadAndTail() { + DCHECK_EQ(Head, &SentinelChunk); + DCHECK_EQ(Tail, &SentinelChunk); + auto Chunk = NewChunk(); + if (Chunk == nullptr) + return nullptr; + Chunk->Prev = &SentinelChunk; + Chunk->Next = &SentinelChunk; + Head = Chunk; + Tail = Chunk; + return Chunk; + } + + Chunk *AppendNewChunk() { + auto Chunk = NewChunk(); + if (Chunk == nullptr) + return nullptr; + Tail->Next = Chunk; + Chunk->Prev = Tail; + Chunk->Next = &SentinelChunk; + Tail = Chunk; + return Chunk; + } + + class Iterator { + Chunk *C = nullptr; + size_t Offset = 0; + + public: + Iterator(Chunk *IC, size_t O) : C(IC), Offset(O) {} + + Iterator &operator++() { + DCHECK_NE(C, nullptr); + if (++Offset % N) + return *this; + + // At this point, we know that Offset % N == 0, so we must advance the + // chunk pointer. + DCHECK_EQ(Offset % N, 0); + C = C->Next; + return *this; + } + + Iterator &operator--() { + DCHECK_NE(C, nullptr); + if (--Offset % N) { + return *this; + } + + // At this point, we know that Offset % N == 0, so we must retreat the + // chunk pointer. + DCHECK_EQ(Offset % N, 0); + C = C->Prev; + return *this; + } + + Iterator operator++(int) { + Iterator Copy(*this); + if (++Offset % N == 0) + C = C->Next; + DCHECK_NE(Copy.Offset, this->Offset); + return Copy; + } + + Iterator operator--(int) { + Iterator Copy(*this); + if (--Offset % N == 0) + C = C->Prev; + DCHECK_NE(Copy.Offset, this->Offset); + return Copy; + } + + friend bool operator==(const Iterator &L, const Iterator &R) { + return L.C == R.C && L.Offset == R.Offset; + } + + friend bool operator!=(const Iterator &L, const Iterator &R) { + return !(L == R); + } + + const T &operator*() const { + DCHECK_NE(C, &SentinelChunk); + return reinterpret_cast(C->Block.Data)[Offset % N]; + } + + T &operator*() { + DCHECK_NE(C, &SentinelChunk); + return reinterpret_cast(C->Block.Data)[Offset % N]; + } + }; + +public: + explicit Array(AllocatorType &A) : Allocator(&A) {} + Array() : Array(GetGlobalAllocator()) {} + + Array(const Array &) = delete; + Array(Array &&O) NOEXCEPT : Allocator(O.Allocator), + Head(O.Head), + Tail(O.Tail), + Size(O.Size) { + O.Head = &SentinelChunk; + O.Tail = &SentinelChunk; + O.Size = 0; + } + + bool empty() const { return Size == 0; } + + AllocatorType &allocator() const { + DCHECK_NE(Allocator, nullptr); + return *Allocator; + } + + size_t size() const { return Size; } + + T *Append(const T &E) { + if (UNLIKELY(Head == &SentinelChunk)) + if (InitHeadAndTail() == nullptr) + return nullptr; + + auto Offset = Size % N; + if (UNLIKELY(Size != 0 && Offset == 0)) + if (AppendNewChunk() == nullptr) + return nullptr; + + auto Position = reinterpret_cast(Tail->Block.Data) + Offset; + *Position = E; + ++Size; + return Position; + } + + template T *AppendEmplace(Args &&... args) { + if (UNLIKELY(Head == &SentinelChunk)) + if (InitHeadAndTail() == nullptr) + return nullptr; + + auto Offset = Size % N; + if (UNLIKELY(Size != 0 && Offset == 0)) + if (AppendNewChunk() == nullptr) + return nullptr; + + auto Position = reinterpret_cast(Tail->Block.Data) + Offset; + // In-place construct at Position. + new (Position) T(std::forward(args)...); + ++Size; + return Position; + } + + T &operator[](size_t Offset) const { + DCHECK_LE(Offset, Size); + // We need to traverse the array enough times to find the element at Offset. + auto C = Head; + while (Offset >= N) { + C = C->Next; + Offset -= N; + DCHECK_NE(C, &SentinelChunk); + } + auto Position = reinterpret_cast(C->Block.Data) + Offset; + return *Position; + } + + T &front() const { + DCHECK_NE(Head, &SentinelChunk); + DCHECK_NE(Size, 0u); + return *reinterpret_cast(Head->Block.Data); + } + + T &back() const { + DCHECK_NE(Tail, &SentinelChunk); + auto Offset = (Size - 1) % N; + return *(reinterpret_cast(Tail->Block.Data) + Offset); + } + + template T *find_element(Predicate P) const { + if (empty()) + return nullptr; + + auto C = Head; + auto E = Size; + while (E != 0) { + DCHECK(C != &SentinelChunk); + auto LE = N < E ? N : E; + auto end = reinterpret_cast(C->Block.Data) + LE; + for (auto It = reinterpret_cast(C->Block.Data); It != end; ++It) + if (P(*It)) + return It; + C = C->Next; + E -= LE; + } + return nullptr; + } + + /// Remove N Elements from the end. + void trim(size_t Elements) { + // TODO: Figure out whether we need to actually destroy the objects that are + // going to be trimmed -- determine whether assuming that trivially + // destructible objects are OK to discard, from the XRay implementation. + DCHECK_LE(Elements, Size); + Size -= Elements; + } + + // Provide iterators. + Iterator begin() const { return Iterator(Head, 0); } + Iterator end() const { return Iterator(Tail, Size); } + Iterator cbegin() const { return Iterator(Head, 0); } + Iterator cend() const { return Iterator(Tail, Size); } +}; + +template +typename Array::Chunk Array::SentinelChunk; + +} // namespace __xray + +#endif // XRAY_SEGMENTED_ARRAY_H Index: compiler-rt/test/xray/TestCases/Posix/profiling-multi-threaded.cc =================================================================== --- /dev/null +++ compiler-rt/test/xray/TestCases/Posix/profiling-multi-threaded.cc @@ -0,0 +1,53 @@ +// Check that we can get a profile from a single-threaded application, on +// demand through the XRay logging implementation API. +// +// FIXME: Make -fxray-modes=xray-profiling part of the default? +// RUN: %clangxx_xray -std=c++11 %s -o %t -fxray-modes=xray-profiler +// RUN: %run %t +// +// UNSUPPORTED: target-is-mips64,target-is-mips64el + +#include "xray/xray_interface.h" +#include "xray/xray_log_interface.h" +#include +#include +#include +#include + +#define XRAY_ALWAYS_INSTRUMENT [[clang::xray_always_instrument]] +#define XRAY_NEVER_INSTRUMENT [[clang::xray_never_instrument]] + +XRAY_ALWAYS_INSTRUMENT void f2() { return; } +XRAY_ALWAYS_INSTRUMENT void f1() { f2(); } +XRAY_ALWAYS_INSTRUMENT void f0() { f1(); } + +using namespace std; + +volatile int buffer_counter = 0; + +XRAY_NEVER_INSTRUMENT void process_buffer(const char *, XRayBuffer) { + // FIXME: Actually assert the contents of the buffer. + ++buffer_counter; +} + +XRAY_ALWAYS_INSTRUMENT int main(int, char **) { + assert(__xray_log_select_mode("xray-profiling") == + XRayLogRegisterStatus::XRAY_REGISTRATION_OK); + assert(__xray_log_get_current_mode() != nullptr); + std::string current_mode = __xray_log_get_current_mode(); + assert(current_mode == "xray-profiling"); + assert(__xray_patch() == XRayPatchingStatus::SUCCESS); + assert(__xray_log_init(0, 0, nullptr, 0) == + XRayLogInitStatus::XRAY_LOG_INITIALIZED); + std::thread t0([] { f0(); }); + std::thread t1([] { f0(); }); + f0(); + t0.join(); + t1.join(); + assert(__xray_log_finalize() == XRayLogInitStatus::XRAY_LOG_FINALIZED); + assert(__xray_log_process_buffers(process_buffer) == + XRayLogFlushStatus::XRAY_LOG_FLUSHED); + // We're running three threds, so we expect three buffers. + assert(buffer_counter == 3); + assert(__xray_log_flushLog() == XRayLogFlushStatus::XRAY_LOG_FLUSHED); +} Index: compiler-rt/test/xray/TestCases/Posix/profiling-single-threaded.cc =================================================================== --- /dev/null +++ compiler-rt/test/xray/TestCases/Posix/profiling-single-threaded.cc @@ -0,0 +1,48 @@ +// Check that we can get a profile from a single-threaded application, on +// demand through the XRay logging implementation API. +// +// FIXME: Make -fxray-modes=xray-profiling part of the default? +// RUN: %clangxx_xray -std=c++11 %s -o %t -fxray-modes=xray-profiler +// RUN: %run %t +// +// UNSUPPORTED: target-is-mips64,target-is-mips64el + +#include "xray/xray_interface.h" +#include "xray/xray_log_interface.h" +#include +#include +#include + +#define ALWAYS_INSTRUMENT [[clang::xray_always_instrument]] +#define NEVER_INSTRUMENT [[clang::xray_never_instrument]] + +ALWAYS_INSTRUMENT void f2() { return; } +ALWAYS_INSTRUMENT void f1() { f2(); } +ALWAYS_INSTRUMENT void f0() { f1(); } + +using namespace std; + +volatile int buffer_counter = 0; + +NEVER_INSTRUMENT void process_buffer(const char *, XRayBuffer) { + // FIXME: Actually assert the contents of the buffer. + ++buffer_counter; +} + +ALWAYS_INSTRUMENT int main(int, char **) { + assert(__xray_log_select_mode("xray-profiling") == + XRayLogRegisterStatus::XRAY_REGISTRATION_OK); + assert(__xray_log_get_current_mode() != nullptr); + std::string current_mode = __xray_log_get_current_mode(); + assert(current_mode == "xray-profiling"); + assert(__xray_patch() == XRayPatchingStatus::SUCCESS); + assert(__xray_log_init(0, 0, nullptr, 0) == + XRayLogInitStatus::XRAY_LOG_INITIALIZED); + f0(); + assert(__xray_log_finalize() == XRayLogInitStatus::XRAY_LOG_FINALIZED); + f0(); + assert(__xray_log_process_buffers(process_buffer) == + XRayLogFlushStatus::XRAY_LOG_FLUSHED); + assert(buffer_counter == 1); + assert(__xray_log_flushLog() == XRayLogFlushStatus::XRAY_LOG_FLUSHED); +}