Index: compiler-rt/trunk/include/xray/xray_interface.h =================================================================== --- compiler-rt/trunk/include/xray/xray_interface.h +++ compiler-rt/trunk/include/xray/xray_interface.h @@ -25,6 +25,7 @@ EXIT = 1, TAIL = 2, LOG_ARGS_ENTRY = 3, + CUSTOM_EVENT = 4, }; /// Provide a function to invoke for when instrumentation points are hit. This @@ -64,6 +65,9 @@ /// Returns 1 on success, 0 on error. extern int __xray_remove_handler_arg1(); +/// Provide a function to invoke when XRay encounters a custom event. +extern int __xray_set_customevent_handler(void (*entry)(void*, std::size_t)); + enum XRayPatchingStatus { NOT_INITIALIZED = 0, SUCCESS = 1, Index: compiler-rt/trunk/lib/xray/xray_AArch64.cc =================================================================== --- compiler-rt/trunk/lib/xray/xray_AArch64.cc +++ compiler-rt/trunk/lib/xray/xray_AArch64.cc @@ -18,8 +18,7 @@ #include #include - -extern "C" void __clear_cache(void* start, void* end); +extern "C" void __clear_cache(void *start, void *end); namespace __xray { @@ -86,8 +85,8 @@ reinterpret_cast *>(FirstAddress), uint32_t(PatchOpcodes::PO_B32), std::memory_order_release); } - __clear_cache(reinterpret_cast(FirstAddress), - reinterpret_cast(CurAddress)); + __clear_cache(reinterpret_cast(FirstAddress), + reinterpret_cast(CurAddress)); return true; } @@ -107,6 +106,12 @@ return patchSled(Enable, FuncId, Sled, __xray_FunctionTailExit); } +bool patchCustomEvent(const bool Enable, const uint32_t FuncId, + const XRaySled &Sled) + XRAY_NEVER_INSTRUMENT { // FIXME: Implement in aarch64? + return false; +} + // FIXME: Maybe implement this better? bool probeRequiredCPUFeatures() XRAY_NEVER_INSTRUMENT { return true; } Index: compiler-rt/trunk/lib/xray/xray_arm.cc =================================================================== --- compiler-rt/trunk/lib/xray/xray_arm.cc +++ compiler-rt/trunk/lib/xray/xray_arm.cc @@ -18,7 +18,7 @@ #include #include -extern "C" void __clear_cache(void* start, void* end); +extern "C" void __clear_cache(void *start, void *end); namespace __xray { @@ -122,8 +122,8 @@ reinterpret_cast *>(FirstAddress), uint32_t(PatchOpcodes::PO_B20), std::memory_order_release); } - __clear_cache(reinterpret_cast(FirstAddress), - reinterpret_cast(CurAddress)); + __clear_cache(reinterpret_cast(FirstAddress), + reinterpret_cast(CurAddress)); return true; } @@ -143,6 +143,12 @@ return patchSled(Enable, FuncId, Sled, __xray_FunctionTailExit); } +bool patchCustomEvent(const bool Enable, const uint32_t FuncId, + const XRaySled &Sled) + XRAY_NEVER_INSTRUMENT { // FIXME: Implement in arm? + return false; +} + // FIXME: Maybe implement this better? bool probeRequiredCPUFeatures() XRAY_NEVER_INSTRUMENT { return true; } Index: compiler-rt/trunk/lib/xray/xray_fdr_log_records.h =================================================================== --- compiler-rt/trunk/lib/xray/xray_fdr_log_records.h +++ compiler-rt/trunk/lib/xray/xray_fdr_log_records.h @@ -29,6 +29,7 @@ NewCPUId, TSCWrap, WalltimeMarker, + CustomEventMarker, }; // Use 7 bits to identify this record type. /* RecordKinds */ uint8_t RecordKind : 7; Index: compiler-rt/trunk/lib/xray/xray_fdr_logging.cc =================================================================== --- compiler-rt/trunk/lib/xray/xray_fdr_logging.cc +++ compiler-rt/trunk/lib/xray/xray_fdr_logging.cc @@ -41,45 +41,12 @@ // Global BufferQueue. std::shared_ptr BQ; -__sanitizer::atomic_sint32_t LoggingStatus = { - XRayLogInitStatus::XRAY_LOG_UNINITIALIZED}; - __sanitizer::atomic_sint32_t LogFlushStatus = { XRayLogFlushStatus::XRAY_LOG_NOT_FLUSHING}; -std::unique_ptr FDROptions; +FDRLoggingOptions FDROptions; -XRayLogInitStatus fdrLoggingInit(std::size_t BufferSize, std::size_t BufferMax, - void *Options, - size_t OptionsSize) XRAY_NEVER_INSTRUMENT { - if (OptionsSize != sizeof(FDRLoggingOptions)) - return static_cast(__sanitizer::atomic_load( - &LoggingStatus, __sanitizer::memory_order_acquire)); - s32 CurrentStatus = XRayLogInitStatus::XRAY_LOG_UNINITIALIZED; - if (!__sanitizer::atomic_compare_exchange_strong( - &LoggingStatus, &CurrentStatus, - XRayLogInitStatus::XRAY_LOG_INITIALIZING, - __sanitizer::memory_order_release)) - return static_cast(CurrentStatus); - - FDROptions.reset(new FDRLoggingOptions()); - memcpy(FDROptions.get(), Options, OptionsSize); - bool Success = false; - BQ = std::make_shared(BufferSize, BufferMax, Success); - if (!Success) { - Report("BufferQueue init failed.\n"); - return XRayLogInitStatus::XRAY_LOG_UNINITIALIZED; - } - - // Install the actual handleArg0 handler after initialising the buffers. - __xray_set_handler(fdrLoggingHandleArg0); - - __sanitizer::atomic_store(&LoggingStatus, - XRayLogInitStatus::XRAY_LOG_INITIALIZED, - __sanitizer::memory_order_release); - Report("XRay FDR init successful.\n"); - return XRayLogInitStatus::XRAY_LOG_INITIALIZED; -} +__sanitizer::SpinMutex FDROptionsMutex; // Must finalize before flushing. XRayLogFlushStatus fdrLoggingFlush() XRAY_NEVER_INSTRUMENT { @@ -108,7 +75,11 @@ // (fixed-sized) and let the tools reading the buffers deal with the data // afterwards. // - int Fd = FDROptions->Fd; + int Fd = -1; + { + __sanitizer::SpinMutexLock Guard(&FDROptionsMutex); + Fd = FDROptions.Fd; + } if (Fd == -1) Fd = getLogFD(); if (Fd == -1) { @@ -120,8 +91,8 @@ // Test for required CPU features and cache the cycle frequency static bool TSCSupported = probeRequiredCPUFeatures(); - static uint64_t CycleFrequency = TSCSupported ? getTSCFrequency() - : __xray::NanosecondsPerSecond; + static uint64_t CycleFrequency = + TSCSupported ? getTSCFrequency() : __xray::NanosecondsPerSecond; XRayFileHeader Header; Header.Version = 1; @@ -192,8 +163,8 @@ return XRayLogInitStatus::XRAY_LOG_UNINITIALIZED; } -void fdrLoggingHandleArg0(int32_t FuncId, - XRayEntryType Entry) XRAY_NEVER_INSTRUMENT { +static std::tuple +getTimestamp() XRAY_NEVER_INSTRUMENT { // We want to get the TSC as early as possible, so that we can check whether // we've seen this CPU before. We also do it before we load anything else, to // allow for forward progress with the scheduling. @@ -203,7 +174,7 @@ // Test once for required CPU features static bool TSCSupported = probeRequiredCPUFeatures(); - if(TSCSupported) { + if (TSCSupported) { TSC = __xray::readTSC(CPU); } else { // FIXME: This code needs refactoring as it appears in multiple locations @@ -216,9 +187,102 @@ CPU = 0; TSC = TS.tv_sec * __xray::NanosecondsPerSecond + TS.tv_nsec; } + return std::make_tuple(TSC, CPU); +} + +void fdrLoggingHandleArg0(int32_t FuncId, + XRayEntryType Entry) XRAY_NEVER_INSTRUMENT { + auto TSC_CPU = getTimestamp(); + __xray_fdr_internal::processFunctionHook(FuncId, Entry, std::get<0>(TSC_CPU), + std::get<1>(TSC_CPU), clock_gettime, + LoggingStatus, BQ); +} + +void fdrLoggingHandleCustomEvent(void *Event, + std::size_t EventSize) XRAY_NEVER_INSTRUMENT { + using namespace __xray_fdr_internal; + auto TSC_CPU = getTimestamp(); + auto &TSC = std::get<0>(TSC_CPU); + auto &CPU = std::get<1>(TSC_CPU); + thread_local bool Running = false; + RecursionGuard Guard{Running}; + if (!Guard) { + assert(Running && "RecursionGuard is buggy!"); + return; + } + if (EventSize > std::numeric_limits::max()) { + using Empty = struct {}; + static Empty Once = [&] { + Report("Event size too large = %zu ; > max = %d\n", EventSize, + std::numeric_limits::max()); + return Empty(); + }(); + (void)Once; + } + int32_t ReducedEventSize = static_cast(EventSize); + if (!isLogInitializedAndReady(LocalBQ, TSC, CPU, clock_gettime)) + return; + + // Here we need to prepare the log to handle: + // - The metadata record we're going to write. (16 bytes) + // - The additional data we're going to write. Currently, that's the size of + // the event we're going to dump into the log as free-form bytes. + if (!prepareBuffer(clock_gettime, MetadataRecSize + EventSize)) { + LocalBQ = nullptr; + return; + } - __xray_fdr_internal::processFunctionHook(FuncId, Entry, TSC, CPU, - clock_gettime, LoggingStatus, BQ); + // Write the custom event metadata record, which consists of the following + // information: + // - 8 bytes (64-bits) for the full TSC when the event started. + // - 4 bytes (32-bits) for the length of the data. + MetadataRecord CustomEvent; + CustomEvent.Type = uint8_t(RecordType::Metadata); + CustomEvent.RecordKind = + uint8_t(MetadataRecord::RecordKinds::CustomEventMarker); + constexpr auto TSCSize = sizeof(std::get<0>(TSC_CPU)); + std::memcpy(&CustomEvent.Data, &ReducedEventSize, sizeof(int32_t)); + std::memcpy(&CustomEvent.Data[sizeof(int32_t)], &TSC, TSCSize); + std::memcpy(RecordPtr, &CustomEvent, sizeof(CustomEvent)); + RecordPtr += sizeof(CustomEvent); + std::memcpy(RecordPtr, Event, ReducedEventSize); + endBufferIfFull(); +} + +XRayLogInitStatus fdrLoggingInit(std::size_t BufferSize, std::size_t BufferMax, + void *Options, + size_t OptionsSize) XRAY_NEVER_INSTRUMENT { + if (OptionsSize != sizeof(FDRLoggingOptions)) + return static_cast(__sanitizer::atomic_load( + &LoggingStatus, __sanitizer::memory_order_acquire)); + s32 CurrentStatus = XRayLogInitStatus::XRAY_LOG_UNINITIALIZED; + if (!__sanitizer::atomic_compare_exchange_strong( + &LoggingStatus, &CurrentStatus, + XRayLogInitStatus::XRAY_LOG_INITIALIZING, + __sanitizer::memory_order_release)) + return static_cast(CurrentStatus); + + { + __sanitizer::SpinMutexLock Guard(&FDROptionsMutex); + memcpy(&FDROptions, Options, OptionsSize); + } + + bool Success = false; + BQ = std::make_shared(BufferSize, BufferMax, Success); + if (!Success) { + Report("BufferQueue init failed.\n"); + return XRayLogInitStatus::XRAY_LOG_UNINITIALIZED; + } + + // Install the actual handleArg0 handler after initialising the buffers. + __xray_set_handler(fdrLoggingHandleArg0); + __xray_set_customevent_handler(fdrLoggingHandleCustomEvent); + + __sanitizer::atomic_store(&LoggingStatus, + XRayLogInitStatus::XRAY_LOG_INITIALIZED, + __sanitizer::memory_order_release); + Report("XRay FDR init successful.\n"); + return XRayLogInitStatus::XRAY_LOG_INITIALIZED; } } // namespace __xray Index: compiler-rt/trunk/lib/xray/xray_fdr_logging_impl.h =================================================================== --- compiler-rt/trunk/lib/xray/xray_fdr_logging_impl.h +++ compiler-rt/trunk/lib/xray/xray_fdr_logging_impl.h @@ -37,6 +37,9 @@ namespace __xray { +__sanitizer::atomic_sint32_t LoggingStatus = { + XRayLogInitStatus::XRAY_LOG_UNINITIALIZED}; + /// We expose some of the state transitions when FDR logging mode is operating /// such that we can simulate a series of log events that may occur without /// and test with determinism without worrying about the real CPU time. @@ -123,12 +126,21 @@ constexpr auto MetadataRecSize = sizeof(MetadataRecord); constexpr auto FunctionRecSize = sizeof(FunctionRecord); +// We use a thread_local variable to keep track of which CPUs we've already +// run, and the TSC times for these CPUs. This allows us to stop repeating the +// CPU field in the function records. +// +// We assume that we'll support only 65536 CPUs for x86_64. +thread_local uint16_t CurrentCPU = std::numeric_limits::max(); +thread_local uint64_t LastTSC = 0; +thread_local uint64_t LastFunctionEntryTSC = 0; + class ThreadExitBufferCleanup { - std::weak_ptr Buffers; + std::shared_ptr &Buffers; BufferQueue::Buffer &Buffer; public: - explicit ThreadExitBufferCleanup(std::weak_ptr BQ, + explicit ThreadExitBufferCleanup(std::shared_ptr &BQ, BufferQueue::Buffer &Buffer) XRAY_NEVER_INSTRUMENT : Buffers(BQ), Buffer(Buffer) {} @@ -142,17 +154,24 @@ // the queue. assert((RecordPtr + MetadataRecSize) - static_cast(Buffer.Buffer) >= static_cast(MetadataRecSize)); - if (auto BQ = Buffers.lock()) { + if (Buffers) { writeEOBMetadata(); - auto EC = BQ->releaseBuffer(Buffer); + auto EC = Buffers->releaseBuffer(Buffer); if (EC != BufferQueue::ErrorCode::Ok) Report("Failed to release buffer at %p; error=%s\n", Buffer.Buffer, BufferQueue::getErrorString(EC)); + Buffers = nullptr; return; } } }; +// Make sure a thread that's ever called handleArg0 has a thread-local +// live reference to the buffer queue for this particular instance of +// FDRLogging, and that we're going to clean it up when the thread exits. +thread_local std::shared_ptr LocalBQ = nullptr; +thread_local ThreadExitBufferCleanup Cleanup(LocalBQ, Buffer); + class RecursionGuard { bool &Running; const bool Valid; @@ -176,7 +195,7 @@ } }; -static inline bool loggingInitialized( +inline bool loggingInitialized( const __sanitizer::atomic_sint32_t &LoggingStatus) XRAY_NEVER_INSTRUMENT { return __sanitizer::atomic_load(&LoggingStatus, __sanitizer::memory_order_acquire) == @@ -185,7 +204,7 @@ } // namespace -static inline void writeNewBufferPreamble(pid_t Tid, timespec TS, +inline void writeNewBufferPreamble(pid_t Tid, timespec TS, char *&MemPtr) XRAY_NEVER_INSTRUMENT { static constexpr int InitRecordsCount = 2; std::aligned_storage::type Records[InitRecordsCount]; @@ -222,9 +241,8 @@ NumTailCalls = 0; } -static inline void setupNewBuffer(int (*wall_clock_reader)(clockid_t, - struct timespec *)) - XRAY_NEVER_INSTRUMENT { +inline void setupNewBuffer(int (*wall_clock_reader)( + clockid_t, struct timespec *)) XRAY_NEVER_INSTRUMENT { RecordPtr = static_cast(Buffer.Buffer); pid_t Tid = syscall(SYS_gettid); timespec TS{0, 0}; @@ -235,7 +253,7 @@ NumTailCalls = 0; } -static inline void writeNewCPUIdMetadata(uint16_t CPU, uint64_t TSC, +inline void writeNewCPUIdMetadata(uint16_t CPU, uint64_t TSC, char *&MemPtr) XRAY_NEVER_INSTRUMENT { MetadataRecord NewCPUId; NewCPUId.Type = uint8_t(RecordType::Metadata); @@ -253,12 +271,12 @@ NumTailCalls = 0; } -static inline void writeNewCPUIdMetadata(uint16_t CPU, +inline void writeNewCPUIdMetadata(uint16_t CPU, uint64_t TSC) XRAY_NEVER_INSTRUMENT { writeNewCPUIdMetadata(CPU, TSC, RecordPtr); } -static inline void writeEOBMetadata(char *&MemPtr) XRAY_NEVER_INSTRUMENT { +inline void writeEOBMetadata(char *&MemPtr) XRAY_NEVER_INSTRUMENT { MetadataRecord EOBMeta; EOBMeta.Type = uint8_t(RecordType::Metadata); EOBMeta.RecordKind = uint8_t(MetadataRecord::RecordKinds::EndOfBuffer); @@ -269,11 +287,11 @@ NumTailCalls = 0; } -static inline void writeEOBMetadata() XRAY_NEVER_INSTRUMENT { +inline void writeEOBMetadata() XRAY_NEVER_INSTRUMENT { writeEOBMetadata(RecordPtr); } -static inline void writeTSCWrapMetadata(uint64_t TSC, +inline void writeTSCWrapMetadata(uint64_t TSC, char *&MemPtr) XRAY_NEVER_INSTRUMENT { MetadataRecord TSCWrap; TSCWrap.Type = uint8_t(RecordType::Metadata); @@ -289,11 +307,11 @@ NumTailCalls = 0; } -static inline void writeTSCWrapMetadata(uint64_t TSC) XRAY_NEVER_INSTRUMENT { +inline void writeTSCWrapMetadata(uint64_t TSC) XRAY_NEVER_INSTRUMENT { writeTSCWrapMetadata(TSC, RecordPtr); } -static inline void writeFunctionRecord(int FuncId, uint32_t TSCDelta, +inline void writeFunctionRecord(int FuncId, uint32_t TSCDelta, XRayEntryType EntryType, char *&MemPtr) XRAY_NEVER_INSTRUMENT { std::aligned_storage::type @@ -339,6 +357,17 @@ FuncRecord.RecordKind = uint8_t(FunctionRecord::RecordKinds::FunctionTailExit); break; + case XRayEntryType::CUSTOM_EVENT: { + // This is a bug in patching, so we'll report it once and move on. + static bool Once = [&] { + Report("Internal error: patched an XRay custom event call as a function; " + "func id = %d\n", + FuncId); + return true; + }(); + (void)Once; + return; + } } std::memcpy(MemPtr, &AlignedFuncRecordBuffer, sizeof(FunctionRecord)); @@ -346,8 +375,9 @@ } static uint64_t thresholdTicks() { - static uint64_t TicksPerSec = probeRequiredCPUFeatures() ? getTSCFrequency() : - __xray::NanosecondsPerSecond; + static uint64_t TicksPerSec = probeRequiredCPUFeatures() + ? getTSCFrequency() + : __xray::NanosecondsPerSecond; static const uint64_t ThresholdTicks = TicksPerSec * flags()->xray_fdr_log_func_duration_threshold_us / 1000000; return ThresholdTicks; @@ -397,9 +427,8 @@ RewindingRecordPtr -= FunctionRecSize; RewindingTSC -= ExpectedTailExit.TSCDelta; AlignedFuncStorage FunctionEntryBuffer; - const auto &ExpectedFunctionEntry = - *reinterpret_cast(std::memcpy( - &FunctionEntryBuffer, RewindingRecordPtr, FunctionRecSize)); + const auto &ExpectedFunctionEntry = *reinterpret_cast( + std::memcpy(&FunctionEntryBuffer, RewindingRecordPtr, FunctionRecSize)); assert(ExpectedFunctionEntry.RecordKind == uint8_t(FunctionRecord::RecordKinds::FunctionEnter) && "Expected to find function entry when rewinding tail call."); @@ -422,7 +451,7 @@ } } -static inline bool releaseThreadLocalBuffer(BufferQueue *BQ) { +inline bool releaseThreadLocalBuffer(BufferQueue *BQ) { auto EC = BQ->releaseBuffer(Buffer); if (EC != BufferQueue::ErrorCode::Ok) { Report("Failed to release buffer at %p; error=%s\n", Buffer.Buffer, @@ -432,11 +461,29 @@ return true; } -static inline void processFunctionHook( - int32_t FuncId, XRayEntryType Entry, uint64_t TSC, unsigned char CPU, - int (*wall_clock_reader)(clockid_t, struct timespec *), - __sanitizer::atomic_sint32_t &LoggingStatus, - const std::shared_ptr &BQ) XRAY_NEVER_INSTRUMENT { +inline bool prepareBuffer(int (*wall_clock_reader)(clockid_t, + struct timespec *), + size_t MaxSize) XRAY_NEVER_INSTRUMENT { + char *BufferStart = static_cast(Buffer.Buffer); + if ((RecordPtr + MaxSize) > (BufferStart + Buffer.Size - MetadataRecSize)) { + writeEOBMetadata(); + if (!releaseThreadLocalBuffer(LocalBQ.get())) + return false; + auto EC = LocalBQ->getBuffer(Buffer); + if (EC != BufferQueue::ErrorCode::Ok) { + Report("Failed to acquire a buffer; error=%s\n", + BufferQueue::getErrorString(EC)); + return false; + } + setupNewBuffer(wall_clock_reader); + } + return true; +} + +inline bool isLogInitializedAndReady( + std::shared_ptr &LocalBQ, uint64_t TSC, unsigned char CPU, + int (*wall_clock_reader)(clockid_t, + struct timespec *)) XRAY_NEVER_INSTRUMENT { // Bail out right away if logging is not initialized yet. // We should take the opportunity to release the buffer though. auto Status = __sanitizer::atomic_load(&LoggingStatus, @@ -446,44 +493,19 @@ (Status == XRayLogInitStatus::XRAY_LOG_FINALIZING || Status == XRayLogInitStatus::XRAY_LOG_FINALIZED)) { writeEOBMetadata(); - if (!releaseThreadLocalBuffer(BQ.get())) - return; + if (!releaseThreadLocalBuffer(LocalBQ.get())) + return false; RecordPtr = nullptr; + LocalBQ = nullptr; + return false; } - return; - } - - // We use a thread_local variable to keep track of which CPUs we've already - // run, and the TSC times for these CPUs. This allows us to stop repeating the - // CPU field in the function records. - // - // We assume that we'll support only 65536 CPUs for x86_64. - thread_local uint16_t CurrentCPU = std::numeric_limits::max(); - thread_local uint64_t LastTSC = 0; - thread_local uint64_t LastFunctionEntryTSC = 0; - - // Make sure a thread that's ever called handleArg0 has a thread-local - // live reference to the buffer queue for this particular instance of - // FDRLogging, and that we're going to clean it up when the thread exits. - thread_local auto LocalBQ = BQ; - thread_local ThreadExitBufferCleanup Cleanup(LocalBQ, Buffer); - - // Prevent signal handler recursion, so in case we're already in a log writing - // mode and the signal handler comes in (and is also instrumented) then we - // don't want to be clobbering potentially partial writes already happening in - // the thread. We use a simple thread_local latch to only allow one on-going - // handleArg0 to happen at any given time. - thread_local bool Running = false; - RecursionGuard Guard{Running}; - if (!Guard) { - assert(Running == true && "RecursionGuard is buggy!"); - return; + return false; } if (!loggingInitialized(LoggingStatus) || LocalBQ->finalizing()) { writeEOBMetadata(); - if (!releaseThreadLocalBuffer(BQ.get())) - return; + if (!releaseThreadLocalBuffer(LocalBQ.get())) + return false; RecordPtr = nullptr; } @@ -496,19 +518,57 @@ LS != XRayLogInitStatus::XRAY_LOG_FINALIZED) Report("Failed to acquire a buffer; error=%s\n", BufferQueue::getErrorString(EC)); - return; + return false; } setupNewBuffer(wall_clock_reader); } if (CurrentCPU == std::numeric_limits::max()) { - // This means this is the first CPU this thread has ever run on. We set the - // current CPU and record this as the first TSC we've seen. + // This means this is the first CPU this thread has ever run on. We set + // the current CPU and record this as the first TSC we've seen. CurrentCPU = CPU; writeNewCPUIdMetadata(CPU, TSC); } + return true; +} // namespace __xray_fdr_internal + +inline void endBufferIfFull() XRAY_NEVER_INSTRUMENT { + auto BufferStart = static_cast(Buffer.Buffer); + if ((RecordPtr + MetadataRecSize) - BufferStart == MetadataRecSize) { + writeEOBMetadata(); + if (!releaseThreadLocalBuffer(LocalBQ.get())) + return; + RecordPtr = nullptr; + } +} + +inline void processFunctionHook( + int32_t FuncId, XRayEntryType Entry, uint64_t TSC, unsigned char CPU, + int (*wall_clock_reader)(clockid_t, struct timespec *), + __sanitizer::atomic_sint32_t &LoggingStatus, + const std::shared_ptr &BQ) XRAY_NEVER_INSTRUMENT { + // Prevent signal handler recursion, so in case we're already in a log writing + // mode and the signal handler comes in (and is also instrumented) then we + // don't want to be clobbering potentially partial writes already happening in + // the thread. We use a simple thread_local latch to only allow one on-going + // handleArg0 to happen at any given time. + thread_local bool Running = false; + RecursionGuard Guard{Running}; + if (!Guard) { + assert(Running == true && "RecursionGuard is buggy!"); + return; + } + + // In case the reference has been cleaned up before, we make sure we + // initialize it to the provided BufferQueue. + if (LocalBQ == nullptr) + LocalBQ = BQ; + + if (!isLogInitializedAndReady(LocalBQ, TSC, CPU, wall_clock_reader)) + return; + // Before we go setting up writing new function entries, we need to be really // careful about the pointer math we're doing. This means we need to ensure // that the record we are about to write is going to fit into the buffer, @@ -545,24 +605,14 @@ // bytes in the end of the buffer, we need to write out the EOB, get a new // Buffer, set it up properly before doing any further writing. // - char *BufferStart = static_cast(Buffer.Buffer); - if ((RecordPtr + (MetadataRecSize + FunctionRecSize)) - BufferStart < - static_cast(MetadataRecSize)) { - writeEOBMetadata(); - if (!releaseThreadLocalBuffer(LocalBQ.get())) - return; - auto EC = LocalBQ->getBuffer(Buffer); - if (EC != BufferQueue::ErrorCode::Ok) { - Report("Failed to acquire a buffer; error=%s\n", - BufferQueue::getErrorString(EC)); - return; - } - setupNewBuffer(wall_clock_reader); + if (!prepareBuffer(wall_clock_reader, FunctionRecSize + MetadataRecSize)) { + LocalBQ = nullptr; + return; } // By this point, we are now ready to write at most 24 bytes (one metadata // record and one function record). - BufferStart = static_cast(Buffer.Buffer); + auto BufferStart = static_cast(Buffer.Buffer); assert((RecordPtr + (MetadataRecSize + FunctionRecSize)) - BufferStart >= static_cast(MetadataRecSize) && "Misconfigured BufferQueue provided; Buffer size not large enough."); @@ -586,7 +636,6 @@ // FunctionRecord. In this case we write down just a FunctionRecord with // the correct TSC delta. // - uint32_t RecordTSCDelta = 0; if (CPU != CurrentCPU) { // We've moved to a new CPU. @@ -619,21 +668,27 @@ break; rewindRecentCall(TSC, LastTSC, LastFunctionEntryTSC, FuncId); return; // without writing log. + case XRayEntryType::CUSTOM_EVENT: { + // This is a bug in patching, so we'll report it once and move on. + static bool Once = [&] { + Report("Internal error: patched an XRay custom event call as a function; " + "func id = %d", + FuncId); + return true; + }(); + (void)Once; + return; + } } writeFunctionRecord(FuncId, RecordTSCDelta, Entry, RecordPtr); // If we've exhausted the buffer by this time, we then release the buffer to // make sure that other threads may start using this buffer. - if ((RecordPtr + MetadataRecSize) - BufferStart == MetadataRecSize) { - writeEOBMetadata(); - if (!releaseThreadLocalBuffer(LocalBQ.get())) - return; - RecordPtr = nullptr; - } + endBufferIfFull(); } } // namespace __xray_fdr_internal - } // namespace __xray + #endif // XRAY_XRAY_FDR_LOGGING_IMPL_H Index: compiler-rt/trunk/lib/xray/xray_interface.cc =================================================================== --- compiler-rt/trunk/lib/xray/xray_interface.cc +++ compiler-rt/trunk/lib/xray/xray_interface.cc @@ -50,6 +50,9 @@ // This is the function to call from the arg1-enabled sleds/trampolines. __sanitizer::atomic_uintptr_t XRayArgLogger{0}; +// This is the function to call when we encounter a custom event log call. +__sanitizer::atomic_uintptr_t XRayPatchedCustomEvent{0}; + // MProtectHelper is an RAII wrapper for calls to mprotect(...) that will undo // any successful mprotect(...) changes. This is used to make a page writeable // and executable, and upon destruction if it was successful in doing so returns @@ -97,7 +100,19 @@ __sanitizer::memory_order_acquire)) { __sanitizer::atomic_store(&__xray::XRayPatchedFunction, - reinterpret_cast(entry), + reinterpret_cast(entry), + __sanitizer::memory_order_release); + return 1; + } + return 0; +} + +int __xray_set_customevent_handler(void (*entry)(void *, size_t)) + XRAY_NEVER_INSTRUMENT { + if (__sanitizer::atomic_load(&XRayInitialized, + __sanitizer::memory_order_acquire)) { + __sanitizer::atomic_store(&__xray::XRayPatchedCustomEvent, + reinterpret_cast(entry), __sanitizer::memory_order_release); return 1; } @@ -161,6 +176,9 @@ case XRayEntryType::LOG_ARGS_ENTRY: Success = patchFunctionEntry(Enable, FuncId, Sled, __xray_ArgLoggerEntry); break; + case XRayEntryType::CUSTOM_EVENT: + Success = patchCustomEvent(Enable, FuncId, Sled); + break; default: Report("Unsupported sled kind '%d' @%04x\n", Sled.Address, int(Sled.Kind)); return false; @@ -301,6 +319,7 @@ __sanitizer::memory_order_release); return 1; } + int __xray_remove_handler_arg1() { return __xray_set_handler_arg1(nullptr); } uintptr_t __xray_function_address(int32_t FuncId) XRAY_NEVER_INSTRUMENT { Index: compiler-rt/trunk/lib/xray/xray_interface_internal.h =================================================================== --- compiler-rt/trunk/lib/xray/xray_interface_internal.h +++ compiler-rt/trunk/lib/xray/xray_interface_internal.h @@ -60,6 +60,7 @@ bool patchFunctionExit(bool Enable, uint32_t FuncId, const XRaySledEntry &Sled); bool patchFunctionTailExit(bool Enable, uint32_t FuncId, const XRaySledEntry &Sled); +bool patchCustomEvent(bool Enable, uint32_t FuncId, const XRaySledEntry &Sled); } // namespace __xray @@ -70,6 +71,7 @@ extern void __xray_FunctionExit(); extern void __xray_FunctionTailExit(); extern void __xray_ArgLoggerEntry(); +extern void __xray_CustomEvent(); } #endif Index: compiler-rt/trunk/lib/xray/xray_mips.cc =================================================================== --- compiler-rt/trunk/lib/xray/xray_mips.cc +++ compiler-rt/trunk/lib/xray/xray_mips.cc @@ -95,7 +95,8 @@ // B #44 if (Enable) { - uint32_t LoTracingHookAddr = reinterpret_cast(TracingHook) & 0xffff; + uint32_t LoTracingHookAddr = + reinterpret_cast(TracingHook) & 0xffff; uint32_t HiTracingHookAddr = (reinterpret_cast(TracingHook) >> 16) & 0xffff; uint32_t LoFunctionID = FuncId & 0xffff; @@ -151,6 +152,12 @@ return patchSled(Enable, FuncId, Sled, __xray_FunctionExit); } +bool patchCustomEvent(const bool Enable, const uint32_t FuncId, + const XRaySledEntry &Sled) XRAY_NEVER_INSTRUMENT { + // FIXME: Implement in mips? + return false; +} + } // namespace __xray extern "C" void __xray_ArgLoggerEntry() XRAY_NEVER_INSTRUMENT { Index: compiler-rt/trunk/lib/xray/xray_mips64.cc =================================================================== --- compiler-rt/trunk/lib/xray/xray_mips64.cc +++ compiler-rt/trunk/lib/xray/xray_mips64.cc @@ -93,7 +93,8 @@ if (Enable) { uint32_t LoTracingHookAddr = reinterpret_cast(TracingHook) & 0xffff; - uint32_t HiTracingHookAddr = (reinterpret_cast(TracingHook) >> 16) & 0xffff; + uint32_t HiTracingHookAddr = + (reinterpret_cast(TracingHook) >> 16) & 0xffff; uint32_t HigherTracingHookAddr = (reinterpret_cast(TracingHook) >> 32) & 0xffff; uint32_t HighestTracingHookAddr = @@ -160,6 +161,11 @@ return patchSled(Enable, FuncId, Sled, __xray_FunctionExit); } +bool patchCustomEvent(const bool Enable, const uint32_t FuncId, + const XRaySledEntry &Sled) XRAY_NEVER_INSTRUMENT { + // FIXME: Implement in mips64? + return false; +} } // namespace __xray extern "C" void __xray_ArgLoggerEntry() XRAY_NEVER_INSTRUMENT { Index: compiler-rt/trunk/lib/xray/xray_powerpc64.cc =================================================================== --- compiler-rt/trunk/lib/xray/xray_powerpc64.cc +++ compiler-rt/trunk/lib/xray/xray_powerpc64.cc @@ -93,6 +93,12 @@ // FIXME: Maybe implement this better? bool probeRequiredCPUFeatures() XRAY_NEVER_INSTRUMENT { return true; } +bool patchCustomEvent(const bool Enable, const uint32_t FuncId, + const XRaySledEntry &Sled) XRAY_NEVER_INSTRUMENT { + // FIXME: Implement in powerpc64? + return false; +} + } // namespace __xray extern "C" void __xray_ArgLoggerEntry() XRAY_NEVER_INSTRUMENT { Index: compiler-rt/trunk/lib/xray/xray_trampoline_x86_64.S =================================================================== --- compiler-rt/trunk/lib/xray/xray_trampoline_x86_64.S +++ compiler-rt/trunk/lib/xray/xray_trampoline_x86_64.S @@ -176,9 +176,15 @@ je .Larg1entryFail .Larg1entryLog: - movq %rdi, %rdx // first argument will become the third - xorq %rsi, %rsi // XRayEntryType::ENTRY into the second - movl %r10d, %edi // 32-bit function ID becomes the first + + // First argument will become the third + movq %rdi, %rdx + + // XRayEntryType::ENTRY into the second + xorq %rsi, %rsi + + // 32-bit function ID becomes the first + movl %r10d, %edi callq *%rax .Larg1entryFail: @@ -189,4 +195,38 @@ .size __xray_ArgLoggerEntry, .Larg1entryEnd-__xray_ArgLoggerEntry .cfi_endproc +//===----------------------------------------------------------------------===// + + .global __xray_CustomEvent + .align 16, 0x90 + .type __xray_CustomEvent,@function +__xray_CustomEvent: + .cfi_startproc + subq $16, %rsp + .cfi_def_cfa_offset 24 + movq %rbp, 8(%rsp) + movq %rax, 0(%rsp) + + // We take two arguments to this trampoline, which should be in rdi and rsi + // already. We also make sure that we stash %rax because we use that register + // to call the logging handler. + movq _ZN6__xray22XRayPatchedCustomEventE(%rip), %rax + testq %rax,%rax + je .LcustomEventCleanup + + // At this point we know that rcx and rdx already has the data, so we just + // call the logging handler. + callq *%rax + +.LcustomEventCleanup: + movq 0(%rsp), %rax + movq 8(%rsp), %rbp + addq $16, %rsp + .cfi_def_cfa_offset 8 + retq + +.Ltmp8: + .size __xray_CustomEvent, .Ltmp8-__xray_CustomEvent + .cfi_endproc + NO_EXEC_STACK_DIRECTIVE Index: compiler-rt/trunk/lib/xray/xray_x86_64.cc =================================================================== --- compiler-rt/trunk/lib/xray/xray_x86_64.cc +++ compiler-rt/trunk/lib/xray/xray_x86_64.cc @@ -75,8 +75,10 @@ static constexpr uint8_t CallOpCode = 0xe8; static constexpr uint16_t MovR10Seq = 0xba41; static constexpr uint16_t Jmp9Seq = 0x09eb; +static constexpr uint16_t Jmp20Seq = 0x14eb; static constexpr uint8_t JmpOpCode = 0xe9; static constexpr uint8_t RetOpCode = 0xc3; +static constexpr uint16_t NopwSeq = 0x9066; static constexpr int64_t MinOffset{std::numeric_limits::min()}; static constexpr int64_t MaxOffset{std::numeric_limits::max()}; @@ -201,6 +203,40 @@ return true; } +bool patchCustomEvent(const bool Enable, const uint32_t FuncId, + const XRaySledEntry &Sled) XRAY_NEVER_INSTRUMENT { + // Here we do the dance of replacing the following sled: + // + // xray_sled_n: + // jmp +19 // 2 bytes + // ... + // + // With the following: + // + // nopw // 2 bytes* + // ... + // + // We need to do this in the following order: + // + // 1. Overwrite the 5-byte nop with the call (relative), where (relative) is + // the relative offset to the __xray_CustomEvent trampoline. + // 2. Do a two-byte atomic write over the 'jmp +24' to turn it into a 'nopw'. + // This allows us to "enable" this code once the changes have committed. + // + // The "unpatch" should just turn the 'nopw' back to a 'jmp +24'. + // + if (Enable) { + std::atomic_store_explicit( + reinterpret_cast *>(Sled.Address), NopwSeq, + std::memory_order_release); + } else { + std::atomic_store_explicit( + reinterpret_cast *>(Sled.Address), Jmp20Seq, + std::memory_order_release); + } + return false; +} + // We determine whether the CPU we're running on has the correct features we // need. In x86_64 this will be rdtscp support. bool probeRequiredCPUFeatures() XRAY_NEVER_INSTRUMENT { Index: compiler-rt/trunk/test/xray/TestCases/Linux/custom-event-logging.cc =================================================================== --- compiler-rt/trunk/test/xray/TestCases/Linux/custom-event-logging.cc +++ compiler-rt/trunk/test/xray/TestCases/Linux/custom-event-logging.cc @@ -0,0 +1,38 @@ +// Use the clang feature for custom xray event logging. +// +// RUN: %clangxx_xray -std=c++11 %s -o %t +// RUN: XRAY_OPTIONS="patch_premain=false verbosity=1 xray_naive_log=false xray_logfile_base=custom-event-logging.xray-" %run %t 2>&1 | FileCheck %s +// +#include +#include "xray/xray_interface.h" + +[[clang::xray_always_instrument]] void foo() { + static constexpr char CustomLogged[] = "hello custom logging!"; + printf("before calling the custom logging...\n"); + __xray_customevent(CustomLogged, sizeof(CustomLogged)); + printf("after calling the custom logging...\n"); +} + +void myprinter(void* ptr, size_t size) { + printf("%.*s\n", static_cast(size), static_cast(ptr)); +} + +int main() { + foo(); + // CHECK: before calling the custom logging... + // CHECK-NEXT: after calling the custom logging... + printf("setting up custom event handler...\n"); + // CHECK-NEXT: setting up custom event handler... + __xray_set_customevent_handler(myprinter); + __xray_patch(); + // CHECK-NEXT: before calling the custom logging... + foo(); + // CHECK-NEXT: hello custom logging! + // CHECK-NEXT: after calling the custom logging... + printf("removing custom event handler...\n"); + // CHECK-NEXT: removing custom event handler... + __xray_set_customevent_handler(nullptr); + foo(); + // CHECK-NEXT: before calling the custom logging... + // CHECK-NEXT: after calling the custom logging... +}