Index: include/xray/xray_log_interface.h =================================================================== --- include/xray/xray_log_interface.h +++ include/xray/xray_log_interface.h @@ -159,6 +159,9 @@ /// always have a handler for function entry and exit events. In case the /// implementation wants to support arg1 (or other future extensions to XRay /// logging) those MUST be installed by the installed 'log_init' handler. + /// + /// Because we didn't want to change the ABI of this struct, the arg1 handler + /// may be silently overwritten during initialization as well. void (*handle_arg0)(int32_t, XRayEntryType); /// The log implementation provided routine for when __xray_log_flushLog() is Index: lib/xray/xray_fdr_log_records.h =================================================================== --- lib/xray/xray_fdr_log_records.h +++ lib/xray/xray_fdr_log_records.h @@ -30,6 +30,7 @@ TSCWrap, WalltimeMarker, CustomEventMarker, + CallArgument, }; // Use 7 bits to identify this record type. /* RecordKinds */ uint8_t RecordKind : 7; Index: lib/xray/xray_fdr_logging.h =================================================================== --- lib/xray/xray_fdr_logging.h +++ lib/xray/xray_fdr_logging.h @@ -30,6 +30,7 @@ void *Options, size_t OptionsSize); XRayLogInitStatus fdrLoggingFinalize(); void fdrLoggingHandleArg0(int32_t FuncId, XRayEntryType Entry); +void fdrLoggingHandleArg1(int32_t FuncId, XRayEntryType Entry, uint64_t Arg1); XRayLogFlushStatus fdrLoggingFlush(); XRayLogInitStatus fdrLoggingReset(); Index: lib/xray/xray_fdr_logging.cc =================================================================== --- lib/xray/xray_fdr_logging.cc +++ lib/xray/xray_fdr_logging.cc @@ -196,9 +196,17 @@ void fdrLoggingHandleArg0(int32_t FuncId, XRayEntryType Entry) XRAY_NEVER_INSTRUMENT { auto TSC_CPU = getTimestamp(); - __xray_fdr_internal::processFunctionHook(FuncId, Entry, std::get<0>(TSC_CPU), - std::get<1>(TSC_CPU), clock_gettime, - LoggingStatus, *BQ); + __xray_fdr_internal::logArg0(FuncId, Entry, std::get<0>(TSC_CPU), + std::get<1>(TSC_CPU), clock_gettime, + LoggingStatus, *BQ); +} + +void fdrLoggingHandleArg1(int32_t FuncId, XRayEntryType, + uint64_t Arg) XRAY_NEVER_INSTRUMENT { + auto TSC_CPU = getTimestamp(); + __xray_fdr_internal::logArg1(FuncId, Arg, std::get<0>(TSC_CPU), + std::get<1>(TSC_CPU), clock_gettime, + LoggingStatus, *BQ); } void fdrLoggingHandleCustomEvent(void *Event, @@ -282,6 +290,7 @@ // Install the actual handleArg0 handler after initialising the buffers. __xray_set_handler(fdrLoggingHandleArg0); + __xray_set_handler_arg1(fdrLoggingHandleArg1); __xray_set_customevent_handler(fdrLoggingHandleCustomEvent); __sanitizer::atomic_store(&LoggingStatus, Index: lib/xray/xray_fdr_logging_impl.h =================================================================== --- lib/xray/xray_fdr_logging_impl.h +++ lib/xray/xray_fdr_logging_impl.h @@ -97,12 +97,17 @@ /// walk backward through its buffer and erase trivial functions to avoid /// polluting the log and may use the buffer queue to obtain or release a /// buffer. -static void processFunctionHook(int32_t FuncId, XRayEntryType Entry, - uint64_t TSC, unsigned char CPU, - int (*wall_clock_reader)(clockid_t, - struct timespec *), - __sanitizer::atomic_sint32_t &LoggingStatus, - const std::shared_ptr &BQ); +static void logArg0(int32_t FuncId, XRayEntryType Entry, uint64_t TSC, + unsigned char CPU, + int (*wall_clock_reader)(clockid_t, struct timespec *), + __sanitizer::atomic_sint32_t &LoggingStatus, + const std::shared_ptr BQ); + +/// First argument logging happens here; comments from logArg0 apply too. +static void logArg1(int32_t FuncId, uint64_t Arg1, uint64_t TSC, uint8_t CPU, + int (*wall_clock_reader)(clockid_t, struct timespec *), + __sanitizer::atomic_sint32_t &LoggingStatus, + const std::shared_ptr BQ); // Group together thread-local-data in a struct, then hide it behind a function // call so that it can be initialized on first use instead of as a global. @@ -331,9 +336,22 @@ writeTSCWrapMetadata(TSC, getThreadLocalData().RecordPtr); } -inline void writeFunctionRecord(int FuncId, uint32_t TSCDelta, - XRayEntryType EntryType, - char *&MemPtr) XRAY_NEVER_INSTRUMENT { +// Call Argument metadata records store the arguments to a function in the +// order of their appearance; holes are not supported by the buffer format. +static inline void writeCallArgumentMetadata(uint64_t A) XRAY_NEVER_INSTRUMENT { + auto &TLD = getThreadLocalData(); + MetadataRecord CallArg; + CallArg.Type = uint8_t(RecordType::Metadata); + CallArg.RecordKind = uint8_t(MetadataRecord::RecordKinds::CallArgument); + + std::memcpy(CallArg.Data, &A, sizeof(A)); + std::memcpy(TLD.RecordPtr, &CallArg, sizeof(MetadataRecord)); + TLD.RecordPtr += sizeof(MetadataRecord); +} + +static inline void writeFunctionRecord(int FuncId, uint32_t TSCDelta, + XRayEntryType EntryType, + char *&MemPtr) XRAY_NEVER_INSTRUMENT { std::aligned_storage::type AlignedFuncRecordBuffer; auto &FuncRecord = @@ -484,6 +502,42 @@ return true; } +// Before we go setting up writing new function entries, we need to be really +// careful about the pointer math we're doing. This means we need to ensure +// that the record we are about to write is going to fit into the buffer, +// without overflowing the buffer. +// +// To do this properly, we use the following assumptions: +// +// - The least number of bytes we will ever write is 8 +// (sizeof(FunctionRecord)) only if the delta between the previous entry +// and this entry is within 32 bits. +// - The most number of bytes we will ever write is 8 + 16 + 16 = 40. +// This is computed by: +// +// sizeof(FunctionRecord) + 2 * sizeof(MetadataRecord) +// +// These arise in the following cases: +// +// 1. When the delta between the TSC we get and the previous TSC for the +// same CPU is outside of the uint32_t range, we end up having to +// write a MetadataRecord to indicate a "tsc wrap" before the actual +// FunctionRecord. +// 2. When we learn that we've moved CPUs, we need to write a +// MetadataRecord to indicate a "cpu change", and thus write out the +// current TSC for that CPU before writing out the actual +// FunctionRecord. +// 3. When we learn about a new CPU ID, we need to write down a "new cpu +// id" MetadataRecord before writing out the actual FunctionRecord. +// 4. The second MetadataRecord is the optional function call argument. +// +// - An End-of-Buffer (EOB) MetadataRecord is 16 bytes. +// +// So the math we need to do is to determine whether writing 24 bytes past the +// current pointer leaves us with enough bytes to write the EOB +// MetadataRecord. If we don't have enough space after writing as much as 24 +// bytes in the end of the buffer, we need to write out the EOB, get a new +// Buffer, set it up properly before doing any further writing. inline bool prepareBuffer(int (*wall_clock_reader)(clockid_t, struct timespec *), size_t MaxSize) XRAY_NEVER_INSTRUMENT { @@ -560,6 +614,42 @@ return true; } // namespace __xray_fdr_internal +// Compute the TSC difference between the time of measurement and the previous +// event. There are a few interesting situations we need to account for: +// +// - The thread has migrated to a different CPU. If this is the case, then +// we write down the following records: +// +// 1. A 'NewCPUId' Metadata record. +// 2. A FunctionRecord with a 0 for the TSCDelta field. +// +// - The TSC delta is greater than the 32 bits we can store in a +// FunctionRecord. In this case we write down the following records: +// +// 1. A 'TSCWrap' Metadata record. +// 2. A FunctionRecord with a 0 for the TSCDelta field. +// +// - The TSC delta is representable within the 32 bits we can store in a +// FunctionRecord. In this case we write down just a FunctionRecord with +// the correct TSC delta. +inline uint32_t writeCurrentCPUTSC(ThreadLocalData &TLD, uint64_t TSC, uint8_t CPU) { + uint32_t RecordTSCDelta = 0; + if (CPU != TLD.CurrentCPU) { + // We've moved to a new CPU. + writeNewCPUIdMetadata(CPU, TSC); + } else { + // If the delta is greater than the range for a uint32_t, then we write out + // the TSC wrap metadata entry with the full TSC, and the TSC for the + // function record be 0. + auto Delta = TSC - TLD.LastTSC; + if (Delta > std::numeric_limits::max()) + writeTSCWrapMetadata(TSC); + else + RecordTSCDelta = Delta; + } + return RecordTSCDelta; +} + inline void endBufferIfFull() XRAY_NEVER_INSTRUMENT { auto &TLD = getThreadLocalData(); auto BufferStart = static_cast(TLD.Buffer.Buffer); @@ -573,11 +663,11 @@ thread_local volatile bool Running = false; -inline void processFunctionHook( +inline void logArg0( int32_t FuncId, XRayEntryType Entry, uint64_t TSC, unsigned char CPU, int (*wall_clock_reader)(clockid_t, struct timespec *), __sanitizer::atomic_sint32_t &LoggingStatus, - const std::shared_ptr &BQ) XRAY_NEVER_INSTRUMENT { + const std::shared_ptr BQ) XRAY_NEVER_INSTRUMENT { // Prevent signal handler recursion, so in case we're already in a log writing // mode and the signal handler comes in (and is also instrumented) then we // don't want to be clobbering potentially partial writes already happening in @@ -599,42 +689,6 @@ if (!isLogInitializedAndReady(TLD.LocalBQ, TSC, CPU, wall_clock_reader)) return; - // Before we go setting up writing new function entries, we need to be really - // careful about the pointer math we're doing. This means we need to ensure - // that the record we are about to write is going to fit into the buffer, - // without overflowing the buffer. - // - // To do this properly, we use the following assumptions: - // - // - The least number of bytes we will ever write is 8 - // (sizeof(FunctionRecord)) only if the delta between the previous entry - // and this entry is within 32 bits. - // - The most number of bytes we will ever write is 8 + 16 = 24. This is - // computed by: - // - // sizeof(FunctionRecord) + sizeof(MetadataRecord) - // - // These arise in the following cases: - // - // 1. When the delta between the TSC we get and the previous TSC for the - // same CPU is outside of the uint32_t range, we end up having to - // write a MetadataRecord to indicate a "tsc wrap" before the actual - // FunctionRecord. - // 2. When we learn that we've moved CPUs, we need to write a - // MetadataRecord to indicate a "cpu change", and thus write out the - // current TSC for that CPU before writing out the actual - // FunctionRecord. - // 3. When we learn about a new CPU ID, we need to write down a "new cpu - // id" MetadataRecord before writing out the actual FunctionRecord. - // - // - An End-of-Buffer (EOB) MetadataRecord is 16 bytes. - // - // So the math we need to do is to determine whether writing 24 bytes past the - // current pointer leaves us with enough bytes to write the EOB - // MetadataRecord. If we don't have enough space after writing as much as 24 - // bytes in the end of the buffer, we need to write out the EOB, get a new - // Buffer, set it up properly before doing any further writing. - // if (!prepareBuffer(wall_clock_reader, FunctionRecSize + MetadataRecSize)) { TLD.LocalBQ = nullptr; return; @@ -647,45 +701,11 @@ static_cast(MetadataRecSize) && "Misconfigured BufferQueue provided; Buffer size not large enough."); - // Here we compute the TSC Delta. There are a few interesting situations we - // need to account for: - // - // - The thread has migrated to a different CPU. If this is the case, then - // we write down the following records: - // - // 1. A 'NewCPUId' Metadata record. - // 2. A FunctionRecord with a 0 for the TSCDelta field. - // - // - The TSC delta is greater than the 32 bits we can store in a - // FunctionRecord. In this case we write down the following records: - // - // 1. A 'TSCWrap' Metadata record. - // 2. A FunctionRecord with a 0 for the TSCDelta field. - // - // - The TSC delta is representable within the 32 bits we can store in a - // FunctionRecord. In this case we write down just a FunctionRecord with - // the correct TSC delta. - // - uint32_t RecordTSCDelta = 0; - if (CPU != TLD.CurrentCPU) { - // We've moved to a new CPU. - writeNewCPUIdMetadata(CPU, TSC); - } else { - // If the delta is greater than the range for a uint32_t, then we write out - // the TSC wrap metadata entry with the full TSC, and the TSC for the - // function record be 0. - auto Delta = TSC - TLD.LastTSC; - if (Delta > (1ULL << 32) - 1) - writeTSCWrapMetadata(TSC); - else - RecordTSCDelta = Delta; - } - + auto RecordTSCDelta = writeCurrentCPUTSC(TLD, TSC, CPU); TLD.LastTSC = TSC; TLD.CurrentCPU = CPU; switch (Entry) { case XRayEntryType::ENTRY: - case XRayEntryType::LOG_ARGS_ENTRY: // Update the thread local state for the next invocation. TLD.LastFunctionEntryTSC = TSC; break; @@ -698,6 +718,7 @@ break; rewindRecentCall(TSC, TLD.LastTSC, TLD.LastFunctionEntryTSC, FuncId); return; // without writing log. + case XRayEntryType::LOG_ARGS_ENTRY: case XRayEntryType::CUSTOM_EVENT: { // This is a bug in patching, so we'll report it once and move on. static bool Once = [&] { @@ -718,6 +739,48 @@ endBufferIfFull(); } +inline void logArg1( + int32_t FnId, uint64_t Arg1, uint64_t TSC, unsigned char CPU, + int (*wall_clock_reader)(clockid_t, struct timespec *), + __sanitizer::atomic_sint32_t &LoggingStatus, + const std::shared_ptr BQ) XRAY_NEVER_INSTRUMENT { + thread_local bool Running = false; + RecursionGuard Guard{Running}; + if (!Guard) { + assert(Running == true && "RecursionGuard is buggy!"); + return; + } + + auto &TLD = getThreadLocalData(); + + if (TLD.LocalBQ == nullptr) TLD.LocalBQ = BQ; + + if (!isLogInitializedAndReady(TLD.LocalBQ, TSC, CPU, wall_clock_reader)) + return; + + if (!prepareBuffer(wall_clock_reader, FunctionRecSize + MetadataRecSize)) { + TLD.LocalBQ = nullptr; + return; + } + + // By this point, we are now ready to write at most 40 bytes (two metadata + // records and one function record). + assert((TLD.RecordPtr + (2 * MetadataRecSize + FunctionRecSize)) - + static_cast(TLD.Buffer.Buffer) >= + static_cast(MetadataRecSize) && + "Misconfigured BufferQueue provided; Buffer size not large enough."); + auto TSCDelta = writeCurrentCPUTSC(TLD, TSC, CPU); + + TLD.LastTSC = TLD.LastFunctionEntryTSC = TSC; + TLD.CurrentCPU = CPU; + + writeFunctionRecord(FnId, TSCDelta, XRayEntryType::LOG_ARGS_ENTRY, + TLD.RecordPtr); + writeCallArgumentMetadata(Arg1); + + endBufferIfFull(); +} + } // namespace __xray_fdr_internal } // namespace __xray Index: test/xray/TestCases/Linux/fdr-mode.cc =================================================================== --- test/xray/TestCases/Linux/fdr-mode.cc +++ test/xray/TestCases/Linux/fdr-mode.cc @@ -29,6 +29,8 @@ [[clang::xray_always_instrument]] void __attribute__((noinline)) fB() { fC(); } [[clang::xray_always_instrument]] void __attribute__((noinline)) fA() { fB(); } +[[clang::xray_always_instrument, clang::xray_log_args(1)]] +void __attribute__((noinline)) fArg(int) { } int main(int argc, char *argv[]) { using namespace __xray; @@ -52,6 +54,7 @@ fC(); fB(); fA(); + fArg(1); }); other_thread.join(); std::cout << "Joined" << std::endl; @@ -85,8 +88,15 @@ // TRACE-DAG: - { type: 0, func-id: [[FIDB]], function: {{.*fB.*}}, cpu: {{.*}}, thread: [[THREAD2:[0-9]+]], kind: function-enter, tsc: {{[0-9]+}} } // TRACE: - { type: 0, func-id: [[FIDB]], function: {{.*fB.*}}, cpu: {{.*}}, thread: [[THREAD2]], kind: function-exit, tsc: {{[0-9]+}} } +// TRACE-DAG: - { type: 0, func-id: [[FIDARG:[0-9]+]], function: 'fArg(int)', args: [ 1 ], cpu: {{.*}}, thread: [[THREAD2]], kind: function-enter-arg, tsc: {{[0-9]+}} } +// TRACE-NEXT: - { type: 0, func-id: [[FIDARG]], function: 'fArg(int)', cpu: {{.*}}, thread: [[THREAD2]], kind: function-exit, tsc: {{[0-9]+}} } + // Assert that when unwriting is enabled with a high threshold time, all the function records are erased. A CPU switch could erroneously fail this test, but // is unlikely given the test program. -// UNWRITE: header +// Even with a high threshold, arg1 logging is never unwritten. +// UNWRITE: header: +// UNWRITE: records: +// UNWRITE-NEXT: - { type: 0, func-id: [[FIDARG:[0-9]+]], function: 'fArg(int)', args: [ 1 ], cpu: {{.*}}, thread: [[THREAD2:[0-9]+]], kind: function-enter-arg, tsc: {{[0-9]+}} } +// UNWRITE-NEXT: - { type: 0, func-id: [[FIDARG]], function: 'fArg(int)', cpu: {{.*}}, thread: [[THREAD2]], kind: function-exit, tsc: {{[0-9]+}} } // UNWRITE-NOT: function-enter // UNWRITE-NOT: function-exit