Index: lib/xray/xray_fdr_logging.cc =================================================================== --- lib/xray/xray_fdr_logging.cc +++ lib/xray/xray_fdr_logging.cc @@ -223,7 +223,8 @@ (void)Once; } int32_t ReducedEventSize = static_cast(EventSize); - if (!isLogInitializedAndReady(*LocalBQ, TSC, CPU, clock_gettime)) + auto &TLD = getThreadLocalData(); + if (!isLogInitializedAndReady(TLD.LocalBQ, TSC, CPU, clock_gettime)) return; // Here we need to prepare the log to handle: @@ -231,7 +232,7 @@ // - The additional data we're going to write. Currently, that's the size of // the event we're going to dump into the log as free-form bytes. if (!prepareBuffer(clock_gettime, MetadataRecSize + EventSize)) { - LocalBQ = nullptr; + TLD.LocalBQ = nullptr; return; } @@ -246,9 +247,9 @@ constexpr auto TSCSize = sizeof(std::get<0>(TSC_CPU)); std::memcpy(&CustomEvent.Data, &ReducedEventSize, sizeof(int32_t)); std::memcpy(&CustomEvent.Data[sizeof(int32_t)], &TSC, TSCSize); - std::memcpy(RecordPtr, &CustomEvent, sizeof(CustomEvent)); - RecordPtr += sizeof(CustomEvent); - std::memcpy(RecordPtr, Event, ReducedEventSize); + std::memcpy(TLD.RecordPtr, &CustomEvent, sizeof(CustomEvent)); + TLD.RecordPtr += sizeof(CustomEvent); + std::memcpy(TLD.RecordPtr, Event, ReducedEventSize); endBufferIfFull(); } Index: lib/xray/xray_fdr_logging_impl.h =================================================================== --- lib/xray/xray_fdr_logging_impl.h +++ lib/xray/xray_fdr_logging_impl.h @@ -104,36 +104,38 @@ __sanitizer::atomic_sint32_t &LoggingStatus, const std::shared_ptr &BQ); -//-----------------------------------------------------------------------------| -// The rest of the file is implementation. | -//-----------------------------------------------------------------------------| -// Functions are implemented in the header for inlining since we don't want | -// to grow the stack when we've hijacked the binary for logging. | -//-----------------------------------------------------------------------------| - -namespace { - -thread_local BufferQueue::Buffer Buffer; -thread_local char *RecordPtr = nullptr; - -// The number of FunctionEntry records immediately preceding RecordPtr. -thread_local uint8_t NumConsecutiveFnEnters = 0; +// Group together thread-local-data in a struct, then hide it behind a function +// call so that it can be initialized on first use instead of as a global. +struct ThreadLocalData { + BufferQueue::Buffer Buffer; + char *RecordPtr = nullptr; + // The number of FunctionEntry records immediately preceding RecordPtr. + uint8_t NumConsecutiveFnEnters = 0; + + // The number of adjacent, consecutive pairs of FunctionEntry, Tail Exit + // records preceding RecordPtr. + uint8_t NumTailCalls = 0; + + // We use a thread_local variable to keep track of which CPUs we've already + // run, and the TSC times for these CPUs. This allows us to stop repeating the + // CPU field in the function records. + // + // We assume that we'll support only 65536 CPUs for x86_64. + uint16_t CurrentCPU = std::numeric_limits::max(); + uint64_t LastTSC = 0; + uint64_t LastFunctionEntryTSC = 0; + + // Make sure a thread that's ever called handleArg0 has a thread-local + // live reference to the buffer queue for this particular instance of + // FDRLogging, and that we're going to clean it up when the thread exits. + std::shared_ptr LocalBQ = nullptr; +}; -// The number of adjacent, consecutive pairs of FunctionEntry, Tail Exit -// records preceding RecordPtr. -thread_local uint8_t NumTailCalls = 0; +// Forward-declare, defined later. +static ThreadLocalData &getThreadLocalData(); -constexpr auto MetadataRecSize = sizeof(MetadataRecord); -constexpr auto FunctionRecSize = sizeof(FunctionRecord); - -// We use a thread_local variable to keep track of which CPUs we've already -// run, and the TSC times for these CPUs. This allows us to stop repeating the -// CPU field in the function records. -// -// We assume that we'll support only 65536 CPUs for x86_64. -thread_local uint16_t CurrentCPU = std::numeric_limits::max(); -thread_local uint64_t LastTSC = 0; -thread_local uint64_t LastFunctionEntryTSC = 0; +static constexpr auto MetadataRecSize = sizeof(MetadataRecord); +static constexpr auto FunctionRecSize = sizeof(FunctionRecord); class ThreadExitBufferCleanup { std::shared_ptr &Buffers; @@ -146,6 +148,8 @@ Buffer(Buffer) {} ~ThreadExitBufferCleanup() noexcept XRAY_NEVER_INSTRUMENT { + auto &TLD = getThreadLocalData(); + auto &RecordPtr = TLD.RecordPtr; if (RecordPtr == nullptr) return; @@ -166,19 +170,27 @@ } }; -// Make sure a thread that's ever called handleArg0 has a thread-local -// live reference to the buffer queue for this particular instance of -// FDRLogging, and that we're going to clean it up when the thread exits. -thread_local std::shared_ptr* LocalBQ = - new std::shared_ptr(); -thread_local ThreadExitBufferCleanup Cleanup(*LocalBQ, Buffer); +static ThreadLocalData &getThreadLocalData() { + thread_local ThreadLocalData TLD; + thread_local ThreadExitBufferCleanup Cleanup(TLD.LocalBQ, TLD.Buffer); + return TLD; +} + +//-----------------------------------------------------------------------------| +// The rest of the file is implementation. | +//-----------------------------------------------------------------------------| +// Functions are implemented in the header for inlining since we don't want | +// to grow the stack when we've hijacked the binary for logging. | +//-----------------------------------------------------------------------------| + +namespace { class RecursionGuard { - bool &Running; + volatile bool &Running; const bool Valid; public: - explicit RecursionGuard(bool &R) : Running(R), Valid(!R) { + explicit RecursionGuard(volatile bool &R) : Running(R), Valid(!R) { if (Valid) Running = true; } @@ -238,24 +250,29 @@ } std::memcpy(MemPtr, Records, sizeof(MetadataRecord) * InitRecordsCount); MemPtr += sizeof(MetadataRecord) * InitRecordsCount; - NumConsecutiveFnEnters = 0; - NumTailCalls = 0; + auto &TLD = getThreadLocalData(); + TLD.NumConsecutiveFnEnters = 0; + TLD.NumTailCalls = 0; } inline void setupNewBuffer(int (*wall_clock_reader)( clockid_t, struct timespec *)) XRAY_NEVER_INSTRUMENT { + auto &TLD = getThreadLocalData(); + auto &Buffer = TLD.Buffer; + auto &RecordPtr = TLD.RecordPtr; RecordPtr = static_cast(Buffer.Buffer); pid_t Tid = syscall(SYS_gettid); timespec TS{0, 0}; // This is typically clock_gettime, but callers have injection ability. wall_clock_reader(CLOCK_MONOTONIC, &TS); writeNewBufferPreamble(Tid, TS, RecordPtr); - NumConsecutiveFnEnters = 0; - NumTailCalls = 0; + TLD.NumConsecutiveFnEnters = 0; + TLD.NumTailCalls = 0; } inline void writeNewCPUIdMetadata(uint16_t CPU, uint64_t TSC, char *&MemPtr) XRAY_NEVER_INSTRUMENT { + auto &TLD = getThreadLocalData(); MetadataRecord NewCPUId; NewCPUId.Type = uint8_t(RecordType::Metadata); NewCPUId.RecordKind = uint8_t(MetadataRecord::RecordKinds::NewCPUId); @@ -268,32 +285,34 @@ std::memcpy(&NewCPUId.Data[sizeof(CPU)], &TSC, sizeof(TSC)); std::memcpy(MemPtr, &NewCPUId, sizeof(MetadataRecord)); MemPtr += sizeof(MetadataRecord); - NumConsecutiveFnEnters = 0; - NumTailCalls = 0; + TLD.NumConsecutiveFnEnters = 0; + TLD.NumTailCalls = 0; } inline void writeNewCPUIdMetadata(uint16_t CPU, uint64_t TSC) XRAY_NEVER_INSTRUMENT { - writeNewCPUIdMetadata(CPU, TSC, RecordPtr); + writeNewCPUIdMetadata(CPU, TSC, getThreadLocalData().RecordPtr); } inline void writeEOBMetadata(char *&MemPtr) XRAY_NEVER_INSTRUMENT { + auto &TLD = getThreadLocalData(); MetadataRecord EOBMeta; EOBMeta.Type = uint8_t(RecordType::Metadata); EOBMeta.RecordKind = uint8_t(MetadataRecord::RecordKinds::EndOfBuffer); // For now we don't write any bytes into the Data field. std::memcpy(MemPtr, &EOBMeta, sizeof(MetadataRecord)); MemPtr += sizeof(MetadataRecord); - NumConsecutiveFnEnters = 0; - NumTailCalls = 0; + TLD.NumConsecutiveFnEnters = 0; + TLD.NumTailCalls = 0; } inline void writeEOBMetadata() XRAY_NEVER_INSTRUMENT { - writeEOBMetadata(RecordPtr); + writeEOBMetadata(getThreadLocalData().RecordPtr); } inline void writeTSCWrapMetadata(uint64_t TSC, char *&MemPtr) XRAY_NEVER_INSTRUMENT { + auto &TLD = getThreadLocalData(); MetadataRecord TSCWrap; TSCWrap.Type = uint8_t(RecordType::Metadata); TSCWrap.RecordKind = uint8_t(MetadataRecord::RecordKinds::TSCWrap); @@ -304,12 +323,12 @@ std::memcpy(&TSCWrap.Data, &TSC, sizeof(TSC)); std::memcpy(MemPtr, &TSCWrap, sizeof(MetadataRecord)); MemPtr += sizeof(MetadataRecord); - NumConsecutiveFnEnters = 0; - NumTailCalls = 0; + TLD.NumConsecutiveFnEnters = 0; + TLD.NumTailCalls = 0; } inline void writeTSCWrapMetadata(uint64_t TSC) XRAY_NEVER_INSTRUMENT { - writeTSCWrapMetadata(TSC, RecordPtr); + writeTSCWrapMetadata(TSC, getThreadLocalData().RecordPtr); } inline void writeFunctionRecord(int FuncId, uint32_t TSCDelta, @@ -324,36 +343,37 @@ FuncRecord.FuncId = FuncId & ~(0x0F << 28); FuncRecord.TSCDelta = TSCDelta; + auto &TLD = getThreadLocalData(); switch (EntryType) { case XRayEntryType::ENTRY: - ++NumConsecutiveFnEnters; + ++TLD.NumConsecutiveFnEnters; FuncRecord.RecordKind = uint8_t(FunctionRecord::RecordKinds::FunctionEnter); break; case XRayEntryType::LOG_ARGS_ENTRY: // We should not rewind functions with logged args. - NumConsecutiveFnEnters = 0; - NumTailCalls = 0; + TLD.NumConsecutiveFnEnters = 0; + TLD.NumTailCalls = 0; FuncRecord.RecordKind = uint8_t(FunctionRecord::RecordKinds::FunctionEnter); break; case XRayEntryType::EXIT: // If we've decided to log the function exit, we will never erase the log // before it. - NumConsecutiveFnEnters = 0; - NumTailCalls = 0; + TLD.NumConsecutiveFnEnters = 0; + TLD.NumTailCalls = 0; FuncRecord.RecordKind = uint8_t(FunctionRecord::RecordKinds::FunctionExit); break; case XRayEntryType::TAIL: // If we just entered the function we're tail exiting from or erased every // invocation since then, this function entry tail pair is a candidate to // be erased when the child function exits. - if (NumConsecutiveFnEnters > 0) { - ++NumTailCalls; - NumConsecutiveFnEnters = 0; + if (TLD.NumConsecutiveFnEnters > 0) { + ++TLD.NumTailCalls; + TLD.NumConsecutiveFnEnters = 0; } else { // We will never be able to erase this tail call since we have logged // something in between the function entry and tail exit. - NumTailCalls = 0; - NumConsecutiveFnEnters = 0; + TLD.NumTailCalls = 0; + TLD.NumConsecutiveFnEnters = 0; } FuncRecord.RecordKind = uint8_t(FunctionRecord::RecordKinds::FunctionTailExit); @@ -391,20 +411,21 @@ using AlignedFuncStorage = std::aligned_storage::type; - RecordPtr -= FunctionRecSize; + auto &TLD = getThreadLocalData(); + TLD.RecordPtr -= FunctionRecSize; AlignedFuncStorage AlignedFuncRecordBuffer; const auto &FuncRecord = *reinterpret_cast( - std::memcpy(&AlignedFuncRecordBuffer, RecordPtr, FunctionRecSize)); + std::memcpy(&AlignedFuncRecordBuffer, TLD.RecordPtr, FunctionRecSize)); assert(FuncRecord.RecordKind == uint8_t(FunctionRecord::RecordKinds::FunctionEnter) && "Expected to find function entry recording when rewinding."); assert(FuncRecord.FuncId == (FuncId & ~(0x0F << 28)) && "Expected matching function id when rewinding Exit"); - --NumConsecutiveFnEnters; + --TLD.NumConsecutiveFnEnters; LastTSC -= FuncRecord.TSCDelta; // We unwound one call. Update the state and return without writing a log. - if (NumConsecutiveFnEnters != 0) { + if (TLD.NumConsecutiveFnEnters != 0) { LastFunctionEntryTSC -= FuncRecord.TSCDelta; return; } @@ -414,8 +435,8 @@ // exited from via this exit. LastFunctionEntryTSC = 0; auto RewindingTSC = LastTSC; - auto RewindingRecordPtr = RecordPtr - FunctionRecSize; - while (NumTailCalls > 0) { + auto RewindingRecordPtr = TLD.RecordPtr - FunctionRecSize; + while (TLD.NumTailCalls > 0) { AlignedFuncStorage TailExitRecordBuffer; // Rewind the TSC back over the TAIL EXIT record. const auto &ExpectedTailExit = @@ -438,24 +459,25 @@ // This tail call exceeded the threshold duration. It will not be erased. if ((TSC - RewindingTSC) >= thresholdTicks()) { - NumTailCalls = 0; + TLD.NumTailCalls = 0; return; } // We can erase a tail exit pair that we're exiting through since // its duration is under threshold. - --NumTailCalls; + --TLD.NumTailCalls; RewindingRecordPtr -= FunctionRecSize; RewindingTSC -= ExpectedFunctionEntry.TSCDelta; - RecordPtr -= 2 * FunctionRecSize; + TLD.RecordPtr -= 2 * FunctionRecSize; LastTSC = RewindingTSC; } } inline bool releaseThreadLocalBuffer(BufferQueue &BQArg) { - auto EC = BQArg.releaseBuffer(Buffer); + auto &TLD = getThreadLocalData(); + auto EC = BQArg.releaseBuffer(TLD.Buffer); if (EC != BufferQueue::ErrorCode::Ok) { - Report("Failed to release buffer at %p; error=%s\n", Buffer.Buffer, + Report("Failed to release buffer at %p; error=%s\n", TLD.Buffer.Buffer, BufferQueue::getErrorString(EC)); return false; } @@ -465,12 +487,14 @@ inline bool prepareBuffer(int (*wall_clock_reader)(clockid_t, struct timespec *), size_t MaxSize) XRAY_NEVER_INSTRUMENT { - char *BufferStart = static_cast(Buffer.Buffer); - if ((RecordPtr + MaxSize) > (BufferStart + Buffer.Size - MetadataRecSize)) { + auto &TLD = getThreadLocalData(); + char *BufferStart = static_cast(TLD.Buffer.Buffer); + if ((TLD.RecordPtr + MaxSize) > + (BufferStart + TLD.Buffer.Size - MetadataRecSize)) { writeEOBMetadata(); - if (!releaseThreadLocalBuffer(**LocalBQ)) + if (!releaseThreadLocalBuffer(*TLD.LocalBQ)) return false; - auto EC = (*LocalBQ)->getBuffer(Buffer); + auto EC = TLD.LocalBQ->getBuffer(TLD.Buffer); if (EC != BufferQueue::ErrorCode::Ok) { Report("Failed to acquire a buffer; error=%s\n", BufferQueue::getErrorString(EC)); @@ -489,14 +513,15 @@ // We should take the opportunity to release the buffer though. auto Status = __sanitizer::atomic_load(&LoggingStatus, __sanitizer::memory_order_acquire); + auto &TLD = getThreadLocalData(); if (Status != XRayLogInitStatus::XRAY_LOG_INITIALIZED) { - if (RecordPtr != nullptr && + if (TLD.RecordPtr != nullptr && (Status == XRayLogInitStatus::XRAY_LOG_FINALIZING || Status == XRayLogInitStatus::XRAY_LOG_FINALIZED)) { writeEOBMetadata(); if (!releaseThreadLocalBuffer(*LBQ)) return false; - RecordPtr = nullptr; + TLD.RecordPtr = nullptr; LBQ = nullptr; return false; } @@ -507,11 +532,11 @@ writeEOBMetadata(); if (!releaseThreadLocalBuffer(*LBQ)) return false; - RecordPtr = nullptr; + TLD.RecordPtr = nullptr; } - if (Buffer.Buffer == nullptr) { - auto EC = LBQ->getBuffer(Buffer); + if (TLD.Buffer.Buffer == nullptr) { + auto EC = LBQ->getBuffer(TLD.Buffer); if (EC != BufferQueue::ErrorCode::Ok) { auto LS = __sanitizer::atomic_load(&LoggingStatus, __sanitizer::memory_order_acquire); @@ -525,10 +550,10 @@ setupNewBuffer(wall_clock_reader); } - if (CurrentCPU == std::numeric_limits::max()) { + if (TLD.CurrentCPU == std::numeric_limits::max()) { // This means this is the first CPU this thread has ever run on. We set // the current CPU and record this as the first TSC we've seen. - CurrentCPU = CPU; + TLD.CurrentCPU = CPU; writeNewCPUIdMetadata(CPU, TSC); } @@ -536,12 +561,13 @@ } // namespace __xray_fdr_internal inline void endBufferIfFull() XRAY_NEVER_INSTRUMENT { - auto BufferStart = static_cast(Buffer.Buffer); - if ((RecordPtr + MetadataRecSize) - BufferStart == MetadataRecSize) { + auto &TLD = getThreadLocalData(); + auto BufferStart = static_cast(TLD.Buffer.Buffer); + if ((TLD.RecordPtr + MetadataRecSize) - BufferStart == MetadataRecSize) { writeEOBMetadata(); - if (!releaseThreadLocalBuffer(**LocalBQ)) + if (!releaseThreadLocalBuffer(*TLD.LocalBQ)) return; - RecordPtr = nullptr; + TLD.RecordPtr = nullptr; } } @@ -555,19 +581,21 @@ // don't want to be clobbering potentially partial writes already happening in // the thread. We use a simple thread_local latch to only allow one on-going // handleArg0 to happen at any given time. - thread_local bool Running = false; + thread_local volatile bool Running = false; RecursionGuard Guard{Running}; if (!Guard) { assert(Running == true && "RecursionGuard is buggy!"); return; } + auto &TLD = getThreadLocalData(); + // In case the reference has been cleaned up before, we make sure we // initialize it to the provided BufferQueue. - if ((*LocalBQ) == nullptr) - *LocalBQ = BQ; + if (TLD.LocalBQ == nullptr) + TLD.LocalBQ = BQ; - if (!isLogInitializedAndReady(*LocalBQ, TSC, CPU, wall_clock_reader)) + if (!isLogInitializedAndReady(TLD.LocalBQ, TSC, CPU, wall_clock_reader)) return; // Before we go setting up writing new function entries, we need to be really @@ -607,14 +635,14 @@ // Buffer, set it up properly before doing any further writing. // if (!prepareBuffer(wall_clock_reader, FunctionRecSize + MetadataRecSize)) { - *LocalBQ = nullptr; + TLD.LocalBQ = nullptr; return; } // By this point, we are now ready to write at most 24 bytes (one metadata // record and one function record). - assert((RecordPtr + (MetadataRecSize + FunctionRecSize)) - - static_cast(Buffer.Buffer) >= + assert((TLD.RecordPtr + (MetadataRecSize + FunctionRecSize)) - + static_cast(TLD.Buffer.Buffer) >= static_cast(MetadataRecSize) && "Misconfigured BufferQueue provided; Buffer size not large enough."); @@ -638,36 +666,36 @@ // the correct TSC delta. // uint32_t RecordTSCDelta = 0; - if (CPU != CurrentCPU) { + if (CPU != TLD.CurrentCPU) { // We've moved to a new CPU. writeNewCPUIdMetadata(CPU, TSC); } else { // If the delta is greater than the range for a uint32_t, then we write out // the TSC wrap metadata entry with the full TSC, and the TSC for the // function record be 0. - auto Delta = TSC - LastTSC; + auto Delta = TSC - TLD.LastTSC; if (Delta > (1ULL << 32) - 1) writeTSCWrapMetadata(TSC); else RecordTSCDelta = Delta; } - LastTSC = TSC; - CurrentCPU = CPU; + TLD.LastTSC = TSC; + TLD.CurrentCPU = CPU; switch (Entry) { case XRayEntryType::ENTRY: case XRayEntryType::LOG_ARGS_ENTRY: // Update the thread local state for the next invocation. - LastFunctionEntryTSC = TSC; + TLD.LastFunctionEntryTSC = TSC; break; case XRayEntryType::TAIL: break; case XRayEntryType::EXIT: // Break out and write the exit record if we can't erase any functions. - if (NumConsecutiveFnEnters == 0 || - (TSC - LastFunctionEntryTSC) >= thresholdTicks()) + if (TLD.NumConsecutiveFnEnters == 0 || + (TSC - TLD.LastFunctionEntryTSC) >= thresholdTicks()) break; - rewindRecentCall(TSC, LastTSC, LastFunctionEntryTSC, FuncId); + rewindRecentCall(TSC, TLD.LastTSC, TLD.LastFunctionEntryTSC, FuncId); return; // without writing log. case XRayEntryType::CUSTOM_EVENT: { // This is a bug in patching, so we'll report it once and move on. @@ -682,7 +710,7 @@ } } - writeFunctionRecord(FuncId, RecordTSCDelta, Entry, RecordPtr); + writeFunctionRecord(FuncId, RecordTSCDelta, Entry, TLD.RecordPtr); // If we've exhausted the buffer by this time, we then release the buffer to // make sure that other threads may start using this buffer.