Index: lib/xray/xray_fdr_logging_impl.h =================================================================== --- lib/xray/xray_fdr_logging_impl.h +++ lib/xray/xray_fdr_logging_impl.h @@ -22,6 +22,7 @@ #include #include #include +#include #include #include #include @@ -137,43 +138,91 @@ static constexpr auto MetadataRecSize = sizeof(MetadataRecord); static constexpr auto FunctionRecSize = sizeof(FunctionRecord); -class ThreadExitBufferCleanup { - std::shared_ptr &Buffers; - BufferQueue::Buffer &Buffer; - -public: - explicit ThreadExitBufferCleanup(std::shared_ptr &BQ, - BufferQueue::Buffer &Buffer) - XRAY_NEVER_INSTRUMENT : Buffers(BQ), - Buffer(Buffer) {} - - ~ThreadExitBufferCleanup() noexcept XRAY_NEVER_INSTRUMENT { - auto &TLD = getThreadLocalData(); - auto &RecordPtr = TLD.RecordPtr; - if (RecordPtr == nullptr) - return; - - // We make sure that upon exit, a thread will write out the EOB - // MetadataRecord in the thread-local log, and also release the buffer to - // the queue. - assert((RecordPtr + MetadataRecSize) - static_cast(Buffer.Buffer) >= - static_cast(MetadataRecSize)); - if (Buffers) { - writeEOBMetadata(); - auto EC = Buffers->releaseBuffer(Buffer); - if (EC != BufferQueue::ErrorCode::Ok) - Report("Failed to release buffer at %p; error=%s\n", Buffer.Buffer, - BufferQueue::getErrorString(EC)); - Buffers = nullptr; - return; - } - } -}; - +// This function will initialize the thread-local data structure used by the FDR +// logging implementation and return a reference to it. The implementation +// details require a bit of care to maintain. +// +// First, some requirements on the implementation in general: +// +// - XRay handlers should not call any memory allocation routines that may +// delegate to an instrumented implementation. This means functions like +// malloc() and free() should not be called while instrumenting. +// +// - We would like to use some thread-local data initialized on first-use of +// the XRay instrumentation. These allow us to implement unsynchronized +// routines that access resources associated with the thread. +// +// The implementation here uses a few mechanisms that allow us to provide both +// the requirements listed above. We do this by: +// +// 1. Using a thread-local aligned storage buffer for representing the +// ThreadLocalData struct. This data will be uninitialized memory by +// design. +// +// 2. Using pthread_once(...) to initialize the thread-local data structures +// on first use, for every thread. We don't use std::call_once so we don't +// have a reliance on the C++ runtime library. +// +// 3. Registering a cleanup function that gets run at the end of a thread's +// lifetime through pthread_create_key(...). The cleanup function would +// allow us to release the thread-local resources in a manner that would +// let the rest of the XRay runtime implementation handle the records +// written for this thread's active buffer. +// +// We're doing this to avoid using a `thread_local` object that has a +// non-trivial destructor, because the C++ runtime might call std::malloc(...) +// to register calls to destructors. Deadlocks may arise when, for example, an +// externally provided malloc implementation is XRay instrumented, and +// initializing the thread-locals involves calling into malloc. A malloc +// implementation that does global synchronization might be holding a lock for a +// critical section, calling a function that might be XRay instrumented (and +// thus in turn calling into malloc by virtue of registration of the +// thread_local's destructor). +// +// With the approach taken where, we attempt to avoid the potential for +// deadlocks by relying instead on pthread's memory management routines. static ThreadLocalData &getThreadLocalData() { - thread_local ThreadLocalData TLD; - thread_local ThreadExitBufferCleanup Cleanup(TLD.LocalBQ, TLD.Buffer); - return TLD; + thread_local pthread_key_t key; + thread_local pthread_once_t key_once = PTHREAD_ONCE_INIT; + + // We need aligned, uninitialized storage for the TLS object which is + // trivially destructible. We're going to use this as raw storage and + // placement-new the ThreadLocalData object into it later. + thread_local std::aligned_union<1, ThreadLocalData>::type TLSBuffer; + + // Ensure that we only actually ever do the pthread initialization once. + thread_local bool unused = [] { + new (&TLSBuffer) ThreadLocalData(); + pthread_key_create(&key, +[](void *) { + auto &TLD = *reinterpret_cast(&TLSBuffer); + auto &RecordPtr = TLD.RecordPtr; + auto &Buffers = TLD.LocalBQ; + auto &Buffer = TLD.Buffer; + if (RecordPtr == nullptr) + return; + + // We make sure that upon exit, a thread will write out the EOB + // MetadataRecord in the thread-local log, and also release the buffer + // to the queue. + assert((RecordPtr + MetadataRecSize) - + static_cast(Buffer.Buffer) >= + static_cast(MetadataRecSize)); + if (Buffers) { + writeEOBMetadata(); + auto EC = Buffers->releaseBuffer(Buffer); + if (EC != BufferQueue::ErrorCode::Ok) + Report("Failed to release buffer at %p; error=%s\n", Buffer.Buffer, + BufferQueue::getErrorString(EC)); + Buffers = nullptr; + return; + } + }); + pthread_setspecific(key, &TLSBuffer); + return true; + }(); + (void)unused; + + return *reinterpret_cast(&TLSBuffer); } //-----------------------------------------------------------------------------|