Index: compiler-rt/trunk/lib/xray/xray_allocator.h =================================================================== --- compiler-rt/trunk/lib/xray/xray_allocator.h +++ compiler-rt/trunk/lib/xray/xray_allocator.h @@ -50,23 +50,33 @@ internal_munmap(B, sizeof(T)); } -inline void *allocateBuffer(size_t S) XRAY_NEVER_INSTRUMENT { - auto B = reinterpret_cast(internal_mmap( - NULL, S, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0)); +template T *allocateBuffer(size_t S) XRAY_NEVER_INSTRUMENT { + auto B = reinterpret_cast( + internal_mmap(NULL, S * sizeof(T), PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0)); if (B == MAP_FAILED) { if (Verbosity()) Report("XRay Profiling: Failed to allocate memory of size %d.\n", S); return nullptr; } - return B; + return reinterpret_cast(B); } -inline void deallocateBuffer(void *B, size_t S) XRAY_NEVER_INSTRUMENT { +template void deallocateBuffer(T *B, size_t S) XRAY_NEVER_INSTRUMENT { if (B == nullptr) return; internal_munmap(B, S); } +template +T *initArray(size_t N, U &&... Us) XRAY_NEVER_INSTRUMENT { + auto A = allocateBuffer(N); + if (A != nullptr) + while (N > 0) + new (A + (--N)) T(std::forward(Us)...); + return A; +} + /// The Allocator type hands out fixed-sized chunks of memory that are /// cache-line aligned and sized. This is useful for placement of /// performance-sensitive data in memory that's frequently accessed. The Index: compiler-rt/trunk/lib/xray/xray_buffer_queue.h =================================================================== --- compiler-rt/trunk/lib/xray/xray_buffer_queue.h +++ compiler-rt/trunk/lib/xray/xray_buffer_queue.h @@ -18,6 +18,7 @@ #include "sanitizer_common/sanitizer_atomic.h" #include "sanitizer_common/sanitizer_common.h" #include "sanitizer_common/sanitizer_mutex.h" +#include "xray_defs.h" #include namespace __xray { @@ -29,14 +30,10 @@ /// trace collection. class BufferQueue { public: - struct alignas(64) BufferExtents { - atomic_uint64_t Size; - }; - struct Buffer { + atomic_uint64_t Extents{0}; void *Data = nullptr; size_t Size = 0; - BufferExtents *Extents; }; struct BufferRep { @@ -76,8 +73,10 @@ T *operator->() const { return &(Buffers[Offset].Buff); } - Iterator(BufferRep *Root, size_t O, size_t M) - : Buffers(Root), Offset(O), Max(M) { + Iterator(BufferRep *Root, size_t O, size_t M) XRAY_NEVER_INSTRUMENT + : Buffers(Root), + Offset(O), + Max(M) { // We want to advance to the first Offset where the 'Used' property is // true, or to the end of the list/queue. while (!Buffers[Offset].Used && Offset != Max) { @@ -107,16 +106,18 @@ // Size of each individual Buffer. size_t BufferSize; - BufferRep *Buffers; - // Amount of pre-allocated buffers. size_t BufferCount; SpinMutex Mutex; atomic_uint8_t Finalizing; - // Pointers to buffers managed/owned by the BufferQueue. - void **OwnedBuffers; + // A pointer to a contiguous block of memory to serve as the backing store for + // all the individual buffers handed out. + void *BackingStore; + + // A dynamically allocated array of BufferRep instances. + BufferRep *Buffers; // Pointer to the next buffer to be handed out. BufferRep *Next; @@ -198,7 +199,7 @@ /// Applies the provided function F to each Buffer in the queue, only if the /// Buffer is marked 'used' (i.e. has been the result of getBuffer(...) and a /// releaseBuffer(...) operation). - template void apply(F Fn) { + template void apply(F Fn) XRAY_NEVER_INSTRUMENT { SpinMutexLock G(&Mutex); for (auto I = begin(), E = end(); I != E; ++I) Fn(*I); Index: compiler-rt/trunk/lib/xray/xray_buffer_queue.cc =================================================================== --- compiler-rt/trunk/lib/xray/xray_buffer_queue.cc +++ compiler-rt/trunk/lib/xray/xray_buffer_queue.cc @@ -16,68 +16,42 @@ #include "sanitizer_common/sanitizer_common.h" #include "sanitizer_common/sanitizer_libc.h" #include "sanitizer_common/sanitizer_posix.h" +#include "xray_allocator.h" +#include "xray_defs.h" #include #include using namespace __xray; using namespace __sanitizer; -template static T *allocRaw(size_t N) { - // TODO: Report errors? - void *A = reinterpret_cast( - internal_mmap(NULL, N * sizeof(T), PROT_WRITE | PROT_READ, - MAP_ANONYMOUS | MAP_PRIVATE, -1, 0)); - return (A == MAP_FAILED) ? nullptr : reinterpret_cast(A); -} - -template static void deallocRaw(T *ptr, size_t N) { - // TODO: Report errors? - if (ptr != nullptr) - internal_munmap(ptr, N); -} - -template static T *initArray(size_t N) { - auto A = allocRaw(N); - if (A != nullptr) - while (N > 0) - new (A + (--N)) T(); - return A; -} - -BufferQueue::BufferQueue(size_t B, size_t N, bool &Success) - : BufferSize(B), Buffers(initArray(N)), - BufferCount(N), Finalizing{0}, OwnedBuffers(initArray(N)), - Next(Buffers), First(Buffers), LiveBuffers(0) { - if (Buffers == nullptr) { +BufferQueue::BufferQueue(size_t B, size_t N, + bool &Success) XRAY_NEVER_INSTRUMENT + : BufferSize(B), + BufferCount(N), + Mutex(), + Finalizing{0}, + BackingStore(allocateBuffer(B *N)), + Buffers(initArray(N)), + Next(Buffers), + First(Buffers), + LiveBuffers(0) { + if (BackingStore == nullptr) { Success = false; return; } - if (OwnedBuffers == nullptr) { - // Clean up the buffers we've already allocated. - for (auto B = Buffers, E = Buffers + BufferCount; B != E; ++B) - B->~BufferRep(); - deallocRaw(Buffers, N); + if (Buffers == nullptr) { + deallocateBuffer(BackingStore, BufferSize * BufferCount); Success = false; return; - }; + } for (size_t i = 0; i < N; ++i) { auto &T = Buffers[i]; - void *Tmp = allocRaw(BufferSize); - if (Tmp == nullptr) { - Success = false; - return; - } - auto *Extents = allocRaw(1); - if (Extents == nullptr) { - Success = false; - return; - } auto &Buf = T.Buff; - Buf.Data = Tmp; + Buf.Data = reinterpret_cast(BackingStore) + (BufferSize * i); Buf.Size = B; - Buf.Extents = Extents; - OwnedBuffers[i] = Tmp; + atomic_store(&Buf.Extents, 0, memory_order_release); + T.Used = false; } Success = true; } @@ -85,13 +59,17 @@ BufferQueue::ErrorCode BufferQueue::getBuffer(Buffer &Buf) { if (atomic_load(&Finalizing, memory_order_acquire)) return ErrorCode::QueueFinalizing; + SpinMutexLock Guard(&Mutex); if (LiveBuffers == BufferCount) return ErrorCode::NotEnoughMemory; auto &T = *Next; auto &B = T.Buff; - Buf = B; + auto Extents = atomic_load(&B.Extents, memory_order_acquire); + atomic_store(&Buf.Extents, Extents, memory_order_release); + Buf.Data = B.Data; + Buf.Size = B.Size; T.Used = true; ++LiveBuffers; @@ -102,15 +80,11 @@ } BufferQueue::ErrorCode BufferQueue::releaseBuffer(Buffer &Buf) { - // Blitz through the buffers array to find the buffer. - bool Found = false; - for (auto I = OwnedBuffers, E = OwnedBuffers + BufferCount; I != E; ++I) { - if (*I == Buf.Data) { - Found = true; - break; - } - } - if (!Found) + // Check whether the buffer being referred to is within the bounds of the + // backing store's range. + if (Buf.Data < BackingStore || + Buf.Data > + reinterpret_cast(BackingStore) + (BufferCount * BufferSize)) return ErrorCode::UnrecognizedBuffer; SpinMutexLock Guard(&Mutex); @@ -121,10 +95,14 @@ return ErrorCode::NotEnoughMemory; // Now that the buffer has been released, we mark it as "used". - First->Buff = Buf; + auto Extents = atomic_load(&Buf.Extents, memory_order_acquire); + atomic_store(&First->Buff.Extents, Extents, memory_order_release); + First->Buff.Data = Buf.Data; + First->Buff.Size = Buf.Size; First->Used = true; Buf.Data = nullptr; Buf.Size = 0; + atomic_store(&Buf.Extents, 0, memory_order_release); --LiveBuffers; if (++First == (Buffers + BufferCount)) First = Buffers; @@ -139,14 +117,8 @@ } BufferQueue::~BufferQueue() { - for (auto I = Buffers, E = Buffers + BufferCount; I != E; ++I) { - auto &T = *I; - auto &Buf = T.Buff; - deallocRaw(Buf.Data, Buf.Size); - deallocRaw(Buf.Extents, 1); - } for (auto B = Buffers, E = Buffers + BufferCount; B != E; ++B) B->~BufferRep(); - deallocRaw(Buffers, BufferCount); - deallocRaw(OwnedBuffers, BufferCount); + deallocateBuffer(Buffers, BufferCount); + deallocateBuffer(BackingStore, BufferSize * BufferCount); } Index: compiler-rt/trunk/lib/xray/xray_fdr_logging.cc =================================================================== --- compiler-rt/trunk/lib/xray/xray_fdr_logging.cc +++ compiler-rt/trunk/lib/xray/xray_fdr_logging.cc @@ -30,6 +30,7 @@ #include "sanitizer_common/sanitizer_common.h" #include "xray/xray_interface.h" #include "xray/xray_records.h" +#include "xray_allocator.h" #include "xray_buffer_queue.h" #include "xray_defs.h" #include "xray_fdr_flags.h" @@ -47,7 +48,7 @@ // force the alignment to 64-bytes for x86 cache line alignment, as this // structure is used in the hot path of implementation. struct alignas(64) ThreadLocalData { - BufferQueue::Buffer Buffer; + BufferQueue::Buffer Buffer{}; char *RecordPtr = nullptr; // The number of FunctionEntry records immediately preceding RecordPtr. uint8_t NumConsecutiveFnEnters = 0; @@ -81,6 +82,7 @@ static pthread_key_t Key; // Global BufferQueue. +static std::aligned_storage::type BufferQueueStorage; static BufferQueue *BQ = nullptr; static atomic_sint32_t LogFlushStatus = { @@ -184,11 +186,10 @@ TLD.NumTailCalls = 0; if (TLD.BQ == nullptr || TLD.BQ->finalizing()) return; - internal_memcpy(TLD.RecordPtr, Metadata, sizeof(Metadata)); - TLD.RecordPtr += sizeof(Metadata); - // Since we write out the extents as the first metadata record of the - // buffer, we need to write out the extents including the extents record. - atomic_store(&TLD.Buffer.Extents->Size, sizeof(Metadata), + internal_memcpy(TLD.RecordPtr, Metadata, + sizeof(MetadataRecord) * InitRecordsCount); + TLD.RecordPtr += sizeof(MetadataRecord) * InitRecordsCount; + atomic_store(&TLD.Buffer.Extents, sizeof(MetadataRecord) * InitRecordsCount, memory_order_release); } @@ -209,12 +210,12 @@ static void incrementExtents(size_t Add) { auto &TLD = getThreadLocalData(); - atomic_fetch_add(&TLD.Buffer.Extents->Size, Add, memory_order_acq_rel); + atomic_fetch_add(&TLD.Buffer.Extents, Add, memory_order_acq_rel); } static void decrementExtents(size_t Subtract) { auto &TLD = getThreadLocalData(); - atomic_fetch_sub(&TLD.Buffer.Extents->Size, Subtract, memory_order_acq_rel); + atomic_fetch_sub(&TLD.Buffer.Extents, Subtract, memory_order_acq_rel); } static void writeNewCPUIdMetadata(uint16_t CPU, @@ -583,7 +584,7 @@ auto &TLD = getThreadLocalData(); - if (TLD.BQ == nullptr) + if (TLD.BQ == nullptr && BQ != nullptr) TLD.BQ = BQ; if (!isLogInitializedAndReady(TLD.BQ, TSC, CPU, wall_clock_reader)) @@ -758,6 +759,7 @@ static BufferQueue::const_iterator It{}; static BufferQueue::const_iterator End{}; static void *CurrentBuffer{nullptr}; + static size_t SerializedBufferSize = 0; if (B.Data == static_cast(&Header) && B.Size == sizeof(Header)) { // From this point on, we provide raw access to the raw buffer we're getting // from the BufferQueue. We're relying on the iterators from the current @@ -767,7 +769,7 @@ } if (CurrentBuffer != nullptr) { - InternalFree(CurrentBuffer); + deallocateBuffer(CurrentBuffer, SerializedBufferSize); CurrentBuffer = nullptr; } @@ -778,9 +780,9 @@ // out to disk. The difference here would be that we still write "empty" // buffers, or at least go through the iterators faithfully to let the // handlers see the empty buffers in the queue. - auto BufferSize = atomic_load(&It->Extents->Size, memory_order_acquire); - auto SerializedBufferSize = BufferSize + sizeof(MetadataRecord); - CurrentBuffer = InternalAlloc(SerializedBufferSize); + auto BufferSize = atomic_load(&It->Extents, memory_order_acquire); + SerializedBufferSize = BufferSize + sizeof(MetadataRecord); + CurrentBuffer = allocateBuffer(SerializedBufferSize); if (CurrentBuffer == nullptr) return {nullptr, 0}; @@ -848,7 +850,6 @@ if (TLD.RecordPtr != nullptr && TLD.BQ != nullptr) releaseThreadLocalBuffer(*TLD.BQ); BQ->~BufferQueue(); - InternalFree(BQ); BQ = nullptr; } }); @@ -883,6 +884,13 @@ retryingWriteAll(Fd, reinterpret_cast(&Header), reinterpret_cast(&Header) + sizeof(Header)); + // Release the current thread's buffer before we attempt to write out all the + // buffers. This ensures that in case we had only a single thread going, that + // we are able to capture the data nonetheless. + auto &TLD = getThreadLocalData(); + if (TLD.RecordPtr != nullptr && TLD.BQ != nullptr) + releaseThreadLocalBuffer(*TLD.BQ); + BQ->apply([&](const BufferQueue::Buffer &B) { // Starting at version 2 of the FDR logging implementation, we only write // the records identified by the extents of the buffer. We use the Extents @@ -890,7 +898,7 @@ // still use a Metadata record, but fill in the extents instead for the // data. MetadataRecord ExtentsRecord; - auto BufferExtents = atomic_load(&B.Extents->Size, memory_order_acquire); + auto BufferExtents = atomic_load(&B.Extents, memory_order_acquire); DCHECK(BufferExtents <= B.Size); ExtentsRecord.Type = uint8_t(RecordType::Metadata); ExtentsRecord.RecordKind = @@ -922,7 +930,12 @@ // Do special things to make the log finalize itself, and not allow any more // operations to be performed until re-initialized. - BQ->finalize(); + if (BQ == nullptr) { + if (Verbosity()) + Report("Attempting to finalize an uninitialized global buffer!\n"); + } else { + BQ->finalize(); + } atomic_store(&LoggingStatus, XRayLogInitStatus::XRAY_LOG_FINALIZED, memory_order_release); @@ -1132,13 +1145,11 @@ if (BQ != nullptr) { BQ->~BufferQueue(); - InternalFree(BQ); BQ = nullptr; } if (BQ == nullptr) { - BQ = reinterpret_cast( - InternalAlloc(sizeof(BufferQueue), nullptr, 64)); + BQ = reinterpret_cast(&BufferQueueStorage); new (BQ) BufferQueue(BufferSize, BufferMax, Success); } @@ -1146,7 +1157,6 @@ Report("BufferQueue init failed.\n"); if (BQ != nullptr) { BQ->~BufferQueue(); - InternalFree(BQ); BQ = nullptr; } return XRayLogInitStatus::XRAY_LOG_UNINITIALIZED;