Index: lib/esan/esan_circular_buffer.h =================================================================== --- lib/esan/esan_circular_buffer.h +++ lib/esan/esan_circular_buffer.h @@ -28,9 +28,11 @@ explicit CircularBuffer() {} CircularBuffer(uptr BufferCapacity) { initialize(BufferCapacity); + WasConstructed = true; } ~CircularBuffer() { - free(); + if (WasConstructed) // Else caller will call free() explicitly. + free(); } void initialize(uptr BufferCapacity) { Capacity = BufferCapacity; @@ -38,6 +40,7 @@ Data = (T *)MmapOrDie(Capacity * sizeof(T), "CircularBuffer"); StartIdx = 0; Count = 0; + WasConstructed = false; } void free() { UnmapOrDie(Data, Capacity * sizeof(T)); @@ -83,6 +86,7 @@ CircularBuffer(const CircularBuffer&); void operator=(const CircularBuffer&); + bool WasConstructed; T *Data; uptr Capacity; uptr StartIdx; Index: lib/esan/esan_flags.inc =================================================================== --- lib/esan/esan_flags.inc +++ lib/esan/esan_flags.inc @@ -39,3 +39,9 @@ // To disable samples, turn off record_snapshots. ESAN_FLAG(int, sample_freq, 20, "Working set tool: sampling frequency in milliseconds.") + +// This controls the difference in frequency between each successive series +// of snapshots. There are 8 in total, with number 0 using sample_freq. +// Number N samples number N-1 every (1 << snapshot_step) instance of N-1. +ESAN_FLAG(int, snapshot_step, 2, "Working set tool: the log of the sampling " + "performed for the next-higher-frequency snapshot series.") Index: lib/esan/working_set.cpp =================================================================== --- lib/esan/working_set.cpp +++ lib/esan/working_set.cpp @@ -14,6 +14,7 @@ #include "working_set.h" #include "esan.h" +#include "esan_circular_buffer.h" #include "esan_flags.h" #include "esan_shadow.h" #include "esan_sideline.h" @@ -24,9 +25,15 @@ // cache line has ever been accessed. // - The lowest bit of each shadow byte indicates whether the corresponding // cache line was accessed since the last sample. -// - The other bits can be used either for a single working set snapshot -// between two consecutive samples, or an aggregate working set snapshot -// over multiple sample periods (future work). +// - The other bits are used for working set snapshots at successively +// lower frequencies, each bit to the left from the lowest bit stepping +// down the frequency by 2 to the power of getFlags()->snapshot_step. +// Thus we have something like this: +// Bit 0: Since last sample +// Bit 1: Since last 2^2 samples +// Bit 2: Since last 2^4 samples +// Bit 3: ... +// Bit 7: Ever accessed. // We live with races in accessing each shadow byte. typedef unsigned char byte; @@ -37,6 +44,10 @@ // See the shadow byte layout description above. static const u32 TotalWorkingSetBitIdx = 7; +// We accumulate to the left until we hit this bit. +// We don't need to accumulate to the final bit as it's set on each ref +// by the compiler instrumentation. +static const u32 MaxAccumBitIdx = 6; static const u32 CurWorkingSetBitIdx = 0; static const byte ShadowAccessedVal = (1 << TotalWorkingSetBitIdx) | (1 << CurWorkingSetBitIdx); @@ -47,6 +58,26 @@ // may want to consider a 64-bit int. static u32 SnapshotNum; +// We store the wset size for each of 8 different sampling frequencies. +static const u32 NumFreq = 8; // One for each bit of our shadow bytes. +// We cannot use static objects as the global destructor is called +// prior to our finalize routine. +// These are each circular buffers, sized up front. +CircularBuffer SizePerFreq[NumFreq]; +// We cannot rely on static initializers (they may run too late) but +// we record the size here for clarity: +u32 CircularBufferSizes[NumFreq] = { + // These are each mmap-ed so our minimum is one page. + 32*1024, + 16*1024, + 8*1024, + 4*1024, + 4*1024, + 4*1024, + 4*1024, + 4*1024, +}; + void processRangeAccessWorkingSet(uptr PC, uptr Addr, SIZE_T Size, bool IsWrite) { if (Size == 0) @@ -95,13 +126,17 @@ ByteValue << 24; // Get word aligned start. ShadowStart = RoundDownTo(ShadowStart, sizeof(u32)); + bool Accum = getFlags()->record_snapshots && BitIdx < MaxAccumBitIdx; for (u32 *Ptr = (u32 *)ShadowStart; Ptr < (u32 *)ShadowEnd; ++Ptr) { if ((*Ptr & WordValue) != 0) { byte *BytePtr = (byte *)Ptr; for (u32 j = 0; j < sizeof(u32); ++j) { if (BytePtr[j] & ByteValue) { ++WorkingSetSize; - // TODO: Accumulate to the lower-frequency bit to the left. + if (Accum) { + // Accumulate to the lower-frequency bit to the left. + BytePtr[j] |= (ByteValue << 1); + } } } // Clear this bit from every shadow byte. @@ -134,19 +169,41 @@ // This is invoked from a signal handler but in a sideline thread doing nothing // else so it is a little less fragile than a typical signal handler. static void takeSample(void *Arg) { - // FIXME: record the size and report at process end. For now this simply - // serves as a test of the sideline thread functionality. - VReport(1, "%s: snapshot #%d: %u\n", SanitizerToolName, SnapshotNum, - computeWorkingSizeAndReset(CurWorkingSetBitIdx)); - ++SnapshotNum; + u32 BitIdx = CurWorkingSetBitIdx; + u32 Freq = 1; + ++SnapshotNum; // Simpler to skip 0 whose mod matches everything. + while (BitIdx <= MaxAccumBitIdx && (SnapshotNum % Freq) == 0) { + u32 NumLines = computeWorkingSizeAndReset(BitIdx); + VReport(1, "%s: snapshot #%5d bit %d freq %4d: %8u\n", SanitizerToolName, + SnapshotNum, BitIdx, Freq, NumLines); + SizePerFreq[BitIdx].push_back(NumLines); + Freq = Freq << getFlags()->snapshot_step; + BitIdx++; + } } void initializeWorkingSet() { CHECK(getFlags()->cache_line_size == CacheLineSize); registerMemoryFaultHandler(); - if (getFlags()->record_snapshots) + if (getFlags()->record_snapshots) { + for (u32 i = 0; i < NumFreq; ++i) + SizePerFreq[i].initialize(CircularBufferSizes[i]); Thread.launchThread(takeSample, nullptr, getFlags()->sample_freq); + } +} + +static u32 getPeriodForPrinting(u32 MilliSec, const char *&Unit) { + if (MilliSec > 600000) { + Unit = "min"; + return MilliSec / 60000; + } else if (MilliSec > 10000) { + Unit = "sec"; + return MilliSec / 1000; + } else { + Unit = "ms"; + return MilliSec; + } } static u32 getSizeForPrinting(u32 NumOfCachelines, const char *&Unit) { @@ -167,12 +224,28 @@ } int finalizeWorkingSet() { - if (getFlags()->record_snapshots) + const char *Unit; + if (getFlags()->record_snapshots) { Thread.joinThread(); - + u32 Freq = 1; + Report(" Total number of samples: %u\n", SnapshotNum); + for (u32 i = 0; i < NumFreq; ++i) { + u32 Time = getPeriodForPrinting(getFlags()->sample_freq*Freq, Unit); + Report(" Samples array #%d at period %u %s\n", i, Time, Unit); + // FIXME: report whether we wrapped around and thus whether we + // have data on the whole run or just the last N samples. + for (u32 j = 0; j < SizePerFreq[i].size(); ++j) { + u32 Size = getSizeForPrinting(SizePerFreq[i][j], Unit); + Report("#%4d: %8u %s (%9u cache lines)\n", j, Size, Unit, + SizePerFreq[i][j]); + } + Freq = Freq << getFlags()->snapshot_step; + SizePerFreq[i].free(); + } + } + // Get the working set size for the entire execution. u32 NumOfCachelines = computeWorkingSizeAndReset(TotalWorkingSetBitIdx); - const char *Unit; u32 Size = getSizeForPrinting(NumOfCachelines, Unit); Report(" %s: the total working set size: %u %s (%u cache lines)\n", SanitizerToolName, Size, Unit, NumOfCachelines); Index: test/esan/TestCases/workingset-samples.cpp =================================================================== --- test/esan/TestCases/workingset-samples.cpp +++ test/esan/TestCases/workingset-samples.cpp @@ -1,5 +1,5 @@ // RUN: %clang_esan_wset -O0 %s -o %t 2>&1 -// RUN: %env_esan_opts=verbosity=1 %run %t 2>&1 | FileCheck %s +// RUN: %run %t 2>&1 | FileCheck %s #include #include @@ -19,8 +19,23 @@ for (int i = 0; i < size; ++i) buf[i] = i; munmap(buf, size); - // CHECK: {{.*}}EfficiencySanitizer: snapshot {{.*}} - // CHECK-NEXT: {{.*}}EfficiencySanitizer: snapshot {{.*}} + // We only check for a few samples here to reduce the chance of flakiness. + // CHECK: =={{[0-9]+}}== Total number of samples: {{[0-9]+}} + // CHECK-NEXT: =={{[0-9]+}}== Samples array #0 at period 20 ms + // CHECK-NEXT: =={{[0-9]+}}==# 0: {{[ 0-9]+}} KB ({{[ 0-9]+}} cache lines) + // CHECK-NEXT: =={{[0-9]+}}==# 1: {{[ 0-9]+}} KB ({{[ 0-9]+}} cache lines) + // CHECK-NEXT: =={{[0-9]+}}==# 2: {{[ 0-9]+}} KB ({{[ 0-9]+}} cache lines) + // CHECK-NEXT: =={{[0-9]+}}==# 3: {{[ 0-9]+}} KB ({{[ 0-9]+}} cache lines) + // CHECK: =={{[0-9]+}}== Samples array #1 at period 80 ms + // CHECK-NEXT: =={{[0-9]+}}==# 0: {{[ 0-9]+}} KB ({{[ 0-9]+}} cache lines) + // CHECK-NEXT: =={{[0-9]+}}==# 1: {{[ 0-9]+}} KB ({{[ 0-9]+}} cache lines) + // CHECK: =={{[0-9]+}}== Samples array #2 at period 320 ms + // CHECK-NEXT: =={{[0-9]+}}==# 0: {{[ 0-9]+}} MB ({{[ 0-9]+}} cache lines) + // CHECK: =={{[0-9]+}}== Samples array #3 at period 1280 ms + // CHECK: =={{[0-9]+}}== Samples array #4 at period 5120 ms + // CHECK: =={{[0-9]+}}== Samples array #5 at period 20 sec + // CHECK: =={{[0-9]+}}== Samples array #6 at period 81 sec + // CHECK: =={{[0-9]+}}== Samples array #7 at period 327 sec // CHECK: {{.*}} EfficiencySanitizer: the total working set size: 32 MB (5242{{[0-9][0-9]}} cache lines) return 0; }