Index: llvm/trunk/include/llvm/Transforms/Instrumentation.h
===================================================================
--- llvm/trunk/include/llvm/Transforms/Instrumentation.h
+++ llvm/trunk/include/llvm/Transforms/Instrumentation.h
@@ -123,6 +123,7 @@
   enum Type {
     ESAN_None = 0,
     ESAN_CacheFrag,
+    ESAN_WorkingSet,
   } ToolType;
 };
 
Index: llvm/trunk/lib/Transforms/Instrumentation/EfficiencySanitizer.cpp
===================================================================
--- llvm/trunk/lib/Transforms/Instrumentation/EfficiencySanitizer.cpp
+++ llvm/trunk/lib/Transforms/Instrumentation/EfficiencySanitizer.cpp
@@ -42,6 +42,9 @@
 static cl::opt<bool>
     ClToolCacheFrag("esan-cache-frag", cl::init(false),
                     cl::desc("Detect data cache fragmentation"), cl::Hidden);
+static cl::opt<bool>
+    ClToolWorkingSet("esan-working-set", cl::init(false),
+                    cl::desc("Measure the working set size"), cl::Hidden);
 // Each new tool will get its own opt flag here.
 // These are converted to EfficiencySanitizerOptions for use
 // in the code.
@@ -65,12 +68,31 @@
 static const char *const EsanInitName = "__esan_init";
 static const char *const EsanExitName = "__esan_exit";
 
+// We must keep these Shadow* constants consistent with the esan runtime.
+// FIXME: Try to place these shadow constants, the names of the __esan_*
+// interface functions, and the ToolType enum into a header shared between
+// llvm and compiler-rt.
+static const uint64_t ShadowMask = 0x00000fffffffffffull;
+static const uint64_t ShadowOffs[3] = { // Indexed by scale
+  0x0000130000000000ull,
+  0x0000220000000000ull,
+  0x0000440000000000ull,
+};
+// This array is indexed by the ToolType enum.
+static const int ShadowScale[] = {
+  0, // ESAN_None.
+  2, // ESAN_CacheFrag: 4B:1B, so 4 to 1 == >>2.
+  6, // ESAN_WorkingSet: 64B:1B, so 64 to 1 == >>6.
+};
+
 namespace {
 
 static EfficiencySanitizerOptions
 OverrideOptionsFromCL(EfficiencySanitizerOptions Options) {
   if (ClToolCacheFrag)
     Options.ToolType = EfficiencySanitizerOptions::ESAN_CacheFrag;
+  else if (ClToolWorkingSet)
+    Options.ToolType = EfficiencySanitizerOptions::ESAN_WorkingSet;
 
   // Direct opt invocation with no params will have the default ESAN_None.
   // We run the default tool in that case.
@@ -100,11 +122,14 @@
   bool instrumentMemIntrinsic(MemIntrinsic *MI);
   bool shouldIgnoreMemoryAccess(Instruction *I);
   int getMemoryAccessFuncIndex(Value *Addr, const DataLayout &DL);
+  Value *appToShadow(Value *Shadow, IRBuilder<> &IRB);
   bool instrumentFastpath(Instruction *I, const DataLayout &DL, bool IsStore,
                           Value *Addr, unsigned Alignment);
   // Each tool has its own fastpath routine:
   bool instrumentFastpathCacheFrag(Instruction *I, const DataLayout &DL,
                                    Value *Addr, unsigned Alignment);
+  bool instrumentFastpathWorkingSet(Instruction *I, const DataLayout &DL,
+                                    Value *Addr, unsigned Alignment);
 
   EfficiencySanitizerOptions Options;
   LLVMContext *Ctx;
@@ -226,11 +251,30 @@
   return true;
 }
 
+Value *EfficiencySanitizer::appToShadow(Value *Shadow, IRBuilder<> &IRB) {
+  // Shadow = ((App & Mask) + Offs) >> Scale
+  Shadow = IRB.CreateAnd(Shadow, ConstantInt::get(IntptrTy, ShadowMask));
+  uint64_t Offs;
+  int Scale = ShadowScale[Options.ToolType];
+  if (Scale <= 2)
+    Offs = ShadowOffs[Scale];
+  else
+    Offs = ShadowOffs[0] << Scale;
+  Shadow = IRB.CreateAdd(Shadow, ConstantInt::get(IntptrTy, Offs));
+  if (Scale > 0)
+    Shadow = IRB.CreateLShr(Shadow, Scale);
+  return Shadow;
+}
+
 bool EfficiencySanitizer::shouldIgnoreMemoryAccess(Instruction *I) {
   if (Options.ToolType == EfficiencySanitizerOptions::ESAN_CacheFrag) {
     // We'd like to know about cache fragmentation in vtable accesses and
     // constant data references, so we do not currently ignore anything.
     return false;
+  } else if (Options.ToolType == EfficiencySanitizerOptions::ESAN_WorkingSet) {
+    // TODO: the instrumentation disturbs the data layout on the stack, so we
+    // may want to add an option to ignore stack references (if we can
+    // distinguish them) to reduce overhead.
   }
   // TODO(bruening): future tools will be returning true for some cases.
   return false;
@@ -309,6 +353,11 @@
   Type *OrigTy = cast<PointerType>(Addr->getType())->getElementType();
   const uint32_t TypeSizeBytes = DL.getTypeStoreSizeInBits(OrigTy) / 8;
   Value *OnAccessFunc = nullptr;
+
+  // Convert 0 to the default alignment.
+  if (Alignment == 0)
+    Alignment = DL.getPrefTypeAlignment(OrigTy);
+
   if (IsStore)
     NumInstrumentedStores++;
   else
@@ -384,6 +433,8 @@
                                              Value *Addr, unsigned Alignment) {
   if (Options.ToolType == EfficiencySanitizerOptions::ESAN_CacheFrag) {
     return instrumentFastpathCacheFrag(I, DL, Addr, Alignment);
+  } else if (Options.ToolType == EfficiencySanitizerOptions::ESAN_WorkingSet) {
+    return instrumentFastpathWorkingSet(I, DL, Addr, Alignment);
   }
   return false;
 }
@@ -395,3 +446,56 @@
   // TODO(bruening): implement a fastpath for aligned accesses
   return false;
 }
+
+bool EfficiencySanitizer::instrumentFastpathWorkingSet(
+    Instruction *I, const DataLayout &DL, Value *Addr, unsigned Alignment) {
+  assert(ShadowScale[Options.ToolType] == 6); // The code below assumes this
+  IRBuilder<> IRB(I);
+  Type *OrigTy = cast<PointerType>(Addr->getType())->getElementType();
+  const uint32_t TypeSize = DL.getTypeStoreSizeInBits(OrigTy);
+  // Bail to the slowpath if the access might touch multiple cache lines.
+  // An access aligned to its size is guaranteed to be intra-cache-line.
+  // getMemoryAccessFuncIndex has already ruled out a size larger than 16
+  // and thus larger than a cache line for platforms this tool targets
+  // (and our shadow memory setup assumes 64-byte cache lines).
+  assert(TypeSize <= 64);
+  if (!(TypeSize == 8 ||
+        (Alignment % (TypeSize / 8)) == 0))
+    return false;
+
+  // We inline instrumentation to set the corresponding shadow bits for
+  // each cache line touched by the application.  Here we handle a single
+  // load or store where we've already ruled out the possibility that it
+  // might touch more than one cache line and thus we simply update the
+  // shadow memory for a single cache line.
+  // Our shadow memory model is fine with races when manipulating shadow values.
+  // We generate the following code:
+  //
+  //   const char BitMask = 0x81;
+  //   char *ShadowAddr = appToShadow(AppAddr);
+  //   if ((*ShadowAddr & BitMask) != BitMask)
+  //     *ShadowAddr |= Bitmask;
+  //
+  Value *AddrPtr = IRB.CreatePointerCast(Addr, IntptrTy);
+  Value *ShadowPtr = appToShadow(AddrPtr, IRB);
+  Type *ShadowTy = IntegerType::get(*Ctx, 8U);
+  Type *ShadowPtrTy = PointerType::get(ShadowTy, 0);
+  // The bottom bit is used for the current sampling period's working set.
+  // The top bit is used for the total working set.  We set both on each
+  // memory access, if they are not already set.
+  Value *ValueMask = ConstantInt::get(ShadowTy, 0x81); // 10000001B
+
+  Value *OldValue = IRB.CreateLoad(IRB.CreateIntToPtr(ShadowPtr, ShadowPtrTy));
+  // The AND and CMP will be turned into a TEST instruction by the compiler.
+  Value *Cmp = IRB.CreateICmpNE(IRB.CreateAnd(OldValue, ValueMask), ValueMask);
+  TerminatorInst *CmpTerm = SplitBlockAndInsertIfThen(Cmp, I, false);
+  // FIXME: do I need to call SetCurrentDebugLocation?
+  IRB.SetInsertPoint(CmpTerm);
+  // We use OR to set the shadow bits to avoid corrupting the middle 6 bits,
+  // which are used by the runtime library.
+  Value *NewVal = IRB.CreateOr(OldValue, ValueMask);
+  IRB.CreateStore(NewVal, IRB.CreateIntToPtr(ShadowPtr, ShadowPtrTy));
+  IRB.SetInsertPoint(I);
+
+  return true;
+}
Index: llvm/trunk/test/Instrumentation/EfficiencySanitizer/working_set_basic.ll
===================================================================
--- llvm/trunk/test/Instrumentation/EfficiencySanitizer/working_set_basic.ll
+++ llvm/trunk/test/Instrumentation/EfficiencySanitizer/working_set_basic.ll
@@ -0,0 +1,164 @@
+; Test basic EfficiencySanitizer working set instrumentation.
+;
+; RUN: opt < %s -esan -esan-working-set -S | FileCheck %s
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Intra-cache-line
+
+define i8 @aligned1(i8* %a) {
+entry:
+  %tmp1 = load i8, i8* %a, align 1
+  ret i8 %tmp1
+; CHECK: @llvm.global_ctors = {{.*}}@esan.module_ctor
+; CHECK:        %0 = ptrtoint i8* %a to i64
+; CHECK-NEXT:   %1 = and i64 %0, 17592186044415
+; CHECK-NEXT:   %2 = add i64 %1, 1337006139375616
+; CHECK-NEXT:   %3 = lshr i64 %2, 6
+; CHECK-NEXT:   %4 = inttoptr i64 %3 to i8*
+; CHECK-NEXT:   %5 = load i8, i8* %4
+; CHECK-NEXT:   %6 = and i8 %5, -127
+; CHECK-NEXT:   %7 = icmp ne i8 %6, -127
+; CHECK-NEXT:   br i1 %7, label %8, label %11
+; CHECK:        %9 = or i8 %5, -127
+; CHECK-NEXT:   %10 = inttoptr i64 %3 to i8*
+; CHECK-NEXT:   store i8 %9, i8* %10
+; CHECK-NEXT:   br label %11
+; CHECK:        %tmp1 = load i8, i8* %a, align 1
+; CHECK-NEXT:   ret i8 %tmp1
+}
+
+define i16 @aligned2(i16* %a) {
+entry:
+  %tmp1 = load i16, i16* %a, align 2
+  ret i16 %tmp1
+; CHECK:        %0 = ptrtoint i16* %a to i64
+; CHECK-NEXT:   %1 = and i64 %0, 17592186044415
+; CHECK-NEXT:   %2 = add i64 %1, 1337006139375616
+; CHECK-NEXT:   %3 = lshr i64 %2, 6
+; CHECK-NEXT:   %4 = inttoptr i64 %3 to i8*
+; CHECK-NEXT:   %5 = load i8, i8* %4
+; CHECK-NEXT:   %6 = and i8 %5, -127
+; CHECK-NEXT:   %7 = icmp ne i8 %6, -127
+; CHECK-NEXT:   br i1 %7, label %8, label %11
+; CHECK:        %9 = or i8 %5, -127
+; CHECK-NEXT:   %10 = inttoptr i64 %3 to i8*
+; CHECK-NEXT:   store i8 %9, i8* %10
+; CHECK-NEXT:   br label %11
+; CHECK:        %tmp1 = load i16, i16* %a, align 2
+; CHECK-NEXT:   ret i16 %tmp1
+}
+
+define i32 @aligned4(i32* %a) {
+entry:
+  %tmp1 = load i32, i32* %a, align 4
+  ret i32 %tmp1
+; CHECK:        %0 = ptrtoint i32* %a to i64
+; CHECK-NEXT:   %1 = and i64 %0, 17592186044415
+; CHECK-NEXT:   %2 = add i64 %1, 1337006139375616
+; CHECK-NEXT:   %3 = lshr i64 %2, 6
+; CHECK-NEXT:   %4 = inttoptr i64 %3 to i8*
+; CHECK-NEXT:   %5 = load i8, i8* %4
+; CHECK-NEXT:   %6 = and i8 %5, -127
+; CHECK-NEXT:   %7 = icmp ne i8 %6, -127
+; CHECK-NEXT:   br i1 %7, label %8, label %11
+; CHECK:        %9 = or i8 %5, -127
+; CHECK-NEXT:   %10 = inttoptr i64 %3 to i8*
+; CHECK-NEXT:   store i8 %9, i8* %10
+; CHECK-NEXT:   br label %11
+; CHECK:        %tmp1 = load i32, i32* %a, align 4
+; CHECK-NEXT:   ret i32 %tmp1
+}
+
+define i64 @aligned8(i64* %a) {
+entry:
+  %tmp1 = load i64, i64* %a, align 8
+  ret i64 %tmp1
+; CHECK:        %0 = ptrtoint i64* %a to i64
+; CHECK-NEXT:   %1 = and i64 %0, 17592186044415
+; CHECK-NEXT:   %2 = add i64 %1, 1337006139375616
+; CHECK-NEXT:   %3 = lshr i64 %2, 6
+; CHECK-NEXT:   %4 = inttoptr i64 %3 to i8*
+; CHECK-NEXT:   %5 = load i8, i8* %4
+; CHECK-NEXT:   %6 = and i8 %5, -127
+; CHECK-NEXT:   %7 = icmp ne i8 %6, -127
+; CHECK-NEXT:   br i1 %7, label %8, label %11
+; CHECK:        %9 = or i8 %5, -127
+; CHECK-NEXT:   %10 = inttoptr i64 %3 to i8*
+; CHECK-NEXT:   store i8 %9, i8* %10
+; CHECK-NEXT:   br label %11
+; CHECK:        %tmp1 = load i64, i64* %a, align 8
+; CHECK-NEXT:   ret i64 %tmp1
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Not guaranteed to be intra-cache-line
+
+define i16 @unaligned2(i16* %a) {
+entry:
+  %tmp1 = load i16, i16* %a, align 1
+  ret i16 %tmp1
+; CHECK:          %0 = bitcast i16* %a to i8*
+; CHECK-NEXT:   call void @__esan_unaligned_load2(i8* %0)
+; CHECK-NEXT:   %tmp1 = load i16, i16* %a, align 1
+; CHECK-NEXT:   ret i16 %tmp1
+}
+
+define i32 @unaligned4(i32* %a) {
+entry:
+  %tmp1 = load i32, i32* %a, align 2
+  ret i32 %tmp1
+; CHECK:          %0 = bitcast i32* %a to i8*
+; CHECK-NEXT:   call void @__esan_unaligned_load4(i8* %0)
+; CHECK-NEXT:   %tmp1 = load i32, i32* %a, align 2
+; CHECK-NEXT:   ret i32 %tmp1
+}
+
+define i64 @unaligned8(i64* %a) {
+entry:
+  %tmp1 = load i64, i64* %a, align 4
+  ret i64 %tmp1
+; CHECK:          %0 = bitcast i64* %a to i8*
+; CHECK-NEXT:   call void @__esan_unaligned_load8(i8* %0)
+; CHECK-NEXT:   %tmp1 = load i64, i64* %a, align 4
+; CHECK-NEXT:   ret i64 %tmp1
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Ensure that esan converts intrinsics to calls:
+
+declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture, i64, i32, i1)
+declare void @llvm.memmove.p0i8.p0i8.i64(i8* nocapture, i8* nocapture, i64, i32, i1)
+declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i32, i1)
+
+define void @memCpyTest(i8* nocapture %x, i8* nocapture %y) {
+entry:
+    tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %x, i8* %y, i64 16, i32 4, i1 false)
+    ret void
+; CHECK: define void @memCpyTest
+; CHECK: call i8* @memcpy
+; CHECK: ret void
+}
+
+define void @memMoveTest(i8* nocapture %x, i8* nocapture %y) {
+entry:
+    tail call void @llvm.memmove.p0i8.p0i8.i64(i8* %x, i8* %y, i64 16, i32 4, i1 false)
+    ret void
+; CHECK: define void @memMoveTest
+; CHECK: call i8* @memmove
+; CHECK: ret void
+}
+
+define void @memSetTest(i8* nocapture %x) {
+entry:
+    tail call void @llvm.memset.p0i8.i64(i8* %x, i8 77, i64 16, i32 4, i1 false)
+    ret void
+; CHECK: define void @memSetTest
+; CHECK: call i8* @memset
+; CHECK: ret void
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Top-level:
+
+; CHECK: define internal void @esan.module_ctor()
+; CHECK: call void @__esan_init(i32 2, i64 ptrtoint (i64* @0 to i64))