Index: lib/scudo/CMakeLists.txt
===================================================================
--- lib/scudo/CMakeLists.txt
+++ lib/scudo/CMakeLists.txt
@@ -14,6 +14,8 @@
   scudo_interceptors.cpp
   scudo_new_delete.cpp
   scudo_termination.cpp
+  scudo_thread_android.cpp
+  scudo_thread_linux.cpp
   scudo_utils.cpp)
 
 # Enable the SSE 4.2 instruction set for scudo_crc32.cpp, if available.
Index: lib/scudo/scudo_allocator.h
===================================================================
--- lib/scudo/scudo_allocator.h
+++ lib/scudo/scudo_allocator.h
@@ -82,8 +82,57 @@
   void copyTo(Flags *f, CommonFlags *cf) const;
 };
 
+#if SANITIZER_CAN_USE_ALLOCATOR64
+const uptr AllocatorSpace = ~0ULL;
+# if defined(__aarch64__) && SANITIZER_ANDROID
+const uptr AllocatorSize = 0x4000000000ULL;  // 256G.
+typedef DefaultSizeClassMap SizeClassMap;
+# elif defined(__aarch64__)
+// AArch64/SANITIZER_CAN_USER_ALLOCATOR64 is only for 42-bit VMA
+// so no need to different values for different VMA.
+const uptr kAllocatorSize  =  0x10000000000ULL;  // 1T.
+typedef DefaultSizeClassMap SizeClassMap;
+# else
+const uptr AllocatorSize = 0x40000000000ULL;  // 4T.
+typedef DefaultSizeClassMap SizeClassMap;
+# endif
+struct AP {
+  static const uptr kSpaceBeg = AllocatorSpace;
+  static const uptr kSpaceSize = AllocatorSize;
+  static const uptr kMetadataSize = 0;
+  typedef __scudo::SizeClassMap SizeClassMap;
+  typedef NoOpMapUnmapCallback MapUnmapCallback;
+  static const uptr kFlags =
+      SizeClassAllocator64FlagMasks::kRandomShuffleChunks;
+};
+typedef SizeClassAllocator64<AP> PrimaryAllocator;
+#else
+// Currently, the 32-bit Sanitizer allocator has not yet benefited from all the
+// security improvements brought to the 64-bit one. This makes the 32-bit
+// version of Scudo slightly less toughened.
+static const uptr RegionSizeLog = 20;
+static const uptr NumRegions = SANITIZER_MMAP_RANGE_SIZE >> RegionSizeLog;
+# if SANITIZER_WORDSIZE == 32
+typedef FlatByteMap<NumRegions> ByteMap;
+# elif SANITIZER_WORDSIZE == 64
+typedef TwoLevelByteMap<(NumRegions >> 12), 1 << 12> ByteMap;
+# endif  // SANITIZER_WORDSIZE
+typedef DefaultSizeClassMap SizeClassMap;
+typedef SizeClassAllocator32<0, SANITIZER_MMAP_RANGE_SIZE, 0, SizeClassMap,
+    RegionSizeLog, ByteMap> PrimaryAllocator;
+#endif  // SANITIZER_CAN_USE_ALLOCATOR64
+
+#include "scudo_allocator_secondary.h"
+
+typedef SizeClassAllocatorLocalCache<PrimaryAllocator> AllocatorCache;
+typedef ScudoLargeMmapAllocator SecondaryAllocator;
+typedef CombinedAllocator<PrimaryAllocator, AllocatorCache, SecondaryAllocator>
+  ScudoBackendAllocator;
+
+ScudoBackendAllocator &getBackendAllocator();
+
+void initScudo();
 void initAllocator(const AllocatorOptions &options);
-void drainQuarantine();
 
 void *scudoMalloc(uptr Size, AllocType Type);
 void scudoFree(void *Ptr, AllocType Type);
@@ -97,8 +146,6 @@
 void *scudoAlignedAlloc(uptr Alignment, uptr Size);
 uptr scudoMallocUsableSize(void *Ptr);
 
-#include "scudo_allocator_secondary.h"
-
 }  // namespace __scudo
 
 #endif  // SCUDO_ALLOCATOR_H_
Index: lib/scudo/scudo_allocator.cpp
===================================================================
--- lib/scudo/scudo_allocator.cpp
+++ lib/scudo/scudo_allocator.cpp
@@ -15,56 +15,16 @@
 //===----------------------------------------------------------------------===//
 
 #include "scudo_allocator.h"
+#include "scudo_thread.h"
 #include "scudo_utils.h"
 
 #include "sanitizer_common/sanitizer_allocator_interface.h"
 #include "sanitizer_common/sanitizer_quarantine.h"
 
-#include <limits.h>
-#include <pthread.h>
-
-#include <cstring>
+#include <string.h>
 
 namespace __scudo {
 
-#if SANITIZER_CAN_USE_ALLOCATOR64
-const uptr AllocatorSpace = ~0ULL;
-const uptr AllocatorSize = 0x40000000000ULL;
-typedef DefaultSizeClassMap SizeClassMap;
-struct AP {
-  static const uptr kSpaceBeg = AllocatorSpace;
-  static const uptr kSpaceSize = AllocatorSize;
-  static const uptr kMetadataSize = 0;
-  typedef __scudo::SizeClassMap SizeClassMap;
-  typedef NoOpMapUnmapCallback MapUnmapCallback;
-  static const uptr kFlags =
-      SizeClassAllocator64FlagMasks::kRandomShuffleChunks;
-};
-typedef SizeClassAllocator64<AP> PrimaryAllocator;
-#else
-// Currently, the 32-bit Sanitizer allocator has not yet benefited from all the
-// security improvements brought to the 64-bit one. This makes the 32-bit
-// version of Scudo slightly less toughened.
-static const uptr RegionSizeLog = 20;
-static const uptr NumRegions = SANITIZER_MMAP_RANGE_SIZE >> RegionSizeLog;
-# if SANITIZER_WORDSIZE == 32
-typedef FlatByteMap<NumRegions> ByteMap;
-# elif SANITIZER_WORDSIZE == 64
-typedef TwoLevelByteMap<(NumRegions >> 12), 1 << 12> ByteMap;
-# endif  // SANITIZER_WORDSIZE
-typedef DefaultSizeClassMap SizeClassMap;
-typedef SizeClassAllocator32<0, SANITIZER_MMAP_RANGE_SIZE, 0, SizeClassMap,
-    RegionSizeLog, ByteMap> PrimaryAllocator;
-#endif  // SANITIZER_CAN_USE_ALLOCATOR64
-
-typedef SizeClassAllocatorLocalCache<PrimaryAllocator> AllocatorCache;
-typedef ScudoLargeMmapAllocator SecondaryAllocator;
-typedef CombinedAllocator<PrimaryAllocator, AllocatorCache, SecondaryAllocator>
-  ScudoAllocator;
-
-static ScudoAllocator &getAllocator();
-
-static thread_local Xorshift128Plus Prng;
 // Global static cookie, initialized at start-up.
 static uptr Cookie;
 
@@ -101,9 +61,10 @@
   // Returns the usable size for a chunk, meaning the amount of bytes from the
   // beginning of the user data to the end of the backend allocated chunk.
   uptr getUsableSize(UnpackedHeader *Header) {
-    uptr Size = getAllocator().GetActuallyAllocatedSize(getAllocBeg(Header));
+    uptr Size =
+        getBackendAllocator().GetActuallyAllocatedSize(getAllocBeg(Header));
     if (Size == 0)
-      return Size;
+      return 0;
     return Size - AlignedChunkHeaderSize - (Header->Offset << MinAlignmentLog);
   }
 
@@ -120,7 +81,8 @@
     return static_cast<u16>(Crc);
   }
 
-  // Checks the validity of a chunk by verifying its checksum.
+  // Checks the validity of a chunk by verifying its checksum. This check
+  // doesn't incur termination in case of an invalid chunk.
   bool isValid() {
     UnpackedHeader NewUnpackedHeader;
     const AtomicPackedHeader *AtomicHeader =
@@ -130,13 +92,27 @@
     return (NewUnpackedHeader.Checksum == computeChecksum(&NewUnpackedHeader));
   }
 
+  // Nulls out a chunk header. When returning the chunk to the backend, there
+  // is no need to store a valid ChunkAvailable header, as this would be
+  // computationally expensive. Zero-ing out serves the same purpose by making
+  // the header invalid. In the extremely rare event where 0 would be a valid
+  // checksum for the chunk, the state of the chunk is ChunkAvailable anyway.
+  COMPILER_CHECK(ChunkAvailable == 0);
+  void eraseHeader() {
+    PackedHeader NullPackedHeader = 0;
+    AtomicPackedHeader *AtomicHeader =
+        reinterpret_cast<AtomicPackedHeader *>(this);
+    atomic_store_relaxed(AtomicHeader, NullPackedHeader);
+  }
+
   // Loads and unpacks the header, verifying the checksum in the process.
   void loadHeader(UnpackedHeader *NewUnpackedHeader) const {
     const AtomicPackedHeader *AtomicHeader =
         reinterpret_cast<const AtomicPackedHeader *>(this);
     PackedHeader NewPackedHeader = atomic_load_relaxed(AtomicHeader);
     *NewUnpackedHeader = bit_cast<UnpackedHeader>(NewPackedHeader);
-    if (NewUnpackedHeader->Checksum != computeChecksum(NewUnpackedHeader)) {
+    if (UNLIKELY(NewUnpackedHeader->Checksum
+                 != computeChecksum(NewUnpackedHeader))) {
       dieWithMessage("ERROR: corrupted chunk header at address %p\n", this);
     }
   }
@@ -160,10 +136,10 @@
     PackedHeader OldPackedHeader = bit_cast<PackedHeader>(*OldUnpackedHeader);
     AtomicPackedHeader *AtomicHeader =
         reinterpret_cast<AtomicPackedHeader *>(this);
-    if (!atomic_compare_exchange_strong(AtomicHeader,
-                                        &OldPackedHeader,
-                                        NewPackedHeader,
-                                        memory_order_relaxed)) {
+    if (UNLIKELY(!atomic_compare_exchange_strong(AtomicHeader,
+                                                 &OldPackedHeader,
+                                                 NewPackedHeader,
+                                                 memory_order_relaxed))) {
       dieWithMessage("ERROR: race on chunk header at address %p\n", this);
     }
   }
@@ -171,30 +147,7 @@
 
 static bool ScudoInitIsRunning = false;
 
-static pthread_once_t GlobalInited = PTHREAD_ONCE_INIT;
-static pthread_key_t PThreadKey;
-
-static thread_local bool ThreadInited = false;
-static thread_local bool ThreadTornDown = false;
-static thread_local AllocatorCache Cache;
-
-static void teardownThread(void *p) {
-  uptr v = reinterpret_cast<uptr>(p);
-  // The glibc POSIX thread-local-storage deallocation routine calls user
-  // provided destructors in a loop of PTHREAD_DESTRUCTOR_ITERATIONS.
-  // We want to be called last since other destructors might call free and the
-  // like, so we wait until PTHREAD_DESTRUCTOR_ITERATIONS before draining the
-  // quarantine and swallowing the cache.
-  if (v < PTHREAD_DESTRUCTOR_ITERATIONS) {
-    pthread_setspecific(PThreadKey, reinterpret_cast<void *>(v + 1));
-    return;
-  }
-  drainQuarantine();
-  getAllocator().DestroyCache(&Cache);
-  ThreadTornDown = true;
-}
-
-static void initInternal() {
+void initScudo() {
   SanitizerToolName = "Scudo";
   CHECK(!ScudoInitIsRunning && "Scudo init calls itself!");
   ScudoInitIsRunning = true;
@@ -205,28 +158,13 @@
   }
 
   initFlags();
-
   AllocatorOptions Options;
   Options.setFrom(getFlags(), common_flags());
   initAllocator(Options);
 
-  MaybeStartBackgroudThread();
-
   ScudoInitIsRunning = false;
 }
 
-static void initGlobal() {
-  pthread_key_create(&PThreadKey, teardownThread);
-  initInternal();
-}
-
-static void NOINLINE initThread() {
-  pthread_once(&GlobalInited, initGlobal);
-  pthread_setspecific(PThreadKey, reinterpret_cast<void *>(1));
-  getAllocator().InitCache(&Cache);
-  ThreadInited = true;
-}
-
 struct QuarantineCallback {
   explicit QuarantineCallback(AllocatorCache *Cache)
     : Cache_(Cache) {}
@@ -235,38 +173,45 @@
   void Recycle(ScudoChunk *Chunk) {
     UnpackedHeader Header;
     Chunk->loadHeader(&Header);
-    if (Header.State != ChunkQuarantine) {
+    if (UNLIKELY(Header.State != ChunkQuarantine)) {
       dieWithMessage("ERROR: invalid chunk state when recycling address %p\n",
                      Chunk);
     }
+    Chunk->eraseHeader();
     void *Ptr = Chunk->getAllocBeg(&Header);
-    getAllocator().Deallocate(Cache_, Ptr);
+    getBackendAllocator().Deallocate(Cache_, Ptr);
   }
 
   /// Internal quarantine allocation and deallocation functions.
   void *Allocate(uptr Size) {
-    // The internal quarantine memory cannot be protected by us. But the only
-    // structures allocated are QuarantineBatch, that are 8KB for x64. So we
-    // will use mmap for those, and given that Deallocate doesn't pass a size
-    // in, we enforce the size of the allocation to be sizeof(QuarantineBatch).
-    // TODO(kostyak): switching to mmap impacts greatly performances, we have
-    //                to find another solution
-    // CHECK_EQ(Size, sizeof(QuarantineBatch));
-    // return MmapOrDie(Size, "QuarantineBatch");
-    return getAllocator().Allocate(Cache_, Size, 1, false);
+    // TODO(kostyak): figure out the best way to protect the batches.
+    return getBackendAllocator().Allocate(Cache_, Size, MinAlignment);
   }
 
   void Deallocate(void *Ptr) {
-    // UnmapOrDie(Ptr, sizeof(QuarantineBatch));
-    getAllocator().Deallocate(Cache_, Ptr);
+    getBackendAllocator().Deallocate(Cache_, Ptr);
   }
 
   AllocatorCache *Cache_;
 };
 
 typedef Quarantine<QuarantineCallback, ScudoChunk> ScudoQuarantine;
-typedef ScudoQuarantine::Cache QuarantineCache;
-static thread_local QuarantineCache ThreadQuarantineCache;
+typedef ScudoQuarantine::Cache ScudoQuarantineCache;
+COMPILER_CHECK(sizeof(ScudoQuarantineCache)
+               <= sizeof(ScudoThreadContext::QuarantineCachePlaceHolder));
+
+AllocatorCache *getAllocatorCache(ScudoThreadContext *ThreadContext) {
+  return &ThreadContext->Cache;
+}
+
+ScudoQuarantineCache *getQuarantineCache(ScudoThreadContext *ThreadContext) {
+  return reinterpret_cast<
+      ScudoQuarantineCache *>(ThreadContext->QuarantineCachePlaceHolder);
+}
+
+Xorshift128Plus *getPrng(ScudoThreadContext *ThreadContext) {
+  return &ThreadContext->Prng;
+}
 
 void AllocatorOptions::setFrom(const Flags *f, const CommonFlags *cf) {
   MayReturnNull = cf->allocator_may_return_null;
@@ -288,11 +233,11 @@
   f->ZeroContents = ZeroContents;
 }
 
-struct Allocator {
+struct ScudoAllocator {
   static const uptr MaxAllowedMallocSize =
       FIRST_32_SECOND_64(2UL << 30, 1ULL << 40);
 
-  ScudoAllocator BackendAllocator;
+  ScudoBackendAllocator BackendAllocator;
   ScudoQuarantine AllocatorQuarantine;
 
   // The fallback caches are used when the thread local caches have been
@@ -300,13 +245,14 @@
   // be accessed by different threads.
   StaticSpinMutex FallbackMutex;
   AllocatorCache FallbackAllocatorCache;
-  QuarantineCache FallbackQuarantineCache;
+  ScudoQuarantineCache FallbackQuarantineCache;
+  Xorshift128Plus GlobalPrng;
 
   bool DeallocationTypeMismatch;
   bool ZeroContents;
   bool DeleteSizeMismatch;
 
-  explicit Allocator(LinkerInitialized)
+  explicit ScudoAllocator(LinkerInitialized)
     : AllocatorQuarantine(LINKER_INITIALIZED),
       FallbackQuarantineCache(LINKER_INITIALIZED) {}
 
@@ -348,38 +294,38 @@
     AllocatorQuarantine.Init(
         static_cast<uptr>(Options.QuarantineSizeMb) << 20,
         static_cast<uptr>(Options.ThreadLocalQuarantineSizeKb) << 10);
+    FallbackMutex.Init();
     BackendAllocator.InitCache(&FallbackAllocatorCache);
-    Cookie = Prng.Next();
+    GlobalPrng.initFromURandom();
+    Cookie = GlobalPrng.getNext();
   }
 
-  // Helper function that checks for a valid Scudo chunk.
+  // Helper function that checks for a valid Scudo chunk. nullptr isn't.
   bool isValidPointer(const void *UserPtr) {
-    if (UNLIKELY(!ThreadInited))
-      initThread();
-    uptr ChunkBeg = reinterpret_cast<uptr>(UserPtr);
-    if (!IsAligned(ChunkBeg, MinAlignment)) {
+    initThreadMaybe();
+    if (!UserPtr)
       return false;
-    }
-    ScudoChunk *Chunk =
-        reinterpret_cast<ScudoChunk *>(ChunkBeg - AlignedChunkHeaderSize);
-    return Chunk->isValid();
+    uptr UserBeg = reinterpret_cast<uptr>(UserPtr);
+    if (!IsAligned(UserBeg, MinAlignment))
+      return false;
+    return getScudoChunk(UserBeg)->isValid();
   }
 
   // Allocates a chunk.
-  void *allocate(uptr Size, uptr Alignment, AllocType Type) {
-    if (UNLIKELY(!ThreadInited))
-      initThread();
-    if (!IsPowerOfTwo(Alignment)) {
+  void *allocate(uptr Size, uptr Alignment, AllocType Type,
+                 bool ForceZeroContents = false) {
+    initThreadMaybe();
+    if (UNLIKELY(!IsPowerOfTwo(Alignment))) {
       dieWithMessage("ERROR: alignment is not a power of 2\n");
     }
     if (Alignment > MaxAlignment)
       return BackendAllocator.ReturnNullOrDieOnBadRequest();
     if (Alignment < MinAlignment)
       Alignment = MinAlignment;
-    if (Size == 0)
-      Size = 1;
     if (Size >= MaxAllowedMallocSize)
       return BackendAllocator.ReturnNullOrDieOnBadRequest();
+    if (Size == 0)
+      Size = 1;
 
     uptr NeededSize = RoundUpTo(Size, MinAlignment) + AlignedChunkHeaderSize;
     if (Alignment > MinAlignment)
@@ -394,14 +340,23 @@
     // combined allocator to accommodate the situation.
     bool FromPrimary = PrimaryAllocator::CanAllocate(NeededSize, MinAlignment);
 
+    uptr Salt;
     void *Ptr;
-    if (LIKELY(!ThreadTornDown)) {
-      Ptr = BackendAllocator.Allocate(&Cache, NeededSize,
-                                      FromPrimary ? MinAlignment : Alignment);
+    uptr AllocationAlignment = FromPrimary ? MinAlignment : Alignment;
+    ScudoThreadContext *ThreadContext = getCurrentThreadContext();
+    // Based on what backs the ThreadContext, the locking mechanism differ (it
+    // might not be locked at all for thread_local variables).
+    if (ThreadContext) {
+      ThreadContext->Lock();
+      Salt = getPrng(ThreadContext)->getNext();
+      Ptr = BackendAllocator.Allocate(getAllocatorCache(ThreadContext),
+                                      NeededSize, AllocationAlignment);
+      ThreadContext->Unlock();
     } else {
-      SpinMutexLock l(&FallbackMutex);
-      Ptr = BackendAllocator.Allocate(&FallbackAllocatorCache, NeededSize,
-                                      FromPrimary ? MinAlignment : Alignment);
+      SpinMutexLock Lock(&FallbackMutex);
+      Salt = GlobalPrng.getNext();
+      Ptr = BackendAllocator.Allocate(&FallbackAllocatorCache,
+                                      NeededSize, AllocationAlignment);
     }
     if (!Ptr)
       return BackendAllocator.ReturnNullOrDieOnOOM();
@@ -419,109 +374,131 @@
     uptr ActuallyAllocatedSize = BackendAllocator.GetActuallyAllocatedSize(
         reinterpret_cast<void *>(AllocBeg));
     // If requested, we will zero out the entire contents of the returned chunk.
-    if (ZeroContents && FromPrimary)
+    if ((ForceZeroContents || ZeroContents) && FromPrimary)
        memset(Ptr, 0, ActuallyAllocatedSize);
 
-    uptr ChunkBeg = AllocBeg + AlignedChunkHeaderSize;
-    if (!IsAligned(ChunkBeg, Alignment))
-      ChunkBeg = RoundUpTo(ChunkBeg, Alignment);
-    CHECK_LE(ChunkBeg + Size, AllocBeg + NeededSize);
-    ScudoChunk *Chunk =
-        reinterpret_cast<ScudoChunk *>(ChunkBeg - AlignedChunkHeaderSize);
+    uptr UserBeg = AllocBeg + AlignedChunkHeaderSize;
+    if (!IsAligned(UserBeg, Alignment))
+      UserBeg = RoundUpTo(UserBeg, Alignment);
+    CHECK_LE(UserBeg + Size, AllocBeg + NeededSize);
     UnpackedHeader Header = {};
     Header.State = ChunkAllocated;
-    uptr Offset = ChunkBeg - AlignedChunkHeaderSize - AllocBeg;
+    uptr Offset = UserBeg - AlignedChunkHeaderSize - AllocBeg;
     Header.Offset = Offset >> MinAlignmentLog;
     Header.AllocType = Type;
     Header.UnusedBytes = ActuallyAllocatedSize - Offset -
         AlignedChunkHeaderSize - Size;
-    Header.Salt = static_cast<u8>(Prng.Next());
-    Chunk->storeHeader(&Header);
-    void *UserPtr = reinterpret_cast<void *>(ChunkBeg);
-    // TODO(kostyak): hooks sound like a terrible idea security wise but might
-    //                be needed for things to work properly?
+    Header.Salt = static_cast<u8>(Salt);
+    getScudoChunk(UserBeg)->storeHeader(&Header);
+    void *UserPtr = reinterpret_cast<void *>(UserBeg);
     // if (&__sanitizer_malloc_hook) __sanitizer_malloc_hook(UserPtr, Size);
     return UserPtr;
   }
 
+  // Place a chunk in the quarantine. In the event of a zero-sized quarantine,
+  // we directly deallocate the chunk, otherwise the flow would lead to the
+  // chunk being checksummed twice, once before Put and once in Recycle, with
+  // no additional security value.
+  void quarantineOrDeallocateChunk(ScudoChunk *Chunk, UnpackedHeader *Header,
+                                   uptr Size) {
+    bool BypassQuarantine = (AllocatorQuarantine.GetCacheSize() == 0);
+    ScudoThreadContext *ThreadContext = getCurrentThreadContext();
+    if (BypassQuarantine) {
+      Chunk->eraseHeader();
+      void *Ptr = Chunk->getAllocBeg(Header);
+      if (ThreadContext) {
+        ThreadContext->Lock();
+        getBackendAllocator().Deallocate(getAllocatorCache(ThreadContext), Ptr);
+        ThreadContext->Unlock();
+      } else {
+        SpinMutexLock Lock(&FallbackMutex);
+        getBackendAllocator().Deallocate(&FallbackAllocatorCache, Ptr);
+      }
+    } else {
+      UnpackedHeader NewHeader = *Header;
+      NewHeader.State = ChunkQuarantine;
+      Chunk->compareExchangeHeader(&NewHeader, Header);
+      if (ThreadContext) {
+        ThreadContext->Lock();
+        AllocatorQuarantine.Put(getQuarantineCache(ThreadContext),
+                                QuarantineCallback(
+                                    getAllocatorCache(ThreadContext)),
+                                Chunk, Size);
+        ThreadContext->Unlock();
+      } else {
+        SpinMutexLock Lock(&FallbackMutex);
+        AllocatorQuarantine.Put(&FallbackQuarantineCache,
+                                QuarantineCallback(&FallbackAllocatorCache),
+                                Chunk, Size);
+      }
+    }
+  }
+
   // Deallocates a Chunk, which means adding it to the delayed free list (or
-  // Quarantine).
+  // Quarantine) or returning it to the backend.
   void deallocate(void *UserPtr, uptr DeleteSize, AllocType Type) {
-    if (UNLIKELY(!ThreadInited))
-      initThread();
-    // TODO(kostyak): see hook comment above
+    initThreadMaybe();
     // if (&__sanitizer_free_hook) __sanitizer_free_hook(UserPtr);
     if (!UserPtr)
       return;
-    uptr ChunkBeg = reinterpret_cast<uptr>(UserPtr);
-    if (!IsAligned(ChunkBeg, MinAlignment)) {
+    uptr UserBeg = reinterpret_cast<uptr>(UserPtr);
+    if (UNLIKELY(!IsAligned(UserBeg, MinAlignment))) {
       dieWithMessage("ERROR: attempted to deallocate a chunk not properly "
                      "aligned at address %p\n", UserPtr);
     }
-    ScudoChunk *Chunk =
-        reinterpret_cast<ScudoChunk *>(ChunkBeg - AlignedChunkHeaderSize);
+    ScudoChunk *Chunk = getScudoChunk(UserBeg);
     UnpackedHeader OldHeader;
     Chunk->loadHeader(&OldHeader);
-    if (OldHeader.State != ChunkAllocated) {
+    if (UNLIKELY(OldHeader.State != ChunkAllocated)) {
       dieWithMessage("ERROR: invalid chunk state when deallocating address "
                      "%p\n", UserPtr);
     }
-    uptr UsableSize = Chunk->getUsableSize(&OldHeader);
-    UnpackedHeader NewHeader = OldHeader;
-    NewHeader.State = ChunkQuarantine;
-    Chunk->compareExchangeHeader(&NewHeader, &OldHeader);
     if (DeallocationTypeMismatch) {
       // The deallocation type has to match the allocation one.
-      if (NewHeader.AllocType != Type) {
+      if (OldHeader.AllocType != Type) {
         // With the exception of memalign'd Chunks, that can be still be free'd.
-        if (NewHeader.AllocType != FromMemalign || Type != FromMalloc) {
+        if (OldHeader.AllocType != FromMemalign || Type != FromMalloc) {
           dieWithMessage("ERROR: allocation type mismatch on address %p\n",
-                         Chunk);
+                         UserPtr);
         }
       }
     }
-    uptr Size = UsableSize - OldHeader.UnusedBytes;
+    uptr UsableSize = Chunk->getUsableSize(&OldHeader);
     if (DeleteSizeMismatch) {
+      uptr Size = UsableSize - OldHeader.UnusedBytes;
       if (DeleteSize && DeleteSize != Size) {
         dieWithMessage("ERROR: invalid sized delete on chunk at address %p\n",
-                       Chunk);
+                       UserPtr);
       }
     }
-
-    if (LIKELY(!ThreadTornDown)) {
-      AllocatorQuarantine.Put(&ThreadQuarantineCache,
-                              QuarantineCallback(&Cache), Chunk, UsableSize);
-    } else {
-      SpinMutexLock l(&FallbackMutex);
-      AllocatorQuarantine.Put(&FallbackQuarantineCache,
-                              QuarantineCallback(&FallbackAllocatorCache),
-                              Chunk, UsableSize);
-    }
+    quarantineOrDeallocateChunk(Chunk, &OldHeader, UsableSize);
   }
 
   // Reallocates a chunk. We can save on a new allocation if the new requested
   // size still fits in the chunk.
   void *reallocate(void *OldPtr, uptr NewSize) {
-    if (UNLIKELY(!ThreadInited))
-      initThread();
-    uptr ChunkBeg = reinterpret_cast<uptr>(OldPtr);
-    ScudoChunk *Chunk =
-        reinterpret_cast<ScudoChunk *>(ChunkBeg - AlignedChunkHeaderSize);
+    initThreadMaybe();
+    uptr UserBeg = reinterpret_cast<uptr>(OldPtr);
+    if (UNLIKELY(!IsAligned(UserBeg, MinAlignment))) {
+      dieWithMessage("ERROR: attempted to reallocate a chunk not properly "
+                     "aligned at address %p\n", OldPtr);
+    }
+    ScudoChunk *Chunk = getScudoChunk(UserBeg);
     UnpackedHeader OldHeader;
     Chunk->loadHeader(&OldHeader);
-    if (OldHeader.State != ChunkAllocated) {
+    if (UNLIKELY(OldHeader.State != ChunkAllocated)) {
       dieWithMessage("ERROR: invalid chunk state when reallocating address "
                      "%p\n", OldPtr);
     }
-    uptr Size = Chunk->getUsableSize(&OldHeader);
-    if (OldHeader.AllocType != FromMalloc) {
+    if (UNLIKELY(OldHeader.AllocType != FromMalloc)) {
       dieWithMessage("ERROR: invalid chunk type when reallocating address %p\n",
-                     Chunk);
+                     OldPtr);
     }
-    UnpackedHeader NewHeader = OldHeader;
+    uptr UsableSize = Chunk->getUsableSize(&OldHeader);
     // The new size still fits in the current chunk.
-    if (NewSize <= Size) {
-      NewHeader.UnusedBytes = Size - NewSize;
+    if (NewSize <= UsableSize) {
+      UnpackedHeader NewHeader = OldHeader;
+      NewHeader.UnusedBytes = UsableSize - NewSize;
       Chunk->compareExchangeHeader(&NewHeader, &OldHeader);
       return OldPtr;
     }
@@ -529,36 +506,28 @@
     // old one.
     void *NewPtr = allocate(NewSize, MinAlignment, FromMalloc);
     if (NewPtr) {
-      uptr OldSize = Size - OldHeader.UnusedBytes;
+      uptr OldSize = UsableSize - OldHeader.UnusedBytes;
       memcpy(NewPtr, OldPtr, Min(NewSize, OldSize));
-      NewHeader.State = ChunkQuarantine;
-      Chunk->compareExchangeHeader(&NewHeader, &OldHeader);
-      if (LIKELY(!ThreadTornDown)) {
-        AllocatorQuarantine.Put(&ThreadQuarantineCache,
-                                QuarantineCallback(&Cache), Chunk, Size);
-      } else {
-        SpinMutexLock l(&FallbackMutex);
-        AllocatorQuarantine.Put(&FallbackQuarantineCache,
-                                QuarantineCallback(&FallbackAllocatorCache),
-                                Chunk, Size);
-      }
+      quarantineOrDeallocateChunk(Chunk, &OldHeader, UsableSize);
     }
     return NewPtr;
   }
 
+  ScudoChunk *getScudoChunk(uptr UserBeg) {
+    return reinterpret_cast<ScudoChunk *>(UserBeg - AlignedChunkHeaderSize);
+  }
+
   // Helper function that returns the actual usable size of a chunk.
   uptr getUsableSize(const void *Ptr) {
-    if (UNLIKELY(!ThreadInited))
-      initThread();
+    initThreadMaybe();
     if (!Ptr)
       return 0;
-    uptr ChunkBeg = reinterpret_cast<uptr>(Ptr);
-    ScudoChunk *Chunk =
-        reinterpret_cast<ScudoChunk *>(ChunkBeg - AlignedChunkHeaderSize);
+    uptr UserBeg = reinterpret_cast<uptr>(Ptr);
+    ScudoChunk *Chunk = getScudoChunk(UserBeg);
     UnpackedHeader Header;
     Chunk->loadHeader(&Header);
     // Getting the usable size of a chunk only makes sense if it's allocated.
-    if (Header.State != ChunkAllocated) {
+    if (UNLIKELY(Header.State != ChunkAllocated)) {
       dieWithMessage("ERROR: invalid chunk state when sizing address %p\n",
                      Ptr);
     }
@@ -566,35 +535,31 @@
   }
 
   void *calloc(uptr NMemB, uptr Size) {
-    if (UNLIKELY(!ThreadInited))
-      initThread();
+    initThreadMaybe();
     uptr Total = NMemB * Size;
-    if (Size != 0 && Total / Size != NMemB) // Overflow check
+    if (Size != 0 && Total / Size != NMemB)  // Overflow check
       return BackendAllocator.ReturnNullOrDieOnBadRequest();
-    void *Ptr = allocate(Total, MinAlignment, FromMalloc);
-    // If ZeroContents, the content of the chunk has already been zero'd out.
-    if (!ZeroContents && Ptr && BackendAllocator.FromPrimary(Ptr))
-      memset(Ptr, 0, getUsableSize(Ptr));
-    return Ptr;
+    return allocate(Total, MinAlignment, FromMalloc, true);
   }
 
-  void drainQuarantine() {
-    AllocatorQuarantine.Drain(&ThreadQuarantineCache,
-                              QuarantineCallback(&Cache));
+  void commitBack(ScudoThreadContext *ThreadContext) {
+    AllocatorCache *Cache = getAllocatorCache(ThreadContext);
+    AllocatorQuarantine.Drain(getQuarantineCache(ThreadContext),
+                              QuarantineCallback(Cache));
+    BackendAllocator.DestroyCache(Cache);
   }
 
   uptr getStats(AllocatorStat StatType) {
-    if (UNLIKELY(!ThreadInited))
-      initThread();
+    initThreadMaybe();
     uptr stats[AllocatorStatCount];
     BackendAllocator.GetStats(stats);
     return stats[StatType];
   }
 };
 
-static Allocator Instance(LINKER_INITIALIZED);
+static ScudoAllocator Instance(LINKER_INITIALIZED);
 
-static ScudoAllocator &getAllocator() {
+ScudoBackendAllocator &getBackendAllocator() {
   return Instance.BackendAllocator;
 }
 
@@ -602,8 +567,8 @@
   Instance.init(Options);
 }
 
-void drainQuarantine() {
-  Instance.drainQuarantine();
+void ScudoThreadContext::commitBack() {
+  Instance.commitBack(this);
 }
 
 void *scudoMalloc(uptr Size, AllocType Type) {
Index: lib/scudo/scudo_allocator_secondary.h
===================================================================
--- lib/scudo/scudo_allocator_secondary.h
+++ lib/scudo/scudo_allocator_secondary.h
@@ -88,8 +88,11 @@
     // The primary adds the whole class size to the stats when allocating a
     // chunk, so we will do something similar here. But we will not account for
     // the guard pages.
-    Stats->Add(AllocatorStatAllocated, MapSize - 2 * PageSize);
-    Stats->Add(AllocatorStatMapped, MapSize - 2 * PageSize);
+    {
+      SpinMutexLock l(&StatsMutex);
+      Stats->Add(AllocatorStatAllocated, MapSize - 2 * PageSize);
+      Stats->Add(AllocatorStatMapped, MapSize - 2 * PageSize);
+    }
 
     return reinterpret_cast<void *>(UserBeg);
   }
@@ -112,8 +115,11 @@
 
   void Deallocate(AllocatorStats *Stats, void *Ptr) {
     SecondaryHeader *Header = getHeader(Ptr);
-    Stats->Sub(AllocatorStatAllocated, Header->MapSize - 2 * PageSize);
-    Stats->Sub(AllocatorStatMapped, Header->MapSize - 2 * PageSize);
+    {
+      SpinMutexLock l(&StatsMutex);
+      Stats->Sub(AllocatorStatAllocated, Header->MapSize - 2 * PageSize);
+      Stats->Sub(AllocatorStatMapped, Header->MapSize - 2 * PageSize);
+    }
     UnmapOrDie(reinterpret_cast<void *>(Header->MapBeg), Header->MapSize);
   }
 
@@ -127,7 +133,7 @@
 
   uptr GetActuallyAllocatedSize(void *Ptr) {
     SecondaryHeader *Header = getHeader(Ptr);
-    // Deduct PageSize as MapEnd includes the trailing guard page.
+    // Deduct PageSize as MapSize includes the trailing guard page.
     uptr MapEnd = Header->MapBeg + Header->MapSize - PageSize;
     return MapEnd - reinterpret_cast<uptr>(Ptr);
   }
@@ -182,6 +188,7 @@
   const uptr SecondaryHeaderSize = sizeof(SecondaryHeader);
   const uptr HeadersSize = SecondaryHeaderSize + AlignedChunkHeaderSize;
   uptr PageSize;
+  SpinMutex StatsMutex;
   atomic_uint8_t MayReturnNull;
 };
 
Index: lib/scudo/scudo_thread.h
===================================================================
--- /dev/null
+++ lib/scudo/scudo_thread.h
@@ -0,0 +1,46 @@
+//===-- scudo_thread.h ------------------------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// TODO(kostyak): description.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef SCUDO_THREAD_H_
+#define SCUDO_THREAD_H_
+
+#include "scudo_allocator.h"
+#include "scudo_utils.h"
+
+#include "sanitizer_common/sanitizer_platform.h"
+
+namespace __scudo {
+
+struct ScudoThreadContext {
+ public:
+  StaticSpinMutex Mutex;
+  AllocatorCache Cache;
+  Xorshift128Plus Prng;
+  uptr QuarantineCachePlaceHolder[4];
+  void init() {
+    Mutex.Init();
+    getBackendAllocator().InitCache(&Cache);
+    Prng.initFromURandom();
+    memset(QuarantineCachePlaceHolder, 0, sizeof(QuarantineCachePlaceHolder));
+  }
+  void commitBack();
+  void Lock();
+  void Unlock();
+};
+
+void initThreadMaybe();
+ScudoThreadContext *getCurrentThreadContext();
+
+}  // namespace __scudo
+
+#endif  // SCUDO_THREAD_H_
Index: lib/scudo/scudo_thread_android.cpp
===================================================================
--- /dev/null
+++ lib/scudo/scudo_thread_android.cpp
@@ -0,0 +1,73 @@
+//===-- scudo_thread_android.cpp --------------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// TODO(kostyak): description.
+///
+//===----------------------------------------------------------------------===//
+
+#include "sanitizer_common/sanitizer_platform.h"
+
+#if SANITIZER_LINUX && SANITIZER_ANDROID
+
+#include "scudo_thread.h"
+
+#include "sanitizer_common/sanitizer_atomic.h"
+
+#include <pthread.h>
+
+namespace __scudo {
+
+static pthread_once_t GlobalInited = PTHREAD_ONCE_INIT;
+static pthread_key_t PThreadKey;
+
+// For Android, and possibly other platforms with stronger memory constraints,
+// we can't necessarily afford a context per thread. The alternative adopted
+// here is a fixed set of contexts (the number being defined at compile time),
+// where contexts are assigned to new threads in a round-robin fashion.
+
+#ifndef SCUDO_N_CONTEXTS
+# define SCUDO_N_CONTEXTS 4
+#endif
+
+static atomic_uint32_t ThreadContextCurrentIndex;
+static ScudoThreadContext *ThreadContexts;
+
+static void initOnce() {
+  CHECK_EQ(pthread_key_create(&PThreadKey, NULL), 0);
+  initScudo();
+  ThreadContexts = reinterpret_cast<ScudoThreadContext *>(
+      MmapOrDie(sizeof(ScudoThreadContext) * SCUDO_N_CONTEXTS, __func__));
+  for (int i = 0; i < SCUDO_N_CONTEXTS; i++)
+    ThreadContexts[i].init();
+}
+
+void initThreadMaybe() {
+  pthread_once(&GlobalInited, initOnce);
+  if (pthread_getspecific(PThreadKey) == NULL) {
+    u32 Index = atomic_fetch_add(&ThreadContextCurrentIndex, 1,
+                                 memory_order_relaxed);
+    ScudoThreadContext *ThreadContext =
+        &ThreadContexts[Index % SCUDO_N_CONTEXTS];
+    pthread_setspecific(PThreadKey, ThreadContext);
+  }
+}
+
+ScudoThreadContext *getCurrentThreadContext() {
+  ScudoThreadContext *ThreadContext =
+      reinterpret_cast<ScudoThreadContext *>(pthread_getspecific(PThreadKey));
+  CHECK(ThreadContext);
+  return ThreadContext;
+}
+
+void ScudoThreadContext::Lock() { Mutex.Lock(); }
+void ScudoThreadContext::Unlock() { Mutex.Unlock(); }
+
+}  // namespace __scudo
+
+#endif
Index: lib/scudo/scudo_thread_linux.cpp
===================================================================
--- /dev/null
+++ lib/scudo/scudo_thread_linux.cpp
@@ -0,0 +1,73 @@
+//===-- scudo_thread_linux.cpp ----------------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// TODO(kostyak): description.
+///
+//===----------------------------------------------------------------------===//
+
+#include "sanitizer_common/sanitizer_platform.h"
+
+#if SANITIZER_LINUX && !SANITIZER_ANDROID
+
+#include "scudo_thread.h"
+
+#include <limits.h>
+#include <pthread.h>
+
+namespace __scudo {
+
+static pthread_once_t GlobalInited = PTHREAD_ONCE_INIT;
+static pthread_key_t PThreadKey;
+
+static thread_local bool ThreadInited = false;
+static thread_local bool ThreadTornDown = false;
+static thread_local ScudoThreadContext ThreadLocalContext;
+
+static void teardownThread(void *Ptr) {
+  uptr Iteration = reinterpret_cast<uptr>(Ptr);
+  // The glibc POSIX thread-local-storage deallocation routine calls user
+  // provided destructors in a loop of PTHREAD_DESTRUCTOR_ITERATIONS.
+  // We want to be called last since other destructors might call free and the
+  // like, so we wait until PTHREAD_DESTRUCTOR_ITERATIONS before draining the
+  // quarantine and swallowing the cache.
+  if (Iteration < PTHREAD_DESTRUCTOR_ITERATIONS) {
+    pthread_setspecific(PThreadKey, reinterpret_cast<void *>(Iteration + 1));
+    return;
+  }
+  ThreadLocalContext.commitBack();
+  ThreadTornDown = true;
+}
+
+
+static void initOnce() {
+  CHECK_EQ(pthread_key_create(&PThreadKey, teardownThread), 0);
+  initScudo();
+}
+
+void initThreadMaybe() {
+  if (LIKELY(ThreadInited))
+    return;
+  pthread_once(&GlobalInited, initOnce);
+  pthread_setspecific(PThreadKey, reinterpret_cast<void *>(1));
+  ThreadLocalContext.init();
+  ThreadInited = true;
+}
+
+ScudoThreadContext *getCurrentThreadContext() {
+  if (ThreadTornDown)
+    return nullptr;
+  return &ThreadLocalContext;
+}
+
+void ScudoThreadContext::Lock() {}
+void ScudoThreadContext::Unlock() {}
+
+}  // namespace __scudo
+
+#endif  // SANITIZER_LINUX && !SANITIZER_ANDROID
Index: lib/scudo/scudo_utils.h
===================================================================
--- lib/scudo/scudo_utils.h
+++ lib/scudo/scudo_utils.h
@@ -14,10 +14,10 @@
 #ifndef SCUDO_UTILS_H_
 #define SCUDO_UTILS_H_
 
-#include <string.h>
-
 #include "sanitizer_common/sanitizer_common.h"
 
+#include <string.h>
+
 namespace __scudo {
 
 template <class Dest, class Source>
@@ -40,8 +40,8 @@
 // The state (128 bits) will be stored in thread local storage.
 struct Xorshift128Plus {
  public:
-  Xorshift128Plus();
-  u64 Next() {
+  void initFromURandom();
+  u64 getNext() {
     u64 x = State[0];
     const u64 y = State[1];
     State[0] = y;
Index: lib/scudo/scudo_utils.cpp
===================================================================
--- lib/scudo/scudo_utils.cpp
+++ lib/scudo/scudo_utils.cpp
@@ -153,9 +153,8 @@
   }
 }
 
-// Default constructor for Xorshift128Plus seeds the state with /dev/urandom.
 // TODO(kostyak): investigate using getrandom() if available.
-Xorshift128Plus::Xorshift128Plus() {
+void Xorshift128Plus::initFromURandom() {
   fillRandom(reinterpret_cast<u8 *>(State), sizeof(State));
 }