diff --git a/compiler-rt/lib/scudo/standalone/CMakeLists.txt b/compiler-rt/lib/scudo/standalone/CMakeLists.txt
--- a/compiler-rt/lib/scudo/standalone/CMakeLists.txt
+++ b/compiler-rt/lib/scudo/standalone/CMakeLists.txt
@@ -30,6 +30,10 @@
   list(APPEND SCUDO_CFLAGS -O3)
 endif()
 
+if (CMAKE_CXX_COMPILER_ID MATCHES Clang)
+  list(APPEND SCUDO_CFLAGS -Werror=thread-safety)
+endif()
+
 set(SCUDO_LINK_FLAGS)
 
 list(APPEND SCUDO_LINK_FLAGS -Wl,-z,defs,-z,now,-z,relro)
diff --git a/compiler-rt/lib/scudo/standalone/combined.h b/compiler-rt/lib/scudo/standalone/combined.h
--- a/compiler-rt/lib/scudo/standalone/combined.h
+++ b/compiler-rt/lib/scudo/standalone/combined.h
@@ -245,9 +245,9 @@
   // - unlinking the local stats from the global ones (destroying the cache does
   //   the last two items).
   void commitBack(TSD<ThisT> *TSD) {
-    Quarantine.drain(&TSD->QuarantineCache,
-                     QuarantineCallback(*this, TSD->Cache));
-    TSD->Cache.destroy(&Stats);
+    Quarantine.drain(&TSD->getQuarantineCache(),
+                     QuarantineCallback(*this, TSD->getCache()));
+    TSD->getCache().destroy(&Stats);
   }
 
   ALWAYS_INLINE void *getHeaderTaggedPointer(void *Ptr) {
@@ -301,7 +301,7 @@
 
   NOINLINE void *allocate(uptr Size, Chunk::Origin Origin,
                           uptr Alignment = MinAlignment,
-                          bool ZeroContents = false) {
+                          bool ZeroContents = false) NO_THREAD_SAFETY_ANALYSIS {
     initThreadMaybe();
 
     const Options Options = Primary.Options.load();
@@ -371,23 +371,24 @@
       DCHECK_NE(ClassId, 0U);
       bool UnlockRequired;
       auto *TSD = TSDRegistry.getTSDAndLock(&UnlockRequired);
-      Block = TSD->Cache.allocate(ClassId);
+      Block = TSD->getCache().allocate(ClassId);
       // If the allocation failed, the most likely reason with a 32-bit primary
       // is the region being full. In that event, retry in each successively
       // larger class until it fits. If it fails to fit in the largest class,
       // fallback to the Secondary.
       if (UNLIKELY(!Block)) {
         while (ClassId < SizeClassMap::LargestClassId && !Block)
-          Block = TSD->Cache.allocate(++ClassId);
+          Block = TSD->getCache().allocate(++ClassId);
         if (!Block)
           ClassId = 0;
       }
       if (UnlockRequired)
         TSD->unlock();
     }
-    if (UNLIKELY(ClassId == 0))
+    if (UNLIKELY(ClassId == 0)) {
       Block = Secondary.allocate(Options, Size, Alignment, &SecondaryBlockEnd,
                                  FillContents);
+    }
 
     if (UNLIKELY(!Block)) {
       if (Options.get(OptionBit::MayReturnNull))
@@ -691,7 +692,7 @@
   // TODO(kostyak): disable() is currently best-effort. There are some small
   //                windows of time when an allocation could still succeed after
   //                this function finishes. We will revisit that later.
-  void disable() {
+  void disable() NO_THREAD_SAFETY_ANALYSIS {
     initThreadMaybe();
 #ifdef GWP_ASAN_HOOKS
     GuardedAlloc.disable();
@@ -703,7 +704,7 @@
     Secondary.disable();
   }
 
-  void enable() {
+  void enable() NO_THREAD_SAFETY_ANALYSIS {
     initThreadMaybe();
     Secondary.enable();
     Primary.enable();
@@ -722,9 +723,7 @@
   // sizing purposes.
   uptr getStats(char *Buffer, uptr Size) {
     ScopedString Str;
-    disable();
     const uptr Length = getStats(&Str) + 1;
-    enable();
     if (Length < Size)
       Size = Length;
     if (Buffer && Size) {
@@ -736,9 +735,7 @@
 
   void printStats() {
     ScopedString Str;
-    disable();
     getStats(&Str);
-    enable();
     Str.output();
   }
 
@@ -1101,7 +1098,8 @@
   }
 
   void quarantineOrDeallocateChunk(Options Options, void *TaggedPtr,
-                                   Chunk::UnpackedHeader *Header, uptr Size) {
+                                   Chunk::UnpackedHeader *Header,
+                                   uptr Size) NO_THREAD_SAFETY_ANALYSIS {
     void *Ptr = getHeaderTaggedPointer(TaggedPtr);
     Chunk::UnpackedHeader NewHeader = *Header;
     // If the quarantine is disabled, the actual size of a chunk is 0 or larger
@@ -1143,7 +1141,7 @@
       if (LIKELY(ClassId)) {
         bool UnlockRequired;
         auto *TSD = TSDRegistry.getTSDAndLock(&UnlockRequired);
-        TSD->Cache.deallocate(ClassId, BlockBegin);
+        TSD->getCache().deallocate(ClassId, BlockBegin);
         if (UnlockRequired)
           TSD->unlock();
       } else {
@@ -1155,8 +1153,8 @@
     } else {
       bool UnlockRequired;
       auto *TSD = TSDRegistry.getTSDAndLock(&UnlockRequired);
-      Quarantine.put(&TSD->QuarantineCache,
-                     QuarantineCallback(*this, TSD->Cache), Ptr, Size);
+      Quarantine.put(&TSD->getQuarantineCache(),
+                     QuarantineCallback(*this, TSD->getCache()), Ptr, Size);
       if (UnlockRequired)
         TSD->unlock();
     }
diff --git a/compiler-rt/lib/scudo/standalone/fuchsia.cpp b/compiler-rt/lib/scudo/standalone/fuchsia.cpp
--- a/compiler-rt/lib/scudo/standalone/fuchsia.cpp
+++ b/compiler-rt/lib/scudo/standalone/fuchsia.cpp
@@ -195,6 +195,8 @@
   sync_mutex_unlock(&M);
 }
 
+void HybridMutex::assertHeldImpl() __TA_NO_THREAD_SAFETY_ANALYSIS {}
+
 u64 getMonotonicTime() { return _zx_clock_get_monotonic(); }
 
 u32 getNumberOfCPUs() { return _zx_system_get_num_cpus(); }
diff --git a/compiler-rt/lib/scudo/standalone/linux.cpp b/compiler-rt/lib/scudo/standalone/linux.cpp
--- a/compiler-rt/lib/scudo/standalone/linux.cpp
+++ b/compiler-rt/lib/scudo/standalone/linux.cpp
@@ -11,6 +11,7 @@
 #if SCUDO_LINUX
 
 #include "common.h"
+#include "internal_defs.h"
 #include "linux.h"
 #include "mutex.h"
 #include "string_utils.h"
@@ -128,6 +129,10 @@
   }
 }
 
+void HybridMutex::assertHeldImpl() {
+  CHECK(atomic_load(&M, memory_order_acquire) != Unlocked);
+}
+
 u64 getMonotonicTime() {
   timespec TS;
   clock_gettime(CLOCK_MONOTONIC, &TS);
diff --git a/compiler-rt/lib/scudo/standalone/mutex.h b/compiler-rt/lib/scudo/standalone/mutex.h
--- a/compiler-rt/lib/scudo/standalone/mutex.h
+++ b/compiler-rt/lib/scudo/standalone/mutex.h
@@ -11,6 +11,7 @@
 
 #include "atomic_helpers.h"
 #include "common.h"
+#include "thread_annotations.h"
 
 #include <string.h>
 
@@ -20,10 +21,10 @@
 
 namespace scudo {
 
-class HybridMutex {
+class CAPABILITY("mutex") HybridMutex {
 public:
-  bool tryLock();
-  NOINLINE void lock() {
+  bool tryLock() TRY_ACQUIRE(true);
+  NOINLINE void lock() ACQUIRE() {
     if (LIKELY(tryLock()))
       return;
       // The compiler may try to fully unroll the loop, ending up in a
@@ -40,9 +41,20 @@
     }
     lockSlow();
   }
-  void unlock();
+  void unlock() RELEASE();
+
+  // TODO(chiahungduan): In general, we may want to assert the owner of lock as
+  // well. Given the current uses of HybridMutex, it's acceptable without
+  // asserting the owner. Re-evaluate this when we have certain scenarios which
+  // requires a more fine-grained lock granularity.
+  ALWAYS_INLINE void assertHeld() ASSERT_CAPABILITY(this) {
+    if (SCUDO_DEBUG)
+      assertHeldImpl();
+  }
 
 private:
+  void assertHeldImpl();
+
   static constexpr u8 NumberOfTries = 8U;
   static constexpr u8 NumberOfYields = 8U;
 
@@ -52,13 +64,13 @@
   sync_mutex_t M = {};
 #endif
 
-  void lockSlow();
+  void lockSlow() ACQUIRE();
 };
 
-class ScopedLock {
+class SCOPED_CAPABILITY ScopedLock {
 public:
-  explicit ScopedLock(HybridMutex &M) : Mutex(M) { Mutex.lock(); }
-  ~ScopedLock() { Mutex.unlock(); }
+  explicit ScopedLock(HybridMutex &M) ACQUIRE(M) : Mutex(M) { Mutex.lock(); }
+  ~ScopedLock() RELEASE() { Mutex.unlock(); }
 
 private:
   HybridMutex &Mutex;
diff --git a/compiler-rt/lib/scudo/standalone/primary32.h b/compiler-rt/lib/scudo/standalone/primary32.h
--- a/compiler-rt/lib/scudo/standalone/primary32.h
+++ b/compiler-rt/lib/scudo/standalone/primary32.h
@@ -18,6 +18,7 @@
 #include "report.h"
 #include "stats.h"
 #include "string_utils.h"
+#include "thread_annotations.h"
 
 namespace scudo {
 
@@ -62,7 +63,7 @@
 
   static bool canAllocate(uptr Size) { return Size <= SizeClassMap::MaxSize; }
 
-  void init(s32 ReleaseToOsInterval) {
+  void init(s32 ReleaseToOsInterval) NO_THREAD_SAFETY_ANALYSIS {
     if (SCUDO_FUCHSIA)
       reportError("SizeClassAllocator32 is not supported on Fuchsia");
 
@@ -86,7 +87,7 @@
     setOption(Option::ReleaseInterval, static_cast<sptr>(ReleaseToOsInterval));
   }
 
-  void unmapTestOnly() {
+  void unmapTestOnly() NO_THREAD_SAFETY_ANALYSIS {
     while (NumberOfStashedRegions > 0)
       unmap(reinterpret_cast<void *>(RegionsStash[--NumberOfStashedRegions]),
             RegionSize);
@@ -121,11 +122,11 @@
     DCHECK_LT(ClassId, NumClasses);
     SizeClassInfo *Sci = getSizeClassInfo(ClassId);
     ScopedLock L(Sci->Mutex);
-    TransferBatch *B = popBatchImpl(C, ClassId);
+    TransferBatch *B = popBatchImpl(C, ClassId, Sci);
     if (UNLIKELY(!B)) {
       if (UNLIKELY(!populateFreeList(C, ClassId, Sci)))
         return nullptr;
-      B = popBatchImpl(C, ClassId);
+      B = popBatchImpl(C, ClassId, Sci);
       // if `populateFreeList` succeeded, we are supposed to get free blocks.
       DCHECK_NE(B, nullptr);
     }
@@ -149,7 +150,7 @@
       // the blocks.
       if (Size == 1 && !populateFreeList(C, ClassId, Sci))
         return;
-      pushBlocksImpl(C, ClassId, Array, Size);
+      pushBlocksImpl(C, ClassId, Sci, Array, Size);
       Sci->Stats.PushedBlocks += Size;
       return;
     }
@@ -173,14 +174,14 @@
     }
 
     ScopedLock L(Sci->Mutex);
-    pushBlocksImpl(C, ClassId, Array, Size, SameGroup);
+    pushBlocksImpl(C, ClassId, Sci, Array, Size, SameGroup);
 
     Sci->Stats.PushedBlocks += Size;
     if (ClassId != SizeClassMap::BatchClassId)
       releaseToOSMaybe(Sci, ClassId);
   }
 
-  void disable() {
+  void disable() NO_THREAD_SAFETY_ANALYSIS {
     // The BatchClassId must be locked last since other classes can use it.
     for (sptr I = static_cast<sptr>(NumClasses) - 1; I >= 0; I--) {
       if (static_cast<uptr>(I) == SizeClassMap::BatchClassId)
@@ -192,7 +193,7 @@
     PossibleRegions.disable();
   }
 
-  void enable() {
+  void enable() NO_THREAD_SAFETY_ANALYSIS {
     PossibleRegions.enable();
     RegionsStashMutex.unlock();
     getSizeClassInfo(SizeClassMap::BatchClassId)->Mutex.unlock();
@@ -207,6 +208,10 @@
     uptr MinRegionIndex = NumRegions, MaxRegionIndex = 0;
     for (uptr I = 0; I < NumClasses; I++) {
       SizeClassInfo *Sci = getSizeClassInfo(I);
+      // TODO: The call of `iterateOverBlocks` requires to disabling the
+      // SizeClassAllocator32. We may consider to locking each region on demand
+      // only.
+      Sci->Mutex.assertHeld();
       if (Sci->MinRegionIndex < MinRegionIndex)
         MinRegionIndex = Sci->MinRegionIndex;
       if (Sci->MaxRegionIndex > MaxRegionIndex)
@@ -230,6 +235,7 @@
     uptr PushedBlocks = 0;
     for (uptr I = 0; I < NumClasses; I++) {
       SizeClassInfo *Sci = getSizeClassInfo(I);
+      ScopedLock L(Sci->Mutex);
       TotalMapped += Sci->AllocatedUser;
       PoppedBlocks += Sci->Stats.PoppedBlocks;
       PushedBlocks += Sci->Stats.PushedBlocks;
@@ -237,8 +243,11 @@
     Str->append("Stats: SizeClassAllocator32: %zuM mapped in %zu allocations; "
                 "remains %zu\n",
                 TotalMapped >> 20, PoppedBlocks, PoppedBlocks - PushedBlocks);
-    for (uptr I = 0; I < NumClasses; I++)
-      getStats(Str, I, 0);
+    for (uptr I = 0; I < NumClasses; I++) {
+      SizeClassInfo *Sci = getSizeClassInfo(I);
+      ScopedLock L(Sci->Mutex);
+      getStats(Str, I, Sci, 0);
+    }
   }
 
   bool setOption(Option O, sptr Value) {
@@ -297,17 +306,17 @@
 
   struct alignas(SCUDO_CACHE_LINE_SIZE) SizeClassInfo {
     HybridMutex Mutex;
-    SinglyLinkedList<BatchGroup> FreeList;
-    uptr CurrentRegion;
-    uptr CurrentRegionAllocated;
-    SizeClassStats Stats;
+    SinglyLinkedList<BatchGroup> FreeList GUARDED_BY(Mutex);
+    uptr CurrentRegion GUARDED_BY(Mutex);
+    uptr CurrentRegionAllocated GUARDED_BY(Mutex);
+    SizeClassStats Stats GUARDED_BY(Mutex);
     u32 RandState;
-    uptr AllocatedUser;
+    uptr AllocatedUser GUARDED_BY(Mutex);
     // Lowest & highest region index allocated for this size class, to avoid
     // looping through the whole NumRegions.
-    uptr MinRegionIndex;
-    uptr MaxRegionIndex;
-    ReleaseToOsInfo ReleaseInfo;
+    uptr MinRegionIndex GUARDED_BY(Mutex);
+    uptr MaxRegionIndex GUARDED_BY(Mutex);
+    ReleaseToOsInfo ReleaseInfo GUARDED_BY(Mutex);
   };
   static_assert(sizeof(SizeClassInfo) % SCUDO_CACHE_LINE_SIZE == 0, "");
 
@@ -342,7 +351,7 @@
     return Region;
   }
 
-  uptr allocateRegion(SizeClassInfo *Sci, uptr ClassId) {
+  uptr allocateRegion(SizeClassInfo *Sci, uptr ClassId) REQUIRES(Sci->Mutex) {
     DCHECK_LT(ClassId, NumClasses);
     uptr Region = 0;
     {
@@ -395,10 +404,10 @@
   // `SameGroup=true` instead.
   //
   // The region mutex needs to be held while calling this method.
-  void pushBlocksImpl(CacheT *C, uptr ClassId, CompactPtrT *Array, u32 Size,
-                      bool SameGroup = false) {
+  void pushBlocksImpl(CacheT *C, uptr ClassId, SizeClassInfo *Sci,
+                      CompactPtrT *Array, u32 Size, bool SameGroup = false)
+      REQUIRES(Sci->Mutex) {
     DCHECK_GT(Size, 0U);
-    SizeClassInfo *Sci = getSizeClassInfo(ClassId);
 
     auto CreateGroup = [&](uptr GroupId) {
       BatchGroup *BG = nullptr;
@@ -524,8 +533,8 @@
   // group id will be considered first.
   //
   // The region mutex needs to be held while calling this method.
-  TransferBatch *popBatchImpl(CacheT *C, uptr ClassId) {
-    SizeClassInfo *Sci = getSizeClassInfo(ClassId);
+  TransferBatch *popBatchImpl(CacheT *C, uptr ClassId, SizeClassInfo *Sci)
+      REQUIRES(Sci->Mutex) {
     if (Sci->FreeList.empty())
       return nullptr;
 
@@ -553,7 +562,8 @@
     return B;
   }
 
-  NOINLINE bool populateFreeList(CacheT *C, uptr ClassId, SizeClassInfo *Sci) {
+  NOINLINE bool populateFreeList(CacheT *C, uptr ClassId, SizeClassInfo *Sci)
+      REQUIRES(Sci->Mutex) {
     uptr Region;
     uptr Offset;
     // If the size-class currently has a region associated to it, use it. The
@@ -608,7 +618,7 @@
       // it only happens when it crosses the group size boundary. Instead of
       // sorting them, treat them as same group here to avoid sorting the
       // almost-sorted blocks.
-      pushBlocksImpl(C, ClassId, &ShuffleArray[I], N, /*SameGroup=*/true);
+      pushBlocksImpl(C, ClassId, Sci, &ShuffleArray[I], N, /*SameGroup=*/true);
       I += N;
     }
 
@@ -629,8 +639,8 @@
     return true;
   }
 
-  void getStats(ScopedString *Str, uptr ClassId, uptr Rss) {
-    SizeClassInfo *Sci = getSizeClassInfo(ClassId);
+  void getStats(ScopedString *Str, uptr ClassId, SizeClassInfo *Sci, uptr Rss)
+      REQUIRES(Sci->Mutex) {
     if (Sci->AllocatedUser == 0)
       return;
     const uptr InUse = Sci->Stats.PoppedBlocks - Sci->Stats.PushedBlocks;
@@ -643,7 +653,7 @@
   }
 
   NOINLINE uptr releaseToOSMaybe(SizeClassInfo *Sci, uptr ClassId,
-                                 bool Force = false) {
+                                 bool Force = false) REQUIRES(Sci->Mutex) {
     const uptr BlockSize = getSizeByClassId(ClassId);
     const uptr PageSize = getPageSizeCached();
 
@@ -749,14 +759,15 @@
   SizeClassInfo SizeClassInfoArray[NumClasses] = {};
 
   // Track the regions in use, 0 is unused, otherwise store ClassId + 1.
+  // FIXME: There is no dedicated lock for `PossibleRegions`.
   ByteMap PossibleRegions = {};
   atomic_s32 ReleaseToOsIntervalMs = {};
   // Unless several threads request regions simultaneously from different size
   // classes, the stash rarely contains more than 1 entry.
   static constexpr uptr MaxStashedRegions = 4;
   HybridMutex RegionsStashMutex;
-  uptr NumberOfStashedRegions = 0;
-  uptr RegionsStash[MaxStashedRegions] = {};
+  uptr NumberOfStashedRegions GUARDED_BY(RegionsStashMutex) = 0;
+  uptr RegionsStash[MaxStashedRegions] GUARDED_BY(RegionsStashMutex) = {};
 };
 
 } // namespace scudo
diff --git a/compiler-rt/lib/scudo/standalone/primary64.h b/compiler-rt/lib/scudo/standalone/primary64.h
--- a/compiler-rt/lib/scudo/standalone/primary64.h
+++ b/compiler-rt/lib/scudo/standalone/primary64.h
@@ -18,6 +18,7 @@
 #include "release.h"
 #include "stats.h"
 #include "string_utils.h"
+#include "thread_annotations.h"
 
 namespace scudo {
 
@@ -60,7 +61,7 @@
 
   static bool canAllocate(uptr Size) { return Size <= SizeClassMap::MaxSize; }
 
-  void init(s32 ReleaseToOsInterval) {
+  void init(s32 ReleaseToOsInterval) NO_THREAD_SAFETY_ANALYSIS {
     DCHECK(isAligned(reinterpret_cast<uptr>(this), alignof(ThisT)));
     DCHECK_EQ(PrimaryBase, 0U);
     // Reserve the space required for the Primary.
@@ -86,7 +87,7 @@
     setOption(Option::ReleaseInterval, static_cast<sptr>(ReleaseToOsInterval));
   }
 
-  void unmapTestOnly() {
+  void unmapTestOnly() NO_THREAD_SAFETY_ANALYSIS {
     for (uptr I = 0; I < NumClasses; I++) {
       RegionInfo *Region = getRegionInfo(I);
       *Region = {};
@@ -101,11 +102,11 @@
     DCHECK_LT(ClassId, NumClasses);
     RegionInfo *Region = getRegionInfo(ClassId);
     ScopedLock L(Region->Mutex);
-    TransferBatch *B = popBatchImpl(C, ClassId);
+    TransferBatch *B = popBatchImpl(C, ClassId, Region);
     if (UNLIKELY(!B)) {
       if (UNLIKELY(!populateFreeList(C, ClassId, Region)))
         return nullptr;
-      B = popBatchImpl(C, ClassId);
+      B = popBatchImpl(C, ClassId, Region);
       // if `populateFreeList` succeeded, we are supposed to get free blocks.
       DCHECK_NE(B, nullptr);
     }
@@ -129,7 +130,7 @@
       // the blocks.
       if (Size == 1 && UNLIKELY(!populateFreeList(C, ClassId, Region)))
         return;
-      pushBlocksImpl(C, ClassId, Array, Size);
+      pushBlocksImpl(C, ClassId, Region, Array, Size);
       Region->Stats.PushedBlocks += Size;
       return;
     }
@@ -153,14 +154,14 @@
     }
 
     ScopedLock L(Region->Mutex);
-    pushBlocksImpl(C, ClassId, Array, Size, SameGroup);
+    pushBlocksImpl(C, ClassId, Region, Array, Size, SameGroup);
 
     Region->Stats.PushedBlocks += Size;
     if (ClassId != SizeClassMap::BatchClassId)
       releaseToOSMaybe(Region, ClassId);
   }
 
-  void disable() {
+  void disable() NO_THREAD_SAFETY_ANALYSIS {
     // The BatchClassId must be locked last since other classes can use it.
     for (sptr I = static_cast<sptr>(NumClasses) - 1; I >= 0; I--) {
       if (static_cast<uptr>(I) == SizeClassMap::BatchClassId)
@@ -170,7 +171,7 @@
     getRegionInfo(SizeClassMap::BatchClassId)->Mutex.lock();
   }
 
-  void enable() {
+  void enable() NO_THREAD_SAFETY_ANALYSIS {
     getRegionInfo(SizeClassMap::BatchClassId)->Mutex.unlock();
     for (uptr I = 0; I < NumClasses; I++) {
       if (I == SizeClassMap::BatchClassId)
@@ -183,7 +184,11 @@
     for (uptr I = 0; I < NumClasses; I++) {
       if (I == SizeClassMap::BatchClassId)
         continue;
-      const RegionInfo *Region = getRegionInfo(I);
+      RegionInfo *Region = getRegionInfo(I);
+      // TODO: The call of `iterateOverBlocks` requires to disabling the
+      // SizeClassAllocator64. We may consider to locking each region on demand
+      // only.
+      Region->Mutex.assertHeld();
       const uptr BlockSize = getSizeByClassId(I);
       const uptr From = Region->RegionBeg;
       const uptr To = From + Region->AllocatedUser;
@@ -199,6 +204,7 @@
     uptr PushedBlocks = 0;
     for (uptr I = 0; I < NumClasses; I++) {
       RegionInfo *Region = getRegionInfo(I);
+      ScopedLock L(Region->Mutex);
       if (Region->MappedUser)
         TotalMapped += Region->MappedUser;
       PoppedBlocks += Region->Stats.PoppedBlocks;
@@ -209,8 +215,11 @@
                 TotalMapped >> 20, 0U, PoppedBlocks,
                 PoppedBlocks - PushedBlocks);
 
-    for (uptr I = 0; I < NumClasses; I++)
-      getStats(Str, I, 0);
+    for (uptr I = 0; I < NumClasses; I++) {
+      RegionInfo *Region = getRegionInfo(I);
+      ScopedLock L(Region->Mutex);
+      getStats(Str, I, Region, 0);
+    }
   }
 
   bool setOption(Option O, sptr Value) {
@@ -261,15 +270,23 @@
         decompactPtrInternal(getCompactPtrBaseByClassId(ClassId), CompactPtr));
   }
 
-  static BlockInfo findNearestBlock(const char *RegionInfoData, uptr Ptr) {
+  static BlockInfo findNearestBlock(const char *RegionInfoData,
+                                    uptr Ptr) NO_THREAD_SAFETY_ANALYSIS {
     const RegionInfo *RegionInfoArray =
         reinterpret_cast<const RegionInfo *>(RegionInfoData);
+
     uptr ClassId;
     uptr MinDistance = -1UL;
     for (uptr I = 0; I != NumClasses; ++I) {
       if (I == SizeClassMap::BatchClassId)
         continue;
       uptr Begin = RegionInfoArray[I].RegionBeg;
+      // TODO(chiahungduan): In fact, We need to lock the RegionInfo::Mutex.
+      // However, the RegionInfoData is passed with const qualifier and lock the
+      // mutex requires modifying RegionInfoData, which means we need to remove
+      // the const qualifier. This may lead to another undefined behavior (The
+      // first one is accessing `AllocatedUser` without locking. It's better to
+      // pass `RegionInfoData` as `void *` then we can lock the mutex properly.
       uptr End = Begin + RegionInfoArray[I].AllocatedUser;
       if (Begin > End || End - Begin < SizeClassMap::getSizeByClassId(I))
         continue;
@@ -330,15 +347,17 @@
 
   struct UnpaddedRegionInfo {
     HybridMutex Mutex;
-    SinglyLinkedList<BatchGroup> FreeList;
+    SinglyLinkedList<BatchGroup> FreeList GUARDED_BY(Mutex);
+    // This is initialized before thread creation.
     uptr RegionBeg = 0;
-    RegionStats Stats = {};
-    u32 RandState = 0;
-    uptr MappedUser = 0;    // Bytes mapped for user memory.
-    uptr AllocatedUser = 0; // Bytes allocated for user memory.
-    MapPlatformData Data = {};
-    ReleaseToOsInfo ReleaseInfo = {};
-    bool Exhausted = false;
+    RegionStats Stats GUARDED_BY(Mutex) = {};
+    u32 RandState GUARDED_BY(Mutex) = 0;
+    uptr MappedUser GUARDED_BY(Mutex) = 0; // Bytes mapped for user memory.
+    uptr
+        AllocatedUser GUARDED_BY(Mutex) = 0; // Bytes allocated for user memory.
+    MapPlatformData Data GUARDED_BY(Mutex) = {};
+    ReleaseToOsInfo ReleaseInfo GUARDED_BY(Mutex) = {};
+    bool Exhausted GUARDED_BY(Mutex) = false;
   };
   struct RegionInfo : UnpaddedRegionInfo {
     char Padding[SCUDO_CACHE_LINE_SIZE -
@@ -401,10 +420,10 @@
   // `SameGroup=true` instead.
   //
   // The region mutex needs to be held while calling this method.
-  void pushBlocksImpl(CacheT *C, uptr ClassId, CompactPtrT *Array, u32 Size,
-                      bool SameGroup = false) {
+  void pushBlocksImpl(CacheT *C, uptr ClassId, RegionInfo *Region,
+                      CompactPtrT *Array, u32 Size, bool SameGroup = false)
+      REQUIRES(Region->Mutex) {
     DCHECK_GT(Size, 0U);
-    RegionInfo *Region = getRegionInfo(ClassId);
 
     auto CreateGroup = [&](uptr GroupId) {
       BatchGroup *BG = nullptr;
@@ -530,8 +549,8 @@
   // group id will be considered first.
   //
   // The region mutex needs to be held while calling this method.
-  TransferBatch *popBatchImpl(CacheT *C, uptr ClassId) {
-    RegionInfo *Region = getRegionInfo(ClassId);
+  TransferBatch *popBatchImpl(CacheT *C, uptr ClassId, RegionInfo *Region)
+      REQUIRES(Region->Mutex) {
     if (Region->FreeList.empty())
       return nullptr;
 
@@ -560,7 +579,8 @@
     return B;
   }
 
-  NOINLINE bool populateFreeList(CacheT *C, uptr ClassId, RegionInfo *Region) {
+  NOINLINE bool populateFreeList(CacheT *C, uptr ClassId, RegionInfo *Region)
+      REQUIRES(Region->Mutex) {
     const uptr Size = getSizeByClassId(ClassId);
     const u16 MaxCount = TransferBatch::getMaxCached(Size);
 
@@ -577,7 +597,11 @@
         if (!Region->Exhausted) {
           Region->Exhausted = true;
           ScopedString Str;
-          getStats(&Str);
+          // FIXME: getStats() needs to go over all the regions and will take
+          // the locks of them. Which means we will try to recursively acquire
+          // the `Region->Mutex` which is not supported. It will be better to
+          // log this somewhere else.
+          // getStats(&Str);
           Str.append(
               "Scudo OOM: The process has exhausted %zuM for size class %zu.\n",
               RegionSize >> 20, Size);
@@ -623,7 +647,8 @@
       // it only happens when it crosses the group size boundary. Instead of
       // sorting them, treat them as same group here to avoid sorting the
       // almost-sorted blocks.
-      pushBlocksImpl(C, ClassId, &ShuffleArray[I], N, /*SameGroup=*/true);
+      pushBlocksImpl(C, ClassId, Region, &ShuffleArray[I], N,
+                     /*SameGroup=*/true);
       I += N;
     }
 
@@ -634,8 +659,8 @@
     return true;
   }
 
-  void getStats(ScopedString *Str, uptr ClassId, uptr Rss) {
-    RegionInfo *Region = getRegionInfo(ClassId);
+  void getStats(ScopedString *Str, uptr ClassId, RegionInfo *Region, uptr Rss)
+      REQUIRES(Region->Mutex) {
     if (Region->MappedUser == 0)
       return;
     const uptr InUse = Region->Stats.PoppedBlocks - Region->Stats.PushedBlocks;
@@ -652,7 +677,7 @@
   }
 
   NOINLINE uptr releaseToOSMaybe(RegionInfo *Region, uptr ClassId,
-                                 bool Force = false) {
+                                 bool Force = false) REQUIRES(Region->Mutex) {
     const uptr BlockSize = getSizeByClassId(ClassId);
     const uptr PageSize = getPageSizeCached();
 
diff --git a/compiler-rt/lib/scudo/standalone/quarantine.h b/compiler-rt/lib/scudo/standalone/quarantine.h
--- a/compiler-rt/lib/scudo/standalone/quarantine.h
+++ b/compiler-rt/lib/scudo/standalone/quarantine.h
@@ -12,6 +12,7 @@
 #include "list.h"
 #include "mutex.h"
 #include "string_utils.h"
+#include "thread_annotations.h"
 
 namespace scudo {
 
@@ -172,7 +173,7 @@
   typedef QuarantineCache<Callback> CacheT;
   using ThisT = GlobalQuarantine<Callback, Node>;
 
-  void init(uptr Size, uptr CacheSize) {
+  void init(uptr Size, uptr CacheSize) NO_THREAD_SAFETY_ANALYSIS {
     DCHECK(isAligned(reinterpret_cast<uptr>(this), alignof(ThisT)));
     DCHECK_EQ(atomic_load_relaxed(&MaxSize), 0U);
     DCHECK_EQ(atomic_load_relaxed(&MinSize), 0U);
@@ -197,16 +198,19 @@
       drain(C, Cb);
   }
 
-  void NOINLINE drain(CacheT *C, Callback Cb) {
+  void NOINLINE drain(CacheT *C, Callback Cb) EXCLUDES(CacheMutex) {
+    bool needRecycle = false;
     {
       ScopedLock L(CacheMutex);
       Cache.transfer(C);
+      needRecycle = Cache.getSize() > getMaxSize();
     }
-    if (Cache.getSize() > getMaxSize() && RecycleMutex.tryLock())
+
+    if (needRecycle && RecycleMutex.tryLock())
       recycle(atomic_load_relaxed(&MinSize), Cb);
   }
 
-  void NOINLINE drainAndRecycle(CacheT *C, Callback Cb) {
+  void NOINLINE drainAndRecycle(CacheT *C, Callback Cb) EXCLUDES(CacheMutex) {
     {
       ScopedLock L(CacheMutex);
       Cache.transfer(C);
@@ -215,20 +219,21 @@
     recycle(0, Cb);
   }
 
-  void getStats(ScopedString *Str) const {
+  void getStats(ScopedString *Str) EXCLUDES(CacheMutex) {
+    ScopedLock L(CacheMutex);
     // It assumes that the world is stopped, just as the allocator's printStats.
     Cache.getStats(Str);
     Str->append("Quarantine limits: global: %zuK; thread local: %zuK\n",
                 getMaxSize() >> 10, getCacheSize() >> 10);
   }
 
-  void disable() {
+  void disable() NO_THREAD_SAFETY_ANALYSIS {
     // RecycleMutex must be locked 1st since we grab CacheMutex within recycle.
     RecycleMutex.lock();
     CacheMutex.lock();
   }
 
-  void enable() {
+  void enable() NO_THREAD_SAFETY_ANALYSIS {
     CacheMutex.unlock();
     RecycleMutex.unlock();
   }
@@ -236,13 +241,14 @@
 private:
   // Read-only data.
   alignas(SCUDO_CACHE_LINE_SIZE) HybridMutex CacheMutex;
-  CacheT Cache;
+  CacheT Cache GUARDED_BY(CacheMutex);
   alignas(SCUDO_CACHE_LINE_SIZE) HybridMutex RecycleMutex;
   atomic_uptr MinSize = {};
   atomic_uptr MaxSize = {};
   alignas(SCUDO_CACHE_LINE_SIZE) atomic_uptr MaxCacheSize = {};
 
-  void NOINLINE recycle(uptr MinSize, Callback Cb) {
+  void NOINLINE recycle(uptr MinSize, Callback Cb) RELEASE(RecycleMutex)
+      EXCLUDES(CacheMutex) {
     CacheT Tmp;
     Tmp.init();
     {
diff --git a/compiler-rt/lib/scudo/standalone/release.h b/compiler-rt/lib/scudo/standalone/release.h
--- a/compiler-rt/lib/scudo/standalone/release.h
+++ b/compiler-rt/lib/scudo/standalone/release.h
@@ -12,6 +12,7 @@
 #include "common.h"
 #include "list.h"
 #include "mutex.h"
+#include "thread_annotations.h"
 
 namespace scudo {
 
@@ -76,7 +77,12 @@
     Buffer = nullptr;
   }
 
-  void reset(uptr NumberOfRegion, uptr CountersPerRegion, uptr MaxValue) {
+  // Lock of `StaticBuffer` is acquired conditionally and there's no easy way to
+  // specify the thread-safety attribute properly in current code structure.
+  // Besides, it's the only place we may want to check thread safety. Therefore,
+  // it's fine to bypass the thread-safety analysis now.
+  void reset(uptr NumberOfRegion, uptr CountersPerRegion,
+             uptr MaxValue) NO_THREAD_SAFETY_ANALYSIS {
     DCHECK_GT(NumberOfRegion, 0);
     DCHECK_GT(CountersPerRegion, 0);
     DCHECK_GT(MaxValue, 0);
@@ -181,7 +187,7 @@
   [[no_unique_address]] MapPlatformData MapData = {};
 
   static HybridMutex Mutex;
-  static uptr StaticBuffer[StaticBufferCount];
+  static uptr StaticBuffer[StaticBufferCount] GUARDED_BY(Mutex);
 };
 
 template <class ReleaseRecorderT> class FreePagesRangeTracker {
diff --git a/compiler-rt/lib/scudo/standalone/secondary.h b/compiler-rt/lib/scudo/standalone/secondary.h
--- a/compiler-rt/lib/scudo/standalone/secondary.h
+++ b/compiler-rt/lib/scudo/standalone/secondary.h
@@ -17,6 +17,7 @@
 #include "options.h"
 #include "stats.h"
 #include "string_utils.h"
+#include "thread_annotations.h"
 
 namespace scudo {
 
@@ -133,7 +134,7 @@
                     Config::SecondaryCacheEntriesArraySize,
                 "");
 
-  void init(s32 ReleaseToOsInterval) {
+  void init(s32 ReleaseToOsInterval) NO_THREAD_SAFETY_ANALYSIS {
     DCHECK_EQ(EntriesCount, 0U);
     setOption(Option::MaxCacheEntriesCount,
               static_cast<sptr>(Config::SecondaryCacheDefaultMaxEntriesCount));
@@ -142,7 +143,7 @@
     setOption(Option::ReleaseInterval, static_cast<sptr>(ReleaseToOsInterval));
   }
 
-  void store(Options Options, LargeBlock::Header *H) {
+  void store(Options Options, LargeBlock::Header *H) EXCLUDES(Mutex) {
     if (!canCache(H->CommitSize))
       return unmap(H);
 
@@ -227,7 +228,7 @@
   }
 
   bool retrieve(Options Options, uptr Size, uptr Alignment,
-                LargeBlock::Header **H, bool *Zeroed) {
+                LargeBlock::Header **H, bool *Zeroed) EXCLUDES(Mutex) {
     const uptr PageSize = getPageSizeCached();
     const u32 MaxCount = atomic_load_relaxed(&MaxEntriesCount);
     bool Found = false;
@@ -249,8 +250,9 @@
         if (HeaderPos > CommitBase + CommitSize)
           continue;
         if (HeaderPos < CommitBase ||
-            AllocPos > CommitBase + PageSize * MaxUnusedCachePages)
+            AllocPos > CommitBase + PageSize * MaxUnusedCachePages) {
           continue;
+        }
         Found = true;
         Entry = Entries[I];
         Entries[I].CommitBase = 0;
@@ -279,6 +281,8 @@
       (*H)->MapBase = Entry.MapBase;
       (*H)->MapSize = Entry.MapSize;
       (*H)->Data = Entry.Data;
+
+      ScopedLock L(Mutex);
       EntriesCount--;
     }
     return Found;
@@ -315,7 +319,7 @@
 
   void releaseToOS() { releaseOlderThan(UINT64_MAX); }
 
-  void disableMemoryTagging() {
+  void disableMemoryTagging() EXCLUDES(Mutex) {
     ScopedLock L(Mutex);
     for (u32 I = 0; I != Config::SecondaryCacheQuarantineSize; ++I) {
       if (Quarantine[I].CommitBase) {
@@ -332,9 +336,9 @@
     QuarantinePos = -1U;
   }
 
-  void disable() { Mutex.lock(); }
+  void disable() NO_THREAD_SAFETY_ANALYSIS { Mutex.lock(); }
 
-  void enable() { Mutex.unlock(); }
+  void enable() NO_THREAD_SAFETY_ANALYSIS { Mutex.unlock(); }
 
   void unmapTestOnly() { empty(); }
 
@@ -375,7 +379,7 @@
     u64 Time;
   };
 
-  void releaseIfOlderThan(CachedBlock &Entry, u64 Time) {
+  void releaseIfOlderThan(CachedBlock &Entry, u64 Time) REQUIRES(Mutex) {
     if (!Entry.CommitBase || !Entry.Time)
       return;
     if (Entry.Time > Time) {
@@ -387,7 +391,7 @@
     Entry.Time = 0;
   }
 
-  void releaseOlderThan(u64 Time) {
+  void releaseOlderThan(u64 Time) EXCLUDES(Mutex) {
     ScopedLock L(Mutex);
     if (!EntriesCount || OldestTime == 0 || OldestTime > Time)
       return;
@@ -399,22 +403,24 @@
   }
 
   HybridMutex Mutex;
-  u32 EntriesCount = 0;
-  u32 QuarantinePos = 0;
+  u32 EntriesCount GUARDED_BY(Mutex) = 0;
+  u32 QuarantinePos GUARDED_BY(Mutex) = 0;
   atomic_u32 MaxEntriesCount = {};
   atomic_uptr MaxEntrySize = {};
-  u64 OldestTime = 0;
-  u32 IsFullEvents = 0;
+  u64 OldestTime GUARDED_BY(Mutex) = 0;
+  u32 IsFullEvents GUARDED_BY(Mutex) = 0;
   atomic_s32 ReleaseToOsIntervalMs = {};
 
-  CachedBlock Entries[Config::SecondaryCacheEntriesArraySize] = {};
+  CachedBlock
+      Entries[Config::SecondaryCacheEntriesArraySize] GUARDED_BY(Mutex) = {};
   NonZeroLengthArray<CachedBlock, Config::SecondaryCacheQuarantineSize>
-      Quarantine = {};
+      Quarantine GUARDED_BY(Mutex) = {};
 };
 
 template <typename Config> class MapAllocator {
 public:
-  void init(GlobalStats *S, s32 ReleaseToOsInterval = -1) {
+  void init(GlobalStats *S,
+            s32 ReleaseToOsInterval = -1) NO_THREAD_SAFETY_ANALYSIS {
     DCHECK_EQ(AllocatedBytes, 0U);
     DCHECK_EQ(FreedBytes, 0U);
     Cache.init(ReleaseToOsInterval);
@@ -438,19 +444,21 @@
     return getBlockEnd(Ptr) - reinterpret_cast<uptr>(Ptr);
   }
 
-  void getStats(ScopedString *Str) const;
+  void getStats(ScopedString *Str);
 
-  void disable() {
+  void disable() NO_THREAD_SAFETY_ANALYSIS {
     Mutex.lock();
     Cache.disable();
   }
 
-  void enable() {
+  void enable() NO_THREAD_SAFETY_ANALYSIS {
     Cache.enable();
     Mutex.unlock();
   }
 
   template <typename F> void iterateOverBlocks(F Callback) const {
+    Mutex.assertHeld();
+
     for (const auto &H : InUseBlocks) {
       uptr Ptr = reinterpret_cast<uptr>(&H) + LargeBlock::getHeaderSize();
       if (allocatorSupportsMemoryTagging<Config>())
@@ -472,14 +480,14 @@
 private:
   typename Config::SecondaryCache Cache;
 
-  HybridMutex Mutex;
-  DoublyLinkedList<LargeBlock::Header> InUseBlocks;
-  uptr AllocatedBytes = 0;
-  uptr FreedBytes = 0;
-  uptr LargestSize = 0;
-  u32 NumberOfAllocs = 0;
-  u32 NumberOfFrees = 0;
-  LocalStats Stats;
+  mutable HybridMutex Mutex;
+  DoublyLinkedList<LargeBlock::Header> InUseBlocks GUARDED_BY(Mutex);
+  uptr AllocatedBytes GUARDED_BY(Mutex) = 0;
+  uptr FreedBytes GUARDED_BY(Mutex) = 0;
+  uptr LargestSize GUARDED_BY(Mutex) = 0;
+  u32 NumberOfAllocs GUARDED_BY(Mutex) = 0;
+  u32 NumberOfFrees GUARDED_BY(Mutex) = 0;
+  LocalStats Stats GUARDED_BY(Mutex);
 };
 
 // As with the Primary, the size passed to this function includes any desired
@@ -600,7 +608,8 @@
 }
 
 template <typename Config>
-void MapAllocator<Config>::deallocate(Options Options, void *Ptr) {
+void MapAllocator<Config>::deallocate(Options Options, void *Ptr)
+    EXCLUDES(Mutex) {
   LargeBlock::Header *H = LargeBlock::getHeader<Config>(Ptr);
   const uptr CommitSize = H->CommitSize;
   {
@@ -615,7 +624,8 @@
 }
 
 template <typename Config>
-void MapAllocator<Config>::getStats(ScopedString *Str) const {
+void MapAllocator<Config>::getStats(ScopedString *Str) EXCLUDES(Mutex) {
+  ScopedLock L(Mutex);
   Str->append("Stats: MapAllocator: allocated %u times (%zuK), freed %u times "
               "(%zuK), remains %u (%zuK) max %zuM\n",
               NumberOfAllocs, AllocatedBytes >> 10, NumberOfFrees,
diff --git a/compiler-rt/lib/scudo/standalone/stats.h b/compiler-rt/lib/scudo/standalone/stats.h
--- a/compiler-rt/lib/scudo/standalone/stats.h
+++ b/compiler-rt/lib/scudo/standalone/stats.h
@@ -12,6 +12,7 @@
 #include "atomic_helpers.h"
 #include "list.h"
 #include "mutex.h"
+#include "thread_annotations.h"
 
 #include <string.h>
 
@@ -60,19 +61,19 @@
 public:
   void init() { LocalStats::init(); }
 
-  void link(LocalStats *S) {
+  void link(LocalStats *S) EXCLUDES(Mutex) {
     ScopedLock L(Mutex);
     StatsList.push_back(S);
   }
 
-  void unlink(LocalStats *S) {
+  void unlink(LocalStats *S) EXCLUDES(Mutex) {
     ScopedLock L(Mutex);
     StatsList.remove(S);
     for (uptr I = 0; I < StatCount; I++)
       add(static_cast<StatType>(I), S->get(static_cast<StatType>(I)));
   }
 
-  void get(uptr *S) const {
+  void get(uptr *S) const EXCLUDES(Mutex) {
     ScopedLock L(Mutex);
     for (uptr I = 0; I < StatCount; I++)
       S[I] = LocalStats::get(static_cast<StatType>(I));
@@ -85,15 +86,15 @@
       S[I] = static_cast<sptr>(S[I]) >= 0 ? S[I] : 0;
   }
 
-  void lock() { Mutex.lock(); }
-  void unlock() { Mutex.unlock(); }
+  void lock() ACQUIRE(Mutex) { Mutex.lock(); }
+  void unlock() RELEASE(Mutex) { Mutex.unlock(); }
 
-  void disable() { lock(); }
-  void enable() { unlock(); }
+  void disable() ACQUIRE(Mutex) { lock(); }
+  void enable() RELEASE(Mutex) { unlock(); }
 
 private:
   mutable HybridMutex Mutex;
-  DoublyLinkedList<LocalStats> StatsList;
+  DoublyLinkedList<LocalStats> StatsList GUARDED_BY(Mutex);
 };
 
 } // namespace scudo
diff --git a/compiler-rt/lib/scudo/standalone/tests/CMakeLists.txt b/compiler-rt/lib/scudo/standalone/tests/CMakeLists.txt
--- a/compiler-rt/lib/scudo/standalone/tests/CMakeLists.txt
+++ b/compiler-rt/lib/scudo/standalone/tests/CMakeLists.txt
@@ -31,6 +31,11 @@
        -mno-omit-leaf-frame-pointer)
 endif()
 
+if (COMPILER_RT_TEST_COMPILER_ID MATCHES Clang)
+  list(APPEND SCUDO_UNITTEST_CFLAGS -Werror=thread-safety)
+endif()
+
+
 set(SCUDO_TEST_ARCH ${SCUDO_STANDALONE_SUPPORTED_ARCH})
 
 # gtests requires c++
diff --git a/compiler-rt/lib/scudo/standalone/tests/combined_test.cpp b/compiler-rt/lib/scudo/standalone/tests/combined_test.cpp
--- a/compiler-rt/lib/scudo/standalone/tests/combined_test.cpp
+++ b/compiler-rt/lib/scudo/standalone/tests/combined_test.cpp
@@ -435,7 +435,7 @@
   EXPECT_NE(Stats.find("Stats: Quarantine"), std::string::npos);
 }
 
-SCUDO_TYPED_TEST(ScudoCombinedTest, CacheDrain) {
+SCUDO_TYPED_TEST(ScudoCombinedTest, CacheDrain) NO_THREAD_SAFETY_ANALYSIS {
   auto *Allocator = this->Allocator.get();
 
   std::vector<void *> V;
@@ -447,9 +447,9 @@
 
   bool UnlockRequired;
   auto *TSD = Allocator->getTSDRegistry()->getTSDAndLock(&UnlockRequired);
-  EXPECT_TRUE(!TSD->Cache.isEmpty());
-  TSD->Cache.drain();
-  EXPECT_TRUE(TSD->Cache.isEmpty());
+  EXPECT_TRUE(!TSD->getCache().isEmpty());
+  TSD->getCache().drain();
+  EXPECT_TRUE(TSD->getCache().isEmpty());
   if (UnlockRequired)
     TSD->unlock();
 }
@@ -724,7 +724,7 @@
 
   bool UnlockRequired;
   auto *TSD = Allocator->getTSDRegistry()->getTSDAndLock(&UnlockRequired);
-  TSD->Cache.drain();
+  TSD->getCache().drain();
 
   Allocator->releaseToOS();
 }
diff --git a/compiler-rt/lib/scudo/standalone/tests/mutex_test.cpp b/compiler-rt/lib/scudo/standalone/tests/mutex_test.cpp
--- a/compiler-rt/lib/scudo/standalone/tests/mutex_test.cpp
+++ b/compiler-rt/lib/scudo/standalone/tests/mutex_test.cpp
@@ -99,3 +99,10 @@
   for (scudo::u32 I = 0; I < NumberOfThreads; I++)
     pthread_join(Threads[I], 0);
 }
+
+TEST(ScudoMutexTest, MutexAssertHeld) {
+  scudo::HybridMutex M;
+  M.lock();
+  M.assertHeld();
+  M.unlock();
+}
diff --git a/compiler-rt/lib/scudo/standalone/tests/tsd_test.cpp b/compiler-rt/lib/scudo/standalone/tests/tsd_test.cpp
--- a/compiler-rt/lib/scudo/standalone/tests/tsd_test.cpp
+++ b/compiler-rt/lib/scudo/standalone/tests/tsd_test.cpp
@@ -82,7 +82,7 @@
   EXPECT_FALSE(Allocator->isInitialized());
 
   auto Registry = Allocator->getTSDRegistry();
-  Registry->init(Allocator.get());
+  Registry->initOnceMaybe(Allocator.get());
   EXPECT_TRUE(Allocator->isInitialized());
 }
 
@@ -102,15 +102,15 @@
   bool UnlockRequired;
   auto TSD = Registry->getTSDAndLock(&UnlockRequired);
   EXPECT_NE(TSD, nullptr);
-  EXPECT_EQ(TSD->Cache.Canary, 0U);
+  EXPECT_EQ(TSD->getCache().Canary, 0U);
   if (UnlockRequired)
     TSD->unlock();
 
   Registry->initThreadMaybe(Allocator.get(), /*MinimalInit=*/false);
   TSD = Registry->getTSDAndLock(&UnlockRequired);
   EXPECT_NE(TSD, nullptr);
-  EXPECT_EQ(TSD->Cache.Canary, 0U);
-  memset(&TSD->Cache, 0x42, sizeof(TSD->Cache));
+  EXPECT_EQ(TSD->getCache().Canary, 0U);
+  memset(&TSD->getCache(), 0x42, sizeof(TSD->getCache()));
   if (UnlockRequired)
     TSD->unlock();
 }
@@ -141,14 +141,14 @@
   // For an exclusive TSD, the cache should be empty. We cannot guarantee the
   // same for a shared TSD.
   if (!UnlockRequired)
-    EXPECT_EQ(TSD->Cache.Canary, 0U);
+    EXPECT_EQ(TSD->getCache().Canary, 0U);
   // Transform the thread id to a uptr to use it as canary.
   const scudo::uptr Canary = static_cast<scudo::uptr>(
       std::hash<std::thread::id>{}(std::this_thread::get_id()));
-  TSD->Cache.Canary = Canary;
+  TSD->getCache().Canary = Canary;
   // Loop a few times to make sure that a concurrent thread isn't modifying it.
   for (scudo::uptr I = 0; I < 4096U; I++)
-    EXPECT_EQ(TSD->Cache.Canary, Canary);
+    EXPECT_EQ(TSD->getCache().Canary, Canary);
   if (UnlockRequired)
     TSD->unlock();
 }
diff --git a/compiler-rt/lib/scudo/standalone/thread_annotations.h b/compiler-rt/lib/scudo/standalone/thread_annotations.h
new file mode 100644
--- /dev/null
+++ b/compiler-rt/lib/scudo/standalone/thread_annotations.h
@@ -0,0 +1,70 @@
+//===-- thread_annotations.h ------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef SCUDO_THREAD_ANNOTATIONS_
+#define SCUDO_THREAD_ANNOTATIONS_
+
+// Enable thread safety attributes only with clang.
+// The attributes can be safely ignored when compiling with other compilers.
+#if defined(__clang__)
+#define THREAD_ANNOTATION_ATTRIBUTE_(x) __attribute__((x))
+#else
+#define THREAD_ANNOTATION_ATTRIBUTE_(x) // no-op
+#endif
+
+#define CAPABILITY(x) THREAD_ANNOTATION_ATTRIBUTE_(capability(x))
+
+#define SCOPED_CAPABILITY THREAD_ANNOTATION_ATTRIBUTE_(scoped_lockable)
+
+#define GUARDED_BY(x) THREAD_ANNOTATION_ATTRIBUTE_(guarded_by(x))
+
+#define PT_GUARDED_BY(x) THREAD_ANNOTATION_ATTRIBUTE_(pt_guarded_by(x))
+
+#define ACQUIRED_BEFORE(...)                                                   \
+  THREAD_ANNOTATION_ATTRIBUTE_(acquired_before(__VA_ARGS__))
+
+#define ACQUIRED_AFTER(...)                                                    \
+  THREAD_ANNOTATION_ATTRIBUTE_(acquired_after(__VA_ARGS__))
+
+#define REQUIRES(...)                                                          \
+  THREAD_ANNOTATION_ATTRIBUTE_(requires_capability(__VA_ARGS__))
+
+#define REQUIRES_SHARED(...)                                                   \
+  THREAD_ANNOTATION_ATTRIBUTE_(requires_shared_capability(__VA_ARGS__))
+
+#define ACQUIRE(...)                                                           \
+  THREAD_ANNOTATION_ATTRIBUTE_(acquire_capability(__VA_ARGS__))
+
+#define ACQUIRE_SHARED(...)                                                    \
+  THREAD_ANNOTATION_ATTRIBUTE_(acquire_shared_capability(__VA_ARGS__))
+
+#define RELEASE(...)                                                           \
+  THREAD_ANNOTATION_ATTRIBUTE_(release_capability(__VA_ARGS__))
+
+#define RELEASE_SHARED(...)                                                    \
+  THREAD_ANNOTATION_ATTRIBUTE_(release_shared_capability(__VA_ARGS__))
+
+#define TRY_ACQUIRE(...)                                                       \
+  THREAD_ANNOTATION_ATTRIBUTE_(try_acquire_capability(__VA_ARGS__))
+
+#define TRY_ACQUIRE_SHARED(...)                                                \
+  THREAD_ANNOTATION_ATTRIBUTE_(try_acquire_shared_capability(__VA_ARGS__))
+
+#define EXCLUDES(...) THREAD_ANNOTATION_ATTRIBUTE_(locks_excluded(__VA_ARGS__))
+
+#define ASSERT_CAPABILITY(x) THREAD_ANNOTATION_ATTRIBUTE_(assert_capability(x))
+
+#define ASSERT_SHARED_CAPABILITY(x)                                            \
+  THREAD_ANNOTATION_ATTRIBUTE_(assert_shared_capability(x))
+
+#define RETURN_CAPABILITY(x) THREAD_ANNOTATION_ATTRIBUTE_(lock_returned(x))
+
+#define NO_THREAD_SAFETY_ANALYSIS                                              \
+  THREAD_ANNOTATION_ATTRIBUTE_(no_thread_safety_analysis)
+
+#endif // SCUDO_THREAD_ANNOTATIONS_
diff --git a/compiler-rt/lib/scudo/standalone/trusty.cpp b/compiler-rt/lib/scudo/standalone/trusty.cpp
--- a/compiler-rt/lib/scudo/standalone/trusty.cpp
+++ b/compiler-rt/lib/scudo/standalone/trusty.cpp
@@ -76,6 +76,8 @@
 
 void HybridMutex::unlock() {}
 
+void HybridMutex::assertHeldImpl() {}
+
 u64 getMonotonicTime() {
   timespec TS;
   clock_gettime(CLOCK_MONOTONIC, &TS);
diff --git a/compiler-rt/lib/scudo/standalone/tsd.h b/compiler-rt/lib/scudo/standalone/tsd.h
--- a/compiler-rt/lib/scudo/standalone/tsd.h
+++ b/compiler-rt/lib/scudo/standalone/tsd.h
@@ -12,6 +12,7 @@
 #include "atomic_helpers.h"
 #include "common.h"
 #include "mutex.h"
+#include "thread_annotations.h"
 
 #include <limits.h> // for PTHREAD_DESTRUCTOR_ITERATIONS
 #include <pthread.h>
@@ -24,21 +25,17 @@
 namespace scudo {
 
 template <class Allocator> struct alignas(SCUDO_CACHE_LINE_SIZE) TSD {
-  typename Allocator::CacheT Cache;
-  typename Allocator::QuarantineCacheT QuarantineCache;
   using ThisT = TSD<Allocator>;
   u8 DestructorIterations = 0;
 
-  void init(Allocator *Instance) {
+  void init(Allocator *Instance) NO_THREAD_SAFETY_ANALYSIS {
     DCHECK_EQ(DestructorIterations, 0U);
     DCHECK(isAligned(reinterpret_cast<uptr>(this), alignof(ThisT)));
     Instance->initCache(&Cache);
     DestructorIterations = PTHREAD_DESTRUCTOR_ITERATIONS;
   }
 
-  void commitBack(Allocator *Instance) { Instance->commitBack(this); }
-
-  inline bool tryLock() {
+  inline bool tryLock() NO_THREAD_SAFETY_ANALYSIS {
     if (Mutex.tryLock()) {
       atomic_store_relaxed(&Precedence, 0);
       return true;
@@ -49,16 +46,36 @@
           static_cast<uptr>(getMonotonicTime() >> FIRST_32_SECOND_64(16, 0)));
     return false;
   }
-  inline void lock() {
+  inline void lock() NO_THREAD_SAFETY_ANALYSIS {
     atomic_store_relaxed(&Precedence, 0);
     Mutex.lock();
   }
-  inline void unlock() { Mutex.unlock(); }
+  inline void unlock() NO_THREAD_SAFETY_ANALYSIS { Mutex.unlock(); }
   inline uptr getPrecedence() { return atomic_load_relaxed(&Precedence); }
 
+  void commitBack(Allocator *Instance) ASSERT_CAPABILITY(Mutex) {
+    Instance->commitBack(this);
+  }
+
+  // Ideally, we may want to assert that all the operations on
+  // Cache/QuarantineCache always have the `Mutex` acquired. However, the
+  // current architecture of accessing TSD is not easy to cooperate with the
+  // thread-safety analysis because of pointer aliasing. So now we just add the
+  // assertion on the getters of Cache/QuarantineCache.
+  typename Allocator::CacheT &getCache() ASSERT_CAPABILITY(Mutex) {
+    return Cache;
+  }
+  typename Allocator::QuarantineCacheT &getQuarantineCache()
+      ASSERT_CAPABILITY(Mutex) {
+    return QuarantineCache;
+  }
+
 private:
   HybridMutex Mutex;
   atomic_uptr Precedence = {};
+
+  typename Allocator::CacheT Cache GUARDED_BY(Mutex);
+  typename Allocator::QuarantineCacheT QuarantineCache GUARDED_BY(Mutex);
 };
 
 } // namespace scudo
diff --git a/compiler-rt/lib/scudo/standalone/tsd_exclusive.h b/compiler-rt/lib/scudo/standalone/tsd_exclusive.h
--- a/compiler-rt/lib/scudo/standalone/tsd_exclusive.h
+++ b/compiler-rt/lib/scudo/standalone/tsd_exclusive.h
@@ -25,7 +25,7 @@
 template <class Allocator> void teardownThread(void *Ptr);
 
 template <class Allocator> struct TSDRegistryExT {
-  void init(Allocator *Instance) {
+  void init(Allocator *Instance) REQUIRES(Mutex) {
     DCHECK(!Initialized);
     Instance->init();
     CHECK_EQ(pthread_key_create(&PThreadKey, teardownThread<Allocator>), 0);
@@ -33,14 +33,14 @@
     Initialized = true;
   }
 
-  void initOnceMaybe(Allocator *Instance) {
+  void initOnceMaybe(Allocator *Instance) EXCLUDES(Mutex) {
     ScopedLock L(Mutex);
     if (LIKELY(Initialized))
       return;
     init(Instance); // Sets Initialized.
   }
 
-  void unmapTestOnly(Allocator *Instance) {
+  void unmapTestOnly(Allocator *Instance) EXCLUDES(Mutex) {
     DCHECK(Instance);
     if (reinterpret_cast<Allocator *>(pthread_getspecific(PThreadKey))) {
       DCHECK_EQ(reinterpret_cast<Allocator *>(pthread_getspecific(PThreadKey)),
@@ -53,6 +53,7 @@
     FallbackTSD.commitBack(Instance);
     FallbackTSD = {};
     State = {};
+    ScopedLock L(Mutex);
     Initialized = false;
   }
 
@@ -62,7 +63,13 @@
     initThread(Instance, MinimalInit);
   }
 
-  ALWAYS_INLINE TSD<Allocator> *getTSDAndLock(bool *UnlockRequired) {
+  // TODO(chiahungduan): Consider removing the argument `UnlockRequired` by
+  // embedding the logic into TSD or always locking the TSD. It will enable us
+  // to properly mark thread annotation here and adding proper runtime
+  // assertions in the member functions of TSD. For example, assert the lock is
+  // acquired before calling TSD::commitBack().
+  ALWAYS_INLINE TSD<Allocator> *
+  getTSDAndLock(bool *UnlockRequired) NO_THREAD_SAFETY_ANALYSIS {
     if (LIKELY(State.InitState == ThreadState::Initialized &&
                !atomic_load(&Disabled, memory_order_acquire))) {
       *UnlockRequired = false;
@@ -75,13 +82,13 @@
 
   // To disable the exclusive TSD registry, we effectively lock the fallback TSD
   // and force all threads to attempt to use it instead of their local one.
-  void disable() {
+  void disable() NO_THREAD_SAFETY_ANALYSIS {
     Mutex.lock();
     FallbackTSD.lock();
     atomic_store(&Disabled, 1U, memory_order_release);
   }
 
-  void enable() {
+  void enable() NO_THREAD_SAFETY_ANALYSIS {
     atomic_store(&Disabled, 0U, memory_order_release);
     FallbackTSD.unlock();
     Mutex.unlock();
@@ -113,7 +120,7 @@
   }
 
   pthread_key_t PThreadKey = {};
-  bool Initialized = false;
+  bool Initialized GUARDED_BY(Mutex) = false;
   atomic_u8 Disabled = {};
   TSD<Allocator> FallbackTSD;
   HybridMutex Mutex;
@@ -128,7 +135,8 @@
 template <class Allocator>
 thread_local ThreadState TSDRegistryExT<Allocator>::State;
 
-template <class Allocator> void teardownThread(void *Ptr) {
+template <class Allocator>
+void teardownThread(void *Ptr) NO_THREAD_SAFETY_ANALYSIS {
   typedef TSDRegistryExT<Allocator> TSDRegistryT;
   Allocator *Instance = reinterpret_cast<Allocator *>(Ptr);
   // The glibc POSIX thread-local-storage deallocation routine calls user
diff --git a/compiler-rt/lib/scudo/standalone/tsd_shared.h b/compiler-rt/lib/scudo/standalone/tsd_shared.h
--- a/compiler-rt/lib/scudo/standalone/tsd_shared.h
+++ b/compiler-rt/lib/scudo/standalone/tsd_shared.h
@@ -24,7 +24,7 @@
 
 template <class Allocator, u32 TSDsArraySize, u32 DefaultTSDCount>
 struct TSDRegistrySharedT {
-  void init(Allocator *Instance) {
+  void init(Allocator *Instance) REQUIRES(Mutex) {
     DCHECK(!Initialized);
     Instance->init();
     for (u32 I = 0; I < TSDsArraySize; I++)
@@ -35,19 +35,20 @@
     Initialized = true;
   }
 
-  void initOnceMaybe(Allocator *Instance) {
+  void initOnceMaybe(Allocator *Instance) EXCLUDES(Mutex) {
     ScopedLock L(Mutex);
     if (LIKELY(Initialized))
       return;
     init(Instance); // Sets Initialized.
   }
 
-  void unmapTestOnly(Allocator *Instance) {
+  void unmapTestOnly(Allocator *Instance) EXCLUDES(Mutex) {
     for (u32 I = 0; I < TSDsArraySize; I++) {
       TSDs[I].commitBack(Instance);
       TSDs[I] = {};
     }
     setCurrentTSD(nullptr);
+    ScopedLock L(Mutex);
     Initialized = false;
   }
 
@@ -58,7 +59,10 @@
     initThread(Instance);
   }
 
-  ALWAYS_INLINE TSD<Allocator> *getTSDAndLock(bool *UnlockRequired) {
+  // TSDs is an array of locks and which is not supported for marking
+  // thread-safety capability.
+  ALWAYS_INLINE TSD<Allocator> *
+  getTSDAndLock(bool *UnlockRequired) NO_THREAD_SAFETY_ANALYSIS {
     TSD<Allocator> *TSD = getCurrentTSD();
     DCHECK(TSD);
     *UnlockRequired = true;
@@ -75,16 +79,26 @@
     return getTSDAndLockSlow(TSD);
   }
 
-  void disable() {
+  void disable() NO_THREAD_SAFETY_ANALYSIS {
     Mutex.lock();
+    disableAllTSDs();
+  }
+  // TSDs is an array of locks which is not supported for marking thread-safety
+  // capability.
+  void disableAllTSDs() NO_THREAD_SAFETY_ANALYSIS {
     for (u32 I = 0; I < TSDsArraySize; I++)
       TSDs[I].lock();
   }
 
-  void enable() {
+  void enable() NO_THREAD_SAFETY_ANALYSIS {
+    enableAllTSDs();
+    Mutex.unlock();
+  }
+  // TSDs is an array of locks and which is not supported for marking
+  // thread-safety capability.
+  void enableAllTSDs() NO_THREAD_SAFETY_ANALYSIS {
     for (s32 I = static_cast<s32>(TSDsArraySize - 1); I >= 0; I--)
       TSDs[I].unlock();
-    Mutex.unlock();
   }
 
   bool setOption(Option O, sptr Value) {
@@ -119,7 +133,7 @@
     return reinterpret_cast<TSD<Allocator> *>(*getTlsPtr() & ~1ULL);
   }
 
-  bool setNumberOfTSDs(u32 N) {
+  bool setNumberOfTSDs(u32 N) EXCLUDES(MutexTSDs) {
     ScopedLock L(MutexTSDs);
     if (N < NumberOfTSDs)
       return false;
@@ -150,7 +164,7 @@
     *getTlsPtr() |= B;
   }
 
-  NOINLINE void initThread(Allocator *Instance) {
+  NOINLINE void initThread(Allocator *Instance) NO_THREAD_SAFETY_ANALYSIS {
     initOnceMaybe(Instance);
     // Initial context assignment is done in a plain round-robin fashion.
     const u32 Index = atomic_fetch_add(&CurrentIndex, 1U, memory_order_relaxed);
@@ -158,7 +172,10 @@
     Instance->callPostInitCallback();
   }
 
-  NOINLINE TSD<Allocator> *getTSDAndLockSlow(TSD<Allocator> *CurrentTSD) {
+  // TSDs is an array of locks which is not supported for marking thread-safety
+  // capability.
+  NOINLINE TSD<Allocator> *getTSDAndLockSlow(TSD<Allocator> *CurrentTSD)
+      EXCLUDES(MutexTSDs) {
     // Use the Precedence of the current TSD as our random seed. Since we are
     // in the slow path, it means that tryLock failed, and as a result it's
     // very likely that said Precedence is non-zero.
@@ -202,10 +219,10 @@
   }
 
   atomic_u32 CurrentIndex = {};
-  u32 NumberOfTSDs = 0;
-  u32 NumberOfCoPrimes = 0;
-  u32 CoPrimes[TSDsArraySize] = {};
-  bool Initialized = false;
+  u32 NumberOfTSDs GUARDED_BY(MutexTSDs) = 0;
+  u32 NumberOfCoPrimes GUARDED_BY(MutexTSDs) = 0;
+  u32 CoPrimes[TSDsArraySize] GUARDED_BY(MutexTSDs) = {};
+  bool Initialized GUARDED_BY(Mutex) = false;
   HybridMutex Mutex;
   HybridMutex MutexTSDs;
   TSD<Allocator> TSDs[TSDsArraySize];
diff --git a/compiler-rt/lib/scudo/standalone/wrappers_c.inc b/compiler-rt/lib/scudo/standalone/wrappers_c.inc
--- a/compiler-rt/lib/scudo/standalone/wrappers_c.inc
+++ b/compiler-rt/lib/scudo/standalone/wrappers_c.inc
@@ -238,7 +238,10 @@
     if (size < max_size)
       sizes[size]++;
   };
+
+  SCUDO_ALLOCATOR.disable();
   SCUDO_ALLOCATOR.iterateOverChunks(0, -1ul, callback, sizes);
+  SCUDO_ALLOCATOR.enable();
 
   fputs("<malloc version=\"scudo-1\">\n", stream);
   for (scudo::uptr i = 0; i != max_size; ++i)