diff --git a/compiler-rt/lib/scudo/standalone/chunk.h b/compiler-rt/lib/scudo/standalone/chunk.h
--- a/compiler-rt/lib/scudo/standalone/chunk.h
+++ b/compiler-rt/lib/scudo/standalone/chunk.h
@@ -65,7 +65,8 @@
 struct UnpackedHeader {
   uptr ClassId : 8;
   u8 State : 2;
-  u8 Origin : 2;
+  // Origin if State == Allocated, or WasZeroed otherwise.
+  u8 OriginOrWasZeroed : 2;
   uptr SizeOrUnusedBytes : 20;
   uptr Offset : 16;
   uptr Checksum : 16;
diff --git a/compiler-rt/lib/scudo/standalone/combined.h b/compiler-rt/lib/scudo/standalone/combined.h
--- a/compiler-rt/lib/scudo/standalone/combined.h
+++ b/compiler-rt/lib/scudo/standalone/combined.h
@@ -275,8 +275,10 @@
     }
 #endif // GWP_ASAN_HOOKS
 
-    const FillContentsMode FillContents =
-        ZeroContents ? ZeroFill : Options.FillContents;
+    const FillContentsMode FillContents = ZeroContents ? ZeroFill
+                                          : TSDRegistry.getDisableMemInit()
+                                              ? NoFill
+                                              : Options.FillContents;
 
     if (UNLIKELY(Alignment > MaxAlignment)) {
       if (Options.MayReturnNull)
@@ -405,7 +407,17 @@
             PrevEnd = NextPage;
           TaggedPtr = reinterpret_cast<void *>(TaggedUserPtr);
           resizeTaggedChunk(PrevEnd, TaggedUserPtr + Size, BlockEnd);
-          if (Size) {
+          if (UNLIKELY(FillContents != NoFill && !Header.OriginOrWasZeroed)) {
+            // If an allocation needs to be zeroed (i.e. calloc) we can normally
+            // avoid zeroing the memory now since we can rely on memory having
+            // been zeroed on free, as this is normally done while setting the
+            // UAF tag. But if tagging was disabled per-thread when the memory
+            // was freed, it would not have been retagged and thus zeroed, and
+            // therefore it needs to be zeroed now.
+            memset(TaggedPtr, 0,
+                   Min(Size, roundUpTo(PrevEnd - TaggedUserPtr,
+                                       archMemoryTagGranuleSize())));
+          } else if (Size) {
             // Clear any stack metadata that may have previously been stored in
             // the chunk data.
             memset(TaggedPtr, 0, archMemoryTagGranuleSize());
@@ -438,7 +450,7 @@
     }
     Header.ClassId = ClassId & Chunk::ClassIdMask;
     Header.State = Chunk::State::Allocated;
-    Header.Origin = Origin & Chunk::OriginMask;
+    Header.OriginOrWasZeroed = Origin & Chunk::OriginMask;
     Header.SizeOrUnusedBytes =
         (ClassId ? Size : SecondaryBlockEnd - (UserPtr + Size)) &
         Chunk::SizeOrUnusedBytesMask;
@@ -483,12 +495,12 @@
     if (UNLIKELY(Header.State != Chunk::State::Allocated))
       reportInvalidChunkState(AllocatorAction::Deallocating, Ptr);
     if (Options.DeallocTypeMismatch) {
-      if (Header.Origin != Origin) {
+      if (Header.OriginOrWasZeroed != Origin) {
         // With the exception of memalign'd chunks, that can be still be free'd.
-        if (UNLIKELY(Header.Origin != Chunk::Origin::Memalign ||
+        if (UNLIKELY(Header.OriginOrWasZeroed != Chunk::Origin::Memalign ||
                      Origin != Chunk::Origin::Malloc))
           reportDeallocTypeMismatch(AllocatorAction::Deallocating, Ptr,
-                                    Header.Origin, Origin);
+                                    Header.OriginOrWasZeroed, Origin);
       }
     }
 
@@ -541,9 +553,10 @@
     // applications think that it is OK to realloc a memalign'ed pointer, which
     // will trigger this check. It really isn't.
     if (Options.DeallocTypeMismatch) {
-      if (UNLIKELY(OldHeader.Origin != Chunk::Origin::Malloc))
+      if (UNLIKELY(OldHeader.OriginOrWasZeroed != Chunk::Origin::Malloc))
         reportDeallocTypeMismatch(AllocatorAction::Reallocating, OldPtr,
-                                  OldHeader.Origin, Chunk::Origin::Malloc);
+                                  OldHeader.OriginOrWasZeroed,
+                                  Chunk::Origin::Malloc);
     }
 
     void *BlockBegin = getBlockBegin(OldPtr, &OldHeader);
@@ -1017,14 +1030,17 @@
     Chunk::UnpackedHeader NewHeader = *Header;
     if (UNLIKELY(NewHeader.ClassId && useMemoryTagging())) {
       u8 PrevTag = extractTag(loadTag(reinterpret_cast<uptr>(Ptr)));
-      uptr TaggedBegin, TaggedEnd;
-      const uptr OddEvenMask = computeOddEvenMaskForPointerMaybe(
-          reinterpret_cast<uptr>(getBlockBegin(Ptr, &NewHeader)),
-          SizeClassMap::getSizeByClassId(NewHeader.ClassId));
-      // Exclude the previous tag so that immediate use after free is detected
-      // 100% of the time.
-      setRandomTag(Ptr, Size, OddEvenMask | (1UL << PrevTag), &TaggedBegin,
-                   &TaggedEnd);
+      if (!TSDRegistry.getDisableMemInit()) {
+        uptr TaggedBegin, TaggedEnd;
+        const uptr OddEvenMask = computeOddEvenMaskForPointerMaybe(
+            reinterpret_cast<uptr>(getBlockBegin(Ptr, &NewHeader)),
+            SizeClassMap::getSizeByClassId(NewHeader.ClassId));
+        // Exclude the previous tag so that immediate use after free is detected
+        // 100% of the time.
+        setRandomTag(Ptr, Size, OddEvenMask | (1UL << PrevTag), &TaggedBegin,
+                     &TaggedEnd);
+      }
+      NewHeader.OriginOrWasZeroed = !TSDRegistry.getDisableMemInit();
       storeDeallocationStackMaybe(Ptr, PrevTag);
     }
     // If the quarantine is disabled, the actual size of a chunk is 0 or larger
diff --git a/compiler-rt/lib/scudo/standalone/common.h b/compiler-rt/lib/scudo/standalone/common.h
--- a/compiler-rt/lib/scudo/standalone/common.h
+++ b/compiler-rt/lib/scudo/standalone/common.h
@@ -185,6 +185,8 @@
 enum class Option : u8 {
   ReleaseInterval,      // Release to OS interval in milliseconds.
   MemtagTuning,         // Whether to tune tagging for UAF or overflow.
+  ThreadDisableMemInit, // Whether to disable automatic heap initialization and,
+                        // where possible, memory tagging, on this thread.
   MaxCacheEntriesCount, // Maximum number of blocks that can be cached.
   MaxCacheEntrySize,    // Maximum size of a block that can be cached.
   MaxTSDsCount,         // Number of usable TSDs for the shared registry.
diff --git a/compiler-rt/lib/scudo/standalone/include/scudo/interface.h b/compiler-rt/lib/scudo/standalone/include/scudo/interface.h
--- a/compiler-rt/lib/scudo/standalone/include/scudo/interface.h
+++ b/compiler-rt/lib/scudo/standalone/include/scudo/interface.h
@@ -121,6 +121,14 @@
 #define M_MEMTAG_TUNING -102
 #endif
 
+// Per-thread memory initialization tuning. The value argument should be one of:
+// 1: Disable automatic heap initialization and, where possible, memory tagging,
+//    on this thread.
+// 0: Normal behavior.
+#ifndef M_THREAD_DISABLE_MEM_INIT
+#define M_THREAD_DISABLE_MEM_INIT -103
+#endif
+
 #ifndef M_CACHE_COUNT_MAX
 #define M_CACHE_COUNT_MAX -200
 #endif
diff --git a/compiler-rt/lib/scudo/standalone/tests/chunk_test.cpp b/compiler-rt/lib/scudo/standalone/tests/chunk_test.cpp
--- a/compiler-rt/lib/scudo/standalone/tests/chunk_test.cpp
+++ b/compiler-rt/lib/scudo/standalone/tests/chunk_test.cpp
@@ -41,7 +41,7 @@
   initChecksum();
   const scudo::uptr Size = 0x100U;
   scudo::Chunk::UnpackedHeader OldHeader = {};
-  OldHeader.Origin = scudo::Chunk::Origin::Malloc;
+  OldHeader.OriginOrWasZeroed = scudo::Chunk::Origin::Malloc;
   OldHeader.ClassId = 0x42U;
   OldHeader.SizeOrUnusedBytes = Size;
   OldHeader.State = scudo::Chunk::State::Allocated;
diff --git a/compiler-rt/lib/scudo/standalone/tests/combined_test.cpp b/compiler-rt/lib/scudo/standalone/tests/combined_test.cpp
--- a/compiler-rt/lib/scudo/standalone/tests/combined_test.cpp
+++ b/compiler-rt/lib/scudo/standalone/tests/combined_test.cpp
@@ -512,3 +512,44 @@
     EXPECT_TRUE(Found);
   }
 }
+
+TEST(ScudoCombinedTest, DisableMemInit) {
+  using AllocatorT = TestAllocator<scudo::AndroidConfig>;
+  using SizeClassMap = AllocatorT::PrimaryT::SizeClassMap;
+  auto Allocator = std::unique_ptr<AllocatorT>(new AllocatorT());
+
+  std::vector<void *> Ptrs(65536, nullptr);
+
+  Allocator->setOption(scudo::Option::ThreadDisableMemInit, 1);
+
+  constexpr scudo::uptr MinAlignLog = FIRST_32_SECOND_64(3U, 4U);
+
+  // Test that if mem-init is disabled on a thread, calloc should still work as
+  // expected. This is tricky to ensure when MTE is enabled, so this test tries
+  // to exercise the relevant code on our MTE path.
+  for (scudo::uptr ClassId = 1U; ClassId <= 8; ClassId++) {
+    const scudo::uptr Size =
+        SizeClassMap::getSizeByClassId(ClassId) - scudo::Chunk::getHeaderSize();
+    if (Size < 8)
+      continue;
+    for (unsigned I = 0; I != Ptrs.size(); ++I) {
+      Ptrs[I] = Allocator->allocate(Size, Origin);
+      memset(Ptrs[I], 0xaa, Size);
+    }
+    for (unsigned I = 0; I != Ptrs.size(); ++I)
+      Allocator->deallocate(Ptrs[I], Origin, Size);
+    for (unsigned I = 0; I != Ptrs.size(); ++I) {
+      Ptrs[I] = Allocator->allocate(Size - 8, Origin);
+      memset(Ptrs[I], 0xbb, Size - 8);
+    }
+    for (unsigned I = 0; I != Ptrs.size(); ++I)
+      Allocator->deallocate(Ptrs[I], Origin, Size - 8);
+    for (unsigned I = 0; I != Ptrs.size(); ++I) {
+      Ptrs[I] = Allocator->allocate(Size, Origin, 1U << MinAlignLog, true);
+      for (scudo::uptr J = 0; J < Size; ++J)
+        ASSERT_EQ((reinterpret_cast<char *>(Ptrs[I]))[J], 0);
+    }
+  }
+
+  Allocator->setOption(scudo::Option::ThreadDisableMemInit, 0);
+}
diff --git a/compiler-rt/lib/scudo/standalone/tsd_exclusive.h b/compiler-rt/lib/scudo/standalone/tsd_exclusive.h
--- a/compiler-rt/lib/scudo/standalone/tsd_exclusive.h
+++ b/compiler-rt/lib/scudo/standalone/tsd_exclusive.h
@@ -13,10 +13,13 @@
 
 namespace scudo {
 
-enum class ThreadState : u8 {
-  NotInitialized = 0,
-  Initialized,
-  TornDown,
+struct ThreadState {
+  bool DisableMemInit : 1;
+  enum {
+    NotInitialized = 0,
+    Initialized,
+    TornDown,
+  } InitState : 2;
 };
 
 template <class Allocator> void teardownThread(void *Ptr);
@@ -36,13 +39,13 @@
   void unmapTestOnly() {}
 
   ALWAYS_INLINE void initThreadMaybe(Allocator *Instance, bool MinimalInit) {
-    if (LIKELY(State != ThreadState::NotInitialized))
+    if (LIKELY(State.InitState != ThreadState::NotInitialized))
       return;
     initThread(Instance, MinimalInit);
   }
 
   ALWAYS_INLINE TSD<Allocator> *getTSDAndLock(bool *UnlockRequired) {
-    if (LIKELY(State == ThreadState::Initialized &&
+    if (LIKELY(State.InitState == ThreadState::Initialized &&
                !atomic_load(&Disabled, memory_order_acquire))) {
       *UnlockRequired = false;
       return &ThreadTSD;
@@ -67,11 +70,15 @@
   }
 
   bool setOption(Option O, UNUSED sptr Value) {
+    if (O == Option::ThreadDisableMemInit)
+      State.DisableMemInit = Value;
     if (O == Option::MaxTSDsCount)
       return false;
     return true;
   }
 
+  bool getDisableMemInit() { return State.DisableMemInit; }
+
 private:
   void initOnceMaybe(Allocator *Instance) {
     ScopedLock L(Mutex);
@@ -90,7 +97,7 @@
     CHECK_EQ(
         pthread_setspecific(PThreadKey, reinterpret_cast<void *>(Instance)), 0);
     ThreadTSD.initLinkerInitialized(Instance);
-    State = ThreadState::Initialized;
+    State.InitState = ThreadState::Initialized;
     Instance->callPostInitCallback();
   }
 
@@ -126,7 +133,7 @@
       return;
   }
   TSDRegistryT::ThreadTSD.commitBack(Instance);
-  TSDRegistryT::State = ThreadState::TornDown;
+  TSDRegistryT::State.InitState = ThreadState::TornDown;
 }
 
 } // namespace scudo
diff --git a/compiler-rt/lib/scudo/standalone/tsd_shared.h b/compiler-rt/lib/scudo/standalone/tsd_shared.h
--- a/compiler-rt/lib/scudo/standalone/tsd_shared.h
+++ b/compiler-rt/lib/scudo/standalone/tsd_shared.h
@@ -83,10 +83,14 @@
   bool setOption(Option O, sptr Value) {
     if (O == Option::MaxTSDsCount)
       return setNumberOfTSDs(static_cast<u32>(Value));
+    if (O == Option::ThreadDisableMemInit)
+      setDisableMemInit(Value);
     // Not supported by the TSD Registry, but not an error either.
     return true;
   }
 
+  bool getDisableMemInit() const { return *getTlsPtr() & 1; }
+
 private:
   ALWAYS_INLINE uptr *getTlsPtr() const {
 #if SCUDO_HAS_PLATFORM_TLS_SLOT
@@ -97,12 +101,15 @@
 #endif
   }
 
+  static_assert(alignof(TSD<Allocator>) >= 2, "");
+
   ALWAYS_INLINE void setCurrentTSD(TSD<Allocator> *CurrentTSD) {
-    *getTlsPtr() = reinterpret_cast<uptr>(CurrentTSD);
+    *getTlsPtr() &= 1;
+    *getTlsPtr() |= reinterpret_cast<uptr>(CurrentTSD);
   }
 
   ALWAYS_INLINE TSD<Allocator> *getCurrentTSD() {
-    return reinterpret_cast<TSD<Allocator> *>(*getTlsPtr());
+    return reinterpret_cast<TSD<Allocator> *>(*getTlsPtr() & ~1ULL);
   }
 
   bool setNumberOfTSDs(u32 N) {
@@ -131,6 +138,11 @@
     return true;
   }
 
+  void setDisableMemInit(bool B) {
+    *getTlsPtr() &= ~1ULL;
+    *getTlsPtr() |= B;
+  }
+
   void initOnceMaybe(Allocator *Instance) {
     ScopedLock L(Mutex);
     if (LIKELY(Initialized))
diff --git a/compiler-rt/lib/scudo/standalone/wrappers_c.inc b/compiler-rt/lib/scudo/standalone/wrappers_c.inc
--- a/compiler-rt/lib/scudo/standalone/wrappers_c.inc
+++ b/compiler-rt/lib/scudo/standalone/wrappers_c.inc
@@ -179,6 +179,9 @@
     case M_MEMTAG_TUNING:
       option = scudo::Option::MemtagTuning;
       break;
+    case M_THREAD_DISABLE_MEM_INIT:
+      option = scudo::Option::ThreadDisableMemInit;
+      break;
     case M_CACHE_COUNT_MAX:
       option = scudo::Option::MaxCacheEntriesCount;
       break;