diff --git a/compiler-rt/lib/scudo/standalone/allocator_config.h b/compiler-rt/lib/scudo/standalone/allocator_config.h
--- a/compiler-rt/lib/scudo/standalone/allocator_config.h
+++ b/compiler-rt/lib/scudo/standalone/allocator_config.h
@@ -34,6 +34,14 @@
 //   typedef SizeClassAllocator64<ExampleConfig> Primary;
 //   // Log2 of the size of a size class region, as used by the Primary.
 //   static const uptr PrimaryRegionSizeLog = 30U;
+//   // Log2 of the size of block group, as used by the Primary. Each group
+//   // contains a range of memory addresses, blocks in the range will belong to
+//   // the same group. In general, single region may have 1 or 2MB group size.
+//   // Multiple regions will have the group size equal to the region size
+//   // because the region size is usually smaller than 1 MB.
+//   // Smaller value gives fine-grained control of memory usage but the trade
+//   // off is that it may take longer time of deallocation.
+//   static const uptr PrimaryGroupSizeLog = 20U;
 //   // Defines the type and scale of a compact pointer. A compact pointer can
 //   // be understood as the offset of a pointer within the region it belongs
 //   // to, in increments of a power-of-2 scale.
@@ -65,6 +73,7 @@
 #if SCUDO_CAN_USE_PRIMARY64
   typedef SizeClassAllocator64<DefaultConfig> Primary;
   static const uptr PrimaryRegionSizeLog = 32U;
+  static const uptr PrimaryGroupSizeLog = 21U;
   typedef uptr PrimaryCompactPtrT;
   static const uptr PrimaryCompactPtrScale = 0;
   static const bool PrimaryEnableRandomOffset = true;
@@ -72,6 +81,7 @@
 #else
   typedef SizeClassAllocator32<DefaultConfig> Primary;
   static const uptr PrimaryRegionSizeLog = 19U;
+  static const uptr PrimaryGroupSizeLog = 19U;
   typedef uptr PrimaryCompactPtrT;
 #endif
   static const s32 PrimaryMinReleaseToOsIntervalMs = INT32_MIN;
@@ -96,11 +106,13 @@
   static const uptr PrimaryRegionSizeLog = 28U;
   typedef u32 PrimaryCompactPtrT;
   static const uptr PrimaryCompactPtrScale = SCUDO_MIN_ALIGNMENT_LOG;
+  static const uptr PrimaryGroupSizeLog = 20U;
   static const bool PrimaryEnableRandomOffset = true;
   static const uptr PrimaryMapSizeIncrement = 1UL << 18;
 #else
   typedef SizeClassAllocator32<AndroidConfig> Primary;
   static const uptr PrimaryRegionSizeLog = 18U;
+  static const uptr PrimaryGroupSizeLog = 18U;
   typedef uptr PrimaryCompactPtrT;
 #endif
   static const s32 PrimaryMinReleaseToOsIntervalMs = 1000;
@@ -127,11 +139,13 @@
   static const uptr PrimaryRegionSizeLog = 27U;
   typedef u32 PrimaryCompactPtrT;
   static const uptr PrimaryCompactPtrScale = SCUDO_MIN_ALIGNMENT_LOG;
+  static const uptr PrimaryGroupSizeLog = 18U;
   static const bool PrimaryEnableRandomOffset = true;
   static const uptr PrimaryMapSizeIncrement = 1UL << 18;
 #else
   typedef SizeClassAllocator32<AndroidSvelteConfig> Primary;
   static const uptr PrimaryRegionSizeLog = 16U;
+  static const uptr PrimaryGroupSizeLog = 16U;
   typedef uptr PrimaryCompactPtrT;
 #endif
   static const s32 PrimaryMinReleaseToOsIntervalMs = 1000;
@@ -156,6 +170,7 @@
 
   typedef SizeClassAllocator64<FuchsiaConfig> Primary;
   static const uptr PrimaryRegionSizeLog = 30U;
+  static const uptr PrimaryGroupSizeLog = 30U;
   typedef u32 PrimaryCompactPtrT;
   static const bool PrimaryEnableRandomOffset = true;
   static const uptr PrimaryMapSizeIncrement = 1UL << 18;
@@ -175,6 +190,7 @@
   typedef SizeClassAllocator64<TrustyConfig> Primary;
   // Some apps have 1 page of heap total so small regions are necessary.
   static const uptr PrimaryRegionSizeLog = 10U;
+  static const uptr PrimaryGroupSizeLog = 10U;
   typedef u32 PrimaryCompactPtrT;
   static const bool PrimaryEnableRandomOffset = false;
   // Trusty is extremely memory-constrained so minimally round up map calls.
diff --git a/compiler-rt/lib/scudo/standalone/list.h b/compiler-rt/lib/scudo/standalone/list.h
--- a/compiler-rt/lib/scudo/standalone/list.h
+++ b/compiler-rt/lib/scudo/standalone/list.h
@@ -110,6 +110,18 @@
     Size--;
   }
 
+  // Insert X next to Prev
+  void insert(T *Prev, T *X) {
+    DCHECK(!empty());
+    DCHECK_NE(Prev, nullptr);
+    DCHECK_NE(X, nullptr);
+    X->Next = Prev->Next;
+    Prev->Next = X;
+    if (Last == Prev)
+      Last = X;
+    ++Size;
+  }
+
   void extract(T *Prev, T *X) {
     DCHECK(!empty());
     DCHECK_NE(Prev, nullptr);
diff --git a/compiler-rt/lib/scudo/standalone/local_cache.h b/compiler-rt/lib/scudo/standalone/local_cache.h
--- a/compiler-rt/lib/scudo/standalone/local_cache.h
+++ b/compiler-rt/lib/scudo/standalone/local_cache.h
@@ -10,6 +10,7 @@
 #define SCUDO_LOCAL_CACHE_H_
 
 #include "internal_defs.h"
+#include "list.h"
 #include "platform.h"
 #include "report.h"
 #include "stats.h"
@@ -27,6 +28,12 @@
       Count = N;
       memcpy(Batch, Array, sizeof(Batch[0]) * Count);
     }
+    void appendFromArray(CompactPtrT *Array, u16 N) {
+      DCHECK_LE(N, MaxNumCached - Count);
+      memcpy(Batch + Count, Array, sizeof(Batch[0]) * N);
+      // u16 will be promoted to int by arithmetic type conversion.
+      Count = static_cast<u16>(Count + N);
+    }
     void clear() { Count = 0; }
     void add(CompactPtrT P) {
       DCHECK_LT(Count, MaxNumCached);
@@ -50,6 +57,22 @@
     u16 Count;
   };
 
+  // A BatchGroup is used to collect blocks. Each group has a group id to
+  // identify the group kind of contained blocks.
+  struct BatchGroup {
+    // `Next` is used by IntrusiveList.
+    BatchGroup *Next;
+    // The identifier of each group
+    uptr GroupId;
+    // Cache value of TransferBatch::getMaxCached()
+    u16 MaxCachedPerBatch;
+    // Blocks are managed by TransferBatch in a list.
+    SinglyLinkedList<TransferBatch> Batches;
+  };
+
+  static_assert(sizeof(BatchGroup) <= sizeof(TransferBatch),
+                "BatchGroup uses the same class size as TransferBatch");
+
   void init(GlobalStats *S, SizeClassAllocator *A) {
     DCHECK(isEmpty());
     Stats.init();
@@ -121,9 +144,18 @@
   TransferBatch *createBatch(uptr ClassId, void *B) {
     if (ClassId != BatchClassId)
       B = allocate(BatchClassId);
+    if (UNLIKELY(!B))
+      reportOutOfMemory(SizeClassAllocator::getSizeByClassId(BatchClassId));
     return reinterpret_cast<TransferBatch *>(B);
   }
 
+  BatchGroup *createGroup() {
+    void *Ptr = allocate(BatchClassId);
+    if (UNLIKELY(!Ptr))
+      reportOutOfMemory(SizeClassAllocator::getSizeByClassId(BatchClassId));
+    return reinterpret_cast<BatchGroup *>(Ptr);
+  }
+
   LocalStats &getStats() { return Stats; }
 
 private:
@@ -182,16 +214,11 @@
 
   NOINLINE void drain(PerClass *C, uptr ClassId) {
     const u16 Count = Min(static_cast<u16>(C->MaxCount / 2), C->Count);
-    TransferBatch *B =
-        createBatch(ClassId, Allocator->decompactPtr(ClassId, C->Chunks[0]));
-    if (UNLIKELY(!B))
-      reportOutOfMemory(SizeClassAllocator::getSizeByClassId(BatchClassId));
-    B->setFromArray(&C->Chunks[0], Count);
+    Allocator->pushBlocks(this, ClassId, &C->Chunks[0], Count);
     // u16 will be promoted to int by arithmetic type conversion.
     C->Count = static_cast<u16>(C->Count - Count);
     for (u16 I = 0; I < C->Count; I++)
       C->Chunks[I] = C->Chunks[I + Count];
-    Allocator->pushBatch(ClassId, B);
   }
 };
 
diff --git a/compiler-rt/lib/scudo/standalone/primary32.h b/compiler-rt/lib/scudo/standalone/primary32.h
--- a/compiler-rt/lib/scudo/standalone/primary32.h
+++ b/compiler-rt/lib/scudo/standalone/primary32.h
@@ -43,6 +43,7 @@
 public:
   typedef typename Config::PrimaryCompactPtrT CompactPtrT;
   typedef typename Config::SizeClassMap SizeClassMap;
+  static const uptr GroupSizeLog = Config::PrimaryGroupSizeLog;
   // The bytemap can only track UINT8_MAX - 1 classes.
   static_assert(SizeClassMap::LargestClassId <= (UINT8_MAX - 1), "");
   // Regions should be large enough to hold the largest Block.
@@ -51,6 +52,7 @@
   typedef SizeClassAllocator32<Config> ThisT;
   typedef SizeClassAllocatorLocalCache<ThisT> CacheT;
   typedef typename CacheT::TransferBatch TransferBatch;
+  typedef typename CacheT::BatchGroup BatchGroup;
 
   static uptr getSizeByClassId(uptr ClassId) {
     return (ClassId == SizeClassMap::BatchClassId)
@@ -111,30 +113,69 @@
     return reinterpret_cast<void *>(static_cast<uptr>(CompactPtr));
   }
 
+  uptr compactPtrGroup(CompactPtrT CompactPtr) {
+    return CompactPtr >> GroupSizeLog;
+  }
+
   TransferBatch *popBatch(CacheT *C, uptr ClassId) {
     DCHECK_LT(ClassId, NumClasses);
     SizeClassInfo *Sci = getSizeClassInfo(ClassId);
     ScopedLock L(Sci->Mutex);
-    TransferBatch *B = Sci->FreeList.front();
-    if (B) {
-      Sci->FreeList.pop_front();
-    } else {
-      B = populateFreeList(C, ClassId, Sci);
-      if (UNLIKELY(!B))
+    TransferBatch *B = popBatchImpl(C, ClassId);
+    if (UNLIKELY(!B)) {
+      if (UNLIKELY(!populateFreeList(C, ClassId, Sci)))
         return nullptr;
+      B = popBatchImpl(C, ClassId);
+      // if `populateFreeList` succeeded, we are supposed to get free blocks.
+      DCHECK_NE(B, nullptr);
     }
-    DCHECK_GT(B->getCount(), 0);
     Sci->Stats.PoppedBlocks += B->getCount();
     return B;
   }
 
-  void pushBatch(uptr ClassId, TransferBatch *B) {
+  // Push the array of free blocks to the designated batch group.
+  void pushBlocks(CacheT *C, uptr ClassId, CompactPtrT *Array, u32 Size) {
     DCHECK_LT(ClassId, NumClasses);
-    DCHECK_GT(B->getCount(), 0);
+    DCHECK_GT(Size, 0);
+
     SizeClassInfo *Sci = getSizeClassInfo(ClassId);
+    if (ClassId == SizeClassMap::BatchClassId) {
+      ScopedLock L(Sci->Mutex);
+      // Constructing a batch group in the free list will use two blocks in
+      // BatchClassId. If we are pushing BatchClassId blocks, we will use the
+      // blocks in the array directly (can't delegate local cache which will
+      // cause a recursive allocation). However, The number of free blocks may
+      // be less than two. Therefore, populate the free list before inserting
+      // the blocks.
+      if (Size == 1 && !populateFreeList(C, ClassId, Sci))
+        return;
+      pushBlocksImpl(C, ClassId, Array, Size);
+      Sci->Stats.PushedBlocks += Size;
+      return;
+    }
+
+    // TODO(chiahungduan): Consider not doing grouping if the group size is not
+    // greater than the block size with a certain scale.
+
+    // Sort the blocks so that blocks belonging to the same group can be pushed
+    // together.
+    bool SameGroup = true;
+    for (u32 I = 1; I < Size; ++I) {
+      if (compactPtrGroup(Array[I - 1]) != compactPtrGroup(Array[I]))
+        SameGroup = false;
+      CompactPtrT Cur = Array[I];
+      u32 J = I;
+      while (J > 0 && compactPtrGroup(Cur) < compactPtrGroup(Array[J - 1])) {
+        Array[J] = Array[J - 1];
+        --J;
+      }
+      Array[J] = Cur;
+    }
+
     ScopedLock L(Sci->Mutex);
-    Sci->FreeList.push_front(B);
-    Sci->Stats.PushedBlocks += B->getCount();
+    pushBlocksImpl(C, ClassId, Array, Size, SameGroup);
+
+    Sci->Stats.PushedBlocks += Size;
     if (ClassId != SizeClassMap::BatchClassId)
       releaseToOSMaybe(Sci, ClassId);
   }
@@ -256,7 +297,7 @@
 
   struct alignas(SCUDO_CACHE_LINE_SIZE) SizeClassInfo {
     HybridMutex Mutex;
-    SinglyLinkedList<TransferBatch> FreeList;
+    SinglyLinkedList<BatchGroup> FreeList;
     uptr CurrentRegion;
     uptr CurrentRegionAllocated;
     SizeClassStats Stats;
@@ -328,8 +369,187 @@
     return &SizeClassInfoArray[ClassId];
   }
 
-  NOINLINE TransferBatch *populateFreeList(CacheT *C, uptr ClassId,
-                                           SizeClassInfo *Sci) {
+  // Push the blocks to their batch group. The layout will be like,
+  //
+  // FreeList - > BG -> BG -> BG
+  //              |     |     |
+  //              v     v     v
+  //              TB    TB    TB
+  //              |
+  //              v
+  //              TB
+  //
+  // Each BlockGroup(BG) will associate with unique group id and the free blocks
+  // are managed by a list of TransferBatch(TB). To reduce the time of inserting
+  // blocks, BGs are sorted and the input `Array` are supposed to be sorted so
+  // that we can get better performance of maintaining sorted property.
+  // Use `SameGroup=true` to indicate that all blocks in the array are from the
+  // same group then we will skip checking the group id of each block.
+  //
+  // Note that this aims to have a better management of dirty pages, i.e., the
+  // RSS usage won't grow indefinitely. There's an exception that we may not put
+  // a block to its associated group. While populating new blocks, we may have
+  // blocks cross different groups. However, most cases will fall into same
+  // group and they are supposed to be popped soon. In that case, it's not worth
+  // sorting the array with the almost-sorted property. Therefore, we use
+  // `SameGroup=true` instead.
+  //
+  // The region mutex needs to be held while calling this method.
+  void pushBlocksImpl(CacheT *C, uptr ClassId, CompactPtrT *Array, u32 Size,
+                      bool SameGroup = false) {
+    DCHECK_GT(Size, 0U);
+    SizeClassInfo *Sci = getSizeClassInfo(ClassId);
+
+    auto CreateGroup = [&](uptr GroupId) {
+      BatchGroup *BG = nullptr;
+      TransferBatch *TB = nullptr;
+      if (ClassId == SizeClassMap::BatchClassId) {
+        DCHECK_GE(Size, 2U);
+        BG = reinterpret_cast<BatchGroup *>(
+            decompactPtr(ClassId, Array[Size - 1]));
+        BG->Batches.clear();
+
+        TB = reinterpret_cast<TransferBatch *>(
+            decompactPtr(ClassId, Array[Size - 2]));
+        TB->clear();
+      } else {
+        BG = C->createGroup();
+        BG->Batches.clear();
+
+        TB = C->createBatch(ClassId, nullptr);
+        TB->clear();
+      }
+
+      BG->GroupId = GroupId;
+      BG->Batches.push_front(TB);
+      BG->MaxCachedPerBatch =
+          TransferBatch::getMaxCached(getSizeByClassId(ClassId));
+
+      return BG;
+    };
+
+    auto InsertBlocks = [&](BatchGroup *BG, CompactPtrT *Array, u32 Size) {
+      SinglyLinkedList<TransferBatch> &Batches = BG->Batches;
+      TransferBatch *CurBatch = Batches.front();
+      DCHECK_NE(CurBatch, nullptr);
+
+      for (u32 I = 0; I < Size;) {
+        DCHECK_GE(BG->MaxCachedPerBatch, CurBatch->getCount());
+        u16 UnusedSlots =
+            static_cast<u16>(BG->MaxCachedPerBatch - CurBatch->getCount());
+        if (UnusedSlots == 0) {
+          CurBatch = C->createBatch(
+              ClassId,
+              reinterpret_cast<void *>(decompactPtr(ClassId, Array[I])));
+          CurBatch->clear();
+          Batches.push_front(CurBatch);
+          UnusedSlots = BG->MaxCachedPerBatch;
+        }
+        // `UnusedSlots` is u16 so the result will be also fit in u16.
+        u16 AppendSize = static_cast<u16>(Min<u32>(UnusedSlots, Size - I));
+        CurBatch->appendFromArray(&Array[I], AppendSize);
+        I += AppendSize;
+      }
+    };
+
+    BatchGroup *Cur = Sci->FreeList.front();
+
+    if (ClassId == SizeClassMap::BatchClassId) {
+      if (Cur == nullptr) {
+        // Don't need to classify BatchClassId.
+        Cur = CreateGroup(/*GroupId=*/0);
+        Sci->FreeList.push_front(Cur);
+      }
+      InsertBlocks(Cur, Array, Size);
+      return;
+    }
+
+    // In the following, `Cur` always points to the BatchGroup for blocks that
+    // will be pushed next. `Prev` is the element right before `Cur`.
+    BatchGroup *Prev = nullptr;
+
+    while (Cur != nullptr && compactPtrGroup(Array[0]) > Cur->GroupId) {
+      Prev = Cur;
+      Cur = Cur->Next;
+    }
+
+    if (Cur == nullptr || compactPtrGroup(Array[0]) != Cur->GroupId) {
+      Cur = CreateGroup(compactPtrGroup(Array[0]));
+      if (Prev == nullptr)
+        Sci->FreeList.push_front(Cur);
+      else
+        Sci->FreeList.insert(Prev, Cur);
+    }
+
+    // All the blocks are from the same group, just push without checking group
+    // id.
+    if (SameGroup) {
+      InsertBlocks(Cur, Array, Size);
+      return;
+    }
+
+    // The blocks are sorted by group id. Determine the segment of group and
+    // push them to their group together.
+    u32 Count = 1;
+    for (u32 I = 1; I < Size; ++I) {
+      if (compactPtrGroup(Array[I - 1]) != compactPtrGroup(Array[I])) {
+        DCHECK_EQ(compactPtrGroup(Array[I - 1]), Cur->GroupId);
+        InsertBlocks(Cur, Array + I - Count, Count);
+
+        while (Cur != nullptr && compactPtrGroup(Array[I]) > Cur->GroupId) {
+          Prev = Cur;
+          Cur = Cur->Next;
+        }
+
+        if (Cur == nullptr || compactPtrGroup(Array[I]) != Cur->GroupId) {
+          Cur = CreateGroup(compactPtrGroup(Array[I]));
+          DCHECK_NE(Prev, nullptr);
+          Sci->FreeList.insert(Prev, Cur);
+        }
+
+        Count = 1;
+      } else {
+        ++Count;
+      }
+    }
+
+    InsertBlocks(Cur, Array + Size - Count, Count);
+  }
+
+  // Pop one TransferBatch from a BatchGroup. The BatchGroup with the smallest
+  // group id will be considered first.
+  //
+  // The region mutex needs to be held while calling this method.
+  TransferBatch *popBatchImpl(CacheT *C, uptr ClassId) {
+    SizeClassInfo *Sci = getSizeClassInfo(ClassId);
+    if (Sci->FreeList.empty())
+      return nullptr;
+
+    SinglyLinkedList<TransferBatch> &Batches = Sci->FreeList.front()->Batches;
+    DCHECK(!Batches.empty());
+
+    TransferBatch *B = Batches.front();
+    Batches.pop_front();
+    DCHECK_NE(B, nullptr);
+    DCHECK_GT(B->getCount(), 0U);
+
+    if (Batches.empty()) {
+      BatchGroup *BG = Sci->FreeList.front();
+      Sci->FreeList.pop_front();
+
+      // We don't keep BatchGroup with zero blocks to avoid empty-checking while
+      // allocating. Note that block used by constructing BatchGroup is recorded
+      // as free blocks in the last element of BatchGroup::Batches. Which means,
+      // once we pop the last TransferBatch, the block is implicitly
+      // deallocated.
+      if (ClassId != SizeClassMap::BatchClassId)
+        C->deallocate(SizeClassMap::BatchClassId, BG);
+    }
+
+    return B;
+  }
+
+  NOINLINE bool populateFreeList(CacheT *C, uptr ClassId, SizeClassInfo *Sci) {
     uptr Region;
     uptr Offset;
     // If the size-class currently has a region associated to it, use it. The
@@ -344,7 +564,7 @@
       DCHECK_EQ(Sci->CurrentRegionAllocated, 0U);
       Region = allocateRegion(Sci, ClassId);
       if (UNLIKELY(!Region))
-        return nullptr;
+        return false;
       C->getStats().add(StatMapped, RegionSize);
       Sci->CurrentRegion = Region;
       Offset = 0;
@@ -378,20 +598,15 @@
     if (ClassId != SizeClassMap::BatchClassId)
       shuffle(ShuffleArray, NumberOfBlocks, &Sci->RandState);
     for (u32 I = 0; I < NumberOfBlocks;) {
-      TransferBatch *B =
-          C->createBatch(ClassId, reinterpret_cast<void *>(ShuffleArray[I]));
-      if (UNLIKELY(!B))
-        return nullptr;
       // `MaxCount` is u16 so the result will also fit in u16.
       const u16 N = static_cast<u16>(Min<u32>(MaxCount, NumberOfBlocks - I));
-      B->setFromArray(&ShuffleArray[I], N);
-      Sci->FreeList.push_back(B);
+      // Note that the N blocks here may have different group ids. Given that
+      // it only happens when it crosses the group size boundary. Instead of
+      // sorting them, treat them as same group here to avoid sorting the
+      // almost-sorted blocks.
+      pushBlocksImpl(C, ClassId, &ShuffleArray[I], N, /*SameGroup=*/true);
       I += N;
     }
-    TransferBatch *B = Sci->FreeList.front();
-    Sci->FreeList.pop_front();
-    DCHECK(B);
-    DCHECK_GT(B->getCount(), 0);
 
     const uptr AllocatedUser = Size * NumberOfBlocks;
     C->getStats().add(StatFree, AllocatedUser);
@@ -407,7 +622,7 @@
     }
     Sci->AllocatedUser += AllocatedUser;
 
-    return B;
+    return true;
   }
 
   void getStats(ScopedString *Str, uptr ClassId, uptr Rss) {
@@ -477,8 +692,12 @@
     auto DecompactPtr = [](CompactPtrT CompactPtr) {
       return reinterpret_cast<uptr>(CompactPtr);
     };
-    releaseFreeMemoryToOS(Sci->FreeList, RegionSize, NumberOfRegions, BlockSize,
-                          Recorder, DecompactPtr, SkipRegion);
+    PageReleaseContext Context(BlockSize, RegionSize, NumberOfRegions);
+    for (BatchGroup &BG : Sci->FreeList)
+      Context.markFreeBlocks(BG.Batches, DecompactPtr, Base);
+
+    releaseFreeMemoryToOS(Context, Recorder, SkipRegion);
+
     if (Recorder.getReleasedRangesCount() > 0) {
       Sci->ReleaseInfo.PushedBlocksAtLastRelease = Sci->Stats.PushedBlocks;
       Sci->ReleaseInfo.RangesReleased += Recorder.getReleasedRangesCount();
diff --git a/compiler-rt/lib/scudo/standalone/primary64.h b/compiler-rt/lib/scudo/standalone/primary64.h
--- a/compiler-rt/lib/scudo/standalone/primary64.h
+++ b/compiler-rt/lib/scudo/standalone/primary64.h
@@ -45,10 +45,12 @@
 public:
   typedef typename Config::PrimaryCompactPtrT CompactPtrT;
   static const uptr CompactPtrScale = Config::PrimaryCompactPtrScale;
+  static const uptr GroupSizeLog = Config::PrimaryGroupSizeLog;
   typedef typename Config::SizeClassMap SizeClassMap;
   typedef SizeClassAllocator64<Config> ThisT;
   typedef SizeClassAllocatorLocalCache<ThisT> CacheT;
   typedef typename CacheT::TransferBatch TransferBatch;
+  typedef typename CacheT::BatchGroup BatchGroup;
 
   static uptr getSizeByClassId(uptr ClassId) {
     return (ClassId == SizeClassMap::BatchClassId)
@@ -99,25 +101,61 @@
     DCHECK_LT(ClassId, NumClasses);
     RegionInfo *Region = getRegionInfo(ClassId);
     ScopedLock L(Region->Mutex);
-    TransferBatch *B = Region->FreeList.front();
-    if (B) {
-      Region->FreeList.pop_front();
-    } else {
-      B = populateFreeList(C, ClassId, Region);
-      if (UNLIKELY(!B))
+    TransferBatch *B = popBatchImpl(C, ClassId);
+    if (UNLIKELY(!B)) {
+      if (UNLIKELY(!populateFreeList(C, ClassId, Region)))
         return nullptr;
+      B = popBatchImpl(C, ClassId);
+      // if `populateFreeList` succeeded, we are supposed to get free blocks.
+      DCHECK_NE(B, nullptr);
     }
-    DCHECK_GT(B->getCount(), 0);
     Region->Stats.PoppedBlocks += B->getCount();
     return B;
   }
 
-  void pushBatch(uptr ClassId, TransferBatch *B) {
-    DCHECK_GT(B->getCount(), 0);
+  // Push the array of free blocks to the designated batch group.
+  void pushBlocks(CacheT *C, uptr ClassId, CompactPtrT *Array, u32 Size) {
+    DCHECK_LT(ClassId, NumClasses);
+    DCHECK_GT(Size, 0);
+
     RegionInfo *Region = getRegionInfo(ClassId);
+    if (ClassId == SizeClassMap::BatchClassId) {
+      ScopedLock L(Region->Mutex);
+      // Constructing a batch group in the free list will use two blocks in
+      // BatchClassId. If we are pushing BatchClassId blocks, we will use the
+      // blocks in the array directly (can't delegate local cache which will
+      // cause a recursive allocation). However, The number of free blocks may
+      // be less than two. Therefore, populate the free list before inserting
+      // the blocks.
+      if (Size == 1 && UNLIKELY(!populateFreeList(C, ClassId, Region)))
+        return;
+      pushBlocksImpl(C, ClassId, Array, Size);
+      Region->Stats.PushedBlocks += Size;
+      return;
+    }
+
+    // TODO(chiahungduan): Consider not doing grouping if the group size is not
+    // greater than the block size with a certain scale.
+
+    // Sort the blocks so that blocks belonging to the same group can be pushed
+    // together.
+    bool SameGroup = true;
+    for (u32 I = 1; I < Size; ++I) {
+      if (compactPtrGroup(Array[I - 1]) != compactPtrGroup(Array[I]))
+        SameGroup = false;
+      CompactPtrT Cur = Array[I];
+      u32 J = I;
+      while (J > 0 && compactPtrGroup(Cur) < compactPtrGroup(Array[J - 1])) {
+        Array[J] = Array[J - 1];
+        --J;
+      }
+      Array[J] = Cur;
+    }
+
     ScopedLock L(Region->Mutex);
-    Region->FreeList.push_front(B);
-    Region->Stats.PushedBlocks += B->getCount();
+    pushBlocksImpl(C, ClassId, Array, Size, SameGroup);
+
+    Region->Stats.PushedBlocks += Size;
     if (ClassId != SizeClassMap::BatchClassId)
       releaseToOSMaybe(Region, ClassId);
   }
@@ -292,7 +330,7 @@
 
   struct UnpaddedRegionInfo {
     HybridMutex Mutex;
-    SinglyLinkedList<TransferBatch> FreeList;
+    SinglyLinkedList<BatchGroup> FreeList;
     uptr RegionBeg = 0;
     RegionStats Stats = {};
     u32 RandState = 0;
@@ -330,8 +368,192 @@
     return Base + (static_cast<uptr>(CompactPtr) << CompactPtrScale);
   }
 
-  NOINLINE TransferBatch *populateFreeList(CacheT *C, uptr ClassId,
-                                           RegionInfo *Region) {
+  static uptr compactPtrGroup(CompactPtrT CompactPtr) {
+    return CompactPtr >> (GroupSizeLog - CompactPtrScale);
+  }
+
+  // Push the blocks to their batch group. The layout will be like,
+  //
+  // FreeList - > BG -> BG -> BG
+  //              |     |     |
+  //              v     v     v
+  //              TB    TB    TB
+  //              |
+  //              v
+  //              TB
+  //
+  // Each BlockGroup(BG) will associate with unique group id and the free blocks
+  // are managed by a list of TransferBatch(TB). To reduce the time of inserting
+  // blocks, BGs are sorted and the input `Array` are supposed to be sorted so
+  // that we can get better performance of maintaining sorted property.
+  // Use `SameGroup=true` to indicate that all blocks in the array are from the
+  // same group then we will skip checking the group id of each block.
+  //
+  // Note that this aims to have a better management of dirty pages, i.e., the
+  // RSS usage won't grow indefinitely. There's an exception that we may not put
+  // a block to its associated group. While populating new blocks, we may have
+  // blocks cross different groups. However, most cases will fall into same
+  // group and they are supposed to be popped soon. In that case, it's not worth
+  // sorting the array with the almost-sorted property. Therefore, we use
+  // `SameGroup=true` instead.
+  //
+  // The region mutex needs to be held while calling this method.
+  void pushBlocksImpl(CacheT *C, uptr ClassId, CompactPtrT *Array, u32 Size,
+                      bool SameGroup = false) {
+    DCHECK_GT(Size, 0U);
+    RegionInfo *Region = getRegionInfo(ClassId);
+
+    auto CreateGroup = [&](uptr GroupId) {
+      BatchGroup *BG = nullptr;
+      TransferBatch *TB = nullptr;
+      if (ClassId == SizeClassMap::BatchClassId) {
+        DCHECK_GE(Size, 2U);
+        BG = reinterpret_cast<BatchGroup *>(
+            decompactPtr(ClassId, Array[Size - 1]));
+        BG->Batches.clear();
+
+        TB = reinterpret_cast<TransferBatch *>(
+            decompactPtr(ClassId, Array[Size - 2]));
+        TB->clear();
+      } else {
+        BG = C->createGroup();
+        BG->Batches.clear();
+
+        TB = C->createBatch(ClassId, nullptr);
+        TB->clear();
+      }
+
+      BG->GroupId = GroupId;
+      BG->Batches.push_front(TB);
+      BG->MaxCachedPerBatch =
+          TransferBatch::getMaxCached(getSizeByClassId(ClassId));
+
+      return BG;
+    };
+
+    auto InsertBlocks = [&](BatchGroup *BG, CompactPtrT *Array, u32 Size) {
+      SinglyLinkedList<TransferBatch> &Batches = BG->Batches;
+      TransferBatch *CurBatch = Batches.front();
+      DCHECK_NE(CurBatch, nullptr);
+
+      for (u32 I = 0; I < Size;) {
+        DCHECK_GE(BG->MaxCachedPerBatch, CurBatch->getCount());
+        u16 UnusedSlots =
+            static_cast<u16>(BG->MaxCachedPerBatch - CurBatch->getCount());
+        if (UnusedSlots == 0) {
+          CurBatch = C->createBatch(
+              ClassId,
+              reinterpret_cast<void *>(decompactPtr(ClassId, Array[I])));
+          CurBatch->clear();
+          Batches.push_front(CurBatch);
+          UnusedSlots = BG->MaxCachedPerBatch;
+        }
+        // `UnusedSlots` is u16 so the result will be also fit in u16.
+        u16 AppendSize = static_cast<u16>(Min<u32>(UnusedSlots, Size - I));
+        CurBatch->appendFromArray(&Array[I], AppendSize);
+        I += AppendSize;
+      }
+    };
+
+    BatchGroup *Cur = Region->FreeList.front();
+
+    if (ClassId == SizeClassMap::BatchClassId) {
+      if (Cur == nullptr) {
+        // Don't need to classify BatchClassId.
+        Cur = CreateGroup(/*GroupId=*/0);
+        Region->FreeList.push_front(Cur);
+      }
+      InsertBlocks(Cur, Array, Size);
+      return;
+    }
+
+    // In the following, `Cur` always points to the BatchGroup for blocks that
+    // will be pushed next. `Prev` is the element right before `Cur`.
+    BatchGroup *Prev = nullptr;
+
+    while (Cur != nullptr && compactPtrGroup(Array[0]) > Cur->GroupId) {
+      Prev = Cur;
+      Cur = Cur->Next;
+    }
+
+    if (Cur == nullptr || compactPtrGroup(Array[0]) != Cur->GroupId) {
+      Cur = CreateGroup(compactPtrGroup(Array[0]));
+      if (Prev == nullptr)
+        Region->FreeList.push_front(Cur);
+      else
+        Region->FreeList.insert(Prev, Cur);
+    }
+
+    // All the blocks are from the same group, just push without checking group
+    // id.
+    if (SameGroup) {
+      InsertBlocks(Cur, Array, Size);
+      return;
+    }
+
+    // The blocks are sorted by group id. Determine the segment of group and
+    // push them to their group together.
+    u32 Count = 1;
+    for (u32 I = 1; I < Size; ++I) {
+      if (compactPtrGroup(Array[I - 1]) != compactPtrGroup(Array[I])) {
+        DCHECK_EQ(compactPtrGroup(Array[I - 1]), Cur->GroupId);
+        InsertBlocks(Cur, Array + I - Count, Count);
+
+        while (Cur != nullptr && compactPtrGroup(Array[I]) > Cur->GroupId) {
+          Prev = Cur;
+          Cur = Cur->Next;
+        }
+
+        if (Cur == nullptr || compactPtrGroup(Array[I]) != Cur->GroupId) {
+          Cur = CreateGroup(compactPtrGroup(Array[I]));
+          DCHECK_NE(Prev, nullptr);
+          Region->FreeList.insert(Prev, Cur);
+        }
+
+        Count = 1;
+      } else {
+        ++Count;
+      }
+    }
+
+    InsertBlocks(Cur, Array + Size - Count, Count);
+  }
+
+  // Pop one TransferBatch from a BatchGroup. The BatchGroup with the smallest
+  // group id will be considered first.
+  //
+  // The region mutex needs to be held while calling this method.
+  TransferBatch *popBatchImpl(CacheT *C, uptr ClassId) {
+    RegionInfo *Region = getRegionInfo(ClassId);
+    if (Region->FreeList.empty())
+      return nullptr;
+
+    SinglyLinkedList<TransferBatch> &Batches =
+        Region->FreeList.front()->Batches;
+    DCHECK(!Batches.empty());
+
+    TransferBatch *B = Batches.front();
+    Batches.pop_front();
+    DCHECK_NE(B, nullptr);
+    DCHECK_GT(B->getCount(), 0U);
+
+    if (Batches.empty()) {
+      BatchGroup *BG = Region->FreeList.front();
+      Region->FreeList.pop_front();
+
+      // We don't keep BatchGroup with zero blocks to avoid empty-checking while
+      // allocating. Note that block used by constructing BatchGroup is recorded
+      // as free blocks in the last element of BatchGroup::Batches. Which means,
+      // once we pop the last TransferBatch, the block is implicitly
+      // deallocated.
+      if (ClassId != SizeClassMap::BatchClassId)
+        C->deallocate(SizeClassMap::BatchClassId, BG);
+    }
+
+    return B;
+  }
+
+  NOINLINE bool populateFreeList(CacheT *C, uptr ClassId, RegionInfo *Region) {
     const uptr Size = getSizeByClassId(ClassId);
     const u16 MaxCount = TransferBatch::getMaxCached(Size);
 
@@ -354,7 +576,7 @@
               RegionSize >> 20, Size);
           Str.output();
         }
-        return nullptr;
+        return false;
       }
       if (MappedUser == 0)
         Region->Data = Data;
@@ -363,8 +585,9 @@
               "scudo:primary",
               MAP_ALLOWNOMEM | MAP_RESIZABLE |
                   (useMemoryTagging<Config>(Options.load()) ? MAP_MEMTAG : 0),
-              &Region->Data)))
-        return nullptr;
+              &Region->Data))) {
+        return false;
+      }
       Region->MappedUser += MapSize;
       C->getStats().add(StatMapped, MapSize);
     }
@@ -387,27 +610,21 @@
     if (ClassId != SizeClassMap::BatchClassId)
       shuffle(ShuffleArray, NumberOfBlocks, &Region->RandState);
     for (u32 I = 0; I < NumberOfBlocks;) {
-      TransferBatch *B =
-          C->createBatch(ClassId, reinterpret_cast<void *>(decompactPtrInternal(
-                                      CompactPtrBase, ShuffleArray[I])));
-      if (UNLIKELY(!B))
-        return nullptr;
       // `MaxCount` is u16 so the result will also fit in u16.
       const u16 N = static_cast<u16>(Min<u32>(MaxCount, NumberOfBlocks - I));
-      B->setFromArray(&ShuffleArray[I], N);
-      Region->FreeList.push_back(B);
+      // Note that the N blocks here may have different group ids. Given that
+      // it only happens when it crosses the group size boundary. Instead of
+      // sorting them, treat them as same group here to avoid sorting the
+      // almost-sorted blocks.
+      pushBlocksImpl(C, ClassId, &ShuffleArray[I], N, /*SameGroup=*/true);
       I += N;
     }
-    TransferBatch *B = Region->FreeList.front();
-    Region->FreeList.pop_front();
-    DCHECK(B);
-    DCHECK_GT(B->getCount(), 0);
 
     const uptr AllocatedUser = Size * NumberOfBlocks;
     C->getStats().add(StatFree, AllocatedUser);
     Region->AllocatedUser += AllocatedUser;
 
-    return B;
+    return true;
   }
 
   void getStats(ScopedString *Str, uptr ClassId, uptr Rss) {
@@ -473,8 +690,11 @@
       return decompactPtrInternal(CompactPtrBase, CompactPtr);
     };
     auto SkipRegion = [](UNUSED uptr RegionIndex) { return false; };
-    releaseFreeMemoryToOS(Region->FreeList, Region->AllocatedUser, 1U,
-                          BlockSize, Recorder, DecompactPtr, SkipRegion);
+    PageReleaseContext Context(BlockSize, RegionSize, /*NumberOfRegions=*/1U);
+    for (BatchGroup &BG : Region->FreeList)
+      Context.markFreeBlocks(BG.Batches, DecompactPtr, Region->RegionBeg);
+
+    releaseFreeMemoryToOS(Context, Recorder, SkipRegion);
 
     if (Recorder.getReleasedRangesCount() > 0) {
       Region->ReleaseInfo.PushedBlocksAtLastRelease =
diff --git a/compiler-rt/lib/scudo/standalone/tests/combined_test.cpp b/compiler-rt/lib/scudo/standalone/tests/combined_test.cpp
--- a/compiler-rt/lib/scudo/standalone/tests/combined_test.cpp
+++ b/compiler-rt/lib/scudo/standalone/tests/combined_test.cpp
@@ -525,6 +525,7 @@
   static const scudo::uptr PrimaryCompactPtrScale = 0;
   static const bool PrimaryEnableRandomOffset = true;
   static const scudo::uptr PrimaryMapSizeIncrement = 1UL << 18;
+  static const scudo::uptr PrimaryGroupSizeLog = 18;
 
   typedef scudo::MapAllocatorNoCache SecondaryCache;
   template <class A> using TSDRegistryT = scudo::TSDRegistrySharedT<A, 1U, 1U>;
diff --git a/compiler-rt/lib/scudo/standalone/tests/list_test.cpp b/compiler-rt/lib/scudo/standalone/tests/list_test.cpp
--- a/compiler-rt/lib/scudo/standalone/tests/list_test.cpp
+++ b/compiler-rt/lib/scudo/standalone/tests/list_test.cpp
@@ -161,6 +161,10 @@
   setList(&L1, X);
   checkList(&L1, X);
 
+  setList(&L1, X, Y);
+  L1.insert(X, Z);
+  checkList(&L1, X, Z, Y);
+
   setList(&L1, X, Y, Z);
   setList(&L2, A, B, C);
   L1.append_back(&L2);
diff --git a/compiler-rt/lib/scudo/standalone/tests/primary_test.cpp b/compiler-rt/lib/scudo/standalone/tests/primary_test.cpp
--- a/compiler-rt/lib/scudo/standalone/tests/primary_test.cpp
+++ b/compiler-rt/lib/scudo/standalone/tests/primary_test.cpp
@@ -12,8 +12,11 @@
 #include "primary64.h"
 #include "size_class_map.h"
 
+#include <algorithm>
+#include <chrono>
 #include <condition_variable>
 #include <mutex>
+#include <random>
 #include <stdlib.h>
 #include <thread>
 #include <vector>
@@ -24,6 +27,7 @@
 
 struct TestConfig1 {
   static const scudo::uptr PrimaryRegionSizeLog = 18U;
+  static const scudo::uptr PrimaryGroupSizeLog = 18U;
   static const scudo::s32 PrimaryMinReleaseToOsIntervalMs = INT32_MIN;
   static const scudo::s32 PrimaryMaxReleaseToOsIntervalMs = INT32_MAX;
   static const bool MaySupportMemoryTagging = false;
@@ -40,6 +44,7 @@
 #else
   static const scudo::uptr PrimaryRegionSizeLog = 24U;
 #endif
+  static const scudo::uptr PrimaryGroupSizeLog = 20U;
   static const scudo::s32 PrimaryMinReleaseToOsIntervalMs = INT32_MIN;
   static const scudo::s32 PrimaryMaxReleaseToOsIntervalMs = INT32_MAX;
   static const bool MaySupportMemoryTagging = false;
@@ -56,6 +61,7 @@
 #else
   static const scudo::uptr PrimaryRegionSizeLog = 24U;
 #endif
+  static const scudo::uptr PrimaryGroupSizeLog = 20U;
   static const scudo::s32 PrimaryMinReleaseToOsIntervalMs = INT32_MIN;
   static const scudo::s32 PrimaryMaxReleaseToOsIntervalMs = INT32_MAX;
   static const bool MaySupportMemoryTagging = true;
@@ -65,6 +71,23 @@
   static const scudo::uptr PrimaryMapSizeIncrement = 1UL << 18;
 };
 
+struct TestConfig4 {
+#if defined(__mips__)
+  // Unable to allocate greater size on QEMU-user.
+  static const scudo::uptr PrimaryRegionSizeLog = 23U;
+#else
+  static const scudo::uptr PrimaryRegionSizeLog = 24U;
+#endif
+  static const scudo::s32 PrimaryMinReleaseToOsIntervalMs = INT32_MIN;
+  static const scudo::s32 PrimaryMaxReleaseToOsIntervalMs = INT32_MAX;
+  static const bool MaySupportMemoryTagging = true;
+  static const scudo::uptr PrimaryCompactPtrScale = 3U;
+  static const scudo::uptr PrimaryGroupSizeLog = 20U;
+  typedef scudo::u32 PrimaryCompactPtrT;
+  static const bool PrimaryEnableRandomOffset = true;
+  static const scudo::uptr PrimaryMapSizeIncrement = 1UL << 18;
+};
+
 template <typename BaseConfig, typename SizeClassMapT>
 struct Config : public BaseConfig {
   using SizeClassMap = SizeClassMapT;
@@ -100,7 +123,8 @@
 #define SCUDO_TYPED_TEST_ALL_TYPES(FIXTURE, NAME)                              \
   SCUDO_TYPED_TEST_TYPE(FIXTURE, NAME, TestConfig1)                            \
   SCUDO_TYPED_TEST_TYPE(FIXTURE, NAME, TestConfig2)                            \
-  SCUDO_TYPED_TEST_TYPE(FIXTURE, NAME, TestConfig3)
+  SCUDO_TYPED_TEST_TYPE(FIXTURE, NAME, TestConfig3)                            \
+  SCUDO_TYPED_TEST_TYPE(FIXTURE, NAME, TestConfig4)
 #endif
 
 #define SCUDO_TYPED_TEST_TYPE(FIXTURE, NAME, TYPE)                             \
@@ -153,6 +177,7 @@
   static const scudo::uptr PrimaryCompactPtrScale = 0;
   static const bool PrimaryEnableRandomOffset = true;
   static const scudo::uptr PrimaryMapSizeIncrement = 1UL << 18;
+  static const scudo::uptr PrimaryGroupSizeLog = 20U;
 };
 
 // The 64-bit SizeClassAllocator can be easily OOM'd with small region sizes.
@@ -170,6 +195,8 @@
   std::vector<TransferBatch *> Batches;
   const scudo::uptr ClassId = Primary::SizeClassMap::LargestClassId;
   const scudo::uptr Size = Primary::getSizeByClassId(ClassId);
+  typename Primary::CacheT::CompactPtrT Blocks[TransferBatch::MaxNumCached];
+
   for (scudo::uptr I = 0; I < 10000U; I++) {
     TransferBatch *B = Allocator.popBatch(&Cache, ClassId);
     if (!B) {
@@ -181,8 +208,11 @@
     Batches.push_back(B);
   }
   while (!Batches.empty()) {
-    Allocator.pushBatch(ClassId, Batches.back());
+    TransferBatch *B = Batches.back();
     Batches.pop_back();
+    B->copyToArray(Blocks);
+    Allocator.pushBlocks(&Cache, ClassId, Blocks, B->getCount());
+    Cache.deallocate(Primary::SizeClassMap::BatchClassId, B);
   }
   Cache.destroy(nullptr);
   Allocator.releaseToOS();
@@ -294,3 +324,47 @@
   Cache.destroy(nullptr);
   EXPECT_GT(Allocator->releaseToOS(), 0U);
 }
+
+SCUDO_TYPED_TEST(ScudoPrimaryTest, MemoryGroup) {
+  using Primary = TestAllocator<TypeParam, scudo::DefaultSizeClassMap>;
+  std::unique_ptr<Primary> Allocator(new Primary);
+  Allocator->init(/*ReleaseToOsInterval=*/-1);
+  typename Primary::CacheT Cache;
+  Cache.init(nullptr, Allocator.get());
+  const scudo::uptr Size = 32U;
+  const scudo::uptr ClassId = Primary::SizeClassMap::getClassIdBySize(Size);
+
+  // We will allocate 4 times the group size memory and release all of them. We
+  // expect the free blocks will be classified with groups. Then we will
+  // allocate the same amount of memory as group size and expect the blocks will
+  // have the max address difference smaller or equal to 2 times the group size.
+  // Note that it isn't necessary to be in the range of single group size
+  // because the way we get the group id is doing compact pointer shifting.
+  // According to configuration, the compact pointer may not align to group
+  // size. As a result, the blocks can cross two groups at most.
+  const scudo::uptr GroupSizeMem = (1ULL << Primary::GroupSizeLog);
+  const scudo::uptr PeakAllocationMem = 4 * GroupSizeMem;
+  const scudo::uptr PeakNumberOfAllocations = PeakAllocationMem / Size;
+  const scudo::uptr FinalNumberOfAllocations = GroupSizeMem / Size;
+  std::vector<scudo::uptr> Blocks;
+  std::mt19937 R;
+
+  for (scudo::uptr I = 0; I < PeakNumberOfAllocations; ++I)
+    Blocks.push_back(reinterpret_cast<scudo::uptr>(Cache.allocate(ClassId)));
+
+  std::shuffle(Blocks.begin(), Blocks.end(), R);
+
+  // Release all the allocated blocks, including those held by local cache.
+  while (!Blocks.empty()) {
+    Cache.deallocate(ClassId, reinterpret_cast<void *>(Blocks.back()));
+    Blocks.pop_back();
+  }
+  Cache.drain();
+
+  for (scudo::uptr I = 0; I < FinalNumberOfAllocations; ++I)
+    Blocks.push_back(reinterpret_cast<scudo::uptr>(Cache.allocate(ClassId)));
+
+  EXPECT_LE(*std::max_element(Blocks.begin(), Blocks.end()) -
+                *std::min_element(Blocks.begin(), Blocks.end()),
+            GroupSizeMem * 2);
+}