diff --git a/compiler-rt/lib/scudo/standalone/local_cache.h b/compiler-rt/lib/scudo/standalone/local_cache.h
--- a/compiler-rt/lib/scudo/standalone/local_cache.h
+++ b/compiler-rt/lib/scudo/standalone/local_cache.h
@@ -35,6 +35,15 @@
       // u16 will be promoted to int by arithmetic type conversion.
       Count = static_cast<u16>(Count + N);
     }
+    void appendFromTransferBatch(TransferBatch *B, u16 N) {
+      DCHECK_LE(N, MaxNumCached - Count);
+      DCHECK_GE(B->Count, N);
+      // Append from the back of `B`.
+      memcpy(Batch + Count, B->Batch + (B->Count - N), sizeof(Batch[0]) * N);
+      // u16 will be promoted to int by arithmetic type conversion.
+      Count = static_cast<u16>(Count + N);
+      B->Count = static_cast<u16>(B->Count - N);
+    }
     void clear() { Count = 0; }
     void add(CompactPtrT P) {
       DCHECK_LT(Count, MaxNumCached);
@@ -44,7 +53,7 @@
       memcpy(Array, Batch, sizeof(Batch[0]) * Count);
     }
     u16 getCount() const { return Count; }
-    CompactPtrT *getRawArray() { return Batch; }
+    bool isEmpty() const { return Count == 0U; }
     CompactPtrT get(u16 I) const {
       DCHECK_LE(I, Count);
       return Batch[I];
diff --git a/compiler-rt/lib/scudo/standalone/primary64.h b/compiler-rt/lib/scudo/standalone/primary64.h
--- a/compiler-rt/lib/scudo/standalone/primary64.h
+++ b/compiler-rt/lib/scudo/standalone/primary64.h
@@ -984,8 +984,6 @@
   NOINLINE uptr releaseToOSMaybe(RegionInfo *Region, uptr ClassId,
                                  ReleaseToOS ReleaseType = ReleaseToOS::Normal)
       REQUIRES(Region->MMLock) EXCLUDES(Region->FLLock) {
-    // TODO(chiahungduan): Release `FLLock` in step 3 & 4 described in the
-    // comment below.
     ScopedLock L(Region->FLLock);
 
     const uptr BlockSize = getSizeByClassId(ClassId);
@@ -1010,10 +1008,6 @@
       return 0;
     }
 
-    // This is only used for debugging to ensure the consistency of the number
-    // of groups.
-    uptr NumberOfBatchGroups = Region->FreeListInfo.BlockList.size();
-
     // ====================================================================== //
     // 2. Determine which groups can release the pages. Use a heuristic to
     //    gather groups that are candidates for doing a release.
@@ -1029,39 +1023,71 @@
     if (GroupsToRelease.empty())
       return 0;
 
-    // ====================================================================== //
-    // 3. Mark the free blocks in `GroupsToRelease` in the `PageReleaseContext`.
-    //    Then we can tell which pages are in-use by querying
-    //    `PageReleaseContext`.
-    // ====================================================================== //
-    PageReleaseContext Context = markFreeBlocks(
-        Region, BlockSize, AllocatedUserEnd, CompactPtrBase, GroupsToRelease);
-    if (UNLIKELY(!Context.hasBlockMarked())) {
-      mergeGroupsToReleaseBack(Region, GroupsToRelease, NumberOfBatchGroups);
-      return 0;
-    }
+    // Ideally, we should use a class like `ScopedUnlock`. However, this form of
+    // unlocking is not supported by the thread-safety analysis. See
+    // https://clang.llvm.org/docs/ThreadSafetyAnalysis.html#no-alias-analysis
+    // for more details.
+    // Put it as local class so that we can mark the ctor/dtor with proper
+    // annotations associated to the target lock. Note that accessing the
+    // function variable in local class only works in thread-safety annotations.
+    // TODO: Implement general `ScopedUnlock` when it's supported.
+    class FLLockScopedUnlock {
+    public:
+      FLLockScopedUnlock(RegionInfo *Region) RELEASE(Region->FLLock)
+          : R(Region) {
+        R->FLLock.assertHeld();
+        R->FLLock.unlock();
+      }
+      ~FLLockScopedUnlock() ACQUIRE(Region->FLLock) { R->FLLock.lock(); }
 
-    // ====================================================================== //
-    // 4. Release the unused physical pages back to the OS.
-    // ====================================================================== //
-    RegionReleaseRecorder<MemMapT> Recorder(&Region->MemMapInfo.MemMap,
-                                            Region->RegionBeg,
-                                            Context.getReleaseOffset());
-    auto SkipRegion = [](UNUSED uptr RegionIndex) { return false; };
-    releaseFreeMemoryToOS(Context, Recorder, SkipRegion);
-    if (Recorder.getReleasedRangesCount() > 0) {
-      Region->ReleaseInfo.BytesInFreeListAtLastCheckpoint = BytesInFreeList;
-      Region->ReleaseInfo.RangesReleased += Recorder.getReleasedRangesCount();
-      Region->ReleaseInfo.LastReleasedBytes = Recorder.getReleasedBytes();
+    private:
+      RegionInfo *R;
+    };
+
+    // Note that we have extracted the `GroupsToRelease` from region freelist.
+    // It's safe to let pushBlocks()/popBatches() access the remaining region
+    // freelist. In the steps 3 and 4, we will temporarily release the FLLock
+    // and lock it again before step 5.
+
+    uptr ReleasedBytes = 0;
+    {
+      FLLockScopedUnlock UL(Region);
+      // ==================================================================== //
+      // 3. Mark the free blocks in `GroupsToRelease` in the
+      //    `PageReleaseContext`. Then we can tell which pages are in-use by
+      //    querying `PageReleaseContext`.
+      // ==================================================================== //
+      PageReleaseContext Context = markFreeBlocks(
+          Region, BlockSize, AllocatedUserEnd, CompactPtrBase, GroupsToRelease);
+      if (UNLIKELY(!Context.hasBlockMarked())) {
+        ScopedLock L(Region->FLLock);
+        mergeGroupsToReleaseBack(Region, GroupsToRelease);
+        return 0;
+      }
+
+      // ==================================================================== //
+      // 4. Release the unused physical pages back to the OS.
+      // ==================================================================== //
+      RegionReleaseRecorder<MemMapT> Recorder(&Region->MemMapInfo.MemMap,
+                                              Region->RegionBeg,
+                                              Context.getReleaseOffset());
+      auto SkipRegion = [](UNUSED uptr RegionIndex) { return false; };
+      releaseFreeMemoryToOS(Context, Recorder, SkipRegion);
+      if (Recorder.getReleasedRangesCount() > 0) {
+        Region->ReleaseInfo.BytesInFreeListAtLastCheckpoint = BytesInFreeList;
+        Region->ReleaseInfo.RangesReleased += Recorder.getReleasedRangesCount();
+        Region->ReleaseInfo.LastReleasedBytes = Recorder.getReleasedBytes();
+      }
+      Region->ReleaseInfo.LastReleaseAtNs = getMonotonicTimeFast();
+      ReleasedBytes = Recorder.getReleasedBytes();
     }
-    Region->ReleaseInfo.LastReleaseAtNs = getMonotonicTimeFast();
 
     // ====================================================================== //
     // 5. Merge the `GroupsToRelease` back to the freelist.
     // ====================================================================== //
-    mergeGroupsToReleaseBack(Region, GroupsToRelease, NumberOfBatchGroups);
+    mergeGroupsToReleaseBack(Region, GroupsToRelease);
 
-    return Recorder.getReleasedBytes();
+    return ReleasedBytes;
   }
 
   bool hasChanceToReleasePages(RegionInfo *Region, uptr BlockSize,
@@ -1298,7 +1324,7 @@
   markFreeBlocks(RegionInfo *Region, const uptr BlockSize,
                  const uptr AllocatedUserEnd, const uptr CompactPtrBase,
                  SinglyLinkedList<BatchGroup> &GroupsToRelease)
-      REQUIRES(Region->MMLock, Region->FLLock) {
+      REQUIRES(Region->MMLock) EXCLUDES(Region->FLLock) {
     const uptr GroupSize = (1U << GroupSizeLog);
     auto DecompactPtr = [CompactPtrBase](CompactPtrT CompactPtr) {
       return decompactPtrInternal(CompactPtrBase, CompactPtr);
@@ -1347,9 +1373,12 @@
                              BG.Batches.front()->getCount();
 
       if (NumBlocks == MaxContainedBlocks) {
-        for (const auto &It : BG.Batches)
+        for (const auto &It : BG.Batches) {
+          if (&It != BG.Batches.front())
+            DCHECK_EQ(It.getCount(), BG.MaxCachedPerBatch);
           for (u16 I = 0; I < It.getCount(); ++I)
             DCHECK_EQ(compactPtrGroup(It.get(I)), BG.CompactPtrGroupBase);
+        }
 
         Context.markRangeAsAllCounted(BatchGroupBase, BatchGroupUsedEnd,
                                       Region->RegionBeg, /*RegionIndex=*/0,
@@ -1371,9 +1400,22 @@
   }
 
   void mergeGroupsToReleaseBack(RegionInfo *Region,
-                                SinglyLinkedList<BatchGroup> &GroupsToRelease,
-                                const uptr NumberOfBatchGroups)
+                                SinglyLinkedList<BatchGroup> &GroupsToRelease)
       REQUIRES(Region->MMLock, Region->FLLock) {
+    // After merging two freelists, we may have redundant `BatchGroup`s that
+    // need to be recycled. The number of unused `BatchGroup`s is expected to be
+    // small. Pick a constant which is inferred from real programs.
+    constexpr uptr MaxUnusedSize = 8;
+    CompactPtrT Blocks[MaxUnusedSize];
+    u32 Idx = 0;
+    RegionInfo *BatchClassRegion = getRegionInfo(SizeClassMap::BatchClassId);
+    // We can't call pushBatchClassBlocks() to recycle the unused `BatchGroup`s
+    // when we are manipulating the freelist of `BatchClassRegion`. Instead, we
+    // should just push it back to the freelist when we merge two `BatchGroup`s.
+    // This logic hasn't been implemented because we haven't supported releasing
+    // pages in `BatchClassRegion`.
+    DCHECK_NE(BatchClassRegion, Region);
+
     // Merge GroupsToRelease back to the Region::FreeListInfo.BlockList. Note
     // that both `Region->FreeListInfo.BlockList` and `GroupsToRelease` are
     // sorted.
@@ -1386,8 +1428,7 @@
         break;
       }
 
-      DCHECK_NE(BG->CompactPtrGroupBase,
-                GroupsToRelease.front()->CompactPtrGroupBase);
+      DCHECK(!BG->Batches.empty());
 
       if (BG->CompactPtrGroupBase <
           GroupsToRelease.front()->CompactPtrGroupBase) {
@@ -1396,6 +1437,65 @@
         continue;
       }
 
+      BatchGroup *Cur = GroupsToRelease.front();
+      TransferBatch *UnusedTransferBatch = nullptr;
+      GroupsToRelease.pop_front();
+
+      if (BG->CompactPtrGroupBase == Cur->CompactPtrGroupBase) {
+        BG->PushedBlocks += Cur->PushedBlocks;
+        // We have updated `BatchGroup::BytesInBGAtLastCheckpoint` while
+        // collecting the `GroupsToRelease`.
+        BG->BytesInBGAtLastCheckpoint = Cur->BytesInBGAtLastCheckpoint;
+        const uptr MaxCachedPerBatch = BG->MaxCachedPerBatch;
+
+        // Note that the first TransferBatches in both `Batches` may not be
+        // full and only the first TransferBatch can have non-full blocks. Thus
+        // we have to merge them before appending one to another.
+        if (Cur->Batches.front()->getCount() == MaxCachedPerBatch) {
+          BG->Batches.append_back(&Cur->Batches);
+        } else {
+          TransferBatch *NonFullBatch = Cur->Batches.front();
+          Cur->Batches.pop_front();
+          const u16 NonFullBatchCount = NonFullBatch->getCount();
+          // The remaining Batches in `Cur` are full.
+          BG->Batches.append_back(&Cur->Batches);
+
+          if (BG->Batches.front()->getCount() == MaxCachedPerBatch) {
+            // Only 1 non-full TransferBatch, push it to the front and append
+            // the remaining.
+            BG->Batches.push_front(NonFullBatch);
+          } else {
+            const u16 NumBlocksToMove = static_cast<u16>(
+                Min(static_cast<u16>(MaxCachedPerBatch -
+                                     BG->Batches.front()->getCount()),
+                    NonFullBatchCount));
+            BG->Batches.front()->appendFromTransferBatch(NonFullBatch,
+                                                         NumBlocksToMove);
+            if (NonFullBatch->isEmpty())
+              UnusedTransferBatch = NonFullBatch;
+            else
+              BG->Batches.push_front(NonFullBatch);
+          }
+        }
+
+        const u32 NeededSlots = UnusedTransferBatch == nullptr ? 1U : 2U;
+        if (UNLIKELY(Idx + NeededSlots > MaxUnusedSize)) {
+          ScopedLock L(BatchClassRegion->FLLock);
+          pushBatchClassBlocks(BatchClassRegion, Blocks, Idx);
+          Idx = 0;
+        }
+        Blocks[Idx++] =
+            compactPtr(SizeClassMap::BatchClassId, reinterpret_cast<uptr>(Cur));
+        if (UnusedTransferBatch) {
+          Blocks[Idx++] =
+              compactPtr(SizeClassMap::BatchClassId,
+                         reinterpret_cast<uptr>(UnusedTransferBatch));
+        }
+        Prev = BG;
+        BG = BG->Next;
+        continue;
+      }
+
       // At here, the `BG` is the first BatchGroup with CompactPtrGroupBase
       // larger than the first element in `GroupsToRelease`. We need to insert
       // `GroupsToRelease::front()` (which is `Cur` below)  before `BG`.
@@ -1406,8 +1506,6 @@
       //
       // Afterwards, we don't need to advance `BG` because the order between
       // `BG` and the new `GroupsToRelease::front()` hasn't been checked.
-      BatchGroup *Cur = GroupsToRelease.front();
-      GroupsToRelease.pop_front();
       if (Prev == nullptr)
         Region->FreeListInfo.BlockList.push_front(Cur);
       else
@@ -1416,8 +1514,10 @@
       Prev = Cur;
     }
 
-    DCHECK_EQ(Region->FreeListInfo.BlockList.size(), NumberOfBatchGroups);
-    (void)NumberOfBatchGroups;
+    if (Idx != 0) {
+      ScopedLock L(BatchClassRegion->FLLock);
+      pushBatchClassBlocks(BatchClassRegion, Blocks, Idx);
+    }
 
     if (SCUDO_DEBUG) {
       BatchGroup *Prev = Region->FreeListInfo.BlockList.front();
diff --git a/compiler-rt/lib/scudo/standalone/tests/primary_test.cpp b/compiler-rt/lib/scudo/standalone/tests/primary_test.cpp
--- a/compiler-rt/lib/scudo/standalone/tests/primary_test.cpp
+++ b/compiler-rt/lib/scudo/standalone/tests/primary_test.cpp
@@ -308,10 +308,18 @@
         if (P)
           V.push_back(std::make_pair(ClassId, P));
       }
+
+      // Try to interleave pushBlocks(), popBatch() and releaseToOS().
+      Allocator->releaseToOS(scudo::ReleaseToOS::Force);
+
       while (!V.empty()) {
         auto Pair = V.back();
         Cache.deallocate(Pair.first, Pair.second);
         V.pop_back();
+        // This increases the chance of having non-full TransferBatches and it
+        // will jump into the code path of merging TransferBatches.
+        if (std::rand() % 8 == 0)
+          Cache.drain();
       }
       Cache.destroy(nullptr);
     });