diff --git a/compiler-rt/lib/scudo/standalone/allocator_config.h b/compiler-rt/lib/scudo/standalone/allocator_config.h --- a/compiler-rt/lib/scudo/standalone/allocator_config.h +++ b/compiler-rt/lib/scudo/standalone/allocator_config.h @@ -34,6 +34,14 @@ // typedef SizeClassAllocator64 Primary; // // Log2 of the size of a size class region, as used by the Primary. // static const uptr PrimaryRegionSizeLog = 30U; +// // Log2 of the size of block group, as used by the Primary. Each group +// // contains a range of memory addresses, blocks in the range will belong to +// // the same group. In general, single region may have 1 or 2MB group size. +// // Multiple regions will have the group size equal to the region size +// // because the region size is usually smaller than 1 MB. +// // Smaller value gives fine-grained control of memory usage but the trade +// // off is that it may take longer time of deallocation. +// static const uptr PrimaryGroupSizeLog = 20U; // // Defines the type and scale of a compact pointer. A compact pointer can // // be understood as the offset of a pointer within the region it belongs // // to, in increments of a power-of-2 scale. @@ -65,6 +73,7 @@ #if SCUDO_CAN_USE_PRIMARY64 typedef SizeClassAllocator64 Primary; static const uptr PrimaryRegionSizeLog = 32U; + static const uptr PrimaryGroupSizeLog = 21U; typedef uptr PrimaryCompactPtrT; static const uptr PrimaryCompactPtrScale = 0; static const bool PrimaryEnableRandomOffset = true; @@ -72,6 +81,7 @@ #else typedef SizeClassAllocator32 Primary; static const uptr PrimaryRegionSizeLog = 19U; + static const uptr PrimaryGroupSizeLog = 19U; typedef uptr PrimaryCompactPtrT; #endif static const s32 PrimaryMinReleaseToOsIntervalMs = INT32_MIN; @@ -96,11 +106,13 @@ static const uptr PrimaryRegionSizeLog = 28U; typedef u32 PrimaryCompactPtrT; static const uptr PrimaryCompactPtrScale = SCUDO_MIN_ALIGNMENT_LOG; + static const uptr PrimaryGroupSizeLog = 20U; static const bool PrimaryEnableRandomOffset = true; static const uptr PrimaryMapSizeIncrement = 1UL << 18; #else typedef SizeClassAllocator32 Primary; static const uptr PrimaryRegionSizeLog = 18U; + static const uptr PrimaryGroupSizeLog = 18U; typedef uptr PrimaryCompactPtrT; #endif static const s32 PrimaryMinReleaseToOsIntervalMs = 1000; @@ -127,11 +139,13 @@ static const uptr PrimaryRegionSizeLog = 27U; typedef u32 PrimaryCompactPtrT; static const uptr PrimaryCompactPtrScale = SCUDO_MIN_ALIGNMENT_LOG; + static const uptr PrimaryGroupSizeLog = 18U; static const bool PrimaryEnableRandomOffset = true; static const uptr PrimaryMapSizeIncrement = 1UL << 18; #else typedef SizeClassAllocator32 Primary; static const uptr PrimaryRegionSizeLog = 16U; + static const uptr PrimaryGroupSizeLog = 16U; typedef uptr PrimaryCompactPtrT; #endif static const s32 PrimaryMinReleaseToOsIntervalMs = 1000; @@ -156,6 +170,7 @@ typedef SizeClassAllocator64 Primary; static const uptr PrimaryRegionSizeLog = 30U; + static const uptr PrimaryGroupSizeLog = 30U; typedef u32 PrimaryCompactPtrT; static const bool PrimaryEnableRandomOffset = true; static const uptr PrimaryMapSizeIncrement = 1UL << 18; @@ -175,6 +190,7 @@ typedef SizeClassAllocator64 Primary; // Some apps have 1 page of heap total so small regions are necessary. static const uptr PrimaryRegionSizeLog = 10U; + static const uptr PrimaryGroupSizeLog = 10U; typedef u32 PrimaryCompactPtrT; static const bool PrimaryEnableRandomOffset = false; // Trusty is extremely memory-constrained so minimally round up map calls. diff --git a/compiler-rt/lib/scudo/standalone/list.h b/compiler-rt/lib/scudo/standalone/list.h --- a/compiler-rt/lib/scudo/standalone/list.h +++ b/compiler-rt/lib/scudo/standalone/list.h @@ -110,6 +110,18 @@ Size--; } + // Insert X next to Prev + void insert(T *Prev, T *X) { + DCHECK(!empty()); + DCHECK_NE(Prev, nullptr); + DCHECK_NE(X, nullptr); + X->Next = Prev->Next; + Prev->Next = X; + if (Last == Prev) + Last = X; + ++Size; + } + void extract(T *Prev, T *X) { DCHECK(!empty()); DCHECK_NE(Prev, nullptr); diff --git a/compiler-rt/lib/scudo/standalone/local_cache.h b/compiler-rt/lib/scudo/standalone/local_cache.h --- a/compiler-rt/lib/scudo/standalone/local_cache.h +++ b/compiler-rt/lib/scudo/standalone/local_cache.h @@ -10,6 +10,7 @@ #define SCUDO_LOCAL_CACHE_H_ #include "internal_defs.h" +#include "list.h" #include "platform.h" #include "report.h" #include "stats.h" @@ -27,6 +28,12 @@ Count = N; memcpy(Batch, Array, sizeof(Batch[0]) * Count); } + void appendFromArray(CompactPtrT *Array, u16 N) { + DCHECK_LE(N, MaxNumCached - Count); + memcpy(Batch + Count, Array, sizeof(Batch[0]) * N); + // u16 will be promoted to int by arithmetic type conversion. + Count = static_cast(Count + N); + } void clear() { Count = 0; } void add(CompactPtrT P) { DCHECK_LT(Count, MaxNumCached); @@ -50,6 +57,22 @@ u16 Count; }; + // A BatchGroup is used to collect blocks. Each group has a group id to + // identify the group kind of contained blocks. + struct BatchGroup { + // `Next` is used by IntrusiveList. + BatchGroup *Next; + // The identifier of each group + uptr GroupId; + // Cache value of TransferBatch::getMaxCached() + u16 MaxCachedPerBatch; + // Blocks are managed by TransferBatch in a list. + SinglyLinkedList Batches; + }; + + static_assert(sizeof(BatchGroup) <= sizeof(TransferBatch), + "BatchGroup uses the same class size as TransferBatch"); + void init(GlobalStats *S, SizeClassAllocator *A) { DCHECK(isEmpty()); Stats.init(); @@ -121,9 +144,18 @@ TransferBatch *createBatch(uptr ClassId, void *B) { if (ClassId != BatchClassId) B = allocate(BatchClassId); + if (UNLIKELY(!B)) + reportOutOfMemory(SizeClassAllocator::getSizeByClassId(BatchClassId)); return reinterpret_cast(B); } + BatchGroup *createGroup() { + void *Ptr = allocate(BatchClassId); + if (UNLIKELY(!Ptr)) + reportOutOfMemory(SizeClassAllocator::getSizeByClassId(BatchClassId)); + return reinterpret_cast(Ptr); + } + LocalStats &getStats() { return Stats; } private: @@ -182,16 +214,11 @@ NOINLINE void drain(PerClass *C, uptr ClassId) { const u16 Count = Min(static_cast(C->MaxCount / 2), C->Count); - TransferBatch *B = - createBatch(ClassId, Allocator->decompactPtr(ClassId, C->Chunks[0])); - if (UNLIKELY(!B)) - reportOutOfMemory(SizeClassAllocator::getSizeByClassId(BatchClassId)); - B->setFromArray(&C->Chunks[0], Count); + Allocator->pushBlocks(this, ClassId, &C->Chunks[0], Count); // u16 will be promoted to int by arithmetic type conversion. C->Count = static_cast(C->Count - Count); for (u16 I = 0; I < C->Count; I++) C->Chunks[I] = C->Chunks[I + Count]; - Allocator->pushBatch(ClassId, B); } }; diff --git a/compiler-rt/lib/scudo/standalone/primary32.h b/compiler-rt/lib/scudo/standalone/primary32.h --- a/compiler-rt/lib/scudo/standalone/primary32.h +++ b/compiler-rt/lib/scudo/standalone/primary32.h @@ -43,6 +43,7 @@ public: typedef typename Config::PrimaryCompactPtrT CompactPtrT; typedef typename Config::SizeClassMap SizeClassMap; + static const uptr GroupSizeLog = Config::PrimaryGroupSizeLog; // The bytemap can only track UINT8_MAX - 1 classes. static_assert(SizeClassMap::LargestClassId <= (UINT8_MAX - 1), ""); // Regions should be large enough to hold the largest Block. @@ -51,6 +52,7 @@ typedef SizeClassAllocator32 ThisT; typedef SizeClassAllocatorLocalCache CacheT; typedef typename CacheT::TransferBatch TransferBatch; + typedef typename CacheT::BatchGroup BatchGroup; static uptr getSizeByClassId(uptr ClassId) { return (ClassId == SizeClassMap::BatchClassId) @@ -111,30 +113,69 @@ return reinterpret_cast(static_cast(CompactPtr)); } + uptr compactPtrGroup(CompactPtrT CompactPtr) { + return CompactPtr >> GroupSizeLog; + } + TransferBatch *popBatch(CacheT *C, uptr ClassId) { DCHECK_LT(ClassId, NumClasses); SizeClassInfo *Sci = getSizeClassInfo(ClassId); ScopedLock L(Sci->Mutex); - TransferBatch *B = Sci->FreeList.front(); - if (B) { - Sci->FreeList.pop_front(); - } else { - B = populateFreeList(C, ClassId, Sci); - if (UNLIKELY(!B)) + TransferBatch *B = popBatchImpl(C, ClassId); + if (UNLIKELY(!B)) { + if (UNLIKELY(!populateFreeList(C, ClassId, Sci))) return nullptr; + B = popBatchImpl(C, ClassId); + // if `populateFreeList` succeeded, we are supposed to get free blocks. + DCHECK_NE(B, nullptr); } - DCHECK_GT(B->getCount(), 0); Sci->Stats.PoppedBlocks += B->getCount(); return B; } - void pushBatch(uptr ClassId, TransferBatch *B) { + // Push the array of free blocks to the designated batch group. + void pushBlocks(CacheT *C, uptr ClassId, CompactPtrT *Array, u32 Size) { DCHECK_LT(ClassId, NumClasses); - DCHECK_GT(B->getCount(), 0); + DCHECK_GT(Size, 0); + SizeClassInfo *Sci = getSizeClassInfo(ClassId); + if (ClassId == SizeClassMap::BatchClassId) { + ScopedLock L(Sci->Mutex); + // Constructing a batch group in the free list will use two blocks in + // BatchClassId. If we are pushing BatchClassId blocks, we will use the + // blocks in the array directly (can't delegate local cache which will + // cause a recursive allocation). However, The number of free blocks may + // be less than two. Therefore, populate the free list before inserting + // the blocks. + if (Size == 1 && !populateFreeList(C, ClassId, Sci)) + return; + pushBlocksImpl(C, ClassId, Array, Size); + Sci->Stats.PushedBlocks += Size; + return; + } + + // TODO(chiahungduan): Consider not doing grouping if the group size is not + // greater than the block size with a certain scale. + + // Sort the blocks so that blocks belonging to the same group can be pushed + // together. + bool SameGroup = true; + for (u32 I = 1; I < Size; ++I) { + if (compactPtrGroup(Array[I - 1]) != compactPtrGroup(Array[I])) + SameGroup = false; + CompactPtrT Cur = Array[I]; + u32 J = I; + while (J > 0 && compactPtrGroup(Cur) < compactPtrGroup(Array[J - 1])) { + Array[J] = Array[J - 1]; + --J; + } + Array[J] = Cur; + } + ScopedLock L(Sci->Mutex); - Sci->FreeList.push_front(B); - Sci->Stats.PushedBlocks += B->getCount(); + pushBlocksImpl(C, ClassId, Array, Size, SameGroup); + + Sci->Stats.PushedBlocks += Size; if (ClassId != SizeClassMap::BatchClassId) releaseToOSMaybe(Sci, ClassId); } @@ -256,7 +297,7 @@ struct alignas(SCUDO_CACHE_LINE_SIZE) SizeClassInfo { HybridMutex Mutex; - SinglyLinkedList FreeList; + SinglyLinkedList FreeList; uptr CurrentRegion; uptr CurrentRegionAllocated; SizeClassStats Stats; @@ -328,8 +369,187 @@ return &SizeClassInfoArray[ClassId]; } - NOINLINE TransferBatch *populateFreeList(CacheT *C, uptr ClassId, - SizeClassInfo *Sci) { + // Push the blocks to their batch group. The layout will be like, + // + // FreeList - > BG -> BG -> BG + // | | | + // v v v + // TB TB TB + // | + // v + // TB + // + // Each BlockGroup(BG) will associate with unique group id and the free blocks + // are managed by a list of TransferBatch(TB). To reduce the time of inserting + // blocks, BGs are sorted and the input `Array` are supposed to be sorted so + // that we can get better performance of maintaining sorted property. + // Use `SameGroup=true` to indicate that all blocks in the array are from the + // same group then we will skip checking the group id of each block. + // + // Note that this aims to have a better management of dirty pages, i.e., the + // RSS usage won't grow indefinitely. There's an exception that we may not put + // a block to its associated group. While populating new blocks, we may have + // blocks cross different groups. However, most cases will fall into same + // group and they are supposed to be popped soon. In that case, it's not worth + // sorting the array with the almost-sorted property. Therefore, we use + // `SameGroup=true` instead. + // + // The region mutex needs to be held while calling this method. + void pushBlocksImpl(CacheT *C, uptr ClassId, CompactPtrT *Array, u32 Size, + bool SameGroup = false) { + DCHECK_GT(Size, 0U); + SizeClassInfo *Sci = getSizeClassInfo(ClassId); + + auto CreateGroup = [&](uptr GroupId) { + BatchGroup *BG = nullptr; + TransferBatch *TB = nullptr; + if (ClassId == SizeClassMap::BatchClassId) { + DCHECK_GE(Size, 2U); + BG = reinterpret_cast( + decompactPtr(ClassId, Array[Size - 1])); + BG->Batches.clear(); + + TB = reinterpret_cast( + decompactPtr(ClassId, Array[Size - 2])); + TB->clear(); + } else { + BG = C->createGroup(); + BG->Batches.clear(); + + TB = C->createBatch(ClassId, nullptr); + TB->clear(); + } + + BG->GroupId = GroupId; + BG->Batches.push_front(TB); + BG->MaxCachedPerBatch = + TransferBatch::getMaxCached(getSizeByClassId(ClassId)); + + return BG; + }; + + auto InsertBlocks = [&](BatchGroup *BG, CompactPtrT *Array, u32 Size) { + SinglyLinkedList &Batches = BG->Batches; + TransferBatch *CurBatch = Batches.front(); + DCHECK_NE(CurBatch, nullptr); + + for (u32 I = 0; I < Size;) { + DCHECK_GE(BG->MaxCachedPerBatch, CurBatch->getCount()); + u16 UnusedSlots = + static_cast(BG->MaxCachedPerBatch - CurBatch->getCount()); + if (UnusedSlots == 0) { + CurBatch = C->createBatch( + ClassId, + reinterpret_cast(decompactPtr(ClassId, Array[I]))); + CurBatch->clear(); + Batches.push_front(CurBatch); + UnusedSlots = BG->MaxCachedPerBatch; + } + // `UnusedSlots` is u16 so the result will be also fit in u16. + u16 AppendSize = static_cast(Min(UnusedSlots, Size - I)); + CurBatch->appendFromArray(&Array[I], AppendSize); + I += AppendSize; + } + }; + + BatchGroup *Cur = Sci->FreeList.front(); + + if (ClassId == SizeClassMap::BatchClassId) { + if (Cur == nullptr) { + // Don't need to classify BatchClassId. + Cur = CreateGroup(/*GroupId=*/0); + Sci->FreeList.push_front(Cur); + } + InsertBlocks(Cur, Array, Size); + return; + } + + // In the following, `Cur` always points to the BatchGroup for blocks that + // will be pushed next. `Prev` is the element right before `Cur`. + BatchGroup *Prev = nullptr; + + while (Cur != nullptr && compactPtrGroup(Array[0]) > Cur->GroupId) { + Prev = Cur; + Cur = Cur->Next; + } + + if (Cur == nullptr || compactPtrGroup(Array[0]) != Cur->GroupId) { + Cur = CreateGroup(compactPtrGroup(Array[0])); + if (Prev == nullptr) + Sci->FreeList.push_front(Cur); + else + Sci->FreeList.insert(Prev, Cur); + } + + // All the blocks are from the same group, just push without checking group + // id. + if (SameGroup) { + InsertBlocks(Cur, Array, Size); + return; + } + + // The blocks are sorted by group id. Determine the segment of group and + // push them to their group together. + u32 Count = 1; + for (u32 I = 1; I < Size; ++I) { + if (compactPtrGroup(Array[I - 1]) != compactPtrGroup(Array[I])) { + DCHECK_EQ(compactPtrGroup(Array[I - 1]), Cur->GroupId); + InsertBlocks(Cur, Array + I - Count, Count); + + while (Cur != nullptr && compactPtrGroup(Array[I]) > Cur->GroupId) { + Prev = Cur; + Cur = Cur->Next; + } + + if (Cur == nullptr || compactPtrGroup(Array[I]) != Cur->GroupId) { + Cur = CreateGroup(compactPtrGroup(Array[I])); + DCHECK_NE(Prev, nullptr); + Sci->FreeList.insert(Prev, Cur); + } + + Count = 1; + } else { + ++Count; + } + } + + InsertBlocks(Cur, Array + Size - Count, Count); + } + + // Pop one TransferBatch from a BatchGroup. The BatchGroup with the smallest + // group id will be considered first. + // + // The region mutex needs to be held while calling this method. + TransferBatch *popBatchImpl(CacheT *C, uptr ClassId) { + SizeClassInfo *Sci = getSizeClassInfo(ClassId); + if (Sci->FreeList.empty()) + return nullptr; + + SinglyLinkedList &Batches = Sci->FreeList.front()->Batches; + DCHECK(!Batches.empty()); + + TransferBatch *B = Batches.front(); + Batches.pop_front(); + DCHECK_NE(B, nullptr); + DCHECK_GT(B->getCount(), 0U); + + if (Batches.empty()) { + BatchGroup *BG = Sci->FreeList.front(); + Sci->FreeList.pop_front(); + + // We don't keep BatchGroup with zero blocks to avoid empty-checking while + // allocating. Note that block used by constructing BatchGroup is recorded + // as free blocks in the last element of BatchGroup::Batches. Which means, + // once we pop the last TransferBatch, the block is implicitly + // deallocated. + if (ClassId != SizeClassMap::BatchClassId) + C->deallocate(SizeClassMap::BatchClassId, BG); + } + + return B; + } + + NOINLINE bool populateFreeList(CacheT *C, uptr ClassId, SizeClassInfo *Sci) { uptr Region; uptr Offset; // If the size-class currently has a region associated to it, use it. The @@ -344,7 +564,7 @@ DCHECK_EQ(Sci->CurrentRegionAllocated, 0U); Region = allocateRegion(Sci, ClassId); if (UNLIKELY(!Region)) - return nullptr; + return false; C->getStats().add(StatMapped, RegionSize); Sci->CurrentRegion = Region; Offset = 0; @@ -378,20 +598,15 @@ if (ClassId != SizeClassMap::BatchClassId) shuffle(ShuffleArray, NumberOfBlocks, &Sci->RandState); for (u32 I = 0; I < NumberOfBlocks;) { - TransferBatch *B = - C->createBatch(ClassId, reinterpret_cast(ShuffleArray[I])); - if (UNLIKELY(!B)) - return nullptr; // `MaxCount` is u16 so the result will also fit in u16. const u16 N = static_cast(Min(MaxCount, NumberOfBlocks - I)); - B->setFromArray(&ShuffleArray[I], N); - Sci->FreeList.push_back(B); + // Note that the N blocks here may have different group ids. Given that + // it only happens when it crosses the group size boundary. Instead of + // sorting them, treat them as same group here to avoid sorting the + // almost-sorted blocks. + pushBlocksImpl(C, ClassId, &ShuffleArray[I], N, /*SameGroup=*/true); I += N; } - TransferBatch *B = Sci->FreeList.front(); - Sci->FreeList.pop_front(); - DCHECK(B); - DCHECK_GT(B->getCount(), 0); const uptr AllocatedUser = Size * NumberOfBlocks; C->getStats().add(StatFree, AllocatedUser); @@ -407,7 +622,7 @@ } Sci->AllocatedUser += AllocatedUser; - return B; + return true; } void getStats(ScopedString *Str, uptr ClassId, uptr Rss) { @@ -477,8 +692,12 @@ auto DecompactPtr = [](CompactPtrT CompactPtr) { return reinterpret_cast(CompactPtr); }; - releaseFreeMemoryToOS(Sci->FreeList, RegionSize, NumberOfRegions, BlockSize, - Recorder, DecompactPtr, SkipRegion); + PageReleaseContext Context(BlockSize, RegionSize, NumberOfRegions); + for (BatchGroup &BG : Sci->FreeList) + Context.markFreeBlocks(BG.Batches, DecompactPtr, Base); + + releaseFreeMemoryToOS(Context, Recorder, SkipRegion); + if (Recorder.getReleasedRangesCount() > 0) { Sci->ReleaseInfo.PushedBlocksAtLastRelease = Sci->Stats.PushedBlocks; Sci->ReleaseInfo.RangesReleased += Recorder.getReleasedRangesCount(); diff --git a/compiler-rt/lib/scudo/standalone/primary64.h b/compiler-rt/lib/scudo/standalone/primary64.h --- a/compiler-rt/lib/scudo/standalone/primary64.h +++ b/compiler-rt/lib/scudo/standalone/primary64.h @@ -45,10 +45,12 @@ public: typedef typename Config::PrimaryCompactPtrT CompactPtrT; static const uptr CompactPtrScale = Config::PrimaryCompactPtrScale; + static const uptr GroupSizeLog = Config::PrimaryGroupSizeLog; typedef typename Config::SizeClassMap SizeClassMap; typedef SizeClassAllocator64 ThisT; typedef SizeClassAllocatorLocalCache CacheT; typedef typename CacheT::TransferBatch TransferBatch; + typedef typename CacheT::BatchGroup BatchGroup; static uptr getSizeByClassId(uptr ClassId) { return (ClassId == SizeClassMap::BatchClassId) @@ -99,25 +101,61 @@ DCHECK_LT(ClassId, NumClasses); RegionInfo *Region = getRegionInfo(ClassId); ScopedLock L(Region->Mutex); - TransferBatch *B = Region->FreeList.front(); - if (B) { - Region->FreeList.pop_front(); - } else { - B = populateFreeList(C, ClassId, Region); - if (UNLIKELY(!B)) + TransferBatch *B = popBatchImpl(C, ClassId); + if (UNLIKELY(!B)) { + if (UNLIKELY(!populateFreeList(C, ClassId, Region))) return nullptr; + B = popBatchImpl(C, ClassId); + // if `populateFreeList` succeeded, we are supposed to get free blocks. + DCHECK_NE(B, nullptr); } - DCHECK_GT(B->getCount(), 0); Region->Stats.PoppedBlocks += B->getCount(); return B; } - void pushBatch(uptr ClassId, TransferBatch *B) { - DCHECK_GT(B->getCount(), 0); + // Push the array of free blocks to the designated batch group. + void pushBlocks(CacheT *C, uptr ClassId, CompactPtrT *Array, u32 Size) { + DCHECK_LT(ClassId, NumClasses); + DCHECK_GT(Size, 0); + RegionInfo *Region = getRegionInfo(ClassId); + if (ClassId == SizeClassMap::BatchClassId) { + ScopedLock L(Region->Mutex); + // Constructing a batch group in the free list will use two blocks in + // BatchClassId. If we are pushing BatchClassId blocks, we will use the + // blocks in the array directly (can't delegate local cache which will + // cause a recursive allocation). However, The number of free blocks may + // be less than two. Therefore, populate the free list before inserting + // the blocks. + if (Size == 1 && UNLIKELY(!populateFreeList(C, ClassId, Region))) + return; + pushBlocksImpl(C, ClassId, Array, Size); + Region->Stats.PushedBlocks += Size; + return; + } + + // TODO(chiahungduan): Consider not doing grouping if the group size is not + // greater than the block size with a certain scale. + + // Sort the blocks so that blocks belonging to the same group can be pushed + // together. + bool SameGroup = true; + for (u32 I = 1; I < Size; ++I) { + if (compactPtrGroup(Array[I - 1]) != compactPtrGroup(Array[I])) + SameGroup = false; + CompactPtrT Cur = Array[I]; + u32 J = I; + while (J > 0 && compactPtrGroup(Cur) < compactPtrGroup(Array[J - 1])) { + Array[J] = Array[J - 1]; + --J; + } + Array[J] = Cur; + } + ScopedLock L(Region->Mutex); - Region->FreeList.push_front(B); - Region->Stats.PushedBlocks += B->getCount(); + pushBlocksImpl(C, ClassId, Array, Size, SameGroup); + + Region->Stats.PushedBlocks += Size; if (ClassId != SizeClassMap::BatchClassId) releaseToOSMaybe(Region, ClassId); } @@ -292,7 +330,7 @@ struct UnpaddedRegionInfo { HybridMutex Mutex; - SinglyLinkedList FreeList; + SinglyLinkedList FreeList; uptr RegionBeg = 0; RegionStats Stats = {}; u32 RandState = 0; @@ -330,8 +368,192 @@ return Base + (static_cast(CompactPtr) << CompactPtrScale); } - NOINLINE TransferBatch *populateFreeList(CacheT *C, uptr ClassId, - RegionInfo *Region) { + static uptr compactPtrGroup(CompactPtrT CompactPtr) { + return CompactPtr >> (GroupSizeLog - CompactPtrScale); + } + + // Push the blocks to their batch group. The layout will be like, + // + // FreeList - > BG -> BG -> BG + // | | | + // v v v + // TB TB TB + // | + // v + // TB + // + // Each BlockGroup(BG) will associate with unique group id and the free blocks + // are managed by a list of TransferBatch(TB). To reduce the time of inserting + // blocks, BGs are sorted and the input `Array` are supposed to be sorted so + // that we can get better performance of maintaining sorted property. + // Use `SameGroup=true` to indicate that all blocks in the array are from the + // same group then we will skip checking the group id of each block. + // + // Note that this aims to have a better management of dirty pages, i.e., the + // RSS usage won't grow indefinitely. There's an exception that we may not put + // a block to its associated group. While populating new blocks, we may have + // blocks cross different groups. However, most cases will fall into same + // group and they are supposed to be popped soon. In that case, it's not worth + // sorting the array with the almost-sorted property. Therefore, we use + // `SameGroup=true` instead. + // + // The region mutex needs to be held while calling this method. + void pushBlocksImpl(CacheT *C, uptr ClassId, CompactPtrT *Array, u32 Size, + bool SameGroup = false) { + DCHECK_GT(Size, 0U); + RegionInfo *Region = getRegionInfo(ClassId); + + auto CreateGroup = [&](uptr GroupId) { + BatchGroup *BG = nullptr; + TransferBatch *TB = nullptr; + if (ClassId == SizeClassMap::BatchClassId) { + DCHECK_GE(Size, 2U); + BG = reinterpret_cast( + decompactPtr(ClassId, Array[Size - 1])); + BG->Batches.clear(); + + TB = reinterpret_cast( + decompactPtr(ClassId, Array[Size - 2])); + TB->clear(); + } else { + BG = C->createGroup(); + BG->Batches.clear(); + + TB = C->createBatch(ClassId, nullptr); + TB->clear(); + } + + BG->GroupId = GroupId; + BG->Batches.push_front(TB); + BG->MaxCachedPerBatch = + TransferBatch::getMaxCached(getSizeByClassId(ClassId)); + + return BG; + }; + + auto InsertBlocks = [&](BatchGroup *BG, CompactPtrT *Array, u32 Size) { + SinglyLinkedList &Batches = BG->Batches; + TransferBatch *CurBatch = Batches.front(); + DCHECK_NE(CurBatch, nullptr); + + for (u32 I = 0; I < Size;) { + DCHECK_GE(BG->MaxCachedPerBatch, CurBatch->getCount()); + u16 UnusedSlots = + static_cast(BG->MaxCachedPerBatch - CurBatch->getCount()); + if (UnusedSlots == 0) { + CurBatch = C->createBatch( + ClassId, + reinterpret_cast(decompactPtr(ClassId, Array[I]))); + CurBatch->clear(); + Batches.push_front(CurBatch); + UnusedSlots = BG->MaxCachedPerBatch; + } + // `UnusedSlots` is u16 so the result will be also fit in u16. + u16 AppendSize = static_cast(Min(UnusedSlots, Size - I)); + CurBatch->appendFromArray(&Array[I], AppendSize); + I += AppendSize; + } + }; + + BatchGroup *Cur = Region->FreeList.front(); + + if (ClassId == SizeClassMap::BatchClassId) { + if (Cur == nullptr) { + // Don't need to classify BatchClassId. + Cur = CreateGroup(/*GroupId=*/0); + Region->FreeList.push_front(Cur); + } + InsertBlocks(Cur, Array, Size); + return; + } + + // In the following, `Cur` always points to the BatchGroup for blocks that + // will be pushed next. `Prev` is the element right before `Cur`. + BatchGroup *Prev = nullptr; + + while (Cur != nullptr && compactPtrGroup(Array[0]) > Cur->GroupId) { + Prev = Cur; + Cur = Cur->Next; + } + + if (Cur == nullptr || compactPtrGroup(Array[0]) != Cur->GroupId) { + Cur = CreateGroup(compactPtrGroup(Array[0])); + if (Prev == nullptr) + Region->FreeList.push_front(Cur); + else + Region->FreeList.insert(Prev, Cur); + } + + // All the blocks are from the same group, just push without checking group + // id. + if (SameGroup) { + InsertBlocks(Cur, Array, Size); + return; + } + + // The blocks are sorted by group id. Determine the segment of group and + // push them to their group together. + u32 Count = 1; + for (u32 I = 1; I < Size; ++I) { + if (compactPtrGroup(Array[I - 1]) != compactPtrGroup(Array[I])) { + DCHECK_EQ(compactPtrGroup(Array[I - 1]), Cur->GroupId); + InsertBlocks(Cur, Array + I - Count, Count); + + while (Cur != nullptr && compactPtrGroup(Array[I]) > Cur->GroupId) { + Prev = Cur; + Cur = Cur->Next; + } + + if (Cur == nullptr || compactPtrGroup(Array[I]) != Cur->GroupId) { + Cur = CreateGroup(compactPtrGroup(Array[I])); + DCHECK_NE(Prev, nullptr); + Region->FreeList.insert(Prev, Cur); + } + + Count = 1; + } else { + ++Count; + } + } + + InsertBlocks(Cur, Array + Size - Count, Count); + } + + // Pop one TransferBatch from a BatchGroup. The BatchGroup with the smallest + // group id will be considered first. + // + // The region mutex needs to be held while calling this method. + TransferBatch *popBatchImpl(CacheT *C, uptr ClassId) { + RegionInfo *Region = getRegionInfo(ClassId); + if (Region->FreeList.empty()) + return nullptr; + + SinglyLinkedList &Batches = + Region->FreeList.front()->Batches; + DCHECK(!Batches.empty()); + + TransferBatch *B = Batches.front(); + Batches.pop_front(); + DCHECK_NE(B, nullptr); + DCHECK_GT(B->getCount(), 0U); + + if (Batches.empty()) { + BatchGroup *BG = Region->FreeList.front(); + Region->FreeList.pop_front(); + + // We don't keep BatchGroup with zero blocks to avoid empty-checking while + // allocating. Note that block used by constructing BatchGroup is recorded + // as free blocks in the last element of BatchGroup::Batches. Which means, + // once we pop the last TransferBatch, the block is implicitly + // deallocated. + if (ClassId != SizeClassMap::BatchClassId) + C->deallocate(SizeClassMap::BatchClassId, BG); + } + + return B; + } + + NOINLINE bool populateFreeList(CacheT *C, uptr ClassId, RegionInfo *Region) { const uptr Size = getSizeByClassId(ClassId); const u16 MaxCount = TransferBatch::getMaxCached(Size); @@ -354,7 +576,7 @@ RegionSize >> 20, Size); Str.output(); } - return nullptr; + return false; } if (MappedUser == 0) Region->Data = Data; @@ -363,8 +585,9 @@ "scudo:primary", MAP_ALLOWNOMEM | MAP_RESIZABLE | (useMemoryTagging(Options.load()) ? MAP_MEMTAG : 0), - &Region->Data))) - return nullptr; + &Region->Data))) { + return false; + } Region->MappedUser += MapSize; C->getStats().add(StatMapped, MapSize); } @@ -387,27 +610,21 @@ if (ClassId != SizeClassMap::BatchClassId) shuffle(ShuffleArray, NumberOfBlocks, &Region->RandState); for (u32 I = 0; I < NumberOfBlocks;) { - TransferBatch *B = - C->createBatch(ClassId, reinterpret_cast(decompactPtrInternal( - CompactPtrBase, ShuffleArray[I]))); - if (UNLIKELY(!B)) - return nullptr; // `MaxCount` is u16 so the result will also fit in u16. const u16 N = static_cast(Min(MaxCount, NumberOfBlocks - I)); - B->setFromArray(&ShuffleArray[I], N); - Region->FreeList.push_back(B); + // Note that the N blocks here may have different group ids. Given that + // it only happens when it crosses the group size boundary. Instead of + // sorting them, treat them as same group here to avoid sorting the + // almost-sorted blocks. + pushBlocksImpl(C, ClassId, &ShuffleArray[I], N, /*SameGroup=*/true); I += N; } - TransferBatch *B = Region->FreeList.front(); - Region->FreeList.pop_front(); - DCHECK(B); - DCHECK_GT(B->getCount(), 0); const uptr AllocatedUser = Size * NumberOfBlocks; C->getStats().add(StatFree, AllocatedUser); Region->AllocatedUser += AllocatedUser; - return B; + return true; } void getStats(ScopedString *Str, uptr ClassId, uptr Rss) { @@ -473,8 +690,11 @@ return decompactPtrInternal(CompactPtrBase, CompactPtr); }; auto SkipRegion = [](UNUSED uptr RegionIndex) { return false; }; - releaseFreeMemoryToOS(Region->FreeList, Region->AllocatedUser, 1U, - BlockSize, Recorder, DecompactPtr, SkipRegion); + PageReleaseContext Context(BlockSize, RegionSize, /*NumberOfRegions=*/1U); + for (BatchGroup &BG : Region->FreeList) + Context.markFreeBlocks(BG.Batches, DecompactPtr, Region->RegionBeg); + + releaseFreeMemoryToOS(Context, Recorder, SkipRegion); if (Recorder.getReleasedRangesCount() > 0) { Region->ReleaseInfo.PushedBlocksAtLastRelease = diff --git a/compiler-rt/lib/scudo/standalone/tests/combined_test.cpp b/compiler-rt/lib/scudo/standalone/tests/combined_test.cpp --- a/compiler-rt/lib/scudo/standalone/tests/combined_test.cpp +++ b/compiler-rt/lib/scudo/standalone/tests/combined_test.cpp @@ -525,6 +525,7 @@ static const scudo::uptr PrimaryCompactPtrScale = 0; static const bool PrimaryEnableRandomOffset = true; static const scudo::uptr PrimaryMapSizeIncrement = 1UL << 18; + static const scudo::uptr PrimaryGroupSizeLog = 18; typedef scudo::MapAllocatorNoCache SecondaryCache; template using TSDRegistryT = scudo::TSDRegistrySharedT; diff --git a/compiler-rt/lib/scudo/standalone/tests/list_test.cpp b/compiler-rt/lib/scudo/standalone/tests/list_test.cpp --- a/compiler-rt/lib/scudo/standalone/tests/list_test.cpp +++ b/compiler-rt/lib/scudo/standalone/tests/list_test.cpp @@ -161,6 +161,10 @@ setList(&L1, X); checkList(&L1, X); + setList(&L1, X, Y); + L1.insert(X, Z); + checkList(&L1, X, Z, Y); + setList(&L1, X, Y, Z); setList(&L2, A, B, C); L1.append_back(&L2); diff --git a/compiler-rt/lib/scudo/standalone/tests/primary_test.cpp b/compiler-rt/lib/scudo/standalone/tests/primary_test.cpp --- a/compiler-rt/lib/scudo/standalone/tests/primary_test.cpp +++ b/compiler-rt/lib/scudo/standalone/tests/primary_test.cpp @@ -12,8 +12,11 @@ #include "primary64.h" #include "size_class_map.h" +#include +#include #include #include +#include #include #include #include @@ -24,6 +27,7 @@ struct TestConfig1 { static const scudo::uptr PrimaryRegionSizeLog = 18U; + static const scudo::uptr PrimaryGroupSizeLog = 18U; static const scudo::s32 PrimaryMinReleaseToOsIntervalMs = INT32_MIN; static const scudo::s32 PrimaryMaxReleaseToOsIntervalMs = INT32_MAX; static const bool MaySupportMemoryTagging = false; @@ -40,6 +44,7 @@ #else static const scudo::uptr PrimaryRegionSizeLog = 24U; #endif + static const scudo::uptr PrimaryGroupSizeLog = 20U; static const scudo::s32 PrimaryMinReleaseToOsIntervalMs = INT32_MIN; static const scudo::s32 PrimaryMaxReleaseToOsIntervalMs = INT32_MAX; static const bool MaySupportMemoryTagging = false; @@ -56,6 +61,7 @@ #else static const scudo::uptr PrimaryRegionSizeLog = 24U; #endif + static const scudo::uptr PrimaryGroupSizeLog = 20U; static const scudo::s32 PrimaryMinReleaseToOsIntervalMs = INT32_MIN; static const scudo::s32 PrimaryMaxReleaseToOsIntervalMs = INT32_MAX; static const bool MaySupportMemoryTagging = true; @@ -65,6 +71,23 @@ static const scudo::uptr PrimaryMapSizeIncrement = 1UL << 18; }; +struct TestConfig4 { +#if defined(__mips__) + // Unable to allocate greater size on QEMU-user. + static const scudo::uptr PrimaryRegionSizeLog = 23U; +#else + static const scudo::uptr PrimaryRegionSizeLog = 24U; +#endif + static const scudo::s32 PrimaryMinReleaseToOsIntervalMs = INT32_MIN; + static const scudo::s32 PrimaryMaxReleaseToOsIntervalMs = INT32_MAX; + static const bool MaySupportMemoryTagging = true; + static const scudo::uptr PrimaryCompactPtrScale = 3U; + static const scudo::uptr PrimaryGroupSizeLog = 20U; + typedef scudo::u32 PrimaryCompactPtrT; + static const bool PrimaryEnableRandomOffset = true; + static const scudo::uptr PrimaryMapSizeIncrement = 1UL << 18; +}; + template struct Config : public BaseConfig { using SizeClassMap = SizeClassMapT; @@ -100,7 +123,8 @@ #define SCUDO_TYPED_TEST_ALL_TYPES(FIXTURE, NAME) \ SCUDO_TYPED_TEST_TYPE(FIXTURE, NAME, TestConfig1) \ SCUDO_TYPED_TEST_TYPE(FIXTURE, NAME, TestConfig2) \ - SCUDO_TYPED_TEST_TYPE(FIXTURE, NAME, TestConfig3) + SCUDO_TYPED_TEST_TYPE(FIXTURE, NAME, TestConfig3) \ + SCUDO_TYPED_TEST_TYPE(FIXTURE, NAME, TestConfig4) #endif #define SCUDO_TYPED_TEST_TYPE(FIXTURE, NAME, TYPE) \ @@ -153,6 +177,7 @@ static const scudo::uptr PrimaryCompactPtrScale = 0; static const bool PrimaryEnableRandomOffset = true; static const scudo::uptr PrimaryMapSizeIncrement = 1UL << 18; + static const scudo::uptr PrimaryGroupSizeLog = 20U; }; // The 64-bit SizeClassAllocator can be easily OOM'd with small region sizes. @@ -170,6 +195,8 @@ std::vector Batches; const scudo::uptr ClassId = Primary::SizeClassMap::LargestClassId; const scudo::uptr Size = Primary::getSizeByClassId(ClassId); + typename Primary::CacheT::CompactPtrT Blocks[TransferBatch::MaxNumCached]; + for (scudo::uptr I = 0; I < 10000U; I++) { TransferBatch *B = Allocator.popBatch(&Cache, ClassId); if (!B) { @@ -181,8 +208,11 @@ Batches.push_back(B); } while (!Batches.empty()) { - Allocator.pushBatch(ClassId, Batches.back()); + TransferBatch *B = Batches.back(); Batches.pop_back(); + B->copyToArray(Blocks); + Allocator.pushBlocks(&Cache, ClassId, Blocks, B->getCount()); + Cache.deallocate(Primary::SizeClassMap::BatchClassId, B); } Cache.destroy(nullptr); Allocator.releaseToOS(); @@ -294,3 +324,47 @@ Cache.destroy(nullptr); EXPECT_GT(Allocator->releaseToOS(), 0U); } + +SCUDO_TYPED_TEST(ScudoPrimaryTest, MemoryGroup) { + using Primary = TestAllocator; + std::unique_ptr Allocator(new Primary); + Allocator->init(/*ReleaseToOsInterval=*/-1); + typename Primary::CacheT Cache; + Cache.init(nullptr, Allocator.get()); + const scudo::uptr Size = 32U; + const scudo::uptr ClassId = Primary::SizeClassMap::getClassIdBySize(Size); + + // We will allocate 4 times the group size memory and release all of them. We + // expect the free blocks will be classified with groups. Then we will + // allocate the same amount of memory as group size and expect the blocks will + // have the max address difference smaller or equal to 2 times the group size. + // Note that it isn't necessary to be in the range of single group size + // because the way we get the group id is doing compact pointer shifting. + // According to configuration, the compact pointer may not align to group + // size. As a result, the blocks can cross two groups at most. + const scudo::uptr GroupSizeMem = (1ULL << Primary::GroupSizeLog); + const scudo::uptr PeakAllocationMem = 4 * GroupSizeMem; + const scudo::uptr PeakNumberOfAllocations = PeakAllocationMem / Size; + const scudo::uptr FinalNumberOfAllocations = GroupSizeMem / Size; + std::vector Blocks; + std::mt19937 R; + + for (scudo::uptr I = 0; I < PeakNumberOfAllocations; ++I) + Blocks.push_back(reinterpret_cast(Cache.allocate(ClassId))); + + std::shuffle(Blocks.begin(), Blocks.end(), R); + + // Release all the allocated blocks, including those held by local cache. + while (!Blocks.empty()) { + Cache.deallocate(ClassId, reinterpret_cast(Blocks.back())); + Blocks.pop_back(); + } + Cache.drain(); + + for (scudo::uptr I = 0; I < FinalNumberOfAllocations; ++I) + Blocks.push_back(reinterpret_cast(Cache.allocate(ClassId))); + + EXPECT_LE(*std::max_element(Blocks.begin(), Blocks.end()) - + *std::min_element(Blocks.begin(), Blocks.end()), + GroupSizeMem * 2); +}