diff --git a/openmp/libomptarget/src/CMakeLists.txt b/openmp/libomptarget/src/CMakeLists.txt --- a/openmp/libomptarget/src/CMakeLists.txt +++ b/openmp/libomptarget/src/CMakeLists.txt @@ -13,6 +13,7 @@ libomptarget_say("Building offloading runtime library libomptarget.") set(src_files + allocator.cpp api.cpp device.cpp interface.cpp diff --git a/openmp/libomptarget/src/memory.h b/openmp/libomptarget/src/allocator.h copy from openmp/libomptarget/src/memory.h copy to openmp/libomptarget/src/allocator.h --- a/openmp/libomptarget/src/memory.h +++ b/openmp/libomptarget/src/allocator.h @@ -1,4 +1,4 @@ -//===----------- memory.h - Target independent memory manager -------------===// +//===----------- allocator.h - Target memory allocators -------------------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. @@ -6,7 +6,7 @@ // //===----------------------------------------------------------------------===// // -// Declarations for target independent memory manager. +// Declarations for target memory allocators. // //===----------------------------------------------------------------------===// @@ -20,21 +20,17 @@ namespace memory { namespace impl { -class MemoryManagerImplTy; +class BumpAllocatorImplTy; } // namespace impl -class MemoryManagerTy { - std::shared_ptr Impl; +class BumpAllocatorTy { + std::shared_ptr Impl; public: - /// Constructor - MemoryManagerTy(DeviceTy &D, size_t Threshold = 0); + BumpAllocatorTy(DeviceTy &Dev); - /// Allocate memory of size \p Size from target device. \p HstPtr is used to - /// assist the allocation. void *allocate(size_t Size, void *HstPtr); - /// Deallocate memory pointed by \p TgtPtr - int free(void *TgtPtr); + int32_t deallocate(void *Ptr); }; } // namespace memory diff --git a/openmp/libomptarget/src/allocator.cpp b/openmp/libomptarget/src/allocator.cpp new file mode 100644 --- /dev/null +++ b/openmp/libomptarget/src/allocator.cpp @@ -0,0 +1,188 @@ +//===----------- allocator.cpp - Target memory allocators -----------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// Functionality for target memory allocators. +// +//===----------------------------------------------------------------------===// + +#include "allocator.h" +#include "device.h" +#include "private.h" +#include "rtl.h" + +#include +#include +#include +#include + +namespace { +struct SlabTy { + void *PtrBegin; + void *PtrEnd; + void *Cur; + const size_t Size; + size_t RefCount; + + SlabTy(void *TgtPtr, size_t Size) + : PtrBegin(TgtPtr), PtrEnd(reinterpret_cast(PtrBegin) + Size), + Cur(PtrEnd), Size(Size), RefCount(0) {} +}; +} // namespace + +namespace memory { +/// Slab size. By default it is 8MB. +size_t SlabSize = 1U << 23; + +constexpr const size_t Alignment = 8; + +namespace impl { +class BumpAllocatorImplTy { + using SlabPtrTy = std::shared_ptr; + using SlabTableTy = std::map; + + /// A reference to the device object + DeviceTy &Device; + /// A table mapping from the target pointer to its slab. The target pointer + /// must be the \p PtrBegin of each slab. + SlabTableTy PtrToSlabTable; + /// The mutex for the map table + std::mutex MapTableMtx; + + /// Request memory from target device + void *allocateFromDevice(size_t Size, void *HstPtr) const { + return Device.RTL->data_alloc(Device.RTLDeviceID, Size, HstPtr); + } + + /// Deallocate data from device + int deleteFromDevice(void *Ptr) const { + return Device.RTL->data_delete(Device.RTLDeviceID, Ptr); + } + + /// Create a new slab of size \p Size + SlabPtrTy createNewSlab(size_t Size) { + void *TgtPtr = allocateFromDevice(Size, nullptr); + if (TgtPtr == nullptr) + return nullptr; + return std::make_shared(TgtPtr, Size); + } + +public: + /// Constructor + BumpAllocatorImplTy(DeviceTy &Dev) : Device(Dev) {} + + /// Destructor + ~BumpAllocatorImplTy() { + for (SlabTableTy::iterator Itr = PtrToSlabTable.begin(); + Itr != PtrToSlabTable.end(); ++Itr) + deleteFromDevice(Itr->first); + } + + /// Allocate memory + void *allocate(size_t Size, void *HstPtr) { + // If the size is larger than SlabSize, we will allocate it from device + // directly. + // TODO: We might want something similar to the CustomSizedSlabs in + // BumpPtrAllocatorImpl in LLVM. Or, we could use it as another slab when it + // is freed. + if (Size > SlabSize) + return allocateFromDevice(Size, HstPtr); + + // Add padding if needed + Size += Size % Alignment; + + // Find an available slab + { + std::lock_guard G(MapTableMtx); + for (SlabTableTy::iterator Itr = PtrToSlabTable.begin(); + Itr != PtrToSlabTable.end(); ++Itr) { + SlabPtrTy Slab = Itr->second; + + // If the size left in a slab is less than Size, we move to the next one + if (reinterpret_cast(Slab->Cur) - + reinterpret_cast(Slab->PtrBegin) < + Size) + continue; + + // Now we find one. Allocate it from the slab. + Slab->Cur = reinterpret_cast(Slab->Cur) - Size; + Slab->RefCount += 1; + return Slab->Cur; + } + } + + // We cannot find a fitted slab. Create a new one + SlabPtrTy Slab = createNewSlab(SlabSize); + + // Failed to create a new slab. Return nullptr directly. + if (Slab == nullptr) + return nullptr; + + // We could manipulate it w/o any lock because it has not been inserted to + // the table yet + Slab->Cur = reinterpret_cast(Slab->Cur) - Size; + Slab->RefCount += 1; + + void *TgtPtr = Slab->Cur; + + // Add the slab into map table + { + std::lock_guard G(MapTableMtx); + PtrToSlabTable[Slab->PtrBegin] = Slab; + } + + return TgtPtr; + } + + /// Deallocate memory + int32_t deallocate(void *Ptr) { + // Find which slab Ptr is from + { + std::lock_guard G(MapTableMtx); + SlabTableTy::iterator Itr = PtrToSlabTable.upper_bound(Ptr); + if (Itr != PtrToSlabTable.end() || !PtrToSlabTable.empty()) { + assert(Itr != PtrToSlabTable.begin() && "Itr should NOT be begin!"); + + Itr = std::prev(Itr); + SlabPtrTy Slab = Itr->second; + + // If Ptr is not in the range, it is not allocated from the slab + if (Ptr >= Slab->PtrBegin && Ptr < Slab->PtrEnd) { + size_t &RC = Slab->RefCount; + RC -= 1; + + // This is not the last piece of the slab. Return directly. + if (RC != 0) + return OFFLOAD_SUCCESS; + + // The last piece of the slab. Remove the slab from the map table. + PtrToSlabTable.erase(Itr); + // Set Ptr to the beginning of the slab. + Ptr = Slab->PtrBegin; + } + } + } + + // There are only two cases that we can reach this point: + // 1. Ptr is not allocated from a slab; + // 2. Ptr has been "free" in the slab, and it is the last piece in the slab, + // so we need to free the slab. In this case, Ptr has already been set to + // the beginning of the slab in the code block above. + return deleteFromDevice(Ptr); + } +}; +} // namespace impl + +BumpAllocatorTy::BumpAllocatorTy(DeviceTy &Dev) + : Impl(std::make_shared(Dev)) {} + +void *BumpAllocatorTy::allocate(size_t Size, void *HstPtr) { + return Impl->allocate(Size, HstPtr); +} + +int32_t BumpAllocatorTy::deallocate(void *Ptr) { return Impl->deallocate(Ptr); } +} // namespace memory diff --git a/openmp/libomptarget/src/memory.h b/openmp/libomptarget/src/memory.h --- a/openmp/libomptarget/src/memory.h +++ b/openmp/libomptarget/src/memory.h @@ -19,12 +19,13 @@ struct DeviceTy; namespace memory { +class BumpAllocatorTy; namespace impl { -class MemoryManagerImplTy; +template class MemoryManagerImplTy; } // namespace impl class MemoryManagerTy { - std::shared_ptr Impl; + std::shared_ptr> Impl; public: /// Constructor diff --git a/openmp/libomptarget/src/memory.cpp b/openmp/libomptarget/src/memory.cpp --- a/openmp/libomptarget/src/memory.cpp +++ b/openmp/libomptarget/src/memory.cpp @@ -33,6 +33,7 @@ #include #include +#include "allocator.h" #include "device.h" #include "memory.h" #include "rtl.h" @@ -105,7 +106,7 @@ bool operator<(const NodeTy &RHS) { return Size < RHS.Size; } }; -class MemoryManagerImplTy { +template class MemoryManagerImplTy { /// Nodes are used in a format of \p std::shared_ptr using NodePtrTy = std::shared_ptr; /// A \p FreeList is a set of Nodes. We're using \p std::multiset here to make @@ -121,18 +122,16 @@ std::vector FreeListLocks; /// The mutex for the table \p PtrToNodeTable std::mutex MapTableLock; - /// A reference to its corresponding \p DeviceTy object - DeviceTy &Device; + /// Allocator + AllocatorTy Allocator; - /// Request memory from target device - void *allocateFromDevice(size_t Size, void *HstPtr) const { - return Device.RTL->data_alloc(Device.RTLDeviceID, Size, HstPtr); + /// Allocate data via \p Allocator + void *allocateOnDevice(size_t Size, void *HstPtr) { + return Allocator.allocate(Size, HstPtr); } - /// Deallocate data from device - int deleteFromDevice(void *Ptr) const { - return Device.RTL->data_delete(Device.RTLDeviceID, Ptr); - } + /// Deallocate data via \p Allocator + int deallocateOnDevice(void *Ptr) { return Allocator.deallocate(Ptr); } /// This function is called when it tries to allocate memory on device but the /// device returns out of memory. It will first free all memory in the @@ -145,18 +144,18 @@ continue; std::lock_guard Lock(FreeListLocks[I]); for (const NodePtrTy &N : List) - deleteFromDevice(N->Ptr); + deallocateOnDevice(N->Ptr); FreeLists[I].clear(); } // Try allocate memory again - return allocateFromDevice(Size, HstPtr); + return allocateOnDevice(Size, HstPtr); } public: /// Constructor MemoryManagerImplTy(DeviceTy &Dev) - : FreeLists(NumBuckets), FreeListLocks(NumBuckets), Device(Dev) {} + : FreeLists(NumBuckets), FreeListLocks(NumBuckets), Allocator(Dev) {} /// Destructor ~MemoryManagerImplTy() { @@ -167,13 +166,13 @@ // We don't need lock here because only one thread can execute it FreeListTy &List = FreeLists[I]; for (const NodePtrTy &N : List) - deleteFromDevice(N->Ptr); + deallocateOnDevice(N->Ptr); } // Deallocate all memory in map for (std::pair P : PtrToNodeTable) { assert(P.second->Ptr && "nullptr in map table"); - deleteFromDevice(P.second->Ptr); + deallocateOnDevice(P.second->Ptr); } } @@ -193,7 +192,7 @@ DP("%zu is greater than the threshold %zu. Allocate it directly from " "device\n", Size, SizeThreshold); - void *TgtPtr = allocateFromDevice(Size, HstPtr); + void *TgtPtr = allocateOnDevice(Size, HstPtr); // We cannot get memory from the device. It might be due to OOM. Let's // free all memory in FreeLists and try again. if (TgtPtr == nullptr) { @@ -230,7 +229,7 @@ if (NodePtr == nullptr) { DP("Cannot find a node in the FreeLists. Allocate from device.\n"); // Allocate one from device - void *TgtPtr = allocateFromDevice(Size, HstPtr); + void *TgtPtr = allocateOnDevice(Size, HstPtr); // If TgtPtr is nullptr, it might be due to OOM. Call freeAndAllocate to // free some memory in FreeList and then allocate again @@ -283,7 +282,7 @@ // The memory is not managed by the manager if (P == nullptr) { DP("Cannot find its node. Delete it from device directly.\n"); - return deleteFromDevice(TgtPtr); + return deallocateOnDevice(TgtPtr); } // Insert the node to the free list @@ -308,7 +307,7 @@ int MemoryManagerTy::free(void *TgtPtr) { return Impl->free(TgtPtr); } MemoryManagerTy::MemoryManagerTy(DeviceTy &D, size_t Threshold) - : Impl(new impl::MemoryManagerImplTy(D)) { + : Impl(new impl::MemoryManagerImplTy(D)) { if (Threshold) impl::SizeThreshold = Threshold; }