diff --git a/openmp/libomptarget/plugins/common/CMakeLists.txt b/openmp/libomptarget/plugins/common/CMakeLists.txt --- a/openmp/libomptarget/plugins/common/CMakeLists.txt +++ b/openmp/libomptarget/plugins/common/CMakeLists.txt @@ -11,3 +11,4 @@ ##===----------------------------------------------------------------------===## add_subdirectory(elf_common) +add_subdirectory(MemoryManager) diff --git a/openmp/libomptarget/plugins/common/CMakeLists.txt b/openmp/libomptarget/plugins/common/MemoryManager/CMakeLists.txt copy from openmp/libomptarget/plugins/common/CMakeLists.txt copy to openmp/libomptarget/plugins/common/MemoryManager/CMakeLists.txt --- a/openmp/libomptarget/plugins/common/CMakeLists.txt +++ b/openmp/libomptarget/plugins/common/MemoryManager/CMakeLists.txt @@ -5,9 +5,7 @@ # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception # ##===----------------------------------------------------------------------===## -# -# Common parts which can be used by all plugins -# -##===----------------------------------------------------------------------===## -add_subdirectory(elf_common) +add_library(MemoryManager INTERFACE) + +target_include_directories(MemoryManager INTERFACE ${CMAKE_CURRENT_SOURCE_DIR}) diff --git a/openmp/libomptarget/plugins/common/MemoryManager/MemoryManager.h b/openmp/libomptarget/plugins/common/MemoryManager/MemoryManager.h new file mode 100644 --- /dev/null +++ b/openmp/libomptarget/plugins/common/MemoryManager/MemoryManager.h @@ -0,0 +1,341 @@ +//===----------- MemoryManager.h - Target independent memory manager ------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// Target independent memory manager. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_OPENMP_LIBOMPTARGET_PLUGINS_COMMON_MEMORYMANAGER_MEMORYMANAGER_H +#define LLVM_OPENMP_LIBOMPTARGET_PLUGINS_COMMON_MEMORYMANAGER_MEMORYMANAGER_H + +#include +#include +#include +#include +#include +#include +#include + +#include "Debug.h" +#include "omptargetplugin.h" + +/// Base class of per-device allocator. +class DeviceAllocatorTy { +public: + virtual ~DeviceAllocatorTy() = default; + + /// Allocate a memory of size \p Size . \p HstPtr is used to assist the + /// allocation. + virtual void *allocate(size_t Size, void *HstPtr) = 0; + + /// Delete the pointer \p TgtPtr on the device + virtual int free(void *TgtPtr) = 0; +}; + +/// Class of memory manager. The memory manager is per-device by using +/// per-device allocator. Therefore, each plugin using memory manager should +/// have an allocator for each device. +class MemoryManagerTy { + static constexpr const size_t BucketSize[] = { + 0, 1U << 2, 1U << 3, 1U << 4, 1U << 5, 1U << 6, 1U << 7, + 1U << 8, 1U << 9, 1U << 10, 1U << 11, 1U << 12, 1U << 13}; + + static constexpr const int NumBuckets = + sizeof(BucketSize) / sizeof(BucketSize[0]); + + /// Find the previous number that is power of 2 given a number that is not + /// power of 2. + static size_t floorToPowerOfTwo(size_t Num) { + Num |= Num >> 1; + Num |= Num >> 2; + Num |= Num >> 4; + Num |= Num >> 8; + Num |= Num >> 16; +#if INTPTR_MAX == INT64_MAX + Num |= Num >> 32; +#elif INTPTR_MAX == INT32_MAX + // Do nothing with 32-bit +#else +#error Unsupported architecture +#endif + Num += 1; + return Num >> 1; + } + + /// Find a suitable bucket + static int findBucket(size_t Size) { + const size_t F = floorToPowerOfTwo(Size); + + DP("findBucket: Size %zu is floored to %zu.\n", Size, F); + + int L = 0, H = NumBuckets - 1; + while (H - L > 1) { + int M = (L + H) >> 1; + if (BucketSize[M] == F) + return M; + if (BucketSize[M] > F) + H = M - 1; + else + L = M; + } + + assert(L >= 0 && L < NumBuckets && "L is out of range"); + + DP("findBucket: Size %zu goes to bucket %d\n", Size, L); + + return L; + } + + /// A structure stores the meta data of a target pointer + struct NodeTy { + /// Memory size + const size_t Size; + /// Target pointer + void *Ptr; + + /// Constructor + NodeTy(size_t Size, void *Ptr) : Size(Size), Ptr(Ptr) {} + }; + + /// To make \p NodePtrTy ordered when they're put into \p std::multiset. + struct NodeCmpTy { + bool operator()(const NodeTy &LHS, const NodeTy &RHS) const { + return LHS.Size < RHS.Size; + } + }; + + /// A \p FreeList is a set of Nodes. We're using \p std::multiset here to make + /// the look up procedure more efficient. + using FreeListTy = std::multiset, NodeCmpTy>; + + /// A list of \p FreeListTy entries, each of which is a \p std::multiset of + /// Nodes whose size is less or equal to a specific bucket size. + std::vector FreeLists; + /// A list of mutex for each \p FreeListTy entry + std::vector FreeListLocks; + /// A table to map from a target pointer to its node + std::unordered_map PtrToNodeTable; + /// The mutex for the table \p PtrToNodeTable + std::mutex MapTableLock; + + /// The reference to a device allocator + DeviceAllocatorTy &DeviceAllocator; + + /// The threshold to manage memory using memory manager. If the request size + /// is larger than \p SizeThreshold, the allocation will not be managed by the + /// memory manager. + size_t SizeThreshold = 1U << 13; + + /// Request memory from target device + void *allocateOnDevice(size_t Size, void *HstPtr) const { + return DeviceAllocator.allocate(Size, HstPtr); + } + + /// Deallocate data on device + int deleteOnDevice(void *Ptr) const { return DeviceAllocator.free(Ptr); } + + /// This function is called when it tries to allocate memory on device but the + /// device returns out of memory. It will first free all memory in the + /// FreeList and try to allocate again. + void *freeAndAllocate(size_t Size, void *HstPtr) { + std::vector RemoveList; + + // Deallocate all memory in FreeList + for (int I = 0; I < NumBuckets; ++I) { + FreeListTy &List = FreeLists[I]; + std::lock_guard Lock(FreeListLocks[I]); + if (List.empty()) + continue; + for (const NodeTy &N : List) { + deleteOnDevice(N.Ptr); + RemoveList.push_back(N.Ptr); + } + FreeLists[I].clear(); + } + + // Remove all nodes in the map table which have been released + if (!RemoveList.empty()) { + std::lock_guard LG(MapTableLock); + for (void *P : RemoveList) + PtrToNodeTable.erase(P); + } + + // Try allocate memory again + return allocateOnDevice(Size, HstPtr); + } + + /// The goal is to allocate memory on the device. It first tries to + /// allocate directly on the device. If a \p nullptr is returned, it might + /// be because the device is OOM. In that case, it will free all unused + /// memory and then try again. + void *allocateOrFreeAndAllocateOnDevice(size_t Size, void *HstPtr) { + void *TgtPtr = allocateOnDevice(Size, HstPtr); + // We cannot get memory from the device. It might be due to OOM. Let's + // free all memory in FreeLists and try again. + if (TgtPtr == nullptr) { + DP("Failed to get memory on device. Free all memory in FreeLists and " + "try again.\n"); + TgtPtr = freeAndAllocate(Size, HstPtr); + } + + if (TgtPtr == nullptr) + DP("Still cannot get memory on device probably because the device is " + "OOM.\n"); + + return TgtPtr; + } + +public: + /// Constructor. If \p Threshold is non-zero, then the default threshold will + /// be overwritten by \p Threshold. + MemoryManagerTy(DeviceAllocatorTy &DeviceAllocator, size_t Threshold = 0) + : FreeLists(NumBuckets), FreeListLocks(NumBuckets), + DeviceAllocator(DeviceAllocator) { + if (Threshold) + SizeThreshold = Threshold; + } + + /// Destructor + ~MemoryManagerTy() { + for (auto Itr = PtrToNodeTable.begin(); Itr != PtrToNodeTable.end(); + ++Itr) { + assert(Itr->second.Ptr && "nullptr in map table"); + deleteOnDevice(Itr->second.Ptr); + } + } + + /// Allocate memory of size \p Size from target device. \p HstPtr is used to + /// assist the allocation. + void *allocate(size_t Size, void *HstPtr) { + // If the size is zero, we will not bother the target device. Just return + // nullptr directly. + if (Size == 0) + return nullptr; + + DP("MemoryManagerTy::allocate: size %zu with host pointer " DPxMOD ".\n", + Size, DPxPTR(HstPtr)); + + // If the size is greater than the threshold, allocate it directly from + // device. + if (Size > SizeThreshold) { + DP("%zu is greater than the threshold %zu. Allocate it directly from " + "device\n", + Size, SizeThreshold); + void *TgtPtr = allocateOrFreeAndAllocateOnDevice(Size, HstPtr); + + DP("Got target pointer " DPxMOD ". Return directly.\n", DPxPTR(TgtPtr)); + + return TgtPtr; + } + + NodeTy *NodePtr = nullptr; + + // Try to get a node from FreeList + { + const int B = findBucket(Size); + FreeListTy &List = FreeLists[B]; + + NodeTy TempNode(Size, nullptr); + std::lock_guard LG(FreeListLocks[B]); + const auto Itr = List.find(TempNode); + + if (Itr != List.end()) { + NodePtr = &Itr->get(); + List.erase(Itr); + } + } + + if (NodePtr != nullptr) + DP("Find one node " DPxMOD " in the bucket.\n", DPxPTR(NodePtr)); + + // We cannot find a valid node in FreeLists. Let's allocate on device and + // create a node for it. + if (NodePtr == nullptr) { + DP("Cannot find a node in the FreeLists. Allocate on device.\n"); + // Allocate one on device + void *TgtPtr = allocateOrFreeAndAllocateOnDevice(Size, HstPtr); + + if (TgtPtr == nullptr) + return nullptr; + + // Create a new node and add it into the map table + { + std::lock_guard Guard(MapTableLock); + auto Itr = PtrToNodeTable.emplace(TgtPtr, NodeTy(Size, TgtPtr)); + NodePtr = &Itr.first->second; + } + + DP("Node address " DPxMOD ", target pointer " DPxMOD ", size %zu\n", + DPxPTR(NodePtr), DPxPTR(TgtPtr), Size); + } + + assert(NodePtr && "NodePtr should not be nullptr at this point"); + + return NodePtr->Ptr; + } + + /// Deallocate memory pointed by \p TgtPtr + int free(void *TgtPtr) { + DP("MemoryManagerTy::free: target memory " DPxMOD ".\n", DPxPTR(TgtPtr)); + + NodeTy *P = nullptr; + + // Look it up into the table + { + std::lock_guard G(MapTableLock); + auto Itr = PtrToNodeTable.find(TgtPtr); + + // We don't remove the node from the map table because the map does not + // change. + if (Itr != PtrToNodeTable.end()) + P = &Itr->second; + } + + // The memory is not managed by the manager + if (P == nullptr) { + DP("Cannot find its node. Delete it on device directly.\n"); + return deleteOnDevice(TgtPtr); + } + + // Insert the node to the free list + const int B = findBucket(P->Size); + + DP("Found its node " DPxMOD ". Insert it to bucket %d.\n", DPxPTR(P), B); + + { + std::lock_guard G(FreeListLocks[B]); + FreeLists[B].insert(*P); + } + + return OFFLOAD_SUCCESS; + } + + /// Get the size threshold from the environment variable + /// \p LIBOMPTARGET_MEMORY_MANAGER_THRESHOLD . Returns a + /// std::pair where the first element represents the + /// threshold and the second element represents whether user disables memory + /// manager explicitly by setting the var to 0. If user doesn't specify + /// anything, returns <0, true>. + static std::pair getSizeThresholdFromEnv() { + size_t Threshold = 0; + + if (const char *Env = + std::getenv("LIBOMPTARGET_MEMORY_MANAGER_THRESHOLD")) { + Threshold = std::stoul(Env); + if (Threshold == 0) { + DP("Disabled memory manager as user set " + "LIBOMPTARGET_MEMORY_MANAGER_THRESHOLD=0.\n"); + return std::make_pair(0, false); + } + } + + return std::make_pair(Threshold, true); + } +}; + +#endif // LLVM_OPENMP_LIBOMPTARGET_PLUGINS_COMMON_MEMORYMANAGER_MEMORYMANAGER_H diff --git a/openmp/libomptarget/plugins/cuda/CMakeLists.txt b/openmp/libomptarget/plugins/cuda/CMakeLists.txt --- a/openmp/libomptarget/plugins/cuda/CMakeLists.txt +++ b/openmp/libomptarget/plugins/cuda/CMakeLists.txt @@ -38,6 +38,7 @@ target_link_libraries(omptarget.rtl.cuda elf_common + MemoryManager ${LIBOMPTARGET_DEP_CUDA_DRIVER_LIBRARIES} ${LIBOMPTARGET_DEP_LIBELF_LIBRARIES} "-Wl,--version-script=${CMAKE_CURRENT_SOURCE_DIR}/../exports" diff --git a/openmp/libomptarget/plugins/cuda/src/rtl.cpp b/openmp/libomptarget/plugins/cuda/src/rtl.cpp --- a/openmp/libomptarget/plugins/cuda/src/rtl.cpp +++ b/openmp/libomptarget/plugins/cuda/src/rtl.cpp @@ -25,6 +25,8 @@ #define TARGET_NAME CUDA #define DEBUG_PREFIX "Target " GETNAME(TARGET_NAME) " RTL" +#include "MemoryManager.h" + // Utility for retrieving and printing CUDA error string. #ifdef OMPTARGET_DEBUG #define CUDA_ERR_STRING(err) \ @@ -290,6 +292,55 @@ std::vector DeviceData; std::vector Modules; + /// A class responsible for interacting with device native runtime library to + /// allocate and free memory. + class CUDADeviceAllocatorTy : public DeviceAllocatorTy { + const int DeviceId; + const std::vector &DeviceData; + + public: + CUDADeviceAllocatorTy(int DeviceId, std::vector &DeviceData) + : DeviceId(DeviceId), DeviceData(DeviceData) {} + + void *allocate(size_t Size, void *) override { + if (Size == 0) + return nullptr; + + CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context); + if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n")) + return nullptr; + + CUdeviceptr DevicePtr; + Err = cuMemAlloc(&DevicePtr, Size); + if (!checkResult(Err, "Error returned from cuMemAlloc\n")) + return nullptr; + + return (void *)DevicePtr; + } + + int free(void *TgtPtr) override { + CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context); + if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n")) + return OFFLOAD_FAIL; + + Err = cuMemFree((CUdeviceptr)TgtPtr); + if (!checkResult(Err, "Error returned from cuMemFree\n")) + return OFFLOAD_FAIL; + + return OFFLOAD_SUCCESS; + } + }; + + /// A vector of device allocators + std::vector DeviceAllocators; + + /// A vector of memory managers. Since the memory manager is non-copyable and + // non-removable, we wrap them into std::unique_ptr. + std::vector> MemoryManagers; + + /// Whether use memory manager + bool UseMemoryManager = true; + // Record entry point associated with device void addOffloadEntry(const int DeviceId, const __tgt_offload_entry entry) { FuncOrGblEntryTy &E = DeviceData[DeviceId].FuncGblEntries.back(); @@ -379,10 +430,27 @@ StreamManager = std::make_unique(NumberOfDevices, DeviceData); + + for (int I = 0; I < NumberOfDevices; ++I) + DeviceAllocators.emplace_back(I, DeviceData); + + // Get the size threshold from environment variable + std::pair Res = MemoryManagerTy::getSizeThresholdFromEnv(); + UseMemoryManager = Res.second; + size_t MemoryManagerThreshold = Res.first; + + if (UseMemoryManager) + for (int I = 0; I < NumberOfDevices; ++I) + MemoryManagers.emplace_back(std::make_unique( + DeviceAllocators[I], MemoryManagerThreshold)); } ~DeviceRTLTy() { - // First destruct stream manager in case of Contexts is destructed before it + // We first destruct memory managers in case that its dependent data are + // destroyed before it. + for (auto &M : MemoryManagers) + M.release(); + StreamManager = nullptr; for (CUmodule &M : Modules) @@ -731,20 +799,11 @@ return getOffloadEntriesTable(DeviceId); } - void *dataAlloc(const int DeviceId, const int64_t Size) const { - if (Size == 0) - return nullptr; - - CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context); - if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n")) - return nullptr; - - CUdeviceptr DevicePtr; - Err = cuMemAlloc(&DevicePtr, Size); - if (!checkResult(Err, "Error returned from cuMemAlloc\n")) - return nullptr; + void *dataAlloc(const int DeviceId, const int64_t Size) { + if (UseMemoryManager) + return MemoryManagers[DeviceId]->allocate(Size, nullptr); - return (void *)DevicePtr; + return DeviceAllocators[DeviceId].allocate(Size, nullptr); } int dataSubmit(const int DeviceId, const void *TgtPtr, const void *HstPtr, @@ -843,16 +902,11 @@ return memcpyDtoD(SrcPtr, DstPtr, Size, Stream); } - int dataDelete(const int DeviceId, void *TgtPtr) const { - CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context); - if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n")) - return OFFLOAD_FAIL; - - Err = cuMemFree((CUdeviceptr)TgtPtr); - if (!checkResult(Err, "Error returned from cuMemFree\n")) - return OFFLOAD_FAIL; + int dataDelete(const int DeviceId, void *TgtPtr) { + if (UseMemoryManager) + return MemoryManagers[DeviceId]->free(TgtPtr); - return OFFLOAD_SUCCESS; + return DeviceAllocators[DeviceId].free(TgtPtr); } int runTargetTeamRegion(const int DeviceId, void *TgtEntryPtr, void **TgtArgs, diff --git a/openmp/libomptarget/src/CMakeLists.txt b/openmp/libomptarget/src/CMakeLists.txt --- a/openmp/libomptarget/src/CMakeLists.txt +++ b/openmp/libomptarget/src/CMakeLists.txt @@ -16,7 +16,6 @@ ${CMAKE_CURRENT_SOURCE_DIR}/api.cpp ${CMAKE_CURRENT_SOURCE_DIR}/device.cpp ${CMAKE_CURRENT_SOURCE_DIR}/interface.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/MemoryManager.cpp ${CMAKE_CURRENT_SOURCE_DIR}/rtl.cpp ${CMAKE_CURRENT_SOURCE_DIR}/omptarget.cpp ) diff --git a/openmp/libomptarget/src/MemoryManager.h b/openmp/libomptarget/src/MemoryManager.h deleted file mode 100644 --- a/openmp/libomptarget/src/MemoryManager.h +++ /dev/null @@ -1,95 +0,0 @@ -//===----------- MemoryManager.h - Target independent memory manager ------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// Declarations for target independent memory manager. -// -//===----------------------------------------------------------------------===// - -#ifndef LLVM_OPENMP_LIBOMPTARGET_SRC_MEMORYMANAGER_H -#define LLVM_OPENMP_LIBOMPTARGET_SRC_MEMORYMANAGER_H - -#include -#include -#include -#include -#include -#include -#include - -// Forward declaration -struct DeviceTy; - -class MemoryManagerTy { - /// A structure stores the meta data of a target pointer - struct NodeTy { - /// Memory size - const size_t Size; - /// Target pointer - void *Ptr; - - /// Constructor - NodeTy(size_t Size, void *Ptr) : Size(Size), Ptr(Ptr) {} - }; - - /// To make \p NodePtrTy ordered when they're put into \p std::multiset. - struct NodeCmpTy { - bool operator()(const NodeTy &LHS, const NodeTy &RHS) const { - return LHS.Size < RHS.Size; - } - }; - - /// A \p FreeList is a set of Nodes. We're using \p std::multiset here to make - /// the look up procedure more efficient. - using FreeListTy = std::multiset, NodeCmpTy>; - - /// A list of \p FreeListTy entries, each of which is a \p std::multiset of - /// Nodes whose size is less or equal to a specific bucket size. - std::vector FreeLists; - /// A list of mutex for each \p FreeListTy entry - std::vector FreeListLocks; - /// A table to map from a target pointer to its node - std::unordered_map PtrToNodeTable; - /// The mutex for the table \p PtrToNodeTable - std::mutex MapTableLock; - /// A reference to its corresponding \p DeviceTy object - DeviceTy &Device; - - /// Request memory from target device - void *allocateOnDevice(size_t Size, void *HstPtr) const; - - /// Deallocate data on device - int deleteOnDevice(void *Ptr) const; - - /// This function is called when it tries to allocate memory on device but the - /// device returns out of memory. It will first free all memory in the - /// FreeList and try to allocate again. - void *freeAndAllocate(size_t Size, void *HstPtr); - - /// The goal is to allocate memory on the device. It first tries to allocate - /// directly on the device. If a \p nullptr is returned, it might be because - /// the device is OOM. In that case, it will free all unused memory and then - /// try again. - void *allocateOrFreeAndAllocateOnDevice(size_t Size, void *HstPtr); - -public: - /// Constructor. If \p Threshold is non-zero, then the default threshold will - /// be overwritten by \p Threshold. - MemoryManagerTy(DeviceTy &Dev, size_t Threshold = 0); - - /// Destructor - ~MemoryManagerTy(); - - /// Allocate memory of size \p Size from target device. \p HstPtr is used to - /// assist the allocation. - void *allocate(size_t Size, void *HstPtr); - - /// Deallocate memory pointed by \p TgtPtr - int free(void *TgtPtr); -}; - -#endif // LLVM_OPENMP_LIBOMPTARGET_SRC_MEMORYMANAGER_H diff --git a/openmp/libomptarget/src/MemoryManager.cpp b/openmp/libomptarget/src/MemoryManager.cpp deleted file mode 100644 --- a/openmp/libomptarget/src/MemoryManager.cpp +++ /dev/null @@ -1,262 +0,0 @@ -//===----------- MemoryManager.cpp - Target independent memory manager ----===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// Functionality for managing target memory. -// It is very expensive to call alloc/free functions of target devices. The -// MemoryManagerTy in this file is to reduce the number of invocations of those -// functions by buffering allocated device memory. In this way, when a memory is -// not used, it will not be freed on the device directly. The buffer is -// organized in a number of buckets for efficient look up. A memory will go to -// corresponding bucket based on its size. When a new memory request comes in, -// it will first check whether there is free memory of same size. If yes, -// returns it directly. Otherwise, allocate one on device. -// -// It also provides a way to opt out the memory manager. Memory -// allocation/deallocation will only be managed if the requested size is less -// than SizeThreshold, which can be configured via an environment variable -// LIBOMPTARGET_MEMORY_MANAGER_THRESHOLD. -// -//===----------------------------------------------------------------------===// - -#include "MemoryManager.h" -#include "device.h" -#include "private.h" -#include "rtl.h" - -namespace { -constexpr const size_t BucketSize[] = { - 0, 1U << 2, 1U << 3, 1U << 4, 1U << 5, 1U << 6, 1U << 7, - 1U << 8, 1U << 9, 1U << 10, 1U << 11, 1U << 12, 1U << 13}; - -constexpr const int NumBuckets = sizeof(BucketSize) / sizeof(BucketSize[0]); - -/// The threshold to manage memory using memory manager. If the request size is -/// larger than \p SizeThreshold, the allocation will not be managed by the -/// memory manager. This variable can be configured via an env \p -/// LIBOMPTARGET_MEMORY_MANAGER_THRESHOLD. By default, the value is 8KB. -size_t SizeThreshold = 1U << 13; - -/// Find the previous number that is power of 2 given a number that is not power -/// of 2. -size_t floorToPowerOfTwo(size_t Num) { - Num |= Num >> 1; - Num |= Num >> 2; - Num |= Num >> 4; - Num |= Num >> 8; - Num |= Num >> 16; -#if INTPTR_MAX == INT64_MAX - Num |= Num >> 32; -#elif INTPTR_MAX == INT32_MAX - // Do nothing with 32-bit -#else -#error Unsupported architecture -#endif - Num += 1; - return Num >> 1; -} - -/// Find a suitable bucket -int findBucket(size_t Size) { - const size_t F = floorToPowerOfTwo(Size); - - DP("findBucket: Size %zu is floored to %zu.\n", Size, F); - - int L = 0, H = NumBuckets - 1; - while (H - L > 1) { - int M = (L + H) >> 1; - if (BucketSize[M] == F) - return M; - if (BucketSize[M] > F) - H = M - 1; - else - L = M; - } - - assert(L >= 0 && L < NumBuckets && "L is out of range"); - - DP("findBucket: Size %zu goes to bucket %d\n", Size, L); - - return L; -} -} // namespace - -MemoryManagerTy::MemoryManagerTy(DeviceTy &Dev, size_t Threshold) - : FreeLists(NumBuckets), FreeListLocks(NumBuckets), Device(Dev) { - if (Threshold) - SizeThreshold = Threshold; -} - -MemoryManagerTy::~MemoryManagerTy() { - // TODO: There is a little issue that target plugin is destroyed before this - // object, therefore the memory free will not succeed. - // Deallocate all memory in map - for (auto Itr = PtrToNodeTable.begin(); Itr != PtrToNodeTable.end(); ++Itr) { - assert(Itr->second.Ptr && "nullptr in map table"); - deleteOnDevice(Itr->second.Ptr); - } -} - -void *MemoryManagerTy::allocateOnDevice(size_t Size, void *HstPtr) const { - return Device.RTL->data_alloc(Device.RTLDeviceID, Size, HstPtr); -} - -int MemoryManagerTy::deleteOnDevice(void *Ptr) const { - return Device.RTL->data_delete(Device.RTLDeviceID, Ptr); -} - -void *MemoryManagerTy::freeAndAllocate(size_t Size, void *HstPtr) { - std::vector RemoveList; - - // Deallocate all memory in FreeList - for (int I = 0; I < NumBuckets; ++I) { - FreeListTy &List = FreeLists[I]; - std::lock_guard Lock(FreeListLocks[I]); - if (List.empty()) - continue; - for (const NodeTy &N : List) { - deleteOnDevice(N.Ptr); - RemoveList.push_back(N.Ptr); - } - FreeLists[I].clear(); - } - - // Remove all nodes in the map table which have been released - if (!RemoveList.empty()) { - std::lock_guard LG(MapTableLock); - for (void *P : RemoveList) - PtrToNodeTable.erase(P); - } - - // Try allocate memory again - return allocateOnDevice(Size, HstPtr); -} - -void *MemoryManagerTy::allocateOrFreeAndAllocateOnDevice(size_t Size, - void *HstPtr) { - void *TgtPtr = allocateOnDevice(Size, HstPtr); - // We cannot get memory from the device. It might be due to OOM. Let's - // free all memory in FreeLists and try again. - if (TgtPtr == nullptr) { - DP("Failed to get memory on device. Free all memory in FreeLists and " - "try again.\n"); - TgtPtr = freeAndAllocate(Size, HstPtr); - } - -#ifdef OMPTARGET_DEBUG - if (TgtPtr == nullptr) - DP("Still cannot get memory on device probably because the device is " - "OOM.\n"); -#endif - - return TgtPtr; -} - -void *MemoryManagerTy::allocate(size_t Size, void *HstPtr) { - // If the size is zero, we will not bother the target device. Just return - // nullptr directly. - if (Size == 0) - return nullptr; - - DP("MemoryManagerTy::allocate: size %zu with host pointer " DPxMOD ".\n", - Size, DPxPTR(HstPtr)); - - // If the size is greater than the threshold, allocate it directly from - // device. - if (Size > SizeThreshold) { - DP("%zu is greater than the threshold %zu. Allocate it directly from " - "device\n", - Size, SizeThreshold); - void *TgtPtr = allocateOrFreeAndAllocateOnDevice(Size, HstPtr); - - DP("Got target pointer " DPxMOD ". Return directly.\n", DPxPTR(TgtPtr)); - - return TgtPtr; - } - - NodeTy *NodePtr = nullptr; - - // Try to get a node from FreeList - { - const int B = findBucket(Size); - FreeListTy &List = FreeLists[B]; - - NodeTy TempNode(Size, nullptr); - std::lock_guard LG(FreeListLocks[B]); - FreeListTy::const_iterator Itr = List.find(TempNode); - - if (Itr != List.end()) { - NodePtr = &Itr->get(); - List.erase(Itr); - } - } - -#ifdef OMPTARGET_DEBUG - if (NodePtr != nullptr) - DP("Find one node " DPxMOD " in the bucket.\n", DPxPTR(NodePtr)); -#endif - - // We cannot find a valid node in FreeLists. Let's allocate on device and - // create a node for it. - if (NodePtr == nullptr) { - DP("Cannot find a node in the FreeLists. Allocate on device.\n"); - // Allocate one on device - void *TgtPtr = allocateOrFreeAndAllocateOnDevice(Size, HstPtr); - - if (TgtPtr == nullptr) - return nullptr; - - // Create a new node and add it into the map table - { - std::lock_guard Guard(MapTableLock); - auto Itr = PtrToNodeTable.emplace(TgtPtr, NodeTy(Size, TgtPtr)); - NodePtr = &Itr.first->second; - } - - DP("Node address " DPxMOD ", target pointer " DPxMOD ", size %zu\n", - DPxPTR(NodePtr), DPxPTR(TgtPtr), Size); - } - - assert(NodePtr && "NodePtr should not be nullptr at this point"); - - return NodePtr->Ptr; -} - -int MemoryManagerTy::free(void *TgtPtr) { - DP("MemoryManagerTy::free: target memory " DPxMOD ".\n", DPxPTR(TgtPtr)); - - NodeTy *P = nullptr; - - // Look it up into the table - { - std::lock_guard G(MapTableLock); - auto Itr = PtrToNodeTable.find(TgtPtr); - - // We don't remove the node from the map table because the map does not - // change. - if (Itr != PtrToNodeTable.end()) - P = &Itr->second; - } - - // The memory is not managed by the manager - if (P == nullptr) { - DP("Cannot find its node. Delete it on device directly.\n"); - return deleteOnDevice(TgtPtr); - } - - // Insert the node to the free list - const int B = findBucket(P->Size); - - DP("Found its node " DPxMOD ". Insert it to bucket %d.\n", DPxPTR(P), B); - - { - std::lock_guard G(FreeListLocks[B]); - FreeLists[B].insert(*P); - } - - return OFFLOAD_SUCCESS; -} diff --git a/openmp/libomptarget/src/device.h b/openmp/libomptarget/src/device.h --- a/openmp/libomptarget/src/device.h +++ b/openmp/libomptarget/src/device.h @@ -29,7 +29,6 @@ struct __tgt_bin_desc; struct __tgt_target_table; struct __tgt_async_info; -class MemoryManagerTy; using map_var_info_t = void *; @@ -157,9 +156,6 @@ // moved into the target task in libomp. std::map LoopTripCnt; - /// Memory manager - std::unique_ptr MemoryManager; - DeviceTy(RTLInfoTy *RTL); // The existence of mutexes makes DeviceTy non-copyable. We need to diff --git a/openmp/libomptarget/src/device.cpp b/openmp/libomptarget/src/device.cpp --- a/openmp/libomptarget/src/device.cpp +++ b/openmp/libomptarget/src/device.cpp @@ -11,7 +11,6 @@ //===----------------------------------------------------------------------===// #include "device.h" -#include "MemoryManager.h" #include "private.h" #include "rtl.h" @@ -26,7 +25,7 @@ HostDataToTargetMap(D.HostDataToTargetMap), PendingCtorsDtors(D.PendingCtorsDtors), ShadowPtrMap(D.ShadowPtrMap), DataMapMtx(), PendingGlobalsMtx(), ShadowMtx(), - LoopTripCnt(D.LoopTripCnt), MemoryManager(nullptr) {} + LoopTripCnt(D.LoopTripCnt) {} DeviceTy &DeviceTy::operator=(const DeviceTy &D) { DeviceID = D.DeviceID; @@ -45,8 +44,7 @@ DeviceTy::DeviceTy(RTLInfoTy *RTL) : DeviceID(-1), RTL(RTL), RTLDeviceID(-1), IsInit(false), InitFlag(), HasPendingGlobals(false), HostDataToTargetMap(), PendingCtorsDtors(), - ShadowPtrMap(), DataMapMtx(), PendingGlobalsMtx(), ShadowMtx(), - MemoryManager(nullptr) {} + ShadowPtrMap(), DataMapMtx(), PendingGlobalsMtx(), ShadowMtx() {} DeviceTy::~DeviceTy() { if (DeviceID == -1 || !(getInfoLevel() & OMP_INFOTYPE_DUMP_TABLE)) @@ -377,16 +375,6 @@ if (Ret != OFFLOAD_SUCCESS) return; - // The memory manager will only be disabled when users provide a threshold via - // the environment variable \p LIBOMPTARGET_MEMORY_MANAGER_THRESHOLD and set - // it to 0. - if (const char *Env = std::getenv("LIBOMPTARGET_MEMORY_MANAGER_THRESHOLD")) { - size_t Threshold = std::stoul(Env); - if (Threshold) - MemoryManager = std::make_unique(*this, Threshold); - } else - MemoryManager = std::make_unique(*this); - IsInit = true; } @@ -415,18 +403,10 @@ } void *DeviceTy::allocData(int64_t Size, void *HstPtr) { - // If memory manager is enabled, we will allocate data via memory manager. - if (MemoryManager) - return MemoryManager->allocate(Size, HstPtr); - return RTL->data_alloc(RTLDeviceID, Size, HstPtr); } int32_t DeviceTy::deleteData(void *TgtPtrBegin) { - // If memory manager is enabled, we will deallocate data via memory manager. - if (MemoryManager) - return MemoryManager->free(TgtPtrBegin); - return RTL->data_delete(RTLDeviceID, TgtPtrBegin); }