diff --git a/openmp/libomptarget/src/CMakeLists.txt b/openmp/libomptarget/src/CMakeLists.txt --- a/openmp/libomptarget/src/CMakeLists.txt +++ b/openmp/libomptarget/src/CMakeLists.txt @@ -1,9 +1,9 @@ ##===----------------------------------------------------------------------===## -# +# # Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. # See https://llvm.org/LICENSE.txt for license information. # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -# +# ##===----------------------------------------------------------------------===## # # Build offloading library libomptarget.so. @@ -16,6 +16,7 @@ api.cpp device.cpp interface.cpp + MemoryManager.cpp rtl.cpp omptarget.cpp ) diff --git a/openmp/libomptarget/src/MemoryManager.h b/openmp/libomptarget/src/MemoryManager.h new file mode 100644 --- /dev/null +++ b/openmp/libomptarget/src/MemoryManager.h @@ -0,0 +1,42 @@ +//===----------- MemoryManager.h - Target independent memory manager ------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// Declarations for target independent memory manager. +// +//===----------------------------------------------------------------------===// + +#pragma once + +#include + +// Forward declaration +struct DeviceTy; + +namespace memory { +namespace impl { +class MemoryManagerImplTy; +} // namespace impl + +class MemoryManagerTy { + std::unique_ptr Impl; + +public: + /// Constructor + MemoryManagerTy(DeviceTy &D, size_t Threshold = 0); + + /// Destructor + ~MemoryManagerTy(); + + /// Allocate memory of size \p Size from target device. \p HstPtr is used to + /// assist the allocation. + void *allocate(size_t Size, void *HstPtr); + + /// Deallocate memory pointed by \p TgtPtr + int free(void *TgtPtr); +}; +} // namespace memory diff --git a/openmp/libomptarget/src/MemoryManager.cpp b/openmp/libomptarget/src/MemoryManager.cpp new file mode 100644 --- /dev/null +++ b/openmp/libomptarget/src/MemoryManager.cpp @@ -0,0 +1,333 @@ +//===----------- MemoryManager.cpp - Target independent memory manager ----===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// Functionality for managing target memory. +// It is very expensive to call alloc/free functions of target devices. The +// MemoryManagerTy in this file is to reduce the number of invocations of those +// functions by buffering allocated device memory. In this way, when a memory is +// not used, it will not be freed on the device directly. The buffer is +// organized in a number of buckets for efficient look up. A memory will go to +// corresponding bucket based on its size. When a new memory request comes in, +// it will first check whether there is free memory of same size. If yes, +// returns it directly. Otherwise, allocate one on device. +// +// It also provides a way to opt out the memory manager. Memory +// allocation/deallocation will only be managed if the requested size is less +// than SizeThreshold, which can be configured via an environment variable +// LIBOMPTARGET_MEMORY_MANAGER_THRESHOLD. +// +//===----------------------------------------------------------------------===// + +#include +#include +#include +#include +#include +#include + +#include "MemoryManager.h" +#include "device.h" +#include "rtl.h" + +#ifdef OMPTARGET_DEBUG +#include "private.h" +#endif + +namespace { +constexpr const size_t BucketSize[] = { + 0, 1U << 2, 1U << 3, 1U << 4, 1U << 5, 1U << 6, 1U << 7, + 1U << 8, 1U << 9, 1U << 10, 1U << 11, 1U << 12, 1U << 13}; + +constexpr const int NumBuckets = sizeof(BucketSize) / sizeof(BucketSize[0]); + +/// The threshold to manage memory using memory manager. If the request size is +/// larger than \p SizeThreshold, the allocation will not be managed by the +/// memory manager. This variable can be configured via an env \p +/// LIBOMPTARGET_MEMORY_MANAGER_THRESHOLD. By default, the value is 8KB. +size_t SizeThreshold = 1U << 13; + +/// Find the previous number that is power of 2 given a number that is not power +/// of 2. +size_t floorToPowerOfTwo(size_t Num) { + Num |= Num >> 1; + Num |= Num >> 2; + Num |= Num >> 4; + Num |= Num >> 8; + Num |= Num >> 16; + Num |= Num >> 32; + Num += 1; + return Num >> 1; +} + +/// Find a suitable bucket +int findBucket(size_t Size) { + const size_t F = floorToPowerOfTwo(Size); + + DP("findBucket: Size %zu is floored to %zu.\n", Size, F); + + int L = 0, H = NumBuckets - 1; + while (H - L > 1) { + int M = (L + H) >> 1; + if (BucketSize[M] == F) + return M; + if (BucketSize[M] > F) + H = M - 1; + else + L = M; + } + + assert(L >= 0 && L < NumBuckets && "L is out of range"); + + DP("findBucket: Size %zu goes to bucket %d\n", Size, L); + + return L; +} +} // namespace + +namespace memory { +namespace impl { +class MemoryManagerImplTy { + /// A structure stores the meta data of a target pointer + struct NodeTy { + /// Memory size + const size_t Size; + /// Target pointer + void *Ptr; + + /// Constructor + NodeTy(size_t Size, void *Ptr) : Size(Size), Ptr(Ptr) {} + }; + + /// Nodes are used in a format of \p std::shared_ptr + using NodePtrTy = std::shared_ptr; + + /// To make \p NodePtrTy ordered when they're put into \p std::multiset. + struct NodePtrCmpTy { + bool operator()(const NodePtrTy &LHS, const NodePtrTy &RHS) const { + return LHS->Size < RHS->Size; + } + }; + + /// A \p FreeList is a set of Nodes. We're using \p std::multiset here to make + /// the look up procedure more efficient. + using FreeListTy = std::multiset; + + /// A list of \p FreeListTy entries, each of which is a \p std::multiset of + /// Nodes whose size is less or equal to a specific bucket size. + std::vector FreeLists; + /// A list of mutex for each \p FreeListTy entry + std::vector FreeListLocks; + /// A table to map from a target pointer to its node + std::unordered_map PtrToNodeTable; + /// The mutex for the table \p PtrToNodeTable + std::mutex MapTableLock; + /// A reference to its corresponding \p DeviceTy object + DeviceTy &Device; + + /// Request memory from target device + void *allocateOnDevice(size_t Size, void *HstPtr) const { + return Device.RTL->data_alloc(Device.RTLDeviceID, Size, HstPtr); + } + + /// Deallocate data on device + int deleteOnDevice(void *Ptr) const { + return Device.RTL->data_delete(Device.RTLDeviceID, Ptr); + } + + /// This function is called when it tries to allocate memory on device but the + /// device returns out of memory. It will first free all memory in the + /// FreeList and try to allocate again. + void *freeAndAllocate(size_t Size, void *HstPtr) { + // Deallocate all memory in FreeList + for (int I = 0; I < NumBuckets; ++I) { + FreeListTy &List = FreeLists[I]; + if (List.empty()) + continue; + std::lock_guard Lock(FreeListLocks[I]); + for (const NodePtrTy &N : List) + deleteOnDevice(N->Ptr); + FreeLists[I].clear(); + } + + // Try allocate memory again + return allocateOnDevice(Size, HstPtr); + } + + /// The goal is to allocate memory on the device. It first tries to allocate + /// directly on the device. If a \p nullptr is returned, it might be because + /// the device is OOM. In that case, it will free all unused memory and then + /// try again. + void *allocateOrFreeAndAllocateOnDevice(size_t Size, void *HstPtr) { + void *TgtPtr = allocateOnDevice(Size, HstPtr); + // We cannot get memory from the device. It might be due to OOM. Let's + // free all memory in FreeLists and try again. + if (TgtPtr == nullptr) { + DP("Failed to get memory on device. Free all memory in FreeLists and " + "try again.\n"); + TgtPtr = freeAndAllocate(Size, HstPtr); + } + +#ifdef OMPTARGET_DEBUG + if (TgtPtr == nullptr) + DP("Still cannot get memory on device probably because the device is " + "OOM.\n"); +#endif + + return TgtPtr; + } + +public: + /// Constructor + MemoryManagerImplTy(DeviceTy &Dev) + : FreeLists(NumBuckets), FreeListLocks(NumBuckets), Device(Dev) {} + + /// Destructor + ~MemoryManagerImplTy() { + // TODO: There is a little issue that target plugin is destroyed before this + // object, therefore the memory free will not succeed. + // Deallocate all memory in map + for (std::pair P : PtrToNodeTable) { + assert(P.second->Ptr && "nullptr in map table"); + deleteOnDevice(P.second->Ptr); + // We need to set Ptr to nullptr because one node might exist both in the + // map table and the free lists. + P.second->Ptr = nullptr; + } + + // Deallocate all memory in FreeList + for (int I = 0; I < NumBuckets; ++I) { + // We don't need lock here because only one thread can execute it + FreeListTy &List = FreeLists[I]; + for (const NodePtrTy &N : List) + if (N->Ptr) + deleteOnDevice(N->Ptr); + } + } + + void *allocate(size_t Size, void *HstPtr) { + // If the size is zero, we will not bother the target device. Just return + // nullptr directly. + if (Size == 0) + return nullptr; + + DP("MemoryManagerImplTy::allocate: size %zu with host pointer " DPxMOD + ".\n", + Size, DPxPTR(HstPtr)); + + // If the size is greater than the threshold, allocate it directly from + // device. + if (Size > SizeThreshold) { + DP("%zu is greater than the threshold %zu. Allocate it directly from " + "device\n", + Size, SizeThreshold); + void *TgtPtr = allocateOrFreeAndAllocateOnDevice(Size, HstPtr); + + DP("Got target pointer " DPxMOD ". Return directly.\n", DPxPTR(TgtPtr)); + + return TgtPtr; + } + + NodePtrTy NodePtr(nullptr); + + // Try to get a node from FreeList + { + const int B = findBucket(Size); + FreeListTy &List = FreeLists[B]; + std::lock_guard LG(FreeListLocks[B]); + FreeListTy::const_iterator Itr = + List.find(std::make_shared(Size, nullptr)); + + if (Itr != List.end()) { + NodePtr = *Itr; + List.erase(Itr); + } + } + +#ifdef OMPTARGET_DEBUG + if (NodePtr != nullptr) + DP("Find one node " DPxMOD " in the bucket.\n", DPxPTR(NodePtr.get())); +#endif + + // We cannot find a valid node in FreeLists. Let's allocate on device and + // create a node for it. + if (NodePtr == nullptr) { + DP("Cannot find a node in the FreeLists. Allocate on device.\n"); + // Allocate one on device + void *TgtPtr = allocateOrFreeAndAllocateOnDevice(Size, HstPtr); + + if (TgtPtr == nullptr) + return nullptr; + + NodePtr = std::make_shared(Size, TgtPtr); + + DP("Node address " DPxMOD ", target pointer " DPxMOD ", size %zu\n", + DPxPTR(NodePtr.get()), DPxPTR(TgtPtr), Size); + } + + // Insert the node into the map table if it is not there + if (PtrToNodeTable.find(NodePtr->Ptr) == PtrToNodeTable.end()) { + std::lock_guard Guard(MapTableLock); + PtrToNodeTable[NodePtr->Ptr] = NodePtr; + } + + return NodePtr->Ptr; + } + + int free(void *TgtPtr) { + DP("MemoryManagerImplTy::free: target memory " DPxMOD ".\n", + DPxPTR(TgtPtr)); + + NodePtrTy P(nullptr); + + // Look it up into the table + { + std::lock_guard G(MapTableLock); + std::unordered_map::const_iterator Itr = + PtrToNodeTable.find(TgtPtr); + + // We don't remove the node from the map table because the map does not + // change. + if (Itr != PtrToNodeTable.end()) + P = Itr->second; + } + + // The memory is not managed by the manager + if (P == nullptr) { + DP("Cannot find its node. Delete it on device directly.\n"); + return deleteOnDevice(TgtPtr); + } + + // Insert the node to the free list + const int B = findBucket(P->Size); + FreeListTy &List = FreeLists[B]; + + DP("Found its node " DPxMOD ". Insert it to bucket %d.\n", DPxPTR(P.get()), + B); + + std::lock_guard G(FreeListLocks[B]); + List.insert(P); + + return OFFLOAD_SUCCESS; + } +}; +} // namespace impl + +void *MemoryManagerTy::allocate(size_t Size, void *HstPtr) { + return Impl->allocate(Size, HstPtr); +} + +int MemoryManagerTy::free(void *TgtPtr) { return Impl->free(TgtPtr); } + +MemoryManagerTy::MemoryManagerTy(DeviceTy &D, size_t Threshold) + : Impl(new impl::MemoryManagerImplTy(D)) { + if (Threshold) + SizeThreshold = Threshold; +} + +MemoryManagerTy::~MemoryManagerTy() = default; +} // namespace memory diff --git a/openmp/libomptarget/src/device.h b/openmp/libomptarget/src/device.h --- a/openmp/libomptarget/src/device.h +++ b/openmp/libomptarget/src/device.h @@ -17,6 +17,7 @@ #include #include #include +#include #include #include #include @@ -26,6 +27,9 @@ struct __tgt_bin_desc; struct __tgt_target_table; struct __tgt_async_info; +namespace memory { +class MemoryManagerTy; +} // namespace memory /// Map between host data and target data. struct HostDataToTargetTy { @@ -142,10 +146,14 @@ // moved into the target task in libomp. std::map LoopTripCnt; + /// Memory manager + std::shared_ptr MemoryManager; + DeviceTy(RTLInfoTy *RTL) : DeviceID(-1), RTL(RTL), RTLDeviceID(-1), IsInit(false), InitFlag(), HasPendingGlobals(false), HostDataToTargetMap(), PendingCtorsDtors(), - ShadowPtrMap(), DataMapMtx(), PendingGlobalsMtx(), ShadowMtx() {} + ShadowPtrMap(), DataMapMtx(), PendingGlobalsMtx(), ShadowMtx(), + MemoryManager(nullptr) {} // The existence of mutexes makes DeviceTy non-copyable. We need to // provide a copy constructor and an assignment operator explicitly. diff --git a/openmp/libomptarget/src/device.cpp b/openmp/libomptarget/src/device.cpp --- a/openmp/libomptarget/src/device.cpp +++ b/openmp/libomptarget/src/device.cpp @@ -11,6 +11,7 @@ //===----------------------------------------------------------------------===// #include "device.h" +#include "MemoryManager.h" #include "private.h" #include "rtl.h" @@ -321,10 +322,21 @@ // Make call to init_requires if it exists for this plugin. if (RTL->init_requires) RTL->init_requires(RTLs->RequiresFlags); - int32_t rc = RTL->init_device(RTLDeviceID); - if (rc == OFFLOAD_SUCCESS) { - IsInit = true; - } + int32_t Ret = RTL->init_device(RTLDeviceID); + if (Ret != OFFLOAD_SUCCESS) + return; + + size_t Threshold = 1U << 13; + + if (const char *Env = std::getenv("LIBOMPTARGET_MEMORY_MANAGER_THRESHOLD")) + Threshold = std::stoul(Env); + + // Only when the threashold is not set to 0 by user via the environment + // variable can we use memory manager. + if (Threshold) + MemoryManager = std::make_shared(*this, Threshold); + + IsInit = true; } /// Thread-safe method to initialize the device only once. @@ -352,10 +364,18 @@ } void *DeviceTy::allocData(int64_t Size, void *HstPtr) { + // If memory manager is enabled, we will allocate data via memory manager. + if (MemoryManager) + return MemoryManager->allocate(Size, HstPtr); + return RTL->data_alloc(RTLDeviceID, Size, HstPtr); } int32_t DeviceTy::deleteData(void *TgtPtrBegin) { + // If memory manager is enabled, we will deallocate data via memory manager. + if (MemoryManager) + return MemoryManager->free(TgtPtrBegin); + return RTL->data_delete(RTLDeviceID, TgtPtrBegin); } diff --git a/openmp/libomptarget/test/offloading/memory_manager.cpp b/openmp/libomptarget/test/offloading/memory_manager.cpp new file mode 100644 --- /dev/null +++ b/openmp/libomptarget/test/offloading/memory_manager.cpp @@ -0,0 +1,46 @@ +// RUN: %libomptarget-compilexx-run-and-check-aarch64-unknown-linux-gnu +// RUN: %libomptarget-compilexx-run-and-check-powerpc64-ibm-linux-gnu +// RUN: %libomptarget-compilexx-run-and-check-powerpc64le-ibm-linux-gnu +// RUN: %libomptarget-compilexx-run-and-check-x86_64-pc-linux-gnu +// RUN: %libomptarget-compilexx-run-and-check-nvptx64-nvidia-cuda + +#include + +#include +#include + +int main(int argc, char *argv[]) { +#pragma omp parallel for + for (int i = 0; i < 16; ++i) { + for (int n = 1; n < (1 << 13); n <<= 1) { + void *p = omp_target_alloc(n * sizeof(int), 0); + omp_target_free(p, 0); + } + } + +#pragma omp parallel for + for (int i = 0; i < 16; ++i) { + for (int n = 1; n < (1 << 13); n <<= 1) { + int *p = (int *)omp_target_alloc(n * sizeof(int), 0); +#pragma omp target teams distribute parallel for is_device_ptr(p) + for (int j = 0; j < n; ++j) { + p[j] = i; + } + int buffer[n]; +#pragma omp target teams distribute parallel for is_device_ptr(p) \ + map(from: buffer) + for (int j = 0; j < n; ++j) { + buffer[j] = p[j]; + } + for (int j = 0; j < n; ++j) { + assert(buffer[j] == i); + } + omp_target_free(p, 0); + } + } + + std::cout << "PASS\n"; + return 0; +} + +// CHECK: PASS