diff --git a/openmp/libomptarget/src/CMakeLists.txt b/openmp/libomptarget/src/CMakeLists.txt --- a/openmp/libomptarget/src/CMakeLists.txt +++ b/openmp/libomptarget/src/CMakeLists.txt @@ -1,9 +1,9 @@ ##===----------------------------------------------------------------------===## -# +# # Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. # See https://llvm.org/LICENSE.txt for license information. # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -# +# ##===----------------------------------------------------------------------===## # # Build offloading library libomptarget.so. @@ -16,6 +16,7 @@ api.cpp device.cpp interface.cpp + MemoryManager.cpp rtl.cpp omptarget.cpp ) diff --git a/openmp/libomptarget/src/MemoryManager.h b/openmp/libomptarget/src/MemoryManager.h new file mode 100644 --- /dev/null +++ b/openmp/libomptarget/src/MemoryManager.h @@ -0,0 +1,92 @@ +//===----------- MemoryManager.h - Target independent memory manager ------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// Declarations for target independent memory manager. +// +//===----------------------------------------------------------------------===// + +#pragma once + +#include +#include +#include +#include +#include +#include +#include + +// Forward declaration +struct DeviceTy; + +class MemoryManagerTy { + /// A structure stores the meta data of a target pointer + struct NodeTy { + /// Memory size + const size_t Size; + /// Target pointer + void *Ptr; + + /// Constructor + NodeTy(size_t Size, void *Ptr) : Size(Size), Ptr(Ptr) {} + }; + + /// To make \p NodePtrTy ordered when they're put into \p std::multiset. + struct NodeCmpTy { + bool operator()(const NodeTy &LHS, const NodeTy &RHS) const { + return LHS.Size < RHS.Size; + } + }; + + /// A \p FreeList is a set of Nodes. We're using \p std::multiset here to make + /// the look up procedure more efficient. + using FreeListTy = std::multiset, NodeCmpTy>; + + /// A list of \p FreeListTy entries, each of which is a \p std::multiset of + /// Nodes whose size is less or equal to a specific bucket size. + std::vector FreeLists; + /// A list of mutex for each \p FreeListTy entry + std::vector FreeListLocks; + /// A table to map from a target pointer to its node + std::unordered_map PtrToNodeTable; + /// The mutex for the table \p PtrToNodeTable + std::mutex MapTableLock; + /// A reference to its corresponding \p DeviceTy object + DeviceTy &Device; + + /// Request memory from target device + void *allocateOnDevice(size_t Size, void *HstPtr) const; + + /// Deallocate data on device + int deleteOnDevice(void *Ptr) const; + + /// This function is called when it tries to allocate memory on device but the + /// device returns out of memory. It will first free all memory in the + /// FreeList and try to allocate again. + void *freeAndAllocate(size_t Size, void *HstPtr); + + /// The goal is to allocate memory on the device. It first tries to allocate + /// directly on the device. If a \p nullptr is returned, it might be because + /// the device is OOM. In that case, it will free all unused memory and then + /// try again. + void *allocateOrFreeAndAllocateOnDevice(size_t Size, void *HstPtr); + +public: + /// Constructor. If \p Threshold is non-zero, then the default threshold will + /// be overwritten by \p Threshold. + MemoryManagerTy(DeviceTy &Dev, size_t Threshold = 0); + + /// Destructor + ~MemoryManagerTy(); + + /// Allocate memory of size \p Size from target device. \p HstPtr is used to + /// assist the allocation. + void *allocate(size_t Size, void *HstPtr); + + /// Deallocate memory pointed by \p TgtPtr + int free(void *TgtPtr); +}; diff --git a/openmp/libomptarget/src/MemoryManager.cpp b/openmp/libomptarget/src/MemoryManager.cpp new file mode 100644 --- /dev/null +++ b/openmp/libomptarget/src/MemoryManager.cpp @@ -0,0 +1,256 @@ +//===----------- MemoryManager.cpp - Target independent memory manager ----===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// Functionality for managing target memory. +// It is very expensive to call alloc/free functions of target devices. The +// MemoryManagerTy in this file is to reduce the number of invocations of those +// functions by buffering allocated device memory. In this way, when a memory is +// not used, it will not be freed on the device directly. The buffer is +// organized in a number of buckets for efficient look up. A memory will go to +// corresponding bucket based on its size. When a new memory request comes in, +// it will first check whether there is free memory of same size. If yes, +// returns it directly. Otherwise, allocate one on device. +// +// It also provides a way to opt out the memory manager. Memory +// allocation/deallocation will only be managed if the requested size is less +// than SizeThreshold, which can be configured via an environment variable +// LIBOMPTARGET_MEMORY_MANAGER_THRESHOLD. +// +//===----------------------------------------------------------------------===// + +#include "MemoryManager.h" +#include "device.h" +#include "private.h" +#include "rtl.h" + +namespace { +constexpr const size_t BucketSize[] = { + 0, 1U << 2, 1U << 3, 1U << 4, 1U << 5, 1U << 6, 1U << 7, + 1U << 8, 1U << 9, 1U << 10, 1U << 11, 1U << 12, 1U << 13}; + +constexpr const int NumBuckets = sizeof(BucketSize) / sizeof(BucketSize[0]); + +/// The threshold to manage memory using memory manager. If the request size is +/// larger than \p SizeThreshold, the allocation will not be managed by the +/// memory manager. This variable can be configured via an env \p +/// LIBOMPTARGET_MEMORY_MANAGER_THRESHOLD. By default, the value is 8KB. +size_t SizeThreshold = 1U << 13; + +/// Find the previous number that is power of 2 given a number that is not power +/// of 2. +size_t floorToPowerOfTwo(size_t Num) { + Num |= Num >> 1; + Num |= Num >> 2; + Num |= Num >> 4; + Num |= Num >> 8; + Num |= Num >> 16; + Num |= Num >> 32; + Num += 1; + return Num >> 1; +} + +/// Find a suitable bucket +int findBucket(size_t Size) { + const size_t F = floorToPowerOfTwo(Size); + + DP("findBucket: Size %zu is floored to %zu.\n", Size, F); + + int L = 0, H = NumBuckets - 1; + while (H - L > 1) { + int M = (L + H) >> 1; + if (BucketSize[M] == F) + return M; + if (BucketSize[M] > F) + H = M - 1; + else + L = M; + } + + assert(L >= 0 && L < NumBuckets && "L is out of range"); + + DP("findBucket: Size %zu goes to bucket %d\n", Size, L); + + return L; +} +} // namespace + +MemoryManagerTy::MemoryManagerTy(DeviceTy &Dev, size_t Threshold) + : FreeLists(NumBuckets), FreeListLocks(NumBuckets), Device(Dev) { + if (Threshold) + SizeThreshold = Threshold; +} + +MemoryManagerTy::~MemoryManagerTy() { + // TODO: There is a little issue that target plugin is destroyed before this + // object, therefore the memory free will not succeed. + // Deallocate all memory in map + for (auto Itr = PtrToNodeTable.begin(); Itr != PtrToNodeTable.end(); ++Itr) { + assert(Itr->second.Ptr && "nullptr in map table"); + deleteOnDevice(Itr->second.Ptr); + } +} + +void *MemoryManagerTy::allocateOnDevice(size_t Size, void *HstPtr) const { + return Device.RTL->data_alloc(Device.RTLDeviceID, Size, HstPtr); +} + +int MemoryManagerTy::deleteOnDevice(void *Ptr) const { + return Device.RTL->data_delete(Device.RTLDeviceID, Ptr); +} + +void *MemoryManagerTy::freeAndAllocate(size_t Size, void *HstPtr) { + std::vector RemoveList; + + // Deallocate all memory in FreeList + for (int I = 0; I < NumBuckets; ++I) { + FreeListTy &List = FreeLists[I]; + std::lock_guard Lock(FreeListLocks[I]); + if (List.empty()) + continue; + for (const NodeTy &N : List) { + deleteOnDevice(N.Ptr); + RemoveList.push_back(N.Ptr); + } + FreeLists[I].clear(); + } + + // Remove all nodes in the map table which have been released + if (!RemoveList.empty()) { + std::lock_guard LG(MapTableLock); + for (void *P : RemoveList) + PtrToNodeTable.erase(P); + } + + // Try allocate memory again + return allocateOnDevice(Size, HstPtr); +} + +void *MemoryManagerTy::allocateOrFreeAndAllocateOnDevice(size_t Size, + void *HstPtr) { + void *TgtPtr = allocateOnDevice(Size, HstPtr); + // We cannot get memory from the device. It might be due to OOM. Let's + // free all memory in FreeLists and try again. + if (TgtPtr == nullptr) { + DP("Failed to get memory on device. Free all memory in FreeLists and " + "try again.\n"); + TgtPtr = freeAndAllocate(Size, HstPtr); + } + +#ifdef OMPTARGET_DEBUG + if (TgtPtr == nullptr) + DP("Still cannot get memory on device probably because the device is " + "OOM.\n"); +#endif + + return TgtPtr; +} + +void *MemoryManagerTy::allocate(size_t Size, void *HstPtr) { + // If the size is zero, we will not bother the target device. Just return + // nullptr directly. + if (Size == 0) + return nullptr; + + DP("MemoryManagerTy::allocate: size %zu with host pointer " DPxMOD ".\n", + Size, DPxPTR(HstPtr)); + + // If the size is greater than the threshold, allocate it directly from + // device. + if (Size > SizeThreshold) { + DP("%zu is greater than the threshold %zu. Allocate it directly from " + "device\n", + Size, SizeThreshold); + void *TgtPtr = allocateOrFreeAndAllocateOnDevice(Size, HstPtr); + + DP("Got target pointer " DPxMOD ". Return directly.\n", DPxPTR(TgtPtr)); + + return TgtPtr; + } + + NodeTy *NodePtr = nullptr; + + // Try to get a node from FreeList + { + const int B = findBucket(Size); + FreeListTy &List = FreeLists[B]; + + NodeTy TempNode(Size, nullptr); + std::lock_guard LG(FreeListLocks[B]); + FreeListTy::const_iterator Itr = List.find(TempNode); + + if (Itr != List.end()) { + NodePtr = &Itr->get(); + List.erase(Itr); + } + } + +#ifdef OMPTARGET_DEBUG + if (NodePtr != nullptr) + DP("Find one node " DPxMOD " in the bucket.\n", DPxPTR(NodePtr)); +#endif + + // We cannot find a valid node in FreeLists. Let's allocate on device and + // create a node for it. + if (NodePtr == nullptr) { + DP("Cannot find a node in the FreeLists. Allocate on device.\n"); + // Allocate one on device + void *TgtPtr = allocateOrFreeAndAllocateOnDevice(Size, HstPtr); + + if (TgtPtr == nullptr) + return nullptr; + + // Create a new node and add it into the map table + { + std::lock_guard Guard(MapTableLock); + auto Itr = PtrToNodeTable.emplace(TgtPtr, NodeTy(Size, TgtPtr)); + NodePtr = &Itr.first->second; + } + + DP("Node address " DPxMOD ", target pointer " DPxMOD ", size %zu\n", + DPxPTR(NodePtr), DPxPTR(TgtPtr), Size); + } + + assert(NodePtr && "NodePtr should not be nullptr at this point"); + + return NodePtr->Ptr; +} + +int MemoryManagerTy::free(void *TgtPtr) { + DP("MemoryManagerTy::free: target memory " DPxMOD ".\n", DPxPTR(TgtPtr)); + + NodeTy *P = nullptr; + + // Look it up into the table + { + std::lock_guard G(MapTableLock); + auto Itr = PtrToNodeTable.find(TgtPtr); + + // We don't remove the node from the map table because the map does not + // change. + if (Itr != PtrToNodeTable.end()) + P = &Itr->second; + } + + // The memory is not managed by the manager + if (P == nullptr) { + DP("Cannot find its node. Delete it on device directly.\n"); + return deleteOnDevice(TgtPtr); + } + + // Insert the node to the free list + const int B = findBucket(P->Size); + + DP("Found its node " DPxMOD ". Insert it to bucket %d.\n", DPxPTR(P), B); + + { + std::lock_guard G(FreeListLocks[B]); + FreeLists[B].insert(*P); + } + + return OFFLOAD_SUCCESS; +} diff --git a/openmp/libomptarget/src/device.h b/openmp/libomptarget/src/device.h --- a/openmp/libomptarget/src/device.h +++ b/openmp/libomptarget/src/device.h @@ -17,6 +17,7 @@ #include #include #include +#include #include #include #include @@ -26,6 +27,7 @@ struct __tgt_bin_desc; struct __tgt_target_table; struct __tgt_async_info; +class MemoryManagerTy; /// Map between host data and target data. struct HostDataToTargetTy { @@ -142,34 +144,18 @@ // moved into the target task in libomp. std::map LoopTripCnt; - DeviceTy(RTLInfoTy *RTL) - : DeviceID(-1), RTL(RTL), RTLDeviceID(-1), IsInit(false), InitFlag(), - HasPendingGlobals(false), HostDataToTargetMap(), PendingCtorsDtors(), - ShadowPtrMap(), DataMapMtx(), PendingGlobalsMtx(), ShadowMtx() {} + /// Memory manager + std::unique_ptr MemoryManager; + + DeviceTy(RTLInfoTy *RTL); // The existence of mutexes makes DeviceTy non-copyable. We need to // provide a copy constructor and an assignment operator explicitly. - DeviceTy(const DeviceTy &d) - : DeviceID(d.DeviceID), RTL(d.RTL), RTLDeviceID(d.RTLDeviceID), - IsInit(d.IsInit), InitFlag(), HasPendingGlobals(d.HasPendingGlobals), - HostDataToTargetMap(d.HostDataToTargetMap), - PendingCtorsDtors(d.PendingCtorsDtors), ShadowPtrMap(d.ShadowPtrMap), - DataMapMtx(), PendingGlobalsMtx(), ShadowMtx(), - LoopTripCnt(d.LoopTripCnt) {} - - DeviceTy& operator=(const DeviceTy &d) { - DeviceID = d.DeviceID; - RTL = d.RTL; - RTLDeviceID = d.RTLDeviceID; - IsInit = d.IsInit; - HasPendingGlobals = d.HasPendingGlobals; - HostDataToTargetMap = d.HostDataToTargetMap; - PendingCtorsDtors = d.PendingCtorsDtors; - ShadowPtrMap = d.ShadowPtrMap; - LoopTripCnt = d.LoopTripCnt; - - return *this; - } + DeviceTy(const DeviceTy &D); + + DeviceTy &operator=(const DeviceTy &D); + + ~DeviceTy(); // Return true if data can be copied to DstDevice directly bool isDataExchangable(const DeviceTy& DstDevice); diff --git a/openmp/libomptarget/src/device.cpp b/openmp/libomptarget/src/device.cpp --- a/openmp/libomptarget/src/device.cpp +++ b/openmp/libomptarget/src/device.cpp @@ -11,6 +11,7 @@ //===----------------------------------------------------------------------===// #include "device.h" +#include "MemoryManager.h" #include "private.h" #include "rtl.h" @@ -21,6 +22,36 @@ /// Map between Device ID (i.e. openmp device id) and its DeviceTy. DevicesTy Devices; +DeviceTy::DeviceTy(const DeviceTy &D) + : DeviceID(D.DeviceID), RTL(D.RTL), RTLDeviceID(D.RTLDeviceID), + IsInit(D.IsInit), InitFlag(), HasPendingGlobals(D.HasPendingGlobals), + HostDataToTargetMap(D.HostDataToTargetMap), + PendingCtorsDtors(D.PendingCtorsDtors), ShadowPtrMap(D.ShadowPtrMap), + DataMapMtx(), PendingGlobalsMtx(), ShadowMtx(), + LoopTripCnt(D.LoopTripCnt), MemoryManager(nullptr) {} + +DeviceTy &DeviceTy::operator=(const DeviceTy &D) { + DeviceID = D.DeviceID; + RTL = D.RTL; + RTLDeviceID = D.RTLDeviceID; + IsInit = D.IsInit; + HasPendingGlobals = D.HasPendingGlobals; + HostDataToTargetMap = D.HostDataToTargetMap; + PendingCtorsDtors = D.PendingCtorsDtors; + ShadowPtrMap = D.ShadowPtrMap; + LoopTripCnt = D.LoopTripCnt; + + return *this; +} + +DeviceTy::DeviceTy(RTLInfoTy *RTL) + : DeviceID(-1), RTL(RTL), RTLDeviceID(-1), IsInit(false), InitFlag(), + HasPendingGlobals(false), HostDataToTargetMap(), PendingCtorsDtors(), + ShadowPtrMap(), DataMapMtx(), PendingGlobalsMtx(), ShadowMtx(), + MemoryManager(nullptr) {} + +DeviceTy::~DeviceTy() = default; + int DeviceTy::associatePtr(void *HstPtrBegin, void *TgtPtrBegin, int64_t Size) { DataMapMtx.lock(); @@ -331,10 +362,21 @@ // Make call to init_requires if it exists for this plugin. if (RTL->init_requires) RTL->init_requires(RTLs->RequiresFlags); - int32_t rc = RTL->init_device(RTLDeviceID); - if (rc == OFFLOAD_SUCCESS) { - IsInit = true; - } + int32_t Ret = RTL->init_device(RTLDeviceID); + if (Ret != OFFLOAD_SUCCESS) + return; + + // The memory manager will only be disabled when users provide a threshold via + // the environment variable \p LIBOMPTARGET_MEMORY_MANAGER_THRESHOLD and set + // it to 0. + if (const char *Env = std::getenv("LIBOMPTARGET_MEMORY_MANAGER_THRESHOLD")) { + size_t Threshold = std::stoul(Env); + if (Threshold) + MemoryManager = std::make_unique(*this, Threshold); + } else + MemoryManager = std::make_unique(*this); + + IsInit = true; } /// Thread-safe method to initialize the device only once. @@ -362,10 +404,18 @@ } void *DeviceTy::allocData(int64_t Size, void *HstPtr) { + // If memory manager is enabled, we will allocate data via memory manager. + if (MemoryManager) + return MemoryManager->allocate(Size, HstPtr); + return RTL->data_alloc(RTLDeviceID, Size, HstPtr); } int32_t DeviceTy::deleteData(void *TgtPtrBegin) { + // If memory manager is enabled, we will deallocate data via memory manager. + if (MemoryManager) + return MemoryManager->free(TgtPtrBegin); + return RTL->data_delete(RTLDeviceID, TgtPtrBegin); } diff --git a/openmp/libomptarget/test/offloading/memory_manager.cpp b/openmp/libomptarget/test/offloading/memory_manager.cpp new file mode 100644 --- /dev/null +++ b/openmp/libomptarget/test/offloading/memory_manager.cpp @@ -0,0 +1,47 @@ +// RUN: %libomptarget-compilexx-run-and-check-aarch64-unknown-linux-gnu +// RUN: %libomptarget-compilexx-run-and-check-powerpc64-ibm-linux-gnu +// RUN: %libomptarget-compilexx-run-and-check-powerpc64le-ibm-linux-gnu +// RUN: %libomptarget-compilexx-run-and-check-x86_64-pc-linux-gnu +// RUN: %libomptarget-compilexx-run-and-check-nvptx64-nvidia-cuda + +#include + +#include +#include + +int main(int argc, char *argv[]) { +#pragma omp parallel for + for (int i = 0; i < 16; ++i) { + for (int n = 1; n < (1 << 13); n <<= 1) { + void *p = omp_target_alloc(n * sizeof(int), 0); + omp_target_free(p, 0); + } + } + +#pragma omp parallel for + for (int i = 0; i < 16; ++i) { + for (int n = 1; n < (1 << 13); n <<= 1) { + int *p = (int *)omp_target_alloc(n * sizeof(int), 0); +#pragma omp target teams distribute parallel for is_device_ptr(p) + for (int j = 0; j < n; ++j) { + p[j] = i; + } + int buffer[n]; +#pragma omp target teams distribute parallel for is_device_ptr(p) \ + map(from \ + : buffer) + for (int j = 0; j < n; ++j) { + buffer[j] = p[j]; + } + for (int j = 0; j < n; ++j) { + assert(buffer[j] == i); + } + omp_target_free(p, 0); + } + } + + std::cout << "PASS\n"; + return 0; +} + +// CHECK: PASS