diff --git a/openmp/libomptarget/include/device.h b/openmp/libomptarget/include/device.h --- a/openmp/libomptarget/include/device.h +++ b/openmp/libomptarget/include/device.h @@ -301,6 +301,9 @@ std::mutex DataMapMtx, PendingGlobalsMtx, ShadowMtx; + /// Flag to indicate if the device supports asynchronous frees. + bool SupportsAsyncFree; + // NOTE: Once libomp gains full target-task support, this state should be // moved into the target task in libomp. std::map LoopTripCnt; @@ -345,7 +348,8 @@ /// \c OFFLOAD_FAIL if not. It is the caller's responsibility to skip calling /// this function if the map entry is not expected to exist because /// \p HstPtrBegin uses shared memory. - int deallocTgtPtr(void *HstPtrBegin, int64_t Size, bool HasHoldModifier); + int deallocTgtPtr(void *HstPtrBegin, int64_t Size, bool HasHoldModifier, + AsyncInfoTy &AsyncInfo); int associatePtr(void *HstPtrBegin, void *TgtPtrBegin, int64_t Size); int disassociatePtr(void *HstPtrBegin); @@ -362,11 +366,12 @@ /// pointer association. Actually, all the __tgt_rtl_data_alloc /// implementations ignore \p HstPtr. \p Kind dictates what allocator should /// be used (host, shared, device). - void *allocData(int64_t Size, void *HstPtr = nullptr, - int32_t Kind = TARGET_ALLOC_DEFAULT); + void *allocData(int64_t Size, void *HstPtr, int32_t Kind, + __tgt_async_info *AsyncInfo = nullptr); + /// Deallocates memory which \p TgtPtrBegin points at and returns /// OFFLOAD_SUCCESS/OFFLOAD_FAIL when succeeds/fails. - int32_t deleteData(void *TgtPtrBegin); + int32_t deleteData(void *TgtPtrBegin, __tgt_async_info *AsyncInfoPtr); // Data transfer. When AsyncInfo is nullptr, the transfer will be // synchronous. diff --git a/openmp/libomptarget/include/omptarget.h b/openmp/libomptarget/include/omptarget.h --- a/openmp/libomptarget/include/omptarget.h +++ b/openmp/libomptarget/include/omptarget.h @@ -183,6 +183,10 @@ /// Return a void* reference with a lifetime that is at least as long as this /// AsyncInfoTy object. The location can be used as intermediate buffer. void *&getVoidPtrLocation(); + + /// Return true if the associated __tgt_async_info object has a queue, thus is + /// initialized and has "pending" asynchronous work. + bool hasPotentiallyPendingAsyncWork() const { return AsyncInfo.Queue; } }; /// This struct is a record of non-contiguous information diff --git a/openmp/libomptarget/include/omptargetplugin.h b/openmp/libomptarget/include/omptargetplugin.h --- a/openmp/libomptarget/include/omptargetplugin.h +++ b/openmp/libomptarget/include/omptargetplugin.h @@ -69,6 +69,8 @@ // to use (e.g. shared, host, device). void *__tgt_rtl_data_alloc(int32_t ID, int64_t Size, void *HostPtr, int32_t Kind); +void *__tgt_rtl_data_alloc_async(int32_t ID, int64_t Size, void *HostPtr, + int32_t Kind, __tgt_async_info *AsyncInfo); // Pass the data content to the target device using the target address. In case // of success, return zero. Otherwise, return an error code. @@ -103,6 +105,8 @@ // De-allocate the data referenced by target ptr on the device. In case of // success, return zero. Otherwise, return an error code. int32_t __tgt_rtl_data_delete(int32_t ID, void *TargetPtr); +int32_t __tgt_rtl_data_delete_async(int32_t ID, void *TargetPtr, + __tgt_async_info *AsyncInfo); // Transfer control to the offloaded entry Entry on the target device. // Args and Offsets are arrays of NumArgs size of target addresses and diff --git a/openmp/libomptarget/include/rtl.h b/openmp/libomptarget/include/rtl.h --- a/openmp/libomptarget/include/rtl.h +++ b/openmp/libomptarget/include/rtl.h @@ -14,6 +14,7 @@ #define _OMPTARGET_RTL_H #include "omptarget.h" +#include #include #include #include @@ -31,6 +32,8 @@ typedef int32_t(init_device_ty)(int32_t); typedef __tgt_target_table *(load_binary_ty)(int32_t, void *); typedef void *(data_alloc_ty)(int32_t, int64_t, void *, int32_t); + typedef void *(data_alloc_async_ty)(int32_t, int64_t, void *, int32_t, + __tgt_async_info *); typedef int32_t(data_submit_ty)(int32_t, void *, void *, int64_t); typedef int32_t(data_submit_async_ty)(int32_t, void *, void *, int64_t, __tgt_async_info *); @@ -41,6 +44,7 @@ typedef int32_t(data_exchange_async_ty)(int32_t, void *, int32_t, void *, int64_t, __tgt_async_info *); typedef int32_t(data_delete_ty)(int32_t, void *); + typedef int32_t(data_delete_async_ty)(int32_t, void *, __tgt_async_info *); typedef int32_t(run_region_ty)(int32_t, void *, void **, ptrdiff_t *, int32_t); typedef int32_t(run_region_async_ty)(int32_t, void *, void **, ptrdiff_t *, @@ -82,6 +86,7 @@ init_device_ty *init_device = nullptr; load_binary_ty *load_binary = nullptr; data_alloc_ty *data_alloc = nullptr; + data_alloc_async_ty *data_alloc_async = nullptr; data_submit_ty *data_submit = nullptr; data_submit_async_ty *data_submit_async = nullptr; data_retrieve_ty *data_retrieve = nullptr; @@ -89,6 +94,7 @@ data_exchange_ty *data_exchange = nullptr; data_exchange_async_ty *data_exchange_async = nullptr; data_delete_ty *data_delete = nullptr; + data_delete_async_ty *data_delete_async = nullptr; run_region_ty *run_region = nullptr; run_region_async_ty *run_region_async = nullptr; run_team_region_ty *run_team_region = nullptr; diff --git a/openmp/libomptarget/plugins/common/MemoryManager/MemoryManager.h b/openmp/libomptarget/plugins/common/MemoryManager/MemoryManager.h --- a/openmp/libomptarget/plugins/common/MemoryManager/MemoryManager.h +++ b/openmp/libomptarget/plugins/common/MemoryManager/MemoryManager.h @@ -22,6 +22,7 @@ #include #include "Debug.h" +#include "omptarget.h" #include "omptargetplugin.h" /// Base class of per-device allocator. @@ -30,11 +31,14 @@ virtual ~DeviceAllocatorTy() = default; /// Allocate a memory of size \p Size . \p HstPtr is used to assist the - /// allocation. - virtual void *allocate(size_t Size, void *HstPtr, TargetAllocTy Kind) = 0; - - /// Delete the pointer \p TgtPtr on the device - virtual int free(void *TgtPtr) = 0; + /// allocation. \p AsyncInfo is used to make it asynchronous, if the device + /// (driver) supports it. + virtual void *allocate(size_t Size, void *HstPtr, TargetAllocTy Kind, + __tgt_async_info *AsyncInfo) = 0; + + /// Delete the pointer \p TgtPtr on the device. \p AsyncInfo is used to make + /// it asynchronous, if the device (driver) supports it. + virtual int free(void *TgtPtr, __tgt_async_info *AsyncInfo) = 0; }; /// Class of memory manager. The memory manager is per-device by using @@ -132,17 +136,25 @@ size_t SizeThreshold = 1U << 13; /// Request memory from target device - void *allocateOnDevice(size_t Size, void *HstPtr) const { - return DeviceAllocator.allocate(Size, HstPtr, TARGET_ALLOC_DEVICE); + void *allocateOnDevice(size_t Size, void *HstPtr, + __tgt_async_info *AsyncInfo) const { + return DeviceAllocator.allocate(Size, HstPtr, TARGET_ALLOC_DEVICE, + AsyncInfo); } /// Deallocate data on device - int deleteOnDevice(void *Ptr) const { return DeviceAllocator.free(Ptr); } + int deleteOnDevice(void *Ptr) const { + // If we need to make space on the device we don't want it to be + // asynchronous as it only happens if we run out of memory. Freeing + // it is priority now. + return DeviceAllocator.free(Ptr, /* AsyncInfo */ nullptr); + } /// This function is called when it tries to allocate memory on device but the /// device returns out of memory. It will first free all memory in the /// FreeList and try to allocate again. - void *freeAndAllocate(size_t Size, void *HstPtr) { + void *freeAndAllocate(size_t Size, void *HstPtr, + __tgt_async_info *AsyncInfo) { std::vector RemoveList; // Deallocate all memory in FreeList @@ -166,21 +178,22 @@ } // Try allocate memory again - return allocateOnDevice(Size, HstPtr); + return allocateOnDevice(Size, HstPtr, AsyncInfo); } /// The goal is to allocate memory on the device. It first tries to /// allocate directly on the device. If a \p nullptr is returned, it might /// be because the device is OOM. In that case, it will free all unused /// memory and then try again. - void *allocateOrFreeAndAllocateOnDevice(size_t Size, void *HstPtr) { - void *TgtPtr = allocateOnDevice(Size, HstPtr); + void *allocateOrFreeAndAllocateOnDevice(size_t Size, void *HstPtr, + __tgt_async_info *AsyncInfo) { + void *TgtPtr = allocateOnDevice(Size, HstPtr, AsyncInfo); // We cannot get memory from the device. It might be due to OOM. Let's // free all memory in FreeLists and try again. if (TgtPtr == nullptr) { DP("Failed to get memory on device. Free all memory in FreeLists and " "try again.\n"); - TgtPtr = freeAndAllocate(Size, HstPtr); + TgtPtr = freeAndAllocate(Size, HstPtr, AsyncInfo); } if (TgtPtr == nullptr) @@ -211,7 +224,7 @@ /// Allocate memory of size \p Size from target device. \p HstPtr is used to /// assist the allocation. - void *allocate(size_t Size, void *HstPtr) { + void *allocate(size_t Size, void *HstPtr, __tgt_async_info *AsyncInfo) { // If the size is zero, we will not bother the target device. Just return // nullptr directly. if (Size == 0) @@ -226,7 +239,7 @@ DP("%zu is greater than the threshold %zu. Allocate it directly from " "device\n", Size, SizeThreshold); - void *TgtPtr = allocateOrFreeAndAllocateOnDevice(Size, HstPtr); + void *TgtPtr = allocateOrFreeAndAllocateOnDevice(Size, HstPtr, AsyncInfo); DP("Got target pointer " DPxMOD ". Return directly.\n", DPxPTR(TgtPtr)); @@ -258,7 +271,7 @@ if (NodePtr == nullptr) { DP("Cannot find a node in the FreeLists. Allocate on device.\n"); // Allocate one on device - void *TgtPtr = allocateOrFreeAndAllocateOnDevice(Size, HstPtr); + void *TgtPtr = allocateOrFreeAndAllocateOnDevice(Size, HstPtr, AsyncInfo); if (TgtPtr == nullptr) return nullptr; diff --git a/openmp/libomptarget/plugins/cuda/dynamic_cuda/cuda.h b/openmp/libomptarget/plugins/cuda/dynamic_cuda/cuda.h --- a/openmp/libomptarget/plugins/cuda/dynamic_cuda/cuda.h +++ b/openmp/libomptarget/plugins/cuda/dynamic_cuda/cuda.h @@ -221,6 +221,7 @@ void **); CUresult cuMemAlloc(CUdeviceptr *, size_t); +CUresult cuMemAllocAsync(CUdeviceptr *, size_t, CUstream); CUresult cuMemAllocHost(void **, size_t); CUresult cuMemAllocManaged(CUdeviceptr *, size_t, unsigned int); @@ -231,6 +232,7 @@ CUresult cuMemcpyHtoDAsync(CUdeviceptr, const void *, size_t, CUstream); CUresult cuMemFree(CUdeviceptr); +CUresult cuMemFreeAsync(CUdeviceptr, CUstream); CUresult cuMemFreeHost(void *); CUresult cuModuleGetFunction(CUfunction *, CUmodule, const char *); diff --git a/openmp/libomptarget/plugins/cuda/dynamic_cuda/cuda.cpp b/openmp/libomptarget/plugins/cuda/dynamic_cuda/cuda.cpp --- a/openmp/libomptarget/plugins/cuda/dynamic_cuda/cuda.cpp +++ b/openmp/libomptarget/plugins/cuda/dynamic_cuda/cuda.cpp @@ -39,6 +39,7 @@ DLWRAP(cuLaunchKernel, 11); DLWRAP(cuMemAlloc, 2); +DLWRAP(cuMemAllocAsync, 3); DLWRAP(cuMemAllocHost, 2); DLWRAP(cuMemAllocManaged, 3); @@ -49,6 +50,7 @@ DLWRAP(cuMemcpyHtoDAsync, 4); DLWRAP(cuMemFree, 1); +DLWRAP(cuMemFreeAsync, 2); DLWRAP(cuMemFreeHost, 1); DLWRAP(cuModuleGetFunction, 3); DLWRAP(cuModuleGetGlobal, 4); diff --git a/openmp/libomptarget/plugins/cuda/src/rtl.cpp b/openmp/libomptarget/plugins/cuda/src/rtl.cpp --- a/openmp/libomptarget/plugins/cuda/src/rtl.cpp +++ b/openmp/libomptarget/plugins/cuda/src/rtl.cpp @@ -22,6 +22,7 @@ #include "Debug.h" #include "DeviceEnvironment.h" +#include "omptarget.h" #include "omptargetplugin.h" #define TARGET_NAME CUDA @@ -314,6 +315,24 @@ } }; +using StreamPoolTy = ResourcePoolTy; +using StreamPoolVectorTy = std::vector>; + +static CUstream getStream(const int DeviceId, __tgt_async_info *AsyncInfo, + const StreamPoolVectorTy &StreamPoolVector) { + assert(AsyncInfo && "AsyncInfo is nullptr"); + + if (!AsyncInfo->Queue) { + CUstream S; + if (StreamPoolVector[DeviceId]->acquire(S) != OFFLOAD_SUCCESS) + return nullptr; + + AsyncInfo->Queue = S; + } + + return reinterpret_cast(AsyncInfo->Queue); +} + class DeviceRTLTy { int NumberOfDevices; // OpenMP environment properties @@ -332,8 +351,7 @@ static constexpr const int DefaultNumTeams = 128; static constexpr const int DefaultNumThreads = 128; - using StreamPoolTy = ResourcePoolTy; - std::vector> StreamPool; + StreamPoolVectorTy StreamPoolVector; ResourcePoolTy EventPool; @@ -347,11 +365,16 @@ const std::vector &DeviceData; std::unordered_map HostPinnedAllocs; + StreamPoolVectorTy &StreamPoolVector; + public: - CUDADeviceAllocatorTy(int DeviceId, std::vector &DeviceData) - : DeviceId(DeviceId), DeviceData(DeviceData) {} + CUDADeviceAllocatorTy(int DeviceId, std::vector &DeviceData, + StreamPoolVectorTy &StreamPoolVector) + : DeviceId(DeviceId), DeviceData(DeviceData), + StreamPoolVector(StreamPoolVector) {} - void *allocate(size_t Size, void *, TargetAllocTy Kind) override { + void *allocate(size_t Size, void *, TargetAllocTy Kind, + __tgt_async_info *AsyncInfo) override { if (Size == 0) return nullptr; @@ -364,7 +387,16 @@ case TARGET_ALLOC_DEFAULT: case TARGET_ALLOC_DEVICE: CUdeviceptr DevicePtr; +#if CUDA_VERSION >= 11020 + if (AsyncInfo) { + CUstream Stream = getStream(DeviceId, AsyncInfo, StreamPoolVector); + Err = cuMemAllocAsync(&DevicePtr, Size, Stream); + } else { + Err = cuMemAlloc(&DevicePtr, Size); + } +#else Err = cuMemAlloc(&DevicePtr, Size); +#endif MemAlloc = (void *)DevicePtr; if (!checkResult(Err, "Error returned from cuMemAlloc\n")) return nullptr; @@ -389,7 +421,7 @@ return MemAlloc; } - int free(void *TgtPtr) override { + int free(void *TgtPtr, __tgt_async_info *AsyncInfo) override { CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context); if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n")) return OFFLOAD_FAIL; @@ -403,7 +435,16 @@ case TARGET_ALLOC_DEFAULT: case TARGET_ALLOC_DEVICE: case TARGET_ALLOC_SHARED: +#if CUDA_VERSION >= 11020 + if (AsyncInfo) { + CUstream Stream = getStream(DeviceId, AsyncInfo, StreamPoolVector); + Err = cuMemFreeAsync((CUdeviceptr)TgtPtr, Stream); + } else { + Err = cuMemFree((CUdeviceptr)TgtPtr); + } +#else Err = cuMemFree((CUdeviceptr)TgtPtr); +#endif if (!checkResult(Err, "Error returned from cuMemFree\n")) return OFFLOAD_FAIL; break; @@ -467,20 +508,6 @@ E.Table.EntriesBegin = E.Table.EntriesEnd = nullptr; } - CUstream getStream(const int DeviceId, __tgt_async_info *AsyncInfo) const { - assert(AsyncInfo && "AsyncInfo is nullptr"); - - if (!AsyncInfo->Queue) { - CUstream S; - if (StreamPool[DeviceId]->acquire(S) != OFFLOAD_SUCCESS) - return nullptr; - - AsyncInfo->Queue = S; - } - - return reinterpret_cast(AsyncInfo->Queue); - } - public: // This class should not be copied DeviceRTLTy(const DeviceRTLTy &) = delete; @@ -513,7 +540,7 @@ } DeviceData.resize(NumberOfDevices); - StreamPool.resize(NumberOfDevices); + StreamPoolVector.resize(NumberOfDevices); // Get environment variables regarding teams if (const char *EnvStr = getenv("OMP_TEAM_LIMIT")) { @@ -544,7 +571,7 @@ } for (int I = 0; I < NumberOfDevices; ++I) - DeviceAllocators.emplace_back(I, DeviceData); + DeviceAllocators.emplace_back(I, DeviceData, StreamPoolVector); // Get the size threshold from environment variable std::pair Res = MemoryManagerTy::getSizeThresholdFromEnv(); @@ -568,7 +595,7 @@ if (M) checkResult(cuModuleUnload(M), "Error returned from cuModuleUnload\n"); - for (auto &S : StreamPool) + for (auto &S : StreamPoolVector) S.reset(); EventPool.clear(); @@ -637,8 +664,8 @@ return OFFLOAD_FAIL; // Initialize stream pool - if (!StreamPool[DeviceId]) - StreamPool[DeviceId] = std::make_unique( + if (!StreamPoolVector[DeviceId]) + StreamPoolVector[DeviceId] = std::make_unique( AllocatorTy(DeviceData[DeviceId].Context), NumInitialStreams); @@ -943,17 +970,19 @@ } void *dataAlloc(const int DeviceId, const int64_t Size, - const TargetAllocTy Kind) { + const TargetAllocTy Kind, __tgt_async_info *AsyncInfo) { switch (Kind) { case TARGET_ALLOC_DEFAULT: case TARGET_ALLOC_DEVICE: if (UseMemoryManager) - return MemoryManagers[DeviceId]->allocate(Size, nullptr); + return MemoryManagers[DeviceId]->allocate(Size, nullptr, AsyncInfo); else - return DeviceAllocators[DeviceId].allocate(Size, nullptr, Kind); + return DeviceAllocators[DeviceId].allocate(Size, nullptr, Kind, + AsyncInfo); case TARGET_ALLOC_HOST: case TARGET_ALLOC_SHARED: - return DeviceAllocators[DeviceId].allocate(Size, nullptr, Kind); + return DeviceAllocators[DeviceId].allocate(Size, nullptr, Kind, + AsyncInfo); } REPORT("Invalid target data allocation kind or requested allocator not " @@ -970,7 +999,7 @@ if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n")) return OFFLOAD_FAIL; - CUstream Stream = getStream(DeviceId, AsyncInfo); + CUstream Stream = getStream(DeviceId, AsyncInfo, StreamPoolVector); Err = cuMemcpyHtoDAsync((CUdeviceptr)TgtPtr, HstPtr, Size, Stream); if (Err != CUDA_SUCCESS) { @@ -992,7 +1021,7 @@ if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n")) return OFFLOAD_FAIL; - CUstream Stream = getStream(DeviceId, AsyncInfo); + CUstream Stream = getStream(DeviceId, AsyncInfo, StreamPoolVector); Err = cuMemcpyDtoHAsync(HstPtr, (CUdeviceptr)TgtPtr, Size, Stream); if (Err != CUDA_SUCCESS) { @@ -1014,7 +1043,7 @@ if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n")) return OFFLOAD_FAIL; - CUstream Stream = getStream(SrcDevId, AsyncInfo); + CUstream Stream = getStream(SrcDevId, AsyncInfo, StreamPoolVector); // If they are two devices, we try peer to peer copy first if (SrcDevId != DstDevId) { @@ -1057,11 +1086,12 @@ return memcpyDtoD(SrcPtr, DstPtr, Size, Stream); } - int dataDelete(const int DeviceId, void *TgtPtr) { + int dataDelete(const int DeviceId, void *TgtPtr, + __tgt_async_info *AsyncInfo) { if (UseMemoryManager) return MemoryManagers[DeviceId]->free(TgtPtr); - return DeviceAllocators[DeviceId].free(TgtPtr); + return DeviceAllocators[DeviceId].free(TgtPtr, AsyncInfo); } int runTargetTeamRegion(const int DeviceId, void *TgtEntryPtr, void **TgtArgs, @@ -1186,7 +1216,7 @@ CudaBlocksPerGrid, CudaThreadsPerBlock, (!IsSPMDMode ? (IsGenericMode ? "Generic" : "SPMD-Generic") : "SPMD")); - CUstream Stream = getStream(DeviceId, AsyncInfo); + CUstream Stream = getStream(DeviceId, AsyncInfo, StreamPoolVector); Err = cuLaunchKernel(KernelInfo->Func, CudaBlocksPerGrid, /* gridDimY */ 1, /* gridDimZ */ 1, CudaThreadsPerBlock, /* blockDimY */ 1, /* blockDimZ */ 1, @@ -1207,7 +1237,8 @@ // Once the stream is synchronized, return it to stream pool and reset // AsyncInfo. This is to make sure the synchronization only works for its // own tasks. - StreamPool[DeviceId]->release(reinterpret_cast(AsyncInfo->Queue)); + StreamPoolVector[DeviceId]->release( + reinterpret_cast(AsyncInfo->Queue)); AsyncInfo->Queue = nullptr; if (Err != CUDA_SUCCESS) { @@ -1408,7 +1439,7 @@ int waitEvent(const int DeviceId, __tgt_async_info *AsyncInfo, void *EventPtr) const { - CUstream Stream = getStream(DeviceId, AsyncInfo); + CUstream Stream = getStream(DeviceId, AsyncInfo, StreamPoolVector); CUevent Event = reinterpret_cast(EventPtr); // We don't use CU_EVENT_WAIT_DEFAULT here as it is only available from @@ -1467,11 +1498,23 @@ return DeviceRTL.loadBinary(device_id, image); } -void *__tgt_rtl_data_alloc(int32_t device_id, int64_t size, void *, +void *__tgt_rtl_data_alloc(int32_t device_id, int64_t size, void *hst_ptr, int32_t kind) { assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); + __tgt_async_info AsyncInfo; + void *ptr = + __tgt_rtl_data_alloc_async(device_id, size, hst_ptr, kind, &AsyncInfo); + if (AsyncInfo.Queue) + __tgt_rtl_synchronize(device_id, &AsyncInfo); + return ptr; +} - return DeviceRTL.dataAlloc(device_id, size, (TargetAllocTy)kind); +void *__tgt_rtl_data_alloc_async(int32_t device_id, int64_t size, void *hst_ptr, + int32_t kind, + __tgt_async_info *async_info_ptr) { + assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); + return DeviceRTL.dataAlloc(device_id, size, (TargetAllocTy)kind, + async_info_ptr); } int32_t __tgt_rtl_data_submit(int32_t device_id, void *tgt_ptr, void *hst_ptr, @@ -1550,7 +1593,25 @@ int32_t __tgt_rtl_data_delete(int32_t device_id, void *tgt_ptr) { assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); - return DeviceRTL.dataDelete(device_id, tgt_ptr); + __tgt_async_info AsyncInfo; + const int32_t rc = + __tgt_rtl_data_delete_async(device_id, tgt_ptr, &AsyncInfo); + if (rc != OFFLOAD_SUCCESS) + return OFFLOAD_FAIL; + + if (AsyncInfo.Queue) + return __tgt_rtl_synchronize(device_id, &AsyncInfo); + return OFFLOAD_SUCCESS; +} + +int32_t __tgt_rtl_data_delete_async(int32_t device_id, void *tgt_ptr, + __tgt_async_info *async_info_ptr) { + assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); +#if CUDA_VERSION < 11020 + if (async_info_ptr && async_info_ptr->Queue) + __tgt_rtl_synchronize(device_id, async_info_ptr); +#endif + return DeviceRTL.dataDelete(device_id, tgt_ptr, async_info_ptr); } int32_t __tgt_rtl_run_target_team_region(int32_t device_id, void *tgt_entry_ptr, diff --git a/openmp/libomptarget/plugins/exports b/openmp/libomptarget/plugins/exports --- a/openmp/libomptarget/plugins/exports +++ b/openmp/libomptarget/plugins/exports @@ -7,6 +7,7 @@ __tgt_rtl_init_device; __tgt_rtl_load_binary; __tgt_rtl_data_alloc; + __tgt_rtl_data_alloc_async; __tgt_rtl_data_submit; __tgt_rtl_data_submit_async; __tgt_rtl_data_retrieve; @@ -14,6 +15,7 @@ __tgt_rtl_data_exchange; __tgt_rtl_data_exchange_async; __tgt_rtl_data_delete; + __tgt_rtl_data_delete_async; __tgt_rtl_run_target_team_region; __tgt_rtl_run_target_team_region_async; __tgt_rtl_run_target_region; diff --git a/openmp/libomptarget/src/api.cpp b/openmp/libomptarget/src/api.cpp --- a/openmp/libomptarget/src/api.cpp +++ b/openmp/libomptarget/src/api.cpp @@ -76,7 +76,7 @@ return; } - PM->Devices[device_num]->deleteData(device_ptr); + PM->Devices[device_num]->deleteData(device_ptr, /* AsyncInfoPtr */ nullptr); DP("omp_target_free deallocated device ptr\n"); } diff --git a/openmp/libomptarget/src/device.cpp b/openmp/libomptarget/src/device.cpp --- a/openmp/libomptarget/src/device.cpp +++ b/openmp/libomptarget/src/device.cpp @@ -11,6 +11,7 @@ //===----------------------------------------------------------------------===// #include "device.h" +#include "omptarget.h" #include "private.h" #include "rtl.h" @@ -19,10 +20,10 @@ #include #include -int HostDataToTargetTy::addEventIfNecessary( - DeviceTy &Device, AsyncInfoTy &AsyncInfo) const { +int HostDataToTargetTy::addEventIfNecessary(DeviceTy &Device, + AsyncInfoTy &AsyncInfo) const { // First, check if the user disabled atomic map transfer/malloc/dealloc. - if (!PM->UseEventsForAtomicTransfers) + if (!AsyncInfo.hasPotentiallyPendingAsyncWork() || !PM->UseEventsForAtomicTransfers) return OFFLOAD_SUCCESS; void *Event = getEvent(); @@ -49,7 +50,9 @@ DeviceTy::DeviceTy(RTLInfoTy *RTL) : DeviceID(-1), RTL(RTL), RTLDeviceID(-1), IsInit(false), InitFlag(), HasPendingGlobals(false), HostDataToTargetMap(), PendingCtorsDtors(), - ShadowPtrMap(), DataMapMtx(), PendingGlobalsMtx(), ShadowMtx() {} + ShadowPtrMap(), DataMapMtx(), PendingGlobalsMtx(), ShadowMtx() { + SupportsAsyncFree = RTL->data_delete_async && RTL->synchronize; +} DeviceTy::~DeviceTy() { if (DeviceID == -1 || !(getInfoLevel() & OMP_INFOTYPE_DUMP_TABLE)) @@ -183,13 +186,11 @@ return lr; } -TargetPointerResultTy -DeviceTy::getTargetPointer(void *HstPtrBegin, void *HstPtrBase, int64_t Size, - map_var_info_t HstPtrName, bool HasFlagTo, - bool HasFlagAlways, bool IsImplicit, - bool UpdateRefCount, bool HasCloseModifier, - bool HasPresentModifier, bool HasHoldModifier, - AsyncInfoTy &AsyncInfo) { +TargetPointerResultTy DeviceTy::getTargetPointer( + void *HstPtrBegin, void *HstPtrBase, int64_t Size, + map_var_info_t HstPtrName, bool HasFlagTo, bool HasFlagAlways, + bool IsImplicit, bool UpdateRefCount, bool HasCloseModifier, + bool HasPresentModifier, bool HasHoldModifier, AsyncInfoTy &AsyncInfo) { void *TargetPointer = nullptr; bool IsHostPtr = false; bool IsNew = false; @@ -265,7 +266,8 @@ } else if (Size) { // If it is not contained and Size > 0, we should create a new entry for it. IsNew = true; - uintptr_t Ptr = (uintptr_t)allocData(Size, HstPtrBegin); + uintptr_t Ptr = (uintptr_t)allocData(Size, HstPtrBegin, + TARGET_ALLOC_DEFAULT, AsyncInfo); Entry = HostDataToTargetMap .emplace((uintptr_t)HstPtrBase, (uintptr_t)HstPtrBegin, (uintptr_t)HstPtrBegin + Size, Ptr, HasHoldModifier, @@ -279,6 +281,17 @@ Entry->dynRefCountToStr().c_str(), Entry->holdRefCountToStr().c_str(), (HstPtrName) ? getNameFromMapping(HstPtrName).c_str() : "unknown"); TargetPointer = (void *)Ptr; + + // Ensure the atomicity of the allocation, if required. Allocation have to + // look atomic even if they are async and not accompanied with a data + // transfer. + if (SupportsAsyncFree) { + HostDataToTargetTy::LockGuard LG(*Entry); + if (Entry->addEventIfNecessary(*this, AsyncInfo) != OFFLOAD_SUCCESS) + return {{false /* IsNewEntry */, false /* IsHostPointer */}, + {} /* MapTableEntry */, + nullptr /* TargetPointer */}; + } } // If the target pointer is valid, and we need to transfer data, issue the @@ -300,8 +313,7 @@ // pointer points to a corrupted memory region so it doesn't make any // sense to continue to use it. TargetPointer = nullptr; - } else if (Entry->addEventIfNecessary(*this, AsyncInfo) != - OFFLOAD_SUCCESS) + } else if (Entry->addEventIfNecessary(*this, AsyncInfo) != OFFLOAD_SUCCESS) return {{false /* IsNewEntry */, false /* IsHostPointer */}, {} /* MapTableEntry */, nullptr /* TargetPointer */}; @@ -413,7 +425,7 @@ } int DeviceTy::deallocTgtPtr(void *HstPtrBegin, int64_t Size, - bool HasHoldModifier) { + bool HasHoldModifier, AsyncInfoTy &AsyncInfo) { // Check if the pointer is contained in any sub-nodes. int Ret = OFFLOAD_SUCCESS; DataMapMtx.lock(); @@ -423,7 +435,7 @@ if (HT.decRefCount(HasHoldModifier) == 0) { DP("Deleting tgt data " DPxMOD " of size %" PRId64 "\n", DPxPTR(HT.TgtPtrBegin), Size); - deleteData((void *)HT.TgtPtrBegin); + deleteData((void *)HT.TgtPtrBegin, AsyncInfo); INFO(OMP_INFOTYPE_MAPPING_CHANGED, DeviceID, "Removing map entry with HstPtrBegin=" DPxMOD ", TgtPtrBegin=" DPxMOD ", Size=%" PRId64 ", Name=%s\n", @@ -484,12 +496,17 @@ return rc; } -void *DeviceTy::allocData(int64_t Size, void *HstPtr, int32_t Kind) { - return RTL->data_alloc(RTLDeviceID, Size, HstPtr, Kind); +void *DeviceTy::allocData(int64_t Size, void *HstPtr, int32_t Kind, + __tgt_async_info *AsyncInfo) { + if (!AsyncInfo || !SupportsAsyncFree) + return RTL->data_alloc(RTLDeviceID, Size, HstPtr, Kind); + return RTL->data_alloc_async(RTLDeviceID, Size, HstPtr, Kind, AsyncInfo); } -int32_t DeviceTy::deleteData(void *TgtPtrBegin) { - return RTL->data_delete(RTLDeviceID, TgtPtrBegin); +int32_t DeviceTy::deleteData(void *TgtPtrBegin, __tgt_async_info *AsyncInfo) { + if (!AsyncInfo || !SupportsAsyncFree) + return RTL->data_delete(RTLDeviceID, TgtPtrBegin); + return RTL->data_delete_async(RTLDeviceID, TgtPtrBegin, AsyncInfo); } // Submit data to device @@ -509,9 +526,8 @@ if (!AsyncInfo || !RTL->data_submit_async || !RTL->synchronize) return RTL->data_submit(RTLDeviceID, TgtPtrBegin, HstPtrBegin, Size); - else - return RTL->data_submit_async(RTLDeviceID, TgtPtrBegin, HstPtrBegin, Size, - AsyncInfo); + return RTL->data_submit_async(RTLDeviceID, TgtPtrBegin, HstPtrBegin, Size, + AsyncInfo); } // Retrieve data from device @@ -530,9 +546,8 @@ if (!RTL->data_retrieve_async || !RTL->synchronize) return RTL->data_retrieve(RTLDeviceID, HstPtrBegin, TgtPtrBegin, Size); - else - return RTL->data_retrieve_async(RTLDeviceID, HstPtrBegin, TgtPtrBegin, Size, - AsyncInfo); + return RTL->data_retrieve_async(RTLDeviceID, HstPtrBegin, TgtPtrBegin, Size, + AsyncInfo); } // Copy data from current device to destination device directly @@ -542,9 +557,9 @@ assert(RTL->data_exchange && "RTL->data_exchange is nullptr"); return RTL->data_exchange(RTLDeviceID, SrcPtr, DstDev.RTLDeviceID, DstPtr, Size); - } else - return RTL->data_exchange_async(RTLDeviceID, SrcPtr, DstDev.RTLDeviceID, - DstPtr, Size, AsyncInfo); + } + return RTL->data_exchange_async(RTLDeviceID, SrcPtr, DstDev.RTLDeviceID, + DstPtr, Size, AsyncInfo); } // Run region on device @@ -554,9 +569,8 @@ if (!RTL->run_region || !RTL->synchronize) return RTL->run_region(RTLDeviceID, TgtEntryPtr, TgtVarsPtr, TgtOffsets, TgtVarsSize); - else - return RTL->run_region_async(RTLDeviceID, TgtEntryPtr, TgtVarsPtr, - TgtOffsets, TgtVarsSize, AsyncInfo); + return RTL->run_region_async(RTLDeviceID, TgtEntryPtr, TgtVarsPtr, TgtOffsets, + TgtVarsSize, AsyncInfo); } // Run region on device @@ -577,10 +591,9 @@ return RTL->run_team_region(RTLDeviceID, TgtEntryPtr, TgtVarsPtr, TgtOffsets, TgtVarsSize, NumTeams, ThreadLimit, LoopTripCount); - else - return RTL->run_team_region_async(RTLDeviceID, TgtEntryPtr, TgtVarsPtr, - TgtOffsets, TgtVarsSize, NumTeams, - ThreadLimit, LoopTripCount, AsyncInfo); + return RTL->run_team_region_async(RTLDeviceID, TgtEntryPtr, TgtVarsPtr, + TgtOffsets, TgtVarsSize, NumTeams, + ThreadLimit, LoopTripCount, AsyncInfo); } // Whether data can be copied to DstDevice directly diff --git a/openmp/libomptarget/src/omptarget.cpp b/openmp/libomptarget/src/omptarget.cpp --- a/openmp/libomptarget/src/omptarget.cpp +++ b/openmp/libomptarget/src/omptarget.cpp @@ -826,20 +826,20 @@ } } - // TODO: We should not synchronize here but pass the AsyncInfo object to the - // allocate/deallocate device APIs. - // - // We need to synchronize before deallocating data. - Ret = AsyncInfo.synchronize(); - if (Ret != OFFLOAD_SUCCESS) - return OFFLOAD_FAIL; + // We need to synchronize before deallocating data if the device does not + // support asynchronous frees. + if (!Device.SupportsAsyncFree) { + Ret = AsyncInfo.synchronize(); + if (Ret != OFFLOAD_SUCCESS) + return OFFLOAD_FAIL; + } // Deallocate target pointer for (DeallocTgtPtrInfo &Info : DeallocTgtPtrs) { if (FromMapperBase && FromMapperBase == Info.HstPtrBegin) continue; Ret = Device.deallocTgtPtr(Info.HstPtrBegin, Info.DataSize, - Info.HasHoldModifier); + Info.HasHoldModifier, AsyncInfo); if (Ret != OFFLOAD_SUCCESS) { REPORT("Deallocating data from device failed.\n"); return OFFLOAD_FAIL; @@ -1141,7 +1141,8 @@ // immediately. if (ArgSize > FirstPrivateArgSizeThreshold || !IsFirstPrivate || AllocImmediately) { - TgtPtr = Device.allocData(ArgSize, HstPtr); + TgtPtr = + Device.allocData(ArgSize, HstPtr, TARGET_ALLOC_DEFAULT, AsyncInfo); if (!TgtPtr) { DP("Data allocation for %sprivate array " DPxMOD " failed.\n", (IsFirstPrivate ? "first-" : ""), DPxPTR(HstPtr)); @@ -1204,7 +1205,8 @@ } // Allocate target memory void *TgtPtr = - Device.allocData(FirstPrivateArgSize, FirstPrivateArgBuffer.data()); + Device.allocData(FirstPrivateArgSize, FirstPrivateArgBuffer.data(), + TARGET_ALLOC_DEFAULT, AsyncInfo); if (TgtPtr == nullptr) { DP("Failed to allocate target memory for private arguments.\n"); return OFFLOAD_FAIL; @@ -1239,7 +1241,7 @@ /// Free all target memory allocated for private arguments int free() { for (void *P : TgtPtrs) { - int Ret = Device.deleteData(P); + int Ret = Device.deleteData(P, AsyncInfo); if (Ret != OFFLOAD_SUCCESS) { DP("Deallocation of (first-)private arrays failed.\n"); return OFFLOAD_FAIL; @@ -1504,5 +1506,7 @@ } } - return OFFLOAD_SUCCESS; + // Ensure we synchronize at the very end of a target directive. If we + // synchronized before this might be a no-op as the "queue" is a nullptr now. + return AsyncInfo.synchronize(); } diff --git a/openmp/libomptarget/src/rtl.cpp b/openmp/libomptarget/src/rtl.cpp --- a/openmp/libomptarget/src/rtl.cpp +++ b/openmp/libomptarget/src/rtl.cpp @@ -167,6 +167,10 @@ // Optional functions *((void **)&R.init_requires) = dlsym(dynlib_handle, "__tgt_rtl_init_requires"); + *((void **)&R.data_alloc_async) = + dlsym(dynlib_handle, "__tgt_rtl_data_alloc_async"); + *((void **)&R.data_delete_async) = + dlsym(dynlib_handle, "__tgt_rtl_data_delete_async"); *((void **)&R.data_submit_async) = dlsym(dynlib_handle, "__tgt_rtl_data_submit_async"); *((void **)&R.data_retrieve_async) =