diff --git a/openmp/libomptarget/include/omptargetplugin.h b/openmp/libomptarget/include/omptargetplugin.h --- a/openmp/libomptarget/include/omptargetplugin.h +++ b/openmp/libomptarget/include/omptargetplugin.h @@ -69,6 +69,8 @@ // to use (e.g. shared, host, device). void *__tgt_rtl_data_alloc(int32_t ID, int64_t Size, void *HostPtr, int32_t Kind); +void *__tgt_rtl_data_alloc_async(int32_t ID, int64_t Size, void *HostPtr, + int32_t Kind, __tgt_async_info *AsyncInfo); // Pass the data content to the target device using the target address. In case // of success, return zero. Otherwise, return an error code. @@ -103,6 +105,8 @@ // De-allocate the data referenced by target ptr on the device. In case of // success, return zero. Otherwise, return an error code. int32_t __tgt_rtl_data_delete(int32_t ID, void *TargetPtr); +int32_t __tgt_rtl_data_delete_async(int32_t ID, void *TargetPtr, + __tgt_async_info *AsyncInfo); // Transfer control to the offloaded entry Entry on the target device. // Args and Offsets are arrays of NumArgs size of target addresses and diff --git a/openmp/libomptarget/plugins/common/MemoryManager/MemoryManager.h b/openmp/libomptarget/plugins/common/MemoryManager/MemoryManager.h --- a/openmp/libomptarget/plugins/common/MemoryManager/MemoryManager.h +++ b/openmp/libomptarget/plugins/common/MemoryManager/MemoryManager.h @@ -22,6 +22,7 @@ #include #include "Debug.h" +#include "omptarget.h" #include "omptargetplugin.h" /// Base class of per-device allocator. @@ -30,11 +31,14 @@ virtual ~DeviceAllocatorTy() = default; /// Allocate a memory of size \p Size . \p HstPtr is used to assist the - /// allocation. - virtual void *allocate(size_t Size, void *HstPtr, TargetAllocTy Kind) = 0; - - /// Delete the pointer \p TgtPtr on the device - virtual int free(void *TgtPtr) = 0; + /// allocation. \p AsyncInfo is used to make it asynchronous, if the device + /// (driver) supports it. + virtual void *allocate(size_t Size, void *HstPtr, TargetAllocTy Kind, + __tgt_async_info *AsyncInfo) = 0; + + /// Delete the pointer \p TgtPtr on the device. \p AsyncInfo is used to make + /// it asynchronous, if the device (driver) supports it. + virtual int free(void *TgtPtr, __tgt_async_info *AsyncInfo) = 0; }; /// Class of memory manager. The memory manager is per-device by using @@ -132,17 +136,22 @@ size_t SizeThreshold = 1U << 13; /// Request memory from target device - void *allocateOnDevice(size_t Size, void *HstPtr) const { - return DeviceAllocator.allocate(Size, HstPtr, TARGET_ALLOC_DEVICE); + void *allocateOnDevice(size_t Size, void *HstPtr, + __tgt_async_info *AsyncInfo) const { + return DeviceAllocator.allocate(Size, HstPtr, TARGET_ALLOC_DEVICE, + AsyncInfo); } /// Deallocate data on device - int deleteOnDevice(void *Ptr) const { return DeviceAllocator.free(Ptr); } + int deleteOnDevice(void *Ptr) const { + return DeviceAllocator.free(Ptr, /* AsyncInfo */ nullptr); + } /// This function is called when it tries to allocate memory on device but the /// device returns out of memory. It will first free all memory in the /// FreeList and try to allocate again. - void *freeAndAllocate(size_t Size, void *HstPtr) { + void *freeAndAllocate(size_t Size, void *HstPtr, + __tgt_async_info *AsyncInfo) { std::vector RemoveList; // Deallocate all memory in FreeList @@ -166,21 +175,22 @@ } // Try allocate memory again - return allocateOnDevice(Size, HstPtr); + return allocateOnDevice(Size, HstPtr, AsyncInfo); } /// The goal is to allocate memory on the device. It first tries to /// allocate directly on the device. If a \p nullptr is returned, it might /// be because the device is OOM. In that case, it will free all unused /// memory and then try again. - void *allocateOrFreeAndAllocateOnDevice(size_t Size, void *HstPtr) { - void *TgtPtr = allocateOnDevice(Size, HstPtr); + void *allocateOrFreeAndAllocateOnDevice(size_t Size, void *HstPtr, + __tgt_async_info *AsyncInfo) { + void *TgtPtr = allocateOnDevice(Size, HstPtr, AsyncInfo); // We cannot get memory from the device. It might be due to OOM. Let's // free all memory in FreeLists and try again. if (TgtPtr == nullptr) { DP("Failed to get memory on device. Free all memory in FreeLists and " "try again.\n"); - TgtPtr = freeAndAllocate(Size, HstPtr); + TgtPtr = freeAndAllocate(Size, HstPtr, AsyncInfo); } if (TgtPtr == nullptr) @@ -211,7 +221,7 @@ /// Allocate memory of size \p Size from target device. \p HstPtr is used to /// assist the allocation. - void *allocate(size_t Size, void *HstPtr) { + void *allocate(size_t Size, void *HstPtr, __tgt_async_info *AsyncInfo) { // If the size is zero, we will not bother the target device. Just return // nullptr directly. if (Size == 0) @@ -226,7 +236,7 @@ DP("%zu is greater than the threshold %zu. Allocate it directly from " "device\n", Size, SizeThreshold); - void *TgtPtr = allocateOrFreeAndAllocateOnDevice(Size, HstPtr); + void *TgtPtr = allocateOrFreeAndAllocateOnDevice(Size, HstPtr, AsyncInfo); DP("Got target pointer " DPxMOD ". Return directly.\n", DPxPTR(TgtPtr)); @@ -258,7 +268,7 @@ if (NodePtr == nullptr) { DP("Cannot find a node in the FreeLists. Allocate on device.\n"); // Allocate one on device - void *TgtPtr = allocateOrFreeAndAllocateOnDevice(Size, HstPtr); + void *TgtPtr = allocateOrFreeAndAllocateOnDevice(Size, HstPtr, AsyncInfo); if (TgtPtr == nullptr) return nullptr; @@ -280,7 +290,7 @@ } /// Deallocate memory pointed by \p TgtPtr - int free(void *TgtPtr) { + int free(void *TgtPtr, __tgt_async_info *AsyncInfo) { DP("MemoryManagerTy::free: target memory " DPxMOD ".\n", DPxPTR(TgtPtr)); NodeTy *P = nullptr; diff --git a/openmp/libomptarget/plugins/cuda/dynamic_cuda/cuda.h b/openmp/libomptarget/plugins/cuda/dynamic_cuda/cuda.h --- a/openmp/libomptarget/plugins/cuda/dynamic_cuda/cuda.h +++ b/openmp/libomptarget/plugins/cuda/dynamic_cuda/cuda.h @@ -221,6 +221,7 @@ void **); CUresult cuMemAlloc(CUdeviceptr *, size_t); +CUresult cuMemAllocAsync(CUdeviceptr *, size_t, CUstream); CUresult cuMemAllocHost(void **, size_t); CUresult cuMemAllocManaged(CUdeviceptr *, size_t, unsigned int); @@ -231,6 +232,7 @@ CUresult cuMemcpyHtoDAsync(CUdeviceptr, const void *, size_t, CUstream); CUresult cuMemFree(CUdeviceptr); +CUresult cuMemFreeAsync(CUdeviceptr, CUstream); CUresult cuMemFreeHost(void *); CUresult cuModuleGetFunction(CUfunction *, CUmodule, const char *); diff --git a/openmp/libomptarget/plugins/cuda/dynamic_cuda/cuda.cpp b/openmp/libomptarget/plugins/cuda/dynamic_cuda/cuda.cpp --- a/openmp/libomptarget/plugins/cuda/dynamic_cuda/cuda.cpp +++ b/openmp/libomptarget/plugins/cuda/dynamic_cuda/cuda.cpp @@ -37,6 +37,7 @@ DLWRAP(cuLaunchKernel, 11); DLWRAP(cuMemAlloc, 2); +DLWRAP(cuMemAllocAsync, 3); DLWRAP(cuMemAllocHost, 2); DLWRAP(cuMemAllocManaged, 3); @@ -47,6 +48,7 @@ DLWRAP(cuMemcpyHtoDAsync, 4); DLWRAP(cuMemFree, 1); +DLWRAP(cuMemFreeAsync, 2); DLWRAP(cuMemFreeHost, 1); DLWRAP(cuModuleGetFunction, 3); DLWRAP(cuModuleGetGlobal, 4); diff --git a/openmp/libomptarget/plugins/cuda/src/rtl.cpp b/openmp/libomptarget/plugins/cuda/src/rtl.cpp --- a/openmp/libomptarget/plugins/cuda/src/rtl.cpp +++ b/openmp/libomptarget/plugins/cuda/src/rtl.cpp @@ -321,6 +321,16 @@ } }; +static CUstream getStream(const int DeviceId, __tgt_async_info *AsyncInfoPtr, + StreamManagerTy *StreamManager) { + assert(AsyncInfoPtr && "AsyncInfoPtr is nullptr"); + + if (!AsyncInfoPtr->Queue) + AsyncInfoPtr->Queue = StreamManager->getStream(DeviceId); + + return reinterpret_cast(AsyncInfoPtr->Queue); +} + class DeviceRTLTy { int NumberOfDevices; // OpenMP environment properties @@ -348,11 +358,16 @@ const std::vector &DeviceData; std::unordered_map HostPinnedAllocs; + StreamManagerTy *StreamManager; + public: - CUDADeviceAllocatorTy(int DeviceId, std::vector &DeviceData) - : DeviceId(DeviceId), DeviceData(DeviceData) {} + CUDADeviceAllocatorTy(int DeviceId, std::vector &DeviceData, + StreamManagerTy *StreamManager) + : DeviceId(DeviceId), DeviceData(DeviceData), + StreamManager(StreamManager) {} - void *allocate(size_t Size, void *, TargetAllocTy Kind) override { + void *allocate(size_t Size, void *, TargetAllocTy Kind, + __tgt_async_info *AsyncInfo) override { if (Size == 0) return nullptr; @@ -365,7 +380,16 @@ case TARGET_ALLOC_DEFAULT: case TARGET_ALLOC_DEVICE: CUdeviceptr DevicePtr; +#if CUDA_VERSION >= 11020 + if (AsyncInfo) { + CUstream Stream = getStream(DeviceId, AsyncInfo, StreamManager); + Err = cuMemAllocAsync(&DevicePtr, Size, Stream); + } else { + Err = cuMemAlloc(&DevicePtr, Size); + } +#else Err = cuMemAlloc(&DevicePtr, Size); +#endif MemAlloc = (void *)DevicePtr; if (!checkResult(Err, "Error returned from cuMemAlloc\n")) return nullptr; @@ -390,7 +414,7 @@ return MemAlloc; } - int free(void *TgtPtr) override { + int free(void *TgtPtr, __tgt_async_info *AsyncInfo) override { CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context); if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n")) return OFFLOAD_FAIL; @@ -404,7 +428,16 @@ case TARGET_ALLOC_DEFAULT: case TARGET_ALLOC_DEVICE: case TARGET_ALLOC_SHARED: +#if CUDA_VERSION >= 11020 + if (AsyncInfo) { + CUstream Stream = getStream(DeviceId, AsyncInfo, StreamManager); + Err = cuMemFreeAsync((CUdeviceptr)TgtPtr, Stream); + } else { + Err = cuMemFree((CUdeviceptr)TgtPtr); + } +#else Err = cuMemFree((CUdeviceptr)TgtPtr); +#endif if (!checkResult(Err, "Error returned from cuMemFree\n")) return OFFLOAD_FAIL; break; @@ -468,15 +501,6 @@ E.Table.EntriesBegin = E.Table.EntriesEnd = nullptr; } - CUstream getStream(const int DeviceId, __tgt_async_info *AsyncInfo) const { - assert(AsyncInfo && "AsyncInfo is nullptr"); - - if (!AsyncInfo->Queue) - AsyncInfo->Queue = StreamManager->getStream(DeviceId); - - return reinterpret_cast(AsyncInfo->Queue); - } - public: // This class should not be copied DeviceRTLTy(const DeviceRTLTy &) = delete; @@ -537,7 +561,7 @@ std::make_unique(NumberOfDevices, DeviceData); for (int I = 0; I < NumberOfDevices; ++I) - DeviceAllocators.emplace_back(I, DeviceData); + DeviceAllocators.emplace_back(I, DeviceData, StreamManager.get()); // Get the size threshold from environment variable std::pair Res = MemoryManagerTy::getSizeThresholdFromEnv(); @@ -931,17 +955,19 @@ } void *dataAlloc(const int DeviceId, const int64_t Size, - const TargetAllocTy Kind) { + const TargetAllocTy Kind, __tgt_async_info *AsyncInfo) { switch (Kind) { case TARGET_ALLOC_DEFAULT: case TARGET_ALLOC_DEVICE: if (UseMemoryManager) - return MemoryManagers[DeviceId]->allocate(Size, nullptr); + return MemoryManagers[DeviceId]->allocate(Size, nullptr, AsyncInfo); else - return DeviceAllocators[DeviceId].allocate(Size, nullptr, Kind); + return DeviceAllocators[DeviceId].allocate(Size, nullptr, Kind, + AsyncInfo); case TARGET_ALLOC_HOST: case TARGET_ALLOC_SHARED: - return DeviceAllocators[DeviceId].allocate(Size, nullptr, Kind); + return DeviceAllocators[DeviceId].allocate(Size, nullptr, Kind, + AsyncInfo); } REPORT("Invalid target data allocation kind or requested allocator not " @@ -958,7 +984,7 @@ if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n")) return OFFLOAD_FAIL; - CUstream Stream = getStream(DeviceId, AsyncInfo); + CUstream Stream = getStream(DeviceId, AsyncInfo, StreamManager.get()); Err = cuMemcpyHtoDAsync((CUdeviceptr)TgtPtr, HstPtr, Size, Stream); if (Err != CUDA_SUCCESS) { @@ -980,7 +1006,7 @@ if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n")) return OFFLOAD_FAIL; - CUstream Stream = getStream(DeviceId, AsyncInfo); + CUstream Stream = getStream(DeviceId, AsyncInfo, StreamManager.get()); Err = cuMemcpyDtoHAsync(HstPtr, (CUdeviceptr)TgtPtr, Size, Stream); if (Err != CUDA_SUCCESS) { @@ -1002,7 +1028,7 @@ if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n")) return OFFLOAD_FAIL; - CUstream Stream = getStream(SrcDevId, AsyncInfo); + CUstream Stream = getStream(SrcDevId, AsyncInfo, StreamManager.get()); // If they are two devices, we try peer to peer copy first if (SrcDevId != DstDevId) { @@ -1045,11 +1071,12 @@ return memcpyDtoD(SrcPtr, DstPtr, Size, Stream); } - int dataDelete(const int DeviceId, void *TgtPtr) { + int dataDelete(const int DeviceId, void *TgtPtr, + __tgt_async_info *AsyncInfo) { if (UseMemoryManager) - return MemoryManagers[DeviceId]->free(TgtPtr); + return MemoryManagers[DeviceId]->free(TgtPtr, AsyncInfo); - return DeviceAllocators[DeviceId].free(TgtPtr); + return DeviceAllocators[DeviceId].free(TgtPtr, AsyncInfo); } int runTargetTeamRegion(const int DeviceId, void *TgtEntryPtr, void **TgtArgs, @@ -1174,7 +1201,7 @@ CudaBlocksPerGrid, CudaThreadsPerBlock, (!IsSPMDMode ? (IsGenericMode ? "Generic" : "SPMD-Generic") : "SPMD")); - CUstream Stream = getStream(DeviceId, AsyncInfo); + CUstream Stream = getStream(DeviceId, AsyncInfo, StreamManager.get()); Err = cuLaunchKernel(KernelInfo->Func, CudaBlocksPerGrid, /* gridDimY */ 1, /* gridDimZ */ 1, CudaThreadsPerBlock, /* blockDimY */ 1, /* blockDimZ */ 1, @@ -1384,7 +1411,7 @@ int waitEvent(const int DeviceId, __tgt_async_info *AsyncInfo, void *EventPtr) const { - CUstream Stream = getStream(DeviceId, AsyncInfo); + CUstream Stream = getStream(DeviceId, AsyncInfo, StreamManager.get()); CUevent Event = reinterpret_cast(EventPtr); // We don't use CU_EVENT_WAIT_DEFAULT here as it is only available from @@ -1443,11 +1470,22 @@ return DeviceRTL.loadBinary(device_id, image); } -void *__tgt_rtl_data_alloc(int32_t device_id, int64_t size, void *, +void *__tgt_rtl_data_alloc(int32_t device_id, int64_t size, void *hst_ptr, int32_t kind) { assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); + __tgt_async_info AsyncInfo; + void *ptr = + __tgt_rtl_data_alloc_async(device_id, size, hst_ptr, kind, &AsyncInfo); + __tgt_rtl_synchronize(device_id, &AsyncInfo); + return ptr; +} - return DeviceRTL.dataAlloc(device_id, size, (TargetAllocTy)kind); +void *__tgt_rtl_data_alloc_async(int32_t device_id, int64_t size, void *hst_ptr, + int32_t kind, + __tgt_async_info *async_info_ptr) { + assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); + return DeviceRTL.dataAlloc(device_id, size, (TargetAllocTy)kind, + async_info_ptr); } int32_t __tgt_rtl_data_submit(int32_t device_id, void *tgt_ptr, void *hst_ptr, @@ -1526,7 +1564,20 @@ int32_t __tgt_rtl_data_delete(int32_t device_id, void *tgt_ptr) { assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); - return DeviceRTL.dataDelete(device_id, tgt_ptr); + __tgt_async_info AsyncInfo; + const int32_t rc = + __tgt_rtl_data_delete_async(device_id, tgt_ptr, &AsyncInfo); + if (rc != OFFLOAD_SUCCESS) + return OFFLOAD_FAIL; + + return __tgt_rtl_synchronize(device_id, &AsyncInfo); +} + +int32_t __tgt_rtl_data_delete_async(int32_t device_id, void *tgt_ptr, + __tgt_async_info *AsyncInfo) { + assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); + + return DeviceRTL.dataDelete(device_id, tgt_ptr, AsyncInfo); } int32_t __tgt_rtl_run_target_team_region(int32_t device_id, void *tgt_entry_ptr, diff --git a/openmp/libomptarget/plugins/exports b/openmp/libomptarget/plugins/exports --- a/openmp/libomptarget/plugins/exports +++ b/openmp/libomptarget/plugins/exports @@ -7,6 +7,7 @@ __tgt_rtl_init_device; __tgt_rtl_load_binary; __tgt_rtl_data_alloc; + __tgt_rtl_data_alloc_async; __tgt_rtl_data_submit; __tgt_rtl_data_submit_async; __tgt_rtl_data_retrieve; @@ -14,6 +15,7 @@ __tgt_rtl_data_exchange; __tgt_rtl_data_exchange_async; __tgt_rtl_data_delete; + __tgt_rtl_data_delete_async; __tgt_rtl_run_target_team_region; __tgt_rtl_run_target_team_region_async; __tgt_rtl_run_target_region; diff --git a/openmp/libomptarget/src/api.cpp b/openmp/libomptarget/src/api.cpp --- a/openmp/libomptarget/src/api.cpp +++ b/openmp/libomptarget/src/api.cpp @@ -76,7 +76,7 @@ return; } - PM->Devices[device_num]->deleteData(device_ptr); + PM->Devices[device_num]->deleteData(device_ptr, /* AsyncInfoPtr */ nullptr); DP("omp_target_free deallocated device ptr\n"); } diff --git a/openmp/libomptarget/src/device.h b/openmp/libomptarget/src/device.h deleted file mode 100644 --- a/openmp/libomptarget/src/device.h +++ /dev/null @@ -1,409 +0,0 @@ -//===----------- device.h - Target independent OpenMP target RTL ----------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// Declarations for managing devices that are handled by RTL plugins. -// -//===----------------------------------------------------------------------===// - -#ifndef _OMPTARGET_DEVICE_H -#define _OMPTARGET_DEVICE_H - -#include -#include -#include -#include -#include -#include -#include -#include - -#include "omptarget.h" -#include "rtl.h" - -// Forward declarations. -struct RTLInfoTy; -struct __tgt_bin_desc; -struct __tgt_target_table; - -using map_var_info_t = void *; - -// enum for OMP_TARGET_OFFLOAD; keep in sync with kmp.h definition -enum kmp_target_offload_kind { - tgt_disabled = 0, - tgt_default = 1, - tgt_mandatory = 2 -}; -typedef enum kmp_target_offload_kind kmp_target_offload_kind_t; - -/// Map between host data and target data. -struct HostDataToTargetTy { - const uintptr_t HstPtrBase; // host info. - const uintptr_t HstPtrBegin; - const uintptr_t HstPtrEnd; // non-inclusive. - const map_var_info_t HstPtrName; // Optional source name of mapped variable. - - const uintptr_t TgtPtrBegin; // target info. - -private: - static const uint64_t INFRefCount = ~(uint64_t)0; - static std::string refCountToStr(uint64_t RefCount) { - return RefCount == INFRefCount ? "INF" : std::to_string(RefCount); - } - - struct StatesTy { - StatesTy(uint64_t DRC, uint64_t HRC) - : DynRefCount(DRC), HoldRefCount(HRC) {} - /// The dynamic reference count is the standard reference count as of OpenMP - /// 4.5. The hold reference count is an OpenMP extension for the sake of - /// OpenACC support. - /// - /// The 'ompx_hold' map type modifier is permitted only on "omp target" and - /// "omp target data", and "delete" is permitted only on "omp target exit - /// data" and associated runtime library routines. As a result, we really - /// need to implement "reset" functionality only for the dynamic reference - /// counter. Likewise, only the dynamic reference count can be infinite - /// because, for example, omp_target_associate_ptr and "omp declare target - /// link" operate only on it. Nevertheless, it's actually easier to follow - /// the code (and requires less assertions for special cases) when we just - /// implement these features generally across both reference counters here. - /// Thus, it's the users of this class that impose those restrictions. - /// - uint64_t DynRefCount; - uint64_t HoldRefCount; - /// This mutex will be locked when data movement is issued. For targets that - /// doesn't support async data movement, this mutex can guarantee that after - /// it is released, memory region on the target is update to date. For - /// targets that support async data movement, this can guarantee that data - /// movement has been issued. This mutex *must* be locked right before - /// releasing the mapping table lock. - std::mutex UpdateMtx; - }; - // When HostDataToTargetTy is used by std::set, std::set::iterator is const - // use unique_ptr to make States mutable. - const std::unique_ptr States; - -public: - HostDataToTargetTy(uintptr_t BP, uintptr_t B, uintptr_t E, uintptr_t TB, - bool UseHoldRefCount, map_var_info_t Name = nullptr, - bool IsINF = false) - : HstPtrBase(BP), HstPtrBegin(B), HstPtrEnd(E), HstPtrName(Name), - TgtPtrBegin(TB), States(std::make_unique(UseHoldRefCount ? 0 - : IsINF ? INFRefCount - : 1, - !UseHoldRefCount ? 0 - : IsINF ? INFRefCount - : 1)) {} - - /// Get the total reference count. This is smarter than just getDynRefCount() - /// + getHoldRefCount() because it handles the case where at least one is - /// infinity and the other is non-zero. - uint64_t getTotalRefCount() const { - if (States->DynRefCount == INFRefCount || - States->HoldRefCount == INFRefCount) - return INFRefCount; - return States->DynRefCount + States->HoldRefCount; - } - - /// Get the dynamic reference count. - uint64_t getDynRefCount() const { return States->DynRefCount; } - - /// Get the hold reference count. - uint64_t getHoldRefCount() const { return States->HoldRefCount; } - - /// Reset the specified reference count unless it's infinity. Reset to 1 - /// (even if currently 0) so it can be followed by a decrement. - void resetRefCount(bool UseHoldRefCount) const { - uint64_t &ThisRefCount = - UseHoldRefCount ? States->HoldRefCount : States->DynRefCount; - if (ThisRefCount != INFRefCount) - ThisRefCount = 1; - } - - /// Increment the specified reference count unless it's infinity. - void incRefCount(bool UseHoldRefCount) const { - uint64_t &ThisRefCount = - UseHoldRefCount ? States->HoldRefCount : States->DynRefCount; - if (ThisRefCount != INFRefCount) { - ++ThisRefCount; - assert(ThisRefCount < INFRefCount && "refcount overflow"); - } - } - - /// Decrement the specified reference count unless it's infinity or zero, and - /// return the total reference count. - uint64_t decRefCount(bool UseHoldRefCount) const { - uint64_t &ThisRefCount = - UseHoldRefCount ? States->HoldRefCount : States->DynRefCount; - uint64_t OtherRefCount = - UseHoldRefCount ? States->DynRefCount : States->HoldRefCount; - (void)OtherRefCount; - if (ThisRefCount != INFRefCount) { - if (ThisRefCount > 0) - --ThisRefCount; - else - assert(OtherRefCount > 0 && "total refcount underflow"); - } - return getTotalRefCount(); - } - - /// Is the dynamic (and thus the total) reference count infinite? - bool isDynRefCountInf() const { return States->DynRefCount == INFRefCount; } - - /// Convert the dynamic reference count to a debug string. - std::string dynRefCountToStr() const { - return refCountToStr(States->DynRefCount); - } - - /// Convert the hold reference count to a debug string. - std::string holdRefCountToStr() const { - return refCountToStr(States->HoldRefCount); - } - - /// Should one decrement of the specified reference count (after resetting it - /// if \c AfterReset) remove this mapping? - bool decShouldRemove(bool UseHoldRefCount, bool AfterReset = false) const { - uint64_t ThisRefCount = - UseHoldRefCount ? States->HoldRefCount : States->DynRefCount; - uint64_t OtherRefCount = - UseHoldRefCount ? States->DynRefCount : States->HoldRefCount; - if (OtherRefCount > 0) - return false; - if (AfterReset) - return ThisRefCount != INFRefCount; - return ThisRefCount == 1; - } - - void lock() const { States->UpdateMtx.lock(); } - - void unlock() const { States->UpdateMtx.unlock(); } -}; - -typedef uintptr_t HstPtrBeginTy; -inline bool operator<(const HostDataToTargetTy &lhs, const HstPtrBeginTy &rhs) { - return lhs.HstPtrBegin < rhs; -} -inline bool operator<(const HstPtrBeginTy &lhs, const HostDataToTargetTy &rhs) { - return lhs < rhs.HstPtrBegin; -} -inline bool operator<(const HostDataToTargetTy &lhs, - const HostDataToTargetTy &rhs) { - return lhs.HstPtrBegin < rhs.HstPtrBegin; -} - -typedef std::set> HostDataToTargetListTy; - -struct LookupResult { - struct { - unsigned IsContained : 1; - unsigned ExtendsBefore : 1; - unsigned ExtendsAfter : 1; - } Flags; - - HostDataToTargetListTy::iterator Entry; - - LookupResult() : Flags({0, 0, 0}), Entry() {} -}; - -/// This struct will be returned by \p DeviceTy::getTargetPointer which provides -/// more data than just a target pointer. -struct TargetPointerResultTy { - struct { - /// If the map table entry is just created - unsigned IsNewEntry : 1; - /// If the pointer is actually a host pointer (when unified memory enabled) - unsigned IsHostPointer : 1; - } Flags = {0, 0}; - - /// The iterator to the corresponding map table entry - HostDataToTargetListTy::iterator MapTableEntry{}; - - /// The corresponding target pointer - void *TargetPointer = nullptr; -}; - -/// Map for shadow pointers -struct ShadowPtrValTy { - void *HstPtrVal; - void *TgtPtrAddr; - void *TgtPtrVal; -}; -typedef std::map ShadowPtrListTy; - -/// -struct PendingCtorDtorListsTy { - std::list PendingCtors; - std::list PendingDtors; -}; -typedef std::map<__tgt_bin_desc *, PendingCtorDtorListsTy> - PendingCtorsDtorsPerLibrary; - -struct DeviceTy { - int32_t DeviceID; - RTLInfoTy *RTL; - int32_t RTLDeviceID; - - bool IsInit; - std::once_flag InitFlag; - bool HasPendingGlobals; - - HostDataToTargetListTy HostDataToTargetMap; - PendingCtorsDtorsPerLibrary PendingCtorsDtors; - - ShadowPtrListTy ShadowPtrMap; - - std::mutex DataMapMtx, PendingGlobalsMtx, ShadowMtx; - - // NOTE: Once libomp gains full target-task support, this state should be - // moved into the target task in libomp. - std::map LoopTripCnt; - - DeviceTy(RTLInfoTy *RTL); - // DeviceTy is not copyable - DeviceTy(const DeviceTy &D) = delete; - DeviceTy &operator=(const DeviceTy &D) = delete; - - ~DeviceTy(); - - // Return true if data can be copied to DstDevice directly - bool isDataExchangable(const DeviceTy &DstDevice); - - LookupResult lookupMapping(void *HstPtrBegin, int64_t Size); - /// Get the target pointer based on host pointer begin and base. If the - /// mapping already exists, the target pointer will be returned directly. In - /// addition, if required, the memory region pointed by \p HstPtrBegin of size - /// \p Size will also be transferred to the device. If the mapping doesn't - /// exist, and if unified shared memory is not enabled, a new mapping will be - /// created and the data will also be transferred accordingly. nullptr will be - /// returned because of any of following reasons: - /// - Data allocation failed; - /// - The user tried to do an illegal mapping; - /// - Data transfer issue fails. - TargetPointerResultTy - getTargetPointer(void *HstPtrBegin, void *HstPtrBase, int64_t Size, - map_var_info_t HstPtrName, bool HasFlagTo, - bool HasFlagAlways, bool IsImplicit, bool UpdateRefCount, - bool HasCloseModifier, bool HasPresentModifier, - bool HasHoldModifier, AsyncInfoTy &AsyncInfo); - void *getTgtPtrBegin(void *HstPtrBegin, int64_t Size); - void *getTgtPtrBegin(void *HstPtrBegin, int64_t Size, bool &IsLast, - bool UpdateRefCount, bool UseHoldRefCount, - bool &IsHostPtr, bool MustContain = false, - bool ForceDelete = false); - /// For the map entry for \p HstPtrBegin, decrement the reference count - /// specified by \p HasHoldModifier and, if the the total reference count is - /// then zero, deallocate the corresponding device storage and remove the map - /// entry. Return \c OFFLOAD_SUCCESS if the map entry existed, and return - /// \c OFFLOAD_FAIL if not. It is the caller's responsibility to skip calling - /// this function if the map entry is not expected to exist because - /// \p HstPtrBegin uses shared memory. - int deallocTgtPtr(void *HstPtrBegin, int64_t Size, bool HasHoldModifier); - int associatePtr(void *HstPtrBegin, void *TgtPtrBegin, int64_t Size); - int disassociatePtr(void *HstPtrBegin); - - // calls to RTL - int32_t initOnce(); - __tgt_target_table *load_binary(void *Img); - - // device memory allocation/deallocation routines - /// Allocates \p Size bytes on the device, host or shared memory space - /// (depending on \p Kind) and returns the address/nullptr when - /// succeeds/fails. \p HstPtr is an address of the host data which the - /// allocated target data will be associated with. If it is unknown, the - /// default value of \p HstPtr is nullptr. Note: this function doesn't do - /// pointer association. Actually, all the __tgt_rtl_data_alloc - /// implementations ignore \p HstPtr. \p Kind dictates what allocator should - /// be used (host, shared, device). - void *allocData(int64_t Size, void *HstPtr = nullptr, - int32_t Kind = TARGET_ALLOC_DEFAULT); - /// Deallocates memory which \p TgtPtrBegin points at and returns - /// OFFLOAD_SUCCESS/OFFLOAD_FAIL when succeeds/fails. - int32_t deleteData(void *TgtPtrBegin); - - // Data transfer. When AsyncInfo is nullptr, the transfer will be - // synchronous. - // Copy data from host to device - int32_t submitData(void *TgtPtrBegin, void *HstPtrBegin, int64_t Size, - AsyncInfoTy &AsyncInfo); - // Copy data from device back to host - int32_t retrieveData(void *HstPtrBegin, void *TgtPtrBegin, int64_t Size, - AsyncInfoTy &AsyncInfo); - // Copy data from current device to destination device directly - int32_t dataExchange(void *SrcPtr, DeviceTy &DstDev, void *DstPtr, - int64_t Size, AsyncInfoTy &AsyncInfo); - - int32_t runRegion(void *TgtEntryPtr, void **TgtVarsPtr, ptrdiff_t *TgtOffsets, - int32_t TgtVarsSize, AsyncInfoTy &AsyncInfo); - int32_t runTeamRegion(void *TgtEntryPtr, void **TgtVarsPtr, - ptrdiff_t *TgtOffsets, int32_t TgtVarsSize, - int32_t NumTeams, int32_t ThreadLimit, - uint64_t LoopTripCount, AsyncInfoTy &AsyncInfo); - - /// Synchronize device/queue/event based on \p AsyncInfo and return - /// OFFLOAD_SUCCESS/OFFLOAD_FAIL when succeeds/fails. - int32_t synchronize(AsyncInfoTy &AsyncInfo); - - /// Calls the corresponding print in the \p RTLDEVID - /// device RTL to obtain the information of the specific device. - bool printDeviceInfo(int32_t RTLDevID); - - /// Event related interfaces. - /// { - /// Create an event. - int32_t createEvent(void **Event); - - /// Record the event based on status in AsyncInfo->Queue at the moment the - /// function is called. - int32_t recordEvent(void *Event, AsyncInfoTy &AsyncInfo); - - /// Wait for an event. This function can be blocking or non-blocking, - /// depending on the implmentation. It is expected to set a dependence on the - /// event such that corresponding operations shall only start once the event - /// is fulfilled. - int32_t waitEvent(void *Event, AsyncInfoTy &AsyncInfo); - - /// Synchronize the event. It is expected to block the thread. - int32_t syncEvent(void *Event); - - /// Destroy the event. - int32_t destroyEvent(void *Event); - /// } - -private: - // Call to RTL - void init(); // To be called only via DeviceTy::initOnce() -}; - -extern bool device_is_ready(int device_num); - -/// Struct for the data required to handle plugins -struct PluginManager { - /// RTLs identified on the host - RTLsTy RTLs; - - /// Devices associated with RTLs - std::vector> Devices; - std::mutex RTLsMtx; ///< For RTLs and Devices - - /// Translation table retreived from the binary - HostEntriesBeginToTransTableTy HostEntriesBeginToTransTable; - std::mutex TrlTblMtx; ///< For Translation Table - /// Host offload entries in order of image registration - std::vector<__tgt_offload_entry *> HostEntriesBeginRegistrationOrder; - - /// Map from ptrs on the host to an entry in the Translation Table - HostPtrToTableMapTy HostPtrToTableMap; - std::mutex TblMapMtx; ///< For HostPtrToTableMap - - // Store target policy (disabled, mandatory, default) - kmp_target_offload_kind_t TargetOffloadPolicy = tgt_default; - std::mutex TargetOffloadMtx; ///< For TargetOffloadPolicy -}; - -extern PluginManager *PM; - -#endif diff --git a/openmp/libomptarget/src/device.cpp b/openmp/libomptarget/src/device.cpp --- a/openmp/libomptarget/src/device.cpp +++ b/openmp/libomptarget/src/device.cpp @@ -11,6 +11,7 @@ //===----------------------------------------------------------------------===// #include "device.h" +#include "omptarget.h" #include "private.h" #include "rtl.h" @@ -22,7 +23,9 @@ DeviceTy::DeviceTy(RTLInfoTy *RTL) : DeviceID(-1), RTL(RTL), RTLDeviceID(-1), IsInit(false), InitFlag(), HasPendingGlobals(false), HostDataToTargetMap(), PendingCtorsDtors(), - ShadowPtrMap(), DataMapMtx(), PendingGlobalsMtx(), ShadowMtx() {} + ShadowPtrMap(), DataMapMtx(), PendingGlobalsMtx(), ShadowMtx() { + SupportsAsyncFree = RTL->data_delete_async && RTL->synchronize; +} DeviceTy::~DeviceTy() { if (DeviceID == -1 || !(getInfoLevel() & OMP_INFOTYPE_DUMP_TABLE)) @@ -153,13 +156,11 @@ return lr; } -TargetPointerResultTy -DeviceTy::getTargetPointer(void *HstPtrBegin, void *HstPtrBase, int64_t Size, - map_var_info_t HstPtrName, bool HasFlagTo, - bool HasFlagAlways, bool IsImplicit, - bool UpdateRefCount, bool HasCloseModifier, - bool HasPresentModifier, bool HasHoldModifier, - AsyncInfoTy &AsyncInfo) { +TargetPointerResultTy DeviceTy::getTargetPointer( + void *HstPtrBegin, void *HstPtrBase, int64_t Size, + map_var_info_t HstPtrName, bool HasFlagTo, bool HasFlagAlways, + bool IsImplicit, bool UpdateRefCount, bool HasCloseModifier, + bool HasPresentModifier, bool HasHoldModifier, AsyncInfoTy &AsyncInfo) { void *TargetPointer = nullptr; bool IsHostPtr = false; bool IsNew = false; @@ -235,7 +236,8 @@ } else if (Size) { // If it is not contained and Size > 0, we should create a new entry for it. IsNew = true; - uintptr_t Ptr = (uintptr_t)allocData(Size, HstPtrBegin); + uintptr_t Ptr = (uintptr_t)allocData(Size, HstPtrBegin, + TARGET_ALLOC_DEFAULT, AsyncInfo); Entry = HostDataToTargetMap .emplace((uintptr_t)HstPtrBase, (uintptr_t)HstPtrBegin, (uintptr_t)HstPtrBegin + Size, Ptr, HasHoldModifier, @@ -363,7 +365,7 @@ } int DeviceTy::deallocTgtPtr(void *HstPtrBegin, int64_t Size, - bool HasHoldModifier) { + bool HasHoldModifier, AsyncInfoTy &AsyncInfo) { // Check if the pointer is contained in any sub-nodes. int rc; DataMapMtx.lock(); @@ -373,7 +375,7 @@ if (HT.decRefCount(HasHoldModifier) == 0) { DP("Deleting tgt data " DPxMOD " of size %" PRId64 "\n", DPxPTR(HT.TgtPtrBegin), Size); - deleteData((void *)HT.TgtPtrBegin); + deleteData((void *)HT.TgtPtrBegin, AsyncInfo); INFO(OMP_INFOTYPE_MAPPING_CHANGED, DeviceID, "Removing map entry with HstPtrBegin=" DPxMOD ", TgtPtrBegin=" DPxMOD ", Size=%" PRId64 ", Name=%s\n", @@ -430,12 +432,17 @@ return rc; } -void *DeviceTy::allocData(int64_t Size, void *HstPtr, int32_t Kind) { - return RTL->data_alloc(RTLDeviceID, Size, HstPtr, Kind); +void *DeviceTy::allocData(int64_t Size, void *HstPtr, int32_t Kind, + __tgt_async_info *AsyncInfo) { + if (!AsyncInfo || !SupportsAsyncFree) + return RTL->data_alloc(RTLDeviceID, Size, HstPtr, Kind); + return RTL->data_alloc_async(RTLDeviceID, Size, HstPtr, Kind, AsyncInfo); } -int32_t DeviceTy::deleteData(void *TgtPtrBegin) { - return RTL->data_delete(RTLDeviceID, TgtPtrBegin); +int32_t DeviceTy::deleteData(void *TgtPtrBegin, __tgt_async_info *AsyncInfo) { + if (!AsyncInfo || !SupportsAsyncFree) + return RTL->data_delete(RTLDeviceID, TgtPtrBegin); + return RTL->data_delete_async(RTLDeviceID, TgtPtrBegin, AsyncInfo); } // Submit data to device @@ -455,9 +462,8 @@ if (!AsyncInfo || !RTL->data_submit_async || !RTL->synchronize) return RTL->data_submit(RTLDeviceID, TgtPtrBegin, HstPtrBegin, Size); - else - return RTL->data_submit_async(RTLDeviceID, TgtPtrBegin, HstPtrBegin, Size, - AsyncInfo); + return RTL->data_submit_async(RTLDeviceID, TgtPtrBegin, HstPtrBegin, Size, + AsyncInfo); } // Retrieve data from device @@ -476,9 +482,8 @@ if (!RTL->data_retrieve_async || !RTL->synchronize) return RTL->data_retrieve(RTLDeviceID, HstPtrBegin, TgtPtrBegin, Size); - else - return RTL->data_retrieve_async(RTLDeviceID, HstPtrBegin, TgtPtrBegin, Size, - AsyncInfo); + return RTL->data_retrieve_async(RTLDeviceID, HstPtrBegin, TgtPtrBegin, Size, + AsyncInfo); } // Copy data from current device to destination device directly @@ -488,9 +493,9 @@ assert(RTL->data_exchange && "RTL->data_exchange is nullptr"); return RTL->data_exchange(RTLDeviceID, SrcPtr, DstDev.RTLDeviceID, DstPtr, Size); - } else - return RTL->data_exchange_async(RTLDeviceID, SrcPtr, DstDev.RTLDeviceID, - DstPtr, Size, AsyncInfo); + } + return RTL->data_exchange_async(RTLDeviceID, SrcPtr, DstDev.RTLDeviceID, + DstPtr, Size, AsyncInfo); } // Run region on device @@ -500,9 +505,8 @@ if (!RTL->run_region || !RTL->synchronize) return RTL->run_region(RTLDeviceID, TgtEntryPtr, TgtVarsPtr, TgtOffsets, TgtVarsSize); - else - return RTL->run_region_async(RTLDeviceID, TgtEntryPtr, TgtVarsPtr, - TgtOffsets, TgtVarsSize, AsyncInfo); + return RTL->run_region_async(RTLDeviceID, TgtEntryPtr, TgtVarsPtr, TgtOffsets, + TgtVarsSize, AsyncInfo); } // Run region on device @@ -523,10 +527,9 @@ return RTL->run_team_region(RTLDeviceID, TgtEntryPtr, TgtVarsPtr, TgtOffsets, TgtVarsSize, NumTeams, ThreadLimit, LoopTripCount); - else - return RTL->run_team_region_async(RTLDeviceID, TgtEntryPtr, TgtVarsPtr, - TgtOffsets, TgtVarsSize, NumTeams, - ThreadLimit, LoopTripCount, AsyncInfo); + return RTL->run_team_region_async(RTLDeviceID, TgtEntryPtr, TgtVarsPtr, + TgtOffsets, TgtVarsSize, NumTeams, + ThreadLimit, LoopTripCount, AsyncInfo); } // Whether data can be copied to DstDevice directly diff --git a/openmp/libomptarget/src/omptarget.cpp b/openmp/libomptarget/src/omptarget.cpp --- a/openmp/libomptarget/src/omptarget.cpp +++ b/openmp/libomptarget/src/omptarget.cpp @@ -517,11 +517,10 @@ const bool HasFlagTo = arg_types[i] & OMP_TGT_MAPTYPE_TO; const bool HasFlagAlways = arg_types[i] & OMP_TGT_MAPTYPE_ALWAYS; - auto TPR = Device.getTargetPointer(HstPtrBegin, HstPtrBase, data_size, - HstPtrName, HasFlagTo, HasFlagAlways, - IsImplicit, UpdateRef, HasCloseModifier, - HasPresentModifier, HasHoldModifier, - AsyncInfo); + auto TPR = Device.getTargetPointer( + HstPtrBegin, HstPtrBase, data_size, HstPtrName, HasFlagTo, + HasFlagAlways, IsImplicit, UpdateRef, HasCloseModifier, + HasPresentModifier, HasHoldModifier, AsyncInfo); void *TgtPtrBegin = TPR.TargetPointer; IsHostPtr = TPR.Flags.IsHostPointer; // If data_size==0, then the argument could be a zero-length pointer to @@ -792,20 +791,20 @@ } } - // TODO: We should not synchronize here but pass the AsyncInfo object to the - // allocate/deallocate device APIs. - // - // We need to synchronize before deallocating data. - Ret = AsyncInfo.synchronize(); - if (Ret != OFFLOAD_SUCCESS) - return OFFLOAD_FAIL; + // We need to synchronize before deallocating data if the device does not + // support asynchronous frees. + if (!Device.SupportsAsyncFree) { + Ret = AsyncInfo.synchronize(); + if (Ret != OFFLOAD_SUCCESS) + return OFFLOAD_FAIL; + } // Deallocate target pointer for (DeallocTgtPtrInfo &Info : DeallocTgtPtrs) { if (FromMapperBase && FromMapperBase == Info.HstPtrBegin) continue; Ret = Device.deallocTgtPtr(Info.HstPtrBegin, Info.DataSize, - Info.HasHoldModifier); + Info.HasHoldModifier, AsyncInfo); if (Ret != OFFLOAD_SUCCESS) { REPORT("Deallocating data from device failed.\n"); return OFFLOAD_FAIL; @@ -1124,7 +1123,8 @@ // immediately. if (ArgSize > FirstPrivateArgSizeThreshold || !IsFirstPrivate || AllocImmediately) { - TgtPtr = Device.allocData(ArgSize, HstPtr); + TgtPtr = + Device.allocData(ArgSize, HstPtr, TARGET_ALLOC_DEFAULT, AsyncInfo); if (!TgtPtr) { DP("Data allocation for %sprivate array " DPxMOD " failed.\n", (IsFirstPrivate ? "first-" : ""), DPxPTR(HstPtr)); @@ -1187,7 +1187,8 @@ } // Allocate target memory void *TgtPtr = - Device.allocData(FirstPrivateArgSize, FirstPrivateArgBuffer.data()); + Device.allocData(FirstPrivateArgSize, FirstPrivateArgBuffer.data(), + TARGET_ALLOC_DEFAULT, AsyncInfo); if (TgtPtr == nullptr) { DP("Failed to allocate target memory for private arguments.\n"); return OFFLOAD_FAIL; @@ -1222,7 +1223,7 @@ /// Free all target memory allocated for private arguments int free() { for (void *P : TgtPtrs) { - int Ret = Device.deleteData(P); + int Ret = Device.deleteData(P, AsyncInfo); if (Ret != OFFLOAD_SUCCESS) { DP("Deallocation of (first-)private arrays failed.\n"); return OFFLOAD_FAIL; @@ -1484,5 +1485,7 @@ } } - return OFFLOAD_SUCCESS; + // Ensure we synchronize at the very end of a target directive. If we + // synchronized before this might be a no-op as the "queue" is a nullptr now. + return AsyncInfo.synchronize(); } diff --git a/openmp/libomptarget/src/rtl.h b/openmp/libomptarget/src/rtl.h deleted file mode 100644 --- a/openmp/libomptarget/src/rtl.h +++ /dev/null @@ -1,178 +0,0 @@ -//===------------ rtl.h - Target independent OpenMP target RTL ------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// Declarations for handling RTL plugins. -// -//===----------------------------------------------------------------------===// - -#ifndef _OMPTARGET_RTL_H -#define _OMPTARGET_RTL_H - -#include "omptarget.h" -#include -#include -#include -#include -#include - -// Forward declarations. -struct DeviceTy; -struct __tgt_bin_desc; - -struct RTLInfoTy { - typedef int32_t(is_valid_binary_ty)(void *); - typedef int32_t(is_data_exchangable_ty)(int32_t, int32_t); - typedef int32_t(number_of_devices_ty)(); - typedef int32_t(init_device_ty)(int32_t); - typedef __tgt_target_table *(load_binary_ty)(int32_t, void *); - typedef void *(data_alloc_ty)(int32_t, int64_t, void *, int32_t); - typedef int32_t(data_submit_ty)(int32_t, void *, void *, int64_t); - typedef int32_t(data_submit_async_ty)(int32_t, void *, void *, int64_t, - __tgt_async_info *); - typedef int32_t(data_retrieve_ty)(int32_t, void *, void *, int64_t); - typedef int32_t(data_retrieve_async_ty)(int32_t, void *, void *, int64_t, - __tgt_async_info *); - typedef int32_t(data_exchange_ty)(int32_t, void *, int32_t, void *, int64_t); - typedef int32_t(data_exchange_async_ty)(int32_t, void *, int32_t, void *, - int64_t, __tgt_async_info *); - typedef int32_t(data_delete_ty)(int32_t, void *); - typedef int32_t(run_region_ty)(int32_t, void *, void **, ptrdiff_t *, - int32_t); - typedef int32_t(run_region_async_ty)(int32_t, void *, void **, ptrdiff_t *, - int32_t, __tgt_async_info *); - typedef int32_t(run_team_region_ty)(int32_t, void *, void **, ptrdiff_t *, - int32_t, int32_t, int32_t, uint64_t); - typedef int32_t(run_team_region_async_ty)(int32_t, void *, void **, - ptrdiff_t *, int32_t, int32_t, - int32_t, uint64_t, - __tgt_async_info *); - typedef int64_t(init_requires_ty)(int64_t); - typedef int32_t(synchronize_ty)(int32_t, __tgt_async_info *); - typedef int32_t (*register_lib_ty)(__tgt_bin_desc *); - typedef int32_t(supports_empty_images_ty)(); - typedef void(print_device_info_ty)(int32_t); - typedef void(set_info_flag_ty)(uint32_t); - typedef int32_t(create_event_ty)(int32_t, void **); - typedef int32_t(record_event_ty)(int32_t, void *, __tgt_async_info *); - typedef int32_t(wait_event_ty)(int32_t, void *, __tgt_async_info *); - typedef int32_t(sync_event_ty)(int32_t, void *); - typedef int32_t(destroy_event_ty)(int32_t, void *); - - int32_t Idx = -1; // RTL index, index is the number of devices - // of other RTLs that were registered before, - // i.e. the OpenMP index of the first device - // to be registered with this RTL. - int32_t NumberOfDevices = -1; // Number of devices this RTL deals with. - - void *LibraryHandler = nullptr; - -#ifdef OMPTARGET_DEBUG - std::string RTLName; -#endif - - // Functions implemented in the RTL. - is_valid_binary_ty *is_valid_binary = nullptr; - is_data_exchangable_ty *is_data_exchangable = nullptr; - number_of_devices_ty *number_of_devices = nullptr; - init_device_ty *init_device = nullptr; - load_binary_ty *load_binary = nullptr; - data_alloc_ty *data_alloc = nullptr; - data_submit_ty *data_submit = nullptr; - data_submit_async_ty *data_submit_async = nullptr; - data_retrieve_ty *data_retrieve = nullptr; - data_retrieve_async_ty *data_retrieve_async = nullptr; - data_exchange_ty *data_exchange = nullptr; - data_exchange_async_ty *data_exchange_async = nullptr; - data_delete_ty *data_delete = nullptr; - run_region_ty *run_region = nullptr; - run_region_async_ty *run_region_async = nullptr; - run_team_region_ty *run_team_region = nullptr; - run_team_region_async_ty *run_team_region_async = nullptr; - init_requires_ty *init_requires = nullptr; - synchronize_ty *synchronize = nullptr; - register_lib_ty register_lib = nullptr; - register_lib_ty unregister_lib = nullptr; - supports_empty_images_ty *supports_empty_images = nullptr; - set_info_flag_ty *set_info_flag = nullptr; - print_device_info_ty *print_device_info = nullptr; - create_event_ty *create_event = nullptr; - record_event_ty *record_event = nullptr; - wait_event_ty *wait_event = nullptr; - sync_event_ty *sync_event = nullptr; - destroy_event_ty *destroy_event = nullptr; - - // Are there images associated with this RTL. - bool isUsed = false; - - // Mutex for thread-safety when calling RTL interface functions. - // It is easier to enforce thread-safety at the libomptarget level, - // so that developers of new RTLs do not have to worry about it. - std::mutex Mtx; -}; - -/// RTLs identified in the system. -struct RTLsTy { - // List of the detected runtime libraries. - std::list AllRTLs; - - // Array of pointers to the detected runtime libraries that have compatible - // binaries. - std::vector UsedRTLs; - - int64_t RequiresFlags = OMP_REQ_UNDEFINED; - - explicit RTLsTy() = default; - - // Register the clauses of the requires directive. - void RegisterRequires(int64_t flags); - - // Initialize RTL if it has not been initialized - void initRTLonce(RTLInfoTy &RTL); - - // Initialize all RTLs - void initAllRTLs(); - - // Register a shared library with all (compatible) RTLs. - void RegisterLib(__tgt_bin_desc *desc); - - // Unregister a shared library from all RTLs. - void UnregisterLib(__tgt_bin_desc *desc); - - // Mutex-like object to guarantee thread-safety and unique initialization - // (i.e. the library attempts to load the RTLs (plugins) only once). - std::once_flag initFlag; - void LoadRTLs(); // not thread-safe -}; - -/// Map between the host entry begin and the translation table. Each -/// registered library gets one TranslationTable. Use the map from -/// __tgt_offload_entry so that we may quickly determine whether we -/// are trying to (re)register an existing lib or really have a new one. -struct TranslationTable { - __tgt_target_table HostTable; - - // Image assigned to a given device. - std::vector<__tgt_device_image *> TargetsImages; // One image per device ID. - - // Table of entry points or NULL if it was not already computed. - std::vector<__tgt_target_table *> TargetsTable; // One table per device ID. -}; -typedef std::map<__tgt_offload_entry *, TranslationTable> - HostEntriesBeginToTransTableTy; - -/// Map between the host ptr and a table index -struct TableMap { - TranslationTable *Table = nullptr; // table associated with the host ptr. - uint32_t Index = 0; // index in which the host ptr translated entry is found. - TableMap() = default; - TableMap(TranslationTable *table, uint32_t index) - : Table(table), Index(index) {} -}; -typedef std::map HostPtrToTableMapTy; - -#endif diff --git a/openmp/libomptarget/src/rtl.cpp b/openmp/libomptarget/src/rtl.cpp --- a/openmp/libomptarget/src/rtl.cpp +++ b/openmp/libomptarget/src/rtl.cpp @@ -158,6 +158,10 @@ // Optional functions *((void **)&R.init_requires) = dlsym(dynlib_handle, "__tgt_rtl_init_requires"); + *((void **)&R.data_alloc_async) = + dlsym(dynlib_handle, "__tgt_rtl_data_alloc_async"); + *((void **)&R.data_delete_async) = + dlsym(dynlib_handle, "__tgt_rtl_data_delete_async"); *((void **)&R.data_submit_async) = dlsym(dynlib_handle, "__tgt_rtl_data_submit_async"); *((void **)&R.data_retrieve_async) =