diff --git a/openmp/libomptarget/include/omptarget.h b/openmp/libomptarget/include/omptarget.h --- a/openmp/libomptarget/include/omptarget.h +++ b/openmp/libomptarget/include/omptarget.h @@ -168,7 +168,7 @@ DeviceTy &Device; public: - AsyncInfoTy(DeviceTy &Device) : Device(Device) {} + AsyncInfoTy(DeviceTy &Device); ~AsyncInfoTy() { synchronize(); } /// Implicit conversion to the __tgt_async_info which is used in the diff --git a/openmp/libomptarget/include/omptargetplugin.h b/openmp/libomptarget/include/omptargetplugin.h --- a/openmp/libomptarget/include/omptargetplugin.h +++ b/openmp/libomptarget/include/omptargetplugin.h @@ -70,6 +70,11 @@ void *__tgt_rtl_data_alloc(int32_t ID, int64_t Size, void *HostPtr, int32_t Kind); +// Create object for device synchronization. In case of success, return zero +// and AsyncInfo->Queue is not a nullptr. On failure, return an error code and +// set AsyncInfo->Queue to nullptr. +int32_t __tgt_rtl_create_async_info(int32_t ID, __tgt_async_info *AsyncInfo); + // Pass the data content to the target device using the target address. In case // of success, return zero. Otherwise, return an error code. int32_t __tgt_rtl_data_submit(int32_t ID, void *TargetPtr, void *HostPtr, diff --git a/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp b/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp --- a/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp +++ b/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp @@ -2204,6 +2204,11 @@ return ptr; } +int32_t __tgt_rtl_create_data_async(int, __tgt_async_info *AsyncInfo) { + initAsyncInfo(AsyncInfo); + return OFFLOAD_SUCCESS; +} + int32_t __tgt_rtl_data_submit(int device_id, void *tgt_ptr, void *hst_ptr, int64_t size) { assert(device_id < DeviceInfo.NumberOfDevices && "Device ID too large"); diff --git a/openmp/libomptarget/plugins/cuda/src/rtl.cpp b/openmp/libomptarget/plugins/cuda/src/rtl.cpp --- a/openmp/libomptarget/plugins/cuda/src/rtl.cpp +++ b/openmp/libomptarget/plugins/cuda/src/rtl.cpp @@ -468,15 +468,6 @@ E.Table.EntriesBegin = E.Table.EntriesEnd = nullptr; } - CUstream getStream(const int DeviceId, __tgt_async_info *AsyncInfo) const { - assert(AsyncInfo && "AsyncInfo is nullptr"); - - if (!AsyncInfo->Queue) - AsyncInfo->Queue = StreamManager->getStream(DeviceId); - - return reinterpret_cast(AsyncInfo->Queue); - } - public: // This class should not be copied DeviceRTLTy(const DeviceRTLTy &) = delete; @@ -950,6 +941,15 @@ return nullptr; } + CUstream getStream(const int DeviceId, __tgt_async_info *AsyncInfo) const { + assert(AsyncInfo && "AsyncInfo is nullptr"); + + if (!AsyncInfo->Queue) + AsyncInfo->Queue = StreamManager->getStream(DeviceId); + + return reinterpret_cast(AsyncInfo->Queue); + } + int dataSubmit(const int DeviceId, const void *TgtPtr, const void *HstPtr, const int64_t Size, __tgt_async_info *AsyncInfo) const { assert(AsyncInfo && "AsyncInfo is nullptr"); @@ -1450,6 +1450,11 @@ return DeviceRTL.dataAlloc(device_id, size, (TargetAllocTy)kind); } +int32_t __tgt_rtl_create_data_async(int DeviceId, __tgt_async_info *AsyncInfo) { + DeviceRTL.getStream(DeviceId, AsyncInfo); + return AsyncInfo->Queue ? OFFLOAD_SUCCESS : OFFLOAD_FAIL; +} + int32_t __tgt_rtl_data_submit(int32_t device_id, void *tgt_ptr, void *hst_ptr, int64_t size) { assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); diff --git a/openmp/libomptarget/plugins/exports b/openmp/libomptarget/plugins/exports --- a/openmp/libomptarget/plugins/exports +++ b/openmp/libomptarget/plugins/exports @@ -7,6 +7,7 @@ __tgt_rtl_init_device; __tgt_rtl_load_binary; __tgt_rtl_data_alloc; + __tgt_rtl_create_async_info; __tgt_rtl_data_submit; __tgt_rtl_data_submit_async; __tgt_rtl_data_retrieve; diff --git a/openmp/libomptarget/src/device.h b/openmp/libomptarget/src/device.h --- a/openmp/libomptarget/src/device.h +++ b/openmp/libomptarget/src/device.h @@ -324,6 +324,10 @@ /// OFFLOAD_SUCCESS/OFFLOAD_FAIL when succeeds/fails. int32_t deleteData(void *TgtPtrBegin); + /// Create async info, OFFLOAD_SUCCESS/OFFLOAD_FAIL when succeeds/fails + /// and sets the internal Queue to !nullptr/nullptr for success/fail + int32_t createAsyncInfo(AsyncInfoTy &AsyncInfo); + // Data transfer. When AsyncInfo is nullptr, the transfer will be // synchronous. // Copy data from host to device diff --git a/openmp/libomptarget/src/device.cpp b/openmp/libomptarget/src/device.cpp --- a/openmp/libomptarget/src/device.cpp +++ b/openmp/libomptarget/src/device.cpp @@ -438,6 +438,12 @@ return RTL->data_delete(RTLDeviceID, TgtPtrBegin); } +int32_t DeviceTy::createAsyncInfo(AsyncInfoTy &AsyncInfo) { + if (RTL->create_async_info) + return RTL->create_async_info(RTLDeviceID, AsyncInfo); + return OFFLOAD_FAIL; +} + // Submit data to device int32_t DeviceTy::submitData(void *TgtPtrBegin, void *HstPtrBegin, int64_t Size, AsyncInfoTy &AsyncInfo) { @@ -453,7 +459,7 @@ : "unknown"); } - if (!AsyncInfo || !RTL->data_submit_async || !RTL->synchronize) + if (!RTL->data_submit_async || !RTL->synchronize) return RTL->data_submit(RTLDeviceID, TgtPtrBegin, HstPtrBegin, Size); else return RTL->data_submit_async(RTLDeviceID, TgtPtrBegin, HstPtrBegin, Size, @@ -484,7 +490,7 @@ // Copy data from current device to destination device directly int32_t DeviceTy::dataExchange(void *SrcPtr, DeviceTy &DstDev, void *DstPtr, int64_t Size, AsyncInfoTy &AsyncInfo) { - if (!AsyncInfo || !RTL->data_exchange_async || !RTL->synchronize) { + if (!RTL->data_exchange_async || !RTL->synchronize) { assert(RTL->data_exchange && "RTL->data_exchange is nullptr"); return RTL->data_exchange(RTLDeviceID, SrcPtr, DstDev.RTLDeviceID, DstPtr, Size); @@ -497,7 +503,7 @@ int32_t DeviceTy::runRegion(void *TgtEntryPtr, void **TgtVarsPtr, ptrdiff_t *TgtOffsets, int32_t TgtVarsSize, AsyncInfoTy &AsyncInfo) { - if (!RTL->run_region || !RTL->synchronize) + if (!RTL->run_region_async || !RTL->synchronize) return RTL->run_region(RTLDeviceID, TgtEntryPtr, TgtVarsPtr, TgtOffsets, TgtVarsSize); else diff --git a/openmp/libomptarget/src/omptarget.cpp b/openmp/libomptarget/src/omptarget.cpp --- a/openmp/libomptarget/src/omptarget.cpp +++ b/openmp/libomptarget/src/omptarget.cpp @@ -19,6 +19,15 @@ #include #include +AsyncInfoTy::AsyncInfoTy(DeviceTy &Device) : Device(Device) { + int Result = Device.createAsyncInfo(*this); + assert( + ((Result == OFFLOAD_SUCCESS) && (AsyncInfo.Queue != nullptr) || + (Result != OFFLOAD_SUCCESS) && (AsyncInfo.Queue == nullptr)) && + "The device plugin returned inconsistent return value and queue pointer"); + (void)Result; +} + int AsyncInfoTy::synchronize() { int Result = OFFLOAD_SUCCESS; if (AsyncInfo.Queue) { diff --git a/openmp/libomptarget/src/rtl.h b/openmp/libomptarget/src/rtl.h --- a/openmp/libomptarget/src/rtl.h +++ b/openmp/libomptarget/src/rtl.h @@ -31,6 +31,7 @@ typedef int32_t(init_device_ty)(int32_t); typedef __tgt_target_table *(load_binary_ty)(int32_t, void *); typedef void *(data_alloc_ty)(int32_t, int64_t, void *, int32_t); + typedef int32_t(create_async_info_ty)(int32_t, __tgt_async_info *); typedef int32_t(data_submit_ty)(int32_t, void *, void *, int64_t); typedef int32_t(data_submit_async_ty)(int32_t, void *, void *, int64_t, __tgt_async_info *); @@ -82,6 +83,7 @@ init_device_ty *init_device = nullptr; load_binary_ty *load_binary = nullptr; data_alloc_ty *data_alloc = nullptr; + create_async_info_ty *create_async_info = nullptr; data_submit_ty *data_submit = nullptr; data_submit_async_ty *data_submit_async = nullptr; data_retrieve_ty *data_retrieve = nullptr; diff --git a/openmp/libomptarget/src/rtl.cpp b/openmp/libomptarget/src/rtl.cpp --- a/openmp/libomptarget/src/rtl.cpp +++ b/openmp/libomptarget/src/rtl.cpp @@ -154,6 +154,8 @@ // Optional functions *((void **)&R.init_requires) = dlsym(dynlib_handle, "__tgt_rtl_init_requires"); + *((void **)&R.create_async_info) = + dlsym(dynlib_handle, "__tgt_rtl_create_async_info"); *((void **)&R.data_submit_async) = dlsym(dynlib_handle, "__tgt_rtl_data_submit_async"); *((void **)&R.data_retrieve_async) =