diff --git a/openmp/libomptarget/include/omptarget.h b/openmp/libomptarget/include/omptarget.h --- a/openmp/libomptarget/include/omptarget.h +++ b/openmp/libomptarget/include/omptarget.h @@ -136,6 +136,10 @@ // We assume to use this structure to do synchronization. In CUDA backend, it // is CUstream. void *Queue = nullptr; + // A pointer to an event-like structure that will be used for tracking + // dependences among different asynchronous operations. In CUDA backend, it is + // CUevent. + void *Event = nullptr; }; struct DeviceTy; diff --git a/openmp/libomptarget/include/omptargetplugin.h b/openmp/libomptarget/include/omptargetplugin.h --- a/openmp/libomptarget/include/omptargetplugin.h +++ b/openmp/libomptarget/include/omptargetplugin.h @@ -142,6 +142,11 @@ // Set plugin's internal information flag externally. void __tgt_rtl_set_info_flag(uint32_t); +// Set a dependency on AsyncInfo->Event for AsyncInfo->Queue. All operations +// after the event can only run after the event is fulfilled. In case of +// success, return zero. Otherwise, return an error code. +int32_t __tgt_rtl_set_dependency(int32_t ID, __tgt_async_info *AsyncInfo); + #ifdef __cplusplus } #endif diff --git a/openmp/libomptarget/plugins/cuda/src/rtl.cpp b/openmp/libomptarget/plugins/cuda/src/rtl.cpp --- a/openmp/libomptarget/plugins/cuda/src/rtl.cpp +++ b/openmp/libomptarget/plugins/cuda/src/rtl.cpp @@ -125,6 +125,30 @@ return OFFLOAD_SUCCESS; } +int recordEvent(CUstream Stream, __tgt_async_info *AsyncInfo) { + CUevent Event = nullptr; + + CUresult Err = cuEventCreate(&Event, CU_EVENT_DEFAULT); + if (Err != CUDA_SUCCESS) { + DP("Error when creating event. stream = " DPxMOD ", event = " DPxMOD ".\n", + DPxPTR(Stream), DPxPTR(Event)); + CUDA_ERR_STRING(Err); + return OFFLOAD_FAIL; + } + + Err = cuEventRecord(Event, Stream); + if (Err != CUDA_SUCCESS) { + DP("Error when recording event. stream = " DPxMOD ", event = " DPxMOD ".\n", + DPxPTR(Stream), DPxPTR(Event)); + CUDA_ERR_STRING(Err); + return OFFLOAD_FAIL; + } + + AsyncInfo->Event = Event; + + return OFFLOAD_SUCCESS; + } + // Structure contains per-device data struct DeviceDataTy { /// List that contains all the kernels. @@ -883,7 +907,7 @@ return OFFLOAD_FAIL; } - return OFFLOAD_SUCCESS; + return recordEvent(Stream, AsyncInfo); } int dataRetrieve(const int DeviceId, void *HstPtr, const void *TgtPtr, @@ -1103,6 +1127,20 @@ } return (Err == CUDA_SUCCESS) ? OFFLOAD_SUCCESS : OFFLOAD_FAIL; } + + int setDependency(const int DeviceId, __tgt_async_info *AsyncInfo) const { + CUstream Stream = reinterpret_cast(AsyncInfo->Queue); + CUevent Event = reinterpret_cast(AsyncInfo->Event); + CUresult Err = cuStreamWaitEvent(Stream, Event, CU_EVENT_WAIT_DEFAULT); + + if (Err != CUDA_SUCCESS) { + DP("Error when waiting event. stream = " DPxMOD ", event = " DPxMOD "\n", + DPxPTR(Stream), DPxPTR(Event)); + CUDA_ERR_STRING(Err); + } + + return (Err == CUDA_SUCCESS) ? OFFLOAD_SUCCESS : OFFLOAD_FAIL; + } }; DeviceRTLTy DeviceRTL; @@ -1303,6 +1341,16 @@ InfoLevel.store(NewInfoLevel); } +int32_t __tgt_rtl_set_dependency(int32_t device_id, + __tgt_async_info *async_info_ptr) { + assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); + assert(async_info_ptr && "async_info_ptr is nullptr"); + assert(async_info_ptr->Queue && "async_info_ptr->Queue is nullptr"); + assert(async_info_ptr->Event && "async_info_ptr->Event is nullptr"); + + return DeviceRTL.setDependency(device_id, async_info_ptr); +} + #ifdef __cplusplus } #endif diff --git a/openmp/libomptarget/plugins/exports b/openmp/libomptarget/plugins/exports --- a/openmp/libomptarget/plugins/exports +++ b/openmp/libomptarget/plugins/exports @@ -23,6 +23,7 @@ __tgt_rtl_unregister_lib; __tgt_rtl_supports_empty_images; __tgt_rtl_set_info_flag; + __tgt_rtl_set_dependency; local: *; }; diff --git a/openmp/libomptarget/src/device.h b/openmp/libomptarget/src/device.h --- a/openmp/libomptarget/src/device.h +++ b/openmp/libomptarget/src/device.h @@ -49,6 +49,9 @@ uintptr_t TgtPtrBegin; // target info. + /// Pointer to the event corresponding to the data movement of this map. + volatile mutable void *Event; + private: /// use mutable to allow modification via std::set iterator which is const. mutable uint64_t RefCount; @@ -58,7 +61,7 @@ HostDataToTargetTy(uintptr_t BP, uintptr_t B, uintptr_t E, uintptr_t TB, map_var_info_t Name = nullptr, bool IsINF = false) : HstPtrBase(BP), HstPtrBegin(B), HstPtrEnd(E), HstPtrName(Name), - TgtPtrBegin(TB), RefCount(IsINF ? INFRefCount : 1) {} + TgtPtrBegin(TB), Event(nullptr), RefCount(IsINF ? INFRefCount : 1) {} uint64_t getRefCount() const { return RefCount; } @@ -241,6 +244,8 @@ /// OFFLOAD_SUCCESS/OFFLOAD_FAIL when succeeds/fails. int32_t synchronize(AsyncInfoTy &AsyncInfo); + int32_t setDependency(AsyncInfoTy &AsyncInfo); + private: // Call to RTL void init(); // To be called only via DeviceTy::initOnce() diff --git a/openmp/libomptarget/src/device.cpp b/openmp/libomptarget/src/device.cpp --- a/openmp/libomptarget/src/device.cpp +++ b/openmp/libomptarget/src/device.cpp @@ -522,6 +522,12 @@ return OFFLOAD_SUCCESS; } +int32_t DeviceTy::setDependency(AsyncInfoTy &AsyncInfo) { + if (RTL->set_dependency) + return RTL->set_dependency(RTLDeviceID, AsyncInfo); + return OFFLOAD_SUCCESS; +} + /// Check whether a device has an associated RTL and initialize it if it's not /// already initialized. bool device_is_ready(int device_num) { diff --git a/openmp/libomptarget/src/omptarget.cpp b/openmp/libomptarget/src/omptarget.cpp --- a/openmp/libomptarget/src/omptarget.cpp +++ b/openmp/libomptarget/src/omptarget.cpp @@ -564,7 +564,26 @@ REPORT("Copying data to device failed.\n"); return OFFLOAD_FAIL; } - // TODO: Attach the event in AsyncInfo to the map table entry if needed + + if (TPR.MapTableEntry->Event) { + // TODO: destroy the event + } + + TPR.MapTableEntry->Event = + static_cast<__tgt_async_info *>(AsyncInfo)->Event; + static_cast<__tgt_async_info *>(AsyncInfo)->Event = nullptr; + } else if (!IsHostPtr) { + // If the data doesn't need to be moved, we need to gaurantee that the + // corresponding event is enqueued to avoid data race. + // Busy wait while the attached event is still nullptr, which means + // the data movement has not been scheduled yet. + while (TPR.MapTableEntry->Event == nullptr) + ; + // TODO: We probably want to set a member function to do that + static_cast<__tgt_async_info *>(AsyncInfo)->Event = + const_cast(TPR.MapTableEntry->Event); + Device.setDependency(AsyncInfo); + static_cast<__tgt_async_info *>(AsyncInfo)->Event = nullptr; } } @@ -581,6 +600,13 @@ return OFFLOAD_FAIL; } // TODO: Attach the event in AsyncInfo to the map table entry if needed + if (Pointer_TPR.MapTableEntry->Event) { + // TODO: destroy the event + } + + Pointer_TPR.MapTableEntry->Event = + static_cast<__tgt_async_info *>(AsyncInfo)->Event; + static_cast<__tgt_async_info *>(AsyncInfo)->Event = nullptr; // create shadow pointers for this entry Device.ShadowMtx.lock(); Device.ShadowPtrMap[Pointer_HstPtrBegin] = { diff --git a/openmp/libomptarget/src/rtl.h b/openmp/libomptarget/src/rtl.h --- a/openmp/libomptarget/src/rtl.h +++ b/openmp/libomptarget/src/rtl.h @@ -56,6 +56,7 @@ typedef int32_t (*register_lib_ty)(__tgt_bin_desc *); typedef int32_t(supports_empty_images_ty)(); typedef void(set_info_flag_ty)(uint32_t); + typedef int32_t(set_dependency_ty)(int32_t, __tgt_async_info *); int32_t Idx = -1; // RTL index, index is the number of devices // of other RTLs that were registered before, @@ -93,6 +94,7 @@ register_lib_ty unregister_lib = nullptr; supports_empty_images_ty *supports_empty_images = nullptr; set_info_flag_ty *set_info_flag = nullptr; + set_dependency_ty *set_dependency = nullptr; // Are there images associated with this RTL. bool isUsed = false; diff --git a/openmp/libomptarget/src/rtl.cpp b/openmp/libomptarget/src/rtl.cpp --- a/openmp/libomptarget/src/rtl.cpp +++ b/openmp/libomptarget/src/rtl.cpp @@ -177,6 +177,8 @@ dlsym(dynlib_handle, "__tgt_rtl_supports_empty_images"); *((void **)&R.set_info_flag) = dlsym(dynlib_handle, "__tgt_rtl_set_info_flag"); + *((void **)&R.set_dependency) = + dlsym(dynlib_handle, "__tgt_rtl_set_dependency"); } DP("RTLs loaded!\n"); diff --git a/openmp/libomptarget/test/offloading/bug49334.cpp b/openmp/libomptarget/test/offloading/bug49334.cpp --- a/openmp/libomptarget/test/offloading/bug49334.cpp +++ b/openmp/libomptarget/test/offloading/bug49334.cpp @@ -70,8 +70,8 @@ } }; -constexpr const int BS = 256; -constexpr const int N = 1024; +constexpr const int BS = 16; +constexpr const int N = 256; int BlockMatMul_TargetNowait(BlockMatrix &A, BlockMatrix &B, BlockMatrix &C) { #pragma omp parallel