diff --git a/openmp/libomptarget/include/omptargetplugin.h b/openmp/libomptarget/include/omptargetplugin.h --- a/openmp/libomptarget/include/omptargetplugin.h +++ b/openmp/libomptarget/include/omptargetplugin.h @@ -145,6 +145,20 @@ // Print the device information void __tgt_rtl_print_device_info(int32_t ID); +// Create an event at the moment when this function is called based on +// AsyncInfo. The returned event can be used later for setting dependency or +// synchronization. +void *__tgt_rtl_create_event(int32_t ID, __tgt_async_info *AsyncInfo); + +// Destroy the event created by __tgt_rtl_create_event previously. +int32_t __tgt_rtl_destroy_event(int32_t ID, void *Event, + __tgt_async_info *AsyncInfo); + +// Wait for the event. It can be used for setting dependences. Depending on +// targets, it can be blocking or non-blocking. +int32_t __tgt_rtl_wait_event(int32_t ID, void *Event, + __tgt_async_info *AsyncInfo); + #ifdef __cplusplus } #endif diff --git a/openmp/libomptarget/plugins/cuda/src/rtl.cpp b/openmp/libomptarget/plugins/cuda/src/rtl.cpp --- a/openmp/libomptarget/plugins/cuda/src/rtl.cpp +++ b/openmp/libomptarget/plugins/cuda/src/rtl.cpp @@ -20,7 +20,6 @@ #include #include #include -#include #include "Debug.h" #include "omptargetplugin.h" @@ -131,6 +130,29 @@ return OFFLOAD_SUCCESS; } +void *createEvent(__tgt_async_info *AsyncInfo) { + CUstream Stream = reinterpret_cast(AsyncInfo->Queue); + CUevent Event = nullptr; + + CUresult Err = cuEventCreate(&Event, CU_EVENT_DEFAULT); + if (Err != CUDA_SUCCESS) { + DP("Error when creating event. stream = " DPxMOD ", event = " DPxMOD "\n", + DPxPTR(Stream), DPxPTR(Event)); + CUDA_ERR_STRING(Err); + return nullptr; + } + + Err = cuEventRecord(Event, Stream); + if (Err != CUDA_SUCCESS) { + DP("Error when recording event. stream = " DPxMOD ", event = " DPxMOD "\n", + DPxPTR(Stream), DPxPTR(Event)); + CUDA_ERR_STRING(Err); + return nullptr; + } + + return (void *)Event; +} + // Structure contains per-device data struct DeviceDataTy { /// List that contains all the kernels. @@ -1198,9 +1220,10 @@ &TmpInt, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK, Device), "Error returned from cuDeviceGetAttribute\n"); printf(" Max Shared Memory per Block: \t%d bytes \n", TmpInt); - checkResult(cuDeviceGetAttribute( - &TmpInt, CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK, Device), - "Error returned from cuDeviceGetAttribute\n"); + checkResult( + cuDeviceGetAttribute( + &TmpInt, CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK, Device), + "Error returned from cuDeviceGetAttribute\n"); printf(" Registers per Block: \t\t%d \n", TmpInt); checkResult( cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_WARP_SIZE, Device), @@ -1210,28 +1233,28 @@ &TmpInt, CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK, Device), "Error returned from cuDeviceGetAttribute\n"); printf(" Maximum Threads per Block: \t\t%d \n", TmpInt); - checkResult(cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X, - Device), + checkResult(cuDeviceGetAttribute( + &TmpInt, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X, Device), "Error returned from cuDeviceGetAttribute\n"); - checkResult(cuDeviceGetAttribute(&TmpInt2, - CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y, Device), + checkResult(cuDeviceGetAttribute( + &TmpInt2, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y, Device), "Error returned from cuDeviceGetAttribute\n"); - checkResult(cuDeviceGetAttribute(&TmpInt3, - CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z, Device), + checkResult(cuDeviceGetAttribute( + &TmpInt3, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z, Device), "Error returned from cuDeviceGetAttribute\n"); printf(" Maximum Block Dimensions: \t\t%d, %d, %d \n", TmpInt, TmpInt2, - TmpInt3); - checkResult( - cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X, Device), - "Error returned from cuDeviceGetAttribute\n"); - checkResult(cuDeviceGetAttribute(&TmpInt2, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y, - Device), + TmpInt3); + checkResult(cuDeviceGetAttribute( + &TmpInt, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X, Device), "Error returned from cuDeviceGetAttribute\n"); - checkResult(cuDeviceGetAttribute(&TmpInt3, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z, - Device), + checkResult(cuDeviceGetAttribute( + &TmpInt2, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y, Device), + "Error returned from cuDeviceGetAttribute\n"); + checkResult(cuDeviceGetAttribute( + &TmpInt3, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z, Device), "Error returned from cuDeviceGetAttribute\n"); printf(" Maximum Grid Dimensions: \t\t%d x %d x %d \n", TmpInt, TmpInt2, - TmpInt3); + TmpInt3); checkResult( cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_MAX_PITCH, Device), "Error returned from cuDeviceGetAttribute\n"); @@ -1280,18 +1303,19 @@ &TmpInt, CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE, Device), "Error returned from cuDeviceGetAttribute\n"); printf(" Memory Clock Rate: \t\t\t%d kHz\n", TmpInt); - checkResult(cuDeviceGetAttribute( - &TmpInt, CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH, Device), - "Error returned from cuDeviceGetAttribute\n"); - printf(" Memory Bus Width: \t\t\t%d bits\n", TmpInt); - checkResult( - cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE, Device), - "Error returned from cuDeviceGetAttribute\n"); - printf(" L2 Cache Size: \t\t\t%d bytes \n", TmpInt); checkResult( cuDeviceGetAttribute( - &TmpInt, CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR, Device), + &TmpInt, CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH, Device), "Error returned from cuDeviceGetAttribute\n"); + printf(" Memory Bus Width: \t\t\t%d bits\n", TmpInt); + checkResult(cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE, + Device), + "Error returned from cuDeviceGetAttribute\n"); + printf(" L2 Cache Size: \t\t\t%d bytes \n", TmpInt); + checkResult(cuDeviceGetAttribute( + &TmpInt, CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR, + Device), + "Error returned from cuDeviceGetAttribute\n"); printf(" Max Threads Per SMP: \t\t%d \n", TmpInt); checkResult(cuDeviceGetAttribute( &TmpInt, CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT, Device), @@ -1301,9 +1325,9 @@ &TmpInt, CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING, Device), "Error returned from cuDeviceGetAttribute\n"); printf(" Unified Addressing: \t\t%s \n", BOOL2TEXT(TmpInt)); - checkResult( - cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_MANAGED_MEMORY, Device), - "Error returned from cuDeviceGetAttribute\n"); + checkResult(cuDeviceGetAttribute( + &TmpInt, CU_DEVICE_ATTRIBUTE_MANAGED_MEMORY, Device), + "Error returned from cuDeviceGetAttribute\n"); printf(" Managed Memory: \t\t\t%s \n", BOOL2TEXT(TmpInt)); checkResult( cuDeviceGetAttribute( @@ -1319,20 +1343,56 @@ &TmpInt, CU_DEVICE_ATTRIBUTE_COOPERATIVE_LAUNCH, Device), "Error returned from cuDeviceGetAttribute\n"); printf(" Cooperative Launch: \t\t%s \n", BOOL2TEXT(TmpInt)); - checkResult(cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_MULTI_GPU_BOARD, - Device), + checkResult(cuDeviceGetAttribute( + &TmpInt, CU_DEVICE_ATTRIBUTE_MULTI_GPU_BOARD, Device), "Error returned from cuDeviceGetAttribute\n"); printf(" Multi-Device Boars: \t\t%s \n", BOOL2TEXT(TmpInt)); - checkResult(cuDeviceGetAttribute(&TmpInt, - CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, - Device), - "Error returned from cuDeviceGetAttribute\n"); - checkResult(cuDeviceGetAttribute(&TmpInt2, - CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, - Device), - "Error returned from cuDeviceGetAttribute\n"); + checkResult( + cuDeviceGetAttribute( + &TmpInt, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, Device), + "Error returned from cuDeviceGetAttribute\n"); + checkResult( + cuDeviceGetAttribute( + &TmpInt2, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, Device), + "Error returned from cuDeviceGetAttribute\n"); printf(" Compute Capabilities: \t\t%d%d \n", TmpInt, TmpInt2); } + + int waitEvent(const int DeviceId, __tgt_async_info *AsyncInfo, + void *EventPtr) const { + CUstream Stream = getStream(DeviceId, AsyncInfo); + CUevent Event = reinterpret_cast(EventPtr); + + // We don't use CU_EVENT_WAIT_DEFAULT here as it is only available from + // specific CUDA version, and defined as 0x0. In previous version, per CUDA + // API document, that argument has to be 0x0. + CUresult Err = cuStreamWaitEvent(Stream, Event, 0); + if (Err != CUDA_SUCCESS) { + DP("Error when waiting event. stream = " DPxMOD ", event = " DPxMOD "\n", + DPxPTR(Stream), DPxPTR(Event)); + CUDA_ERR_STRING(Err); + return OFFLOAD_FAIL; + } + + return OFFLOAD_SUCCESS; + } + + int destroyEvent(const int DeviceId, __tgt_async_info *AsyncInfo, + void *EventPtr) const { + CUstream Stream = reinterpret_cast(AsyncInfo->Queue); + CUevent Event = reinterpret_cast(EventPtr); + + CUresult Err = cuEventDestroy(Event); + if (Err != CUDA_SUCCESS) { + DP("Error when destroying event. stream = " DPxMOD ", event = " DPxMOD + "\n", + DPxPTR(Stream), DPxPTR(Event)); + CUDA_ERR_STRING(Err); + return OFFLOAD_FAIL; + } + + return OFFLOAD_SUCCESS; + } }; DeviceRTLTy DeviceRTL; @@ -1538,6 +1598,34 @@ DeviceRTL.printDeviceInfo(device_id); } +void *__tgt_rtl_create_event(int32_t device_id, + __tgt_async_info *async_info_ptr) { + assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); + assert(async_info_ptr && "async_info_ptr is nullptr"); + assert(async_info_ptr->Queue && "async_info_ptr->Queue is nullptr"); + + return createEvent(async_info_ptr); +} + +int32_t __tgt_rtl_wait_event(int32_t device_id, void *event_ptr, + __tgt_async_info *async_info_ptr) { + assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); + assert(async_info_ptr && "async_info_ptr is nullptr"); + assert(event_ptr && "event is nullptr"); + + return DeviceRTL.waitEvent(device_id, async_info_ptr, event_ptr); +} + +int32_t __tgt_rtl_destroy_event(int32_t device_id, void *event_ptr, + __tgt_async_info *async_info_ptr) { + assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); + assert(async_info_ptr && "async_info_ptr is nullptr"); + assert(async_info_ptr->Queue && "async_info_ptr->Queue is nullptr"); + assert(event_ptr && "event is nullptr"); + + return DeviceRTL.destroyEvent(device_id, async_info_ptr, event_ptr); +} + #ifdef __cplusplus } #endif diff --git a/openmp/libomptarget/plugins/exports b/openmp/libomptarget/plugins/exports --- a/openmp/libomptarget/plugins/exports +++ b/openmp/libomptarget/plugins/exports @@ -24,6 +24,9 @@ __tgt_rtl_supports_empty_images; __tgt_rtl_set_info_flag; __tgt_rtl_print_device_info; + __tgt_rtl_create_event; + __tgt_rtl_destroy_event; + __tgt_rtl_wait_event; local: *; }; diff --git a/openmp/libomptarget/src/device.h b/openmp/libomptarget/src/device.h --- a/openmp/libomptarget/src/device.h +++ b/openmp/libomptarget/src/device.h @@ -49,6 +49,9 @@ uintptr_t TgtPtrBegin; // target info. + /// Pointer to the event corresponding to the data update of this map. + mutable void *Event; + private: /// use mutable to allow modification via std::set iterator which is const. mutable uint64_t RefCount; @@ -65,7 +68,7 @@ HostDataToTargetTy(uintptr_t BP, uintptr_t B, uintptr_t E, uintptr_t TB, map_var_info_t Name = nullptr, bool IsINF = false) : HstPtrBase(BP), HstPtrBegin(B), HstPtrEnd(E), HstPtrName(Name), - TgtPtrBegin(TB), RefCount(IsINF ? INFRefCount : 1), + TgtPtrBegin(TB), Event(nullptr), RefCount(IsINF ? INFRefCount : 1), UpdateMtx(std::make_shared()) {} uint64_t getRefCount() const { return RefCount; } @@ -275,10 +278,16 @@ /// OFFLOAD_SUCCESS/OFFLOAD_FAIL when succeeds/fails. int32_t synchronize(AsyncInfoTy &AsyncInfo); - /// Calls the corresponding print in the \p RTLDEVID + /// Calls the corresponding print in the \p RTLDEVID /// device RTL to obtain the information of the specific device. bool printDeviceInfo(int32_t RTLDevID); + void *createEvent(AsyncInfoTy &AsyncInfo); + + int32_t destroyEvent(void *Event, AsyncInfoTy &AsyncInfo); + + int32_t waitEvent(void *Event, AsyncInfoTy &AsyncInfo); + private: // Call to RTL void init(); // To be called only via DeviceTy::initOnce() diff --git a/openmp/libomptarget/src/device.cpp b/openmp/libomptarget/src/device.cpp --- a/openmp/libomptarget/src/device.cpp +++ b/openmp/libomptarget/src/device.cpp @@ -259,29 +259,63 @@ if (IsNew && MoveData == MoveDataStateTy::UNKNOWN) MoveData = MoveDataStateTy::REQUIRED; - // If the target pointer is valid, and we need to transfer data, issue the - // data transfer. - if (TargetPointer && (MoveData == MoveDataStateTy::REQUIRED)) { - // Lock the entry before releasing the mapping table lock such that another - // thread that could issue data movement will get the right result. - Entry->lock(); - // Release the mapping table lock right after the entry is locked. - DataMapMtx.unlock(); + if (TargetPointer) { + // If the target pointer is valid, and we need to transfer data, issue the + // data transfer. + if (MoveData == MoveDataStateTy::REQUIRED) { + // Lock the entry before releasing the mapping table lock such that + // another thread that could issue data movement will get the right + // result. + Entry->lock(); + // Release the mapping table lock right after the entry is locked. + DataMapMtx.unlock(); - DP("Moving %" PRId64 " bytes (hst:" DPxMOD ") -> (tgt:" DPxMOD ")\n", Size, - DPxPTR(HstPtrBegin), DPxPTR(TargetPointer)); + DP("Moving %" PRId64 " bytes (hst:" DPxMOD ") -> (tgt:" DPxMOD ")\n", + Size, DPxPTR(HstPtrBegin), DPxPTR(TargetPointer)); - int Ret = submitData(TargetPointer, HstPtrBegin, Size, AsyncInfo); + int Ret = submitData(TargetPointer, HstPtrBegin, Size, AsyncInfo); - // Unlock the entry immediately after the data movement is issued. - Entry->unlock(); + if (Ret != OFFLOAD_SUCCESS) { + // Unlock the entry immediately if data movement issuing reports error. + Entry->unlock(); - if (Ret != OFFLOAD_SUCCESS) { - REPORT("Copying data to device failed.\n"); - // We will also return nullptr if the data movement fails because that - // pointer points to a corrupted memory region so it doesn't make any - // sense to continue to use it. - TargetPointer = nullptr; + REPORT("Copying data to device failed.\n"); + // We will also return nullptr if the data movement fails because that + // pointer points to a corrupted memory region so it doesn't make any + // sense to continue to use it. + TargetPointer = nullptr; + } + + // Create an event at this moment and attach it to the entry. + void *Event = createEvent(AsyncInfo); + void *OldEvent = Entry->Event; + Entry->Event = Event; + // We're done with the entry. Release the entry. + Entry->unlock(); + // If there is an event attached, destroy it. + if (OldEvent) + destroyEvent(OldEvent, AsyncInfo); + } else { + // Release the mapping table lock directly. + DataMapMtx.unlock(); + // If not a host pointer, we need to wait for the event if it exists. + if (!IsHostPtr) { + Entry->lock(); + void *Event = Entry->Event; + Entry->unlock(); + + if (Event) { + int Ret = waitEvent(Event, AsyncInfo); + if (Ret != OFFLOAD_SUCCESS) { + // If it fails to wait for the event, we need to return nullptr in + // case of any data race. + REPORT("Failed to wait for event " DPxMOD ".\n", DPxPTR(Event)); + return {{false /* IsNewEntry */, false /* IsHostPointer */}, + {} /* MapTableEntry */, + nullptr /* TargetPointer */}; + } + } + } } } else { // Release the mapping table lock directly. @@ -553,6 +587,27 @@ return OFFLOAD_SUCCESS; } +void *DeviceTy::createEvent(AsyncInfoTy &AsyncInfo) { + if (RTL->create_event) + return RTL->create_event(RTLDeviceID, AsyncInfo); + + return nullptr; +} + +int32_t DeviceTy::destroyEvent(void *Event, AsyncInfoTy &AsyncInfo) { + if (RTL->create_event) + return RTL->destroy_event(RTLDeviceID, Event, AsyncInfo); + + return OFFLOAD_SUCCESS; +} + +int32_t DeviceTy::waitEvent(void *Event, AsyncInfoTy &AsyncInfo) { + if (RTL->create_event) + return RTL->wait_event(RTLDeviceID, Event, AsyncInfo); + + return OFFLOAD_SUCCESS; +} + /// Check whether a device has an associated RTL and initialize it if it's not /// already initialized. bool device_is_ready(int device_num) { diff --git a/openmp/libomptarget/src/omptarget.cpp b/openmp/libomptarget/src/omptarget.cpp --- a/openmp/libomptarget/src/omptarget.cpp +++ b/openmp/libomptarget/src/omptarget.cpp @@ -584,12 +584,20 @@ int rt = Device.submitData(PointerTgtPtrBegin, &TgtPtrBase, sizeof(void *), AsyncInfo); - Pointer_TPR.MapTableEntry->unlock(); - if (rt != OFFLOAD_SUCCESS) { + Pointer_TPR.MapTableEntry->unlock(); REPORT("Copying data to device failed.\n"); return OFFLOAD_FAIL; } + // Create a new event for this moment + void *Event = Device.createEvent(AsyncInfo); + // Exchange the old event with new created event + void *OldEvent = Pointer_TPR.MapTableEntry->Event; + Pointer_TPR.MapTableEntry->Event = Event; + Pointer_TPR.MapTableEntry->unlock(); + // If the old event is not null, we need to destroy it. + if (OldEvent) + Device.destroyEvent(OldEvent, AsyncInfo); } else Device.ShadowMtx.unlock(); } diff --git a/openmp/libomptarget/src/rtl.h b/openmp/libomptarget/src/rtl.h --- a/openmp/libomptarget/src/rtl.h +++ b/openmp/libomptarget/src/rtl.h @@ -57,6 +57,9 @@ typedef int32_t(supports_empty_images_ty)(); typedef void(print_device_info_ty)(int32_t); typedef void(set_info_flag_ty)(uint32_t); + typedef void *(create_event_ty)(int32_t, __tgt_async_info *); + typedef int32_t(destroy_event_ty)(int32_t, void *, __tgt_async_info *); + typedef int32_t(wait_event_ty)(int32_t, void *, __tgt_async_info *); int32_t Idx = -1; // RTL index, index is the number of devices // of other RTLs that were registered before, @@ -95,6 +98,9 @@ supports_empty_images_ty *supports_empty_images = nullptr; set_info_flag_ty *set_info_flag = nullptr; print_device_info_ty *print_device_info = nullptr; + create_event_ty *create_event = nullptr; + destroy_event_ty *destroy_event = nullptr; + wait_event_ty *wait_event = nullptr; // Are there images associated with this RTL. bool isUsed = false; diff --git a/openmp/libomptarget/src/rtl.cpp b/openmp/libomptarget/src/rtl.cpp --- a/openmp/libomptarget/src/rtl.cpp +++ b/openmp/libomptarget/src/rtl.cpp @@ -183,6 +183,11 @@ dlsym(dynlib_handle, "__tgt_rtl_set_info_flag"); *((void **)&R.print_device_info) = dlsym(dynlib_handle, "__tgt_rtl_print_device_info"); + *((void **)&R.create_event) = + dlsym(dynlib_handle, "__tgt_rtl_create_event"); + *((void **)&R.destroy_event) = + dlsym(dynlib_handle, "__tgt_rtl_destroy_event"); + *((void **)&R.wait_event) = dlsym(dynlib_handle, "__tgt_rtl_wait_event"); } #if OMPT_SUPPORT