diff --git a/openmp/libomptarget/include/omptargetplugin.h b/openmp/libomptarget/include/omptargetplugin.h --- a/openmp/libomptarget/include/omptargetplugin.h +++ b/openmp/libomptarget/include/omptargetplugin.h @@ -145,6 +145,33 @@ // Print the device information void __tgt_rtl_print_device_info(int32_t ID); +// Event related interfaces. It is expected to use the interfaces in the +// following way: +// 1) Create an event on the target device (__tgt_rtl_create_event). +// 2) Record the event based on the status of \p AsyncInfo->Queue at the moment +// of function call to __tgt_rtl_record_event. An event becomes "meaningful" +// once it is recorded, such that others can depend on it. +// 3) Call __tgt_rtl_wait_event to set dependence on the event. Whether the +// operation is blocking or non-blocking depends on the target. It is expected +// to be non-blocking, just set dependence and return. +// 4) Call __tgt_rtl_sync_event to sync the event. It is expected to block the +// thread calling the function. 5) Destroy the event (__tgt_rtl_destroy_event). +// { +void *__tgt_rtl_create_event(int32_t ID, __tgt_async_info *AsyncInfo); + +int32_t __tgt_rtl_record_event(int32_t ID, void *Event, + __tgt_async_info *AsyncInfo); + +int32_t __tgt_rtl_wait_event(int32_t ID, void *Event, + __tgt_async_info *AsyncInfo); + +int32_t __tgt_rtl_sync_event(int32_t ID, void *Event, + __tgt_async_info *AsyncInfo); + +int32_t __tgt_rtl_destroy_event(int32_t ID, void *Event, + __tgt_async_info *AsyncInfo); +// } + #ifdef __cplusplus } #endif diff --git a/openmp/libomptarget/plugins/cuda/dynamic_cuda/cuda.h b/openmp/libomptarget/plugins/cuda/dynamic_cuda/cuda.h --- a/openmp/libomptarget/plugins/cuda/dynamic_cuda/cuda.h +++ b/openmp/libomptarget/plugins/cuda/dynamic_cuda/cuda.h @@ -22,6 +22,7 @@ typedef struct CUctx_st *CUcontext; typedef struct CUfunc_st *CUfunction; typedef struct CUstream_st *CUstream; +typedef struct CUevent_st *CUevent; typedef enum cudaError_enum { CUDA_SUCCESS = 0, @@ -248,4 +249,10 @@ CUresult cuCtxGetLimit(size_t *, CUlimit); CUresult cuCtxSetLimit(CUlimit, size_t); +CUresult cuEventCreate(CUevent *, unsigned int); +CUresult cuEventRecord(CUevent, CUstream); +CUresult cuStreamWaitEvent(CUstream, CUevent, unsigned int); +CUresult cuEventSynchronize(CUevent); +CUresult cuEventDestroy(CUevent); + #endif diff --git a/openmp/libomptarget/plugins/cuda/dynamic_cuda/cuda.cpp b/openmp/libomptarget/plugins/cuda/dynamic_cuda/cuda.cpp --- a/openmp/libomptarget/plugins/cuda/dynamic_cuda/cuda.cpp +++ b/openmp/libomptarget/plugins/cuda/dynamic_cuda/cuda.cpp @@ -69,6 +69,12 @@ DLWRAP(cuCtxGetLimit, 2); DLWRAP(cuCtxSetLimit, 2); +DLWRAP(cuEventCreate, 2); +DLWRAP(cuEventRecord, 2); +DLWRAP(cuStreamWaitEvent, 3); +DLWRAP(cuEventSynchronize, 1); +DLWRAP(cuEventDestroy, 1); + DLWRAP_FINALIZE(); #ifndef DYNAMIC_CUDA_PATH diff --git a/openmp/libomptarget/plugins/cuda/src/rtl.cpp b/openmp/libomptarget/plugins/cuda/src/rtl.cpp --- a/openmp/libomptarget/plugins/cuda/src/rtl.cpp +++ b/openmp/libomptarget/plugins/cuda/src/rtl.cpp @@ -129,6 +129,60 @@ return OFFLOAD_SUCCESS; } +void *createEvent() { + CUevent Event = nullptr; + + CUresult Err = cuEventCreate(&Event, CU_EVENT_DEFAULT); + if (Err != CUDA_SUCCESS) { + DP("Error when creating event event = " DPxMOD "\n", DPxPTR(Event)); + CUDA_ERR_STRING(Err); + return nullptr; + } + + return reinterpret_cast(Event); +} + +int recordEvent(void *EventPtr, __tgt_async_info *AsyncInfo) { + CUstream Stream = reinterpret_cast(AsyncInfo->Queue); + CUevent Event = reinterpret_cast(EventPtr); + + CUresult Err = cuEventRecord(Event, Stream); + if (Err != CUDA_SUCCESS) { + DP("Error when recording event. stream = " DPxMOD ", event = " DPxMOD "\n", + DPxPTR(Stream), DPxPTR(Event)); + CUDA_ERR_STRING(Err); + return OFFLOAD_FAIL; + } + + return OFFLOAD_SUCCESS; +} + +int syncEvent(void *EventPtr) { + CUevent Event = reinterpret_cast(EventPtr); + + CUresult Err = cuEventSynchronize(Event); + if (Err != CUDA_SUCCESS) { + DP("Error when syncing event = " DPxMOD "\n", DPxPTR(Event)); + CUDA_ERR_STRING(Err); + return OFFLOAD_FAIL; + } + + return OFFLOAD_SUCCESS; +} + +int destroyEvent(void *EventPtr) { + CUevent Event = reinterpret_cast(EventPtr); + + CUresult Err = cuEventDestroy(Event); + if (Err != CUDA_SUCCESS) { + DP("Error when destroying event = " DPxMOD "\n", DPxPTR(Event)); + CUDA_ERR_STRING(Err); + return OFFLOAD_FAIL; + } + + return OFFLOAD_SUCCESS; +} + // Structure contains per-device data struct DeviceDataTy { /// List that contains all the kernels. @@ -1332,6 +1386,25 @@ "Error returned from cuDeviceGetAttribute\n"); printf(" Compute Capabilities: \t\t%d%d \n", TmpInt, TmpInt2); } + + int waitEvent(const int DeviceId, __tgt_async_info *AsyncInfo, + void *EventPtr) const { + CUstream Stream = getStream(DeviceId, AsyncInfo); + CUevent Event = reinterpret_cast(EventPtr); + + // We don't use CU_EVENT_WAIT_DEFAULT here as it is only available from + // specific CUDA version, and defined as 0x0. In previous version, per CUDA + // API document, that argument has to be 0x0. + CUresult Err = cuStreamWaitEvent(Stream, Event, 0); + if (Err != CUDA_SUCCESS) { + DP("Error when waiting event. stream = " DPxMOD ", event = " DPxMOD "\n", + DPxPTR(Stream), DPxPTR(Event)); + CUDA_ERR_STRING(Err); + return OFFLOAD_FAIL; + } + + return OFFLOAD_SUCCESS; + } }; DeviceRTLTy DeviceRTL; @@ -1537,6 +1610,43 @@ DeviceRTL.printDeviceInfo(device_id); } +void *__tgt_rtl_create_event(int32_t device_id, + __tgt_async_info *async_info_ptr) { + return createEvent(); +} + +int32_t __tgt_rtl_record_event(int32_t device_id, void *event_ptr, + __tgt_async_info *async_info_ptr) { + assert(async_info_ptr && "async_info_ptr is nullptr"); + assert(async_info_ptr->Queue && "async_info_ptr->Queue is nullptr"); + assert(event_ptr && "event_ptr is nullptr"); + + return recordEvent(event_ptr, async_info_ptr); +} + +int32_t __tgt_rtl_wait_event(int32_t device_id, void *event_ptr, + __tgt_async_info *async_info_ptr) { + assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); + assert(async_info_ptr && "async_info_ptr is nullptr"); + assert(event_ptr && "event is nullptr"); + + return DeviceRTL.waitEvent(device_id, async_info_ptr, event_ptr); +} + +int32_t __tgt_rtl_sync_event(int32_t device_id, void *event_ptr, + __tgt_async_info *async_info_ptr) { + assert(event_ptr && "event is nullptr"); + + return syncEvent(event_ptr); +} + +int32_t __tgt_rtl_destroy_event(int32_t device_id, void *event_ptr, + __tgt_async_info *async_info_ptr) { + assert(event_ptr && "event is nullptr"); + + return destroyEvent(event_ptr); +} + #ifdef __cplusplus } #endif diff --git a/openmp/libomptarget/plugins/exports b/openmp/libomptarget/plugins/exports --- a/openmp/libomptarget/plugins/exports +++ b/openmp/libomptarget/plugins/exports @@ -24,6 +24,11 @@ __tgt_rtl_supports_empty_images; __tgt_rtl_set_info_flag; __tgt_rtl_print_device_info; + __tgt_rtl_create_event; + __tgt_rtl_record_event; + __tgt_rtl_wait_event; + __tgt_rtl_sync_event; + __tgt_rtl_destroy_event; local: *; }; diff --git a/openmp/libomptarget/src/device.h b/openmp/libomptarget/src/device.h --- a/openmp/libomptarget/src/device.h +++ b/openmp/libomptarget/src/device.h @@ -275,10 +275,32 @@ /// OFFLOAD_SUCCESS/OFFLOAD_FAIL when succeeds/fails. int32_t synchronize(AsyncInfoTy &AsyncInfo); - /// Calls the corresponding print in the \p RTLDEVID + /// Calls the corresponding print in the \p RTLDEVID /// device RTL to obtain the information of the specific device. bool printDeviceInfo(int32_t RTLDevID); + /// Event related interfaces. + /// { + /// Create an event. + void *createEvent(AsyncInfoTy &AsyncInfo); + + /// Record the event based on status in AsyncInfo->Queue at the moment the + /// function is called. + int32_t recordEvent(void *Event, AsyncInfoTy &AsyncInfo); + + /// Wait for an event. This function can be blocking or non-blocking, + /// depending on the implmentation. It is expected to set a dependenc on the + /// event such that corresponding operations shall only start once the event + /// is fulfilled. + int32_t waitEvent(void *Event, AsyncInfoTy &AsyncInfo); + + /// Synchronize the event. It is expected to block the thread. + int32_t syncEvent(void *Event, AsyncInfoTy &AsyncInfo); + + /// Destroy the event. + int32_t destroyEvent(void *Event, AsyncInfoTy &AsyncInfo); + /// } + private: // Call to RTL void init(); // To be called only via DeviceTy::initOnce() diff --git a/openmp/libomptarget/src/device.cpp b/openmp/libomptarget/src/device.cpp --- a/openmp/libomptarget/src/device.cpp +++ b/openmp/libomptarget/src/device.cpp @@ -553,6 +553,41 @@ return OFFLOAD_SUCCESS; } +void *DeviceTy::createEvent(AsyncInfoTy &AsyncInfo) { + if (RTL->create_event) + return RTL->create_event(RTLDeviceID, AsyncInfo); + + return nullptr; +} + +int32_t DeviceTy::recordEvent(void *Event, AsyncInfoTy &AsyncInfo) { + if (RTL->create_event) + return RTL->record_event(RTLDeviceID, Event, AsyncInfo); + + return OFFLOAD_SUCCESS; +} + +int32_t DeviceTy::waitEvent(void *Event, AsyncInfoTy &AsyncInfo) { + if (RTL->create_event) + return RTL->wait_event(RTLDeviceID, Event, AsyncInfo); + + return OFFLOAD_SUCCESS; +} + +int32_t DeviceTy::syncEvent(void *Event, AsyncInfoTy &AsyncInfo) { + if (RTL->create_event) + return RTL->sync_event(RTLDeviceID, Event, AsyncInfo); + + return OFFLOAD_SUCCESS; +} + +int32_t DeviceTy::destroyEvent(void *Event, AsyncInfoTy &AsyncInfo) { + if (RTL->create_event) + return RTL->destroy_event(RTLDeviceID, Event, AsyncInfo); + + return OFFLOAD_SUCCESS; +} + /// Check whether a device has an associated RTL and initialize it if it's not /// already initialized. bool device_is_ready(int device_num) { diff --git a/openmp/libomptarget/src/rtl.h b/openmp/libomptarget/src/rtl.h --- a/openmp/libomptarget/src/rtl.h +++ b/openmp/libomptarget/src/rtl.h @@ -57,6 +57,11 @@ typedef int32_t(supports_empty_images_ty)(); typedef void(print_device_info_ty)(int32_t); typedef void(set_info_flag_ty)(uint32_t); + typedef void *(create_event_ty)(int32_t, __tgt_async_info *); + typedef int32_t(record_event_ty)(int32_t, void *, __tgt_async_info *); + typedef int32_t(wait_event_ty)(int32_t, void *, __tgt_async_info *); + typedef int32_t(sync_event_ty)(int32_t, void *, __tgt_async_info *); + typedef int32_t(destroy_event_ty)(int32_t, void *, __tgt_async_info *); int32_t Idx = -1; // RTL index, index is the number of devices // of other RTLs that were registered before, @@ -95,6 +100,11 @@ supports_empty_images_ty *supports_empty_images = nullptr; set_info_flag_ty *set_info_flag = nullptr; print_device_info_ty *print_device_info = nullptr; + create_event_ty *create_event = nullptr; + record_event_ty *record_event = nullptr; + wait_event_ty *wait_event = nullptr; + sync_event_ty *sync_event = nullptr; + destroy_event_ty *destroy_event = nullptr; // Are there images associated with this RTL. bool isUsed = false; diff --git a/openmp/libomptarget/src/rtl.cpp b/openmp/libomptarget/src/rtl.cpp --- a/openmp/libomptarget/src/rtl.cpp +++ b/openmp/libomptarget/src/rtl.cpp @@ -183,6 +183,14 @@ dlsym(dynlib_handle, "__tgt_rtl_set_info_flag"); *((void **)&R.print_device_info) = dlsym(dynlib_handle, "__tgt_rtl_print_device_info"); + *((void **)&R.create_event) = + dlsym(dynlib_handle, "__tgt_rtl_create_event"); + *((void **)&R.record_event) = + dlsym(dynlib_handle, "__tgt_rtl_record_event"); + *((void **)&R.wait_event) = dlsym(dynlib_handle, "__tgt_rtl_wait_event"); + *((void **)&R.sync_event) = dlsym(dynlib_handle, "__tgt_rtl_sync_event"); + *((void **)&R.destroy_event) = + dlsym(dynlib_handle, "__tgt_rtl_destroy_event"); } #if OMPT_SUPPORT