Index: openmp/libomptarget/include/omptarget.h =================================================================== --- openmp/libomptarget/include/omptarget.h +++ openmp/libomptarget/include/omptarget.h @@ -136,6 +136,7 @@ // We assume to use this structure to do synchronization. In CUDA backend, it // is CUstream. void *Queue = nullptr; + void *Event = nullptr; }; struct DeviceTy; @@ -151,8 +152,17 @@ __tgt_async_info AsyncInfo; DeviceTy &Device; + /// AsyncInfoTy is constructed in nowait interface. + bool FromNoWait; + /// if > 0, opt in the code path using events in synchronize(). + static int32_t UseNoWaitEvent; + /// when record/query events are not supported by the plugin, + /// this flag is set to false to disable using events at synchronize(). + bool EventSupported = false; + public: - AsyncInfoTy(DeviceTy &Device) : Device(Device) {} + AsyncInfoTy(DeviceTy &Device, bool from_nowait = false) + : Device(Device), FromNoWait(from_nowait) {} ~AsyncInfoTy() { synchronize(); } /// Implicit conversion to the __tgt_async_info which is used in the @@ -167,6 +177,11 @@ /// Return a void* reference with a lifetime that is at least as long as this /// AsyncInfoTy object. The location can be used as intermediate buffer. void *&getVoidPtrLocation(); + + /// set EventSupported + void setEventSupported(bool supported) { EventSupported = supported; } + /// get EventSupported + bool getEventSupported() const { return EventSupported; } }; /// This struct is a record of non-contiguous information @@ -331,6 +346,8 @@ void __kmpc_push_target_tripcount_mapper(ident_t *loc, int64_t device_id, uint64_t loop_tripcount); +void __kmpc_target_task_yield(); + void __tgt_set_info_flag(uint32_t); #ifdef __cplusplus Index: openmp/libomptarget/plugins/cuda/src/rtl.cpp =================================================================== --- openmp/libomptarget/plugins/cuda/src/rtl.cpp +++ openmp/libomptarget/plugins/cuda/src/rtl.cpp @@ -1138,6 +1138,66 @@ return OFFLOAD_SUCCESS; } + int recordEvent(const int DeviceId, __tgt_async_info *AsyncInfo) const { + CUstream Stream = reinterpret_cast(AsyncInfo->Queue); + CUevent Event; + CUresult Err = cuEventCreate(&Event, CU_EVENT_DEFAULT); + + if (Err != CUDA_SUCCESS) { + DP("Error when creating an event. stream = " DPxMOD + ", async info ptr = " DPxMOD "\n", + DPxPTR(Stream), DPxPTR(AsyncInfo)); + CUDA_ERR_STRING(Err); + return OFFLOAD_FAIL; + } + Err = cuEventRecord(Event, Stream); + if (Err != CUDA_SUCCESS) { + DP("Error when recording an event. stream = " DPxMOD + ", async info ptr = " DPxMOD "\n", + DPxPTR(Stream), DPxPTR(AsyncInfo)); + CUDA_ERR_STRING(Err); + return OFFLOAD_FAIL; + } + AsyncInfo->Event = Event; + + // Once the event is recorded, return it to stream pool and reset + // AsyncInfo->Queue. + StreamManager->returnStream(DeviceId, + reinterpret_cast(AsyncInfo->Queue)); + AsyncInfo->Queue = nullptr; + + return OFFLOAD_SUCCESS; + } + + int queryEvent(const int DeviceId, __tgt_async_info *AsyncInfo) const { + CUevent Event = reinterpret_cast(AsyncInfo->Event); + CUresult Err = cuEventQuery(Event); + + if (Err == CUDA_ERROR_NOT_READY) { + DP("captured work is incomplete. Event = " DPxMOD + ", async info ptr = " DPxMOD "\n", + DPxPTR(Event), DPxPTR(AsyncInfo)); + // CUDA_ERR_STRING(Err); + return OFFLOAD_SUCCESS; + } else if (Err != CUDA_SUCCESS) { + DP("Error when querying an event. Event = " DPxMOD + ", async info ptr = " DPxMOD "\n", + DPxPTR(Event), DPxPTR(AsyncInfo)); + CUDA_ERR_STRING(Err); + return OFFLOAD_FAIL; + } + Err = cuEventDestroy(Event); + if (Err != CUDA_SUCCESS) { + DP("Error when destroying an event. Event = " DPxMOD + ", async info ptr = " DPxMOD "\n", + DPxPTR(Event), DPxPTR(AsyncInfo)); + CUDA_ERR_STRING(Err); + return OFFLOAD_FAIL; + } + AsyncInfo->Event = nullptr; + return OFFLOAD_SUCCESS; + } + int synchronize(const int DeviceId, __tgt_async_info *AsyncInfo) const { CUstream Stream = reinterpret_cast(AsyncInfo->Queue); CUresult Err = cuStreamSynchronize(Stream); @@ -1343,6 +1403,24 @@ async_info_ptr); } +int32_t __tgt_rtl_record_event(int32_t device_id, + __tgt_async_info *async_info_ptr) { + assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); + assert(async_info_ptr && "async_info_ptr is nullptr"); + assert(async_info_ptr->Queue && "async_info_ptr->Queue is nullptr"); + + return DeviceRTL.recordEvent(device_id, async_info_ptr); +} + +int32_t __tgt_rtl_query_event(int32_t device_id, + __tgt_async_info *async_info_ptr) { + assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); + assert(async_info_ptr && "async_info_ptr is nullptr"); + assert(async_info_ptr->Event && "async_info_ptr->Event is nullptr"); + + return DeviceRTL.queryEvent(device_id, async_info_ptr); +} + int32_t __tgt_rtl_synchronize(int32_t device_id, __tgt_async_info *async_info_ptr) { assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); Index: openmp/libomptarget/plugins/exports =================================================================== --- openmp/libomptarget/plugins/exports +++ openmp/libomptarget/plugins/exports @@ -18,6 +18,8 @@ __tgt_rtl_run_target_team_region_async; __tgt_rtl_run_target_region; __tgt_rtl_run_target_region_async; + __tgt_rtl_record_event; + __tgt_rtl_query_event; __tgt_rtl_synchronize; __tgt_rtl_register_lib; __tgt_rtl_unregister_lib; Index: openmp/libomptarget/src/CMakeLists.txt =================================================================== --- openmp/libomptarget/src/CMakeLists.txt +++ openmp/libomptarget/src/CMakeLists.txt @@ -35,6 +35,7 @@ target_link_libraries(omptarget PRIVATE LLVMSupport) endif() target_link_libraries(omptarget PRIVATE + omp ${CMAKE_DL_LIBS} "-Wl,--version-script=${CMAKE_CURRENT_SOURCE_DIR}/exports") Index: openmp/libomptarget/src/device.h =================================================================== --- openmp/libomptarget/src/device.h +++ openmp/libomptarget/src/device.h @@ -273,6 +273,8 @@ /// Synchronize device/queue/event based on \p AsyncInfo and return /// OFFLOAD_SUCCESS/OFFLOAD_FAIL when succeeds/fails. + int32_t recordEvent(AsyncInfoTy &AsyncInfo); + int32_t queryEvent(AsyncInfoTy &AsyncInfo); int32_t synchronize(AsyncInfoTy &AsyncInfo); private: Index: openmp/libomptarget/src/device.cpp =================================================================== --- openmp/libomptarget/src/device.cpp +++ openmp/libomptarget/src/device.cpp @@ -538,6 +538,27 @@ return false; } +int32_t DeviceTy::recordEvent(AsyncInfoTy &AsyncInfo) { + if (RTL->record_event) { + AsyncInfo.setEventSupported(true); + return RTL->record_event(RTLDeviceID, AsyncInfo); + } else { + AsyncInfo.setEventSupported(false); + return OFFLOAD_SUCCESS; + } +} + +// when OFFLOAD_SUCCESS is returned, it means either the event has been +// fullfiled without error or the event has not been not fullfiled and +// AsyncInfo.Event is not nullptr. +int32_t DeviceTy::queryEvent(AsyncInfoTy &AsyncInfo) { + if (AsyncInfo.getEventSupported()) { + return RTL->query_event(RTLDeviceID, AsyncInfo); + } + // when events are not supported, queryEvent should not be called. + return OFFLOAD_FAIL; +} + int32_t DeviceTy::synchronize(AsyncInfoTy &AsyncInfo) { if (RTL->synchronize) return RTL->synchronize(RTLDeviceID, AsyncInfo); Index: openmp/libomptarget/src/interface.cpp =================================================================== --- openmp/libomptarget/src/interface.cpp +++ openmp/libomptarget/src/interface.cpp @@ -407,9 +407,35 @@ if (depNum + noAliasDepNum > 0) __kmpc_omp_taskwait(loc, __kmpc_global_thread_num(loc)); - return __tgt_target_teams_mapper(loc, device_id, host_ptr, arg_num, args_base, - args, arg_sizes, arg_types, arg_names, - arg_mappers, team_num, thread_limit); + DP("Entering target nowait region with entry point " DPxMOD + " and device Id %" PRId64 "\n", + DPxPTR(host_ptr), device_id); + if (checkDeviceAndCtors(device_id, loc) != OFFLOAD_SUCCESS) { + DP("Not offloading to device %" PRId64 "\n", device_id); + return OFFLOAD_FAIL; + } + + if (getInfoLevel() & OMP_INFOTYPE_KERNEL_ARGS) + printKernelArguments(loc, device_id, arg_num, arg_sizes, arg_types, + arg_names, "Entering OpenMP kernel"); +#ifdef OMPTARGET_DEBUG + for (int i = 0; i < arg_num; ++i) { + DP("Entry %2d: Base=" DPxMOD ", Begin=" DPxMOD ", Size=%" PRId64 + ", Type=0x%" PRIx64 ", Name=%s\n", + i, DPxPTR(args_base[i]), DPxPTR(args[i]), arg_sizes[i], arg_types[i], + (arg_names) ? getNameFromMapping(arg_names[i]).c_str() : "unknown"); + } +#endif + + DeviceTy &Device = PM->Devices[device_id]; + AsyncInfoTy AsyncInfo(Device, true); + int rc = target(loc, Device, host_ptr, arg_num, args_base, args, arg_sizes, + arg_types, arg_names, arg_mappers, team_num, thread_limit, + true /*team*/, AsyncInfo); + if (rc == OFFLOAD_SUCCESS) + rc = AsyncInfo.synchronize(); + handleTargetOutcome(rc == OFFLOAD_SUCCESS, loc); + return rc; } // Get the current number of components for a user-defined mapper. Index: openmp/libomptarget/src/omptarget.cpp =================================================================== --- openmp/libomptarget/src/omptarget.cpp +++ openmp/libomptarget/src/omptarget.cpp @@ -19,11 +19,53 @@ #include #include +int32_t AsyncInfoTy::UseNoWaitEvent = []() { + char *EnvStr = getenv("LIBOMPTARGET_USE_NOWAIT_EVENT"); + return EnvStr ? std::stoi(EnvStr) : 0; +}(); + int AsyncInfoTy::synchronize() { int Result = OFFLOAD_SUCCESS; if (AsyncInfo.Queue) { - // If we have a queue we need to synchronize it now. - Result = Device.synchronize(*this); + // If we have a queue, there are works on going and we need to synchronize + // it. + if (FromNoWait && UseNoWaitEvent > 0) { + // if FromNoWait, return 1) no event support 2) event create fail 3) event + // record fail. + int Ret = Device.recordEvent(*this); + // handle case 2 and 3. + if (Ret != OFFLOAD_SUCCESS) { + DP("Device.recordEvent failed!\n"); + return OFFLOAD_FAIL; + } + + // in case 1) skip task yield + if (!EventSupported) { + DP("No event support by the pluggin! Calling synchronize\n"); + Result = Device.synchronize(*this); + } else { + assert(AsyncInfo.Event && "Event should exist!"); + do { + __kmpc_target_task_yield(); + Ret = Device.queryEvent(*this); + } while (Ret == OFFLOAD_SUCCESS && AsyncInfo.Event); + + if (Ret != OFFLOAD_SUCCESS) { + DP("Device.queryEvent failed!\n"); + return OFFLOAD_FAIL; + } + + // synchronization relies on events + // Event should have been destroyed + assert(AsyncInfo.Event == nullptr && "Event should have been nulled!"); + DP("Event has been fulfilled and destroyed!\n"); + } + } else { + DP("Calling Device.synchronize\n"); + Result = Device.synchronize(*this); + } + + // as the last step, the Queue should have been returned assert(AsyncInfo.Queue == nullptr && "The device plugin should have nulled the queue to indicate there " "are no outstanding actions!"); Index: openmp/libomptarget/src/rtl.h =================================================================== --- openmp/libomptarget/src/rtl.h +++ openmp/libomptarget/src/rtl.h @@ -52,6 +52,8 @@ int32_t, uint64_t, __tgt_async_info *); typedef int64_t(init_requires_ty)(int64_t); + typedef int64_t(record_event_ty)(int32_t, __tgt_async_info *); + typedef int64_t(query_event_ty)(int32_t, __tgt_async_info *); typedef int64_t(synchronize_ty)(int32_t, __tgt_async_info *); typedef int32_t (*register_lib_ty)(__tgt_bin_desc *); typedef int32_t(supports_empty_images_ty)(); @@ -88,6 +90,8 @@ run_team_region_ty *run_team_region = nullptr; run_team_region_async_ty *run_team_region_async = nullptr; init_requires_ty *init_requires = nullptr; + record_event_ty *record_event = nullptr; + query_event_ty *query_event = nullptr; synchronize_ty *synchronize = nullptr; register_lib_ty register_lib = nullptr; register_lib_ty unregister_lib = nullptr; Index: openmp/libomptarget/src/rtl.cpp =================================================================== --- openmp/libomptarget/src/rtl.cpp +++ openmp/libomptarget/src/rtl.cpp @@ -162,6 +162,9 @@ dlsym(dynlib_handle, "__tgt_rtl_run_target_region_async"); *((void **)&R.run_team_region_async) = dlsym(dynlib_handle, "__tgt_rtl_run_target_team_region_async"); + *((void **)&R.record_event) = + dlsym(dynlib_handle, "__tgt_rtl_record_event"); + *((void **)&R.query_event) = dlsym(dynlib_handle, "__tgt_rtl_query_event"); *((void **)&R.synchronize) = dlsym(dynlib_handle, "__tgt_rtl_synchronize"); *((void **)&R.data_exchange) = dlsym(dynlib_handle, "__tgt_rtl_data_exchange"); Index: openmp/runtime/src/kmp.h =================================================================== --- openmp/runtime/src/kmp.h +++ openmp/runtime/src/kmp.h @@ -3796,6 +3796,9 @@ KMP_EXPORT kmp_task_t *__kmpc_omp_target_task_alloc( ident_t *loc_ref, kmp_int32 gtid, kmp_int32 flags, size_t sizeof_kmp_task_t, size_t sizeof_shareds, kmp_routine_entry_t task_entry, kmp_int64 device_id); + +KMP_EXPORT void __kmpc_target_task_yield(); + KMP_EXPORT void __kmpc_omp_task_begin_if0(ident_t *loc_ref, kmp_int32 gtid, kmp_task_t *task); KMP_EXPORT void __kmpc_omp_task_complete_if0(ident_t *loc_ref, kmp_int32 gtid, Index: openmp/runtime/src/kmp_tasking.cpp =================================================================== --- openmp/runtime/src/kmp_tasking.cpp +++ openmp/runtime/src/kmp_tasking.cpp @@ -1447,6 +1447,11 @@ sizeof_shareds, task_entry); } +void __kmpc_target_task_yield() { + int gtid = __kmp_get_gtid(); + __kmpc_omp_taskyield(nullptr, gtid, 0); +} + /*! @ingroup TASKING @param loc_ref location of the original task directive