Index: openmp/libomptarget/include/omptarget.h =================================================================== --- openmp/libomptarget/include/omptarget.h +++ openmp/libomptarget/include/omptarget.h @@ -19,6 +19,7 @@ #define OFFLOAD_SUCCESS (0) #define OFFLOAD_FAIL (~0) +#define OFFLOAD_NOT_DONE (1) #define OFFLOAD_DEVICE_DEFAULT -1 #define HOST_DEVICE -10 @@ -114,10 +115,14 @@ /// This struct contains information exchanged between different asynchronous /// operations for device-dependent optimization and potential synchronization struct __tgt_async_info { + // Device ID. Note that it is NOT the RTLDeviceID. We don't need to store the + // RTLDeviceID explicitly as we can always get it via DeviceID. + int DeviceID = -1; // A pointer to a queue-like structure where offloading operations are issued. - // We assume to use this structure to do synchronization. In CUDA backend, it - // is CUstream. + // We assume to use this structure to do synchronization. void *Queue = nullptr; + // A pointer to a device-dependent event used for synchronization as well. + void *Event = nullptr; }; #ifdef __cplusplus @@ -208,6 +213,7 @@ int32_t depNum, void *depList, int32_t noAliasDepNum, void *noAliasDepList); void __kmpc_push_target_tripcount(int64_t device_id, uint64_t loop_tripcount); +void __kmpc_free_async_info(void *Ptr); #ifdef __cplusplus } Index: openmp/libomptarget/include/omptargetplugin.h =================================================================== --- openmp/libomptarget/include/omptargetplugin.h +++ openmp/libomptarget/include/omptargetplugin.h @@ -129,9 +129,28 @@ int32_t NumTeams, int32_t ThreadLimit, uint64_t loop_tripcount, __tgt_async_info *AsyncInfoPtr); -// Device synchronization. In case of success, return zero. Otherwise, return an -// error code. -int32_t __tgt_rtl_synchronize(int32_t ID, __tgt_async_info *AsyncInfoPtr); +// Release all resources in __tgt_async_info +int32_t __tgt_rtl_release_async_info(int32_t ID, __tgt_async_info *AsyncInfo); + +// Wait an event. This is different from synchronizing an event. Waiting an +// event is a non-blocking operation. Basically, all operations enqueued after +// this waiting should be blocked until this event is full-filled. +int32_t __tgt_rtl_wait_event(int32_t ID, __tgt_async_info *AsyncInfo, + __tgt_async_info *DepAsyncInfo); + +// Record an event such that the event can be later used for waiting or +// synchronization. Note that once the event is recorded, all following use of +// async_info should not use the queue again. In the implementation, the queue +// should be released somehow. +int32_t __tgt_rtl_record_event(int32_t ID, __tgt_async_info *AsyncInfo); + +// Synchronize an event. This is a blocking operation. +int32_t __tgt_rtl_synchronize(int32_t ID, __tgt_async_info *AsyncInfo); + +// Query an event whether it has been full-filled. If return OFFLOAD_SUCCESS, +// the event has been full-filled. If OFFLOAD_NOT_DONE, it has not been finished +// yet. If OFFLOAD_FAIL, something wrong. +int32_t __tgt_rtl_check_event(int32_t ID, __tgt_async_info *AsyncInfo); #ifdef __cplusplus } Index: openmp/libomptarget/plugins/cuda/src/rtl.cpp =================================================================== --- openmp/libomptarget/plugins/cuda/src/rtl.cpp +++ openmp/libomptarget/plugins/cuda/src/rtl.cpp @@ -925,26 +925,120 @@ return OFFLOAD_SUCCESS; } + // Since we have two items that can be synchronized, we will always first + // try to synchronize the event. If success, return directly. Otherwise, + // synchronize the stream. int synchronize(const int DeviceId, __tgt_async_info *AsyncInfoPtr) const { + CUresult Err; + + if (AsyncInfoPtr->Event) { + CUevent Event = reinterpret_cast(AsyncInfoPtr->Event); + Err = cuEventSynchronize(Event); + if (!checkResult(Err, "error returned from cuEventSynchronize")) + return OFFLOAD_FAIL; + + return OFFLOAD_SUCCESS; + } + + assert(AsyncInfoPtr->Queue && "AsyncInfoPtr->Queue is nullptr"); + CUstream Stream = reinterpret_cast(AsyncInfoPtr->Queue); - CUresult Err = cuStreamSynchronize(Stream); - if (Err != CUDA_SUCCESS) { - DP("Error when synchronizing stream. stream = " DPxMOD - ", async info ptr = " DPxMOD "\n", - DPxPTR(Stream), DPxPTR(AsyncInfoPtr)); - CUDA_ERR_STRING(Err); + Err = cuStreamSynchronize(Stream); + if (!checkResult(Err, "error returned from cuStreamSynchronize")) return OFFLOAD_FAIL; - } - // Once the stream is synchronized, return it to stream pool and reset - // async_info. This is to make sure the synchronization only works for its - // own tasks. - StreamManager->returnStream( - DeviceId, reinterpret_cast(AsyncInfoPtr->Queue)); + StreamManager->returnStream(DeviceId, Stream); AsyncInfoPtr->Queue = nullptr; return OFFLOAD_SUCCESS; } + + int releaseAsyncInfo(int DeviceId, __tgt_async_info *AsyncInfo) const { + if (AsyncInfo->Queue) { + StreamManager->returnStream( + DeviceId, reinterpret_cast(AsyncInfo->Queue)); + AsyncInfo->Queue = nullptr; + } + + if (AsyncInfo->Event) { + CUresult Err = + cuEventDestroy(reinterpret_cast(AsyncInfo->Event)); + if (!checkResult(Err, "error returned from cuEventDestroy")) + return OFFLOAD_FAIL; + AsyncInfo->Event = nullptr; + } + + delete AsyncInfo; + + return OFFLOAD_SUCCESS; + } + + int waitEvent(int DeviceID, __tgt_async_info *AsyncInfo, + __tgt_async_info *DepAsyncInfo) const { + CUstream Stream = getStream(DeviceID, AsyncInfo); + CUevent Event = reinterpret_cast(DepAsyncInfo->Event); + + CUresult Err = cuStreamWaitEvent(Stream, Event, 0); + if (!checkResult(Err, "error returned from cuStreamWaitEvent")) + return OFFLOAD_FAIL; + + return OFFLOAD_SUCCESS; + } + + int recordEvent(int DeviceId, __tgt_async_info *AsyncInfoPtr) const { + CUstream Stream = reinterpret_cast(AsyncInfoPtr->Queue); + CUevent Event; + CUresult Err; + + if (AsyncInfoPtr->Event == nullptr) { + Err = cuEventCreate(&Event, CU_EVENT_DISABLE_TIMING); + if (!checkResult(Err, "error returned from cuEventCreate")) + return OFFLOAD_FAIL; + AsyncInfoPtr->Event = Event; + } else { + Event = reinterpret_cast(AsyncInfoPtr->Event); + } + + Err = cuEventRecord(Event, Stream); + if (!checkResult(Err, "error returned from cuEventRecord")) + return OFFLOAD_FAIL; + + return OFFLOAD_SUCCESS; + } + + int checkEvent(int DeviceId, __tgt_async_info *AsyncInfoPtr) const { + CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context); + if (!checkResult(Err, "error returned from cuCtxSetCurrent")) + return OFFLOAD_FAIL; + + CUevent Event = reinterpret_cast(AsyncInfoPtr->Event); + Err = cuEventQuery(Event); + // Event has been full-filled + if (Err == CUDA_SUCCESS) + return OFFLOAD_SUCCESS; + // Event has not been full-filled + if (Err == CUDA_ERROR_NOT_READY) + return OFFLOAD_NOT_DONE; + // Other errors + checkResult(Err, "error returned from cuEventQuery"); + return OFFLOAD_FAIL; + } + + int initAsyncInfo(int DeviceId, __tgt_async_info **AsyncInfo) const { + CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context); + if (!checkResult(Err, "error returned from cuCtxSetCurrent")) + return OFFLOAD_FAIL; + + __tgt_async_info *P = new __tgt_async_info; + getStream(DeviceId, P); + CUevent Event; + Err = cuEventCreate(&Event, CU_EVENT_DISABLE_TIMING); + if (!checkResult(Err, "error returned from cuEventCreate")) + return OFFLOAD_FAIL; + P->Event = Event; + *AsyncInfo = P; + return OFFLOAD_SUCCESS; + } }; DeviceRTLTy DeviceRTL; @@ -1130,13 +1224,54 @@ async_info_ptr); } -int32_t __tgt_rtl_synchronize(int32_t device_id, - __tgt_async_info *async_info_ptr) { +int32_t __tgt_rtl_release_async_info(int32_t device_id, + __tgt_async_info *async_info) { assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); - assert(async_info_ptr && "async_info_ptr is nullptr"); - assert(async_info_ptr->Queue && "async_info_ptr->Queue is nullptr"); + assert(async_info && "async_info is nullptr"); + + return DeviceRTL.releaseAsyncInfo(device_id, async_info); +} + +int32_t __tgt_rtl_wait_event(int32_t device_id, __tgt_async_info *async_info, + __tgt_async_info *dep_async_info) { + assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); + assert(async_info && "async_info is nullptr"); + assert(dep_async_info->Event && "dep_async_info->Event is nullptr"); + + return DeviceRTL.waitEvent(device_id, async_info, dep_async_info); +} + +int32_t __tgt_rtl_record_event(int32_t device_id, + __tgt_async_info *async_info) { + assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); + assert(async_info && "async_info is nullptr"); + assert(async_info->Queue && "async_info->Queue is nullptr"); + + return DeviceRTL.recordEvent(device_id, async_info); +} + +int32_t __tgt_rtl_synchronize(int32_t device_id, __tgt_async_info *async_info) { + assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); + assert(async_info && "async_info is nullptr"); + assert((async_info->Event || async_info->Queue) && + "Both async_info->Event and async_info->Queue are nullptr"); + + return DeviceRTL.synchronize(device_id, async_info); +} + +int32_t __tgt_rtl_check_event(int32_t device_id, __tgt_async_info *async_info) { + assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); + assert(async_info && "async_info is nullptr"); + assert(async_info->Event && "async_info->Event is nullptr"); + + return DeviceRTL.checkEvent(device_id, async_info); +} + +int32_t __tgt_rtl_initialize_async_info(int32_t device_id, __tgt_async_info **async_info) { + assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); + assert(async_info && "async_info is nullptr"); - return DeviceRTL.synchronize(device_id, async_info_ptr); + return DeviceRTL.initAsyncInfo(device_id, async_info); } #ifdef __cplusplus Index: openmp/libomptarget/plugins/exports =================================================================== --- openmp/libomptarget/plugins/exports +++ openmp/libomptarget/plugins/exports @@ -18,7 +18,12 @@ __tgt_rtl_run_target_team_region_async; __tgt_rtl_run_target_region; __tgt_rtl_run_target_region_async; + __tgt_rtl_initialize_async_info; + __tgt_rtl_release_async_info; + __tgt_rtl_wait_event; + __tgt_rtl_record_event; __tgt_rtl_synchronize; + __tgt_rtl_check_event; local: *; }; Index: openmp/libomptarget/src/device.h =================================================================== --- openmp/libomptarget/src/device.h +++ openmp/libomptarget/src/device.h @@ -198,6 +198,10 @@ uint64_t LoopTripCount, __tgt_async_info *AsyncInfoPtr); + // Functions for initialization and release of async info + int32_t initAsyncInfo(__tgt_async_info **AsyncInfo); + int32_t releaseAsyncInfo(__tgt_async_info *AsyncInfo); + private: // Call to RTL void init(); // To be called only via DeviceTy::initOnce() Index: openmp/libomptarget/src/device.cpp =================================================================== --- openmp/libomptarget/src/device.cpp +++ openmp/libomptarget/src/device.cpp @@ -146,7 +146,7 @@ return lr; } -// Used by target_data_begin +// Used by targetDataBegin // Return the target pointer begin (where the data will be moved). // Allocate memory if this is the first occurrence of this mapping. // Increment the reference counter. @@ -213,9 +213,9 @@ return rc; } -// Used by target_data_begin, target_data_end, target_data_update and target. +// Used by targetDataBegin, targetDataEnd, targetDataUpdate and target. // Return the target pointer begin (where the data will be moved). -// Decrement the reference counter if called from target_data_end. +// Decrement the reference counter if called from targetDataEnd. void *DeviceTy::getTgtPtrBegin(void *HstPtrBegin, int64_t Size, bool &IsLast, bool UpdateRefCount, bool &IsHostPtr) { void *rc = NULL; @@ -335,7 +335,7 @@ // Submit data to device int32_t DeviceTy::data_submit(void *TgtPtrBegin, void *HstPtrBegin, int64_t Size, __tgt_async_info *AsyncInfoPtr) { - if (!AsyncInfoPtr || !RTL->data_submit_async || !RTL->synchronize) + if (!AsyncInfoPtr || !RTL->data_submit_async) return RTL->data_submit(RTLDeviceID, TgtPtrBegin, HstPtrBegin, Size); else return RTL->data_submit_async(RTLDeviceID, TgtPtrBegin, HstPtrBegin, Size, @@ -345,7 +345,7 @@ // Retrieve data from device int32_t DeviceTy::data_retrieve(void *HstPtrBegin, void *TgtPtrBegin, int64_t Size, __tgt_async_info *AsyncInfoPtr) { - if (!AsyncInfoPtr || !RTL->data_retrieve_async || !RTL->synchronize) + if (!AsyncInfoPtr || !RTL->data_retrieve_async) return RTL->data_retrieve(RTLDeviceID, HstPtrBegin, TgtPtrBegin, Size); else return RTL->data_retrieve_async(RTLDeviceID, HstPtrBegin, TgtPtrBegin, Size, @@ -355,7 +355,7 @@ // Copy data from current device to destination device directly int32_t DeviceTy::data_exchange(void *SrcPtr, DeviceTy DstDev, void *DstPtr, int64_t Size, __tgt_async_info *AsyncInfoPtr) { - if (!AsyncInfoPtr || !RTL->data_exchange_async || !RTL->synchronize) { + if (!AsyncInfoPtr || !RTL->data_exchange_async) { assert(RTL->data_exchange && "RTL->data_exchange is nullptr"); return RTL->data_exchange(RTLDeviceID, SrcPtr, DstDev.RTLDeviceID, DstPtr, Size); @@ -368,7 +368,7 @@ int32_t DeviceTy::run_region(void *TgtEntryPtr, void **TgtVarsPtr, ptrdiff_t *TgtOffsets, int32_t TgtVarsSize, __tgt_async_info *AsyncInfoPtr) { - if (!AsyncInfoPtr || !RTL->run_region || !RTL->synchronize) + if (!AsyncInfoPtr || !RTL->run_region) return RTL->run_region(RTLDeviceID, TgtEntryPtr, TgtVarsPtr, TgtOffsets, TgtVarsSize); else @@ -382,7 +382,7 @@ int32_t NumTeams, int32_t ThreadLimit, uint64_t LoopTripCount, __tgt_async_info *AsyncInfoPtr) { - if (!AsyncInfoPtr || !RTL->run_team_region_async || !RTL->synchronize) + if (!AsyncInfoPtr || !RTL->run_team_region_async) return RTL->run_team_region(RTLDeviceID, TgtEntryPtr, TgtVarsPtr, TgtOffsets, TgtVarsSize, NumTeams, ThreadLimit, LoopTripCount); @@ -404,6 +404,28 @@ return false; } +int DeviceTy::initAsyncInfo(__tgt_async_info **AsyncInfo) { + if (!RTL->AsyncSupported) { + DP("Asynchronous offloading is not supported"); + return OFFLOAD_FAIL; + } + + if (RTL->initAsyncInfo(RTLDeviceID, AsyncInfo) != OFFLOAD_SUCCESS) + return OFFLOAD_FAIL; + (*AsyncInfo)->DeviceID = DeviceID; + + return OFFLOAD_SUCCESS; +} + +int32_t DeviceTy::releaseAsyncInfo(__tgt_async_info *AsyncInfo) { + if (!RTL->AsyncSupported) { + DP("Asynchronous offloading is not supported"); + return OFFLOAD_FAIL; + } + + return RTL->releaseAsyncInfo(RTLDeviceID, AsyncInfo); +} + /// Check whether a device has an associated RTL and initialize it if it's not /// already initialized. bool device_is_ready(int device_num) { Index: openmp/libomptarget/src/exports =================================================================== --- openmp/libomptarget/src/exports +++ openmp/libomptarget/src/exports @@ -25,7 +25,7 @@ omp_target_associate_ptr; omp_target_disassociate_ptr; __kmpc_push_target_tripcount; + __kmpc_free_async_info; local: *; }; - Index: openmp/libomptarget/src/interface.cpp =================================================================== --- openmp/libomptarget/src/interface.cpp +++ openmp/libomptarget/src/interface.cpp @@ -62,9 +62,39 @@ break; case tgt_mandatory: if (!success) { - FATAL_MESSAGE0(1, "failure of target construct while offloading is mandatory"); + FATAL_MESSAGE0( + 1, "failure of target construct while offloading is mandatory"); } break; + } +} + +template static bool checkAndInitDevice(int64_t &DeviceId) { + if (IsOffloadDisabled()) + return false; + + // No devices available? + if (DeviceId == OFFLOAD_DEVICE_DEFAULT) { + DeviceId = omp_get_default_device(); + DP("Use default device id %" PRId64 "\n", DeviceId); + } + + // Invalid device id as we always expect a non-negative device id and it must + // be less than the size of all device RTLs + if (DeviceId < 0 || static_cast(DeviceId) >= Devices.size()) { + DP("Invalid device %" PRId64 "\n", DeviceId); + return false; + } + + if (!Begin) + return true; + + if (CheckDeviceAndCtors(DeviceId) != OFFLOAD_SUCCESS) { + DP("Failed to get device %" PRId64 " ready\n", DeviceId); + HandleTargetOutcome(false); + return false; + } else { + return true; } } @@ -91,22 +121,12 @@ /// and passes the data to the device. EXTERN void __tgt_target_data_begin(int64_t device_id, int32_t arg_num, void **args_base, void **args, int64_t *arg_sizes, int64_t *arg_types) { - if (IsOffloadDisabled()) return; + // device_id will be corrected if it is default value + if (!checkAndInitDevice(device_id)) + return; DP("Entering data begin region for device %" PRId64 " with %d mappings\n", - device_id, arg_num); - - // No devices available? - if (device_id == OFFLOAD_DEVICE_DEFAULT) { - device_id = omp_get_default_device(); - DP("Use default device id %" PRId64 "\n", device_id); - } - - if (CheckDeviceAndCtors(device_id) != OFFLOAD_SUCCESS) { - DP("Failed to get device %" PRId64 " ready\n", device_id); - HandleTargetOutcome(false); - return; - } + device_id, arg_num); DeviceTy &Device = Devices[device_id]; @@ -118,8 +138,8 @@ } #endif - int rc = target_data_begin(Device, arg_num, args_base, args, arg_sizes, - arg_types, nullptr); + const int rc = targetData( + Device, arg_num, args_base, args, arg_sizes, arg_types); HandleTargetOutcome(rc == OFFLOAD_SUCCESS); } @@ -127,11 +147,19 @@ void **args_base, void **args, int64_t *arg_sizes, int64_t *arg_types, int32_t depNum, void *depList, int32_t noAliasDepNum, void *noAliasDepList) { - if (depNum + noAliasDepNum > 0) - __kmpc_omp_taskwait(NULL, __kmpc_global_thread_num(NULL)); + // device_id will be corrected if it is default value + if (!checkAndInitDevice(device_id)) + return; + + DP("Entering data begin region for device %" PRId64 " with %d mappings\n", + device_id, arg_num); + + DeviceTy &Device = Devices[device_id]; - __tgt_target_data_begin(device_id, arg_num, args_base, args, arg_sizes, - arg_types); + const int rc = targetDataNowait( + Device, arg_num, args_base, args, arg_sizes, arg_types, depNum, depList, + noAliasDepNum, noAliasDepList); + HandleTargetOutcome(rc == OFFLOAD_SUCCESS); } /// passes data from the target, releases target memory and destroys @@ -139,13 +167,12 @@ /// created by the last __tgt_target_data_begin. EXTERN void __tgt_target_data_end(int64_t device_id, int32_t arg_num, void **args_base, void **args, int64_t *arg_sizes, int64_t *arg_types) { - if (IsOffloadDisabled()) return; - DP("Entering data end region with %d mappings\n", arg_num); + // device_id will be corrected if it is default value + if (!checkAndInitDevice(device_id)) + return; - // No devices available? - if (device_id == OFFLOAD_DEVICE_DEFAULT) { - device_id = omp_get_default_device(); - } + DP("Entering data end region for device %" PRId64 " with %d mappings\n", + device_id, arg_num); RTLsMtx->lock(); size_t Devices_size = Devices.size(); @@ -164,15 +191,15 @@ } #ifdef OMPTARGET_DEBUG - for (int i=0; i(Device, arg_num, args_base, + args, arg_sizes, arg_types); HandleTargetOutcome(rc == OFFLOAD_SUCCESS); } @@ -180,32 +207,33 @@ void **args_base, void **args, int64_t *arg_sizes, int64_t *arg_types, int32_t depNum, void *depList, int32_t noAliasDepNum, void *noAliasDepList) { - if (depNum + noAliasDepNum > 0) - __kmpc_omp_taskwait(NULL, __kmpc_global_thread_num(NULL)); + // device_id will be corrected if it is default value + if (!checkAndInitDevice(device_id)) + return; - __tgt_target_data_end(device_id, arg_num, args_base, args, arg_sizes, - arg_types); + DP("Entering data end region for device %" PRId64 " with %d mappings\n", + device_id, arg_num); + + DeviceTy &Device = Devices[device_id]; + + const int rc = targetDataNowait( + Device, arg_num, args_base, args, arg_sizes, arg_types, depNum, depList, + noAliasDepNum, noAliasDepList); + HandleTargetOutcome(rc == OFFLOAD_SUCCESS); } EXTERN void __tgt_target_data_update(int64_t device_id, int32_t arg_num, void **args_base, void **args, int64_t *arg_sizes, int64_t *arg_types) { - if (IsOffloadDisabled()) return; - DP("Entering data update with %d mappings\n", arg_num); - - // No devices available? - if (device_id == OFFLOAD_DEVICE_DEFAULT) { - device_id = omp_get_default_device(); - } - - if (CheckDeviceAndCtors(device_id) != OFFLOAD_SUCCESS) { - DP("Failed to get device %" PRId64 " ready\n", device_id); - HandleTargetOutcome(false); + // device_id will be corrected if it is default value + if (!checkAndInitDevice(device_id)) return; - } - DeviceTy& Device = Devices[device_id]; - int rc = target_data_update(Device, arg_num, args_base, - args, arg_sizes, arg_types); + DP("Entering data update region for device %" PRId64 " with %d mappings\n", + device_id, arg_num); + + DeviceTy &Device = Devices[device_id]; + const int rc = targetData( + Device, arg_num, args_base, args, arg_sizes, arg_types); HandleTargetOutcome(rc == OFFLOAD_SUCCESS); } @@ -213,39 +241,47 @@ int64_t device_id, int32_t arg_num, void **args_base, void **args, int64_t *arg_sizes, int64_t *arg_types, int32_t depNum, void *depList, int32_t noAliasDepNum, void *noAliasDepList) { - if (depNum + noAliasDepNum > 0) - __kmpc_omp_taskwait(NULL, __kmpc_global_thread_num(NULL)); + // device_id will be corrected if it is default value + if (!checkAndInitDevice(device_id)) + return; + + DP("Entering data update region for device %" PRId64 " with %d mappings\n", + device_id, arg_num); + + DeviceTy &Device = Devices[device_id]; + + // TODO: this part should be refined maybe in case of memory error + __tgt_async_info *async_info = new __tgt_async_info; + async_info->DeviceID = device_id; - __tgt_target_data_update(device_id, arg_num, args_base, args, arg_sizes, - arg_types); + const int rc = targetDataNowait( + Device, arg_num, args_base, args, arg_sizes, arg_types, depNum, depList, + noAliasDepNum, noAliasDepList); + HandleTargetOutcome(rc == OFFLOAD_SUCCESS); } EXTERN int __tgt_target(int64_t device_id, void *host_ptr, int32_t arg_num, void **args_base, void **args, int64_t *arg_sizes, int64_t *arg_types) { - if (IsOffloadDisabled()) return OFFLOAD_FAIL; - DP("Entering target region with entry point " DPxMOD " and device Id %" - PRId64 "\n", DPxPTR(host_ptr), device_id); - - if (device_id == OFFLOAD_DEVICE_DEFAULT) { - device_id = omp_get_default_device(); - } - - if (CheckDeviceAndCtors(device_id) != OFFLOAD_SUCCESS) { - DP("Failed to get device %" PRId64 " ready\n", device_id); - HandleTargetOutcome(false); + // device_id will be corrected if it is default value + if (!checkAndInitDevice(device_id)) return OFFLOAD_FAIL; - } + + DP("Entering target region with entry point " DPxMOD " and device Id %" PRId64 + "\n", + DPxPTR(host_ptr), device_id); #ifdef OMPTARGET_DEBUG - for (int i=0; i 0) - __kmpc_omp_taskwait(NULL, __kmpc_global_thread_num(NULL)); + // device_id will be corrected if it is default value + if (!checkAndInitDevice(device_id)) + return OFFLOAD_FAIL; + + DP("Entering target region with entry point " DPxMOD " and device Id %" PRId64 + "\n", + DPxPTR(host_ptr), device_id); + +#ifdef OMPTARGET_DEBUG + for (int i = 0; i < arg_num; ++i) { + DP("Entry %2d: Base=" DPxMOD ", Begin=" DPxMOD ", Size=%" PRId64 + ", Type=0x%" PRIx64 "\n", + i, DPxPTR(args_base[i]), DPxPTR(args[i]), arg_sizes[i], arg_types[i]); + } +#endif - return __tgt_target(device_id, host_ptr, arg_num, args_base, args, arg_sizes, - arg_types); + const int rc = targetNowait( + device_id, host_ptr, arg_num, args_base, args, arg_sizes, arg_types, + /* TeamNum */ 0, /* ThreadLimit */ 0, /* IsTeamConstruct */ false, depNum, + depList, noAliasDepNum, noAliasDepList); + HandleTargetOutcome(rc == OFFLOAD_SUCCESS); + return rc; } EXTERN int __tgt_target_teams(int64_t device_id, void *host_ptr, int32_t arg_num, void **args_base, void **args, int64_t *arg_sizes, int64_t *arg_types, int32_t team_num, int32_t thread_limit) { - if (IsOffloadDisabled()) return OFFLOAD_FAIL; - DP("Entering target region with entry point " DPxMOD " and device Id %" - PRId64 "\n", DPxPTR(host_ptr), device_id); - - if (device_id == OFFLOAD_DEVICE_DEFAULT) { - device_id = omp_get_default_device(); - } - - if (CheckDeviceAndCtors(device_id) != OFFLOAD_SUCCESS) { - DP("Failed to get device %" PRId64 " ready\n", device_id); - HandleTargetOutcome(false); + // device_id will be corrected if it is default value + if (!checkAndInitDevice(device_id)) return OFFLOAD_FAIL; - } + + DP("Entering target region with entry point " DPxMOD " and device Id %" PRId64 + "\n", + DPxPTR(host_ptr), device_id); #ifdef OMPTARGET_DEBUG - for (int i=0; i 0) - __kmpc_omp_taskwait(NULL, __kmpc_global_thread_num(NULL)); + // device_id will be corrected if it is default value + if (!checkAndInitDevice(device_id)) + return OFFLOAD_FAIL; + + DP("Entering target region with entry point " DPxMOD " and device Id %" PRId64 + "\n", + DPxPTR(host_ptr), device_id); - return __tgt_target_teams(device_id, host_ptr, arg_num, args_base, args, - arg_sizes, arg_types, team_num, thread_limit); +#ifdef OMPTARGET_DEBUG + for (int i = 0; i < arg_num; ++i) { + DP("Entry %2d: Base=" DPxMOD ", Begin=" DPxMOD ", Size=%" PRId64 + ", Type=0x%" PRIx64 "\n", + i, DPxPTR(args_base[i]), DPxPTR(args[i]), arg_sizes[i], arg_types[i]); + } +#endif + + const int rc = targetNowait(device_id, host_ptr, arg_num, args_base, args, + arg_sizes, arg_types, team_num, thread_limit, + /* IsTeamConstruct */ true, depNum, depList, + noAliasDepNum, noAliasDepList); + HandleTargetOutcome(rc == OFFLOAD_SUCCESS); + return rc; } // Get the current number of components for a user-defined mapper. @@ -342,9 +407,20 @@ } DP("__kmpc_push_target_tripcount(%" PRId64 ", %" PRIu64 ")\n", device_id, - loop_tripcount); + loop_tripcount); TblMapMtx->lock(); Devices[device_id].LoopTripCnt.emplace(__kmpc_global_thread_num(NULL), loop_tripcount); TblMapMtx->unlock(); } + +EXTERN void __kmpc_free_async_info(void *Ptr) { + if (!Ptr) + return; + __tgt_async_info *AsyncInfo = reinterpret_cast<__tgt_async_info *>(Ptr); + int DeviceId = AsyncInfo->DeviceID; + + assert(DeviceId >= 0 && "Invalid DeviceId"); + + Devices[DeviceId].releaseAsyncInfo(AsyncInfo); +} Index: openmp/libomptarget/src/omptarget.cpp =================================================================== --- openmp/libomptarget/src/omptarget.cpp +++ openmp/libomptarget/src/omptarget.cpp @@ -166,8 +166,8 @@ DP("Has pending ctors... call now\n"); for (auto &entry : lib.second.PendingCtors) { void *ctor = entry; - int rc = target(device_id, ctor, 0, NULL, NULL, NULL, - NULL, 1, 1, true /*team*/); + int rc = target(device_id, ctor, 0, NULL, NULL, NULL, NULL, 1, 1, + /*IsTeamConstruct*/ true); if (rc != OFFLOAD_SUCCESS) { DP("Running ctor " DPxMOD " failed.\n", DPxPTR(ctor)); Device.PendingGlobalsMtx.unlock(); @@ -214,10 +214,26 @@ return ((type & OMP_TGT_MAPTYPE_MEMBER_OF) >> 48) - 1; } +static const unsigned LambdaMapping = OMP_TGT_MAPTYPE_PTR_AND_OBJ | + OMP_TGT_MAPTYPE_LITERAL | + OMP_TGT_MAPTYPE_IMPLICIT; +static bool isLambdaMapping(int64_t Mapping) { + return (Mapping & LambdaMapping) == LambdaMapping; +} + +// Runtime functions from libomp +extern "C" { +void __kmpc_get_target_task_waiting_list(void **list, int *num); +int __kmpc_set_async_info(void *async_info); +void __kmpc_target_task_yield(); +int __kmpc_get_target_task_npredecessors(); +} + +namespace { /// Internal function to do the mapping and transfer the data to the device -int target_data_begin(DeviceTy &Device, int32_t arg_num, void **args_base, - void **args, int64_t *arg_sizes, int64_t *arg_types, - __tgt_async_info *async_info_ptr) { +int targetDataBegin(DeviceTy &Device, int32_t arg_num, void **args_base, + void **args, int64_t *arg_sizes, int64_t *arg_types, + __tgt_async_info *AsyncInfo) { // process each input. for (int32_t i = 0; i < arg_num; ++i) { // Ignore private variables and arrays - there is no mapping for them. @@ -233,14 +249,15 @@ // Look at the next argument - if that is MEMBER_OF this one, then this one // is a combined entry. int64_t padding = 0; - const int next_i = i+1; + const int next_i = i + 1; if (member_of(arg_types[i]) < 0 && next_i < arg_num && member_of(arg_types[next_i]) == i) { padding = (int64_t)HstPtrBegin % alignment; if (padding) { DP("Using a padding of %" PRId64 " bytes for begin address " DPxMOD - "\n", padding, DPxPTR(HstPtrBegin)); - HstPtrBegin = (char *) HstPtrBegin - padding; + "\n", + padding, DPxPTR(HstPtrBegin)); + HstPtrBegin = (char *)HstPtrBegin - padding; data_size += padding; } } @@ -262,35 +279,37 @@ if (arg_types[i] & OMP_TGT_MAPTYPE_PTR_AND_OBJ) { DP("Has a pointer entry: \n"); // base is address of pointer. - Pointer_TgtPtrBegin = Device.getOrAllocTgtPtr(HstPtrBase, HstPtrBase, - sizeof(void *), Pointer_IsNew, IsHostPtr, IsImplicit, UpdateRef, - HasCloseModifier); + Pointer_TgtPtrBegin = Device.getOrAllocTgtPtr( + HstPtrBase, HstPtrBase, sizeof(void *), Pointer_IsNew, IsHostPtr, + IsImplicit, UpdateRef, HasCloseModifier); if (!Pointer_TgtPtrBegin) { DP("Call to getOrAllocTgtPtr returned null pointer (device failure or " - "illegal mapping).\n"); + "illegal mapping).\n"); return OFFLOAD_FAIL; } DP("There are %zu bytes allocated at target address " DPxMOD " - is%s new" - "\n", sizeof(void *), DPxPTR(Pointer_TgtPtrBegin), - (Pointer_IsNew ? "" : " not")); + "\n", + sizeof(void *), DPxPTR(Pointer_TgtPtrBegin), + (Pointer_IsNew ? "" : " not")); Pointer_HstPtrBegin = HstPtrBase; // modify current entry. HstPtrBase = *(void **)HstPtrBase; UpdateRef = true; // subsequently update ref count of pointee } - void *TgtPtrBegin = Device.getOrAllocTgtPtr(HstPtrBegin, HstPtrBase, - data_size, IsNew, IsHostPtr, IsImplicit, UpdateRef, HasCloseModifier); + void *TgtPtrBegin = Device.getOrAllocTgtPtr( + HstPtrBegin, HstPtrBase, data_size, IsNew, IsHostPtr, IsImplicit, + UpdateRef, HasCloseModifier); if (!TgtPtrBegin && data_size) { // If data_size==0, then the argument could be a zero-length pointer to // NULL, so getOrAlloc() returning NULL is not an error. DP("Call to getOrAllocTgtPtr returned null pointer (device failure or " - "illegal mapping).\n"); + "illegal mapping).\n"); return OFFLOAD_FAIL; } DP("There are %" PRId64 " bytes allocated at target address " DPxMOD - " - is%s new\n", data_size, DPxPTR(TgtPtrBegin), - (IsNew ? "" : " not")); + " - is%s new\n", + data_size, DPxPTR(TgtPtrBegin), (IsNew ? "" : " not")); if (arg_types[i] & OMP_TGT_MAPTYPE_RETURN_PARAM) { uintptr_t Delta = (uintptr_t)HstPtrBegin - (uintptr_t)HstPtrBase; @@ -319,8 +338,8 @@ if (copy && !IsHostPtr) { DP("Moving %" PRId64 " bytes (hst:" DPxMOD ") -> (tgt:" DPxMOD ")\n", data_size, DPxPTR(HstPtrBegin), DPxPTR(TgtPtrBegin)); - int rt = Device.data_submit(TgtPtrBegin, HstPtrBegin, data_size, - async_info_ptr); + int rt = + Device.data_submit(TgtPtrBegin, HstPtrBegin, data_size, AsyncInfo); if (rt != OFFLOAD_SUCCESS) { DP("Copying data to device failed.\n"); return OFFLOAD_FAIL; @@ -330,19 +349,19 @@ if (arg_types[i] & OMP_TGT_MAPTYPE_PTR_AND_OBJ && !IsHostPtr) { DP("Update pointer (" DPxMOD ") -> [" DPxMOD "]\n", - DPxPTR(Pointer_TgtPtrBegin), DPxPTR(TgtPtrBegin)); + DPxPTR(Pointer_TgtPtrBegin), DPxPTR(TgtPtrBegin)); uint64_t Delta = (uint64_t)HstPtrBegin - (uint64_t)HstPtrBase; void *TgtPtrBase = (void *)((uint64_t)TgtPtrBegin - Delta); int rt = Device.data_submit(Pointer_TgtPtrBegin, &TgtPtrBase, - sizeof(void *), async_info_ptr); + sizeof(void *), AsyncInfo); if (rt != OFFLOAD_SUCCESS) { DP("Copying data to device failed.\n"); return OFFLOAD_FAIL; } // create shadow pointers for this entry Device.ShadowMtx.lock(); - Device.ShadowPtrMap[Pointer_HstPtrBegin] = {HstPtrBase, - Pointer_TgtPtrBegin, TgtPtrBase}; + Device.ShadowPtrMap[Pointer_HstPtrBegin] = { + HstPtrBase, Pointer_TgtPtrBegin, TgtPtrBase}; Device.ShadowMtx.unlock(); } } @@ -351,9 +370,9 @@ } /// Internal function to undo the mapping and retrieve the data from the device. -int target_data_end(DeviceTy &Device, int32_t arg_num, void **args_base, - void **args, int64_t *arg_sizes, int64_t *arg_types, - __tgt_async_info *async_info_ptr) { +int targetDataEnd(DeviceTy &Device, int32_t arg_num, void **args_base, + void **args, int64_t *arg_sizes, int64_t *arg_types, + __tgt_async_info *AsyncInfo) { // process each input. for (int32_t i = arg_num - 1; i >= 0; --i) { // Ignore private variables and arrays - there is no mapping for them. @@ -368,30 +387,31 @@ // Look at the next argument - if that is MEMBER_OF this one, then this one // is a combined entry. int64_t padding = 0; - const int next_i = i+1; + const int next_i = i + 1; if (member_of(arg_types[i]) < 0 && next_i < arg_num && member_of(arg_types[next_i]) == i) { padding = (int64_t)HstPtrBegin % alignment; if (padding) { DP("Using a padding of %" PRId64 " bytes for begin address " DPxMOD - "\n", padding, DPxPTR(HstPtrBegin)); - HstPtrBegin = (char *) HstPtrBegin - padding; + "\n", + padding, DPxPTR(HstPtrBegin)); + HstPtrBegin = (char *)HstPtrBegin - padding; data_size += padding; } } bool IsLast, IsHostPtr; bool UpdateRef = !(arg_types[i] & OMP_TGT_MAPTYPE_MEMBER_OF) || - (arg_types[i] & OMP_TGT_MAPTYPE_PTR_AND_OBJ); + (arg_types[i] & OMP_TGT_MAPTYPE_PTR_AND_OBJ); bool ForceDelete = arg_types[i] & OMP_TGT_MAPTYPE_DELETE; bool HasCloseModifier = arg_types[i] & OMP_TGT_MAPTYPE_CLOSE; // If PTR_AND_OBJ, HstPtrBegin is address of pointee void *TgtPtrBegin = Device.getTgtPtrBegin(HstPtrBegin, data_size, IsLast, - UpdateRef, IsHostPtr); + UpdateRef, IsHostPtr); DP("There are %" PRId64 " bytes allocated at target address " DPxMOD - " - is%s last\n", data_size, DPxPTR(TgtPtrBegin), - (IsLast ? "" : " not")); + " - is%s last\n", + data_size, DPxPTR(TgtPtrBegin), (IsLast ? "" : " not")); bool DelEntry = IsLast || ForceDelete; @@ -425,7 +445,7 @@ DP("Moving %" PRId64 " bytes (tgt:" DPxMOD ") -> (hst:" DPxMOD ")\n", data_size, DPxPTR(TgtPtrBegin), DPxPTR(HstPtrBegin)); int rt = Device.data_retrieve(HstPtrBegin, TgtPtrBegin, data_size, - async_info_ptr); + AsyncInfo); if (rt != OFFLOAD_SUCCESS) { DP("Copying data from device failed.\n"); return OFFLOAD_FAIL; @@ -437,27 +457,27 @@ // need to restore the original host pointer values from their shadow // copies. If the struct is going to be deallocated, remove any remaining // shadow pointer entries for this struct. - uintptr_t lb = (uintptr_t) HstPtrBegin; - uintptr_t ub = (uintptr_t) HstPtrBegin + data_size; + uintptr_t lb = (uintptr_t)HstPtrBegin; + uintptr_t ub = (uintptr_t)HstPtrBegin + data_size; Device.ShadowMtx.lock(); for (ShadowPtrListTy::iterator it = Device.ShadowPtrMap.begin(); it != Device.ShadowPtrMap.end();) { - void **ShadowHstPtrAddr = (void**) it->first; + void **ShadowHstPtrAddr = (void **)it->first; // An STL map is sorted on its keys; use this property // to quickly determine when to break out of the loop. - if ((uintptr_t) ShadowHstPtrAddr < lb) { + if ((uintptr_t)ShadowHstPtrAddr < lb) { ++it; continue; } - if ((uintptr_t) ShadowHstPtrAddr >= ub) + if ((uintptr_t)ShadowHstPtrAddr >= ub) break; // If we copied the struct to the host, we need to restore the pointer. if (arg_types[i] & OMP_TGT_MAPTYPE_FROM) { DP("Restoring original host pointer value " DPxMOD " for host " - "pointer " DPxMOD "\n", DPxPTR(it->second.HstPtrVal), - DPxPTR(ShadowHstPtrAddr)); + "pointer " DPxMOD "\n", + DPxPTR(it->second.HstPtrVal), DPxPTR(ShadowHstPtrAddr)); *ShadowHstPtrAddr = it->second.HstPtrVal; } // If the struct is to be deallocated, remove the shadow entry. @@ -486,8 +506,9 @@ } /// Internal function to pass data to/from the target. -int target_data_update(DeviceTy &Device, int32_t arg_num, - void **args_base, void **args, int64_t *arg_sizes, int64_t *arg_types) { +int targetDataUpdate(DeviceTy &Device, int32_t arg_num, void **args_base, + void **args, int64_t *arg_sizes, int64_t *arg_types, + __tgt_async_info *AsyncInfo) { // process each input. for (int32_t i = 0; i < arg_num; ++i) { if ((arg_types[i] & OMP_TGT_MAPTYPE_LITERAL) || @@ -497,10 +518,11 @@ void *HstPtrBegin = args[i]; int64_t MapSize = arg_sizes[i]; bool IsLast, IsHostPtr; - void *TgtPtrBegin = Device.getTgtPtrBegin(HstPtrBegin, MapSize, IsLast, - false, IsHostPtr); + void *TgtPtrBegin = + Device.getTgtPtrBegin(HstPtrBegin, MapSize, IsLast, false, IsHostPtr); if (!TgtPtrBegin) { - DP("hst data:" DPxMOD " not found, becomes a noop\n", DPxPTR(HstPtrBegin)); + DP("hst data:" DPxMOD " not found, becomes a noop\n", + DPxPTR(HstPtrBegin)); continue; } @@ -513,26 +535,27 @@ if (arg_types[i] & OMP_TGT_MAPTYPE_FROM) { DP("Moving %" PRId64 " bytes (tgt:" DPxMOD ") -> (hst:" DPxMOD ")\n", - arg_sizes[i], DPxPTR(TgtPtrBegin), DPxPTR(HstPtrBegin)); - int rt = Device.data_retrieve(HstPtrBegin, TgtPtrBegin, MapSize, nullptr); + arg_sizes[i], DPxPTR(TgtPtrBegin), DPxPTR(HstPtrBegin)); + int rt = + Device.data_retrieve(HstPtrBegin, TgtPtrBegin, MapSize, AsyncInfo); if (rt != OFFLOAD_SUCCESS) { DP("Copying data from device failed.\n"); return OFFLOAD_FAIL; } - uintptr_t lb = (uintptr_t) HstPtrBegin; - uintptr_t ub = (uintptr_t) HstPtrBegin + MapSize; + uintptr_t lb = (uintptr_t)HstPtrBegin; + uintptr_t ub = (uintptr_t)HstPtrBegin + MapSize; Device.ShadowMtx.lock(); for (ShadowPtrListTy::iterator it = Device.ShadowPtrMap.begin(); - it != Device.ShadowPtrMap.end(); ++it) { - void **ShadowHstPtrAddr = (void**) it->first; - if ((uintptr_t) ShadowHstPtrAddr < lb) + it != Device.ShadowPtrMap.end(); ++it) { + void **ShadowHstPtrAddr = (void **)it->first; + if ((uintptr_t)ShadowHstPtrAddr < lb) continue; - if ((uintptr_t) ShadowHstPtrAddr >= ub) + if ((uintptr_t)ShadowHstPtrAddr >= ub) break; - DP("Restoring original host pointer value " DPxMOD " for host pointer " - DPxMOD "\n", DPxPTR(it->second.HstPtrVal), - DPxPTR(ShadowHstPtrAddr)); + DP("Restoring original host pointer value " DPxMOD + " for host pointer " DPxMOD "\n", + DPxPTR(it->second.HstPtrVal), DPxPTR(ShadowHstPtrAddr)); *ShadowHstPtrAddr = it->second.HstPtrVal; } Device.ShadowMtx.unlock(); @@ -540,28 +563,28 @@ if (arg_types[i] & OMP_TGT_MAPTYPE_TO) { DP("Moving %" PRId64 " bytes (hst:" DPxMOD ") -> (tgt:" DPxMOD ")\n", - arg_sizes[i], DPxPTR(HstPtrBegin), DPxPTR(TgtPtrBegin)); - int rt = Device.data_submit(TgtPtrBegin, HstPtrBegin, MapSize, nullptr); + arg_sizes[i], DPxPTR(HstPtrBegin), DPxPTR(TgtPtrBegin)); + int rt = Device.data_submit(TgtPtrBegin, HstPtrBegin, MapSize, AsyncInfo); if (rt != OFFLOAD_SUCCESS) { DP("Copying data to device failed.\n"); return OFFLOAD_FAIL; } - uintptr_t lb = (uintptr_t) HstPtrBegin; - uintptr_t ub = (uintptr_t) HstPtrBegin + MapSize; + uintptr_t lb = (uintptr_t)HstPtrBegin; + uintptr_t ub = (uintptr_t)HstPtrBegin + MapSize; Device.ShadowMtx.lock(); for (ShadowPtrListTy::iterator it = Device.ShadowPtrMap.begin(); - it != Device.ShadowPtrMap.end(); ++it) { - void **ShadowHstPtrAddr = (void**) it->first; - if ((uintptr_t) ShadowHstPtrAddr < lb) + it != Device.ShadowPtrMap.end(); ++it) { + void **ShadowHstPtrAddr = (void **)it->first; + if ((uintptr_t)ShadowHstPtrAddr < lb) continue; - if ((uintptr_t) ShadowHstPtrAddr >= ub) + if ((uintptr_t)ShadowHstPtrAddr >= ub) break; DP("Restoring original target pointer value " DPxMOD " for target " - "pointer " DPxMOD "\n", DPxPTR(it->second.TgtPtrVal), - DPxPTR(it->second.TgtPtrAddr)); - rt = Device.data_submit(it->second.TgtPtrAddr, - &it->second.TgtPtrVal, sizeof(void *), nullptr); + "pointer " DPxMOD "\n", + DPxPTR(it->second.TgtPtrVal), DPxPTR(it->second.TgtPtrAddr)); + rt = Device.data_submit(it->second.TgtPtrAddr, &it->second.TgtPtrVal, + sizeof(void *), AsyncInfo); if (rt != OFFLOAD_SUCCESS) { DP("Copying data to device failed.\n"); Device.ShadowMtx.unlock(); @@ -574,22 +597,16 @@ return OFFLOAD_SUCCESS; } -static const unsigned LambdaMapping = OMP_TGT_MAPTYPE_PTR_AND_OBJ | - OMP_TGT_MAPTYPE_LITERAL | - OMP_TGT_MAPTYPE_IMPLICIT; -static bool isLambdaMapping(int64_t Mapping) { - return (Mapping & LambdaMapping) == LambdaMapping; -} - /// performs the same actions as data_begin in case arg_num is /// non-zero and initiates run of the offloaded region on the target platform; /// if arg_num is non-zero after the region execution is done it also /// performs the same action as data_update and data_end above. This function /// returns 0 if it was able to transfer the execution to a target and an /// integer different from zero otherwise. -int target(int64_t device_id, void *host_ptr, int32_t arg_num, - void **args_base, void **args, int64_t *arg_sizes, int64_t *arg_types, - int32_t team_num, int32_t thread_limit, int IsTeamConstruct) { +int target(int64_t device_id, void *host_ptr, int32_t arg_num, void **args_base, + void **args, int64_t *arg_sizes, int64_t *arg_types, + int32_t team_num, int32_t thread_limit, int IsTeamConstruct, + __tgt_async_info *AsyncInfo) { DeviceTy &Device = Devices[device_id]; // Find the table information in the map or look it up in the translation @@ -643,13 +660,11 @@ TrlTblMtx->unlock(); assert(TargetTable && "Global data has not been mapped\n"); - __tgt_async_info AsyncInfo; - // Move data to device. - int rc = target_data_begin(Device, arg_num, args_base, args, arg_sizes, - arg_types, &AsyncInfo); + int rc = targetDataBegin(Device, arg_num, args_base, args, arg_sizes, + arg_types, AsyncInfo); if (rc != OFFLOAD_SUCCESS) { - DP("Call to target_data_begin failed, abort target.\n"); + DP("Call to targetDataBegin failed, abort target.\n"); return OFFLOAD_FAIL; } @@ -681,9 +696,8 @@ DP("Parent lambda base " DPxMOD "\n", DPxPTR(TgtPtrBase)); uint64_t Delta = (uint64_t)HstPtrBegin - (uint64_t)HstPtrBase; void *TgtPtrBegin = (void *)((uintptr_t)TgtPtrBase + Delta); - void *Pointer_TgtPtrBegin = - Device.getTgtPtrBegin(HstPtrVal, arg_sizes[i], IsLast, false, - IsHostPtr); + void *Pointer_TgtPtrBegin = Device.getTgtPtrBegin( + HstPtrVal, arg_sizes[i], IsLast, false, IsHostPtr); if (!Pointer_TgtPtrBegin) { DP("No lambda captured variable mapped (" DPxMOD ") - ignored\n", DPxPTR(HstPtrVal)); @@ -692,13 +706,14 @@ if (RTLs->RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY && TgtPtrBegin == HstPtrBegin) { DP("Unified memory is active, no need to map lambda captured" - "variable (" DPxMOD ")\n", DPxPTR(HstPtrVal)); + "variable (" DPxMOD ")\n", + DPxPTR(HstPtrVal)); continue; } DP("Update lambda reference (" DPxMOD ") -> [" DPxMOD "]\n", DPxPTR(Pointer_TgtPtrBegin), DPxPTR(TgtPtrBegin)); int rt = Device.data_submit(TgtPtrBegin, &Pointer_TgtPtrBegin, - sizeof(void *), &AsyncInfo); + sizeof(void *), AsyncInfo); if (rt != OFFLOAD_SUCCESS) { DP("Copying data to device failed.\n"); return OFFLOAD_FAIL; @@ -713,18 +728,18 @@ bool IsLast, IsHostPtr; // unused. if (arg_types[i] & OMP_TGT_MAPTYPE_LITERAL) { DP("Forwarding first-private value " DPxMOD " to the target construct\n", - DPxPTR(HstPtrBase)); + DPxPTR(HstPtrBase)); TgtPtrBegin = HstPtrBase; TgtBaseOffset = 0; } else if (arg_types[i] & OMP_TGT_MAPTYPE_PRIVATE) { // Allocate memory for (first-)private array - TgtPtrBegin = Device.RTL->data_alloc(Device.RTLDeviceID, - arg_sizes[i], HstPtrBegin); + TgtPtrBegin = + Device.RTL->data_alloc(Device.RTLDeviceID, arg_sizes[i], HstPtrBegin); if (!TgtPtrBegin) { - DP ("Data allocation for %sprivate array " DPxMOD " failed, " - "abort target.\n", - (arg_types[i] & OMP_TGT_MAPTYPE_TO ? "first-" : ""), - DPxPTR(HstPtrBegin)); + DP("Data allocation for %sprivate array " DPxMOD " failed, " + "abort target.\n", + (arg_types[i] & OMP_TGT_MAPTYPE_TO ? "first-" : ""), + DPxPTR(HstPtrBegin)); return OFFLOAD_FAIL; } fpArrays.push_back(TgtPtrBegin); @@ -732,15 +747,15 @@ #ifdef OMPTARGET_DEBUG void *TgtPtrBase = (void *)((intptr_t)TgtPtrBegin + TgtBaseOffset); DP("Allocated %" PRId64 " bytes of target memory at " DPxMOD " for " - "%sprivate array " DPxMOD " - pushing target argument " DPxMOD "\n", - arg_sizes[i], DPxPTR(TgtPtrBegin), - (arg_types[i] & OMP_TGT_MAPTYPE_TO ? "first-" : ""), - DPxPTR(HstPtrBegin), DPxPTR(TgtPtrBase)); + "%sprivate array " DPxMOD " - pushing target argument " DPxMOD "\n", + arg_sizes[i], DPxPTR(TgtPtrBegin), + (arg_types[i] & OMP_TGT_MAPTYPE_TO ? "first-" : ""), + DPxPTR(HstPtrBegin), DPxPTR(TgtPtrBase)); #endif // If first-private, copy data from host if (arg_types[i] & OMP_TGT_MAPTYPE_TO) { int rt = Device.data_submit(TgtPtrBegin, HstPtrBegin, arg_sizes[i], - &AsyncInfo); + AsyncInfo); if (rt != OFFLOAD_SUCCESS) { DP("Copying data to device failed, failed.\n"); return OFFLOAD_FAIL; @@ -748,19 +763,19 @@ } } else if (arg_types[i] & OMP_TGT_MAPTYPE_PTR_AND_OBJ) { TgtPtrBegin = Device.getTgtPtrBegin(HstPtrBase, sizeof(void *), IsLast, - false, IsHostPtr); + false, IsHostPtr); TgtBaseOffset = 0; // no offset for ptrs. DP("Obtained target argument " DPxMOD " from host pointer " DPxMOD " to " - "object " DPxMOD "\n", DPxPTR(TgtPtrBegin), DPxPTR(HstPtrBase), - DPxPTR(HstPtrBase)); + "object " DPxMOD "\n", + DPxPTR(TgtPtrBegin), DPxPTR(HstPtrBase), DPxPTR(HstPtrBase)); } else { TgtPtrBegin = Device.getTgtPtrBegin(HstPtrBegin, arg_sizes[i], IsLast, - false, IsHostPtr); + false, IsHostPtr); TgtBaseOffset = (intptr_t)HstPtrBase - (intptr_t)HstPtrBegin; #ifdef OMPTARGET_DEBUG void *TgtPtrBase = (void *)((intptr_t)TgtPtrBegin + TgtBaseOffset); DP("Obtained target argument " DPxMOD " from host pointer " DPxMOD "\n", - DPxPTR(TgtPtrBase), DPxPTR(HstPtrBegin)); + DPxPTR(TgtPtrBase), DPxPTR(HstPtrBegin)); #endif } tgtArgsPositions[i] = tgt_args.size(); @@ -769,7 +784,7 @@ } assert(tgt_args.size() == tgt_offsets.size() && - "Size mismatch in arguments and offsets"); + "Size mismatch in arguments and offsets"); // Pop loop trip count uint64_t ltc = 0; @@ -784,19 +799,19 @@ // Launch device execution. DP("Launching target execution %s with pointer " DPxMOD " (index=%d).\n", - TargetTable->EntriesBegin[TM->Index].name, - DPxPTR(TargetTable->EntriesBegin[TM->Index].addr), TM->Index); + TargetTable->EntriesBegin[TM->Index].name, + DPxPTR(TargetTable->EntriesBegin[TM->Index].addr), TM->Index); if (IsTeamConstruct) { rc = Device.run_team_region(TargetTable->EntriesBegin[TM->Index].addr, &tgt_args[0], &tgt_offsets[0], tgt_args.size(), - team_num, thread_limit, ltc, &AsyncInfo); + team_num, thread_limit, ltc, AsyncInfo); } else { rc = Device.run_region(TargetTable->EntriesBegin[TM->Index].addr, &tgt_args[0], &tgt_offsets[0], tgt_args.size(), - &AsyncInfo); + AsyncInfo); } if (rc != OFFLOAD_SUCCESS) { - DP ("Executing target region abort target.\n"); + DP("Executing target region abort target.\n"); return OFFLOAD_FAIL; } @@ -810,15 +825,257 @@ } // Move data from device. - int rt = target_data_end(Device, arg_num, args_base, args, arg_sizes, - arg_types, &AsyncInfo); + int rt = targetDataEnd(Device, arg_num, args_base, args, arg_sizes, arg_types, + AsyncInfo); if (rt != OFFLOAD_SUCCESS) { - DP("Call to target_data_end failed, abort targe.\n"); + DP("Call to targetDataEnd failed, abort target.\n"); return OFFLOAD_FAIL; } - if (Device.RTL->synchronize) - return Device.RTL->synchronize(device_id, &AsyncInfo); + return OFFLOAD_SUCCESS; +} + +int queryAndWait(DeviceTy &Device, __tgt_async_info *AsyncInfo) { + // TODO: Do we need to make it configurable? + constexpr const int MAX_TASK_YIELD_COUNT = 16; + int TaskYieldCount = 0; + while (1) { + int Ret = Device.RTL->check_event(Device.RTLDeviceID, AsyncInfo); + if (Ret == OFFLOAD_SUCCESS) + return OFFLOAD_SUCCESS; + // Something wrong + if (Ret == OFFLOAD_FAIL) + return OFFLOAD_FAIL; + // We have yielded enough time. Now do blocking waiting here. + if (TaskYieldCount > MAX_TASK_YIELD_COUNT) + return Device.RTL->synchronize(Device.RTLDeviceID, AsyncInfo); + // Still not finished yet, do task yield + __kmpc_target_task_yield(); + ++TaskYieldCount; + } + + assert("It should never reach this point!"); + // It should never reach this point + return OFFLOAD_FAIL; +} + +int recordEvent(DeviceTy &Device, __tgt_async_info *AsyncInfo) { + int Ret = Device.RTL->record_event(Device.RTLDeviceID, AsyncInfo); + if (Ret != OFFLOAD_SUCCESS) + return OFFLOAD_FAIL; + + assert(AsyncInfo->Event && "AsyncInfo->Event is nullptr"); return OFFLOAD_SUCCESS; } + +int waitForDeps(DeviceTy &Device, __tgt_async_info *AsyncInfo) { + // Wait until current task has no depending task because during the task + // creation as we enqueue the task even if it has depending host tasks. + while (__kmpc_get_target_task_npredecessors() != 0) + __kmpc_target_task_yield(); + + int Num; + + // Get the number of events that this task depends on + __kmpc_get_target_task_waiting_list(nullptr, &Num); + + // We have a number of depending tasks so we need to insert the event wait + // before pushing operations of current task into the queue + if (Num > 0) { + // Get a list of events that this task depends on + std::vector WaitingList(Num, nullptr); + __kmpc_get_target_task_waiting_list(WaitingList.data(), &Num); + + for (int I = 0; I < Num; ++I) { + __tgt_async_info *WaitingAsyncInfo = + reinterpret_cast<__tgt_async_info *>(WaitingList[I]); + + assert(WaitingAsyncInfo && "WaitingAsyncInfo is nullptr"); + assert(WaitingAsyncInfo->Event && "WaitingAsyncInfo->Event is nullptr"); + assert(WaitingAsyncInfo->DeviceID != -1 && + "Invalid WaitingAsyncInfo->DeviceID"); + + int Ret; + + // Depend on a target task of different type. We do query and wait here. + if (Device.RTL != Devices[WaitingAsyncInfo->DeviceID].RTL) { + Ret = + queryAndWait(Devices[WaitingAsyncInfo->DeviceID], WaitingAsyncInfo); + if (Ret != OFFLOAD_SUCCESS) + return OFFLOAD_FAIL; + + continue; + } + + Ret = Device.RTL->wait_event(Device.RTLDeviceID, AsyncInfo, + WaitingAsyncInfo); + if (Ret != OFFLOAD_SUCCESS) + return OFFLOAD_FAIL; + } + } + + return OFFLOAD_SUCCESS; +} +} // namespace + +int target(int64_t DeviceID, void *HostPtr, int32_t ArgNum, void **ArgsBase, + void **Args, int64_t *ArgSizes, int64_t *ArgTypes, int32_t TeamNum, + int32_t ThreadLimit, int IsTeamConstruct) { + return target(DeviceID, HostPtr, ArgNum, ArgsBase, Args, ArgSizes, ArgTypes, + TeamNum, ThreadLimit, IsTeamConstruct, + /* AsyncInfo */ nullptr); +} + +int targetNowait(int64_t DeviceID, void *HostPtr, int32_t ArgNum, + void **ArgsBase, void **Args, int64_t *ArgSizes, + int64_t *ArgTypes, int32_t TeamNum, int32_t ThreadLimit, + int IsTeamConstruct, int32_t DepNum, void *DepList, + int32_t NoAliasDepNum, void *NoAliasDepList) { + DeviceTy &Device = Devices[DeviceID]; + + // Fall back to synchronous version if necessary interfaces are not supported + if (!Device.RTL->AsyncSupported) { + // Wait until current task has no depending task because during the task + // creation, we enqueue the task even if it has depending target tasks but + // here we don't have enough API to do asynchronous offloading, therefore we + // need to make sure that all depending tasks are finished. + while (__kmpc_get_target_task_npredecessors() != 0) + __kmpc_target_task_yield(); + + // TODO: Need to wait for all dependencies in successors as well + + return target(DeviceID, HostPtr, ArgNum, ArgsBase, Args, ArgSizes, ArgTypes, + TeamNum, ThreadLimit, IsTeamConstruct); + } + + __tgt_async_info *AsyncInfo; + int Ret = Device.initAsyncInfo(&AsyncInfo); + if (Ret != OFFLOAD_SUCCESS) + return OFFLOAD_FAIL; + + Ret = waitForDeps(Device, AsyncInfo); + if (Ret != OFFLOAD_SUCCESS) + return OFFLOAD_FAIL; + + Ret = target(DeviceID, HostPtr, ArgNum, ArgsBase, Args, ArgSizes, ArgTypes, + TeamNum, ThreadLimit, IsTeamConstruct, AsyncInfo); + if (Ret != OFFLOAD_SUCCESS) + return OFFLOAD_FAIL; + + Ret = recordEvent(Device, AsyncInfo); + if (Ret != OFFLOAD_SUCCESS) + return OFFLOAD_FAIL; + + // Attach the async info to current task such that all dependent tasks can + // start wait for the event if there is any dependency + bool HasDependency = __kmpc_set_async_info(AsyncInfo); + + Ret = queryAndWait(Device, AsyncInfo); + if (Ret != OFFLOAD_SUCCESS) + return OFFLOAD_FAIL; + + if (!HasDependency) + return Device.releaseAsyncInfo(AsyncInfo); + + return OFFLOAD_SUCCESS; +} + +template +int targetData(DeviceTy &Device, int32_t ArgNum, void **ArgsBase, void **Args, + int64_t *ArgSizes, int64_t *ArgTypes) { + if (F == TargetDataFuncTy::Begin) + return targetDataBegin(Device, ArgNum, ArgsBase, Args, ArgSizes, ArgTypes, + nullptr); + if (F == TargetDataFuncTy::End) + return targetDataEnd(Device, ArgNum, ArgsBase, Args, ArgSizes, ArgTypes, + nullptr); + if (F == TargetDataFuncTy::Update) + return targetDataUpdate(Device, ArgNum, ArgsBase, Args, ArgSizes, ArgTypes, + nullptr); +} + +template int targetData(DeviceTy &Device, + int32_t ArgNum, + void **ArgsBase, void **Args, + int64_t *ArgSizes, + int64_t *ArgTypes); +template int targetData(DeviceTy &Device, int32_t ArgNum, + void **ArgsBase, void **Args, + int64_t *ArgSizes, + int64_t *ArgTypes); +template int targetData(DeviceTy &Device, + int32_t ArgNum, + void **ArgsBase, void **Args, + int64_t *ArgSizes, + int64_t *ArgTypes); +template +int targetDataNowait(DeviceTy &Device, int32_t ArgNum, void **ArgsBase, + void **Args, int64_t *ArgSizes, int64_t *ArgTypes, + int32_t DepNum, void *DepList, int32_t NoAliasDepNum, + void *NoAliasDepList) { + // Fall back to synchronous version if necessary interfaces are not supported + if (!Device.RTL->AsyncSupported) { + // Wait until current task has no depending task because during the task + // creation, we enqueue the task even if it has depending target tasks but + // here we don't have enough API to do asynchronous offloading, therefore we + // need to make sure that all depending tasks are finished. + while (__kmpc_get_target_task_npredecessors() != 0) + __kmpc_target_task_yield(); + + // TODO: Need to wait for all dependencies in successors as well + + return targetData(Device, ArgNum, ArgsBase, Args, ArgSizes, ArgTypes); + } + + __tgt_async_info *AsyncInfo; + int Ret = Device.initAsyncInfo(&AsyncInfo); + if (Ret != OFFLOAD_SUCCESS) + return OFFLOAD_FAIL; + + Ret = waitForDeps(Device, AsyncInfo); + if (Ret != OFFLOAD_SUCCESS) + return OFFLOAD_FAIL; + + if (F == TargetDataFuncTy::Begin) + Ret = targetDataBegin(Device, ArgNum, ArgsBase, Args, ArgSizes, ArgTypes, + AsyncInfo); + else if (F == TargetDataFuncTy::End) + Ret = targetDataEnd(Device, ArgNum, ArgsBase, Args, ArgSizes, ArgTypes, + AsyncInfo); + else if (F == TargetDataFuncTy::Update) + Ret = targetDataUpdate(Device, ArgNum, ArgsBase, Args, ArgSizes, ArgTypes, + AsyncInfo); + if (Ret != OFFLOAD_SUCCESS) + return OFFLOAD_FAIL; + + Ret = recordEvent(Device, AsyncInfo); + if (Ret != OFFLOAD_SUCCESS) + return OFFLOAD_FAIL; + + // Attach the async info to current task such that all dependent tasks can + // start wait for the event if there is any dependency + bool HasDependency = __kmpc_set_async_info(AsyncInfo); + + Ret = queryAndWait(Device, AsyncInfo); + if (Ret != OFFLOAD_SUCCESS) + return OFFLOAD_FAIL; + + if (!HasDependency) + return Device.releaseAsyncInfo(AsyncInfo); + + return OFFLOAD_SUCCESS; +} + +template int targetDataNowait( + DeviceTy &Device, int32_t ArgNum, void **ArgsBase, void **Args, + int64_t *ArgSizes, int64_t *ArgTypes, int32_t DepNum, void *DepList, + int32_t NoAliasDepNum, void *NoAliasDepList); +template int targetDataNowait( + DeviceTy &Device, int32_t ArgNum, void **ArgsBase, void **Args, + int64_t *ArgSizes, int64_t *ArgTypes, int32_t DepNum, void *DepList, + int32_t NoAliasDepNum, void *NoAliasDepList); +template int targetDataNowait( + DeviceTy &Device, int32_t ArgNum, void **ArgsBase, void **Args, + int64_t *ArgSizes, int64_t *ArgTypes, int32_t DepNum, void *DepList, + int32_t NoAliasDepNum, void *NoAliasDepList); Index: openmp/libomptarget/src/private.h =================================================================== --- openmp/libomptarget/src/private.h +++ openmp/libomptarget/src/private.h @@ -17,21 +17,28 @@ #include -extern int target_data_begin(DeviceTy &Device, int32_t arg_num, - void **args_base, void **args, int64_t *arg_sizes, - int64_t *arg_types, - __tgt_async_info *async_info_ptr); - -extern int target_data_end(DeviceTy &Device, int32_t arg_num, void **args_base, - void **args, int64_t *arg_sizes, int64_t *arg_types, - __tgt_async_info *async_info_ptr); - -extern int target_data_update(DeviceTy &Device, int32_t arg_num, - void **args_base, void **args, int64_t *arg_sizes, int64_t *arg_types); - -extern int target(int64_t device_id, void *host_ptr, int32_t arg_num, - void **args_base, void **args, int64_t *arg_sizes, int64_t *arg_types, - int32_t team_num, int32_t thread_limit, int IsTeamConstruct); +enum TargetDataFuncTy { Begin, End, Update }; + +template +int targetData(DeviceTy &Device, int32_t ArgNum, void **ArgsBase, void **Args, + int64_t *ArgSizes, int64_t *ArgTypes); + +template +int targetDataNowait(DeviceTy &Device, int32_t ArgNum, void **ArgsBase, + void **Args, int64_t *ArgSizes, int64_t *ArgTypes, + int32_t DepNum, void *DepList, int32_t NoAliasDepNum, + void *NoAliasDepList); + +extern int target(int64_t DeviceID, void *HostPtr, int32_t ArgNum, + void **ArgsBase, void **Args, int64_t *ArgSizes, + int64_t *ArgTypes, int32_t TeamNum, int32_t ThreadLimit, + int IsTeamConstruct); + +extern int targetNowait(int64_t DeviceID, void *HostPtr, int32_t ArgNum, + void **ArgsBase, void **Args, int64_t *ArgSizes, + int64_t *ArgTypes, int32_t TeamNum, int32_t ThreadLimit, + int IsTeamConstruct, int32_t DepNum, void *DepList, + int32_t NoAliasDepNum, void *NoAliasDepList); extern int CheckDeviceAndCtors(int64_t device_id); Index: openmp/libomptarget/src/rtl.h =================================================================== --- openmp/libomptarget/src/rtl.h +++ openmp/libomptarget/src/rtl.h @@ -26,33 +26,43 @@ struct RTLInfoTy { typedef int32_t(is_valid_binary_ty)(void *); - typedef int32_t(is_data_exchangable_ty)(int32_t, int32_t); typedef int32_t(number_of_devices_ty)(); typedef int32_t(init_device_ty)(int32_t); typedef __tgt_target_table *(load_binary_ty)(int32_t, void *); typedef void *(data_alloc_ty)(int32_t, int64_t, void *); typedef int32_t(data_submit_ty)(int32_t, void *, void *, int64_t); - typedef int32_t(data_submit_async_ty)(int32_t, void *, void *, int64_t, - __tgt_async_info *); typedef int32_t(data_retrieve_ty)(int32_t, void *, void *, int64_t); - typedef int32_t(data_retrieve_async_ty)(int32_t, void *, void *, int64_t, - __tgt_async_info *); - typedef int32_t(data_exchange_ty)(int32_t, void *, int32_t, void *, int64_t); - typedef int32_t(data_exchange_async_ty)(int32_t, void *, int32_t, void *, - int64_t, __tgt_async_info *); typedef int32_t(data_delete_ty)(int32_t, void *); typedef int32_t(run_region_ty)(int32_t, void *, void **, ptrdiff_t *, int32_t); - typedef int32_t(run_region_async_ty)(int32_t, void *, void **, ptrdiff_t *, - int32_t, __tgt_async_info *); typedef int32_t(run_team_region_ty)(int32_t, void *, void **, ptrdiff_t *, int32_t, int32_t, int32_t, uint64_t); + typedef int64_t(init_requires_ty)(int64_t); + + // Device to device memory copy interfaces + typedef int32_t(is_data_exchangable_ty)(int32_t, int32_t); + typedef int32_t(data_exchange_ty)(int32_t, void *, int32_t, void *, int64_t); + typedef int32_t(data_exchange_async_ty)(int32_t, void *, int32_t, void *, + int64_t, __tgt_async_info *); + + // The following interfaces are all about asynchronous operations + typedef int32_t(data_submit_async_ty)(int32_t, void *, void *, int64_t, + __tgt_async_info *); + typedef int32_t(data_retrieve_async_ty)(int32_t, void *, void *, int64_t, + __tgt_async_info *); + typedef int32_t(run_region_async_ty)(int32_t, void *, void **, ptrdiff_t *, + int32_t, __tgt_async_info *); typedef int32_t(run_team_region_async_ty)(int32_t, void *, void **, ptrdiff_t *, int32_t, int32_t, int32_t, uint64_t, __tgt_async_info *); - typedef int64_t(init_requires_ty)(int64_t); - typedef int64_t(synchronize_ty)(int64_t, __tgt_async_info *); + typedef int32_t(wait_event_ty)(int32_t, __tgt_async_info *, + __tgt_async_info *); + typedef int32_t(record_event_ty)(int32_t, __tgt_async_info *); + typedef int32_t(synchronize_ty)(int32_t, __tgt_async_info *); + typedef int32_t(check_event_ty)(int32_t, __tgt_async_info *); + typedef int32_t(release_async_info_ty)(int32_t, __tgt_async_info *); + typedef int32_t(init_async_info_ty)(int32_t, __tgt_async_info **); int32_t Idx = -1; // RTL index, index is the number of devices // of other RTLs that were registered before, @@ -85,7 +95,12 @@ run_team_region_ty *run_team_region = nullptr; run_team_region_async_ty *run_team_region_async = nullptr; init_requires_ty *init_requires = nullptr; + release_async_info_ty *releaseAsyncInfo = nullptr; + wait_event_ty *wait_event = nullptr; + record_event_ty *record_event = nullptr; synchronize_ty *synchronize = nullptr; + check_event_ty *check_event = nullptr; + init_async_info_ty *initAsyncInfo = nullptr; // Are there images associated with this RTL. bool isUsed = false; @@ -95,6 +110,9 @@ // so that developers of new RTLs do not have to worry about it. std::mutex Mtx; + // Whether it supports asynchronous operation + bool AsyncSupported = true; + // The existence of the mutex above makes RTLInfoTy non-copyable. // We need to provide a copy constructor explicitly. RTLInfoTy() = default; @@ -125,7 +143,13 @@ run_team_region_async = r.run_team_region_async; init_requires = r.init_requires; isUsed = r.isUsed; + AsyncSupported = r.AsyncSupported; + releaseAsyncInfo = r.releaseAsyncInfo; + wait_event = r.wait_event; + record_event = r.record_event; synchronize = r.synchronize; + check_event = r.check_event; + initAsyncInfo = r.initAsyncInfo; } }; Index: openmp/libomptarget/src/rtl.cpp =================================================================== --- openmp/libomptarget/src/rtl.cpp +++ openmp/libomptarget/src/rtl.cpp @@ -146,6 +146,31 @@ dlsym(dynlib_handle, "__tgt_rtl_data_exchange_async"); *((void **)&R.is_data_exchangable) = dlsym(dynlib_handle, "__tgt_rtl_is_data_exchangable"); + *((void **)&R.releaseAsyncInfo) = + dlsym(dynlib_handle, "__tgt_rtl_release_async_info"); + *((void **)&R.wait_event) = dlsym(dynlib_handle, "__tgt_rtl_wait_event"); + *((void **)&R.record_event) = + dlsym(dynlib_handle, "__tgt_rtl_record_event"); + *((void **)&R.check_event) = dlsym(dynlib_handle, "__tgt_rtl_check_event"); + *((void **)&R.initAsyncInfo) = + dlsym(dynlib_handle, "__tgt_rtl_initialize_async_info"); + + if (!R.synchronize || !R.check_event) { + DP("Asynchronous offloading not supported\n"); + R.AsyncSupported = false; + R.data_exchange_async = nullptr; + R.data_retrieve_async = nullptr; + R.data_submit_async = nullptr; + + R.run_region_async = nullptr; + R.run_team_region_async = nullptr; + + R.releaseAsyncInfo = nullptr; + R.record_event = nullptr; + R.wait_event = nullptr; + R.check_event = nullptr; + R.initAsyncInfo = nullptr; + } // No devices are supported by this RTL? if (!(R.NumberOfDevices = R.number_of_devices())) { @@ -387,8 +412,11 @@ Device.PendingGlobalsMtx.lock(); if (Device.PendingCtorsDtors[desc].PendingCtors.empty()) { for (auto &dtor : Device.PendingCtorsDtors[desc].PendingDtors) { - int rc = target(Device.DeviceID, dtor, 0, NULL, NULL, NULL, NULL, 1, - 1, true /*team*/); + int rc = target(Device.DeviceID, dtor, 0 /* arg_num */, + NULL /* arg_base */, NULL /* args */, + NULL /* arg_sizes */, NULL /* arg_types */, + 1 /* team_num */, 1 /* thread_limit */, + true /* IsTeamConstruct */); if (rc != OFFLOAD_SUCCESS) { DP("Running destructor " DPxMOD " failed.\n", DPxPTR(dtor)); } Index: openmp/runtime/src/kmp.h =================================================================== --- openmp/runtime/src/kmp.h +++ openmp/runtime/src/kmp.h @@ -2156,6 +2156,10 @@ #endif std::atomic npredecessors; std::atomic nrefs; + // All its depending target tasks + kmp_depnode_list_t *predecessors; + // A pointer to __tgt_async_info + std::atomic async_info; } kmp_base_depnode_t; union KMP_ALIGN_CACHE kmp_depnode { @@ -2235,7 +2239,8 @@ unsigned priority_specified : 1; /* set if the compiler provides priority setting for the task */ unsigned detachable : 1; /* 1 == can detach */ - unsigned reserved : 9; /* reserved for compiler use */ + unsigned target : 1; /* 1 == target task */ + unsigned reserved : 8; /* reserved for compiler use */ /* Library flags */ /* Total library flags must be 16 bits */ unsigned tasktype : 1; /* task is either explicit(1) or implicit (0) */ @@ -3906,6 +3911,12 @@ extern void __kmp_omp_display_env(int verbose); +// For interaction with libomptarget +extern int __kmpc_set_async_info(void *async_info); +extern void __kmpc_get_target_task_waiting_list(void **list, int *num); +extern void __kmpc_target_task_yield(); +extern int __kmpc_get_target_task_npredecessors(); + #ifdef __cplusplus } #endif Index: openmp/runtime/src/kmp_taskdeps.h =================================================================== --- openmp/runtime/src/kmp_taskdeps.h +++ openmp/runtime/src/kmp_taskdeps.h @@ -11,7 +11,6 @@ // //===----------------------------------------------------------------------===// - #ifndef KMP_TASKDEPS_H #define KMP_TASKDEPS_H @@ -20,6 +19,10 @@ #define KMP_ACQUIRE_DEPNODE(gtid, n) __kmp_acquire_lock(&(n)->dn.lock, (gtid)) #define KMP_RELEASE_DEPNODE(gtid, n) __kmp_release_lock(&(n)->dn.lock, (gtid)) +extern "C" { +void __kmpc_free_async_info(void *); +} + static inline void __kmp_node_deref(kmp_info_t *thread, kmp_depnode_t *node) { if (!node) return; @@ -27,6 +30,21 @@ kmp_int32 n = KMP_ATOMIC_DEC(&node->dn.nrefs) - 1; if (n == 0) { KMP_ASSERT(node->dn.nrefs == 0); + // Free async info + if (node->dn.async_info) + __kmpc_free_async_info( + reinterpret_cast(KMP_ATOMIC_LD_ACQ(&node->dn.async_info))); + // Free the predecessor list + kmp_depnode_list_t *next; + for (kmp_depnode_list_t *p = node->dn.predecessors; p; p = next) { + __kmp_node_deref(thread, p->node); + next = p->next; +#if USE_FAST_MEMORY + __kmp_fast_free(thread, p); +#else + __kmp_thread_free(thread, p); +#endif + } #if USE_FAST_MEMORY __kmp_fast_free(thread, node); #else @@ -111,13 +129,15 @@ kmp_depnode_list_t *next; for (kmp_depnode_list_t *p = node->dn.successors; p; p = next) { kmp_depnode_t *successor = p->node; + kmp_taskdata_t *successor_task_data = KMP_TASK_TO_TASKDATA(successor); kmp_int32 npredecessors = KMP_ATOMIC_DEC(&successor->dn.npredecessors) - 1; // successor task can be NULL for wait_depends or because deps are still // being processed if (npredecessors == 0) { KMP_MB(); - if (successor->dn.task) { + // All target tasks have been enqueued before + if (successor->dn.task && !successor_task_data->td_flags.target) { KA_TRACE(20, ("__kmp_release_deps: T#%d successor %p of %p scheduled " "for execution.\n", gtid, successor->dn.task, task)); Index: openmp/runtime/src/kmp_taskdeps.cpp =================================================================== --- openmp/runtime/src/kmp_taskdeps.cpp +++ openmp/runtime/src/kmp_taskdeps.cpp @@ -45,6 +45,8 @@ #ifdef KMP_SUPPORT_GRAPH_OUTPUT node->dn.id = KMP_ATOMIC_INC(&kmp_node_id_seed); #endif + node->dn.predecessors = NULL; + KMP_ATOMIC_ST_REL(&node->dn.async_info, 0); } static inline kmp_depnode_t *__kmp_node_ref(kmp_depnode_t *node) { @@ -240,19 +242,37 @@ if (!plist) return 0; kmp_int32 npredecessors = 0; - // link node as successor of list elements + kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task); + // If a task is a target task, we will check all its depending tasks. If a + // depending task is a target task, we put it into the predecessors list of + // current task. If a depending task is a host task, we will put current task + // into the successors list of the depending task and increase the count. + // Later on, when we process all dependencies of current task, if the count is + // not zero, meaning current target task depends on at least one host task, we + // will still push the task into the queue and let the RTL dispatch it. + // However, before starting the offloading, it will check whether the count is + // zero and will not proceed if not. for (kmp_depnode_list_t *p = plist; p; p = p->next) { kmp_depnode_t *dep = p->node; if (dep->dn.task) { KMP_ACQUIRE_DEPNODE(gtid, dep); if (dep->dn.task) { __kmp_track_dependence(dep, node, task); - dep->dn.successors = __kmp_add_node(thread, dep->dn.successors, node); - KA_TRACE(40, ("__kmp_process_deps: T#%d adding dependence from %p to " - "%p\n", - gtid, KMP_TASK_TO_TASKDATA(dep->dn.task), - KMP_TASK_TO_TASKDATA(task))); - npredecessors++; + kmp_taskdata_t *dep_taskdata = KMP_TASK_TO_TASKDATA(dep->dn.task); + if (taskdata->td_flags.target && dep_taskdata->td_flags.target) { + node->dn.predecessors = + __kmp_add_node(thread, node->dn.predecessors, dep); + KA_TRACE(40, ("__kmp_process_deps: T#%d target task %p depends on " + "target task %p\n", + gtid, taskdata, dep_taskdata)); + } else { + dep->dn.successors = __kmp_add_node(thread, dep->dn.successors, node); + ++npredecessors; + KA_TRACE(40, ("__kmp_process_deps: T#%d adding dependence from %p to " + "%p\n", + gtid, KMP_TASK_TO_TASKDATA(dep->dn.task), + KMP_TASK_TO_TASKDATA(task))); + } } KMP_RELEASE_DEPNODE(gtid, dep); } @@ -267,18 +287,29 @@ kmp_depnode_t *sink) { if (!sink) return 0; + kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task); kmp_int32 npredecessors = 0; if (sink->dn.task) { // synchronously add source to sink' list of successors KMP_ACQUIRE_DEPNODE(gtid, sink); if (sink->dn.task) { __kmp_track_dependence(sink, source, task); - sink->dn.successors = __kmp_add_node(thread, sink->dn.successors, source); - KA_TRACE(40, ("__kmp_process_deps: T#%d adding dependence from %p to " - "%p\n", - gtid, KMP_TASK_TO_TASKDATA(sink->dn.task), - KMP_TASK_TO_TASKDATA(task))); - npredecessors++; + kmp_taskdata_t *sink_taskdata = KMP_TASK_TO_TASKDATA(sink->dn.task); + if (taskdata->td_flags.target && sink_taskdata->td_flags.target) { + source->dn.predecessors = + __kmp_add_node(thread, source->dn.predecessors, sink); + KA_TRACE(40, ("__kmp_process_deps: T#%d target task %p depends on " + "target task %p\n", + gtid, taskdata, sink_taskdata)); + } else { + sink->dn.successors = + __kmp_add_node(thread, sink->dn.successors, source); + npredecessors++; + KA_TRACE(40, ("__kmp_process_deps: T#%d adding dependence from %p to " + "%p\n", + gtid, KMP_TASK_TO_TASKDATA(sink->dn.task), + KMP_TASK_TO_TASKDATA(task))); + } } KMP_RELEASE_DEPNODE(gtid, sink); } @@ -612,7 +643,11 @@ current_task->ompt_task_info.frame.enter_frame = ompt_data_none; } #endif - return TASK_CURRENT_NOT_QUEUED; + // All target tasks will be enqueued directly no matter whether their + // dependencies have been fullfilled. They will be checked again in + // libomptarget. + if (!new_taskdata->td_flags.target) + return TASK_CURRENT_NOT_QUEUED; } } else { KA_TRACE(10, ("__kmpc_omp_task_with_deps(exit): T#%d ignored dependencies " Index: openmp/runtime/src/kmp_tasking.cpp =================================================================== --- openmp/runtime/src/kmp_tasking.cpp +++ openmp/runtime/src/kmp_tasking.cpp @@ -930,7 +930,7 @@ #endif // Only need to keep track of count if team parallel and tasking not - // serialized, or task is detachable and event has already been fulfilled + // serialized, or task is detachable and event has already been fulfilled if (!(taskdata->td_flags.team_serial || taskdata->td_flags.tasking_ser) || taskdata->td_flags.detachable == TASK_DETACHABLE) { // Predecrement simulated by "- 1" calculation @@ -1310,6 +1310,7 @@ taskdata->td_flags.destructors_thunk = flags->destructors_thunk; taskdata->td_flags.proxy = flags->proxy; taskdata->td_flags.detachable = flags->detachable; + taskdata->td_flags.target = flags->target; taskdata->td_task_team = thread->th.th_task_team; taskdata->td_size_alloc = shareds_offset + sizeof_shareds; taskdata->td_flags.tasktype = TASK_EXPLICIT; @@ -1405,6 +1406,9 @@ size_t sizeof_shareds, kmp_routine_entry_t task_entry, kmp_int64 device_id) { + // All tasks created via this interface should be a target task + kmp_tasking_flags_t *input_flags = (kmp_tasking_flags_t *)&flags; + input_flags->target = TRUE; return __kmpc_omp_task_alloc(loc_ref, gtid, flags, sizeof_kmp_task_t, sizeof_shareds, task_entry); } @@ -4570,3 +4574,75 @@ } KA_TRACE(20, ("__kmpc_taskloop(exit): T#%d\n", gtid)); } + +// Bind the async info to current task. Return FALSE if this task does not have +// any dependency. Otherwise, return TRUE. +int __kmpc_set_async_info(void *async_info) { + int gtid = __kmp_get_gtid(); + kmp_info_t *thread = __kmp_threads[gtid]; + kmp_depnode_t *dep = thread->th.th_current_task->td_depnode; + if (!dep) + return FALSE; + KMP_ATOMIC_ST_REL(&dep->dn.async_info, + reinterpret_cast(async_info)); + return TRUE; +} + +// Get the list of waiting async info. If list is NULL, just query the number of +// predecessors current executing task has. If not, list will contain all +// asynchronous +void __kmpc_get_target_task_waiting_list(void **list, int *num) { + KMP_DEBUG_ASSERT(num != NULL); + + int gtid = __kmp_get_gtid(); + kmp_info_t *thread = __kmp_threads[gtid]; + kmp_taskdata_t *taskdata = thread->th.th_current_task; + + // If the depnode is nullptr, the team must work in a serial mode + if (!taskdata->td_depnode) { + *num = 0; + return; + } + + int n = 0; + for (kmp_depnode_list_t *p = taskdata->td_depnode->dn.predecessors; p; + p = p->next) { + kmp_depnode_t *dep = p->node; + if (dep->dn.task) { + kmp_taskdata_t *pred_task = KMP_TASK_TO_TASKDATA(dep->dn.task); + KMP_ASSERT(pred_task->td_flags.target); + if (list) { + while (KMP_ATOMIC_LD_ACQ(&dep->dn.async_info) == 0) + __kmpc_omp_taskyield(nullptr, gtid, 0); + list[n] = + reinterpret_cast(KMP_ATOMIC_LD_ACQ(&dep->dn.async_info)); + } + ++n; + } + } + + *num = n; +} + +void __kmpc_target_task_yield() { + int gtid = __kmp_get_gtid(); + __kmpc_omp_taskyield(nullptr, gtid, 0); +} + +int __kmpc_get_target_task_npredecessors() { + int gtid = __kmp_get_gtid(); + kmp_info_t *thread = __kmp_threads[gtid]; + kmp_taskdata_t *taskdata = thread->th.th_current_task; + kmp_depnode_t *dep = taskdata->td_depnode; + + if (!dep) + return 0; + + int n; + + KMP_ACQUIRE_DEPNODE(gtid, dep); + n = dep->dn.npredecessors; + KMP_RELEASE_DEPNODE(gtid, dep); + + return n; +}