diff --git a/openmp/libomptarget/include/omptarget.h b/openmp/libomptarget/include/omptarget.h --- a/openmp/libomptarget/include/omptarget.h +++ b/openmp/libomptarget/include/omptarget.h @@ -111,6 +111,15 @@ *EntriesEnd; // End of the table with all the entries (non inclusive) }; +/// This struct contains information exchanged between different asynchronous +/// operations for device-dependent optimization and potential synchronization +struct __tgt_async_info { + // A pointer to a queue-like structure where offloading operations are issued. + // We assume to use this structure to do synchronization. In CUDA backend, it + // is CUstream. + void *Queue = nullptr; +}; + #ifdef __cplusplus extern "C" { #endif diff --git a/openmp/libomptarget/include/omptargetplugin.h b/openmp/libomptarget/include/omptargetplugin.h --- a/openmp/libomptarget/include/omptargetplugin.h +++ b/openmp/libomptarget/include/omptargetplugin.h @@ -63,11 +63,23 @@ int32_t __tgt_rtl_data_submit(int32_t ID, void *TargetPtr, void *HostPtr, int64_t Size); +// Asynchronously pass the data content to the target device using the target +// address. In case of success, return zero. Otherwise, return an error code. +int32_t __tgt_rtl_data_submit_async(int32_t ID, void *TargetPtr, void *HostPtr, + int64_t Size, + __tgt_async_info *AsyncInfoPtr); + // Retrieve the data content from the target device using its address. // In case of success, return zero. Otherwise, return an error code. int32_t __tgt_rtl_data_retrieve(int32_t ID, void *HostPtr, void *TargetPtr, int64_t Size); +// Asynchronously retrieve the data content from the target device using its +// address. In case of success, return zero. Otherwise, return an error code. +int32_t __tgt_rtl_data_retrieve_async(int32_t ID, void *HostPtr, + void *TargetPtr, int64_t Size, + __tgt_async_info *AsyncInfoPtr); + // De-allocate the data referenced by target ptr on the device. In case of // success, return zero. Otherwise, return an error code. int32_t __tgt_rtl_data_delete(int32_t ID, void *TargetPtr); @@ -78,14 +90,20 @@ // to the outlined function on device side. In case of success, return zero. // Otherwise, return an error code. int32_t __tgt_rtl_run_target_region(int32_t ID, void *Entry, void **Args, - ptrdiff_t *Offsets, int32_t NumArgs); + ptrdiff_t *Offsets, int32_t NumArgs, + __tgt_async_info *AsyncInfoPtr); // Similar to __tgt_rtl_run_target_region, but additionally specify the // number of teams to be created and a number of threads in each team. int32_t __tgt_rtl_run_target_team_region(int32_t ID, void *Entry, void **Args, ptrdiff_t *Offsets, int32_t NumArgs, int32_t NumTeams, int32_t ThreadLimit, - uint64_t loop_tripcount); + uint64_t loop_tripcount, + __tgt_async_info *AsyncInfoPtr); + +// Device synchronization. In case of success, return zero. Otherwise, return an +// error code. +int32_t __tgt_rtl_synchronize(int32_t ID, __tgt_async_info *AsyncInfoPtr); #ifdef __cplusplus } diff --git a/openmp/libomptarget/plugins/cuda/src/rtl.cpp b/openmp/libomptarget/plugins/cuda/src/rtl.cpp --- a/openmp/libomptarget/plugins/cuda/src/rtl.cpp +++ b/openmp/libomptarget/plugins/cuda/src/rtl.cpp @@ -309,6 +309,18 @@ static RTLDeviceInfoTy DeviceInfo; +namespace { +CUstream selectStream(int32_t Id, __tgt_async_info *AsyncInfo) { + if (!AsyncInfo) + return DeviceInfo.getNextStream(Id); + + if (!AsyncInfo->Queue) + AsyncInfo->Queue = DeviceInfo.getNextStream(Id); + + return reinterpret_cast(AsyncInfo->Queue); +} +} // namespace + #ifdef __cplusplus extern "C" { #endif @@ -662,8 +674,9 @@ return vptr; } -int32_t __tgt_rtl_data_submit(int32_t device_id, void *tgt_ptr, void *hst_ptr, - int64_t size) { +int32_t __tgt_rtl_data_submit_async(int32_t device_id, void *tgt_ptr, + void *hst_ptr, int64_t size, + __tgt_async_info *async_info) { // Set the context we are using. CUresult err = cuCtxSetCurrent(DeviceInfo.Contexts[device_id]); if (err != CUDA_SUCCESS) { @@ -672,7 +685,7 @@ return OFFLOAD_FAIL; } - CUstream &Stream = DeviceInfo.getNextStream(device_id); + CUstream Stream = selectStream(device_id, async_info); err = cuMemcpyHtoDAsync((CUdeviceptr)tgt_ptr, hst_ptr, size, Stream); if (err != CUDA_SUCCESS) { @@ -683,20 +696,25 @@ return OFFLOAD_FAIL; } - err = cuStreamSynchronize(Stream); - if (err != CUDA_SUCCESS) { - DP("Error when synchronizing async data transfer from host to device. " - "Pointers: host = " DPxMOD ", device = " DPxMOD ", size = %" PRId64 "\n", - DPxPTR(hst_ptr), DPxPTR(tgt_ptr), size); - CUDA_ERR_STRING(err); + return OFFLOAD_SUCCESS; +} + +int32_t __tgt_rtl_data_submit(int32_t device_id, void *tgt_ptr, void *hst_ptr, + int64_t size) { + __tgt_async_info async_info; + + int32_t rc = __tgt_rtl_data_submit_async(device_id, tgt_ptr, hst_ptr, size, + &async_info); + if (rc != OFFLOAD_SUCCESS) { return OFFLOAD_FAIL; } - return OFFLOAD_SUCCESS; + return __tgt_rtl_synchronize(device_id, &async_info); } -int32_t __tgt_rtl_data_retrieve(int32_t device_id, void *hst_ptr, void *tgt_ptr, - int64_t size) { +int32_t __tgt_rtl_data_retrieve_async(int32_t device_id, void *hst_ptr, + void *tgt_ptr, int64_t size, + __tgt_async_info *async_info) { // Set the context we are using. CUresult err = cuCtxSetCurrent(DeviceInfo.Contexts[device_id]); if (err != CUDA_SUCCESS) { @@ -705,7 +723,7 @@ return OFFLOAD_FAIL; } - CUstream &Stream = DeviceInfo.getNextStream(device_id); + CUstream Stream = selectStream(device_id, async_info); err = cuMemcpyDtoHAsync(hst_ptr, (CUdeviceptr)tgt_ptr, size, Stream); if (err != CUDA_SUCCESS) { @@ -716,16 +734,20 @@ return OFFLOAD_FAIL; } - err = cuStreamSynchronize(Stream); - if (err != CUDA_SUCCESS) { - DP("Error when synchronizing async data transfer from device to host. " - "Pointers: host = " DPxMOD ", device = " DPxMOD ", size = %" PRId64 "\n", - DPxPTR(hst_ptr), DPxPTR(tgt_ptr), size); - CUDA_ERR_STRING(err); + return OFFLOAD_SUCCESS; +} + +int32_t __tgt_rtl_data_retrieve(int32_t device_id, void *hst_ptr, void *tgt_ptr, + int64_t size) { + __tgt_async_info async_info; + + int32_t rc = __tgt_rtl_data_retrieve_async(device_id, hst_ptr, tgt_ptr, size, + &async_info); + if (rc != OFFLOAD_SUCCESS) { return OFFLOAD_FAIL; } - return OFFLOAD_SUCCESS; + return __tgt_rtl_synchronize(device_id, &async_info); } int32_t __tgt_rtl_data_delete(int32_t device_id, void *tgt_ptr) { @@ -747,8 +769,12 @@ } int32_t __tgt_rtl_run_target_team_region(int32_t device_id, void *tgt_entry_ptr, - void **tgt_args, ptrdiff_t *tgt_offsets, int32_t arg_num, int32_t team_num, - int32_t thread_limit, uint64_t loop_tripcount) { + void **tgt_args, + ptrdiff_t *tgt_offsets, + int32_t arg_num, int32_t team_num, + int32_t thread_limit, + uint64_t loop_tripcount, + __tgt_async_info *async_info) { // Set the context we are using. CUresult err = cuCtxSetCurrent(DeviceInfo.Contexts[device_id]); if (err != CUDA_SUCCESS) { @@ -844,8 +870,7 @@ DP("Launch kernel with %d blocks and %d threads\n", cudaBlocksPerGrid, cudaThreadsPerBlock); - CUstream &Stream = DeviceInfo.getNextStream(device_id); - + CUstream Stream = selectStream(device_id, async_info); err = cuLaunchKernel(KernelInfo->Func, cudaBlocksPerGrid, 1, 1, cudaThreadsPerBlock, 1, 1, 0 /*bytes of shared memory*/, Stream, &args[0], 0); @@ -858,25 +883,35 @@ DP("Launch of entry point at " DPxMOD " successful!\n", DPxPTR(tgt_entry_ptr)); - CUresult sync_err = cuStreamSynchronize(Stream); - if (sync_err != CUDA_SUCCESS) { - DP("Kernel execution error at " DPxMOD "!\n", DPxPTR(tgt_entry_ptr)); - CUDA_ERR_STRING(sync_err); - return OFFLOAD_FAIL; - } else { - DP("Kernel execution at " DPxMOD " successful!\n", DPxPTR(tgt_entry_ptr)); - } - return OFFLOAD_SUCCESS; } int32_t __tgt_rtl_run_target_region(int32_t device_id, void *tgt_entry_ptr, - void **tgt_args, ptrdiff_t *tgt_offsets, int32_t arg_num) { + void **tgt_args, ptrdiff_t *tgt_offsets, + int32_t arg_num, + __tgt_async_info *async_info) { // use one team and the default number of threads. const int32_t team_num = 1; const int32_t thread_limit = 0; return __tgt_rtl_run_target_team_region(device_id, tgt_entry_ptr, tgt_args, - tgt_offsets, arg_num, team_num, thread_limit, 0); + tgt_offsets, arg_num, team_num, + thread_limit, 0, async_info); +} + +int32_t __tgt_rtl_synchronize(int32_t device_id, __tgt_async_info *async_info) { + assert(async_info && "async_info is nullptr"); + assert(async_info->Queue && "async_info->Identifier is nullptr"); + + auto Stream = reinterpret_cast(async_info->Queue); + CUresult Err = cuStreamSynchronize(Stream); + if (Err != CUDA_SUCCESS) { + DP("Error when synchronizing stream. stream = " DPxMOD + ", async info ptr = " DPxMOD "\n", + DPxPTR(Stream), DPxPTR(async_info)); + CUDA_ERR_STRING(Err); + return OFFLOAD_FAIL; + } + return OFFLOAD_SUCCESS; } #ifdef __cplusplus diff --git a/openmp/libomptarget/plugins/exports b/openmp/libomptarget/plugins/exports --- a/openmp/libomptarget/plugins/exports +++ b/openmp/libomptarget/plugins/exports @@ -7,10 +7,13 @@ __tgt_rtl_load_binary; __tgt_rtl_data_alloc; __tgt_rtl_data_submit; + __tgt_rtl_data_submit_async; __tgt_rtl_data_retrieve; + __tgt_rtl_data_retrieve_async; __tgt_rtl_data_delete; __tgt_rtl_run_target_team_region; __tgt_rtl_run_target_region; + __tgt_rtl_synchronize; local: *; }; diff --git a/openmp/libomptarget/plugins/generic-elf-64bit/src/rtl.cpp b/openmp/libomptarget/plugins/generic-elf-64bit/src/rtl.cpp --- a/openmp/libomptarget/plugins/generic-elf-64bit/src/rtl.cpp +++ b/openmp/libomptarget/plugins/generic-elf-64bit/src/rtl.cpp @@ -293,9 +293,11 @@ return OFFLOAD_SUCCESS; } -int32_t __tgt_rtl_run_target_team_region(int32_t device_id, void *tgt_entry_ptr, - void **tgt_args, ptrdiff_t *tgt_offsets, int32_t arg_num, int32_t team_num, - int32_t thread_limit, uint64_t loop_tripcount /*not used*/) { +int32_t __tgt_rtl_run_target_team_region( + int32_t device_id, void *tgt_entry_ptr, void **tgt_args, + ptrdiff_t *tgt_offsets, int32_t arg_num, int32_t team_num, + int32_t thread_limit, uint64_t loop_tripcount /*not used*/, + __tgt_async_info *async_info /*not used*/) { // ignore team num and thread limit. // Use libffi to launch execution. @@ -328,10 +330,13 @@ } int32_t __tgt_rtl_run_target_region(int32_t device_id, void *tgt_entry_ptr, - void **tgt_args, ptrdiff_t *tgt_offsets, int32_t arg_num) { + void **tgt_args, ptrdiff_t *tgt_offsets, + int32_t arg_num, + __tgt_async_info *async_info) { // use one team and one thread. return __tgt_rtl_run_target_team_region(device_id, tgt_entry_ptr, tgt_args, - tgt_offsets, arg_num, 1, 1, 0); + tgt_offsets, arg_num, 1, 1, 0, + async_info); } #ifdef __cplusplus diff --git a/openmp/libomptarget/src/api.cpp b/openmp/libomptarget/src/api.cpp --- a/openmp/libomptarget/src/api.cpp +++ b/openmp/libomptarget/src/api.cpp @@ -161,19 +161,19 @@ } else if (src_device == omp_get_initial_device()) { DP("copy from host to device\n"); DeviceTy& DstDev = Devices[dst_device]; - rc = DstDev.data_submit(dstAddr, srcAddr, length); + rc = DstDev.data_submit(dstAddr, srcAddr, length, nullptr); } else if (dst_device == omp_get_initial_device()) { DP("copy from device to host\n"); DeviceTy& SrcDev = Devices[src_device]; - rc = SrcDev.data_retrieve(dstAddr, srcAddr, length); + rc = SrcDev.data_retrieve(dstAddr, srcAddr, length, nullptr); } else { DP("copy from device to device\n"); void *buffer = malloc(length); DeviceTy& SrcDev = Devices[src_device]; DeviceTy& DstDev = Devices[dst_device]; - rc = SrcDev.data_retrieve(buffer, srcAddr, length); + rc = SrcDev.data_retrieve(buffer, srcAddr, length, nullptr); if (rc == OFFLOAD_SUCCESS) - rc = DstDev.data_submit(dstAddr, buffer, length); + rc = DstDev.data_submit(dstAddr, buffer, length, nullptr); free(buffer); } diff --git a/openmp/libomptarget/src/device.h b/openmp/libomptarget/src/device.h --- a/openmp/libomptarget/src/device.h +++ b/openmp/libomptarget/src/device.h @@ -24,6 +24,7 @@ struct RTLInfoTy; struct __tgt_bin_desc; struct __tgt_target_table; +struct __tgt_async_info; /// Map between host data and target data. struct HostDataToTargetTy { @@ -173,14 +174,20 @@ int32_t initOnce(); __tgt_target_table *load_binary(void *Img); - int32_t data_submit(void *TgtPtrBegin, void *HstPtrBegin, int64_t Size); - int32_t data_retrieve(void *HstPtrBegin, void *TgtPtrBegin, int64_t Size); + // Asynchronous data transfer. When AsyncInfoPtr is nullptr, the transfer will + // be synchronous. + int32_t data_submit(void *TgtPtrBegin, void *HstPtrBegin, int64_t Size, + __tgt_async_info *AsyncInfoPtr); + int32_t data_retrieve(void *HstPtrBegin, void *TgtPtrBegin, int64_t Size, + __tgt_async_info *AsyncInfoPtr); int32_t run_region(void *TgtEntryPtr, void **TgtVarsPtr, - ptrdiff_t *TgtOffsets, int32_t TgtVarsSize); + ptrdiff_t *TgtOffsets, int32_t TgtVarsSize, + __tgt_async_info *AsyncInfo); int32_t run_team_region(void *TgtEntryPtr, void **TgtVarsPtr, - ptrdiff_t *TgtOffsets, int32_t TgtVarsSize, int32_t NumTeams, - int32_t ThreadLimit, uint64_t LoopTripCount); + ptrdiff_t *TgtOffsets, int32_t TgtVarsSize, + int32_t NumTeams, int32_t ThreadLimit, + uint64_t LoopTripCount, __tgt_async_info *AsyncInfo); private: // Call to RTL diff --git a/openmp/libomptarget/src/device.cpp b/openmp/libomptarget/src/device.cpp --- a/openmp/libomptarget/src/device.cpp +++ b/openmp/libomptarget/src/device.cpp @@ -331,31 +331,47 @@ return rc; } -// Submit data to device. +// Submit data to device asynchronously. int32_t DeviceTy::data_submit(void *TgtPtrBegin, void *HstPtrBegin, - int64_t Size) { - return RTL->data_submit(RTLDeviceID, TgtPtrBegin, HstPtrBegin, Size); + int64_t Size, __tgt_async_info *AsyncInfoPtr) { + // If no async info or the async handler is null, fall back to sync version + if (!AsyncInfoPtr || !RTL->data_submit_async) { + return RTL->data_submit(RTLDeviceID, TgtPtrBegin, HstPtrBegin, Size); + } else { + return RTL->data_submit_async(RTLDeviceID, TgtPtrBegin, HstPtrBegin, Size, + AsyncInfoPtr); + } } -// Retrieve data from device. +// Retrieve data from device asynchronously. int32_t DeviceTy::data_retrieve(void *HstPtrBegin, void *TgtPtrBegin, - int64_t Size) { - return RTL->data_retrieve(RTLDeviceID, HstPtrBegin, TgtPtrBegin, Size); + int64_t Size, __tgt_async_info *AsyncInfoPtr) { + // If no async info or the async handler is null, fall back to sync version + if (!AsyncInfoPtr || !RTL->data_retrieve_async) { + return RTL->data_retrieve(RTLDeviceID, HstPtrBegin, TgtPtrBegin, Size); + } else { + return RTL->data_retrieve_async(RTLDeviceID, HstPtrBegin, TgtPtrBegin, Size, + AsyncInfoPtr); + } } // Run region on device int32_t DeviceTy::run_region(void *TgtEntryPtr, void **TgtVarsPtr, - ptrdiff_t *TgtOffsets, int32_t TgtVarsSize) { + ptrdiff_t *TgtOffsets, int32_t TgtVarsSize, + __tgt_async_info *AsyncInfo) { return RTL->run_region(RTLDeviceID, TgtEntryPtr, TgtVarsPtr, TgtOffsets, - TgtVarsSize); + TgtVarsSize, AsyncInfo); } // Run team region on device. int32_t DeviceTy::run_team_region(void *TgtEntryPtr, void **TgtVarsPtr, - ptrdiff_t *TgtOffsets, int32_t TgtVarsSize, int32_t NumTeams, - int32_t ThreadLimit, uint64_t LoopTripCount) { + ptrdiff_t *TgtOffsets, int32_t TgtVarsSize, + int32_t NumTeams, int32_t ThreadLimit, + uint64_t LoopTripCount, + __tgt_async_info *AsyncInfo) { return RTL->run_team_region(RTLDeviceID, TgtEntryPtr, TgtVarsPtr, TgtOffsets, - TgtVarsSize, NumTeams, ThreadLimit, LoopTripCount); + TgtVarsSize, NumTeams, ThreadLimit, LoopTripCount, + AsyncInfo); } /// Check whether a device has an associated RTL and initialize it if it's not diff --git a/openmp/libomptarget/src/omptarget.cpp b/openmp/libomptarget/src/omptarget.cpp --- a/openmp/libomptarget/src/omptarget.cpp +++ b/openmp/libomptarget/src/omptarget.cpp @@ -215,8 +215,9 @@ } /// Internal function to do the mapping and transfer the data to the device -int target_data_begin(DeviceTy &Device, int32_t arg_num, - void **args_base, void **args, int64_t *arg_sizes, int64_t *arg_types) { +int target_data_begin(DeviceTy &Device, int32_t arg_num, void **args_base, + void **args, int64_t *arg_sizes, int64_t *arg_types, + __tgt_async_info *async_info_ptr) { // process each input. for (int32_t i = 0; i < arg_num; ++i) { // Ignore private variables and arrays - there is no mapping for them. @@ -316,8 +317,9 @@ if (copy && !IsHostPtr) { DP("Moving %" PRId64 " bytes (hst:" DPxMOD ") -> (tgt:" DPxMOD ")\n", - data_size, DPxPTR(HstPtrBegin), DPxPTR(TgtPtrBegin)); - int rt = Device.data_submit(TgtPtrBegin, HstPtrBegin, data_size); + data_size, DPxPTR(HstPtrBegin), DPxPTR(TgtPtrBegin)); + int rt = Device.data_submit(TgtPtrBegin, HstPtrBegin, data_size, + async_info_ptr); if (rt != OFFLOAD_SUCCESS) { DP("Copying data to device failed.\n"); return OFFLOAD_FAIL; @@ -331,7 +333,7 @@ uint64_t Delta = (uint64_t)HstPtrBegin - (uint64_t)HstPtrBase; void *TgtPtrBase = (void *)((uint64_t)TgtPtrBegin - Delta); int rt = Device.data_submit(Pointer_TgtPtrBegin, &TgtPtrBase, - sizeof(void *)); + sizeof(void *), async_info_ptr); if (rt != OFFLOAD_SUCCESS) { DP("Copying data to device failed.\n"); return OFFLOAD_FAIL; @@ -349,7 +351,8 @@ /// Internal function to undo the mapping and retrieve the data from the device. int target_data_end(DeviceTy &Device, int32_t arg_num, void **args_base, - void **args, int64_t *arg_sizes, int64_t *arg_types) { + void **args, int64_t *arg_sizes, int64_t *arg_types, + __tgt_async_info *async_info_ptr) { // process each input. for (int32_t i = arg_num - 1; i >= 0; --i) { // Ignore private variables and arrays - there is no mapping for them. @@ -419,8 +422,9 @@ !(RTLs->RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY && TgtPtrBegin == HstPtrBegin)) { DP("Moving %" PRId64 " bytes (tgt:" DPxMOD ") -> (hst:" DPxMOD ")\n", - data_size, DPxPTR(TgtPtrBegin), DPxPTR(HstPtrBegin)); - int rt = Device.data_retrieve(HstPtrBegin, TgtPtrBegin, data_size); + data_size, DPxPTR(TgtPtrBegin), DPxPTR(HstPtrBegin)); + int rt = Device.data_retrieve(HstPtrBegin, TgtPtrBegin, data_size, + async_info_ptr); if (rt != OFFLOAD_SUCCESS) { DP("Copying data from device failed.\n"); return OFFLOAD_FAIL; @@ -509,7 +513,7 @@ if (arg_types[i] & OMP_TGT_MAPTYPE_FROM) { DP("Moving %" PRId64 " bytes (tgt:" DPxMOD ") -> (hst:" DPxMOD ")\n", arg_sizes[i], DPxPTR(TgtPtrBegin), DPxPTR(HstPtrBegin)); - int rt = Device.data_retrieve(HstPtrBegin, TgtPtrBegin, MapSize); + int rt = Device.data_retrieve(HstPtrBegin, TgtPtrBegin, MapSize, nullptr); if (rt != OFFLOAD_SUCCESS) { DP("Copying data from device failed.\n"); return OFFLOAD_FAIL; @@ -536,7 +540,7 @@ if (arg_types[i] & OMP_TGT_MAPTYPE_TO) { DP("Moving %" PRId64 " bytes (hst:" DPxMOD ") -> (tgt:" DPxMOD ")\n", arg_sizes[i], DPxPTR(HstPtrBegin), DPxPTR(TgtPtrBegin)); - int rt = Device.data_submit(TgtPtrBegin, HstPtrBegin, MapSize); + int rt = Device.data_submit(TgtPtrBegin, HstPtrBegin, MapSize, nullptr); if (rt != OFFLOAD_SUCCESS) { DP("Copying data to device failed.\n"); return OFFLOAD_FAIL; @@ -556,7 +560,7 @@ "pointer " DPxMOD "\n", DPxPTR(it->second.TgtPtrVal), DPxPTR(it->second.TgtPtrAddr)); rt = Device.data_submit(it->second.TgtPtrAddr, - &it->second.TgtPtrVal, sizeof(void *)); + &it->second.TgtPtrVal, sizeof(void *), nullptr); if (rt != OFFLOAD_SUCCESS) { DP("Copying data to device failed.\n"); Device.ShadowMtx.unlock(); @@ -638,9 +642,11 @@ TrlTblMtx->unlock(); assert(TargetTable && "Global data has not been mapped\n"); + __tgt_async_info AsynInfo; + // Move data to device. int rc = target_data_begin(Device, arg_num, args_base, args, arg_sizes, - arg_types); + arg_types, &AsynInfo); if (rc != OFFLOAD_SUCCESS) { DP("Call to target_data_begin failed, abort target.\n"); return OFFLOAD_FAIL; @@ -691,7 +697,7 @@ DP("Update lambda reference (" DPxMOD ") -> [" DPxMOD "]\n", DPxPTR(Pointer_TgtPtrBegin), DPxPTR(TgtPtrBegin)); int rt = Device.data_submit(TgtPtrBegin, &Pointer_TgtPtrBegin, - sizeof(void *)); + sizeof(void *), &AsynInfo); if (rt != OFFLOAD_SUCCESS) { DP("Copying data to device failed.\n"); return OFFLOAD_FAIL; @@ -732,9 +738,10 @@ #endif // If first-private, copy data from host if (arg_types[i] & OMP_TGT_MAPTYPE_TO) { - int rt = Device.data_submit(TgtPtrBegin, HstPtrBegin, arg_sizes[i]); + int rt = Device.data_submit(TgtPtrBegin, HstPtrBegin, arg_sizes[i], + &AsynInfo); if (rt != OFFLOAD_SUCCESS) { - DP ("Copying data to device failed, failed.\n"); + DP("Copying data to device failed, failed.\n"); return OFFLOAD_FAIL; } } @@ -780,11 +787,12 @@ DPxPTR(TargetTable->EntriesBegin[TM->Index].addr), TM->Index); if (IsTeamConstruct) { rc = Device.run_team_region(TargetTable->EntriesBegin[TM->Index].addr, - &tgt_args[0], &tgt_offsets[0], tgt_args.size(), team_num, - thread_limit, ltc); + &tgt_args[0], &tgt_offsets[0], tgt_args.size(), + team_num, thread_limit, ltc, &AsynInfo); } else { rc = Device.run_region(TargetTable->EntriesBegin[TM->Index].addr, - &tgt_args[0], &tgt_offsets[0], tgt_args.size()); + &tgt_args[0], &tgt_offsets[0], tgt_args.size(), + &AsynInfo); } if (rc != OFFLOAD_SUCCESS) { DP ("Executing target region abort target.\n"); @@ -802,11 +810,17 @@ // Move data from device. int rt = target_data_end(Device, arg_num, args_base, args, arg_sizes, - arg_types); + arg_types, &AsynInfo); if (rt != OFFLOAD_SUCCESS) { DP("Call to target_data_end failed, abort targe.\n"); return OFFLOAD_FAIL; } + // If synchronize is not nullptr, we're using asynchronous operations above so + // we need a final synchronization. + if (Device.RTL->synchronize) { + return Device.RTL->synchronize(device_id, &AsynInfo); + } + return OFFLOAD_SUCCESS; } diff --git a/openmp/libomptarget/src/private.h b/openmp/libomptarget/src/private.h --- a/openmp/libomptarget/src/private.h +++ b/openmp/libomptarget/src/private.h @@ -18,10 +18,13 @@ #include extern int target_data_begin(DeviceTy &Device, int32_t arg_num, - void **args_base, void **args, int64_t *arg_sizes, int64_t *arg_types); + void **args_base, void **args, int64_t *arg_sizes, + int64_t *arg_types, + __tgt_async_info *async_info_ptr = nullptr); extern int target_data_end(DeviceTy &Device, int32_t arg_num, void **args_base, - void **args, int64_t *arg_sizes, int64_t *arg_types); + void **args, int64_t *arg_sizes, int64_t *arg_types, + __tgt_async_info *async_info_ptr = nullptr); extern int target_data_update(DeviceTy &Device, int32_t arg_num, void **args_base, void **args, int64_t *arg_sizes, int64_t *arg_types); diff --git a/openmp/libomptarget/src/rtl.h b/openmp/libomptarget/src/rtl.h --- a/openmp/libomptarget/src/rtl.h +++ b/openmp/libomptarget/src/rtl.h @@ -31,13 +31,19 @@ typedef __tgt_target_table *(load_binary_ty)(int32_t, void *); typedef void *(data_alloc_ty)(int32_t, int64_t, void *); typedef int32_t(data_submit_ty)(int32_t, void *, void *, int64_t); + typedef int32_t(data_submit_async_ty)(int32_t, void *, void *, int64_t, + __tgt_async_info *); typedef int32_t(data_retrieve_ty)(int32_t, void *, void *, int64_t); + typedef int32_t(data_retrieve_async_ty)(int32_t, void *, void *, int64_t, + __tgt_async_info *); typedef int32_t(data_delete_ty)(int32_t, void *); - typedef int32_t(run_region_ty)(int32_t, void *, void **, ptrdiff_t *, - int32_t); + typedef int32_t(run_region_ty)(int32_t, void *, void **, ptrdiff_t *, int32_t, + __tgt_async_info *); typedef int32_t(run_team_region_ty)(int32_t, void *, void **, ptrdiff_t *, - int32_t, int32_t, int32_t, uint64_t); + int32_t, int32_t, int32_t, uint64_t, + __tgt_async_info *); typedef int64_t(init_requires_ty)(int64_t); + typedef int64_t(synchronize_ty)(int64_t, __tgt_async_info *); int32_t Idx = -1; // RTL index, index is the number of devices // of other RTLs that were registered before, @@ -59,10 +65,13 @@ data_alloc_ty *data_alloc = nullptr; data_submit_ty *data_submit = nullptr; data_retrieve_ty *data_retrieve = nullptr; + data_submit_async_ty *data_submit_async = nullptr; + data_retrieve_async_ty *data_retrieve_async = nullptr; data_delete_ty *data_delete = nullptr; run_region_ty *run_region = nullptr; run_team_region_ty *run_team_region = nullptr; init_requires_ty *init_requires = nullptr; + synchronize_ty *synchronize = nullptr; // Are there images associated with this RTL. bool isUsed = false; @@ -90,11 +99,14 @@ data_alloc = r.data_alloc; data_submit = r.data_submit; data_retrieve = r.data_retrieve; + data_submit_async = r.data_submit_async; + data_retrieve_async = r.data_retrieve_async; data_delete = r.data_delete; run_region = r.run_region; run_team_region = r.run_team_region; init_requires = r.init_requires; isUsed = r.isUsed; + synchronize = r.synchronize; } }; diff --git a/openmp/libomptarget/src/rtl.cpp b/openmp/libomptarget/src/rtl.cpp --- a/openmp/libomptarget/src/rtl.cpp +++ b/openmp/libomptarget/src/rtl.cpp @@ -130,6 +130,11 @@ // Optional functions *((void**) &R.init_requires) = dlsym( dynlib_handle, "__tgt_rtl_init_requires"); + *((void **)&R.data_retrieve_async) = + dlsym(dynlib_handle, "__tgt_rtl_data_retrieve_async"); + *((void **)&R.data_submit_async) = + dlsym(dynlib_handle, "__tgt_rtl_data_submit_async"); + *((void **)&R.synchronize) = dlsym(dynlib_handle, "__tgt_rtl_synchronize"); // No devices are supported by this RTL? if (!(R.NumberOfDevices = R.number_of_devices())) {