Index: openmp/libomptarget/include/omptarget.h =================================================================== --- openmp/libomptarget/include/omptarget.h +++ openmp/libomptarget/include/omptarget.h @@ -111,6 +111,15 @@ *EntriesEnd; // End of the table with all the entries (non inclusive) }; +/// This struct contains information exchanged between different asynchronous +/// operations for device-dependent optimization and potential synchronization +struct __tgt_async_info { + // A pointer to a queue-like structure where offloading operations are issued. + // We assume to use this structure to do synchronization. In CUDA backend, it + // is CUstream. + void *Queue = nullptr; +}; + #ifdef __cplusplus extern "C" { #endif Index: openmp/libomptarget/include/omptargetplugin.h =================================================================== --- openmp/libomptarget/include/omptargetplugin.h +++ openmp/libomptarget/include/omptargetplugin.h @@ -58,15 +58,21 @@ // case an error occurred on the target device. void *__tgt_rtl_data_alloc(int32_t ID, int64_t Size, void *HostPtr); -// Pass the data content to the target device using the target address. -// In case of success, return zero. Otherwise, return an error code. +// Pass the data content to the target device using the target address. If +// AsyncInfoPtr is nullptr, it is synchronous; otherwise it is asynchronous. +// However, AsyncInfoPtr may be ignored on some platforms, like x86_64. In that +// case, it is synchronous. In case of success, return zero. Otherwise, return +// an error code. int32_t __tgt_rtl_data_submit(int32_t ID, void *TargetPtr, void *HostPtr, - int64_t Size); + int64_t Size, __tgt_async_info *AsyncInfoPtr); -// Retrieve the data content from the target device using its address. -// In case of success, return zero. Otherwise, return an error code. +// Retrieve the data content from the target device using its address. If +// AsyncInfoPtr is nullptr, it is synchronous; otherwise it is asynchronous. +// However, AsyncInfoPtr may be ignored on some platforms, like x86_64. In that +// case, it is synchronous. In case of success, return zero. Otherwise, return +// an error code. int32_t __tgt_rtl_data_retrieve(int32_t ID, void *HostPtr, void *TargetPtr, - int64_t Size); + int64_t Size, __tgt_async_info *AsyncInfoPtr); // De-allocate the data referenced by target ptr on the device. In case of // success, return zero. Otherwise, return an error code. @@ -75,17 +81,28 @@ // Transfer control to the offloaded entry Entry on the target device. // Args and Offsets are arrays of NumArgs size of target addresses and // offsets. An offset should be added to the target address before passing it -// to the outlined function on device side. In case of success, return zero. -// Otherwise, return an error code. +// to the outlined function on device side. If AsyncInfoPtr is nullptr, it is +// synchronous; otherwise it is asynchronous. However, AsyncInfoPtr may be +// ignored on some platforms, like x86_64. In that case, it is synchronous. In +// case of success, return zero. Otherwise, return an error code. int32_t __tgt_rtl_run_target_region(int32_t ID, void *Entry, void **Args, - ptrdiff_t *Offsets, int32_t NumArgs); + ptrdiff_t *Offsets, int32_t NumArgs, + __tgt_async_info *AsyncInfoPtr); // Similar to __tgt_rtl_run_target_region, but additionally specify the -// number of teams to be created and a number of threads in each team. +// number of teams to be created and a number of threads in each team. If +// AsyncInfoPtr is nullptr, it is synchronous; otherwise it is asynchronous. +// However, AsyncInfoPtr may be ignored on some platforms, like x86_64. In that +// case, it is synchronous. int32_t __tgt_rtl_run_target_team_region(int32_t ID, void *Entry, void **Args, ptrdiff_t *Offsets, int32_t NumArgs, int32_t NumTeams, int32_t ThreadLimit, - uint64_t loop_tripcount); + uint64_t loop_tripcount, + __tgt_async_info *AsyncInfoPtr); + +// Device synchronization. In case of success, return zero. Otherwise, return an +// error code. +int32_t __tgt_rtl_synchronize(int32_t ID, __tgt_async_info *AsyncInfoPtr); #ifdef __cplusplus } Index: openmp/libomptarget/plugins/cuda/src/rtl.cpp =================================================================== --- openmp/libomptarget/plugins/cuda/src/rtl.cpp +++ openmp/libomptarget/plugins/cuda/src/rtl.cpp @@ -309,6 +309,68 @@ static RTLDeviceInfoTy DeviceInfo; +namespace { +CUstream selectStream(int32_t Id, __tgt_async_info *AsyncInfo) { + if (!AsyncInfo) + return DeviceInfo.getNextStream(Id); + + if (!AsyncInfo->Queue) + AsyncInfo->Queue = DeviceInfo.getNextStream(Id); + + return reinterpret_cast(AsyncInfo->Queue); +} + +int32_t dataRetrieve(int32_t DeviceId, void *HstPtr, void *TgtPtr, int64_t Size, + __tgt_async_info *AsyncInfoPtr) { + assert(AsyncInfoPtr && "AsyncInfoPtr is nullptr"); + // Set the context we are using. + CUresult err = cuCtxSetCurrent(DeviceInfo.Contexts[DeviceId]); + if (err != CUDA_SUCCESS) { + DP("Error when setting CUDA context\n"); + CUDA_ERR_STRING(err); + return OFFLOAD_FAIL; + } + + CUstream Stream = selectStream(DeviceId, AsyncInfoPtr); + + err = cuMemcpyDtoHAsync(HstPtr, (CUdeviceptr)TgtPtr, Size, Stream); + if (err != CUDA_SUCCESS) { + DP("Error when copying data from device to host. Pointers: host = " DPxMOD + ", device = " DPxMOD ", size = %" PRId64 "\n", + DPxPTR(HstPtr), DPxPTR(TgtPtr), Size); + CUDA_ERR_STRING(err); + return OFFLOAD_FAIL; + } + + return OFFLOAD_SUCCESS; +} + +int32_t dataSubmit(int32_t DeviceId, void *TgtPtr, void *HstPtr, int64_t Size, + __tgt_async_info *AsyncInfoPtr) { + assert(AsyncInfoPtr && "AsyncInfoPtr is nullptr"); + // Set the context we are using. + CUresult err = cuCtxSetCurrent(DeviceInfo.Contexts[DeviceId]); + if (err != CUDA_SUCCESS) { + DP("Error when setting CUDA context\n"); + CUDA_ERR_STRING(err); + return OFFLOAD_FAIL; + } + + CUstream Stream = selectStream(DeviceId, AsyncInfoPtr); + + err = cuMemcpyHtoDAsync((CUdeviceptr)TgtPtr, HstPtr, Size, Stream); + if (err != CUDA_SUCCESS) { + DP("Error when copying data from host to device. Pointers: host = " DPxMOD + ", device = " DPxMOD ", size = %" PRId64 "\n", + DPxPTR(HstPtr), DPxPTR(TgtPtr), Size); + CUDA_ERR_STRING(err); + return OFFLOAD_FAIL; + } + + return OFFLOAD_SUCCESS; +} +} // namespace + #ifdef __cplusplus extern "C" { #endif @@ -663,69 +725,44 @@ } int32_t __tgt_rtl_data_submit(int32_t device_id, void *tgt_ptr, void *hst_ptr, - int64_t size) { - // Set the context we are using. - CUresult err = cuCtxSetCurrent(DeviceInfo.Contexts[device_id]); - if (err != CUDA_SUCCESS) { - DP("Error when setting CUDA context\n"); - CUDA_ERR_STRING(err); - return OFFLOAD_FAIL; - } - - CUstream &Stream = DeviceInfo.getNextStream(device_id); + int64_t size, __tgt_async_info *async_info_ptr) { + // The function dataSubmit is always asynchronous. Considering some data + // transfer must be synchronous, we assume if async_info_ptr is nullptr, the + // transfer will be synchronous by creating a temporary async info and then + // synchronizing after call dataSubmit; otherwise, it is asynchronous. + if (async_info_ptr) { + return dataSubmit(device_id, tgt_ptr, hst_ptr, size, async_info_ptr); + } else { + __tgt_async_info async_info; - err = cuMemcpyHtoDAsync((CUdeviceptr)tgt_ptr, hst_ptr, size, Stream); - if (err != CUDA_SUCCESS) { - DP("Error when copying data from host to device. Pointers: host = " DPxMOD - ", device = " DPxMOD ", size = %" PRId64 "\n", - DPxPTR(hst_ptr), DPxPTR(tgt_ptr), size); - CUDA_ERR_STRING(err); - return OFFLOAD_FAIL; - } + const int32_t rc = + dataSubmit(device_id, tgt_ptr, hst_ptr, size, &async_info); + if (rc != OFFLOAD_SUCCESS) { + return OFFLOAD_FAIL; + } - err = cuStreamSynchronize(Stream); - if (err != CUDA_SUCCESS) { - DP("Error when synchronizing async data transfer from host to device. " - "Pointers: host = " DPxMOD ", device = " DPxMOD ", size = %" PRId64 "\n", - DPxPTR(hst_ptr), DPxPTR(tgt_ptr), size); - CUDA_ERR_STRING(err); - return OFFLOAD_FAIL; + return __tgt_rtl_synchronize(device_id, &async_info); } - - return OFFLOAD_SUCCESS; } int32_t __tgt_rtl_data_retrieve(int32_t device_id, void *hst_ptr, void *tgt_ptr, - int64_t size) { - // Set the context we are using. - CUresult err = cuCtxSetCurrent(DeviceInfo.Contexts[device_id]); - if (err != CUDA_SUCCESS) { - DP("Error when setting CUDA context\n"); - CUDA_ERR_STRING(err); - return OFFLOAD_FAIL; - } - - CUstream &Stream = DeviceInfo.getNextStream(device_id); - - err = cuMemcpyDtoHAsync(hst_ptr, (CUdeviceptr)tgt_ptr, size, Stream); - if (err != CUDA_SUCCESS) { - DP("Error when copying data from device to host. Pointers: host = " DPxMOD - ", device = " DPxMOD ", size = %" PRId64 "\n", - DPxPTR(hst_ptr), DPxPTR(tgt_ptr), size); - CUDA_ERR_STRING(err); - return OFFLOAD_FAIL; - } - - err = cuStreamSynchronize(Stream); - if (err != CUDA_SUCCESS) { - DP("Error when synchronizing async data transfer from device to host. " - "Pointers: host = " DPxMOD ", device = " DPxMOD ", size = %" PRId64 "\n", - DPxPTR(hst_ptr), DPxPTR(tgt_ptr), size); - CUDA_ERR_STRING(err); - return OFFLOAD_FAIL; + int64_t size, + __tgt_async_info *async_info_ptr) { + // The function dataRetrieve is always asynchronous. Considering some data + // transfer must be synchronous, we assume if async_info_ptr is nullptr, the + // transfer will be synchronous by creating a temporary async info and then + // synchronizing after call dataRetrieve; otherwise, it is asynchronous. + if (async_info_ptr) { + return dataRetrieve(device_id, hst_ptr, tgt_ptr, size, async_info_ptr); + } else { + __tgt_async_info async_info; + const int32_t rc = + dataRetrieve(device_id, hst_ptr, tgt_ptr, size, &async_info); + if (rc != OFFLOAD_SUCCESS) { + return OFFLOAD_FAIL; + } + return __tgt_rtl_synchronize(device_id, &async_info); } - - return OFFLOAD_SUCCESS; } int32_t __tgt_rtl_data_delete(int32_t device_id, void *tgt_ptr) { @@ -747,8 +784,12 @@ } int32_t __tgt_rtl_run_target_team_region(int32_t device_id, void *tgt_entry_ptr, - void **tgt_args, ptrdiff_t *tgt_offsets, int32_t arg_num, int32_t team_num, - int32_t thread_limit, uint64_t loop_tripcount) { + void **tgt_args, + ptrdiff_t *tgt_offsets, + int32_t arg_num, int32_t team_num, + int32_t thread_limit, + uint64_t loop_tripcount, + __tgt_async_info *async_info) { // Set the context we are using. CUresult err = cuCtxSetCurrent(DeviceInfo.Contexts[device_id]); if (err != CUDA_SUCCESS) { @@ -844,8 +885,7 @@ DP("Launch kernel with %d blocks and %d threads\n", cudaBlocksPerGrid, cudaThreadsPerBlock); - CUstream &Stream = DeviceInfo.getNextStream(device_id); - + CUstream Stream = selectStream(device_id, async_info); err = cuLaunchKernel(KernelInfo->Func, cudaBlocksPerGrid, 1, 1, cudaThreadsPerBlock, 1, 1, 0 /*bytes of shared memory*/, Stream, &args[0], 0); @@ -858,25 +898,35 @@ DP("Launch of entry point at " DPxMOD " successful!\n", DPxPTR(tgt_entry_ptr)); - CUresult sync_err = cuStreamSynchronize(Stream); - if (sync_err != CUDA_SUCCESS) { - DP("Kernel execution error at " DPxMOD "!\n", DPxPTR(tgt_entry_ptr)); - CUDA_ERR_STRING(sync_err); - return OFFLOAD_FAIL; - } else { - DP("Kernel execution at " DPxMOD " successful!\n", DPxPTR(tgt_entry_ptr)); - } - return OFFLOAD_SUCCESS; } int32_t __tgt_rtl_run_target_region(int32_t device_id, void *tgt_entry_ptr, - void **tgt_args, ptrdiff_t *tgt_offsets, int32_t arg_num) { + void **tgt_args, ptrdiff_t *tgt_offsets, + int32_t arg_num, + __tgt_async_info *async_info) { // use one team and the default number of threads. const int32_t team_num = 1; const int32_t thread_limit = 0; return __tgt_rtl_run_target_team_region(device_id, tgt_entry_ptr, tgt_args, - tgt_offsets, arg_num, team_num, thread_limit, 0); + tgt_offsets, arg_num, team_num, + thread_limit, 0, async_info); +} + +int32_t __tgt_rtl_synchronize(int32_t device_id, __tgt_async_info *async_info) { + assert(async_info && "async_info is nullptr"); + assert(async_info->Queue && "async_info->Queue is nullptr"); + + CUstream Stream = reinterpret_cast(async_info->Queue); + CUresult Err = cuStreamSynchronize(Stream); + if (Err != CUDA_SUCCESS) { + DP("Error when synchronizing stream. stream = " DPxMOD + ", async info ptr = " DPxMOD "\n", + DPxPTR(Stream), DPxPTR(async_info)); + CUDA_ERR_STRING(Err); + return OFFLOAD_FAIL; + } + return OFFLOAD_SUCCESS; } #ifdef __cplusplus Index: openmp/libomptarget/plugins/exports =================================================================== --- openmp/libomptarget/plugins/exports +++ openmp/libomptarget/plugins/exports @@ -11,6 +11,7 @@ __tgt_rtl_data_delete; __tgt_rtl_run_target_team_region; __tgt_rtl_run_target_region; + __tgt_rtl_synchronize; local: *; }; Index: openmp/libomptarget/plugins/generic-elf-64bit/src/rtl.cpp =================================================================== --- openmp/libomptarget/plugins/generic-elf-64bit/src/rtl.cpp +++ openmp/libomptarget/plugins/generic-elf-64bit/src/rtl.cpp @@ -277,13 +277,13 @@ } int32_t __tgt_rtl_data_submit(int32_t device_id, void *tgt_ptr, void *hst_ptr, - int64_t size) { + int64_t size, __tgt_async_info *) { memcpy(tgt_ptr, hst_ptr, size); return OFFLOAD_SUCCESS; } int32_t __tgt_rtl_data_retrieve(int32_t device_id, void *hst_ptr, void *tgt_ptr, - int64_t size) { + int64_t size, __tgt_async_info *) { memcpy(hst_ptr, tgt_ptr, size); return OFFLOAD_SUCCESS; } @@ -293,9 +293,11 @@ return OFFLOAD_SUCCESS; } -int32_t __tgt_rtl_run_target_team_region(int32_t device_id, void *tgt_entry_ptr, - void **tgt_args, ptrdiff_t *tgt_offsets, int32_t arg_num, int32_t team_num, - int32_t thread_limit, uint64_t loop_tripcount /*not used*/) { +int32_t __tgt_rtl_run_target_team_region( + int32_t device_id, void *tgt_entry_ptr, void **tgt_args, + ptrdiff_t *tgt_offsets, int32_t arg_num, int32_t team_num, + int32_t thread_limit, uint64_t loop_tripcount /*not used*/, + __tgt_async_info *async_info /*not used*/) { // ignore team num and thread limit. // Use libffi to launch execution. @@ -328,10 +330,18 @@ } int32_t __tgt_rtl_run_target_region(int32_t device_id, void *tgt_entry_ptr, - void **tgt_args, ptrdiff_t *tgt_offsets, int32_t arg_num) { + void **tgt_args, ptrdiff_t *tgt_offsets, + int32_t arg_num, + __tgt_async_info *async_info_ptr) { // use one team and one thread. return __tgt_rtl_run_target_team_region(device_id, tgt_entry_ptr, tgt_args, - tgt_offsets, arg_num, 1, 1, 0); + tgt_offsets, arg_num, 1, 1, 0, + async_info_ptr); +} + +int32_t __tgt_rtl_synchronize(int32_t device_id, + __tgt_async_info *async_info_ptr) { + return OFFLOAD_SUCCESS; } #ifdef __cplusplus Index: openmp/libomptarget/src/api.cpp =================================================================== --- openmp/libomptarget/src/api.cpp +++ openmp/libomptarget/src/api.cpp @@ -161,19 +161,19 @@ } else if (src_device == omp_get_initial_device()) { DP("copy from host to device\n"); DeviceTy& DstDev = Devices[dst_device]; - rc = DstDev.data_submit(dstAddr, srcAddr, length); + rc = DstDev.data_submit(dstAddr, srcAddr, length, nullptr); } else if (dst_device == omp_get_initial_device()) { DP("copy from device to host\n"); DeviceTy& SrcDev = Devices[src_device]; - rc = SrcDev.data_retrieve(dstAddr, srcAddr, length); + rc = SrcDev.data_retrieve(dstAddr, srcAddr, length, nullptr); } else { DP("copy from device to device\n"); void *buffer = malloc(length); DeviceTy& SrcDev = Devices[src_device]; DeviceTy& DstDev = Devices[dst_device]; - rc = SrcDev.data_retrieve(buffer, srcAddr, length); + rc = SrcDev.data_retrieve(buffer, srcAddr, length, nullptr); if (rc == OFFLOAD_SUCCESS) - rc = DstDev.data_submit(dstAddr, buffer, length); + rc = DstDev.data_submit(dstAddr, buffer, length, nullptr); free(buffer); } Index: openmp/libomptarget/src/device.h =================================================================== --- openmp/libomptarget/src/device.h +++ openmp/libomptarget/src/device.h @@ -24,6 +24,7 @@ struct RTLInfoTy; struct __tgt_bin_desc; struct __tgt_target_table; +struct __tgt_async_info; /// Map between host data and target data. struct HostDataToTargetTy { @@ -173,14 +174,20 @@ int32_t initOnce(); __tgt_target_table *load_binary(void *Img); - int32_t data_submit(void *TgtPtrBegin, void *HstPtrBegin, int64_t Size); - int32_t data_retrieve(void *HstPtrBegin, void *TgtPtrBegin, int64_t Size); + // Asynchronous data transfer. When AsyncInfoPtr is nullptr, the transfer will + // be synchronous. + int32_t data_submit(void *TgtPtrBegin, void *HstPtrBegin, int64_t Size, + __tgt_async_info *AsyncInfoPtr); + int32_t data_retrieve(void *HstPtrBegin, void *TgtPtrBegin, int64_t Size, + __tgt_async_info *AsyncInfoPtr); int32_t run_region(void *TgtEntryPtr, void **TgtVarsPtr, - ptrdiff_t *TgtOffsets, int32_t TgtVarsSize); + ptrdiff_t *TgtOffsets, int32_t TgtVarsSize, + __tgt_async_info *AsyncInfo); int32_t run_team_region(void *TgtEntryPtr, void **TgtVarsPtr, - ptrdiff_t *TgtOffsets, int32_t TgtVarsSize, int32_t NumTeams, - int32_t ThreadLimit, uint64_t LoopTripCount); + ptrdiff_t *TgtOffsets, int32_t TgtVarsSize, + int32_t NumTeams, int32_t ThreadLimit, + uint64_t LoopTripCount, __tgt_async_info *AsyncInfo); private: // Call to RTL Index: openmp/libomptarget/src/device.cpp =================================================================== --- openmp/libomptarget/src/device.cpp +++ openmp/libomptarget/src/device.cpp @@ -331,31 +331,38 @@ return rc; } -// Submit data to device. +// Submit data to device int32_t DeviceTy::data_submit(void *TgtPtrBegin, void *HstPtrBegin, - int64_t Size) { - return RTL->data_submit(RTLDeviceID, TgtPtrBegin, HstPtrBegin, Size); + int64_t Size, __tgt_async_info *AsyncInfoPtr) { + + return RTL->data_submit(RTLDeviceID, TgtPtrBegin, HstPtrBegin, Size, + AsyncInfoPtr); } -// Retrieve data from device. +// Retrieve data from device int32_t DeviceTy::data_retrieve(void *HstPtrBegin, void *TgtPtrBegin, - int64_t Size) { - return RTL->data_retrieve(RTLDeviceID, HstPtrBegin, TgtPtrBegin, Size); + int64_t Size, __tgt_async_info *AsyncInfoPtr) { + return RTL->data_retrieve(RTLDeviceID, HstPtrBegin, TgtPtrBegin, Size, + AsyncInfoPtr); } // Run region on device int32_t DeviceTy::run_region(void *TgtEntryPtr, void **TgtVarsPtr, - ptrdiff_t *TgtOffsets, int32_t TgtVarsSize) { + ptrdiff_t *TgtOffsets, int32_t TgtVarsSize, + __tgt_async_info *AsyncInfo) { return RTL->run_region(RTLDeviceID, TgtEntryPtr, TgtVarsPtr, TgtOffsets, - TgtVarsSize); + TgtVarsSize, AsyncInfo); } // Run team region on device. int32_t DeviceTy::run_team_region(void *TgtEntryPtr, void **TgtVarsPtr, - ptrdiff_t *TgtOffsets, int32_t TgtVarsSize, int32_t NumTeams, - int32_t ThreadLimit, uint64_t LoopTripCount) { + ptrdiff_t *TgtOffsets, int32_t TgtVarsSize, + int32_t NumTeams, int32_t ThreadLimit, + uint64_t LoopTripCount, + __tgt_async_info *AsyncInfo) { return RTL->run_team_region(RTLDeviceID, TgtEntryPtr, TgtVarsPtr, TgtOffsets, - TgtVarsSize, NumTeams, ThreadLimit, LoopTripCount); + TgtVarsSize, NumTeams, ThreadLimit, LoopTripCount, + AsyncInfo); } /// Check whether a device has an associated RTL and initialize it if it's not Index: openmp/libomptarget/src/omptarget.cpp =================================================================== --- openmp/libomptarget/src/omptarget.cpp +++ openmp/libomptarget/src/omptarget.cpp @@ -215,8 +215,9 @@ } /// Internal function to do the mapping and transfer the data to the device -int target_data_begin(DeviceTy &Device, int32_t arg_num, - void **args_base, void **args, int64_t *arg_sizes, int64_t *arg_types) { +int target_data_begin(DeviceTy &Device, int32_t arg_num, void **args_base, + void **args, int64_t *arg_sizes, int64_t *arg_types, + __tgt_async_info *async_info_ptr) { // process each input. for (int32_t i = 0; i < arg_num; ++i) { // Ignore private variables and arrays - there is no mapping for them. @@ -316,8 +317,9 @@ if (copy && !IsHostPtr) { DP("Moving %" PRId64 " bytes (hst:" DPxMOD ") -> (tgt:" DPxMOD ")\n", - data_size, DPxPTR(HstPtrBegin), DPxPTR(TgtPtrBegin)); - int rt = Device.data_submit(TgtPtrBegin, HstPtrBegin, data_size); + data_size, DPxPTR(HstPtrBegin), DPxPTR(TgtPtrBegin)); + int rt = Device.data_submit(TgtPtrBegin, HstPtrBegin, data_size, + async_info_ptr); if (rt != OFFLOAD_SUCCESS) { DP("Copying data to device failed.\n"); return OFFLOAD_FAIL; @@ -331,7 +333,7 @@ uint64_t Delta = (uint64_t)HstPtrBegin - (uint64_t)HstPtrBase; void *TgtPtrBase = (void *)((uint64_t)TgtPtrBegin - Delta); int rt = Device.data_submit(Pointer_TgtPtrBegin, &TgtPtrBase, - sizeof(void *)); + sizeof(void *), async_info_ptr); if (rt != OFFLOAD_SUCCESS) { DP("Copying data to device failed.\n"); return OFFLOAD_FAIL; @@ -349,7 +351,8 @@ /// Internal function to undo the mapping and retrieve the data from the device. int target_data_end(DeviceTy &Device, int32_t arg_num, void **args_base, - void **args, int64_t *arg_sizes, int64_t *arg_types) { + void **args, int64_t *arg_sizes, int64_t *arg_types, + __tgt_async_info *async_info_ptr) { // process each input. for (int32_t i = arg_num - 1; i >= 0; --i) { // Ignore private variables and arrays - there is no mapping for them. @@ -419,8 +422,9 @@ !(RTLs->RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY && TgtPtrBegin == HstPtrBegin)) { DP("Moving %" PRId64 " bytes (tgt:" DPxMOD ") -> (hst:" DPxMOD ")\n", - data_size, DPxPTR(TgtPtrBegin), DPxPTR(HstPtrBegin)); - int rt = Device.data_retrieve(HstPtrBegin, TgtPtrBegin, data_size); + data_size, DPxPTR(TgtPtrBegin), DPxPTR(HstPtrBegin)); + int rt = Device.data_retrieve(HstPtrBegin, TgtPtrBegin, data_size, + async_info_ptr); if (rt != OFFLOAD_SUCCESS) { DP("Copying data from device failed.\n"); return OFFLOAD_FAIL; @@ -509,7 +513,7 @@ if (arg_types[i] & OMP_TGT_MAPTYPE_FROM) { DP("Moving %" PRId64 " bytes (tgt:" DPxMOD ") -> (hst:" DPxMOD ")\n", arg_sizes[i], DPxPTR(TgtPtrBegin), DPxPTR(HstPtrBegin)); - int rt = Device.data_retrieve(HstPtrBegin, TgtPtrBegin, MapSize); + int rt = Device.data_retrieve(HstPtrBegin, TgtPtrBegin, MapSize, nullptr); if (rt != OFFLOAD_SUCCESS) { DP("Copying data from device failed.\n"); return OFFLOAD_FAIL; @@ -536,7 +540,7 @@ if (arg_types[i] & OMP_TGT_MAPTYPE_TO) { DP("Moving %" PRId64 " bytes (hst:" DPxMOD ") -> (tgt:" DPxMOD ")\n", arg_sizes[i], DPxPTR(HstPtrBegin), DPxPTR(TgtPtrBegin)); - int rt = Device.data_submit(TgtPtrBegin, HstPtrBegin, MapSize); + int rt = Device.data_submit(TgtPtrBegin, HstPtrBegin, MapSize, nullptr); if (rt != OFFLOAD_SUCCESS) { DP("Copying data to device failed.\n"); return OFFLOAD_FAIL; @@ -556,7 +560,7 @@ "pointer " DPxMOD "\n", DPxPTR(it->second.TgtPtrVal), DPxPTR(it->second.TgtPtrAddr)); rt = Device.data_submit(it->second.TgtPtrAddr, - &it->second.TgtPtrVal, sizeof(void *)); + &it->second.TgtPtrVal, sizeof(void *), nullptr); if (rt != OFFLOAD_SUCCESS) { DP("Copying data to device failed.\n"); Device.ShadowMtx.unlock(); @@ -638,9 +642,11 @@ TrlTblMtx->unlock(); assert(TargetTable && "Global data has not been mapped\n"); + __tgt_async_info AsyncInfo; + // Move data to device. int rc = target_data_begin(Device, arg_num, args_base, args, arg_sizes, - arg_types); + arg_types, &AsyncInfo); if (rc != OFFLOAD_SUCCESS) { DP("Call to target_data_begin failed, abort target.\n"); return OFFLOAD_FAIL; @@ -691,7 +697,7 @@ DP("Update lambda reference (" DPxMOD ") -> [" DPxMOD "]\n", DPxPTR(Pointer_TgtPtrBegin), DPxPTR(TgtPtrBegin)); int rt = Device.data_submit(TgtPtrBegin, &Pointer_TgtPtrBegin, - sizeof(void *)); + sizeof(void *), &AsyncInfo); if (rt != OFFLOAD_SUCCESS) { DP("Copying data to device failed.\n"); return OFFLOAD_FAIL; @@ -732,9 +738,10 @@ #endif // If first-private, copy data from host if (arg_types[i] & OMP_TGT_MAPTYPE_TO) { - int rt = Device.data_submit(TgtPtrBegin, HstPtrBegin, arg_sizes[i]); + int rt = Device.data_submit(TgtPtrBegin, HstPtrBegin, arg_sizes[i], + &AsyncInfo); if (rt != OFFLOAD_SUCCESS) { - DP ("Copying data to device failed, failed.\n"); + DP("Copying data to device failed, failed.\n"); return OFFLOAD_FAIL; } } @@ -780,11 +787,12 @@ DPxPTR(TargetTable->EntriesBegin[TM->Index].addr), TM->Index); if (IsTeamConstruct) { rc = Device.run_team_region(TargetTable->EntriesBegin[TM->Index].addr, - &tgt_args[0], &tgt_offsets[0], tgt_args.size(), team_num, - thread_limit, ltc); + &tgt_args[0], &tgt_offsets[0], tgt_args.size(), + team_num, thread_limit, ltc, &AsyncInfo); } else { rc = Device.run_region(TargetTable->EntriesBegin[TM->Index].addr, - &tgt_args[0], &tgt_offsets[0], tgt_args.size()); + &tgt_args[0], &tgt_offsets[0], tgt_args.size(), + &AsyncInfo); } if (rc != OFFLOAD_SUCCESS) { DP ("Executing target region abort target.\n"); @@ -802,11 +810,11 @@ // Move data from device. int rt = target_data_end(Device, arg_num, args_base, args, arg_sizes, - arg_types); + arg_types, &AsyncInfo); if (rt != OFFLOAD_SUCCESS) { DP("Call to target_data_end failed, abort targe.\n"); return OFFLOAD_FAIL; } - return OFFLOAD_SUCCESS; + return Device.RTL->synchronize(device_id, &AsyncInfo); } Index: openmp/libomptarget/src/private.h =================================================================== --- openmp/libomptarget/src/private.h +++ openmp/libomptarget/src/private.h @@ -18,10 +18,13 @@ #include extern int target_data_begin(DeviceTy &Device, int32_t arg_num, - void **args_base, void **args, int64_t *arg_sizes, int64_t *arg_types); + void **args_base, void **args, int64_t *arg_sizes, + int64_t *arg_types, + __tgt_async_info *async_info_ptr = nullptr); extern int target_data_end(DeviceTy &Device, int32_t arg_num, void **args_base, - void **args, int64_t *arg_sizes, int64_t *arg_types); + void **args, int64_t *arg_sizes, int64_t *arg_types, + __tgt_async_info *async_info_ptr = nullptr); extern int target_data_update(DeviceTy &Device, int32_t arg_num, void **args_base, void **args, int64_t *arg_sizes, int64_t *arg_types); Index: openmp/libomptarget/src/rtl.h =================================================================== --- openmp/libomptarget/src/rtl.h +++ openmp/libomptarget/src/rtl.h @@ -30,14 +30,18 @@ typedef int32_t(init_device_ty)(int32_t); typedef __tgt_target_table *(load_binary_ty)(int32_t, void *); typedef void *(data_alloc_ty)(int32_t, int64_t, void *); - typedef int32_t(data_submit_ty)(int32_t, void *, void *, int64_t); - typedef int32_t(data_retrieve_ty)(int32_t, void *, void *, int64_t); + typedef int32_t(data_submit_ty)(int32_t, void *, void *, int64_t, + __tgt_async_info *); + typedef int32_t(data_retrieve_ty)(int32_t, void *, void *, int64_t, + __tgt_async_info *); typedef int32_t(data_delete_ty)(int32_t, void *); - typedef int32_t(run_region_ty)(int32_t, void *, void **, ptrdiff_t *, - int32_t); + typedef int32_t(run_region_ty)(int32_t, void *, void **, ptrdiff_t *, int32_t, + __tgt_async_info *); typedef int32_t(run_team_region_ty)(int32_t, void *, void **, ptrdiff_t *, - int32_t, int32_t, int32_t, uint64_t); + int32_t, int32_t, int32_t, uint64_t, + __tgt_async_info *); typedef int64_t(init_requires_ty)(int64_t); + typedef int64_t(synchronize_ty)(int64_t, __tgt_async_info *); int32_t Idx = -1; // RTL index, index is the number of devices // of other RTLs that were registered before, @@ -63,6 +67,7 @@ run_region_ty *run_region = nullptr; run_team_region_ty *run_team_region = nullptr; init_requires_ty *init_requires = nullptr; + synchronize_ty *synchronize = nullptr; // Are there images associated with this RTL. bool isUsed = false; @@ -95,6 +100,7 @@ run_team_region = r.run_team_region; init_requires = r.init_requires; isUsed = r.isUsed; + synchronize = r.synchronize; } }; Index: openmp/libomptarget/src/rtl.cpp =================================================================== --- openmp/libomptarget/src/rtl.cpp +++ openmp/libomptarget/src/rtl.cpp @@ -126,6 +126,9 @@ if (!(*((void**) &R.run_team_region) = dlsym( dynlib_handle, "__tgt_rtl_run_target_team_region"))) continue; + if (!(*((void**) &R.synchronize) = dlsym( + dynlib_handle, "__tgt_rtl_synchronize"))) + continue; // Optional functions *((void**) &R.init_requires) = dlsym(