diff --git a/openmp/libomptarget/include/omptargetplugin.h b/openmp/libomptarget/include/omptargetplugin.h --- a/openmp/libomptarget/include/omptargetplugin.h +++ b/openmp/libomptarget/include/omptargetplugin.h @@ -31,6 +31,11 @@ // having to load the library, which can be expensive. int32_t __tgt_rtl_is_valid_binary(__tgt_device_image *Image); +// Return an integer other than zero if the data can be exchaned from SrcDevId +// to DstDevId. If it is data exchangable, the device plugin should provide +// function to move data from source device to destination device directly. +int32_t __tgt_rtl_is_data_exchangable(int32_t SrcDevId, int32_t DstDevId); + // Initialize the requires flags for the device. int64_t __tgt_rtl_init_requires(int64_t RequiresFlags); @@ -77,6 +82,18 @@ void *TargetPtr, int64_t Size, __tgt_async_info *AsyncInfoPtr); +// Copy the data content from one target device to another target device using +// its address. This operation does not need to copy data back to host and then +// from host to another device. In case of success, return zero. Otherwise, +// return an error code. +int32_t __tgt_rtl_data_exchange(int32_t SrcID, void *SrcPtr, int32_t DstID, + void *DstPtr, int64_t Size); + +// Asynchronous version of __tgt_rtl_data_exchange +int32_t __tgt_rtl_data_exchange_async(int32_t SrcID, void *SrcPtr, + int32_t DesID, void *DstPtr, int64_t Size, + __tgt_async_info *AsyncInfoPtr); + // De-allocate the data referenced by target ptr on the device. In case of // success, return zero. Otherwise, return an error code. int32_t __tgt_rtl_data_delete(int32_t ID, void *TargetPtr); diff --git a/openmp/libomptarget/plugins/cuda/src/rtl.cpp b/openmp/libomptarget/plugins/cuda/src/rtl.cpp --- a/openmp/libomptarget/plugins/cuda/src/rtl.cpp +++ b/openmp/libomptarget/plugins/cuda/src/rtl.cpp @@ -100,6 +100,22 @@ return false; } +int memcpyDtoD(const void *SrcPtr, void *DstPtr, int64_t Size, + CUstream Stream) { + CUresult Err = + cuMemcpyDtoDAsync((CUdeviceptr)DstPtr, (CUdeviceptr)SrcPtr, Size, Stream); + + if (Err != CUDA_SUCCESS) { + DP("Error when copying data from device to device. Pointers: src " + "= " DPxMOD ", dst = " DPxMOD ", size = %" PRId64 "\n", + DPxPTR(SrcPtr), DPxPTR(DstPtr), Size); + CUDA_ERR_STRING(Err); + return OFFLOAD_FAIL; + } + + return OFFLOAD_SUCCESS; +} + // Structure contains per-device data struct DeviceDataTy { std::list FuncGblEntries; @@ -736,6 +752,57 @@ return OFFLOAD_SUCCESS; } + int dataExchange(int SrcDevId, const void *SrcPtr, int DstDevId, void *DstPtr, + int64_t Size, __tgt_async_info *AsyncInfoPtr) const { + assert(AsyncInfoPtr && "AsyncInfoPtr is nullptr"); + + CUresult Err = cuCtxSetCurrent(DeviceData[SrcDevId].Context); + if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n")) + return OFFLOAD_FAIL; + + CUstream Stream = getStream(SrcDevId, AsyncInfoPtr); + + // If they are two devices, we try peer to peer copy first + if (SrcDevId != DstDevId) { + int CanAccessPeer = 0; + Err = cuDeviceCanAccessPeer(&CanAccessPeer, SrcDevId, DstDevId); + if (Err != CUDA_SUCCESS) { + DP("Error returned from cuDeviceCanAccessPeer. src = %" PRId32 + ", dst = %" PRId32 "\n", + SrcDevId, DstDevId); + CUDA_ERR_STRING(Err); + return memcpyDtoD(SrcPtr, DstPtr, Size, Stream); + } + + if (!CanAccessPeer) { + DP("P2P memcpy not supported so fall back to D2D memcpy"); + return memcpyDtoD(SrcPtr, DstPtr, Size, Stream); + } + + Err = cuCtxEnablePeerAccess(DeviceData[DstDevId].Context, 0); + if (Err != CUDA_SUCCESS) { + DP("Error returned from cuCtxEnablePeerAccess. src = %" PRId32 + ", dst = %" PRId32 "\n", + SrcDevId, DstDevId); + CUDA_ERR_STRING(Err); + return memcpyDtoD(SrcPtr, DstPtr, Size, Stream); + } + + Err = cuMemcpyPeerAsync((CUdeviceptr)DstPtr, DeviceData[DstDevId].Context, + (CUdeviceptr)SrcPtr, DeviceData[SrcDevId].Context, + Size, Stream); + if (Err == CUDA_SUCCESS) + return OFFLOAD_SUCCESS; + + DP("Error returned from cuMemcpyPeerAsync. src_ptr = " DPxMOD + ", src_id =%" PRId32 ", dst_ptr = %" DPxMOD ", dst_id =%" PRId32 "\n", + SrcPtr, SrcDevId, DstPtr, DstDevId); + CUDA_ERR_STRING(Err); + } + + return memcpyDtoD(SrcPtr, DstPtr, Size, Stream); + } + int dataDelete(const int DeviceId, void *TgtPtr) const { CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context); if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n")) @@ -900,6 +967,14 @@ return RequiresFlags; } +int32_t __tgt_rtl_is_data_exchangable(int32_t src_dev_id, int dst_dev_id) { + if (DeviceRTL.isValidDeviceId(src_dev_id) && + DeviceRTL.isValidDeviceId(dst_dev_id)) + return 1; + + return 0; +} + int32_t __tgt_rtl_init_device(int32_t device_id) { assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); @@ -965,6 +1040,33 @@ async_info_ptr); } +int32_t __tgt_rtl_data_exchange_async(int32_t src_dev_id, void *src_ptr, + int dst_dev_id, void *dst_ptr, + int64_t size, + __tgt_async_info *async_info_ptr) { + assert(DeviceRTL.isValidDeviceId(src_dev_id) && "src_dev_id is invalid"); + assert(DeviceRTL.isValidDeviceId(dst_dev_id) && "dst_dev_id is invalid"); + assert(async_info_ptr && "async_info_ptr is nullptr"); + + return DeviceRTL.dataExchange(src_dev_id, src_ptr, dst_dev_id, dst_ptr, size, + async_info_ptr); +} + +int32_t __tgt_rtl_data_exchange(int32_t src_dev_id, void *src_ptr, + int32_t dst_dev_id, void *dst_ptr, + int64_t size) { + assert(DeviceRTL.isValidDeviceId(src_dev_id) && "src_dev_id is invalid"); + assert(DeviceRTL.isValidDeviceId(dst_dev_id) && "dst_dev_id is invalid"); + + __tgt_async_info async_info; + const int32_t rc = __tgt_rtl_data_exchange_async( + src_dev_id, src_ptr, dst_dev_id, dst_ptr, size, &async_info); + if (rc != OFFLOAD_SUCCESS) + return OFFLOAD_FAIL; + + return __tgt_rtl_synchronize(src_dev_id, &async_info); +} + int32_t __tgt_rtl_data_delete(int32_t device_id, void *tgt_ptr) { assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); diff --git a/openmp/libomptarget/plugins/exports b/openmp/libomptarget/plugins/exports --- a/openmp/libomptarget/plugins/exports +++ b/openmp/libomptarget/plugins/exports @@ -1,6 +1,7 @@ VERS1.0 { global: __tgt_rtl_is_valid_binary; + __tgt_rtl_is_data_exchangable; __tgt_rtl_number_of_devices; __tgt_rtl_init_requires; __tgt_rtl_init_device; @@ -10,6 +11,8 @@ __tgt_rtl_data_submit_async; __tgt_rtl_data_retrieve; __tgt_rtl_data_retrieve_async; + __tgt_rtl_data_exchange; + __tgt_rtl_data_exchange_async; __tgt_rtl_data_delete; __tgt_rtl_run_target_team_region; __tgt_rtl_run_target_team_region_async; diff --git a/openmp/libomptarget/src/api.cpp b/openmp/libomptarget/src/api.cpp --- a/openmp/libomptarget/src/api.cpp +++ b/openmp/libomptarget/src/api.cpp @@ -168,9 +168,17 @@ rc = SrcDev.data_retrieve(dstAddr, srcAddr, length, nullptr); } else { DP("copy from device to device\n"); + DeviceTy &SrcDev = Devices[src_device]; + DeviceTy &DstDev = Devices[dst_device]; + // First try to use D2D memcpy which is more efficient. If fails, fall back + // to unefficient way. + if (SrcDev.isDataExchangable(DstDev)) { + rc = SrcDev.data_exchange(srcAddr, DstDev, dstAddr, length, nullptr); + if (rc == OFFLOAD_SUCCESS) + return OFFLOAD_SUCCESS; + } + void *buffer = malloc(length); - DeviceTy& SrcDev = Devices[src_device]; - DeviceTy& DstDev = Devices[dst_device]; rc = SrcDev.data_retrieve(buffer, srcAddr, length, nullptr); if (rc == OFFLOAD_SUCCESS) rc = DstDev.data_submit(dstAddr, buffer, length, nullptr); diff --git a/openmp/libomptarget/src/device.h b/openmp/libomptarget/src/device.h --- a/openmp/libomptarget/src/device.h +++ b/openmp/libomptarget/src/device.h @@ -157,6 +157,9 @@ return *this; } + // Return true if data can be copied to DstDevice directly + bool isDataExchangable(const DeviceTy& DstDevice); + uint64_t getMapEntryRefCnt(void *HstPtrBegin); LookupResult lookupMapping(void *HstPtrBegin, int64_t Size); void *getOrAllocTgtPtr(void *HstPtrBegin, void *HstPtrBase, int64_t Size, @@ -176,10 +179,15 @@ // Data transfer. When AsyncInfoPtr is nullptr, the transfer will be // synchronous. + // Copy data from host to device int32_t data_submit(void *TgtPtrBegin, void *HstPtrBegin, int64_t Size, __tgt_async_info *AsyncInfoPtr); + // Copy data from device back to host int32_t data_retrieve(void *HstPtrBegin, void *TgtPtrBegin, int64_t Size, __tgt_async_info *AsyncInfoPtr); + // Copy data from current device to destination device directly + int32_t data_exchange(void *SrcPtr, DeviceTy DstDev, void *DstPtr, + int64_t Size, __tgt_async_info *AsyncInfoPtr); int32_t run_region(void *TgtEntryPtr, void **TgtVarsPtr, ptrdiff_t *TgtOffsets, int32_t TgtVarsSize, diff --git a/openmp/libomptarget/src/device.cpp b/openmp/libomptarget/src/device.cpp --- a/openmp/libomptarget/src/device.cpp +++ b/openmp/libomptarget/src/device.cpp @@ -352,6 +352,18 @@ AsyncInfoPtr); } +// Copy data from current device to destination device directly +int32_t DeviceTy::data_exchange(void *SrcPtr, DeviceTy DstDev, void *DstPtr, + int64_t Size, __tgt_async_info *AsyncInfoPtr) { + if (!AsyncInfoPtr || !RTL->data_exchange_async || !RTL->synchronize) { + assert(RTL->data_exchange && "RTL->data_exchange is nullptr"); + return RTL->data_exchange(RTLDeviceID, SrcPtr, DstDev.RTLDeviceID, DstPtr, + Size); + } else + return RTL->data_exchange_async(RTLDeviceID, SrcPtr, DstDev.RTLDeviceID, + DstPtr, Size, AsyncInfoPtr); +} + // Run region on device int32_t DeviceTy::run_region(void *TgtEntryPtr, void **TgtVarsPtr, ptrdiff_t *TgtOffsets, int32_t TgtVarsSize, @@ -380,6 +392,18 @@ ThreadLimit, LoopTripCount, AsyncInfoPtr); } +// Whether data can be copied to DstDevice directly +bool DeviceTy::isDataExchangable(const DeviceTy &DstDevice) { + if (RTL != DstDevice.RTL || !RTL->is_data_exchangable) + return false; + + if (RTL->is_data_exchangable(RTLDeviceID, DstDevice.RTLDeviceID)) + return (RTL->data_exchange != nullptr) || + (RTL->data_exchange_async != nullptr); + + return false; +} + /// Check whether a device has an associated RTL and initialize it if it's not /// already initialized. bool device_is_ready(int device_num) { diff --git a/openmp/libomptarget/src/rtl.h b/openmp/libomptarget/src/rtl.h --- a/openmp/libomptarget/src/rtl.h +++ b/openmp/libomptarget/src/rtl.h @@ -26,6 +26,7 @@ struct RTLInfoTy { typedef int32_t(is_valid_binary_ty)(void *); + typedef int32_t(is_data_exchangable_ty)(int32_t, int32_t); typedef int32_t(number_of_devices_ty)(); typedef int32_t(init_device_ty)(int32_t); typedef __tgt_target_table *(load_binary_ty)(int32_t, void *); @@ -36,6 +37,9 @@ typedef int32_t(data_retrieve_ty)(int32_t, void *, void *, int64_t); typedef int32_t(data_retrieve_async_ty)(int32_t, void *, void *, int64_t, __tgt_async_info *); + typedef int32_t(data_exchange_ty)(int32_t, void *, int32_t, void *, int64_t); + typedef int32_t(data_exchange_async_ty)(int32_t, void *, int32_t, void *, + int64_t, __tgt_async_info *); typedef int32_t(data_delete_ty)(int32_t, void *); typedef int32_t(run_region_ty)(int32_t, void *, void **, ptrdiff_t *, int32_t); @@ -64,6 +68,7 @@ // Functions implemented in the RTL. is_valid_binary_ty *is_valid_binary = nullptr; + is_data_exchangable_ty *is_data_exchangable = nullptr; number_of_devices_ty *number_of_devices = nullptr; init_device_ty *init_device = nullptr; load_binary_ty *load_binary = nullptr; @@ -72,6 +77,8 @@ data_submit_async_ty *data_submit_async = nullptr; data_retrieve_ty *data_retrieve = nullptr; data_retrieve_async_ty *data_retrieve_async = nullptr; + data_exchange_ty *data_exchange = nullptr; + data_exchange_async_ty *data_exchange_async = nullptr; data_delete_ty *data_delete = nullptr; run_region_ty *run_region = nullptr; run_region_async_ty *run_region_async = nullptr; @@ -100,6 +107,7 @@ RTLName = r.RTLName; #endif is_valid_binary = r.is_valid_binary; + is_data_exchangable = r.is_data_exchangable; number_of_devices = r.number_of_devices; init_device = r.init_device; load_binary = r.load_binary; @@ -108,6 +116,8 @@ data_submit_async = r.data_submit_async; data_retrieve = r.data_retrieve; data_retrieve_async = r.data_retrieve_async; + data_exchange = r.data_exchange; + data_exchange_async = r.data_exchange_async; data_delete = r.data_delete; run_region = r.run_region; run_region_async = r.run_region_async; diff --git a/openmp/libomptarget/src/rtl.cpp b/openmp/libomptarget/src/rtl.cpp --- a/openmp/libomptarget/src/rtl.cpp +++ b/openmp/libomptarget/src/rtl.cpp @@ -140,6 +140,12 @@ *((void **)&R.run_team_region_async) = dlsym(dynlib_handle, "__tgt_rtl_run_target_team_region_async"); *((void **)&R.synchronize) = dlsym(dynlib_handle, "__tgt_rtl_synchronize"); + *((void **)&R.data_exchange) = + dlsym(dynlib_handle, "__tgt_rtl_data_exchange"); + *((void **)&R.data_exchange_async) = + dlsym(dynlib_handle, "__tgt_rtl_data_exchange_async"); + *((void **)&R.is_data_exchangable) = + dlsym(dynlib_handle, "__tgt_rtl_is_data_exchangable"); // No devices are supported by this RTL? if (!(R.NumberOfDevices = R.number_of_devices())) { diff --git a/openmp/libomptarget/test/offloading/d2d_memcpy.c b/openmp/libomptarget/test/offloading/d2d_memcpy.c new file mode 100644 --- /dev/null +++ b/openmp/libomptarget/test/offloading/d2d_memcpy.c @@ -0,0 +1,69 @@ +// RUN: %libomptarget-compile-aarch64-unknown-linux-gnu && env OMP_MAX_ACTIVE_LEVELS=2 %libomptarget-run-aarch64-unknown-linux-gnu | %fcheck-aarch64-unknown-linux-gnu +// RUN: %libomptarget-compile-powerpc64-ibm-linux-gnu && env OMP_MAX_ACTIVE_LEVELS=2 %libomptarget-run-powerpc64-ibm-linux-gnu | %fcheck-powerpc64-ibm-linux-gnu +// RUN: %libomptarget-compile-powerpc64le-ibm-linux-gnu && env OMP_MAX_ACTIVE_LEVELS=2 %libomptarget-run-powerpc64le-ibm-linux-gnu | %fcheck-powerpc64le-ibm-linux-gnu +// RUN: %libomptarget-compile-x86_64-pc-linux-gnu && env OMP_MAX_ACTIVE_LEVELS=2 %libomptarget-run-x86_64-pc-linux-gnu | %fcheck-x86_64-pc-linux-gnu -allow-empty + +#include +#include +#include +#include + +const int magic_num = 7; + +int main(int argc, char *argv[]) { + const int N = 128; + const int num_devices = omp_get_num_devices(); + + // No target device, just return + if (num_devices == 0) { + printf("PASS\n"); + return 0; + } + + const int src_device = 0; + int dst_device = 1; + if (dst_device >= num_devices) + dst_device = num_devices - 1; + + int length = N * sizeof(int); + int *src_ptr = omp_target_alloc(length, src_device); + int *dst_ptr = omp_target_alloc(length, dst_device); + + assert(src_ptr && "src_ptr is NULL"); + assert(dst_ptr && "dst_ptr is NULL"); + +#pragma omp target teams distribute parallel for device(src_device) \ + is_device_ptr(src_ptr) + for (int i = 0; i < N; ++i) { + src_ptr[i] = magic_num; + } + + int rc = + omp_target_memcpy(dst_ptr, src_ptr, length, 0, 0, dst_device, src_device); + + assert(rc == 0 && "error in omp_target_memcpy"); + + int *buffer = malloc(length); + + assert(buffer && "failed to allocate host buffer"); + +#pragma omp target teams distribute parallel for device(dst_device) \ + map(from: buffer[0:N]) is_device_ptr(dst_ptr) + for (int i = 0; i < N; ++i) { + buffer[i] = dst_ptr[i] + magic_num; + } + + for (int i = 0; i < N; ++i) + assert(buffer[i] == 2 * magic_num); + + printf("PASS\n"); + + // Free host and device memory + free(buffer); + omp_target_free(src_ptr, src_device); + omp_target_free(dst_ptr, dst_device); + + return 0; +} + +// CHECK: PASS