diff --git a/openmp/libomptarget/include/omptargetplugin.h b/openmp/libomptarget/include/omptargetplugin.h --- a/openmp/libomptarget/include/omptargetplugin.h +++ b/openmp/libomptarget/include/omptargetplugin.h @@ -77,6 +77,18 @@ void *TargetPtr, int64_t Size, __tgt_async_info *AsyncInfoPtr); +// Copy the data content from one target device to another target device using +// its address. This operation does not need to copy data back to host and then +// from host to another device. In case of success, return zero. Otherwise, +// return an error code. +int32_t __tgt_rtl_data_exchange(int32_t SrcID, void *SrcPtr, int32_t DstID, + void *DstPtr, int64_t Size); + +// Asynchronous version of __tgt_rtl_data_exchange +int32_t __tgt_rtl_data_exchange_async(int32_t SrcID, void *SrcPtr, + int32_t DesID, void *DstPtr, int64_t Size, + __tgt_async_info *AsyncInfoPtr); + // De-allocate the data referenced by target ptr on the device. In case of // success, return zero. Otherwise, return an error code. int32_t __tgt_rtl_data_delete(int32_t ID, void *TargetPtr); diff --git a/openmp/libomptarget/plugins/cuda/src/rtl.cpp b/openmp/libomptarget/plugins/cuda/src/rtl.cpp --- a/openmp/libomptarget/plugins/cuda/src/rtl.cpp +++ b/openmp/libomptarget/plugins/cuda/src/rtl.cpp @@ -736,6 +736,69 @@ return OFFLOAD_SUCCESS; } + int dataExchange(int SrcDevId, const void *SrcPtr, int DstDevId, + const void *DstPtr, int64_t Size, + __tgt_async_info *AsyncInfoPtr) const { + assert(AsyncInfoPtr && "AsyncInfoPtr is nullptr"); + + CUresult Err = cuCtxSetCurrent(DeviceData[SrcDevId].Context); + if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n")) + return OFFLOAD_FAIL; + + CUstream Stream = getStream(SrcDevId, AsyncInfoPtr); + + // If they are two devices, we try peer to peer copy first + if (SrcDevId != DstDevId) { + int CanAccessPeer = 0; + Err = cuDeviceCanAccessPeer(&CanAccessPeer, SrcDevId, DstDevId); + if (Err != CUDA_SUCCESS) { + DP("Error returned from cuDeviceCanAccessPeer. src = %" PRId32 + ", dst = %" PRId32 "\n", + SrcDevId, DstDevId); + CUDA_ERR_STRING(Err); + goto MemcpyDtoD; + } + + if (!CanAccessPeer) { + DP("P2P memcpy not supported so fall back to D2D memcpy"); + goto MemcpyDtoD; + } + + Err = cuCtxEnablePeerAccess(DeviceData[DstDevId].Context, 0); + if (Err != CUDA_SUCCESS) { + DP("Error returned from cuCtxEnablePeerAccess. src = %" PRId32 + ", dst = %" PRId32 "\n", + SrcDevId, DstDevId); + CUDA_ERR_STRING(Err); + goto MemcpyDtoD; + } + + Err = cuMemcpyPeerAsync((CUdeviceptr)DstPtr, DeviceData[DstDevId].Context, + (CUdeviceptr)SrcPtr, DeviceData[SrcDevId].Context, + Size, Stream); + if (Err == CUDA_SUCCESS) + return OFFLOAD_SUCCESS; + + DP("Error returned from cuMemcpyPeerAsync. src_ptr = " DPxMOD + ", src_id =%" PRId32 ", dst_ptr = %" DPxMOD ", dst_id =%" PRId32 "\n", + SrcPtr, SrcDevId, DstPtr, DstDevId); + CUDA_ERR_STRING(Err); + } + + MemcpyDtoD: + Err = cuMemcpyDtoDAsync((CUdeviceptr)DstPtr, (CUdeviceptr)SrcPtr, Size, + Stream); + if (Err != CUDA_SUCCESS) { + DP("Error when copying data from device to device. Pointers: src " + "= " DPxMOD ", dst = " DPxMOD ", size = %" PRId64 "\n", + DPxPTR(SrcPtr), DPxPTR(DstPtr), Size); + CUDA_ERR_STRING(Err); + return OFFLOAD_FAIL; + } + + return OFFLOAD_SUCCESS; + } + int dataDelete(const int DeviceId, void *TgtPtr) const { CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context); if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n")) @@ -965,6 +1028,33 @@ async_info_ptr); } +int32_t __tgt_rtl_data_exchange_async(int32_t src_dev_id, void *src_ptr, + int dst_dev_id, void *dst_ptr, + int64_t size, + __tgt_async_info *async_info_ptr) { + assert(DeviceRTL.isValidDeviceId(src_dev_id) && "src_dev_id is invalid"); + assert(DeviceRTL.isValidDeviceId(dst_dev_id) && "dst_dev_id is invalid"); + assert(async_info_ptr && "async_info_ptr is nullptr"); + + return DeviceRTL.dataExchange(src_dev_id, src_ptr, dst_dev_id, dst_ptr, size, + async_info_ptr); +} + +int32_t __tgt_rtl_data_exchange(int32_t src_dev_id, void *src_ptr, + int32_t dst_dev_id, void *dst_ptr, + int64_t size) { + assert(DeviceRTL.isValidDeviceId(src_dev_id) && "src_dev_id is invalid"); + assert(DeviceRTL.isValidDeviceId(dst_dev_id) && "dst_dev_id is invalid"); + + __tgt_async_info async_info; + const int32_t rc = __tgt_rtl_data_exchange_async( + src_dev_id, src_ptr, dst_dev_id, dst_ptr, size, &async_info); + if (rc != OFFLOAD_SUCCESS) + return OFFLOAD_FAIL; + + return __tgt_rtl_synchronize(src_dev_id, &async_info); +} + int32_t __tgt_rtl_data_delete(int32_t device_id, void *tgt_ptr) { assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); diff --git a/openmp/libomptarget/plugins/exports b/openmp/libomptarget/plugins/exports --- a/openmp/libomptarget/plugins/exports +++ b/openmp/libomptarget/plugins/exports @@ -10,6 +10,8 @@ __tgt_rtl_data_submit_async; __tgt_rtl_data_retrieve; __tgt_rtl_data_retrieve_async; + __tgt_rtl_data_exchange; + __tgt_rtl_data_exchange_async; __tgt_rtl_data_delete; __tgt_rtl_run_target_team_region; __tgt_rtl_run_target_team_region_async; diff --git a/openmp/libomptarget/src/api.cpp b/openmp/libomptarget/src/api.cpp --- a/openmp/libomptarget/src/api.cpp +++ b/openmp/libomptarget/src/api.cpp @@ -168,9 +168,15 @@ rc = SrcDev.data_retrieve(dstAddr, srcAddr, length, nullptr); } else { DP("copy from device to device\n"); + DeviceTy &SrcDev = Devices[src_device]; + DeviceTy &DstDev = Devices[dst_device]; + // First try to use D2D memcpy which is more efficient. If fails, fall back + // to unefficient way. + if (SrcDev.isExchangable(DstDev)) + rc = SrcDev.data_exchange(srcAddr, DstDev, dstAddr, length, nullptr); + if (rc == OFFLOAD_SUCCESS) + return OFFLOAD_SUCCESS; void *buffer = malloc(length); - DeviceTy& SrcDev = Devices[src_device]; - DeviceTy& DstDev = Devices[dst_device]; rc = SrcDev.data_retrieve(buffer, srcAddr, length, nullptr); if (rc == OFFLOAD_SUCCESS) rc = DstDev.data_submit(dstAddr, buffer, length, nullptr); diff --git a/openmp/libomptarget/src/device.h b/openmp/libomptarget/src/device.h --- a/openmp/libomptarget/src/device.h +++ b/openmp/libomptarget/src/device.h @@ -157,6 +157,8 @@ return *this; } + bool isExchangable(const DeviceTy& OtherDevice); + uint64_t getMapEntryRefCnt(void *HstPtrBegin); LookupResult lookupMapping(void *HstPtrBegin, int64_t Size); void *getOrAllocTgtPtr(void *HstPtrBegin, void *HstPtrBase, int64_t Size, @@ -180,6 +182,8 @@ __tgt_async_info *AsyncInfoPtr); int32_t data_retrieve(void *HstPtrBegin, void *TgtPtrBegin, int64_t Size, __tgt_async_info *AsyncInfoPtr); + int32_t data_exchange(void *SrcPtr, DeviceTy DstDev, void *DstPtr, + int64_t Size, __tgt_async_info *AsyncInfoPtr); int32_t run_region(void *TgtEntryPtr, void **TgtVarsPtr, ptrdiff_t *TgtOffsets, int32_t TgtVarsSize, diff --git a/openmp/libomptarget/src/device.cpp b/openmp/libomptarget/src/device.cpp --- a/openmp/libomptarget/src/device.cpp +++ b/openmp/libomptarget/src/device.cpp @@ -352,6 +352,20 @@ AsyncInfoPtr); } +// Copy data from device +int32_t DeviceTy::data_exchange(void *SrcPtr, DeviceTy DstDev, void *DstPtr, + int64_t Size, __tgt_async_info *AsyncInfoPtr) { + if (!AsyncInfoPtr || !RTL->data_exchange_async || !RTL->synchronize) { + if (!RTL->data_exchange) + return OFFLOAD_FAIL; + + return RTL->data_exchange(RTLDeviceID, SrcPtr, DstDev.RTLDeviceID, DstPtr, + Size); + } else + return RTL->data_exchange_async(RTLDeviceID, SrcPtr, DstDev.RTLDeviceID, + DstPtr, Size, AsyncInfoPtr); +} + // Run region on device int32_t DeviceTy::run_region(void *TgtEntryPtr, void **TgtVarsPtr, ptrdiff_t *TgtOffsets, int32_t TgtVarsSize, @@ -380,6 +394,11 @@ ThreadLimit, LoopTripCount, AsyncInfoPtr); } +// Whether can exchange data between two target devices +bool DeviceTy::isExchangable(const DeviceTy &OtherDevice) { + return RTL->RTLName == OtherDevice.RTL->RTLName; +} + /// Check whether a device has an associated RTL and initialize it if it's not /// already initialized. bool device_is_ready(int device_num) { diff --git a/openmp/libomptarget/src/rtl.h b/openmp/libomptarget/src/rtl.h --- a/openmp/libomptarget/src/rtl.h +++ b/openmp/libomptarget/src/rtl.h @@ -36,6 +36,9 @@ typedef int32_t(data_retrieve_ty)(int32_t, void *, void *, int64_t); typedef int32_t(data_retrieve_async_ty)(int32_t, void *, void *, int64_t, __tgt_async_info *); + typedef int32_t(data_exchange_ty)(int32_t, void *, int32_t, void *, int64_t); + typedef int32_t(data_exchange_async_ty)(int32_t, void *, int32_t, void *, + int64_t, __tgt_async_info *); typedef int32_t(data_delete_ty)(int32_t, void *); typedef int32_t(run_region_ty)(int32_t, void *, void **, ptrdiff_t *, int32_t); @@ -58,9 +61,7 @@ void *LibraryHandler = nullptr; -#ifdef OMPTARGET_DEBUG std::string RTLName; -#endif // Functions implemented in the RTL. is_valid_binary_ty *is_valid_binary = nullptr; @@ -72,6 +73,8 @@ data_submit_async_ty *data_submit_async = nullptr; data_retrieve_ty *data_retrieve = nullptr; data_retrieve_async_ty *data_retrieve_async = nullptr; + data_exchange_ty *data_exchange = nullptr; + data_exchange_async_ty *data_exchange_async = nullptr; data_delete_ty *data_delete = nullptr; run_region_ty *run_region = nullptr; run_region_async_ty *run_region_async = nullptr; @@ -96,9 +99,7 @@ Idx = r.Idx; NumberOfDevices = r.NumberOfDevices; LibraryHandler = r.LibraryHandler; -#ifdef OMPTARGET_DEBUG RTLName = r.RTLName; -#endif is_valid_binary = r.is_valid_binary; number_of_devices = r.number_of_devices; init_device = r.init_device; @@ -108,6 +109,8 @@ data_submit_async = r.data_submit_async; data_retrieve = r.data_retrieve; data_retrieve_async = r.data_retrieve_async; + data_exchange = r.data_exchange; + data_exchange_async = r.data_exchange_async; data_delete = r.data_delete; run_region = r.run_region; run_region_async = r.run_region_async; diff --git a/openmp/libomptarget/src/rtl.cpp b/openmp/libomptarget/src/rtl.cpp --- a/openmp/libomptarget/src/rtl.cpp +++ b/openmp/libomptarget/src/rtl.cpp @@ -92,10 +92,7 @@ R.LibraryHandler = dynlib_handle; R.isUsed = false; - -#ifdef OMPTARGET_DEBUG R.RTLName = Name; -#endif if (!(*((void **)&R.is_valid_binary) = dlsym(dynlib_handle, "__tgt_rtl_is_valid_binary"))) @@ -140,6 +137,10 @@ *((void **)&R.run_team_region_async) = dlsym(dynlib_handle, "__tgt_rtl_run_target_team_region_async"); *((void **)&R.synchronize) = dlsym(dynlib_handle, "__tgt_rtl_synchronize"); + *((void **)&R.data_exchange) = + dlsym(dynlib_handle, "__tgt_rtl_data_exchange"); + *((void **)&R.data_exchange_async) = + dlsym(dynlib_handle, "__tgt_rtl_data_exchange_async"); // No devices are supported by this RTL? if (!(R.NumberOfDevices = R.number_of_devices())) { diff --git a/openmp/libomptarget/test/offloading/d2d_memcpy.c b/openmp/libomptarget/test/offloading/d2d_memcpy.c new file mode 100644 --- /dev/null +++ b/openmp/libomptarget/test/offloading/d2d_memcpy.c @@ -0,0 +1,63 @@ +// RUN: %libomptarget-compile-aarch64-unknown-linux-gnu && env OMP_MAX_ACTIVE_LEVELS=2 %libomptarget-run-aarch64-unknown-linux-gnu | %fcheck-aarch64-unknown-linux-gnu +// RUN: %libomptarget-compile-powerpc64-ibm-linux-gnu && env OMP_MAX_ACTIVE_LEVELS=2 %libomptarget-run-powerpc64-ibm-linux-gnu | %fcheck-powerpc64-ibm-linux-gnu +// RUN: %libomptarget-compile-powerpc64le-ibm-linux-gnu && env OMP_MAX_ACTIVE_LEVELS=2 %libomptarget-run-powerpc64le-ibm-linux-gnu | %fcheck-powerpc64le-ibm-linux-gnu +// RUN: %libomptarget-compile-x86_64-pc-linux-gnu && env OMP_MAX_ACTIVE_LEVELS=2 %libomptarget-run-x86_64-pc-linux-gnu | %fcheck-x86_64-pc-linux-gnu -allow-empty + +#include +#include +#include +#include + +const int magic_num = 7; + +int main(int argc, char *argv[]) { + const int N = 128; + const int num_devices = omp_get_num_devices(); + + if (num_devices == 0) { + printf("PASS\n"); + return 0; + } + + const int src_device = 0; + int dst_device = 1; + if (dst_device >= num_devices) + dst_device = num_devices - 1; + + int length = N * sizeof(int); + int *src_ptr = omp_target_alloc(length, src_device); + int *dst_ptr = omp_target_alloc(length, dst_device); + + assert(src_ptr && "src_ptr is NULL"); + assert(dst_ptr && "dst_ptr is NULL"); + +#pragma omp target teams distribute parallel for device(src_device) \ + is_device_ptr(src_ptr) + for (int i = 0; i < N; ++i) { + src_ptr[i] = magic_num; + } + + int rc = + omp_target_memcpy(dst_ptr, src_ptr, length, 0, 0, dst_device, src_device); + + assert(rc == 0 && "error in omp_target_memcpy"); + + int *buffer = malloc(length); + + assert(buffer && "failed to malloc host buffer"); + +#pragma omp target teams distribute parallel for device(dst_device) \ + map(from: buffer[0:N]) is_device_ptr(dst_ptr) + for (int i = 0; i < N; ++i) { + buffer[i] = dst_ptr[i] + magic_num; + } + + for (int i = 0; i < N; ++i) + assert(buffer[i] == 2 * magic_num); + + printf("PASS\n"); + + return 0; +} + +// CHECK: PASS