Changeset View
Changeset View
Standalone View
Standalone View
openmp/libomptarget/plugins/cuda/src/rtl.cpp
Show First 20 Lines • Show All 94 Lines • ▼ Show 20 Lines | bool checkResult(CUresult Err, const char *ErrMsg) { | ||||
if (Err == CUDA_SUCCESS) | if (Err == CUDA_SUCCESS) | ||||
return true; | return true; | ||||
DP(ErrMsg); | DP(ErrMsg); | ||||
CUDA_ERR_STRING(Err); | CUDA_ERR_STRING(Err); | ||||
return false; | return false; | ||||
} | } | ||||
int memcpyDtoD(const void *SrcPtr, void *DstPtr, int64_t Size, | |||||
CUstream Stream) { | |||||
CUresult Err = | |||||
cuMemcpyDtoDAsync((CUdeviceptr)DstPtr, (CUdeviceptr)SrcPtr, Size, Stream); | |||||
if (Err != CUDA_SUCCESS) { | |||||
DP("Error when copying data from device to device. Pointers: src " | |||||
"= " DPxMOD ", dst = " DPxMOD ", size = %" PRId64 "\n", | |||||
DPxPTR(SrcPtr), DPxPTR(DstPtr), Size); | |||||
CUDA_ERR_STRING(Err); | |||||
return OFFLOAD_FAIL; | |||||
} | |||||
return OFFLOAD_SUCCESS; | |||||
} | |||||
// Structure contains per-device data | // Structure contains per-device data | ||||
struct DeviceDataTy { | struct DeviceDataTy { | ||||
std::list<FuncOrGblEntryTy> FuncGblEntries; | std::list<FuncOrGblEntryTy> FuncGblEntries; | ||||
CUcontext Context = nullptr; | CUcontext Context = nullptr; | ||||
// Device properties | // Device properties | ||||
int ThreadsPerBlock = 0; | int ThreadsPerBlock = 0; | ||||
int BlocksPerGrid = 0; | int BlocksPerGrid = 0; | ||||
int WarpSize = 0; | int WarpSize = 0; | ||||
▲ Show 20 Lines • Show All 620 Lines • ▼ Show 20 Lines | if (Err != CUDA_SUCCESS) { | ||||
DPxPTR(HstPtr), DPxPTR(TgtPtr), Size); | DPxPTR(HstPtr), DPxPTR(TgtPtr), Size); | ||||
CUDA_ERR_STRING(Err); | CUDA_ERR_STRING(Err); | ||||
return OFFLOAD_FAIL; | return OFFLOAD_FAIL; | ||||
} | } | ||||
return OFFLOAD_SUCCESS; | return OFFLOAD_SUCCESS; | ||||
} | } | ||||
int dataExchange(int SrcDevId, const void *SrcPtr, int DstDevId, void *DstPtr, | |||||
int64_t Size, __tgt_async_info *AsyncInfoPtr) const { | |||||
assert(AsyncInfoPtr && "AsyncInfoPtr is nullptr"); | |||||
CUresult Err = cuCtxSetCurrent(DeviceData[SrcDevId].Context); | |||||
if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n")) | |||||
return OFFLOAD_FAIL; | |||||
CUstream Stream = getStream(SrcDevId, AsyncInfoPtr); | |||||
// If they are two devices, we try peer to peer copy first | |||||
if (SrcDevId != DstDevId) { | |||||
int CanAccessPeer = 0; | |||||
Err = cuDeviceCanAccessPeer(&CanAccessPeer, SrcDevId, DstDevId); | |||||
if (Err != CUDA_SUCCESS) { | |||||
DP("Error returned from cuDeviceCanAccessPeer. src = %" PRId32 | |||||
", dst = %" PRId32 "\n", | |||||
SrcDevId, DstDevId); | |||||
CUDA_ERR_STRING(Err); | |||||
return memcpyDtoD(SrcPtr, DstPtr, Size, Stream); | |||||
} | |||||
if (!CanAccessPeer) { | |||||
DP("P2P memcpy not supported so fall back to D2D memcpy"); | |||||
return memcpyDtoD(SrcPtr, DstPtr, Size, Stream); | |||||
} | |||||
Err = cuCtxEnablePeerAccess(DeviceData[DstDevId].Context, 0); | |||||
if (Err != CUDA_SUCCESS) { | |||||
DP("Error returned from cuCtxEnablePeerAccess. src = %" PRId32 | |||||
", dst = %" PRId32 "\n", | |||||
SrcDevId, DstDevId); | |||||
CUDA_ERR_STRING(Err); | |||||
return memcpyDtoD(SrcPtr, DstPtr, Size, Stream); | |||||
} | |||||
Err = cuMemcpyPeerAsync((CUdeviceptr)DstPtr, DeviceData[DstDevId].Context, | |||||
(CUdeviceptr)SrcPtr, DeviceData[SrcDevId].Context, | |||||
Size, Stream); | |||||
if (Err == CUDA_SUCCESS) | |||||
return OFFLOAD_SUCCESS; | |||||
DP("Error returned from cuMemcpyPeerAsync. src_ptr = " DPxMOD | |||||
", src_id =%" PRId32 ", dst_ptr = %" DPxMOD ", dst_id =%" PRId32 "\n", | |||||
SrcPtr, SrcDevId, DstPtr, DstDevId); | |||||
CUDA_ERR_STRING(Err); | |||||
} | |||||
return memcpyDtoD(SrcPtr, DstPtr, Size, Stream); | |||||
} | |||||
jdoerfert: Any reason not to make this a function and call it? | |||||
int dataDelete(const int DeviceId, void *TgtPtr) const { | int dataDelete(const int DeviceId, void *TgtPtr) const { | ||||
CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context); | CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context); | ||||
if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n")) | if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n")) | ||||
return OFFLOAD_FAIL; | return OFFLOAD_FAIL; | ||||
Err = cuMemFree((CUdeviceptr)TgtPtr); | Err = cuMemFree((CUdeviceptr)TgtPtr); | ||||
if (!checkResult(Err, "Error returned from cuMemFree\n")) | if (!checkResult(Err, "Error returned from cuMemFree\n")) | ||||
return OFFLOAD_FAIL; | return OFFLOAD_FAIL; | ||||
▲ Show 20 Lines • Show All 148 Lines • ▼ Show 20 Lines | |||||
int32_t __tgt_rtl_number_of_devices() { return DeviceRTL.getNumOfDevices(); } | int32_t __tgt_rtl_number_of_devices() { return DeviceRTL.getNumOfDevices(); } | ||||
int64_t __tgt_rtl_init_requires(int64_t RequiresFlags) { | int64_t __tgt_rtl_init_requires(int64_t RequiresFlags) { | ||||
DP("Init requires flags to %ld\n", RequiresFlags); | DP("Init requires flags to %ld\n", RequiresFlags); | ||||
DeviceRTL.setRequiresFlag(RequiresFlags); | DeviceRTL.setRequiresFlag(RequiresFlags); | ||||
return RequiresFlags; | return RequiresFlags; | ||||
} | } | ||||
int32_t __tgt_rtl_is_data_exchangable(int32_t src_dev_id, int dst_dev_id) { | |||||
if (DeviceRTL.isValidDeviceId(src_dev_id) && | |||||
DeviceRTL.isValidDeviceId(dst_dev_id)) | |||||
return 1; | |||||
return 0; | |||||
} | |||||
int32_t __tgt_rtl_init_device(int32_t device_id) { | int32_t __tgt_rtl_init_device(int32_t device_id) { | ||||
assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); | assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); | ||||
return DeviceRTL.initDevice(device_id); | return DeviceRTL.initDevice(device_id); | ||||
} | } | ||||
__tgt_target_table *__tgt_rtl_load_binary(int32_t device_id, | __tgt_target_table *__tgt_rtl_load_binary(int32_t device_id, | ||||
__tgt_device_image *image) { | __tgt_device_image *image) { | ||||
▲ Show 20 Lines • Show All 49 Lines • ▼ Show 20 Lines | int32_t __tgt_rtl_data_retrieve_async(int32_t device_id, void *hst_ptr, | ||||
__tgt_async_info *async_info_ptr) { | __tgt_async_info *async_info_ptr) { | ||||
assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); | assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); | ||||
assert(async_info_ptr && "async_info_ptr is nullptr"); | assert(async_info_ptr && "async_info_ptr is nullptr"); | ||||
return DeviceRTL.dataRetrieve(device_id, hst_ptr, tgt_ptr, size, | return DeviceRTL.dataRetrieve(device_id, hst_ptr, tgt_ptr, size, | ||||
async_info_ptr); | async_info_ptr); | ||||
} | } | ||||
int32_t __tgt_rtl_data_exchange_async(int32_t src_dev_id, void *src_ptr, | |||||
int dst_dev_id, void *dst_ptr, | |||||
int64_t size, | |||||
__tgt_async_info *async_info_ptr) { | |||||
assert(DeviceRTL.isValidDeviceId(src_dev_id) && "src_dev_id is invalid"); | |||||
assert(DeviceRTL.isValidDeviceId(dst_dev_id) && "dst_dev_id is invalid"); | |||||
assert(async_info_ptr && "async_info_ptr is nullptr"); | |||||
return DeviceRTL.dataExchange(src_dev_id, src_ptr, dst_dev_id, dst_ptr, size, | |||||
async_info_ptr); | |||||
} | |||||
int32_t __tgt_rtl_data_exchange(int32_t src_dev_id, void *src_ptr, | |||||
int32_t dst_dev_id, void *dst_ptr, | |||||
int64_t size) { | |||||
assert(DeviceRTL.isValidDeviceId(src_dev_id) && "src_dev_id is invalid"); | |||||
assert(DeviceRTL.isValidDeviceId(dst_dev_id) && "dst_dev_id is invalid"); | |||||
__tgt_async_info async_info; | |||||
const int32_t rc = __tgt_rtl_data_exchange_async( | |||||
src_dev_id, src_ptr, dst_dev_id, dst_ptr, size, &async_info); | |||||
if (rc != OFFLOAD_SUCCESS) | |||||
return OFFLOAD_FAIL; | |||||
return __tgt_rtl_synchronize(src_dev_id, &async_info); | |||||
} | |||||
int32_t __tgt_rtl_data_delete(int32_t device_id, void *tgt_ptr) { | int32_t __tgt_rtl_data_delete(int32_t device_id, void *tgt_ptr) { | ||||
assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); | assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); | ||||
return DeviceRTL.dataDelete(device_id, tgt_ptr); | return DeviceRTL.dataDelete(device_id, tgt_ptr); | ||||
} | } | ||||
int32_t __tgt_rtl_run_target_team_region(int32_t device_id, void *tgt_entry_ptr, | int32_t __tgt_rtl_run_target_team_region(int32_t device_id, void *tgt_entry_ptr, | ||||
void **tgt_args, | void **tgt_args, | ||||
▲ Show 20 Lines • Show All 67 Lines • Show Last 20 Lines |
Any reason not to make this a function and call it?