Changeset View
Standalone View
openmp/libomptarget/plugins/cuda/src/rtl.cpp
Show First 20 Lines • Show All 303 Lines • ▼ Show 20 Lines | for (auto &ctx : Contexts) | ||||
CUDA_ERR_STRING(err); | CUDA_ERR_STRING(err); | ||||
} | } | ||||
} | } | ||||
} | } | ||||
}; | }; | ||||
static RTLDeviceInfoTy DeviceInfo; | static RTLDeviceInfoTy DeviceInfo; | ||||
namespace { | |||||
CUstream selectStream(int32_t Id, __tgt_async_info *AsyncInfo) { | |||||
if (!AsyncInfo) | |||||
return DeviceInfo.getNextStream(Id); | |||||
if (!AsyncInfo->Queue) | |||||
AsyncInfo->Queue = DeviceInfo.getNextStream(Id); | |||||
return reinterpret_cast<CUstream>(AsyncInfo->Queue); | |||||
jdoerfert: Nit: I know the style in here is mixed but I would remove the braces around single statements… | |||||
Speaking of this, is it required in LLVM that single-statement should not be wrapped into braces? It is often good practice to have these braces even just with one statement. tianshilei1992: Speaking of this, is it required in LLVM that single-statement should not be wrapped into… | |||||
Coding style says no braces but the runtime is unfortunately not in accordance... jdoerfert: Coding style says no braces but the runtime is unfortunately not in accordance... | |||||
Okay. Will change that. tianshilei1992: Okay. Will change that. | |||||
} | |||||
int32_t dataRetrieve(int32_t DeviceId, void *HstPtr, void *TgtPtr, int64_t Size, | |||||
__tgt_async_info *AsyncInfoPtr) { | |||||
assert(AsyncInfoPtr && "AsyncInfoPtr is nullptr"); | |||||
// Set the context we are using. | |||||
CUresult err = cuCtxSetCurrent(DeviceInfo.Contexts[DeviceId]); | |||||
if (err != CUDA_SUCCESS) { | |||||
DP("Error when setting CUDA context\n"); | |||||
CUDA_ERR_STRING(err); | |||||
return OFFLOAD_FAIL; | |||||
} | |||||
CUstream Stream = selectStream(DeviceId, AsyncInfoPtr); | |||||
err = cuMemcpyDtoHAsync(HstPtr, (CUdeviceptr)TgtPtr, Size, Stream); | |||||
if (err != CUDA_SUCCESS) { | |||||
DP("Error when copying data from device to host. Pointers: host = " DPxMOD | |||||
", device = " DPxMOD ", size = %" PRId64 "\n", | |||||
DPxPTR(HstPtr), DPxPTR(TgtPtr), Size); | |||||
CUDA_ERR_STRING(err); | |||||
return OFFLOAD_FAIL; | |||||
} | |||||
return OFFLOAD_SUCCESS; | |||||
} | |||||
int32_t dataSubmit(int32_t DeviceId, void *TgtPtr, void *HstPtr, int64_t Size, | |||||
__tgt_async_info *AsyncInfoPtr) { | |||||
assert(AsyncInfoPtr && "AsyncInfoPtr is nullptr"); | |||||
// Set the context we are using. | |||||
CUresult err = cuCtxSetCurrent(DeviceInfo.Contexts[DeviceId]); | |||||
if (err != CUDA_SUCCESS) { | |||||
DP("Error when setting CUDA context\n"); | |||||
CUDA_ERR_STRING(err); | |||||
return OFFLOAD_FAIL; | |||||
} | |||||
CUstream Stream = selectStream(DeviceId, AsyncInfoPtr); | |||||
err = cuMemcpyHtoDAsync((CUdeviceptr)TgtPtr, HstPtr, Size, Stream); | |||||
if (err != CUDA_SUCCESS) { | |||||
DP("Error when copying data from host to device. Pointers: host = " DPxMOD | |||||
", device = " DPxMOD ", size = %" PRId64 "\n", | |||||
DPxPTR(HstPtr), DPxPTR(TgtPtr), Size); | |||||
CUDA_ERR_STRING(err); | |||||
return OFFLOAD_FAIL; | |||||
} | |||||
return OFFLOAD_SUCCESS; | |||||
} | |||||
} // namespace | |||||
#ifdef __cplusplus | #ifdef __cplusplus | ||||
extern "C" { | extern "C" { | ||||
#endif | #endif | ||||
int32_t __tgt_rtl_is_valid_binary(__tgt_device_image *image) { | int32_t __tgt_rtl_is_valid_binary(__tgt_device_image *image) { | ||||
return elf_check_machine(image, 190); // EM_CUDA = 190. | return elf_check_machine(image, 190); // EM_CUDA = 190. | ||||
} | } | ||||
▲ Show 20 Lines • Show All 338 Lines • ▼ Show 20 Lines | if (err != CUDA_SUCCESS) { | ||||
return NULL; | return NULL; | ||||
} | } | ||||
void *vptr = (void *)ptr; | void *vptr = (void *)ptr; | ||||
return vptr; | return vptr; | ||||
} | } | ||||
int32_t __tgt_rtl_data_submit(int32_t device_id, void *tgt_ptr, void *hst_ptr, | int32_t __tgt_rtl_data_submit(int32_t device_id, void *tgt_ptr, void *hst_ptr, | ||||
int64_t size) { | int64_t size, __tgt_async_info *async_info_ptr) { | ||||
// Set the context we are using. | // The function dataSubmit is always asynchronous. Considering some data | ||||
CUresult err = cuCtxSetCurrent(DeviceInfo.Contexts[device_id]); | // transfer must be synchronous, we assume if async_info_ptr is nullptr, the | ||||
if (err != CUDA_SUCCESS) { | // transfer will be synchronous by creating a temporary async info and then | ||||
DP("Error when setting CUDA context\n"); | // synchronizing after call dataSubmit; otherwise, it is asynchronous. | ||||
CUDA_ERR_STRING(err); | if (async_info_ptr) | ||||
return dataSubmit(device_id, tgt_ptr, hst_ptr, size, async_info_ptr); | |||||
__tgt_async_info async_info; | |||||
int32_t rc = dataSubmit(device_id, tgt_ptr, hst_ptr, size, &async_info); | |||||
if (rc != OFFLOAD_SUCCESS) | |||||
return OFFLOAD_FAIL; | return OFFLOAD_FAIL; | ||||
} | |||||
CUstream &Stream = DeviceInfo.getNextStream(device_id); | |||||
err = cuMemcpyHtoDAsync((CUdeviceptr)tgt_ptr, hst_ptr, size, Stream); | |||||
if (err != CUDA_SUCCESS) { | |||||
DP("Error when copying data from host to device. Pointers: host = " DPxMOD | |||||
", device = " DPxMOD ", size = %" PRId64 "\n", | |||||
DPxPTR(hst_ptr), DPxPTR(tgt_ptr), size); | |||||
CUDA_ERR_STRING(err); | |||||
return OFFLOAD_FAIL; | |||||
} | |||||
err = cuStreamSynchronize(Stream); | return __tgt_rtl_synchronize(device_id, &async_info); | ||||
Not Done ReplyInline ActionsStyle: Remove the else case, there is a return before. Same below. jdoerfert: Style: Remove the else case, there is a return before. Same below. | |||||
Not Done ReplyInline ActionsSorry, didn't get your point. The return above only applies when rc != OFFLOAD_SUCCESS. tianshilei1992: Sorry, didn't get your point. The `return` above only applies when `rc != OFFLOAD_SUCCESS`. | |||||
Not Done ReplyInline ActionsOkay, will do that. tianshilei1992: Okay, will do that. | |||||
if (err != CUDA_SUCCESS) { | |||||
DP("Error when synchronizing async data transfer from host to device. " | |||||
"Pointers: host = " DPxMOD ", device = " DPxMOD ", size = %" PRId64 "\n", | |||||
DPxPTR(hst_ptr), DPxPTR(tgt_ptr), size); | |||||
CUDA_ERR_STRING(err); | |||||
return OFFLOAD_FAIL; | |||||
} | |||||
return OFFLOAD_SUCCESS; | |||||
} | } | ||||
int32_t __tgt_rtl_data_retrieve(int32_t device_id, void *hst_ptr, void *tgt_ptr, | int32_t __tgt_rtl_data_retrieve(int32_t device_id, void *hst_ptr, void *tgt_ptr, | ||||
int64_t size) { | int64_t size, | ||||
// Set the context we are using. | __tgt_async_info *async_info_ptr) { | ||||
CUresult err = cuCtxSetCurrent(DeviceInfo.Contexts[device_id]); | // The function dataRetrieve is always asynchronous. Considering some data | ||||
if (err != CUDA_SUCCESS) { | // transfer must be synchronous, we assume if async_info_ptr is nullptr, the | ||||
DP("Error when setting CUDA context\n"); | // transfer will be synchronous by creating a temporary async info and then | ||||
CUDA_ERR_STRING(err); | // synchronizing after call dataRetrieve; otherwise, it is asynchronous. | ||||
if (async_info_ptr) | |||||
return dataRetrieve(device_id, hst_ptr, tgt_ptr, size, async_info_ptr); | |||||
__tgt_async_info async_info; | |||||
int32_t rc = dataRetrieve(device_id, hst_ptr, tgt_ptr, size, &async_info); | |||||
if (rc != OFFLOAD_SUCCESS) | |||||
return OFFLOAD_FAIL; | return OFFLOAD_FAIL; | ||||
} | |||||
CUstream &Stream = DeviceInfo.getNextStream(device_id); | return __tgt_rtl_synchronize(device_id, &async_info); | ||||
err = cuMemcpyDtoHAsync(hst_ptr, (CUdeviceptr)tgt_ptr, size, Stream); | |||||
if (err != CUDA_SUCCESS) { | |||||
DP("Error when copying data from device to host. Pointers: host = " DPxMOD | |||||
", device = " DPxMOD ", size = %" PRId64 "\n", | |||||
DPxPTR(hst_ptr), DPxPTR(tgt_ptr), size); | |||||
CUDA_ERR_STRING(err); | |||||
return OFFLOAD_FAIL; | |||||
} | |||||
err = cuStreamSynchronize(Stream); | |||||
if (err != CUDA_SUCCESS) { | |||||
DP("Error when synchronizing async data transfer from device to host. " | |||||
"Pointers: host = " DPxMOD ", device = " DPxMOD ", size = %" PRId64 "\n", | |||||
DPxPTR(hst_ptr), DPxPTR(tgt_ptr), size); | |||||
CUDA_ERR_STRING(err); | |||||
return OFFLOAD_FAIL; | |||||
} | |||||
return OFFLOAD_SUCCESS; | |||||
} | } | ||||
int32_t __tgt_rtl_data_delete(int32_t device_id, void *tgt_ptr) { | int32_t __tgt_rtl_data_delete(int32_t device_id, void *tgt_ptr) { | ||||
// Set the context we are using. | // Set the context we are using. | ||||
CUresult err = cuCtxSetCurrent(DeviceInfo.Contexts[device_id]); | CUresult err = cuCtxSetCurrent(DeviceInfo.Contexts[device_id]); | ||||
if (err != CUDA_SUCCESS) { | if (err != CUDA_SUCCESS) { | ||||
DP("Error when setting CUDA context\n"); | DP("Error when setting CUDA context\n"); | ||||
CUDA_ERR_STRING(err); | CUDA_ERR_STRING(err); | ||||
return OFFLOAD_FAIL; | return OFFLOAD_FAIL; | ||||
} | } | ||||
err = cuMemFree((CUdeviceptr)tgt_ptr); | err = cuMemFree((CUdeviceptr)tgt_ptr); | ||||
if (err != CUDA_SUCCESS) { | if (err != CUDA_SUCCESS) { | ||||
DP("Error when freeing CUDA memory\n"); | DP("Error when freeing CUDA memory\n"); | ||||
CUDA_ERR_STRING(err); | CUDA_ERR_STRING(err); | ||||
return OFFLOAD_FAIL; | return OFFLOAD_FAIL; | ||||
} | } | ||||
return OFFLOAD_SUCCESS; | return OFFLOAD_SUCCESS; | ||||
} | } | ||||
int32_t __tgt_rtl_run_target_team_region(int32_t device_id, void *tgt_entry_ptr, | int32_t __tgt_rtl_run_target_team_region(int32_t device_id, void *tgt_entry_ptr, | ||||
void **tgt_args, ptrdiff_t *tgt_offsets, int32_t arg_num, int32_t team_num, | void **tgt_args, | ||||
int32_t thread_limit, uint64_t loop_tripcount) { | ptrdiff_t *tgt_offsets, | ||||
int32_t arg_num, int32_t team_num, | |||||
int32_t thread_limit, | |||||
uint64_t loop_tripcount, | |||||
__tgt_async_info *async_info) { | |||||
// Set the context we are using. | // Set the context we are using. | ||||
CUresult err = cuCtxSetCurrent(DeviceInfo.Contexts[device_id]); | CUresult err = cuCtxSetCurrent(DeviceInfo.Contexts[device_id]); | ||||
if (err != CUDA_SUCCESS) { | if (err != CUDA_SUCCESS) { | ||||
DP("Error when setting CUDA context\n"); | DP("Error when setting CUDA context\n"); | ||||
CUDA_ERR_STRING(err); | CUDA_ERR_STRING(err); | ||||
return OFFLOAD_FAIL; | return OFFLOAD_FAIL; | ||||
} | } | ||||
▲ Show 20 Lines • Show All 79 Lines • ▼ Show 20 Lines | if (team_num <= 0) { | ||||
cudaBlocksPerGrid = team_num; | cudaBlocksPerGrid = team_num; | ||||
DP("Using requested number of teams %d\n", team_num); | DP("Using requested number of teams %d\n", team_num); | ||||
} | } | ||||
// Run on the device. | // Run on the device. | ||||
DP("Launch kernel with %d blocks and %d threads\n", cudaBlocksPerGrid, | DP("Launch kernel with %d blocks and %d threads\n", cudaBlocksPerGrid, | ||||
cudaThreadsPerBlock); | cudaThreadsPerBlock); | ||||
CUstream &Stream = DeviceInfo.getNextStream(device_id); | CUstream Stream = selectStream(device_id, async_info); | ||||
err = cuLaunchKernel(KernelInfo->Func, cudaBlocksPerGrid, 1, 1, | err = cuLaunchKernel(KernelInfo->Func, cudaBlocksPerGrid, 1, 1, | ||||
cudaThreadsPerBlock, 1, 1, 0 /*bytes of shared memory*/, | cudaThreadsPerBlock, 1, 1, 0 /*bytes of shared memory*/, | ||||
Stream, &args[0], 0); | Stream, &args[0], 0); | ||||
if (err != CUDA_SUCCESS) { | if (err != CUDA_SUCCESS) { | ||||
DP("Device kernel launch failed!\n"); | DP("Device kernel launch failed!\n"); | ||||
CUDA_ERR_STRING(err); | CUDA_ERR_STRING(err); | ||||
return OFFLOAD_FAIL; | return OFFLOAD_FAIL; | ||||
} | } | ||||
DP("Launch of entry point at " DPxMOD " successful!\n", | DP("Launch of entry point at " DPxMOD " successful!\n", | ||||
DPxPTR(tgt_entry_ptr)); | DPxPTR(tgt_entry_ptr)); | ||||
CUresult sync_err = cuStreamSynchronize(Stream); | |||||
if (sync_err != CUDA_SUCCESS) { | |||||
DP("Kernel execution error at " DPxMOD "!\n", DPxPTR(tgt_entry_ptr)); | |||||
CUDA_ERR_STRING(sync_err); | |||||
return OFFLOAD_FAIL; | |||||
} else { | |||||
DP("Kernel execution at " DPxMOD " successful!\n", DPxPTR(tgt_entry_ptr)); | |||||
} | |||||
return OFFLOAD_SUCCESS; | return OFFLOAD_SUCCESS; | ||||
} | } | ||||
int32_t __tgt_rtl_run_target_region(int32_t device_id, void *tgt_entry_ptr, | int32_t __tgt_rtl_run_target_region(int32_t device_id, void *tgt_entry_ptr, | ||||
void **tgt_args, ptrdiff_t *tgt_offsets, int32_t arg_num) { | void **tgt_args, ptrdiff_t *tgt_offsets, | ||||
int32_t arg_num, | |||||
__tgt_async_info *async_info) { | |||||
// use one team and the default number of threads. | // use one team and the default number of threads. | ||||
const int32_t team_num = 1; | const int32_t team_num = 1; | ||||
const int32_t thread_limit = 0; | const int32_t thread_limit = 0; | ||||
return __tgt_rtl_run_target_team_region(device_id, tgt_entry_ptr, tgt_args, | return __tgt_rtl_run_target_team_region(device_id, tgt_entry_ptr, tgt_args, | ||||
tgt_offsets, arg_num, team_num, thread_limit, 0); | tgt_offsets, arg_num, team_num, | ||||
thread_limit, 0, async_info); | |||||
} | |||||
int32_t __tgt_rtl_synchronize(int32_t device_id, __tgt_async_info *async_info) { | |||||
assert(async_info && "async_info is nullptr"); | |||||
assert(async_info->Queue && "async_info->Queue is nullptr"); | |||||
CUstream Stream = reinterpret_cast<CUstream>(async_info->Queue); | |||||
It seems no synchronization by default, not aligned with the original behavior lildmh: It seems no synchronization by default, not aligned with the original behavior | |||||
The assumption here is: if this function is called at the end of target function, Identifier must be a valid pointer because anyhow during kernel launch it is assigned with a valid CUstream. tianshilei1992: The assumption here is: if this function is called at the end of `target` function… | |||||
It seems wrong to call this with Identifier = nullptr, doesn't it? If so we should make it an assert. jdoerfert: It seems wrong to call this with `Identifier = nullptr`, doesn't it? If so we should make it an… | |||||
Yeah, seems right. "If there is nothing to be synchronized, what do users expect to synchronize?" Yes, should do an assert here. tianshilei1992: Yeah, seems right. "If there is nothing to be synchronized, what do users expect to synchronize? | |||||
CUresult Err = cuStreamSynchronize(Stream); | |||||
if (Err != CUDA_SUCCESS) { | |||||
DP("Error when synchronizing stream. stream = " DPxMOD | |||||
", async info ptr = " DPxMOD "\n", | |||||
Why reinterpret_Cast? Is a CUstream a pointer? If not, why do we move objects around? jdoerfert: Why reinterpret_Cast? Is a CUstream a pointer? If not, why do we move objects around? | |||||
Yep. typedef struct CUstream_st *CUstream; /**< CUDA stream */ tianshilei1992: Yep.
```
typedef struct CUstream_st *CUstream; /**< CUDA stream */
``` | |||||
then we should be able to just use a static cast, shouldn't we? Maybe also add a static_assert just to be safe. Also other places. jdoerfert: then we should be able to just use a static cast, shouldn't we? Maybe also add a static_assert… | |||||
Correct me if I'm wrong. Compiler will generate code for the cast if using static_cast, but it will not for reinterpret_cast. My thought is, since it is already in CUDA plugin, we can make sure that this void * is actually a CUstream so we can directly reinterpret it to avoid one cast instruction. tianshilei1992: Correct me if I'm wrong. Compiler will generate code for the cast if using `static_cast`, but… | |||||
It could but not for such a pointer 2 pointer cast on a "normal" architecture. It's just that I don't see reinterpret_cast often, almost 1:4 against static_cast, but I'm fine with leaving it. jdoerfert: It could but not for such a pointer 2 pointer cast on a "normal" architecture. It's just that I… | |||||
DPxPTR(Stream), DPxPTR(async_info)); | |||||
CUDA_ERR_STRING(Err); | |||||
return OFFLOAD_FAIL; | |||||
} | |||||
return OFFLOAD_SUCCESS; | |||||
} | } | ||||
#ifdef __cplusplus | #ifdef __cplusplus | ||||
} | } | ||||
#endif | #endif |
Nit: I know the style in here is mixed but I would remove the braces around single statements, especially return.