Index: openmp/libomptarget/plugins/cuda/src/rtl.cpp =================================================================== --- openmp/libomptarget/plugins/cuda/src/rtl.cpp +++ openmp/libomptarget/plugins/cuda/src/rtl.cpp @@ -10,10 +10,12 @@ // //===----------------------------------------------------------------------===// +#include #include #include #include #include +#include #include #include @@ -90,11 +92,13 @@ /// Class containing all the device information. class RTLDeviceInfoTy { std::vector> FuncGblEntries; + std::vector> NextStreamId; public: int NumberOfDevices; std::vector Modules; std::vector Contexts; + std::vector> Streams; // Device properties std::vector ThreadsPerBlock; @@ -108,6 +112,7 @@ // OpenMP Environment properties int EnvNumTeams; int EnvTeamLimit; + int EnvNumStreams; // OpenMP Requires Flags int64_t RequiresFlags; @@ -173,6 +178,15 @@ E.Table.EntriesBegin = E.Table.EntriesEnd = 0; } + // Get the next stream on a given device in a round robin manner + CUstream &getNextStream(const int DeviceId) { + assert(DeviceId >= 0 && + static_cast(DeviceId) < NextStreamId.size() && + "Unexpected device id!"); + const unsigned int Id = NextStreamId[DeviceId]->fetch_add(1); + return Streams[DeviceId][Id % EnvNumStreams]; + } + RTLDeviceInfoTy() { #ifdef OMPTARGET_DEBUG if (char *envStr = getenv("LIBOMPTARGET_DEBUG")) { @@ -205,6 +219,8 @@ FuncGblEntries.resize(NumberOfDevices); Contexts.resize(NumberOfDevices); + Streams.resize(NumberOfDevices); + NextStreamId.resize(NumberOfDevices); ThreadsPerBlock.resize(NumberOfDevices); BlocksPerGrid.resize(NumberOfDevices); WarpSize.resize(NumberOfDevices); @@ -229,6 +245,27 @@ EnvNumTeams = -1; } + // By default let's create 256 streams per device + EnvNumStreams = 256; + envStr = getenv("LIBOMPTARGET_NUM_STREAMS"); + if (envStr) { + EnvNumStreams = std::stoi(envStr); + } + + // Initialize streams for each device + for (std::vector &S : Streams) { + S.resize(EnvNumStreams); + } + + // Initialize the next stream id + for (std::unique_ptr &Ptr : NextStreamId) { +#if __cplusplus < 201402L + Ptr = std::unique_ptr(new std::atomic_uint(0)); +#else + Ptr = std::make_unique(0); +#endif + } + // Default state. RequiresFlags = OMP_REQ_UNDEFINED; } @@ -244,6 +281,24 @@ } } + // Destroy streams before contexts + for (int I = 0; I < NumberOfDevices; ++I) { + CUresult err = cuCtxSetCurrent(Contexts[I]); + if (err != CUDA_SUCCESS) { + DP("Error when setting current CUDA context\n"); + CUDA_ERR_STRING(err); + } + + for (auto &S : Streams[I]) + if (S) { + err = cuStreamDestroy(S); + if (err != CUDA_SUCCESS) { + DP("Error when destroying CUDA stream\n"); + CUDA_ERR_STRING(err); + } + } + } + // Destroy contexts for (auto &ctx : Contexts) if (ctx) { @@ -294,6 +349,20 @@ return OFFLOAD_FAIL; } + err = cuCtxSetCurrent(DeviceInfo.Contexts[device_id]); + if (err != CUDA_SUCCESS) { + DP("Error when setting current CUDA context\n"); + CUDA_ERR_STRING(err); + } + + for (CUstream &Stream : DeviceInfo.Streams[device_id]) { + err = cuStreamCreate(&Stream, CU_STREAM_NON_BLOCKING); + if (err != CUDA_SUCCESS) { + DP("Error when creating CUDA stream\n"); + CUDA_ERR_STRING(err); + } + } + // Query attributes to determine number of threads/block and blocks/grid. int maxGridDimX; err = cuDeviceGetAttribute(&maxGridDimX, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X, @@ -607,14 +676,26 @@ return OFFLOAD_FAIL; } - err = cuMemcpyHtoD((CUdeviceptr)tgt_ptr, hst_ptr, size); + CUstream &Stream = DeviceInfo.getNextStream(device_id); + + err = cuMemcpyHtoDAsync((CUdeviceptr)tgt_ptr, hst_ptr, size, Stream); if (err != CUDA_SUCCESS) { DP("Error when copying data from host to device. Pointers: host = " DPxMOD - ", device = " DPxMOD ", size = %" PRId64 "\n", DPxPTR(hst_ptr), - DPxPTR(tgt_ptr), size); + ", device = " DPxMOD ", size = %" PRId64 "\n", + DPxPTR(hst_ptr), DPxPTR(tgt_ptr), size); CUDA_ERR_STRING(err); return OFFLOAD_FAIL; } + + err = cuStreamSynchronize(Stream); + if (err != CUDA_SUCCESS) { + DP("Error when synchronizing async data transfer from host to device. " + "Pointers: host = " DPxMOD ", device = " DPxMOD ", size = %" PRId64 "\n", + DPxPTR(hst_ptr), DPxPTR(tgt_ptr), size); + CUDA_ERR_STRING(err); + return OFFLOAD_FAIL; + } + return OFFLOAD_SUCCESS; } @@ -628,14 +709,26 @@ return OFFLOAD_FAIL; } - err = cuMemcpyDtoH(hst_ptr, (CUdeviceptr)tgt_ptr, size); + CUstream &Stream = DeviceInfo.getNextStream(device_id); + + err = cuMemcpyDtoHAsync(hst_ptr, (CUdeviceptr)tgt_ptr, size, Stream); if (err != CUDA_SUCCESS) { DP("Error when copying data from device to host. Pointers: host = " DPxMOD - ", device = " DPxMOD ", size = %" PRId64 "\n", DPxPTR(hst_ptr), - DPxPTR(tgt_ptr), size); + ", device = " DPxMOD ", size = %" PRId64 "\n", + DPxPTR(hst_ptr), DPxPTR(tgt_ptr), size); + CUDA_ERR_STRING(err); + return OFFLOAD_FAIL; + } + + err = cuStreamSynchronize(Stream); + if (err != CUDA_SUCCESS) { + DP("Error when synchronizing async data transfer from device to host. " + "Pointers: host = " DPxMOD ", device = " DPxMOD ", size = %" PRId64 "\n", + DPxPTR(hst_ptr), DPxPTR(tgt_ptr), size); CUDA_ERR_STRING(err); return OFFLOAD_FAIL; } + return OFFLOAD_SUCCESS; } @@ -755,8 +848,11 @@ DP("Launch kernel with %d blocks and %d threads\n", cudaBlocksPerGrid, cudaThreadsPerBlock); + CUstream &Stream = DeviceInfo.getNextStream(device_id); + err = cuLaunchKernel(KernelInfo->Func, cudaBlocksPerGrid, 1, 1, - cudaThreadsPerBlock, 1, 1, 0 /*bytes of shared memory*/, 0, &args[0], 0); + cudaThreadsPerBlock, 1, 1, 0 /*bytes of shared memory*/, + Stream, &args[0], 0); if (err != CUDA_SUCCESS) { DP("Device kernel launch failed!\n"); CUDA_ERR_STRING(err); @@ -766,7 +862,7 @@ DP("Launch of entry point at " DPxMOD " successful!\n", DPxPTR(tgt_entry_ptr)); - CUresult sync_err = cuCtxSynchronize(); + CUresult sync_err = cuStreamSynchronize(Stream); if (sync_err != CUDA_SUCCESS) { DP("Kernel execution error at " DPxMOD "!\n", DPxPTR(tgt_entry_ptr)); CUDA_ERR_STRING(sync_err); Index: openmp/libomptarget/test/offloading/parallel_offloading_map.c =================================================================== --- /dev/null +++ openmp/libomptarget/test/offloading/parallel_offloading_map.c @@ -0,0 +1,41 @@ +// RUN: %libomptarget-compilexx-run-and-check-aarch64-unknown-linux-gnu +// RUN: %libomptarget-compilexx-run-and-check-powerpc64-ibm-linux-gnu +// RUN: %libomptarget-compilexx-run-and-check-powerpc64le-ibm-linux-gnu +// RUN: %libomptarget-compilexx-run-and-check-x86_64-pc-linux-gnu +#include +#include + +int main(int argc, char *argv[]) { + const int num_threads = 64, N = 128; + int array[num_threads] = {0}; + +#pragma omp parallel for + for (int i = 0; i < num_threads; ++i) { + int tmp[N]; + + for (int j = 0; j < N; ++j) { + tmp[j] = i; + } + +#pragma omp target teams distribute parallel for map(tofrom : tmp) + for (int j = 0; j < N; ++j) { + tmp[j] += j; + } + + for (int j = 0; j < N; ++j) { + array[i] += tmp[j]; + } + } + + // Verify + for (int i = 0; i < num_threads; ++i) { + const int ref = (0 + N - 1) * N / 2 + i * N; + assert(array[i] == ref); + } + + printf("PASS\n"); + + return 0; +} + +// CHECK: PASS