Index: openmp/libomptarget/plugins/cuda/src/rtl.cpp =================================================================== --- openmp/libomptarget/plugins/cuda/src/rtl.cpp +++ openmp/libomptarget/plugins/cuda/src/rtl.cpp @@ -10,10 +10,12 @@ // //===----------------------------------------------------------------------===// +#include #include #include #include #include +#include #include #include @@ -90,11 +92,13 @@ /// Class containing all the device information. class RTLDeviceInfoTy { std::vector> FuncGblEntries; + std::vector> NextStreamId; public: int NumberOfDevices; std::vector Modules; std::vector Contexts; + std::vector> Streams; // Device properties std::vector ThreadsPerBlock; @@ -108,6 +112,7 @@ // OpenMP Environment properties int EnvNumTeams; int EnvTeamLimit; + int EnvNumStreams; // OpenMP Requires Flags int64_t RequiresFlags; @@ -173,6 +178,12 @@ E.Table.EntriesBegin = E.Table.EntriesEnd = 0; } + // Get the next stream on a given device in a round robin manner + CUstream &getNextStream(const int DeviceId) { + const int Id = NextStreamId[DeviceId]->fetch_add(1); + return Streams[DeviceId][Id % EnvNumStreams]; + } + RTLDeviceInfoTy() { #ifdef OMPTARGET_DEBUG if (char *envStr = getenv("LIBOMPTARGET_DEBUG")) { @@ -205,6 +216,8 @@ FuncGblEntries.resize(NumberOfDevices); Contexts.resize(NumberOfDevices); + Streams.resize(NumberOfDevices); + NextStreamId.resize(NumberOfDevices); ThreadsPerBlock.resize(NumberOfDevices); BlocksPerGrid.resize(NumberOfDevices); WarpSize.resize(NumberOfDevices); @@ -229,6 +242,23 @@ EnvNumTeams = -1; } + // By default let's create 32 streams per device + EnvNumStreams = 32; + envStr = getenv("LIBOMPTARGET_NUM_STREAMS"); + if (envStr) { + EnvNumStreams = std::stoi(envStr); + } + + // Initialize streams for each device + for (std::vector &S : Streams) { + S.resize(EnvNumStreams); + } + + // Initialize the next stream id + for (std::unique_ptr &Ptr : NextStreamId) { + Ptr = std::unique_ptr(new std::atomic_int(0)); + } + // Default state. RequiresFlags = OMP_REQ_UNDEFINED; } @@ -244,6 +274,24 @@ } } + // Destroy streams before contexts + for (int I = 0; I < NumberOfDevices; ++I) { + CUresult err = cuCtxSetCurrent(Contexts[I]); + if (err != CUDA_SUCCESS) { + DP("Error when setting current CUDA context\n"); + CUDA_ERR_STRING(err); + } + + for (auto &S : Streams[I]) + if (S) { + err = cuStreamDestroy(S); + if (err != CUDA_SUCCESS) { + DP("Error when destroying CUDA stream\n"); + CUDA_ERR_STRING(err); + } + } + } + // Destroy contexts for (auto &ctx : Contexts) if (ctx) { @@ -294,6 +342,20 @@ return OFFLOAD_FAIL; } + err = cuCtxSetCurrent(DeviceInfo.Contexts[device_id]); + if (err != CUDA_SUCCESS) { + DP("Error when setting current CUDA context\n"); + CUDA_ERR_STRING(err); + } + + for (CUstream &Stream : DeviceInfo.Streams[device_id]) { + err = cuStreamCreate(&Stream, CU_STREAM_NON_BLOCKING); + if (err != CUDA_SUCCESS) { + DP("Error when creating CUDA stream\n"); + CUDA_ERR_STRING(err); + } + } + // Query attributes to determine number of threads/block and blocks/grid. int maxGridDimX; err = cuDeviceGetAttribute(&maxGridDimX, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X, @@ -756,7 +818,8 @@ cudaThreadsPerBlock); err = cuLaunchKernel(KernelInfo->Func, cudaBlocksPerGrid, 1, 1, - cudaThreadsPerBlock, 1, 1, 0 /*bytes of shared memory*/, 0, &args[0], 0); + cudaThreadsPerBlock, 1, 1, 0 /*bytes of shared memory*/, + DeviceInfo.getNextStream(device_id), &args[0], 0); if (err != CUDA_SUCCESS) { DP("Device kernel launch failed!\n"); CUDA_ERR_STRING(err);