Index: openmp/libomptarget/plugins/cuda/src/rtl.cpp =================================================================== --- openmp/libomptarget/plugins/cuda/src/rtl.cpp +++ openmp/libomptarget/plugins/cuda/src/rtl.cpp @@ -95,6 +95,7 @@ int NumberOfDevices; std::vector Modules; std::vector Contexts; + std::vector Streams; // Device properties std::vector ThreadsPerBlock; @@ -205,6 +206,7 @@ FuncGblEntries.resize(NumberOfDevices); Contexts.resize(NumberOfDevices); + Streams.resize(NumberOfDevices); ThreadsPerBlock.resize(NumberOfDevices); BlocksPerGrid.resize(NumberOfDevices); WarpSize.resize(NumberOfDevices); @@ -253,6 +255,16 @@ CUDA_ERR_STRING(err); } } + + // Destroy streams + for (auto &stream : Streams) + if (stream) { + CUresult err = cuStreamDestroy(stream); + if (err != CUDA_SUCCESS) { + DP("Error when destroying CUDA stream\n"); + CUDA_ERR_STRING(err); + } + } } }; @@ -294,6 +306,22 @@ return OFFLOAD_FAIL; } + // Set current context for later creating corresponding stream + err = cuCtxSetCurrent(DeviceInfo.Contexts[device_id]); + if (err != CUDA_SUCCESS) { + DP("Error when setting current CUDA context\n"); + CUDA_ERR_STRING(err); + return OFFLOAD_FAIL; + } + + //Create a stream for each device + err = cuStreamCreate(&DeviceInfo.Streams[device_id], CU_STREAM_NON_BLOCKING); + if (err != CUDA_SUCCESS) { + DP("Error when creating CUDA stream\n"); + CUDA_ERR_STRING(err); + return OFFLOAD_FAIL; + } + // Query attributes to determine number of threads/block and blocks/grid. int maxGridDimX; err = cuDeviceGetAttribute(&maxGridDimX, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X, @@ -756,7 +784,8 @@ cudaThreadsPerBlock); err = cuLaunchKernel(KernelInfo->Func, cudaBlocksPerGrid, 1, 1, - cudaThreadsPerBlock, 1, 1, 0 /*bytes of shared memory*/, 0, &args[0], 0); + cudaThreadsPerBlock, 1, 1, 0 /*bytes of shared memory*/, + DeviceInfo.Streams[device_id], &args[0], 0); if (err != CUDA_SUCCESS) { DP("Device kernel launch failed!\n"); CUDA_ERR_STRING(err);