diff --git a/openmp/libomptarget/plugins/cuda/src/rtl.cpp b/openmp/libomptarget/plugins/cuda/src/rtl.cpp --- a/openmp/libomptarget/plugins/cuda/src/rtl.cpp +++ b/openmp/libomptarget/plugins/cuda/src/rtl.cpp @@ -164,20 +164,21 @@ /// Resource allocator where \p T is the resource type. /// Functions \p create and \p destroy return OFFLOAD_SUCCESS and OFFLOAD_FAIL /// accordingly. The implementation should not raise any exception. -template class AllocatorTy { -public: +template struct AllocatorTy { + AllocatorTy(CUcontext C) noexcept : Context(C) {} + /// Create a resource and assign to R. int create(T &R) noexcept; /// Destroy the resource. int destroy(T) noexcept; -}; -/// Allocator for CUstream. -template <> class AllocatorTy { +protected: CUcontext Context; +}; -public: - AllocatorTy(CUcontext C) noexcept : Context(C) {} +/// Allocator for CUstream. +struct StreamAllocatorTy : public AllocatorTy { + StreamAllocatorTy(CUcontext C) noexcept : AllocatorTy(C) {} /// See AllocatorTy::create. int create(CUstream &Stream) noexcept { @@ -206,8 +207,9 @@ }; /// Allocator for CUevent. -template <> class AllocatorTy { -public: +struct EventAllocatorTy : public AllocatorTy { + EventAllocatorTy(CUcontext C) noexcept : AllocatorTy(C) {} + /// See AllocatorTy::create. int create(CUevent &Event) noexcept { if (!checkResult(cuEventCreate(&Event, CU_EVENT_DEFAULT), @@ -326,9 +328,13 @@ int64_t RequiresFlags; // Amount of dynamic shared memory to use at launch. uint64_t DynamicMemorySize; - // Number of initial streams for each device. + + /// Number of initial streams for each device. int NumInitialStreams = 32; + /// Number of initial events for each device. + int NumInitialEvents = 8; + static constexpr const int HardTeamLimit = 1U << 16U; // 64k static constexpr const int HardThreadLimit = 1024; static constexpr const int DefaultNumTeams = 128; @@ -337,7 +343,8 @@ using StreamPoolTy = ResourcePoolTy; std::vector> StreamPool; - ResourcePoolTy EventPool; + using EventPoolTy = ResourcePoolTy; + std::vector> EventPool; std::vector DeviceData; std::vector Modules; @@ -495,7 +502,7 @@ DeviceRTLTy() : NumberOfDevices(0), EnvNumTeams(-1), EnvTeamLimit(-1), EnvTeamThreadLimit(-1), RequiresFlags(OMP_REQ_UNDEFINED), - DynamicMemorySize(0), EventPool(AllocatorTy()) { + DynamicMemorySize(0) { DP("Start initializing CUDA\n"); @@ -520,6 +527,7 @@ DeviceData.resize(NumberOfDevices); StreamPool.resize(NumberOfDevices); + EventPool.resize(NumberOfDevices); // Get environment variables regarding teams if (const char *EnvStr = getenv("OMP_TEAM_LIMIT")) { @@ -621,12 +629,18 @@ if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n")) return OFFLOAD_FAIL; - // Initialize stream pool + // Initialize the stream pool. if (!StreamPool[DeviceId]) StreamPool[DeviceId] = std::make_unique( - AllocatorTy(DeviceData[DeviceId].Context), + StreamAllocatorTy(DeviceData[DeviceId].Context), NumInitialStreams); + // Initialize the event pool. + if (!EventPool[DeviceId]) + EventPool[DeviceId] = std::make_unique( + EventAllocatorTy(DeviceData[DeviceId].Context), + NumInitialEvents); + // Query attributes to determine number of threads/block and blocks/grid. int MaxGridDimX; Err = cuDeviceGetAttribute(&MaxGridDimX, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X, @@ -765,12 +779,7 @@ checkResult(cuModuleUnload(M), "Error returned from cuModuleUnload\n"); StreamPool[DeviceId].reset(); - - // The event pool is shared, we initialize it once all devices have been - // deinitialized. - if (std::none_of(InitializedFlags.begin(), InitializedFlags.end(), - [](bool IsInitialized) { return IsInitialized; })) - EventPool.clear(); + EventPool[DeviceId].reset(); // Destroy context DeviceDataTy &D = DeviceData[DeviceId]; @@ -1415,16 +1424,16 @@ printf(" Compute Capabilities: \t\t%d%d \n", TmpInt, TmpInt2); } - int createEvent(void **P) { + int createEvent(int DeviceId, void **P) { CUevent Event = nullptr; - if (EventPool.acquire(Event) != OFFLOAD_SUCCESS) + if (EventPool[DeviceId]->acquire(Event) != OFFLOAD_SUCCESS) return OFFLOAD_FAIL; *P = Event; return OFFLOAD_SUCCESS; } - int destroyEvent(void *EventPtr) { - EventPool.release(reinterpret_cast(EventPtr)); + int destroyEvent(int DeviceId, void *EventPtr) { + EventPool[DeviceId]->release(reinterpret_cast(EventPtr)); return OFFLOAD_SUCCESS; } @@ -1698,7 +1707,7 @@ int32_t __tgt_rtl_create_event(int32_t device_id, void **event) { assert(event && "event is nullptr"); - return DeviceRTL.createEvent(event); + return DeviceRTL.createEvent(device_id, event); } int32_t __tgt_rtl_record_event(int32_t device_id, void *event_ptr, @@ -1728,7 +1737,7 @@ int32_t __tgt_rtl_destroy_event(int32_t device_id, void *event_ptr) { assert(event_ptr && "event is nullptr"); - return DeviceRTL.destroyEvent(event_ptr); + return DeviceRTL.destroyEvent(device_id, event_ptr); } int32_t __tgt_rtl_release_async_info(int32_t device_id,