diff --git a/openmp/libomptarget/plugins/cuda/src/rtl.cpp b/openmp/libomptarget/plugins/cuda/src/rtl.cpp --- a/openmp/libomptarget/plugins/cuda/src/rtl.cpp +++ b/openmp/libomptarget/plugins/cuda/src/rtl.cpp @@ -144,6 +144,8 @@ return OFFLOAD_SUCCESS; } +namespace { + // Structure contains per-device data struct DeviceDataTy { /// List that contains all the kernels. @@ -164,23 +166,27 @@ /// Resource allocator where \p T is the resource type. /// Functions \p create and \p destroy return OFFLOAD_SUCCESS and OFFLOAD_FAIL /// accordingly. The implementation should not raise any exception. -template class AllocatorTy { -public: +template struct AllocatorTy { + AllocatorTy(CUcontext C) noexcept : Context(C) {} + using ElementTy = T; + + virtual ~AllocatorTy() {} + /// Create a resource and assign to R. - int create(T &R) noexcept; + virtual int create(T &R) noexcept = 0; /// Destroy the resource. - int destroy(T) noexcept; -}; + virtual int destroy(T) noexcept = 0; -/// Allocator for CUstream. -template <> class AllocatorTy { +protected: CUcontext Context; +}; -public: - AllocatorTy(CUcontext C) noexcept : Context(C) {} +/// Allocator for CUstream. +struct StreamAllocatorTy final : public AllocatorTy { + StreamAllocatorTy(CUcontext C) noexcept : AllocatorTy(C) {} /// See AllocatorTy::create. - int create(CUstream &Stream) noexcept { + int create(CUstream &Stream) noexcept override { if (!checkResult(cuCtxSetCurrent(Context), "Error returned from cuCtxSetCurrent\n")) return OFFLOAD_FAIL; @@ -193,7 +199,7 @@ } /// See AllocatorTy::destroy. - int destroy(CUstream Stream) noexcept { + int destroy(CUstream Stream) noexcept override { if (!checkResult(cuCtxSetCurrent(Context), "Error returned from cuCtxSetCurrent\n")) return OFFLOAD_FAIL; @@ -206,10 +212,11 @@ }; /// Allocator for CUevent. -template <> class AllocatorTy { -public: +struct EventAllocatorTy final : public AllocatorTy { + EventAllocatorTy(CUcontext C) noexcept : AllocatorTy(C) {} + /// See AllocatorTy::create. - int create(CUevent &Event) noexcept { + int create(CUevent &Event) noexcept override { if (!checkResult(cuEventCreate(&Event, CU_EVENT_DEFAULT), "Error returned from cuEventCreate\n")) return OFFLOAD_FAIL; @@ -218,7 +225,7 @@ } /// See AllocatorTy::destroy. - int destroy(CUevent Event) noexcept { + int destroy(CUevent Event) noexcept override { if (!checkResult(cuEventDestroy(Event), "Error returned from cuEventDestroy\n")) return OFFLOAD_FAIL; @@ -229,15 +236,16 @@ /// A generic pool of resources where \p T is the resource type. /// \p T should be copyable as the object is stored in \p std::vector . -template class ResourcePoolTy { +template class ResourcePoolTy { + using ElementTy = typename AllocTy::ElementTy; /// Index of the next available resource. size_t Next = 0; /// Mutex to guard the pool. std::mutex Mutex; /// Pool of resources. - std::vector Resources; + std::vector Resources; /// A reference to the corresponding allocator. - AllocatorTy Allocator; + AllocTy Allocator; /// If `Resources` is used up, we will fill in more resources. It assumes that /// the new size `Size` should be always larger than the current size. @@ -246,7 +254,7 @@ assert(Size > CurSize && "Unexpected smaller size"); Resources.reserve(Size); for (auto I = CurSize; I < Size; ++I) { - T NewItem; + ElementTy NewItem; int Ret = Allocator.create(NewItem); if (Ret != OFFLOAD_SUCCESS) return false; @@ -256,7 +264,7 @@ } public: - ResourcePoolTy(AllocatorTy &&A, size_t Size = 0) noexcept + ResourcePoolTy(AllocTy &&A, size_t Size = 0) noexcept : Allocator(std::move(A)) { if (Size) (void)resize(Size); @@ -275,7 +283,7 @@ /// xxxxxs+++++++++ /// ^ /// Next - int acquire(T &R) noexcept { + int acquire(ElementTy &R) noexcept { std::lock_guard LG(Mutex); if (Next == Resources.size()) { auto NewSize = Resources.size() ? Resources.size() * 2 : 1; @@ -302,7 +310,7 @@ /// `Next`. The left one will in the end be overwritten by another resource. /// Therefore, after several execution, the order of pool might be different /// from its initial state. - void release(T R) noexcept { + void release(ElementTy R) noexcept { std::lock_guard LG(Mutex); Resources[--Next] = R; } @@ -316,6 +324,8 @@ } }; +} // namespace + class DeviceRTLTy { int NumberOfDevices; // OpenMP environment properties @@ -326,17 +336,22 @@ int64_t RequiresFlags; // Amount of dynamic shared memory to use at launch. uint64_t DynamicMemorySize; - // Number of initial streams for each device. + + /// Number of initial streams for each device. int NumInitialStreams = 32; + /// Number of initial events for each device. + int NumInitialEvents = 8; + static constexpr const int32_t HardThreadLimit = 1024; static constexpr const int32_t DefaultNumTeams = 128; static constexpr const int32_t DefaultNumThreads = 128; - using StreamPoolTy = ResourcePoolTy; + using StreamPoolTy = ResourcePoolTy; std::vector> StreamPool; - ResourcePoolTy EventPool; + using EventPoolTy = ResourcePoolTy; + std::vector> EventPool; std::vector DeviceData; std::vector Modules; @@ -494,7 +509,7 @@ DeviceRTLTy() : NumberOfDevices(0), EnvNumTeams(-1), EnvTeamLimit(-1), EnvTeamThreadLimit(-1), RequiresFlags(OMP_REQ_UNDEFINED), - DynamicMemorySize(0), EventPool(AllocatorTy()) { + DynamicMemorySize(0) { DP("Start initializing CUDA\n"); @@ -519,6 +534,7 @@ DeviceData.resize(NumberOfDevices); StreamPool.resize(NumberOfDevices); + EventPool.resize(NumberOfDevices); // Get environment variables regarding teams if (const char *EnvStr = getenv("OMP_TEAM_LIMIT")) { @@ -622,11 +638,15 @@ if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n")) return OFFLOAD_FAIL; - // Initialize stream pool + // Initialize the stream pool. if (!StreamPool[DeviceId]) StreamPool[DeviceId] = std::make_unique( - AllocatorTy(DeviceData[DeviceId].Context), - NumInitialStreams); + StreamAllocatorTy(DeviceData[DeviceId].Context), NumInitialStreams); + + // Initialize the event pool. + if (!EventPool[DeviceId]) + EventPool[DeviceId] = std::make_unique( + EventAllocatorTy(DeviceData[DeviceId].Context), NumInitialEvents); // Query attributes to determine number of threads/block and blocks/grid. int MaxGridDimX; @@ -761,12 +781,7 @@ checkResult(cuModuleUnload(M), "Error returned from cuModuleUnload\n"); StreamPool[DeviceId].reset(); - - // The event pool is shared, we initialize it once all devices have been - // deinitialized. - if (std::none_of(InitializedFlags.begin(), InitializedFlags.end(), - [](bool IsInitialized) { return IsInitialized; })) - EventPool.clear(); + EventPool[DeviceId].reset(); // Destroy context DeviceDataTy &D = DeviceData[DeviceId]; @@ -1412,16 +1427,16 @@ printf(" Compute Capabilities: \t\t%d%d \n", TmpInt, TmpInt2); } - int createEvent(void **P) { + int createEvent(int DeviceId, void **P) { CUevent Event = nullptr; - if (EventPool.acquire(Event) != OFFLOAD_SUCCESS) + if (EventPool[DeviceId]->acquire(Event) != OFFLOAD_SUCCESS) return OFFLOAD_FAIL; *P = Event; return OFFLOAD_SUCCESS; } - int destroyEvent(void *EventPtr) { - EventPool.release(reinterpret_cast(EventPtr)); + int destroyEvent(int DeviceId, void *EventPtr) { + EventPool[DeviceId]->release(reinterpret_cast(EventPtr)); return OFFLOAD_SUCCESS; } @@ -1695,7 +1710,7 @@ int32_t __tgt_rtl_create_event(int32_t device_id, void **event) { assert(event && "event is nullptr"); - return DeviceRTL.createEvent(event); + return DeviceRTL.createEvent(device_id, event); } int32_t __tgt_rtl_record_event(int32_t device_id, void *event_ptr, @@ -1725,7 +1740,7 @@ int32_t __tgt_rtl_destroy_event(int32_t device_id, void *event_ptr) { assert(event_ptr && "event is nullptr"); - return DeviceRTL.destroyEvent(event_ptr); + return DeviceRTL.destroyEvent(device_id, event_ptr); } int32_t __tgt_rtl_release_async_info(int32_t device_id,