diff --git a/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp b/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp --- a/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp +++ b/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp @@ -1653,6 +1653,12 @@ return Err; GridValues.GV_Default_Num_Teams = ComputeUnits * OMPX_DefaultTeamsPerCU; + uint32_t WavesPerCU = 0; + if (auto Err = + getDeviceAttr(HSA_AMD_AGENT_INFO_MAX_WAVES_PER_CU, WavesPerCU)) + return Err; + HardwareParallelism = ComputeUnits * WavesPerCU; + // Get maximum size of any device queues and maximum number of queues. uint32_t MaxQueueSize; if (auto Err = getDeviceAttr(HSA_AGENT_INFO_QUEUE_MAX_SIZE, MaxQueueSize)) @@ -1813,6 +1819,12 @@ return libomptargetSupportsRPC(); } + /// AMDGPU returns the product of the number of compute units and the waves + /// per compute unit. + uint64_t requestedRPCPortCount() const override { + return HardwareParallelism; + } + /// Get the stream of the asynchronous info sructure or get a new one. AMDGPUStreamTy &getStream(AsyncInfoWrapperTy &AsyncInfoWrapper) { AMDGPUStreamTy *&Stream = AsyncInfoWrapper.getQueueAs(); @@ -2455,6 +2467,9 @@ /// The frequency of the steady clock inside the device. uint64_t ClockFrequency; + /// The total number of concurrent work items that can be running on the GPU. + uint64_t HardwareParallelism; + /// Reference to the host device. AMDHostDeviceTy &HostDevice; diff --git a/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.h b/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.h --- a/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.h +++ b/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.h @@ -764,6 +764,19 @@ /// Get the RPC server running on this device. RPCServerTy *getRPCServer() const { return RPCServer; } + /// The number of parallel RPC ports to use on the device. In general, this + /// should be roughly equivalent to the amount of hardware parallelism the + /// device can support. This is because GPUs in general do not have forward + /// progress guarantees, so we minimize thread level dependencies by + /// allocating enough space such that each device thread can have a port. This + /// is likely overly pessimistic in the average case, but guarantees no + /// deadlocks at the cost of memory. This must be overloaded by targets + /// expecting to use the RPC server. + virtual uint64_t requestedRPCPortCount() const { + assert(!shouldSetupRPCServer() && "Default implementation cannot be used"); + return 0; + } + private: /// Register offload entry for global variable. Error registerGlobalOffloadEntry(DeviceImageTy &DeviceImage, @@ -870,7 +883,6 @@ #endif private: - /// Return the kernel environment object for kernel \p Name. Expected getKernelEnvironmentForKernel(StringRef Name, DeviceImageTy &Image); diff --git a/openmp/libomptarget/plugins-nextgen/common/PluginInterface/RPC.cpp b/openmp/libomptarget/plugins-nextgen/common/PluginInterface/RPC.cpp --- a/openmp/libomptarget/plugins-nextgen/common/PluginInterface/RPC.cpp +++ b/openmp/libomptarget/plugins-nextgen/common/PluginInterface/RPC.cpp @@ -59,8 +59,9 @@ *reinterpret_cast(Data); return Device.allocate(Size, nullptr, TARGET_ALLOC_HOST); }; - // TODO: Allow the device to declare its requested port count. - if (rpc_status_t Err = rpc_server_init(DeviceId, RPC_MAXIMUM_PORT_COUNT, + uint64_t NumPorts = + std::min(Device.requestedRPCPortCount(), RPC_MAXIMUM_PORT_COUNT); + if (rpc_status_t Err = rpc_server_init(DeviceId, NumPorts, Device.getWarpSize(), Alloc, &Device)) return plugin::Plugin::error( "Failed to initialize RPC server for device %d: %d", DeviceId, Err); diff --git a/openmp/libomptarget/plugins-nextgen/cuda/src/rtl.cpp b/openmp/libomptarget/plugins-nextgen/cuda/src/rtl.cpp --- a/openmp/libomptarget/plugins-nextgen/cuda/src/rtl.cpp +++ b/openmp/libomptarget/plugins-nextgen/cuda/src/rtl.cpp @@ -287,6 +287,19 @@ ComputeCapability.Minor)) return Err; + uint32_t NumMuliprocessors = 0; + uint32_t MaxThreadsPerSM = 0; + uint32_t WarpSize = 0; + if (auto Err = getDeviceAttr(CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, + NumMuliprocessors)) + return Err; + if (auto Err = getDeviceAttr(CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR, + MaxThreadsPerSM)) + return Err; + if (auto Err = getDeviceAttr(CU_DEVICE_ATTRIBUTE_WARP_SIZE, WarpSize)) + return Err; + HardwareParallelism = NumMuliprocessors * (MaxThreadsPerSM / WarpSize); + return Plugin::success(); } @@ -373,6 +386,12 @@ return libomptargetSupportsRPC(); } + /// NVIDIA returns the produce number of SMs and the number of warps that fit + /// if the maximum number of threads were scheduled on each SM. + uint64_t requestedRPCPortCount() const override { + return HardwareParallelism; + } + /// Get the stream of the asynchronous info sructure or get a new one. CUstream getStream(AsyncInfoWrapperTy &AsyncInfoWrapper) { CUstream &Stream = AsyncInfoWrapper.getQueueAs(); @@ -875,6 +894,10 @@ return "sm_" + std::to_string(Major * 10 + Minor); } } ComputeCapability; + + /// The maximum number of warps that can be resident on all the SMs + /// simultaneously. + uint32_t HardwareParallelism = 0; }; Error CUDAKernelTy::launchImpl(GenericDeviceTy &GenericDevice,