diff --git a/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp b/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp --- a/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp +++ b/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp @@ -1785,6 +1785,12 @@ return Err; GridValues.GV_Default_Num_Teams = ComputeUnits * OMPX_DefaultTeamsPerCU; + uint32_t WavesPerCU = 0; + if (auto Err = + getDeviceAttr(HSA_AMD_AGENT_INFO_MAX_WAVES_PER_CU, WavesPerCU)) + return Err; + HardwareParallelism = ComputeUnits * WavesPerCU; + // Get maximum size of any device queues and maximum number of queues. uint32_t MaxQueueSize; if (auto Err = getDeviceAttr(HSA_AGENT_INFO_QUEUE_MAX_SIZE, MaxQueueSize)) @@ -1932,6 +1938,12 @@ return libomptargetSupportsRPC(); } + /// AMDGPU returns the product of the number of compute units and the waves + /// per compute unit. + uint64_t requestedRPCPortCount() const override { + return HardwareParallelism; + } + /// Get the stream of the asynchronous info sructure or get a new one. Error getStream(AsyncInfoWrapperTy &AsyncInfoWrapper, AMDGPUStreamTy *&Stream) { @@ -2577,6 +2589,9 @@ /// The frequency of the steady clock inside the device. uint64_t ClockFrequency; + /// The total number of concurrent work items that can be running on the GPU. + uint64_t HardwareParallelism; + /// Reference to the host device. AMDHostDeviceTy &HostDevice; }; diff --git a/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.h b/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.h --- a/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.h +++ b/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.h @@ -782,6 +782,19 @@ /// Get the RPC server running on this device. RPCServerTy *getRPCServer() const { return RPCServer; } + /// The number of parallel RPC ports to use on the device. In general, this + /// should be roughly equivalent to the amount of hardware parallelism the + /// device can support. This is because GPUs in general do not have forward + /// progress guarantees, so we minimize thread level dependencies by + /// allocating enough space such that each device thread can have a port. This + /// is likely overly pessimistic in the average case, but guarantees no + /// deadlocks at the cost of memory. This must be overloaded by targets + /// expecting to use the RPC server. + virtual uint64_t requestedRPCPortCount() const { + assert(!shouldSetupRPCServer() && "Default implementation cannot be used"); + return 0; + } + private: /// Register offload entry for global variable. Error registerGlobalOffloadEntry(DeviceImageTy &DeviceImage, @@ -888,7 +901,6 @@ #endif private: - /// Return the kernel environment object for kernel \p Name. Expected getKernelEnvironmentForKernel(StringRef Name, DeviceImageTy &Image); diff --git a/openmp/libomptarget/plugins-nextgen/common/PluginInterface/RPC.cpp b/openmp/libomptarget/plugins-nextgen/common/PluginInterface/RPC.cpp --- a/openmp/libomptarget/plugins-nextgen/common/PluginInterface/RPC.cpp +++ b/openmp/libomptarget/plugins-nextgen/common/PluginInterface/RPC.cpp @@ -59,8 +59,9 @@ *reinterpret_cast(Data); return Device.allocate(Size, nullptr, TARGET_ALLOC_HOST); }; - // TODO: Allow the device to declare its requested port count. - if (rpc_status_t Err = rpc_server_init(DeviceId, RPC_MAXIMUM_PORT_COUNT, + uint64_t NumPorts = + std::min(Device.requestedRPCPortCount(), RPC_MAXIMUM_PORT_COUNT); + if (rpc_status_t Err = rpc_server_init(DeviceId, NumPorts, Device.getWarpSize(), Alloc, &Device)) return plugin::Plugin::error( "Failed to initialize RPC server for device %d: %d", DeviceId, Err); diff --git a/openmp/libomptarget/plugins-nextgen/cuda/src/rtl.cpp b/openmp/libomptarget/plugins-nextgen/cuda/src/rtl.cpp --- a/openmp/libomptarget/plugins-nextgen/cuda/src/rtl.cpp +++ b/openmp/libomptarget/plugins-nextgen/cuda/src/rtl.cpp @@ -295,6 +295,19 @@ ComputeCapability.Minor)) return Err; + uint32_t NumMuliprocessors = 0; + uint32_t MaxThreadsPerSM = 0; + uint32_t WarpSize = 0; + if (auto Err = getDeviceAttr(CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, + NumMuliprocessors)) + return Err; + if (auto Err = getDeviceAttr(CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR, + MaxThreadsPerSM)) + return Err; + if (auto Err = getDeviceAttr(CU_DEVICE_ATTRIBUTE_WARP_SIZE, WarpSize)) + return Err; + HardwareParallelism = NumMuliprocessors * (MaxThreadsPerSM / WarpSize); + return Plugin::success(); } @@ -366,6 +379,12 @@ return libomptargetSupportsRPC(); } + /// NVIDIA returns the product of the SM count and the number of warps that + /// fit if the maximum number of threads were scheduled on each SM. + uint64_t requestedRPCPortCount() const override { + return HardwareParallelism; + } + /// Get the stream of the asynchronous info sructure or get a new one. Error getStream(AsyncInfoWrapperTy &AsyncInfoWrapper, CUstream &Stream) { // Get the stream (if any) from the async info. @@ -876,6 +895,10 @@ return "sm_" + std::to_string(Major * 10 + Minor); } } ComputeCapability; + + /// The maximum number of warps that can be resident on all the SMs + /// simultaneously. + uint32_t HardwareParallelism = 0; }; Error CUDAKernelTy::launchImpl(GenericDeviceTy &GenericDevice,