Index: llvm/include/llvm/Frontend/OpenMP/OMPGridValues.h =================================================================== --- llvm/include/llvm/Frontend/OpenMP/OMPGridValues.h +++ llvm/include/llvm/Frontend/OpenMP/OMPGridValues.h @@ -66,6 +66,9 @@ /// the maximum number of teams. unsigned GV_Max_Teams; + // The default number teams + unsigned GV_Default_Num_Teams; + // An alternative to the heavy data sharing infrastructure that uses global // memory is one that uses device __shared__ memory. The amount of such space // (in bytes) reserved by the OpenMP runtime is noted here. @@ -85,6 +88,7 @@ 256, // GV_Slot_Size 64, // GV_Warp_Size 128, // GV_Max_Teams + 440, // GV_Default_Num_Teams 896, // GV_SimpleBufferSize 1024, // GV_Max_WG_Size, 256, // GV_Default_WG_Size @@ -94,6 +98,7 @@ 256, // GV_Slot_Size 32, // GV_Warp_Size 128, // GV_Max_Teams + 440, // GV_Default_Num_Teams 896, // GV_SimpleBufferSize 1024, // GV_Max_WG_Size, 256, // GV_Default_WG_Size @@ -109,6 +114,7 @@ 256, // GV_Slot_Size 32, // GV_Warp_Size 1024, // GV_Max_Teams + 3200, // GV_Default_Num_Teams 896, // GV_SimpleBufferSize 1024, // GV_Max_WG_Size 128, // GV_Default_WG_Size Index: openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp =================================================================== --- openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp +++ openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp @@ -121,6 +121,8 @@ /// Create a reference to an existing resource. AMDGPUResourceRef(ResourceTy *Resource) : Resource(Resource) {} + virtual ~AMDGPUResourceRef() {} + /// Create a new resource and save the reference. The reference must be empty /// before calling to this function. Error create(GenericDeviceTy &Device) override; @@ -540,6 +542,10 @@ // should be lightweight; do not block the thread, allocate memory, etc. std::lock_guard Lock(Mutex); + // Avoid defining the input dependency if already satisfied. + if (InputSignal && !InputSignal->load()) + InputSignal = nullptr; + // Add a barrier packet before the kernel packet in case there is a pending // preceding operation. The barrier packet will delay the processing of // subsequent queue's packets until the barrier input signal are satisfied. @@ -786,8 +792,19 @@ return Plugin::success(); // Perform the action. - if (auto Err = (*ActionFunction)(&ActionArgs)) - return Err; + if (ActionFunction == memcpyAction) { + if (auto Err = memcpyAction(&ActionArgs)) + return Err; + } else if (ActionFunction == releaseBufferAction) { + if (auto Err = releaseBufferAction(&ActionArgs)) + return Err; + } else if (ActionFunction == releaseSignalAction) { + if (auto Err = releaseSignalAction(&ActionArgs)) + return Err; + } else { + if (auto Err = (*ActionFunction)(&ActionArgs)) + return Err; + } // Invalidate the action. ActionFunction = nullptr; @@ -990,10 +1007,6 @@ // Consume stream slot and compute dependencies. auto [Curr, InputSignal] = consume(OutputSignal); - // Avoid defining the input dependency if already satisfied. - if (InputSignal && !InputSignal->load()) - InputSignal = nullptr; - // Setup the post action to release the kernel args buffer. if (auto Err = Slots[Curr].schedReleaseBuffer(KernelArgs, MemoryManager)) return Err; @@ -1485,8 +1498,9 @@ AMDGPUDeviceTy(int32_t DeviceId, int32_t NumDevices, AMDHostDeviceTy &HostDevice, hsa_agent_t Agent) : GenericDeviceTy(DeviceId, NumDevices, {0}), AMDGenericDeviceTy(), - OMPX_NumQueues("LIBOMPTARGET_AMDGPU_NUM_HSA_QUEUES", 8), - OMPX_QueueSize("LIBOMPTARGET_AMDGPU_HSA_QUEUE_SIZE", 1024), + OMPX_NumQueues("LIBOMPTARGET_AMDGPU_NUM_HSA_QUEUES", 4), + OMPX_QueueSize("LIBOMPTARGET_AMDGPU_HSA_QUEUE_SIZE", 512), + OMPX_DefaultTeamsPerCU("LIBOMPTARGET_AMDGPU_TEAMS_PER_CU", 4), OMPX_MaxAsyncCopyBytes("LIBOMPTARGET_AMDGPU_MAX_ASYNC_COPY_BYTES", 1 * 1024 * 1024), // 1MB OMPX_InitialNumSignals("LIBOMPTARGET_AMDGPU_NUM_INITIAL_HSA_SIGNALS", @@ -1528,10 +1542,18 @@ hsa_dim3_t GridMaxDim; if (auto Err = getDeviceAttr(HSA_AGENT_INFO_GRID_MAX_DIM, GridMaxDim)) return Err; + GridValues.GV_Max_Teams = GridMaxDim.x / GridValues.GV_Max_WG_Size; if (GridValues.GV_Max_Teams == 0) return Plugin::error("Maximum number of teams cannot be zero"); + // Compute the default number of teams. + uint32_t ComputeUnits = 0; + if (auto Err = + getDeviceAttr(HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT, ComputeUnits)) + return Err; + GridValues.GV_Default_Num_Teams = ComputeUnits * OMPX_DefaultTeamsPerCU; + // Get maximum size of any device queues and maximum number of queues. uint32_t MaxQueueSize; if (auto Err = getDeviceAttr(HSA_AGENT_INFO_QUEUE_MAX_SIZE, MaxQueueSize)) @@ -2014,6 +2036,11 @@ /// process them. UInt32Envar OMPX_QueueSize; + /// Envar for controlling the default number of teams relative to the number + /// of compute units (CUs) the device has: + /// #default_teams = OMPX_DefaultTeamsPerCU * #CUs. + UInt32Envar OMPX_DefaultTeamsPerCU; + /// Envar specifying the maximum size in bytes where the memory copies are /// asynchronous operations. Up to this transfer size, the memory copies are /// asychronous operations pushed to the corresponding stream. For larger @@ -2226,9 +2253,9 @@ // Classify the agents into kernel (GPU) and host (CPU) kernels. if (DeviceType == HSA_DEVICE_TYPE_GPU) { // Ensure that the GPU agent supports kernel dispatch packets. - hsa_agent_feature_t features; - Status = hsa_agent_get_info(Agent, HSA_AGENT_INFO_FEATURE, &features); - if (features & HSA_AGENT_FEATURE_KERNEL_DISPATCH) + hsa_agent_feature_t Features; + Status = hsa_agent_get_info(Agent, HSA_AGENT_INFO_FEATURE, &Features); + if (Features & HSA_AGENT_FEATURE_KERNEL_DISPATCH) KernelAgents.push_back(Agent); } else if (DeviceType == HSA_DEVICE_TYPE_CPU) { HostAgents.push_back(Agent); @@ -2405,11 +2432,10 @@ std::memset(ImplArgs, 0, ImplicitArgsSize); // Copy the explicit arguments. - for (int32_t ArgId = 0; ArgId < NumKernelArgs; ++ArgId) { - void *Dst = (char *)AllArgs + sizeof(void *) * ArgId; - void *Src = *((void **)KernelArgs + ArgId); - std::memcpy(Dst, Src, sizeof(void *)); - } + // TODO: We should expose the args memory manager alloc to the common part as + // alternative to copying them twice. + std::memcpy(AllArgs, *static_cast(KernelArgs), + sizeof(void *) * NumKernelArgs); AMDGPUDeviceTy &AMDGPUDevice = static_cast(GenericDevice); AMDGPUStreamTy &Stream = AMDGPUDevice.getStream(AsyncInfoWrapper); Index: openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.h =================================================================== --- openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.h +++ openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.h @@ -372,8 +372,7 @@ return GridValues.GV_Default_WG_Size; } uint64_t getDefaultNumBlocks() const { - // TODO: Introduce a default num blocks value. - return GridValues.GV_Default_WG_Size; + return GridValues.GV_Default_Num_Teams; } uint32_t getDynamicMemorySize() const { return OMPX_SharedMemorySize; } Index: openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.cpp =================================================================== --- openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.cpp +++ openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.cpp @@ -100,16 +100,21 @@ uint64_t NumTeamsClause, uint64_t LoopTripCount, uint32_t NumThreads) const { - uint64_t PreferredNumBlocks = getDefaultNumBlocks(GenericDevice); if (NumTeamsClause > 0) { - PreferredNumBlocks = NumTeamsClause; - } else if (LoopTripCount > 0) { + // TODO: We need to honor any value and consequently allow more than the + // block limit. For this we might need to start multiple kernels or let the + // blocks start again until the requested number has been started. + return std::min(NumTeamsClause, GenericDevice.getBlockLimit()); + } + + uint64_t TripCountNumBlocks = std::numeric_limits::max(); + if (LoopTripCount > 0) { if (isSPMDMode()) { // We have a combined construct, i.e. `target teams distribute // parallel for [simd]`. We launch so many teams so that each thread // will execute one iteration of the loop. round up to the nearest // integer - PreferredNumBlocks = ((LoopTripCount - 1) / NumThreads) + 1; + TripCountNumBlocks = ((LoopTripCount - 1) / NumThreads) + 1; } else { assert((isGenericMode() || isGenericSPMDMode()) && "Unexpected execution mode!"); @@ -125,9 +130,12 @@ // // Threads within a team will execute the iterations of the `parallel` // loop. - PreferredNumBlocks = LoopTripCount; + TripCountNumBlocks = LoopTripCount; } } + // If the loops are long running we rather reuse blocks than spawn too many. + uint64_t PreferredNumBlocks = + std::min(TripCountNumBlocks, getDefaultNumBlocks(GenericDevice)); return std::min(PreferredNumBlocks, GenericDevice.getBlockLimit()); }