Index: openmp/libomptarget/plugins-nextgen/CMakeLists.txt =================================================================== --- openmp/libomptarget/plugins-nextgen/CMakeLists.txt +++ openmp/libomptarget/plugins-nextgen/CMakeLists.txt @@ -77,6 +77,7 @@ endmacro() add_subdirectory(aarch64) +add_subdirectory(amdgpu) add_subdirectory(cuda) add_subdirectory(ppc64) add_subdirectory(ppc64le) Index: openmp/libomptarget/plugins-nextgen/amdgpu/CMakeLists.txt =================================================================== --- /dev/null +++ openmp/libomptarget/plugins-nextgen/amdgpu/CMakeLists.txt @@ -0,0 +1,105 @@ +##===----------------------------------------------------------------------===## +# +# The LLVM Compiler Infrastructure +# +# This file is dual licensed under the MIT and the University of Illinois Open +# Source Licenses. See LICENSE.txt for details. +# +##===----------------------------------------------------------------------===## +# +# Build a plugin for an AMDGPU machine if available. +# +##===----------------------------------------------------------------------===## + +################################################################################ +set(LIBOMPTARGET_BUILD_AMDGPU_PLUGIN TRUE CACHE BOOL + "Whether to build AMDGPU plugin") +if (NOT LIBOMPTARGET_BUILD_AMDGPU_PLUGIN) + libomptarget_say("Not building AMDGPU NextGen offloading plugin: LIBOMPTARGET_BUILD_AMDGPU_PLUGIN is false") + return() +endif() + +# as of rocm-3.7, hsa is installed with cmake packages and kmt is found via hsa +find_package(hsa-runtime64 QUIET 1.2.0 HINTS ${CMAKE_INSTALL_PREFIX} PATHS /opt/rocm) + +if(NOT CMAKE_SYSTEM_PROCESSOR MATCHES "(x86_64)|(ppc64le)|(aarch64)$" AND CMAKE_SYSTEM_NAME MATCHES "Linux") + libomptarget_say("Not building AMDGPU NextGen plugin: only support AMDGPU in Linux x86_64, ppc64le, or aarch64 hosts") + return() +endif() + +################################################################################ +# Define the suffix for the runtime messaging dumps. +add_definitions(-DTARGET_NAME=AMDGPU) + +# Define debug prefix. TODO: This should be automatized in the Debug.h but it +# requires changing the original plugins. +add_definitions(-DDEBUG_PREFIX="TARGET AMDGPU RTL") + +if(CMAKE_SYSTEM_PROCESSOR MATCHES "(ppc64le)|(aarch64)$") + add_definitions(-DLITTLEENDIAN_CPU=1) +endif() + +if(CMAKE_BUILD_TYPE MATCHES Debug) + add_definitions(-DDEBUG) +endif() + +set(LIBOMPTARGET_DLOPEN_LIBHSA OFF) +option(LIBOMPTARGET_FORCE_DLOPEN_LIBHSA "Build with dlopened libhsa" ${LIBOMPTARGET_DLOPEN_LIBHSA}) + +if (${hsa-runtime64_FOUND} AND NOT LIBOMPTARGET_FORCE_DLOPEN_LIBHSA) + libomptarget_say("Building AMDGPU NextGen plugin linked against libhsa") + set(LIBOMPTARGET_EXTRA_SOURCE) + set(LIBOMPTARGET_DEP_LIBRARIES hsa-runtime64::hsa-runtime64) +else() + libomptarget_say("Building AMDGPU NextGen plugin for dlopened libhsa") + include_directories(../../plugins/amdgpu/dynamic_hsa) + set(LIBOMPTARGET_EXTRA_SOURCE ../../plugins/amdgpu/dynamic_hsa/hsa.cpp) + set(LIBOMPTARGET_DEP_LIBRARIES) +endif() + +if(CMAKE_SYSTEM_NAME MATCHES "FreeBSD") + # On FreeBSD, the 'environ' symbol is undefined at link time, but resolved by + # the dynamic linker at runtime. Therefore, allow the symbol to be undefined + # when creating a shared library. + set(LDFLAGS_UNDEFINED "-Wl,--allow-shlib-undefined") +else() + set(LDFLAGS_UNDEFINED "-Wl,-z,defs") +endif() + +add_llvm_library(omptarget.rtl.amdgpu.nextgen SHARED + src/rtl.cpp + ${LIBOMPTARGET_EXTRA_SOURCE} + + ADDITIONAL_HEADER_DIRS + ${LIBOMPTARGET_INCLUDE_DIR} + + LINK_COMPONENTS + Support + Object + + LINK_LIBS + PRIVATE + elf_common + MemoryManager + PluginInterface + ${LIBOMPTARGET_DEP_LIBRARIES} + ${OPENMP_PTHREAD_LIB} + "-Wl,--version-script=${CMAKE_CURRENT_SOURCE_DIR}/../exports" + ${LDFLAGS_UNDEFINED} + + NO_INSTALL_RPATH +) +add_dependencies(omptarget.rtl.amdgpu.nextgen omptarget.devicertl.amdgpu) + +target_include_directories( + omptarget.rtl.amdgpu.nextgen + PRIVATE + ${LIBOMPTARGET_INCLUDE_DIR} +) + + +# Install plugin under the lib destination folder. +install(TARGETS omptarget.rtl.amdgpu.nextgen LIBRARY DESTINATION "${OPENMP_INSTALL_LIBDIR}") +set_target_properties(omptarget.rtl.amdgpu.nextgen PROPERTIES + INSTALL_RPATH "$ORIGIN" BUILD_RPATH "$ORIGIN:${CMAKE_CURRENT_BINARY_DIR}/.." + CXX_VISIBILITY_PRESET protected) Index: openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp =================================================================== --- /dev/null +++ openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp @@ -0,0 +1,2486 @@ +//===----RTLs/amdgpu/src/rtl.cpp - Target RTLs Implementation ----- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// RTL NextGen for AMDGPU machine +// +//===----------------------------------------------------------------------===// + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "Debug.h" +#include "DeviceEnvironment.h" +#include "GlobalHandler.h" +#include "PluginInterface.h" + +#include "llvm/ADT/StringMap.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/BinaryFormat/ELF.h" +#include "llvm/Frontend/OpenMP/OMPConstants.h" +#include "llvm/Frontend/OpenMP/OMPGridValues.h" + +namespace llvm { +namespace omp { +namespace target { +namespace plugin { + +/// Forward declarations for all specialized data structures. +struct AMDGPUKernelTy; +struct AMDGPUDeviceTy; +struct AMDGPUPluginTy; +struct AMDGPUStreamTy; +struct AMDGPUEventTy; +struct AMDGPUStreamManagerTy; +struct AMDGPUEventManagerTy; +struct AMDGPUDeviceImageTy; +struct AMDGPUMemoryManagerTy; +struct AMDGPUMemoryPoolTy; + +namespace utils { + +// The implicit arguments of AMDGPU kernels. +typedef struct impl_implicit_args_s { + uint64_t offset_x; + uint64_t offset_y; + uint64_t offset_z; + uint64_t hostcall_ptr; + uint64_t unused0; + uint64_t unused1; + uint64_t unused2; +} impl_implicit_args_t; + +static_assert(sizeof(impl_implicit_args_t) == 56, ""); + +/// Iterate elements using an HSA iterate function. Do not use this function +/// directly but the specialized ones below instead. +template +hsa_status_t iterate(IterFuncTy Func, CallbackTy Cb) { + auto L = [](ElemTy Elem, void *Data) -> hsa_status_t { + CallbackTy *Unwrapped = static_cast(Data); + return (*Unwrapped)(Elem); + }; + return Func(L, static_cast(&Cb)); +} + +/// Iterate elements using an HSA iterate function passing a parameter. Do not +/// use this function directly but the specialized ones below instead. +template +hsa_status_t iterate(IterFuncTy Func, IterFuncArgTy FuncArg, CallbackTy Cb) { + auto L = [](ElemTy Elem, void *Data) -> hsa_status_t { + CallbackTy *Unwrapped = static_cast(Data); + return (*Unwrapped)(Elem); + }; + return Func(FuncArg, L, static_cast(&Cb)); +} + +/// Iterate elements using an HSA iterate function passing a parameter. Do not +/// use this function directly but the specialized ones below instead. +template +hsa_status_t iterate(IterFuncTy Func, IterFuncArgTy FuncArg, CallbackTy Cb) { + auto L = [](Elem1Ty Elem1, Elem2Ty Elem2, void *Data) -> hsa_status_t { + CallbackTy *Unwrapped = static_cast(Data); + return (*Unwrapped)(Elem1, Elem2); + }; + return Func(FuncArg, L, static_cast(&Cb)); +} + +/// Iterate agents. +template Error iterateAgents(CallbackTy Callback) { + hsa_status_t Status = iterate(hsa_iterate_agents, Callback); + return Plugin::check(Status, "Error in hsa_iterate_agents: %s"); +} + +/// Iterate ISAs of an agent. +template +Error iterateAgentISAs(hsa_agent_t Agent, CallbackTy Cb) { + hsa_status_t Status = iterate(hsa_agent_iterate_isas, Agent, Cb); + return Plugin::check(Status, "Error in hsa_agent_iterate_isas: %s"); +} + +/// Iterate memory pools of an agent. +template +Error iterateAgentMemoryPools(hsa_agent_t Agent, CallbackTy Cb) { + hsa_status_t Status = iterate( + hsa_amd_agent_iterate_memory_pools, Agent, Cb); + return Plugin::check(Status, + "Error in hsa_amd_agent_iterate_memory_pools: %s"); +} + +/// Iterate executable symbols. +template +Error iterateSymbols(hsa_executable_t Executable, CallbackTy Cb) { + hsa_status_t Status = iterate( + hsa_executable_iterate_symbols, Executable, Cb); + return Plugin::check(Status, "Error in hsa_executable_iterate_symbols: %s"); +} + +/// Parse a TargetID to get processor arch and feature map. +/// Returns processor subarch. +/// Returns TargetID features in \p FeatureMap argument. +/// If the \p TargetID contains feature+, FeatureMap it to true. +/// If the \p TargetID contains feature-, FeatureMap it to false. +/// If the \p TargetID does not contain a feature (default), do not map it. +StringRef parseTargetID(StringRef TargetID, StringMap &FeatureMap) { + if (TargetID.empty()) + return llvm::StringRef(); + + auto ArchFeature = TargetID.split(":"); + auto Arch = ArchFeature.first; + auto Features = ArchFeature.second; + if (Features.empty()) + return Arch; + + if (Features.contains("sramecc+")) { + FeatureMap.insert(std::pair("sramecc", true)); + } else if (Features.contains("sramecc-")) { + FeatureMap.insert(std::pair("sramecc", false)); + } + if (Features.contains("xnack+")) { + FeatureMap.insert(std::pair("xnack", true)); + } else if (Features.contains("xnack-")) { + FeatureMap.insert(std::pair("xnack", false)); + } + + return Arch; +} + +} // namespace utils + +/// Class holding an HSA memory pool. +struct AMDGPUMemoryPoolTy { + /// Create a memory pool from an HSA memory pool. + AMDGPUMemoryPoolTy(hsa_amd_memory_pool_t MemoryPool) + : MemoryPool(MemoryPool), GlobalFlags(0) {} + + /// Initialize the memory pool retrieving its properties. + Error init() { + if (auto Err = getAttr(HSA_AMD_MEMORY_POOL_INFO_SEGMENT, Segment)) + return Err; + + if (auto Err = getAttr(HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, GlobalFlags)) + return Err; + + return Plugin::success(); + } + + /// Getter of the HSA memory pool. + hsa_amd_memory_pool_t get() const { return MemoryPool; } + + /// Indicate if it belongs to the global segment. + bool isGlobal() const { return (Segment == HSA_AMD_SEGMENT_GLOBAL); } + + /// Indicate if it is fine-grained memory. Valid only for global. + bool isFineGrained() const { + assert(isGlobal() && "Not global memory"); + return (GlobalFlags & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_FINE_GRAINED); + } + + /// Indicate if it is coarse-grained memory. Valid only for global. + bool isCoarseGrained() const { + assert(isGlobal() && "Not global memory"); + return (GlobalFlags & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_COARSE_GRAINED); + } + + /// Indicate if it supports storing kernel arguments. Valid only for global. + bool supportsKernelArgs() const { + assert(isGlobal() && "Not global memory"); + return (GlobalFlags & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_KERNARG_INIT); + } + + /// Allocate memory on the memory pool. + Error allocate(size_t Size, void **PtrStorage) { + hsa_status_t Status = + hsa_amd_memory_pool_allocate(MemoryPool, Size, 0, PtrStorage); + return Plugin::check(Status, "Error in hsa_amd_memory_pool_allocate: %s"); + } + + /// Return memory to the memory pool. + Error deallocate(void *Ptr) { + hsa_status_t Status = hsa_amd_memory_pool_free(Ptr); + return Plugin::check(Status, "Error in hsa_amd_memory_pool_free: %s"); + } + + /// Allow the device to access a specific allocation. + Error enableAccess(void *Ptr, int64_t Size, + const llvm::SmallVector &Agents) const { + // TODO: Ensure it is possible to enable the access. This can be retrieved + // through HSA_AMD_AGENT_MEMORY_POOL_INFO_ACCESS. If it is not possible, + // enabling the access results in undefined behavior. + + // We can access but it is disabled by default. Enable the access then. + hsa_status_t Status = + hsa_amd_agents_allow_access(Agents.size(), Agents.data(), nullptr, Ptr); + return Plugin::check(Status, "Error in hsa_amd_agents_allow_access: %s"); + } + +private: + /// Get attribute from the memory pool. + template + Error getAttr(hsa_amd_memory_pool_info_t Kind, Ty &Value) const { + hsa_status_t Status; + Status = hsa_amd_memory_pool_get_info(MemoryPool, Kind, &Value); + return Plugin::check(Status, "Error in hsa_amd_memory_pool_get_info: %s"); + } + + /// Get attribute from the memory pool relating to an agent. + template + Error getAttr(hsa_agent_t Agent, hsa_amd_agent_memory_pool_info_t Kind, + Ty &Value) const { + hsa_status_t Status; + Status = + hsa_amd_agent_memory_pool_get_info(Agent, MemoryPool, Kind, &Value); + return Plugin::check(Status, + "Error in hsa_amd_agent_memory_pool_get_info: %s"); + } + + /// The HSA memory pool. + hsa_amd_memory_pool_t MemoryPool; + + /// The segment where the memory pool belongs to. + hsa_amd_segment_t Segment; + + /// The global flags of memory pool. Only valid if the memory pool belongs to + /// the global segment. + uint32_t GlobalFlags; +}; + +/// Class that implements a memory manager that gets memory from a specific +/// memory pool. +struct AMDGPUMemoryManagerTy : public DeviceAllocatorTy { + + AMDGPUMemoryManagerTy() : MemoryPool(nullptr), MemoryManager(nullptr) {} + + /// Initialize the memory manager from a memory pool. + Error init(AMDGPUMemoryPoolTy &MemoryPool) { + const uint32_t Threshold = 1 << 30; + this->MemoryManager = new MemoryManagerTy(*this, Threshold); + this->MemoryPool = &MemoryPool; + return Plugin::success(); + } + + /// Deinitialize the memory manager and free its allocations. + Error deinit() { + assert(MemoryManager && "Invalid memory manager"); + + // Delete and invalidate the memory manager. At this point, the memory + // manager will deallocate all its allocations. + delete MemoryManager; + MemoryManager = nullptr; + + return Plugin::success(); + } + + /// Reuse or allocate memory through the memory manager. + Error allocate(size_t Size, void **PtrStorage) { + assert(MemoryManager && "Invalid memory manager"); + assert(PtrStorage && "Invalid pointer storage"); + + *PtrStorage = MemoryManager->allocate(Size, nullptr); + if (*PtrStorage == nullptr) + return Plugin::error("Failure to allocate from AMDGPU memory manager"); + + return Plugin::success(); + } + + /// Release an allocation to be reused. + Error deallocate(void *Ptr) { + assert(Ptr && "Invalid pointer"); + + if (MemoryManager->free(Ptr)) + return Plugin::error("Failure to deallocate from AMDGPU memory manager"); + + return Plugin::success(); + } + +private: + /// Allocation callback that will be called once the memory manager does not + /// have more previously allocated buffers. + void *allocate(size_t Size, void *HstPtr, TargetAllocTy Kind) override; + + /// Deallocation callack that will be called by the memory manager. + int free(void *TgtPtr, TargetAllocTy Kind) override { + if (auto Err = MemoryPool->deallocate(TgtPtr)) { + consumeError(std::move(Err)); + return OFFLOAD_FAIL; + } + return OFFLOAD_SUCCESS; + } + + /// The memory pool used to allocate memory. + AMDGPUMemoryPoolTy *MemoryPool; + + /// Reference to the actual memory manager. + MemoryManagerTy *MemoryManager; +}; + +/// Class implementing the AMDGPU device images' properties. +struct AMDGPUDeviceImageTy : public DeviceImageTy { + /// Create the AMDGPU image with the id and the target image pointer. + AMDGPUDeviceImageTy(int32_t ImageId, const __tgt_device_image *TgtImage) + : DeviceImageTy(ImageId, TgtImage) /*, Module(nullptr)*/ {} + + /// Prepare and load the executable corresponding to the image. + Error loadExecutable(const AMDGPUDeviceTy &Device); + + /// Unload the executable. + Error unloadExecutable() { + hsa_status_t Status = hsa_executable_destroy(Executable); + if (auto Err = Plugin::check(Status, "Error in hsa_executable_destroy: %s")) + return Err; + + Status = hsa_code_object_destroy(CodeObject); + return Plugin::check(Status, "Error in hsa_code_object_destroy: %s"); + } + + /// Get the executable. + hsa_executable_t getExecutable() const { return Executable; } + + /// Find an HSA device symbol by its name on the executable. + Expected + findDeviceSymbol(GenericDeviceTy &Device, StringRef SymbolName) const; + +private: + /// The exectuable loaded on the agent. + hsa_executable_t Executable; + hsa_code_object_t CodeObject; +}; + +/// Class holding an HSA queue to submit kernel and barrier packets. +struct AMDGPUQueueTy { + /// Create an empty queue. + AMDGPUQueueTy() : Queue(nullptr), Agent({0}), Mutex() {} + + /// Initialize a new queue belonging to a specific agent. + Error init(hsa_agent_t Agent, int32_t QueueSize) { + this->Agent = Agent; + + hsa_status_t Status = + hsa_queue_create(Agent, QueueSize, HSA_QUEUE_TYPE_MULTI, callbackError, + nullptr, UINT32_MAX, UINT32_MAX, &Queue); + return Plugin::check(Status, "Error in hsa_queue_create: %s"); + } + + /// Deinitialize the queue and destroy its resources. + Error deinit() { + hsa_status_t Status = hsa_queue_destroy(Queue); + return Plugin::check(Status, "Error in hsa_queue_destroy: %s"); + } + + /// Acquire a packet from the queue. This call may block the thread there is + /// space in the underlying HSA queue. Requires the queue to be locked. See + /// the AMDGPUQueueTy::lock() function. + hsa_kernel_dispatch_packet_t *acquirePacket(uint64_t &PacketId) { + // Increase the queue index with relaxed memory order. Notice this will need + // another subsequent atomic operation with acquire order. + PacketId = hsa_queue_add_write_index_relaxed(Queue, 1); + + // Wait for the package to be available. Notice the atomic operation uses + // the acquire memory order. + while (PacketId - hsa_queue_load_read_index_scacquire(Queue) >= Queue->size) + ; + + // Return the packet reference. + const uint32_t Mask = Queue->size - 1; // The size is a power of 2. + return (hsa_kernel_dispatch_packet_t *)Queue->base_address + + (PacketId & Mask); + } + + /// Publish the kernel packet so the HSA runtime can start processing the + /// kernel launch. Do not modify the packet once this function is called. + /// Requires the queue to be locked. See the AMDGPUQueueTy::lock() function. + void publishKernelPacket(uint64_t PacketId, + hsa_kernel_dispatch_packet_t *Packet) { + uint32_t *PacketPtr = reinterpret_cast(Packet); + + uint16_t Setup = Packet->setup; + uint16_t Header = HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE; + Header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE; + Header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE; + + // Publish the packet. Do not modify the package after this point. + __atomic_store_n(PacketPtr, Header | (Setup << 16), __ATOMIC_RELEASE); + + // Signal the doorbell about the published packet. + hsa_signal_store_relaxed(Queue->doorbell_signal, PacketId); + } + + /// Publish the barrier packet so the HSA runtime can start processing the + /// barrier. Next packets in the queue will not be processed until all barrier + /// dependencies (signals) are satisfied. Requires the queue to be locked. See + /// the AMDGPUQueueTy::lock() function. + void publishBarrierPacket(uint64_t PacketId, hsa_barrier_and_packet_t *Packet, + bool SignalDoorbell = false) { + uint32_t *PacketPtr = reinterpret_cast(Packet); + + uint16_t Setup = 0; + uint16_t Header = HSA_PACKET_TYPE_BARRIER_AND << HSA_PACKET_HEADER_TYPE; + Header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE; + Header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE; + + // Publish the packet. Do not modify the package after this point. + __atomic_store_n(PacketPtr, Header | (Setup << 16), __ATOMIC_RELEASE); + + // Signal the doorbell about the published packet if necessary. No need + // to signal the doorbell if there will be a subsequent packet being + // published. + if (SignalDoorbell) + hsa_signal_store_relaxed(Queue->doorbell_signal, PacketId); + } + + hsa_queue_t *get() { return Queue; } + + /// Get the size of the HSA queue. + uint32_t getSize() const { return Queue->size; } + + /// Get the agent corresponding to the queue. + hsa_agent_t getAgent() const { return Agent; } + + /// Lock and unlock the queue. Acquiring a packet, filling its fields and + /// publishing it should be performed under the same exclusive region without + /// releasing the queue's lock. We should keep the order of acquiring and + /// publishing of packets. When publishing a packet, the previous ones should + /// be ready to be processed too. + void lock() { Mutex.lock(); } + void unlock() { Mutex.unlock(); } + +private: + /// Callack that will be called when an error is detected on the HSA queue. + static void callbackError(hsa_status_t Status, hsa_queue_t *Source, void *) { + auto Err = Plugin::check(Status, "Received error in queue %p: %s", Source); + FATAL_MESSAGE(1, "%s", toString(std::move(Err)).data()); + } + + /// The HSA queue. + hsa_queue_t *Queue; + + /// The agent corresponding to the queue. + hsa_agent_t Agent; + + /// Mutex to protect the acquiring and publishing of packets. + std::mutex Mutex; +}; + +/// Class representing an HSA signal. Signals are used to define dependencies +/// between asynchronous operations: kernel launches and memory transfers. +struct AMDGPUSignalTy { + /// Create an empty signal. + AMDGPUSignalTy() : Signal({0}) {} + + /// Initialize the signal with an initial value. + Error init(uint32_t InitialValue = 0) { + hsa_status_t Status = + hsa_amd_signal_create(InitialValue, 0, nullptr, 0, &Signal); + return Plugin::check(Status, "Error in hsa_signal_create: %s"); + } + + /// Deinitialize the signal. + Error deinit() { + hsa_status_t Status = hsa_signal_destroy(Signal); + return Plugin::check(Status, "Error in hsa_signal_destroy: %s"); + } + + /// Wait until the signal gets a zero value. + Error wait() const { + while (hsa_signal_wait_scacquire(Signal, HSA_SIGNAL_CONDITION_EQ, 0, + UINT64_MAX, HSA_WAIT_STATE_ACTIVE) != 0) + ; + return Plugin::success(); + } + + /// Load the value on the signal. + hsa_signal_value_t load() const { return hsa_signal_load_scacquire(Signal); } + + /// Signal decrementing by one. + void signal() { + assert(load() > 0 && "Invalid signal value"); + hsa_signal_subtract_screlease(Signal, 1); + } + + /// Reseting the value to one. + void reset() { hsa_signal_store_screlease(Signal, 1); } + + hsa_signal_t get() const { return Signal; } + +private: + /// The underlying HSA signal. + hsa_signal_t Signal; +}; + +/// Class implementing the AMDGPU kernel functionalities which derives from the +/// generic kernel class. +struct AMDGPUKernelTy : public GenericKernelTy { + /// Create an AMDGPU kernel with a name and an execution mode. + AMDGPUKernelTy(const char *Name, OMPTgtExecModeFlags ExecutionMode) + : GenericKernelTy(Name, ExecutionMode), + ImplicitArgsSize(sizeof(utils::impl_implicit_args_t)) {} + + /// Initialize the AMDGPU kernel. + Error initImpl(GenericDeviceTy &Device, DeviceImageTy &Image) override { + AMDGPUDeviceImageTy &AMDImage = static_cast(Image); + + // Kernel symbols have a ".kd" suffix. + std::string KernelName(getName()); + KernelName += ".kd"; + + // Find the symbol on the device executable. + auto SymbolOrErr = AMDImage.findDeviceSymbol(Device, KernelName); + if (!SymbolOrErr) + return SymbolOrErr.takeError(); + + hsa_executable_symbol_t Symbol = *SymbolOrErr; + hsa_symbol_kind_t SymbolType; + hsa_status_t Status; + + // Retrieve different properties of the kernel symbol. + std::pair RequiredInfos[] = { + {HSA_EXECUTABLE_SYMBOL_INFO_TYPE, &SymbolType}, + {HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT, &KernelObject}, + {HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_SIZE, &ArgsSize}, + {HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_GROUP_SEGMENT_SIZE, &GroupSize}, + {HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_PRIVATE_SEGMENT_SIZE, &PrivateSize}}; + + for (auto &Info : RequiredInfos) { + Status = hsa_executable_symbol_get_info(Symbol, Info.first, Info.second); + if (auto Err = Plugin::check( + Status, "Error in hsa_executable_symbol_get_info: %s")) + return Err; + } + + // Make sure it is a kernel symbol. + if (SymbolType != HSA_SYMBOL_KIND_KERNEL) + return Plugin::error("Symbol %s is not a kernel function"); + + // TODO: Read the kernel descriptor for the max threads per block. May be + // read from the image. + + return Plugin::success(); + } + + /// Launch the AMDGPU kernel function + Error launchImpl(GenericDeviceTy &GenericDevice, uint32_t NumThreads, + uint64_t NumBlocks, uint32_t DynamicMemorySize, + int32_t NumKernelArgs, void *KernelArgs, + AsyncInfoWrapperTy &AsyncInfoWrapper) const override; + + /// The default number of blocks is common to the whole device. + uint64_t getDefaultNumBlocks(GenericDeviceTy &GenericDevice) const override { + return GenericDevice.getDefaultNumBlocks(); + } + + /// The default number of threads is common to the whole device. + uint32_t getDefaultNumThreads(GenericDeviceTy &GenericDevice) const override { + return GenericDevice.getDefaultNumThreads(); + } + + /// Get group and private segment kernel size. + uint32_t getGroupSize() const { return GroupSize; } + uint32_t getPrivateSize() const { return PrivateSize; } + + /// Get the HSA kernel object representing the kernel function. + uint64_t getKernelObject() const { return KernelObject; } + +private: + /// The kernel object to execute. + uint64_t KernelObject; + + /// The args, group and private segments sizes required by a kernel instance. + uint32_t ArgsSize; + uint32_t GroupSize; + uint32_t PrivateSize; + + /// The size of implicit kernel arguments. + const uint32_t ImplicitArgsSize; +}; + +/// Class that implements a stream of asynchronous operations for AMDGPU +/// devices. This class relies on signals to implement streams and define +/// the dependencies between asynchronous operations. +class AMDGPUStreamTy { + /// Utility struct holding arguments for async H2H memory copies. + struct MemcpyArgsTy { + void *Dst; + const void *Src; + size_t Size; + }; + + /// Utility struct holding arguments for freeing buffers to memory managers. + struct RelBufArgsTy { + void *Buffer; + AMDGPUMemoryManagerTy *MemoryManager; + }; + + /// The stream is composed of N stream's slots or operations. The following + /// struct represents the fields of each slot/operation. Each slot has a + /// signal and an optional action function. When adding an asynchronous + /// operation to the stream, the signal is used as the output signal of the + /// HSA async operation. The input signal of the async operation is set to the + /// preceding operations' output signal. This way, we obtain a chain of + /// dependant async operations. The action is a function that will be executed + /// after the operation is completed, e.g., for releasing a buffer. + struct StreamOperationTy { + /// The output signal of the stream operation. May be used by the subsequent + /// operation as input signal. + AMDGPUSignalTy Signal; + + /// The action that must be performed after the operation's completion. Set + /// to nullptr when there is no action to perform. + Error (*ActionFunction)(void *); + + /// Space for the action's arguments. A pointer to these arguments is passed + /// to the action function. Notice the space of arguments is limited. + uint64_t ActionArgs[3]; + + /// Create an empty operation. + StreamOperationTy() : Signal(), ActionFunction(nullptr) {} + + // Perform the action if needed. + Error performAction() { + if (!ActionFunction) + return Plugin::success(); + + // Perform the action. + if (auto Err = (*ActionFunction)(ActionArgs)) + return Err; + + // Invalidate the action. + ActionFunction = nullptr; + + return Plugin::success(); + } + }; + + /// The queue that the stream uses to launch kernels. + AMDGPUQueueTy &Queue; + + /// Capacity of the stream. + const uint32_t Capacity; + + /// Array of stream slots/operations. The array works as a circular buffer. + llvm::SmallVector Ops; + + /// Properties of the circular buffer. + uint32_t Size; + uint32_t Head; + uint32_t Tail; + uint32_t SyncId; + + /// Mutex to protect stream's management. + mutable std::mutex Mutex; + + /// Get the last valid operation/slot on the stream. + uint32_t getLast() const { + assert(Size > 0 && "Stream is empty"); + return (Head > 0) ? Head - 1 : Capacity - 1; + } + + /// Consume one operation/slot from the stream. + void consume(uint32_t Num) { + assert(Size + Num < Capacity && "Stream is full"); + Head = (Head + Num) % Capacity; + Size += Num; + } + + /// Setup an action on a specific slot. Copy the action arguments to the + /// reserved space on the slot. + template + void setupAction(uint32_t Position, Error (*Action)(void *), + const ActionArgsTy &ActionArgs) { + static_assert(sizeof(ActionArgsTy) <= + sizeof(StreamOperationTy::ActionArgs)); + + Ops[Position].ActionFunction = Action; + std::memcpy(&Ops[Position].ActionArgs, &ActionArgs, sizeof(ActionArgsTy)); + } + + /// Make the current stream wait on a specific operation of another stream. + Error waitStreamOperation(const AMDGPUStreamTy &OtherStream, uint32_t Op) { + /// The signal that we must wait from the other stream. + const AMDGPUSignalTy &OtherSignal = OtherStream.Ops[Op].Signal; + + // Compute input and output dependencies for the current stream. + AMDGPUSignalTy *OutputSignal = &Ops[Head].Signal; + AMDGPUSignalTy *InputSignal = (Size > 0) ? &Ops[getLast()].Signal : nullptr; + + // Prepare the output signal. + OutputSignal->reset(); + + // Consume one slot on the current stream. + consume(1); + + // Lock the queue during the packet publishing process. + std::lock_guard Lock(Queue); + + /// Add a queue barrier waiting on both the other stream's operation and the + /// last operation on the current stream (if any). + uint64_t PacketId; + hsa_barrier_and_packet_t *Packet = + (hsa_barrier_and_packet_t *)Queue.acquirePacket(PacketId); + assert(Packet && "Invalid packet"); + + Packet->reserved0 = 0; + Packet->reserved1 = 0; + Packet->dep_signal[0] = OtherSignal.get(); + Packet->dep_signal[1] = {0}; + if (InputSignal) + Packet->dep_signal[1] = InputSignal->get(); + Packet->dep_signal[2] = {0}; + Packet->dep_signal[3] = {0}; + Packet->dep_signal[4] = {0}; + Packet->reserved2 = 0; + Packet->completion_signal = OutputSignal->get(); + + // Publish the packet. Do not modify the packet after this point. + Queue.publishBarrierPacket(PacketId, Packet, /* signal doorbell */ true); + + return Plugin::success(); + } + + /// Callback for running a specific asynchronous operation. This callback is + /// used for hsa_amd_signal_async_handler. The argument is the operation that + /// should be executed. Notice we use the post action mechanism to codify the + /// asynchronous operation. + static bool asyncActionCallback(hsa_signal_value_t Value, void *Args) { + StreamOperationTy *Op = reinterpret_cast(Args); + assert(Op && "Invalid operation"); + + // Peform the operation. + if (auto Err = Op->performAction()) + FATAL_MESSAGE(1, "Error peforming post action: %s", + toString(std::move(Err)).data()); + + // Signal the output signal to notify the asycnhronous operation finalized. + Op->Signal.signal(); + + // Unregister callback. + return false; + } + + // Callback for host-to-host memory copies. + static Error memcpyAction(void *Data) { + MemcpyArgsTy *Args = reinterpret_cast(Data); + assert(Args && "Invalid arguments"); + assert(Args->Dst && "Invalid destination buffer"); + assert(Args->Src && "Invalid source buffer"); + + std::memcpy(Args->Dst, Args->Src, Args->Size); + + return Plugin::success(); + } + + // Callback for releasing a memory buffer to a memory manager. + static Error releaseBufferAction(void *Data) { + RelBufArgsTy *Args = reinterpret_cast(Data); + assert(Args && "Invalid arguments"); + assert(Args->MemoryManager && "Invalid memory manager"); + assert(Args->Buffer && "Invalid buffer"); + + // Release the allocation to the memory manager. + return Args->MemoryManager->deallocate(Args->Buffer); + } + +public: + /// Create an empty stream with a specific capacity. + AMDGPUStreamTy(AMDGPUQueueTy &Queue, uint32_t Capacity) + : Queue(Queue), Capacity(Capacity), Ops(Capacity), Size(0), Head(0), + Tail(0), SyncId(0) {} + + /// Intialize the stream's signals. + Error init() { + for (StreamOperationTy &Op : Ops) { + if (auto Err = Op.Signal.init()) + return Err; + } + return Plugin::success(); + } + + /// Deinitialize the stream's signals. + Error deinit() { + for (StreamOperationTy &Op : Ops) { + if (auto Err = Op.Signal.deinit()) + return Err; + } + return Plugin::success(); + } + + /// Push a asynchronous kernel to the stream. The kernel arguments must be + /// placed in a special allocation for kernel args and must keep alive until + /// the kernel finalizes. Once the kernel is finished, the stream will release + /// the kernel args buffer to the specified memory manager. + Error pushKernel(const AMDGPUKernelTy &Kernel, void *KernelArgs, + uint32_t NumThreads, uint64_t NumBlocks, + AMDGPUMemoryManagerTy &MemoryManager) { + std::lock_guard StreamLock(Mutex); + + if (Size == Capacity) + return Plugin::error("Stream is full"); + + // Compute input and output dependencies. + AMDGPUSignalTy *OutputSignal = &Ops[Head].Signal; + AMDGPUSignalTy *InputSignal = (Size > 0) ? &Ops[getLast()].Signal : nullptr; + + // Avoid defining the input dependency if already satisfied. + if (InputSignal && !InputSignal->load()) + InputSignal = nullptr; + + // Setup the post action to release the kernel args buffer. + setupAction(Head, releaseBufferAction, + RelBufArgsTy{KernelArgs, &MemoryManager}); + + // Prepare the output signal. + OutputSignal->reset(); + + // Consume one stream position. + consume(1); + + // Lock the queue during the packet publishing process. + std::lock_guard Lock(Queue); + + // Add a barrier packet before the kernel packet in case there is a pending + // preceding operation on the stream. The barrier packet will stop the + // processing of subsequent packets on the queue until the barrier input + // signals are satisfied. + if (InputSignal) { + uint64_t PacketId; + hsa_barrier_and_packet_t *Packet = + (hsa_barrier_and_packet_t *)Queue.acquirePacket(PacketId); + assert(Packet && "Invalid packet"); + + Packet->reserved0 = 0; + Packet->reserved1 = 0; + Packet->dep_signal[0] = InputSignal->get(); + Packet->dep_signal[1] = {0}; + Packet->dep_signal[2] = {0}; + Packet->dep_signal[3] = {0}; + Packet->dep_signal[4] = {0}; + Packet->reserved2 = 0; + // No need to setup an output signal. The dependency is already guaranteed + // by the queue barrier itself. + Packet->completion_signal = {0}; + + // Publish the packet. Do not modify the packet after this point. + Queue.publishBarrierPacket(PacketId, Packet); + } + + // Now prepare the kernel packet. + uint64_t PacketId; + hsa_kernel_dispatch_packet_t *Packet = Queue.acquirePacket(PacketId); + assert(Packet && "Invalid packet"); + + // The header of the packet is written in the last moment. + Packet->setup = UINT16_C(1) << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS; + Packet->workgroup_size_x = NumThreads; + Packet->workgroup_size_y = 1; + Packet->workgroup_size_z = 1; + Packet->reserved0 = 0; + Packet->grid_size_x = NumBlocks * NumThreads; + Packet->grid_size_y = 1; + Packet->grid_size_z = 1; + Packet->private_segment_size = Kernel.getPrivateSize(); + Packet->group_segment_size = Kernel.getGroupSize(); + Packet->kernel_object = Kernel.getKernelObject(); + Packet->kernarg_address = KernelArgs; + Packet->reserved2 = 0; + Packet->completion_signal = OutputSignal->get(); + + // Publish the packet. Do not modify the packet after this point. + Queue.publishKernelPacket(PacketId, Packet); + + return Plugin::success(); + } + + /// Push an asynchronous memory copy between pinned memory buffers. + Error pushPinnedMemoryCopyAsync(void *Dst, const void *Src, + uint64_t CopySize) { + std::lock_guard Lock(Mutex); + + if (Size == Capacity) + return Plugin::error("Stream is full"); + + AMDGPUSignalTy *OutputSignal = &Ops[Head].Signal; + AMDGPUSignalTy *InputSignal = (Size > 0) ? &Ops[getLast()].Signal : nullptr; + + // Prepare the output signal. + OutputSignal->reset(); + + // Avoid defining the input dependency if already satisfied. + hsa_status_t Status; + if (InputSignal && InputSignal->load()) { + hsa_signal_t InputSignalRaw = InputSignal->get(); + Status = hsa_amd_memory_async_copy(Dst, Queue.getAgent(), Src, + Queue.getAgent(), CopySize, 1, + &InputSignalRaw, OutputSignal->get()); + } else + Status = hsa_amd_memory_async_copy(Dst, Queue.getAgent(), Src, + Queue.getAgent(), CopySize, 0, nullptr, + OutputSignal->get()); + if (auto Err = + Plugin::check(Status, "Error in hsa_amd_memory_async_copy: %s")) + return Err; + + // Consume one stream position. + consume(1); + + return Plugin::success(); + } + + /// Push an asynchronous memory copy device-to-host involving a unpinned + /// memory buffer. The operation consists of a two-step copy from the + /// device buffer to an intermediate pinned host buffer, and then, to a + /// unpinned host buffer. Both operations are asynchronous and dependant. + /// The intermediate pinned buffer will be released to the specified memory + /// manager once the operation completes. + Error pushMemoryCopyD2HAsync(void *Dst, const void *Src, void *Inter, + uint64_t CopySize, + AMDGPUMemoryManagerTy &MemoryManager) { + std::lock_guard Lock(Mutex); + + if (Size > Capacity - 2) + return Plugin::error("Stream is full"); + + AMDGPUSignalTy *OutputSignal = &Ops[Head].Signal; + AMDGPUSignalTy *InputSignal = (Size > 0) ? &Ops[getLast()].Signal : nullptr; + + // Setup the post action for releasing the intermediate buffer. + setupAction(Head, releaseBufferAction, RelBufArgsTy{Inter, &MemoryManager}); + + // Prepare the output signal. + OutputSignal->reset(); + + // Issue the first step: device to host transfer. Avoid defining the input + // dependency if already satisfied. + hsa_status_t Status; + if (InputSignal && InputSignal->load()) { + hsa_signal_t InputSignalRaw = InputSignal->get(); + Status = hsa_amd_memory_async_copy(Inter, Queue.getAgent(), Src, + Queue.getAgent(), CopySize, 1, + &InputSignalRaw, OutputSignal->get()); + } else { + Status = hsa_amd_memory_async_copy(Inter, Queue.getAgent(), Src, + Queue.getAgent(), CopySize, 0, nullptr, + OutputSignal->get()); + } + + if (auto Err = + Plugin::check(Status, "Error in hsa_amd_memory_async_copy: %s")) + return Err; + + // Consume one stream position. + consume(1); + + InputSignal = OutputSignal; + OutputSignal = &Ops[Head].Signal; + + // The std::memcpy is done asynchronously using an async handler. We store + // the function's information in the action but it is not actually a post + // action. + setupAction(Head, memcpyAction, MemcpyArgsTy{Dst, Inter, CopySize}); + + // Prepare the output signal. + OutputSignal->reset(); + + // Issue the second step: host to host transfer. + Status = hsa_amd_signal_async_handler( + InputSignal->get(), HSA_SIGNAL_CONDITION_EQ, 0, asyncActionCallback, + (void *)&Ops[Head]); + if (auto Err = + Plugin::check(Status, "Error in hsa_amd_signal_async_handler: %s")) + return Err; + + // Consume another stream position. + consume(1); + + return Plugin::success(); + } + + /// Push an asynchronous memory copy host-to-device involving a unpinned + /// memory buffer. The operation consists of a two-step copy from the + /// unpinned host buffer to an intermediate pinned host buffer, and then, to + /// the pinned host buffer. Both operations are asynchronous and dependant. + /// The intermediate pinned buffer will be released to the specified memory + /// manager once the operation completes. + Error pushMemoryCopyH2DAsync(void *Dst, const void *Src, void *Inter, + uint64_t CopySize, + AMDGPUMemoryManagerTy &MemoryManager) { + std::lock_guard Lock(Mutex); + + if (Size > Capacity - 2) + return Plugin::error("Stream is full"); + + AMDGPUSignalTy *OutputSignal = &Ops[Head].Signal; + AMDGPUSignalTy *InputSignal = (Size > 0) ? &Ops[getLast()].Signal : nullptr; + + // Issue the first step: host to host transfer. + if (InputSignal) { + // The std::memcpy is done asynchronously using an async handler. We store + // the function's information in the action but it is not actually a + // post action. + setupAction(Head, memcpyAction, MemcpyArgsTy{Inter, Src, CopySize}); + + // Prepare the output signal. + OutputSignal->reset(); + + hsa_status_t Status = hsa_amd_signal_async_handler( + InputSignal->get(), HSA_SIGNAL_CONDITION_EQ, 0, asyncActionCallback, + (void *)&Ops[Head]); + + if (auto Err = Plugin::check(Status, + "Error in hsa_amd_signal_async_handler: %s")) + return Err; + + // Consume one stream position. + consume(1); + + InputSignal = OutputSignal; + OutputSignal = &Ops[Head].Signal; + } else { + // All preceding operations completed, copy the memory synchronously. + std::memcpy(Inter, Src, CopySize); + } + + // Setup the post action to release the intermediate pinned buffer. + setupAction(Head, releaseBufferAction, RelBufArgsTy{Inter, &MemoryManager}); + + // Prepare the output signal. + OutputSignal->reset(); + + // Issue the second step: host to device transfer. Avoid defining the input + // dependency if already satisfied. + hsa_status_t Status; + if (InputSignal && InputSignal->load()) { + hsa_signal_t InputSignalRaw = InputSignal->get(); + Status = hsa_amd_memory_async_copy(Dst, Queue.getAgent(), Inter, + Queue.getAgent(), CopySize, 1, + &InputSignalRaw, OutputSignal->get()); + } else { + Status = hsa_amd_memory_async_copy(Dst, Queue.getAgent(), Inter, + Queue.getAgent(), CopySize, 0, nullptr, + OutputSignal->get()); + } + + if (auto Err = + Plugin::check(Status, "Error in hsa_amd_memory_async_copy: %s")) + return Err; + + // Consume another stream positions. + consume(1); + + return Plugin::success(); + } + + /// Synchronize with the stream. Wait until all operations are finalized and + /// perform the pending post actions (i.e., releasing intermediate buffers). + Error synchronize() { + std::lock_guard Lock(Mutex); + + SyncId += 1; + + // No need to synchronize anything. + if (Size == 0) + return Plugin::success(); + + // Wait until all previous operations on the stream have completed. + if (auto Err = Ops[getLast()].Signal.wait()) + return Err; + + while (Tail != Head) { + // Take the post action of the operation if any. + if (auto Err = Ops[Tail].performAction()) + return Err; + + Tail = (Tail + 1) % Capacity; + } + + // Reset the size. + Size = 0; + + return Plugin::success(); + } + + /// Record the state of the stream on an event. + Error recordEvent(AMDGPUEventTy &Event) const; + + /// Make the stream wait on an event. + Error waitEvent(const AMDGPUEventTy &Event); +}; + +/// Class representing an event on AMDGPU. The event basically stores some +/// information regarding the state of the recorded stream. +struct AMDGPUEventTy { + /// Create an empty event. + AMDGPUEventTy() + : RecordedStream(nullptr), RecordedOperation(-1), RecordedSyncId(-1) {} + + /// Initialize and deinitialize. + Error init() { return Plugin::success(); } + Error deinit() { return Plugin::success(); } + + /// Record the state of a stream on the event. + Error record(const AMDGPUStreamTy &Stream) { + std::lock_guard Lock(Mutex); + + // Ignore the last recorded stream. + RecordedStream = &Stream; + + return Stream.recordEvent(*this); + } + + /// Make a stream wait on the current event. + Error wait(AMDGPUStreamTy &Stream) { + std::lock_guard Lock(Mutex); + + if (!RecordedStream) + return Plugin::error("Event does not have any recorded stream"); + + // Synchronizing the same stream. Do nothing. + if (RecordedStream == &Stream) + return Plugin::success(); + + // No need to wait anything, the recorded stream already finished the + // corresponding operation. + if (RecordedOperation < 0) + return Plugin::success(); + + if (auto Err = Stream.waitEvent(*this)) + return Err; + + return Plugin::success(); + } + +protected: + /// The stream registered in this event. + const AMDGPUStreamTy *RecordedStream; + + /// The recordered operation on the recorded stream. + int32_t RecordedOperation; + + /// The sync number when the stream was recorded. + int32_t RecordedSyncId; + + /// Mutex to safely access event fields. + mutable std::mutex Mutex; + + friend class AMDGPUStreamTy; +}; + +Error AMDGPUStreamTy::recordEvent(AMDGPUEventTy &Event) const { + std::lock_guard Lock(Mutex); + + if (Size > 0) { + // Record the synchronize identifier (to detect stale recordings) and + // the last valid stream's operation. + Event.RecordedSyncId = SyncId; + Event.RecordedOperation = getLast(); + } else { + // The stream is empty, everything already completed, record nothing. + Event.RecordedSyncId = -1; + Event.RecordedOperation = -1; + } + return Plugin::success(); +} + +Error AMDGPUStreamTy::waitEvent(const AMDGPUEventTy &Event) { + // Retrieve the recorded stream on the event. + const AMDGPUStreamTy &RecordedStream = *Event.RecordedStream; + + std::scoped_lock MultiLock(Mutex, RecordedStream.Mutex); + + // The recorded stream already completed the operation because the synchronize + // identifier is already outdated. + if (RecordedStream.SyncId != (uint32_t)Event.RecordedSyncId) + return Plugin::success(); + + // Again, the recorded stream already completed the operation, the last + // operation's output signal is satisfied. + if (!RecordedStream.Ops[Event.RecordedOperation].Signal.load()) + return Plugin::success(); + + // Otherwise, make the current stream wait on the other stream's operation. + return waitStreamOperation(RecordedStream, Event.RecordedOperation); +} + +/// Class wrapping an AMDGPU stream reference. These are the objects handled by +/// the Stream Manager for the AMDGPU plugin. +class AMDGPUStreamRef final : public GenericDeviceResourceRef { + /// The reference to the AMDGPU stream. + AMDGPUStreamTy *Stream; + +public: + /// Create an empty reference to an invalid stream. + AMDGPUStreamRef() : Stream(nullptr) {} + + /// Create a reference to an existing stream. + AMDGPUStreamRef(AMDGPUStreamTy *Stream) : Stream(Stream) {} + + /// Create a new stream and save the reference. The reference must be empty + /// before calling to this function. + Error create(GenericDeviceTy &Device) override; + + /// Destroy the referenced stream and invalidate the reference. The reference + /// must be to a valid stream before calling to this function. + Error destroy(GenericDeviceTy &Device) override { + if (!Stream) + return Plugin::error("Destroying an invalid stream"); + + if (auto Err = Stream->deinit()) + return Err; + + delete Stream; + + Stream = nullptr; + return Plugin::success(); + } + + /// Get the underlying AMDGPUStreamTy. + operator AMDGPUStreamTy *() const { return Stream; } +}; + +/// Class wrapping a CUDA event reference. These are the objects handled by the +/// Event Manager for the CUDA plugin. +class AMDGPUEventRef final : public GenericDeviceResourceRef { + AMDGPUEventTy *Event; + +public: + /// Create an empty reference to an invalid event. + AMDGPUEventRef() : Event(nullptr) {} + + /// Create a reference to an existing event. + AMDGPUEventRef(AMDGPUEventTy *Event) : Event(Event) {} + + /// Create a new event and save the reference. The reference must be empty + /// before calling to this function. + Error create(GenericDeviceTy &Device) override { + if (Event) + return Plugin::error("Creating an existing event"); + + Event = new AMDGPUEventTy(); + return Event->init(); + } + + /// Destroy the referenced event and invalidate the reference. The reference + /// must be to a valid event before calling to this function. + Error destroy(GenericDeviceTy &Device) override { + if (!Event) + return Plugin::error("Destroying an invalid event"); + + if (auto Err = Event->deinit()) + return Err; + + delete Event; + + Event = nullptr; + return Plugin::success(); + } + + /// Get the underlying AMDGPUEventTy *. + operator AMDGPUEventTy *() const { return Event; } +}; + +/// Abstract class that holds the common members of the actual kernel devices +/// and the host device. Both types should inherit from this class. +struct AMDGenericDeviceTy { + AMDGenericDeviceTy() {} + + virtual ~AMDGenericDeviceTy() {} + + /// Create all memory pools which the device has access to and classify them. + Error initMemoryPools() { + // Retrieve all memory pools from the device agent(s). + Error Err = retrieveAllMemoryPools(); + if (Err) + return Err; + + for (AMDGPUMemoryPoolTy *MemoryPool : AllMemoryPools) { + // Initialize the memory pool and retrieve some basic info. + Error Err = MemoryPool->init(); + if (Err) + return Err; + + if (!MemoryPool->isGlobal()) + continue; + + // Classify the memory pools depending on their properties. + if (MemoryPool->isFineGrained()) { + FineGrainedMemoryPools.push_back(MemoryPool); + if (MemoryPool->supportsKernelArgs()) + ArgsMemoryPools.push_back(MemoryPool); + } else if (MemoryPool->isCoarseGrained()) { + CoarseGrainedMemoryPools.push_back(MemoryPool); + } + } + return Plugin::success(); + } + + /// Destroy all memory pools. + Error deinitMemoryPools() { + for (AMDGPUMemoryPoolTy *Pool : AllMemoryPools) + delete Pool; + + AllMemoryPools.clear(); + FineGrainedMemoryPools.clear(); + CoarseGrainedMemoryPools.clear(); + ArgsMemoryPools.clear(); + + return Plugin::success(); + } + + /// Retrieve and construct all memory pools from the device agent(s). + virtual Error retrieveAllMemoryPools() = 0; + + /// Get the device agent. + virtual hsa_agent_t getAgent() const = 0; + +protected: + /// Array of all memory pools available to the host agents. + llvm::SmallVector AllMemoryPools; + + /// Array of fine-grained memory pools available to the host agents. + llvm::SmallVector FineGrainedMemoryPools; + + /// Array of coarse-grained memory pools available to the host agents. + llvm::SmallVector CoarseGrainedMemoryPools; + + /// Array of kernel args memory pools available to the host agents. + llvm::SmallVector ArgsMemoryPools; +}; + +/// Class representing the host device. This host device may have more than one +/// HSA host agent. We aggregate all its resources into the same instance. +struct AMDHostDeviceTy : public AMDGenericDeviceTy { + /// Create a host device from an array of host agents. + AMDHostDeviceTy(const llvm::SmallVector &HostAgents) + : AMDGenericDeviceTy(), Agents(HostAgents), ArgsMemoryManager(), + PinnedMemoryManager() { + assert(HostAgents.size() && "No host agent found"); + } + + /// Initialize the host device memory pools and the memory managers for + /// kernel args and host pinned memory allocations. + Error init() { + if (auto Err = initMemoryPools()) + return Err; + + if (auto Err = ArgsMemoryManager.init(getArgsMemoryPool())) + return Err; + + if (auto Err = PinnedMemoryManager.init(getHostMemoryPool())) + return Err; + + return Plugin::success(); + } + + /// Deinitialize memory pools and managers. + Error deinit() { + if (auto Err = deinitMemoryPools()) + return Err; + + if (auto Err = ArgsMemoryManager.deinit()) + return Err; + + if (auto Err = PinnedMemoryManager.deinit()) + return Err; + + return Plugin::success(); + } + + /// Retrieve and construct all memory pools from the host agents. + Error retrieveAllMemoryPools() override { + // Iterate through the available pools across the host agents. + for (hsa_agent_t Agent : Agents) { + Error Err = utils::iterateAgentMemoryPools( + Agent, [&](hsa_amd_memory_pool_t HSAMemoryPool) { + AMDGPUMemoryPoolTy *MemoryPool = + new AMDGPUMemoryPoolTy(HSAMemoryPool); + AllMemoryPools.push_back(MemoryPool); + return HSA_STATUS_SUCCESS; + }); + if (Err) + return Err; + } + return Plugin::success(); + } + + /// Get one of the host agents. + hsa_agent_t getAgent() const override { + // Return the first host agent. + return Agents[0]; + } + + /// Get a memory pool for host pinned allocations. + AMDGPUMemoryPoolTy &getHostMemoryPool() { + assert(!FineGrainedMemoryPools.empty() && "No fine-grained mempool"); + // Retrive any memory pool. + return *FineGrainedMemoryPools[0]; + } + + /// Get a memory pool for kernel args allocations. + AMDGPUMemoryPoolTy &getArgsMemoryPool() { + assert(!ArgsMemoryPools.empty() && "No kernelargs mempool"); + // Retrieve any memory pool. + return *ArgsMemoryPools[0]; + } + + /// Getters for kernel args and host pinned memory managers. + AMDGPUMemoryManagerTy &getArgsMemoryManager() { return ArgsMemoryManager; } + AMDGPUMemoryManagerTy &getPinnedMemoryManager() { + return PinnedMemoryManager; + } + +private: + /// Array of agents on the host side. + const llvm::SmallVector Agents; + + // Memory manager for kernel arguments. + AMDGPUMemoryManagerTy ArgsMemoryManager; + + // Memory manager for pinned memory. + AMDGPUMemoryManagerTy PinnedMemoryManager; +}; + +/// Class implementing the AMDGPU device functionalities which derives from the +/// generic device class. +struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy { + // Create an AMDGPU device with a device id and default AMDGPU grid values. + AMDGPUDeviceTy(int32_t DeviceId, int32_t NumDevices, + AMDHostDeviceTy &HostDevice, hsa_agent_t Agent) + : GenericDeviceTy(DeviceId, NumDevices, {0}), AMDGenericDeviceTy(), + OMPX_NumQueues("LIBOMPTARGET_AMDGPU_NUM_QUEUES", 8), + OMPX_QueueSize("LIBOMPTARGET_AMDGPU_QUEUE_SIZE", 1024), + OMPX_StreamSize("LIBOMPTARGET_AMDGPU_STREAM_SIZE", 512), + OMPX_MaxAsyncCopySize("LIBOMPTARGET_AMDGPU_MAX_ASYNC_COPY_SIZE", + 1 * 1024 * 1024), // 1MB + AMDGPUStreamManager(*this), AMDGPUEventManager(*this), Agent(Agent), + HostDevice(HostDevice), Queues() {} + + ~AMDGPUDeviceTy() {} + + /// Initialize the device, its resources and get its properties. + Error initImpl(GenericPluginTy &Plugin) override { + // First setup all the memory pools. + if (auto Err = initMemoryPools()) + return Err; + + // Get the wavefront size. + uint32_t WavefrontSize = 0; + if (auto Err = getDeviceAttr(HSA_AGENT_INFO_WAVEFRONT_SIZE, WavefrontSize)) + return Err; + GridValues.GV_Warp_Size = WavefrontSize; + + // Load the grid values dependending on the wavefront. + if (WavefrontSize == 32) + GridValues = getAMDGPUGridValues<32>(); + else if (WavefrontSize == 64) + GridValues = getAMDGPUGridValues<64>(); + else + return Plugin::error("Unexpected AMDGPU wavefront %d", WavefrontSize); + + // Get maximum number of workitems per workgroup. + uint16_t WorkgroupMaxDim[3]; + if (auto Err = + getDeviceAttr(HSA_AGENT_INFO_WORKGROUP_MAX_DIM, WorkgroupMaxDim)) + return Err; + GridValues.GV_Max_WG_Size = WorkgroupMaxDim[0]; + + // Get maximum number of workgroups. + hsa_dim3_t GridMaxDim; + if (auto Err = getDeviceAttr(HSA_AGENT_INFO_GRID_MAX_DIM, GridMaxDim)) + return Err; + GridValues.GV_Max_Teams = GridMaxDim.x / GridValues.GV_Max_WG_Size; + if (GridValues.GV_Max_Teams == 0) + return Plugin::error("Maximum number of teams cannot be zero"); + + // Get maximum size of any device queues and maximum number of queues. + uint32_t MaxQueueSize; + if (auto Err = getDeviceAttr(HSA_AGENT_INFO_QUEUE_MAX_SIZE, MaxQueueSize)) + return Err; + + uint32_t MaxQueues; + if (auto Err = getDeviceAttr(HSA_AGENT_INFO_QUEUES_MAX, MaxQueues)) + return Err; + + // Compute the number of queues and their size. + const uint32_t NumQueues = std::min(OMPX_NumQueues.get(), MaxQueues); + const uint32_t QueueSize = std::min(OMPX_QueueSize.get(), MaxQueueSize); + + // Construct and initialize each device queue. + Queues = std::vector(NumQueues); + for (AMDGPUQueueTy &Queue : Queues) + if (auto Err = Queue.init(Agent, QueueSize)) + return Err; + + // Initialize stream pool. + if (auto Err = AMDGPUStreamManager.init()) + return Err; + + // Initialize event pool. + if (auto Err = AMDGPUEventManager.init()) + return Err; + + return Plugin::success(); + } + + /// Deinitialize the device and release its resources. + Error deinitImpl() override { + // Deinitialize the stream and event pools. + if (auto Err = AMDGPUStreamManager.deinit()) + return Err; + + if (auto Err = AMDGPUEventManager.deinit()) + return Err; + + // Close modules if necessary. + if (!LoadedImages.empty()) { + // Each image has its own module. + for (DeviceImageTy *Image : LoadedImages) { + AMDGPUDeviceImageTy &AMDImage = + static_cast(*Image); + + // Unload the executable of the image. + if (auto Err = AMDImage.unloadExecutable()) + return Err; + } + } + + for (AMDGPUQueueTy &Queue : Queues) { + if (auto Err = Queue.deinit()) + return Err; + } + + // Invalidate agent reference. + Agent = {0}; + + return Plugin::success(); + } + + /// Allocate and construct an AMDGPU kernel. + Expected + constructKernelEntry(const __tgt_offload_entry &KernelEntry, + DeviceImageTy &Image) override { + AMDGPUDeviceImageTy &AMDImage = static_cast(Image); + + // Create a metadata object for the exec mode global (auto-generated). + StaticGlobalTy ExecModeGlobal( + KernelEntry.name, "_exec_mode"); + + // Retrieve execution mode for the kernel. This may fail since some kernels + // may not have a execution mode. + GenericGlobalHandlerTy &GHandler = Plugin::get().getGlobalHandler(); + if (auto Err = GHandler.readGlobalFromImage(*this, Image, ExecModeGlobal)) { + DP("Failed to read execution mode for '%s': %s\n" + "Using default GENERIC (1) execution mode\n", + KernelEntry.name, toString(std::move(Err)).data()); + // Consume the error since it is acceptable to fail. + consumeError(std::move(Err)); + // In some cases the execution mode is not included, so use the default. + ExecModeGlobal.setValue(llvm::omp::OMP_TGT_EXEC_MODE_GENERIC); + } + + // Check that the retrieved execution mode is valid. + if (!GenericKernelTy::isValidExecutionMode(ExecModeGlobal.getValue())) + return Plugin::error("Invalid execution mode %d for '%s'", + ExecModeGlobal.getValue(), KernelEntry.name); + + // Allocate and initialize the AMDGPU kernel. + AMDGPUKernelTy *AMDKernel = Plugin::get().allocate(); + new (AMDKernel) AMDGPUKernelTy(KernelEntry.name, ExecModeGlobal.getValue()); + + return AMDKernel; + } + + /// Set the current context to this device's context. Do nothing since the + /// AMDGPU devices do not have the concept of contexts. + Error setContext() override { return Plugin::success(); } + + /// Get the stream of the asynchronous info sructure or get a new one. + AMDGPUStreamTy &getStream(AsyncInfoWrapperTy &AsyncInfoWrapper) { + AMDGPUStreamTy *&Stream = AsyncInfoWrapper.getQueueAs(); + if (!Stream) + Stream = AMDGPUStreamManager.getStream(); + return *Stream; + } + + /// Load the binary image into the device and allocate an image object. + Expected loadBinaryImpl(const __tgt_device_image *TgtImage, + int32_t ImageId) override { + // Allocate and initialize the image object. + AMDGPUDeviceImageTy *AMDImage = + Plugin::get().allocate(); + new (AMDImage) AMDGPUDeviceImageTy(ImageId, TgtImage); + + // Load the HSA executable. + if (Error Err = AMDImage->loadExecutable(*this)) + return std::move(Err); + + return AMDImage; + } + + /// Allocate memory on the device or related to the device. + void *allocate(size_t Size, void *, TargetAllocTy Kind) override; + + /// Deallocate memory on the device or related to the device. + int free(void *TgtPtr, TargetAllocTy Kind) override { + if (TgtPtr == nullptr) + return OFFLOAD_SUCCESS; + + AMDGPUMemoryPoolTy *MemoryPool = nullptr; + switch (Kind) { + case TARGET_ALLOC_DEFAULT: + case TARGET_ALLOC_DEVICE: + MemoryPool = CoarseGrainedMemoryPools[0]; + break; + case TARGET_ALLOC_HOST: + MemoryPool = &HostDevice.getHostMemoryPool(); + break; + case TARGET_ALLOC_SHARED: + // TODO: Not supported yet. We could look at fine-grained host memory + // pools that are accessible by this device. The allocation should be made + // explicitly accessible if it is not yet. + break; + } + + if (!MemoryPool) { + REPORT("No memory pool for the specified allocation kind\n"); + return OFFLOAD_FAIL; + } + + if (Error Err = MemoryPool->deallocate(TgtPtr)) { + REPORT("%s\n", toString(std::move(Err)).data()); + return OFFLOAD_FAIL; + } + + if (Kind == TARGET_ALLOC_HOST) { + std::lock_guard Lock(HostAllocationsMutex); + size_t Erased = HostAllocations.erase(TgtPtr); + if (!Erased) { + REPORT("Cannot find a host allocation in the map\n"); + return OFFLOAD_FAIL; + } + } + + return OFFLOAD_SUCCESS; + } + + /// Synchronize current thread with the pending operations on the async info. + Error synchronizeImpl(__tgt_async_info &AsyncInfo) override { + AMDGPUStreamTy *Stream = + reinterpret_cast(AsyncInfo.Queue); + assert(Stream && "Invalid stream"); + + if (auto Err = Stream->synchronize()) + return Err; + + // Once the stream is synchronized, return it to stream pool and reset + // AsyncInfo. This is to make sure the synchronization only works for its + // own tasks. + AMDGPUStreamManager.returnStream(Stream); + AsyncInfo.Queue = nullptr; + + return Plugin::success(); + } + + /// Submit data to the device (host to device transfer). + Error dataSubmitImpl(void *TgtPtr, const void *HstPtr, int64_t Size, + AsyncInfoWrapperTy &AsyncInfoWrapper) override { + void *PinnedHstPtr = nullptr; + if (isHostPinnedMemory(HstPtr)) { + // Use one-step asynchronous operation when host memory is already pinned. + AMDGPUStreamTy &Stream = getStream(AsyncInfoWrapper); + if (auto Err = Stream.pushPinnedMemoryCopyAsync(TgtPtr, HstPtr, Size)) + return Err; + } else if (Size >= OMPX_MaxAsyncCopySize) { + // For large transfers use synchronous behavior. + if (AsyncInfoWrapper.hasQueue()) { + if (auto Err = synchronize(AsyncInfoWrapper)) + return Err; + } + + hsa_status_t Status; + Status = + hsa_amd_memory_lock((void *)HstPtr, Size, nullptr, 0, &PinnedHstPtr); + if (auto Err = + Plugin::check(Status, "Error in hsa_amd_memory_lock: %s\n")) + return Err; + + AMDGPUSignalTy Signal; + if (auto Err = Signal.init(1)) + return Err; + + Status = hsa_amd_memory_async_copy(TgtPtr, Agent, PinnedHstPtr, Agent, + Size, 0, nullptr, Signal.get()); + if (auto Err = + Plugin::check(Status, "Error in hsa_amd_memory_async_copy: %s")) + return Err; + + if (auto Err = Signal.wait()) + return Err; + + if (auto Err = Signal.deinit()) + return Err; + + Status = hsa_amd_memory_unlock((void *)HstPtr); + if (auto Err = + Plugin::check(Status, "Error in hsa_amd_memory_unlock: %s\n")) + return Err; + } else { + // Otherwise, use two-step copy with an intermediate pinned host buffer. + AMDGPUMemoryManagerTy &PinnedMemoryManager = + HostDevice.getPinnedMemoryManager(); + if (auto Err = PinnedMemoryManager.allocate(Size, &PinnedHstPtr)) + return Err; + + AMDGPUStreamTy &Stream = getStream(AsyncInfoWrapper); + if (auto Err = Stream.pushMemoryCopyH2DAsync(TgtPtr, HstPtr, PinnedHstPtr, + Size, PinnedMemoryManager)) + return Err; + } + return Plugin::success(); + } + + /// Retrieve data from the device (device to host transfer). + Error dataRetrieveImpl(void *HstPtr, const void *TgtPtr, int64_t Size, + AsyncInfoWrapperTy &AsyncInfoWrapper) override { + void *PinnedHstPtr = nullptr; + if (isHostPinnedMemory(HstPtr)) { + // Use one-step asynchronous operation when host memory is already pinned. + AMDGPUStreamTy &Stream = getStream(AsyncInfoWrapper); + return Stream.pushPinnedMemoryCopyAsync(HstPtr, TgtPtr, Size); + } else if (Size >= OMPX_MaxAsyncCopySize) { + // For large transfers use synchronous behavior. + if (AsyncInfoWrapper.hasQueue()) { + if (auto Err = synchronize(AsyncInfoWrapper)) + return Err; + } + + hsa_status_t Status; + Status = hsa_amd_memory_lock(HstPtr, Size, nullptr, 0, &PinnedHstPtr); + if (auto Err = + Plugin::check(Status, "Error in hsa_amd_memory_lock: %s\n")) + return Err; + + AMDGPUSignalTy Signal; + if (auto Err = Signal.init(1)) + return Err; + + Status = hsa_amd_memory_async_copy(PinnedHstPtr, Agent, TgtPtr, Agent, + Size, 0, nullptr, Signal.get()); + if (auto Err = + Plugin::check(Status, "Error in hsa_amd_memory_async_copy: %s")) + return Err; + + if (auto Err = Signal.wait()) + return Err; + + if (auto Err = Signal.deinit()) + return Err; + + Status = hsa_amd_memory_unlock(HstPtr); + if (auto Err = + Plugin::check(Status, "Error in hsa_amd_memory_unlock: %s\n")) + return Err; + } else { + // Otherwise, use two-step copy with an intermediate pinned host buffer. + AMDGPUMemoryManagerTy &PinnedMemoryManager = + HostDevice.getPinnedMemoryManager(); + if (auto Err = PinnedMemoryManager.allocate(Size, &PinnedHstPtr)) + return Err; + + AMDGPUStreamTy &Stream = getStream(AsyncInfoWrapper); + if (auto Err = Stream.pushMemoryCopyD2HAsync(HstPtr, TgtPtr, PinnedHstPtr, + Size, PinnedMemoryManager)) + return Err; + } + return Plugin::success(); + } + + /// Exchange data between two devices within the plugin. This function is not + /// supported in this plugin. + Error dataExchangeImpl(const void *SrcPtr, GenericDeviceTy &DstGenericDevice, + void *DstPtr, int64_t Size, + AsyncInfoWrapperTy &AsyncInfoWrapper) override { + // This function should never be called because the function + // AMDGPUPluginTy::isDataExchangable() returns false. + return Plugin::error("dataExchangeImpl not supported"); + } + + /// Initialize the async info for interoperability purposes. + Error initAsyncInfoImpl(AsyncInfoWrapperTy &AsyncInfoWrapper) override { + // TODO: Implement this function. + return Plugin::success(); + } + + /// Initialize the device info for interoperability purposes. + Error initDeviceInfoImpl(__tgt_device_info *DeviceInfo) override { + DeviceInfo->Context = nullptr; + + if (!DeviceInfo->Device) + DeviceInfo->Device = reinterpret_cast(Agent.handle); + + return Plugin::success(); + } + + /// Create an event. + Error createEventImpl(void **EventPtrStorage) override { + AMDGPUEventTy **Event = reinterpret_cast(EventPtrStorage); + *Event = AMDGPUEventManager.getEvent(); + return Plugin::success(); + } + + /// Destroy a previously created event. + Error destroyEventImpl(void *EventPtr) override { + AMDGPUEventTy *Event = reinterpret_cast(EventPtr); + AMDGPUEventManager.returnEvent(Event); + return Plugin::success(); + } + + /// Record the event. + Error recordEventImpl(void *EventPtr, + AsyncInfoWrapperTy &AsyncInfoWrapper) override { + AMDGPUEventTy *Event = reinterpret_cast(EventPtr); + assert(Event && "Invalid event"); + + AMDGPUStreamTy &Stream = getStream(AsyncInfoWrapper); + + return Event->record(Stream); + } + + /// Make the stream wait on the event. + Error waitEventImpl(void *EventPtr, + AsyncInfoWrapperTy &AsyncInfoWrapper) override { + AMDGPUEventTy *Event = reinterpret_cast(EventPtr); + + AMDGPUStreamTy &Stream = getStream(AsyncInfoWrapper); + + return Event->wait(Stream); + } + + /// Synchronize the current thread with the event. + Error syncEventImpl(void *EventPtr) override { + return Plugin::error("Synchronize event not implemented"); + } + + /// Print information about the device. + Error printInfoImpl() override { + // TODO: Implement the basic info. + return Plugin::success(); + } + + /// Getters and setters for stack and heap sizes. + Error getDeviceStackSize(uint64_t &Value) override { + Value = 0; + return Plugin::success(); + } + Error setDeviceStackSize(uint64_t Value) override { + return Plugin::success(); + } + Error getDeviceHeapSize(uint64_t &Value) override { + Value = 0; + return Plugin::success(); + } + Error setDeviceHeapSize(uint64_t Value) override { return Plugin::success(); } + + /// AMDGPU-specific function to get device attributes. + template Error getDeviceAttr(uint32_t Kind, Ty &Value) { + hsa_status_t Status = + hsa_agent_get_info(Agent, (hsa_agent_info_t)Kind, &Value); + return Plugin::check(Status, "Error in hsa_agent_get_info: %s"); + } + + /// Get the device agent. + hsa_agent_t getAgent() const override { return Agent; } + + /// Retrieve and construct all memory pools of the device agent. + Error retrieveAllMemoryPools() override { + // Iterate through the available pools of the device agent. + return utils::iterateAgentMemoryPools( + Agent, [&](hsa_amd_memory_pool_t HSAMemoryPool) { + AMDGPUMemoryPoolTy *MemoryPool = + Plugin::get().allocate(); + new (MemoryPool) AMDGPUMemoryPoolTy(HSAMemoryPool); + AllMemoryPools.push_back(MemoryPool); + return HSA_STATUS_SUCCESS; + }); + } + + /// Get the next queue in a round-robin fashion. + AMDGPUQueueTy &getNextQueue() { + static std::atomic NextQueue(0); + + int Current = NextQueue.fetch_add(1, std::memory_order_relaxed); + return Queues[Current % Queues.size()]; + } + + /// Get the stream capacity. + uint32_t getStreamSize() const { return OMPX_StreamSize; } + + /// Check whether a buffer is a host pinned buffer. + bool isHostPinnedMemory(const void *Ptr) const { + bool Found = false; + HostAllocationsMutex.lock_shared(); + if (!HostAllocations.empty()) { + auto It = HostAllocations.lower_bound((const void *)Ptr); + if (It != HostAllocations.end() && It->first == Ptr) { + Found = true; + } else if (It != HostAllocations.begin()) { + --It; + Found = ((const char *)It->first + It->second > (const char *)Ptr); + } + } + HostAllocationsMutex.unlock_shared(); + return Found; + } + +private: + using AMDGPUStreamManagerTy = GenericStreamManagerTy; + using AMDGPUEventManagerTy = GenericEventManagerTy; + + /// Environment variables to control queues and streams. + UInt32Envar OMPX_NumQueues; + UInt32Envar OMPX_QueueSize; + UInt32Envar OMPX_StreamSize; + UInt32Envar OMPX_MaxAsyncCopySize; + + /// Stream manager for AMDGPU streams. + AMDGPUStreamManagerTy AMDGPUStreamManager; + + /// Event manager for AMDGPU events. + AMDGPUEventManagerTy AMDGPUEventManager; + + /// The agent handler corresponding to the device. + hsa_agent_t Agent; + + /// Reference to the host device. + AMDHostDeviceTy &HostDevice; + + /// List of device packet queues. + std::vector Queues; + + /// Map of host pinned allocations. We track these pinned allocations so that + /// memory transfers involving these allocations do not need a two-step copy + /// with an intermediate pinned buffer. + std::map HostAllocations; + mutable std::shared_mutex HostAllocationsMutex; +}; + +Error AMDGPUStreamRef::create(GenericDeviceTy &Device) { + if (Stream) + return Plugin::error("Creating an existing stream"); + + AMDGPUDeviceTy &AMDGPUDevice = reinterpret_cast(Device); + + Stream = new AMDGPUStreamTy(AMDGPUDevice.getNextQueue(), + AMDGPUDevice.getStreamSize()); + + return Stream->init(); +} + +Error AMDGPUDeviceImageTy::loadExecutable(const AMDGPUDeviceTy &Device) { + hsa_status_t Status; + Status = hsa_code_object_deserialize(getStart(), getSize(), "", &CodeObject); + if (auto Err = + Plugin::check(Status, "Error in hsa_code_object_deserialize: %s")) + return Err; + + Status = hsa_executable_create_alt( + HSA_PROFILE_FULL, HSA_DEFAULT_FLOAT_ROUNDING_MODE_ZERO, "", &Executable); + if (auto Err = + Plugin::check(Status, "Error in hsa_executable_create_alt: %s")) + return Err; + + Status = hsa_executable_load_code_object(Executable, Device.getAgent(), + CodeObject, ""); + if (auto Err = + Plugin::check(Status, "Error in hsa_executable_load_code_object: %s")) + return Err; + + Status = hsa_executable_freeze(Executable, ""); + if (auto Err = Plugin::check(Status, "Error in hsa_executable_freeze: %s")) + return Err; + + uint32_t Result; + Status = hsa_executable_validate(Executable, &Result); + if (auto Err = Plugin::check(Status, "Error in hsa_executable_validate: %s")) + return Err; + + if (Result) + return Plugin::error("Loaded HSA executable does not validate"); + + return Plugin::success(); +} + +Expected +AMDGPUDeviceImageTy::findDeviceSymbol(GenericDeviceTy &Device, + StringRef SymbolName) const { + + AMDGPUDeviceTy &AMDGPUDevice = static_cast(Device); + hsa_agent_t Agent = AMDGPUDevice.getAgent(); + + hsa_executable_symbol_t Symbol; + hsa_status_t Status = hsa_executable_get_symbol_by_name( + Executable, SymbolName.data(), &Agent, &Symbol); + if (auto Err = Plugin::check( + Status, "Error in hsa_executable_get_symbol_by_name(%s): %s", + SymbolName.data())) + return std::move(Err); + + return Symbol; +} + +/// Class implementing the AMDGPU-specific functionalities of the global +/// handler. +class AMDGPUGlobalHandlerTy final : public GenericGlobalHandlerTy { + /// Extract the global's information from the ELF image, section, and symbol. + Error getGlobalMetadataFromELF(const DeviceImageTy &Image, + const ELF64LE::Sym &Symbol, + const ELF64LE::Shdr &Section, + GlobalTy &ImageGlobal) { + // The global's address in AMDGPU is computed as the image begin + the ELF + // symbol value. Notice we do not add the ELF section offset. + ImageGlobal.setPtr((char *)Image.getStart() + Symbol.st_value); + + // Set the global's size. + ImageGlobal.setSize(Symbol.st_size); + + return Plugin::success(); + } + +public: + /// Get the metadata of a global from the device. The name and size of the + /// global is read from DeviceGlobal and the address of the global is written + /// to DeviceGlobal. + Error getGlobalMetadataFromDevice(GenericDeviceTy &Device, + DeviceImageTy &Image, + GlobalTy &DeviceGlobal) override { + AMDGPUDeviceImageTy &AMDImage = static_cast(Image); + + // Find the symbol on the device executable. + auto SymbolOrErr = + AMDImage.findDeviceSymbol(Device, DeviceGlobal.getName()); + if (!SymbolOrErr) + return SymbolOrErr.takeError(); + + hsa_executable_symbol_t Symbol = *SymbolOrErr; + hsa_symbol_kind_t SymbolType; + hsa_status_t Status; + uint64_t SymbolAddr; + uint32_t SymbolSize; + + // Retrieve the type, address and size of the symbol. + std::pair RequiredInfos[] = { + {HSA_EXECUTABLE_SYMBOL_INFO_TYPE, &SymbolType}, + {HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS, &SymbolAddr}, + {HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_SIZE, &SymbolSize}}; + + for (auto &Info : RequiredInfos) { + Status = hsa_executable_symbol_get_info(Symbol, Info.first, Info.second); + if (auto Err = Plugin::check( + Status, "Error in hsa_executable_symbol_get_info: %s")) + return Err; + } + + // Check the size of the symbol. + if (SymbolSize != DeviceGlobal.getSize()) + return Plugin::error( + "Failed to load global '%s' due to size mismatch (%zu != %zu)", + DeviceGlobal.getName().data(), SymbolSize, + (size_t)DeviceGlobal.getSize()); + + // Store the symbol address on the device global metadata. + DeviceGlobal.setPtr(reinterpret_cast(SymbolAddr)); + + return Plugin::success(); + } +}; + +/// Class implementing the AMDGPU-specific functionalities of the plugin. +struct AMDGPUPluginTy final : public GenericPluginTy { + /// Create an AMDGPU plugin and initialize the AMDGPU driver. + AMDGPUPluginTy() : GenericPluginTy(), HostDevice(nullptr) {} + + /// This class should not be copied. + AMDGPUPluginTy(const AMDGPUPluginTy &) = delete; + AMDGPUPluginTy(AMDGPUPluginTy &&) = delete; + + /// Initialize the plugin and return the number of devices. + Expected initImpl() override { + hsa_status_t Status = hsa_init(); + if (Status != HSA_STATUS_SUCCESS) { + // Cannot call hsa_success_string. + DP("Failed initialize AMDGPU's HSA library\n"); + return 0; + } + + // Register event handler to detect memory errors on the devices. + Status = hsa_amd_register_system_event_handler(eventHandler, nullptr); + if (auto Err = Plugin::check( + Status, "Error in hsa_amd_register_system_event_handler: %s")) + return std::move(Err); + + // List of host (CPU) agents. + llvm::SmallVector HostAgents; + + // Count the number of available agents. + auto Err = utils::iterateAgents([&](hsa_agent_t Agent) { + // Get the device type of the agent. + hsa_device_type_t DeviceType; + hsa_status_t Status = + hsa_agent_get_info(Agent, HSA_AGENT_INFO_DEVICE, &DeviceType); + if (Status != HSA_STATUS_SUCCESS) + return Status; + + // Classify the agents into kernel (GPU) and host (CPU) kernels. + if (DeviceType == HSA_DEVICE_TYPE_GPU) { + // Ensure that the GPU agent supports kernel dispatch packets. + hsa_agent_feature_t features; + Status = hsa_agent_get_info(Agent, HSA_AGENT_INFO_FEATURE, &features); + if (features & HSA_AGENT_FEATURE_KERNEL_DISPATCH) + KernelAgents.push_back(Agent); + } else if (DeviceType == HSA_DEVICE_TYPE_CPU) { + HostAgents.push_back(Agent); + } + return HSA_STATUS_SUCCESS; + }); + + if (Err) + return std::move(Err); + + int32_t NumDevices = KernelAgents.size(); + if (NumDevices == 0) { + // Do not initialize if there are no devices. + DP("There are no devices supporting AMDGPU.\n"); + return 0; + } + + // There are kernel agents but there is no host agent. That should be + // treated as an error. + if (HostAgents.empty()) + return Plugin::error("No AMDGPU host agents"); + + // Initialize the host device using host agents. + HostDevice = allocate(); + new (HostDevice) AMDHostDeviceTy(HostAgents); + + // Setup the memory pools of available for the host. + if (Err = HostDevice->init()) + return std::move(Err); + + return NumDevices; + } + + /// Deinitialize the plugin. + Error deinitImpl() override { + if (auto Err = HostDevice->deinit()) + return Err; + + // Finalize the HSA runtime. + hsa_status_t Status = hsa_shut_down(); + return Plugin::check(Status, "Error in hsa_shut_down: %s"); + } + + /// Get the ELF code for recognizing the compatible image binary. + uint16_t getMagicElfBits() const override { return ELF::EM_AMDGPU; } + + /// Check whether the image is compatible with an AMDGPU device. + Expected isImageCompatible(__tgt_image_info *Info) const override { + for (hsa_agent_t Agent : KernelAgents) { + std::string Target; + auto Err = utils::iterateAgentISAs(Agent, [&](hsa_isa_t ISA) { + uint32_t Length; + hsa_status_t Status; + Status = hsa_isa_get_info_alt(ISA, HSA_ISA_INFO_NAME_LENGTH, &Length); + if (Status != HSA_STATUS_SUCCESS) + return Status; + + // TODO: This is not allowed by the standard. + char ISAName[Length]; + Status = hsa_isa_get_info_alt(ISA, HSA_ISA_INFO_NAME, ISAName); + if (Status != HSA_STATUS_SUCCESS) + return Status; + + llvm::StringRef TripleTarget(ISAName); + if (TripleTarget.consume_front("amdgcn-amd-amdhsa")) + Target = TripleTarget.ltrim('-').str(); + return HSA_STATUS_SUCCESS; + }); + if (Err) + return std::move(Err); + + if (!isImageCompatible(Info, Target)) + return false; + } + return true; + } + + /// This plugin does not support exchanging data between two devices. + bool isDataExchangable(int32_t SrcDeviceId, int32_t DstDeviceId) override { + return false; + } + + /// Get the host device instance. + AMDHostDeviceTy &getHostDevice() { + assert(HostDevice && "Host device not initialized"); + return *HostDevice; + } + + /// Get the kernel agent with the corresponding agent id. + hsa_agent_t getKernelAgent(int32_t AgentId) const { + assert((uint32_t)AgentId < KernelAgents.size() && "Invalid agent id"); + return KernelAgents[AgentId]; + } + + /// Get the list of the available kernel agents. + const llvm::SmallVector &getKernelAgents() const { + return KernelAgents; + } + +private: + /// Check if an image is compatible with current system's environment. + bool isImageCompatible(__tgt_image_info *Info, + const std::string &EnvInfo) const { + llvm::StringRef ImgTID(Info->Arch), EnvTID(EnvInfo); + + // Compatible in case of exact match. + if (ImgTID == EnvTID) { + DP("Compatible: Exact match \t[Image: %s]\t:\t[Environment: %s]\n", + ImgTID.data(), EnvTID.data()); + return true; + } + + // Incompatible if Archs mismatch. + StringMap ImgMap, EnvMap; + StringRef ImgArch = utils::parseTargetID(ImgTID, ImgMap); + StringRef EnvArch = utils::parseTargetID(EnvTID, EnvMap); + + // Both EnvArch and ImgArch can't be empty here. + if (EnvArch.empty() || ImgArch.empty() || !ImgArch.contains(EnvArch)) { + DP("Incompatible: Processor mismatch \t[Image: %s]\t:\t[Environment: " + "%s]\n", + ImgTID.data(), EnvTID.data()); + return false; + } + + // Incompatible if image has more features than the environment, + // irrespective of type or sign of features. + if (ImgMap.size() > EnvMap.size()) { + DP("Incompatible: Image has more features than the environment \t[Image: " + "%s]\t:\t[Environment: %s]\n", + ImgTID.data(), EnvTID.data()); + return false; + } + + // Compatible if each target feature specified by the environment is + // compatible with target feature of the image. The target feature is + // compatible if the iamge does not specify it (meaning Any), or if it + // specifies it with the same value (meaning On or Off). + for (const auto &ImgFeature : ImgMap) { + auto EnvFeature = EnvMap.find(ImgFeature.first()); + if (EnvFeature == EnvMap.end()) { + DP("Incompatible: Value of Image's non-ANY feature is not matching " + "with " + "the Environment feature's ANY value \t[Image: " + "%s]\t:\t[Environment: " + "%s]\n", + ImgTID.data(), EnvTID.data()); + return false; + } else if (EnvFeature->first() == ImgFeature.first() && + EnvFeature->second != ImgFeature.second) { + DP("Incompatible: Value of Image's non-ANY feature is not matching " + "with " + "the Environment feature's non-ANY value \t[Image: " + "%s]\t:\t[Environment: %s]\n", + ImgTID.data(), EnvTID.data()); + return false; + } + } + + // Image is compatible if all features of Environment are: + // - either, present in the Image's features map with the same sign, + // - or, the feature is missing from Image's features map i.e. it is + // set to ANY + DP("Compatible: Target IDs are compatible \t[Image: %s]\t:\t[Environment: " + "%s]\n", + ImgTID.data(), EnvTID.data()); + return true; + } + + /// Event handler that will be called by ROCr if an event is detected. + static hsa_status_t eventHandler(const hsa_amd_event_t *Event, void *) { + if (Event->event_type == HSA_AMD_GPU_MEMORY_FAULT_EVENT) { + // Abort the execution since we do not recover from this error. + FATAL_MESSAGE0(1, "Received HSA_AMD_GPU_MEMORY_FAULT_EVENT"); + return HSA_STATUS_ERROR; + } + return HSA_STATUS_SUCCESS; + } + + /// Arrays of the available GPU and CPU agents. These arrays of handles should + /// not be here but in the AMDGPUDeviceTy structures directly. However, the + /// HSA standard does not provide API functions to retirve agents directly, + /// only iterating functions. We cache the agents here for convenience. + llvm::SmallVector KernelAgents; + + /// The device representing all HSA host agents. + AMDHostDeviceTy *HostDevice; +}; + +Error AMDGPUKernelTy::launchImpl(GenericDeviceTy &GenericDevice, + uint32_t NumThreads, uint64_t NumBlocks, + uint32_t DynamicMemorySize, + int32_t NumKernelArgs, void *KernelArgs, + AsyncInfoWrapperTy &AsyncInfoWrapper) const { + const uint32_t KernelArgsSize = NumKernelArgs * sizeof(void *); + + if (ArgsSize < KernelArgsSize) + return Plugin::error("Mismatch of kernel arguments size"); + + // The args size reported by HSA may or may not contain the implicit args. + // For now, assume that HSA does not consider the implicit arguments when + // reporting the arguments of a kernel. In the worst case, we can waste + // 56 bytes per allocation. + uint32_t AllArgsSize = KernelArgsSize + ImplicitArgsSize; + + AMDHostDeviceTy &HostDevice = Plugin::get().getHostDevice(); + AMDGPUMemoryManagerTy &ArgsMemoryManager = HostDevice.getArgsMemoryManager(); + + void *AllArgs = nullptr; + if (auto Err = ArgsMemoryManager.allocate(AllArgsSize, &AllArgs)) + return Err; + + // Initialize implicit arguments. + utils::impl_implicit_args_t *ImplArgs = + reinterpret_cast( + static_cast(AllArgs) + KernelArgsSize); + + // Initialize the implicit arguments to zero. + std::memset(ImplArgs, 0, ImplicitArgsSize); + + // Copy the explicit arguments. + for (int32_t ArgId = 0; ArgId < NumKernelArgs; ++ArgId) { + void *Dst = (char *)AllArgs + sizeof(void *) * ArgId; + void *Src = *((void **)KernelArgs + ArgId); + std::memcpy(Dst, Src, sizeof(void *)); + } + + AMDGPUDeviceTy &AMDGPUDevice = static_cast(GenericDevice); + AMDGPUStreamTy &Stream = AMDGPUDevice.getStream(AsyncInfoWrapper); + + // Push the kernel launch into the stream. + if (auto Err = Stream.pushKernel(*this, AllArgs, NumThreads, NumBlocks, + ArgsMemoryManager)) + return Err; + + return Plugin::success(); +} + +GenericPluginTy *Plugin::createPlugin() { return new AMDGPUPluginTy(); } + +GenericDeviceTy *Plugin::createDevice(int32_t DeviceId, int32_t NumDevices) { + AMDGPUPluginTy &Plugin = get(); + return new AMDGPUDeviceTy(DeviceId, NumDevices, Plugin.getHostDevice(), + Plugin.getKernelAgent(DeviceId)); +} + +GenericGlobalHandlerTy *Plugin::createGlobalHandler() { + return new AMDGPUGlobalHandlerTy(); +} + +template +Error Plugin::check(int32_t Code, const char *ErrFmt, ArgsTy... Args) { + hsa_status_t ResultCode = static_cast(Code); + if (ResultCode == HSA_STATUS_SUCCESS || ResultCode == HSA_STATUS_INFO_BREAK) + return Error::success(); + + const char *Desc = "Unknown error"; + hsa_status_t Ret = hsa_status_string(ResultCode, &Desc); + if (Ret != HSA_STATUS_SUCCESS) + REPORT("Unrecognized " GETNAME(TARGET_NAME) " error code %d\n", Code); + + return createStringError(inconvertibleErrorCode(), + ErrFmt, Args..., Desc); +} + +void *AMDGPUMemoryManagerTy::allocate(size_t Size, void *HstPtr, + TargetAllocTy Kind) { + // Allocate memory from the pool. + void *Ptr = nullptr; + if (auto Err = MemoryPool->allocate(Size, &Ptr)) { + consumeError(std::move(Err)); + return nullptr; + } + assert(Ptr && "Invalid pointer"); + + auto &KernelAgents = Plugin::get().getKernelAgents(); + + // Allow all kernel agents to access the allocation. + if (auto Err = MemoryPool->enableAccess(Ptr, Size, KernelAgents)) { + REPORT("%s\n", toString(std::move(Err)).data()); + return nullptr; + } + return Ptr; +} + +void *AMDGPUDeviceTy::allocate(size_t Size, void *, TargetAllocTy Kind) { + if (Size == 0) + return nullptr; + + // Find the correct memory pool. + AMDGPUMemoryPoolTy *MemoryPool = nullptr; + switch (Kind) { + case TARGET_ALLOC_DEFAULT: + case TARGET_ALLOC_DEVICE: + MemoryPool = CoarseGrainedMemoryPools[0]; + break; + case TARGET_ALLOC_HOST: + MemoryPool = &HostDevice.getHostMemoryPool(); + break; + case TARGET_ALLOC_SHARED: + // TODO: Not supported yet. We could look at fine-grained host memory + // pools that are accessible by this device. The allocation should be made + // explicitly accessible if it is not yet. + break; + } + + if (!MemoryPool) { + REPORT("No memory pool for the specified allocation kind\n"); + return nullptr; + } + + // Allocate from the corresponding memory pool. + void *Alloc = nullptr; + if (Error Err = MemoryPool->allocate(Size, &Alloc)) { + REPORT("%s\n", toString(std::move(Err)).data()); + return nullptr; + } + + if (Kind == TARGET_ALLOC_HOST && Alloc) { + auto &KernelAgents = Plugin::get().getKernelAgents(); + + // Enable all kernel agents to access the host pinned buffer. + if (auto Err = MemoryPool->enableAccess(Alloc, Size, KernelAgents)) { + REPORT("%s\n", toString(std::move(Err)).data()); + } + + // Keep track of the host pinned allocations for optimizations in transfers. + std::lock_guard Lock(HostAllocationsMutex); + HostAllocations.insert({Alloc, Size}); + } + + return Alloc; +} + +} // namespace plugin +} // namespace target +} // namespace omp +} // namespace llvm Index: openmp/libomptarget/plugins-nextgen/common/PluginInterface/GlobalHandler.h =================================================================== --- openmp/libomptarget/plugins-nextgen/common/PluginInterface/GlobalHandler.h +++ openmp/libomptarget/plugins-nextgen/common/PluginInterface/GlobalHandler.h @@ -35,14 +35,7 @@ /// Common abstraction for globals that live on the host and device. /// It simply encapsulates the symbol name, symbol size, and symbol address /// (which might be host or device depending on the context). -class GlobalTy { - // NOTE: Maybe we can have a pointer to the offload entry name instead of - // holding a private copy of the name as a std::string. - std::string Name; - uint32_t Size; - void *Ptr; - -public: +struct GlobalTy { GlobalTy(const std::string &Name, uint32_t Size, void *Ptr = nullptr) : Name(Name), Size(Size), Ptr(Ptr) {} @@ -55,13 +48,17 @@ void setSize(int32_t S) { Size = S; } void setPtr(void *P) { Ptr = P; } + +private: + // NOTE: Maybe we can have a pointer to the offload entry name instead of + // holding a private copy of the name as a std::string. + std::string Name; + uint32_t Size; + void *Ptr; }; /// Subclass of GlobalTy that holds the memory for a global of \p Ty. -template class StaticGlobalTy : public GlobalTy { - Ty Data; - -public: +template struct StaticGlobalTy : public GlobalTy { template StaticGlobalTy(const std::string &Name, Args &&...args) : GlobalTy(Name, sizeof(Ty), &Data), @@ -80,6 +77,9 @@ Ty &getValue() { return Data; } const Ty &getValue() const { return Data; } void setValue(const Ty &V) { Data = V; } + +private: + Ty Data; }; /// Helper class to do the heavy lifting when it comes to moving globals between @@ -96,6 +96,12 @@ const ELF64LEObjectFile * getOrCreateELFObjectFile(const GenericDeviceTy &Device, DeviceImageTy &Image); + /// Extract the global's information from the ELF image, section, and symbol. + virtual Error getGlobalMetadataFromELF(const DeviceImageTy &Image, + const ELF64LE::Sym &Symbol, + const ELF64LE::Shdr &Section, + GlobalTy &ImageGlobal); + /// Actually move memory between host and device. See readGlobalFromDevice and /// writeGlobalToDevice for the interface description. Error moveGlobalBetweenDeviceAndHost(GenericDeviceTy &Device, Index: openmp/libomptarget/plugins-nextgen/common/PluginInterface/GlobalHandler.cpp =================================================================== --- openmp/libomptarget/plugins-nextgen/common/PluginInterface/GlobalHandler.cpp +++ openmp/libomptarget/plugins-nextgen/common/PluginInterface/GlobalHandler.cpp @@ -46,6 +46,21 @@ return &Result.first->second; } +Error GenericGlobalHandlerTy::getGlobalMetadataFromELF( + const DeviceImageTy &Image, const ELF64LE::Sym &Symbol, + const ELF64LE::Shdr &Section, GlobalTy &ImageGlobal) { + + // The global's address is computed as the image begin + the ELF section + // offset + the ELF symbol value. + ImageGlobal.setPtr((char *)Image.getStart() + Section.sh_offset + + Symbol.st_value); + + // Set the global's size. + ImageGlobal.setSize(Symbol.st_size); + + return Plugin::success(); +} + Error GenericGlobalHandlerTy::moveGlobalBetweenDeviceAndHost( GenericDeviceTy &Device, DeviceImageTy &Image, const GlobalTy &HostGlobal, bool Device2Host) { @@ -111,19 +126,14 @@ ImageGlobal.getName().data()); // Get the section to which the symbol belongs. - auto SymSecOrErr = ELFObj->getELFFile().getSection((*SymOrErr)->st_shndx); - if (!SymSecOrErr) + auto SecOrErr = ELFObj->getELFFile().getSection((*SymOrErr)->st_shndx); + if (!SecOrErr) return Plugin::error("Failed to get ELF section from global '%s': %s", ImageGlobal.getName().data(), - toString(SymOrErr.takeError()).data()); + toString(SecOrErr.takeError()).data()); - // Save the global symbol's address and size. The address of the global is the - // image base address + the section offset + the symbol value. - ImageGlobal.setPtr((char *)Image.getStart() + (*SymSecOrErr)->sh_offset + - (*SymOrErr)->st_value); - ImageGlobal.setSize((*SymOrErr)->st_size); - - return Plugin::success(); + // Setup the global symbol's address and size. + return getGlobalMetadataFromELF(Image, **SymOrErr, **SecOrErr, ImageGlobal); } Error GenericGlobalHandlerTy::readGlobalFromImage(GenericDeviceTy &Device, Index: openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.h =================================================================== --- openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.h +++ openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.h @@ -65,6 +65,9 @@ return reinterpret_cast(AsyncInfoPtr->Queue); } + /// Indicate whether there is queue. + bool hasQueue() const { return (AsyncInfoPtr->Queue != nullptr); } + private: Error &Err; ErrorAsOutParameter ErrOutParam; @@ -444,31 +447,21 @@ /// implement the necessary virtual function members. struct GenericPluginTy { - /// Construct a plugin instance. The number of active instances should be - /// always be either zero or one. - GenericPluginTy() : RequiresFlags(OMP_REQ_UNDEFINED), GlobalHandler(nullptr) { - ++NumActiveInstances; - } + /// Construct a plugin instance. + GenericPluginTy() + : RequiresFlags(OMP_REQ_UNDEFINED), GlobalHandler(nullptr) {} - /// Destroy the plugin instance and release all its resources. Also decrease - /// the number of instances. - virtual ~GenericPluginTy() { - // There is no global handler if no device is available. - if (GlobalHandler) - delete GlobalHandler; - - // Deinitialize all active devices. - for (int32_t DeviceId = 0; DeviceId < NumDevices; ++DeviceId) { - if (Devices[DeviceId]) { - if (auto Err = deinitDevice(DeviceId)) - REPORT("Failure to deinitialize device %d: %s\n", DeviceId, - toString(std::move(Err)).data()); - } - assert(!Devices[DeviceId] && "Device was not deinitialized"); - } + virtual ~GenericPluginTy() {} - --NumActiveInstances; - } + /// Initialize the plugin. + Error init(); + + /// Initialize the plugin and return the number of available devices. + virtual Expected initImpl() = 0; + + /// Deinitialize the plugin and release the resources. + Error deinit(); + virtual Error deinitImpl() = 0; /// Get the reference to the device with a certain device id. GenericDeviceTy &getDevice(int32_t DeviceId) { @@ -522,26 +515,7 @@ /// Indicate whether the plugin supports empty images. virtual bool supportsEmptyImages() const { return false; } - /// Indicate whether there is any active plugin instance. - static bool hasAnyActiveInstance() { - assert(NumActiveInstances <= 1 && "Invalid number of instances"); - return (NumActiveInstances > 0); - } - protected: - /// Initialize the plugin and prepare for initializing its devices. - void init(int NumDevices, GenericGlobalHandlerTy *GlobalHandler) { - this->NumDevices = NumDevices; - this->GlobalHandler = GlobalHandler; - - assert(Devices.size() == 0 && "Plugin already intialized"); - - Devices.resize(NumDevices, nullptr); - } - - /// Create a new device with a specific device id. - virtual GenericDeviceTy &createDevice(int32_t DeviceId) = 0; - /// Indicate whether a device id is valid. bool isValidDeviceId(int32_t DeviceId) const { return (DeviceId >= 0 && DeviceId < getNumDevices()); @@ -565,41 +539,98 @@ /// Internal allocator for different structures. BumpPtrAllocator Allocator; - - /// Indicates the number of active plugin instances. Actually, we should only - /// have one active instance per plugin library. But we use a counter for - /// simplicity. - static uint32_t NumActiveInstances; }; /// Class for simplifying the getter operation of the plugin. Anywhere on the -/// code, the current plugin can be retrieved by Plugin::get(). The init(), -/// deinit(), get() and check() functions should be defined by each plugin -/// implementation. +/// code, the current plugin can be retrieved by Plugin::get(). The class also +/// declares functions to create plugin-specific object instances. The check(), +/// createPlugin(), createDevice() and createGlobalHandler() functions should be +/// defined by each plugin implementation. class Plugin { - /// Avoid instances of this class. - Plugin() {} + // Reference to the plugin instance. + static GenericPluginTy *SpecificPlugin; + + Plugin() { + if (auto Err = init()) + REPORT("Failed to initialize plugin: %s\n", + toString(std::move(Err)).data()); + } + + ~Plugin() { + if (auto Err = deinit()) + REPORT("Failed to deinitialize plugin: %s\n", + toString(std::move(Err)).data()); + } + Plugin(const Plugin &) = delete; void operator=(const Plugin &) = delete; + /// Create and intialize the plugin instance. + static Error init() { + assert(!SpecificPlugin && "Plugin already created"); + + // Create the specific plugin. + SpecificPlugin = createPlugin(); + assert(SpecificPlugin && "Plugin was not created"); + + // Initialize the plugin. + return SpecificPlugin->init(); + } + + // Deinitialize and destroy the plugin instance. + static Error deinit() { + assert(SpecificPlugin && "Plugin no longer valid"); + + // Deinitialize the plugin. + if (auto Err = SpecificPlugin->deinit()) + return Err; + + // Delete the plugin instance. + delete SpecificPlugin; + + // Invalidate the plugin reference. + SpecificPlugin = nullptr; + + return Plugin::success(); + } + public: - /// Initialize the plugin if it was not initialized yet. - static Error init(); + /// Initialize the plugin if needed. The plugin could have been initialized by + /// a previous call to Plugin::get(). + static Error initIfNeeded() { + // Trigger the initialization if needed. + get(); - /// Deinitialize the plugin if it was not deinitialized yet. - static Error deinit(); + return Error::success(); + } + + // Deinitialize the plugin if needed. The plugin could have been deinitialized + // because the plugin library was exiting. + static Error deinitIfNeeded() { + // Do nothing. The plugin is deinitialized automatically. + return Plugin::success(); + } /// Get a reference (or create if it was not created) to the plugin instance. - static GenericPluginTy &get(); + static GenericPluginTy &get() { + // This static variable will initialize the underlying plugin instance in + // case there was no previous explicit initialization. The initialization is + // thread safe. + static Plugin Plugin; + + assert(SpecificPlugin && "Plugin is not active"); + return *SpecificPlugin; + } /// Get a reference to the plugin with a specific plugin-specific type. template static Ty &get() { return static_cast(get()); } - /// Indicate if the plugin is currently active. Actually, we check if there is - /// any active instances. - static bool isActive() { return GenericPluginTy::hasAnyActiveInstance(); } + /// Indicate whether the plugin is active. + static bool isActive() { return SpecificPlugin != nullptr; } - /// Create a success error. + /// Create a success error. This is the same as calling Error::success(), but + /// it is recommended to use this one for consistency with Plugin::error() and + /// Plugin::check(). static Error success() { return Error::success(); } /// Create a string error. @@ -617,6 +648,15 @@ /// the plugin-specific code. template static Error check(int32_t ErrorCode, const char *ErrFmt, ArgsTy... Args); + + /// Create a plugin instance. + static GenericPluginTy *createPlugin(); + + /// Create a plugin-specific device. + static GenericDeviceTy *createDevice(int32_t DeviceId, int32_t NumDevices); + + /// Create a plugin-specific global handler. + static GenericGlobalHandlerTy *createGlobalHandler(); }; /// Auxiliary interface class for GenericDeviceResourcePoolTy. This class acts @@ -626,13 +666,13 @@ /// create a new resource on the ctor, but on the create() function instead. struct GenericDeviceResourceRef { /// Create a new resource and stores a reference. - virtual Error create() = 0; + virtual Error create(GenericDeviceTy &Device) = 0; /// Destroy and release the resources pointed by the reference. - virtual Error destroy() = 0; + virtual Error destroy(GenericDeviceTy &Device) = 0; protected: - ~GenericDeviceResourceRef() = default; + ~GenericDeviceResourceRef() {} }; /// Class that implements a resource pool belonging to a device. This class @@ -679,6 +719,10 @@ /// Get resource from the pool or create new resources. ResourceRef getResource() { const std::lock_guard Lock(Mutex); + + assert(NextAvailable <= ResourcePool.size() && + "Resource pool is corrupted"); + if (NextAvailable == ResourcePool.size()) { // By default we double the resource pool every time. if (auto Err = ResourcePoolTy::resizeResourcePool(NextAvailable * 2)) { @@ -694,6 +738,8 @@ /// Return resource to the pool. void returnResource(ResourceRef Resource) { const std::lock_guard Lock(Mutex); + + assert(NextAvailable > 0 && "Resource pool is corrupted"); ResourcePool[--NextAvailable] = Resource; } @@ -709,13 +755,13 @@ if (OldSize < NewSize) { // Create new resources. for (uint32_t I = OldSize; I < NewSize; ++I) { - if (auto Err = ResourcePool[I].create()) + if (auto Err = ResourcePool[I].create(Device)) return Err; } } else { // Destroy the obsolete resources. for (uint32_t I = NewSize; I < OldSize; ++I) { - if (auto Err = ResourcePool[I].destroy()) + if (auto Err = ResourcePool[I].destroy(Device)) return Err; } } Index: openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.cpp =================================================================== --- openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.cpp +++ openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.cpp @@ -23,13 +23,13 @@ using namespace target; using namespace plugin; -uint32_t GenericPluginTy::NumActiveInstances = 0; +GenericPluginTy *Plugin::SpecificPlugin = nullptr; AsyncInfoWrapperTy::~AsyncInfoWrapperTy() { // If we used a local async info object we want synchronous behavior. // In that case, and assuming the current status code is OK, we will // synchronize explicitly when the object is deleted. - if (AsyncInfoPtr == &LocalAsyncInfo && !Err) + if (AsyncInfoPtr == &LocalAsyncInfo && LocalAsyncInfo.Queue && !Err) Err = Device.synchronize(&LocalAsyncInfo); } @@ -236,12 +236,17 @@ DeviceEnvironment.DynamicMemSize = OMPX_SharedMemorySize; // Create the metainfo of the device environment global. - GlobalTy DeviceEnvGlobal("omptarget_device_environment", - sizeof(DeviceEnvironmentTy), &DeviceEnvironment); + GlobalTy DevEnvGlobal("omptarget_device_environment", + sizeof(DeviceEnvironmentTy), &DeviceEnvironment); // Write device environment values to the device. - GenericGlobalHandlerTy &GlobalHandler = Plugin.getGlobalHandler(); - return GlobalHandler.writeGlobalToDevice(*this, Image, DeviceEnvGlobal); + GenericGlobalHandlerTy &GHandler = Plugin.getGlobalHandler(); + if (auto Err = GHandler.writeGlobalToDevice(*this, Image, DevEnvGlobal)) { + DP("Missing symbol %s, continue execution anyway.\n", + DevEnvGlobal.getName().data()); + consumeError(std::move(Err)); + } + return Plugin::success(); } Error GenericDeviceTy::registerOffloadEntries(DeviceImageTy &Image) { @@ -447,33 +452,6 @@ return initDeviceInfoImpl(DeviceInfo); } -Error GenericPluginTy::initDevice(int32_t DeviceId) { - assert(!Devices[DeviceId] && "Device already initialized"); - - // Create the device and save the reference. - GenericDeviceTy &Device = createDevice(DeviceId); - Devices[DeviceId] = &Device; - - // Initialize the device and its resources. - return Device.init(*this); -} - -Error GenericPluginTy::deinitDevice(int32_t DeviceId) { - // The device may be already deinitialized. - if (Devices[DeviceId] == nullptr) - return Plugin::success(); - - // Deinitialize the device and release its resources. - if (auto Err = Devices[DeviceId]->deinit()) - return Err; - - // Delete the device and invalidate its reference. - delete Devices[DeviceId]; - Devices[DeviceId] = nullptr; - - return Plugin::success(); -} - Error GenericDeviceTy::printInfo() { // TODO: Print generic information here return printInfoImpl(); @@ -506,6 +484,72 @@ return syncEventImpl(EventPtr); } +Error GenericPluginTy::init() { + auto NumDevicesOrErr = initImpl(); + if (!NumDevicesOrErr) + return NumDevicesOrErr.takeError(); + + NumDevices = *NumDevicesOrErr; + if (NumDevices == 0) + return Plugin::success(); + + assert(Devices.size() == 0 && "Plugin already initialized"); + Devices.resize(NumDevices, nullptr); + + GlobalHandler = Plugin::createGlobalHandler(); + assert(GlobalHandler && "Invalid global handler"); + + return Plugin::success(); +} + +Error GenericPluginTy::deinit() { + // There is no global handler if no device is available. + if (GlobalHandler) + delete GlobalHandler; + + // Deinitialize all active devices. + for (int32_t DeviceId = 0; DeviceId < NumDevices; ++DeviceId) { + if (Devices[DeviceId]) { + if (auto Err = deinitDevice(DeviceId)) + return Err; + } + assert(!Devices[DeviceId] && "Device was not deinitialized"); + } + + // Perform last deinitializations on the plugin. + return deinitImpl(); +} + +Error GenericPluginTy::initDevice(int32_t DeviceId) { + assert(!Devices[DeviceId] && "Device already initialized"); + + // Create the device and save the reference. + GenericDeviceTy *Device = Plugin::createDevice(DeviceId, NumDevices); + assert(Device && "Invalid device"); + + // Save the device reference into the list. + Devices[DeviceId] = Device; + + // Initialize the device and its resources. + return Device->init(*this); +} + +Error GenericPluginTy::deinitDevice(int32_t DeviceId) { + // The device may be already deinitialized. + if (Devices[DeviceId] == nullptr) + return Plugin::success(); + + // Deinitialize the device and release its resources. + if (auto Err = Devices[DeviceId]->deinit()) + return Err; + + // Delete the device and invalidate its reference. + delete Devices[DeviceId]; + Devices[DeviceId] = nullptr; + + return Plugin::success(); +} + /// Exposed library API function, basically wrappers around the GenericDeviceTy /// functionality with the same name. All non-async functions are redirected /// to the async versions right away with a NULL AsyncInfoPtr. @@ -514,7 +558,7 @@ #endif int32_t __tgt_rtl_init_plugin() { - auto Err = Plugin::init(); + auto Err = Plugin::initIfNeeded(); if (Err) REPORT("Failure to initialize plugin " GETNAME(TARGET_NAME) ": %s\n", toString(std::move(Err)).data()); @@ -523,7 +567,7 @@ } int32_t __tgt_rtl_deinit_plugin() { - auto Err = Plugin::deinit(); + auto Err = Plugin::deinitIfNeeded(); if (Err) REPORT("Failure to deinitialize plugin " GETNAME(TARGET_NAME) ": %s\n", toString(std::move(Err)).data()); Index: openmp/libomptarget/plugins-nextgen/cuda/src/rtl.cpp =================================================================== --- openmp/libomptarget/plugins-nextgen/cuda/src/rtl.cpp +++ openmp/libomptarget/plugins-nextgen/cuda/src/rtl.cpp @@ -97,7 +97,7 @@ /// Create a new stream and save the reference. The reference must be empty /// before calling to this function. - Error create() override { + Error create(GenericDeviceTy &Device) override { if (Stream) return Plugin::error("Creating an existing stream"); @@ -110,7 +110,7 @@ /// Destroy the referenced stream and invalidate the reference. The reference /// must be to a valid stream before calling to this function. - Error destroy() override { + Error destroy(GenericDeviceTy &Device) override { if (!Stream) return Plugin::error("Destroying an invalid stream"); @@ -140,7 +140,7 @@ /// Create a new event and save the reference. The reference must be empty /// before calling to this function. - Error create() override { + Error create(GenericDeviceTy &Device) override { if (Event) return Plugin::error("Creating an existing event"); @@ -153,7 +153,7 @@ /// Destroy the referenced event and invalidate the reference. The reference /// must be to a valid event before calling to this function. - Error destroy() override { + Error destroy(GenericDeviceTy &Device) override { if (!Event) return Plugin::error("Destroying an invalid event"); @@ -848,59 +848,50 @@ /// Class implementing the CUDA-specific functionalities of the plugin. struct CUDAPluginTy final : public GenericPluginTy { - /// Create a CUDA plugin and initialize the CUDA driver. - CUDAPluginTy() : GenericPluginTy() { + /// Create a CUDA plugin. + CUDAPluginTy() : GenericPluginTy() {} + + /// This class should not be copied. + CUDAPluginTy(const CUDAPluginTy &) = delete; + CUDAPluginTy(CUDAPluginTy &&) = delete; + + /// Initialize the plugin and return the number of devices. + Expected initImpl() override { CUresult Res = cuInit(0); if (Res == CUDA_ERROR_INVALID_HANDLE) { // Cannot call cuGetErrorString if dlsym failed. DP("Failed to load CUDA shared library\n"); - return; + return 0; } if (Res == CUDA_ERROR_NO_DEVICE) { // Do not initialize if there are no devices. DP("There are no devices supporting CUDA.\n"); - return; + return 0; } - if (auto Err = Plugin::check(Res, "Error in cuInit: %s")) { - REPORT("%s\n", toString(std::move(Err)).data()); - return; - } + if (auto Err = Plugin::check(Res, "Error in cuInit: %s")) + return std::move(Err); // Get the number of devices. int NumDevices; Res = cuDeviceGetCount(&NumDevices); - if (auto Err = Plugin::check(Res, "Error in cuDeviceGetCount: %s")) { - REPORT("%s\n", toString(std::move(Err)).data()); - return; - } + if (auto Err = Plugin::check(Res, "Error in cuDeviceGetCount: %s")) + return std::move(Err); // Do not initialize if there are no devices. - if (NumDevices == 0) { + if (NumDevices == 0) DP("There are no devices supporting CUDA.\n"); - return; - } - // Initialize the generic plugin structure. - GenericPluginTy::init(NumDevices, new CUDAGlobalHandlerTy()); + return NumDevices; } - /// This class should not be copied. - CUDAPluginTy(const CUDAPluginTy &) = delete; - CUDAPluginTy(CUDAPluginTy &&) = delete; - - ~CUDAPluginTy() {} + /// Deinitialize the plugin. + Error deinitImpl() override { return Plugin::success(); } /// Get the ELF code for recognizing the compatible image binary. uint16_t getMagicElfBits() const override { return ELF::EM_CUDA; } - /// Create a CUDA device with a specific id. - CUDADeviceTy &createDevice(int32_t DeviceId) override { - CUDADeviceTy *Device = new CUDADeviceTy(DeviceId, getNumDevices()); - return *Device; - } - /// Check whether the image is compatible with the available CUDA devices. Expected isImageCompatible(__tgt_image_info *Info) const override { for (int32_t DevId = 0; DevId < getNumDevices(); ++DevId) { @@ -1002,32 +993,14 @@ return Plugin::check(Res, "Error in cuMemcpyDtoDAsync: %s"); } -Error Plugin::init() { - // Call the getter to intialize the CUDA plugin. - get(); - return Plugin::success(); -} - -Error Plugin::deinit() { - // The CUDA plugin and the CUDA driver should already be deinitialized - // at this point. So do nothing for this plugin. - if (Plugin::isActive()) - return Plugin::error("CUDA plugin is not deinitialized"); +GenericPluginTy *Plugin::createPlugin() { return new CUDAPluginTy(); } - return Plugin::success(); +GenericDeviceTy *Plugin::createDevice(int32_t DeviceId, int32_t NumDevices) { + return new CUDADeviceTy(DeviceId, NumDevices); } -GenericPluginTy &Plugin::get() { - // The CUDA plugin instance is built the first time that Plugin::get() is - // called thanks to the following static variable. The ideal implementation - // would initialize the plugin in Plugin::init() (__tgt_rtl_plugin_init) and - // destroy it in Plugin::deinit() (__tgt_rtl_plugin_deinit). However, at the - // time Plugin::deinit() is called, the CUDA driver is already shut down. That - // is caused by the fact that __tgt_rtl_plugin_deinit is called from a dtor - // in libomptarget. Thus, this is a workaround until that aspect is fixed. - static CUDAPluginTy CUDAPlugin; - assert(Plugin::isActive() && "Plugin is not active"); - return CUDAPlugin; +GenericGlobalHandlerTy *Plugin::createGlobalHandler() { + return new CUDAGlobalHandlerTy(); } template Index: openmp/libomptarget/plugins-nextgen/generic-elf-64bit/src/rtl.cpp =================================================================== --- openmp/libomptarget/plugins-nextgen/generic-elf-64bit/src/rtl.cpp +++ openmp/libomptarget/plugins-nextgen/generic-elf-64bit/src/rtl.cpp @@ -333,28 +333,22 @@ /// Class implementing the plugin functionalities for GenELF64. struct GenELF64PluginTy final : public GenericPluginTy { - /// Create the plugin. - GenELF64PluginTy() : GenericPluginTy() { - // Initialize the generic plugin structure with multiple devices and a - // global handler. - GenericPluginTy::init(NUM_DEVICES, new GenELF64GlobalHandlerTy()); - } + /// Create the GenELF64 plugin. + GenELF64PluginTy() : GenericPluginTy() {} /// This class should not be copied. GenELF64PluginTy(const GenELF64PluginTy &) = delete; GenELF64PluginTy(GenELF64PluginTy &&) = delete; - ~GenELF64PluginTy() {} + /// Initialize the plugin and return the number of devices. + Expected initImpl() override { return NUM_DEVICES; } + + /// Deinitialize the plugin. + Error deinitImpl() override { return Plugin::success(); } /// Get the ELF code to recognize the compatible binary images. uint16_t getMagicElfBits() const override { return TARGET_ELF_ID; } - /// Create a GenELF64 device with a specific id. - GenELF64DeviceTy &createDevice(int32_t DeviceId) override { - GenELF64DeviceTy *Device = new GenELF64DeviceTy(DeviceId, getNumDevices()); - return *Device; - } - /// This plugin does not support exchanging data between two devices. bool isDataExchangable(int32_t SrcDeviceId, int32_t DstDeviceId) override { return false; @@ -366,24 +360,14 @@ } }; -Error Plugin::init() { - // Call the getter to intialize the GenELF64 plugin. - get(); - return Plugin::success(); -} - -Error Plugin::deinit() { - // The Generic ELF64 plugin should already be deinitialized at this point. - if (Plugin::isActive()) - return Plugin::error("Generic ELF64 plugin is not deinitialized"); +GenericPluginTy *Plugin::createPlugin() { return new GenELF64PluginTy(); } - return Plugin::success(); +GenericDeviceTy *Plugin::createDevice(int32_t DeviceId, int32_t NumDevices) { + return new GenELF64DeviceTy(DeviceId, NumDevices); } -GenericPluginTy &Plugin::get() { - static GenELF64PluginTy GenELF64Plugin; - assert(Plugin::isActive() && "Plugin is not active"); - return GenELF64Plugin; +GenericGlobalHandlerTy *Plugin::createGlobalHandler() { + return new GenELF64GlobalHandlerTy(); } template Index: openmp/libomptarget/plugins/amdgpu/dynamic_hsa/hsa.h =================================================================== --- openmp/libomptarget/plugins/amdgpu/dynamic_hsa/hsa.h +++ openmp/libomptarget/plugins/amdgpu/dynamic_hsa/hsa.h @@ -63,6 +63,7 @@ typedef enum { HSA_AGENT_INFO_NAME = 0, HSA_AGENT_INFO_VENDOR_NAME = 1, + HSA_AGENT_INFO_FEATURE = 2, HSA_AGENT_INFO_PROFILE = 4, HSA_AGENT_INFO_WAVEFRONT_SIZE = 6, HSA_AGENT_INFO_WORKGROUP_MAX_DIM = 7, @@ -150,6 +151,11 @@ HSA_QUEUE_TYPE_SINGLE = 1, } hsa_queue_type_t; +typedef enum { + HSA_QUEUE_FEATURE_KERNEL_DISPATCH = 1, + HSA_QUEUE_FEATURE_AGENT_DISPATCH = 2 +} hsa_queue_feature_t; + typedef uint32_t hsa_queue_type32_t; typedef struct hsa_queue_s {