diff --git a/openmp/libomptarget/plugins/CMakeLists.txt b/openmp/libomptarget/plugins/CMakeLists.txt --- a/openmp/libomptarget/plugins/CMakeLists.txt +++ b/openmp/libomptarget/plugins/CMakeLists.txt @@ -66,6 +66,7 @@ endmacro() add_subdirectory(aarch64) +add_subdirectory(amdgpu) add_subdirectory(cuda) add_subdirectory(ppc64) add_subdirectory(ppc64le) diff --git a/openmp/libomptarget/plugins/amdgpu/CMakeLists.txt b/openmp/libomptarget/plugins/amdgpu/CMakeLists.txt new file mode 100644 --- /dev/null +++ b/openmp/libomptarget/plugins/amdgpu/CMakeLists.txt @@ -0,0 +1,84 @@ +##===----------------------------------------------------------------------===## +# +# The LLVM Compiler Infrastructure +# +# This file is dual licensed under the MIT and the University of Illinois Open +# Source Licenses. See LICENSE.txt for details. +# +##===----------------------------------------------------------------------===## +# +# Build a plugin for an AMDGPU machine if available. +# +##===----------------------------------------------------------------------===## + +################################################################################ + +if(NOT LIBOMPTARGET_DEP_LIBELF_FOUND) + libomptarget_say("Not building AMDGPU plugin: LIBELF not found") + return() +endif() + +if(NOT ROCM_DIR) + libomptarget_say("Not building AMDGPU plugin: ROCM_DIR is not set") + return() +endif() + +set(LIBOMPTARGET_DEP_LIBHSA_INCLUDE_DIRS ${ROCM_DIR}/hsa/include ${ROCM_DIR}/hsa/include/hsa) +set(LIBOMPTARGET_DEP_LIBHSA_LIBRARIES_DIRS ${ROCM_DIR}/hsa/lib) +set(LIBOMPTARGET_DEP_LIBHSAKMT_LIBRARIES_DIRS ${ROCM_DIR}/lib) + +mark_as_advanced( LIBOMPTARGET_DEP_LIBHSA_INCLUDE_DIRS LIBOMPTARGET_DEP_LIBHSA_LIBRARIES_DIRS) + +if(NOT CMAKE_SYSTEM_PROCESSOR MATCHES "(x86_64)|(ppc64le)|(aarch64)$" AND CMAKE_SYSTEM_NAME MATCHES "Linux") + libomptarget_say("Not building amdgpu plugin: only support amdgpu in Linux x86_64, ppc64le, or aarch64 hosts.") + return() +endif() +libomptarget_say("Building amdgpu offloading plugin using ROCM_DIR = ${ROCM_DIR}") + +libomptarget_say("LIBOMPTARGET_DEP_LIBHSA_INCLUDE_DIRS: ${LIBOMPTARGET_DEP_LIBHSA_INCLUDE_DIRS}") +libomptarget_say("LIBOMPTARGET_DEP_LIBHSA_LIBRARIES_DIRS ${LIBOMPTARGET_DEP_LIBHSA_LIBRARIES_DIRS}") +libomptarget_say("LIBOMPTARGET_DEP_LIBHSAKMT_LIBRARIES_DIRS: ${LIBOMPTARGET_DEP_LIBHSAKMT_LIBRARIES_DIRS}") + +################################################################################ +# Define the suffix for the runtime messaging dumps. +add_definitions(-DTARGET_NAME=AMDGPU) +if(CMAKE_SYSTEM_PROCESSOR MATCHES "(ppc64le)|(aarch64)$") + add_definitions(-DLITTLEENDIAN_CPU=1) +endif() + +if(CMAKE_BUILD_TYPE MATCHES Debug) + add_definitions(-DDEBUG) +endif() + +include_directories( + ${LIBOMPTARGET_DEP_LIBHSA_INCLUDE_DIRS} + ${CMAKE_CURRENT_SOURCE_DIR}/impl +) + +add_library(omptarget.rtl.amdgpu SHARED + impl/atmi.cpp + impl/atmi_interop_hsa.cpp + impl/data.cpp + impl/machine.cpp + impl/system.cpp + impl/utils.cpp + impl/msgpack.cpp + src/rtl.cpp + ) + +# Install plugin under the lib destination folder. +# When we build for debug, OPENMP_LIBDIR_SUFFIX get set to -debug +install(TARGETS omptarget.rtl.amdgpu LIBRARY DESTINATION "lib${OPENMP_LIBDIR_SUFFIX}") + +target_link_libraries( + omptarget.rtl.amdgpu + -lpthread -ldl -Wl,-rpath,${OPENMP_INSTALL_LIBDIR} + -L${LIBOMPTARGET_DEP_LIBHSA_LIBRARIES_DIRS} -L${LIBOMPTARGET_DEP_LIBHSAKMT_LIBRARIES_DIRS} -lhsa-runtime64 -lhsakmt -Wl,-rpath,${LIBOMPTARGET_DEP_LIBHSA_LIBRARIES_DIRS},-rpath,${LIBOMPTARGET_DEP_LIBHSAKMT_LIBRARIES_DIRS} + -lelf + "-Wl,--version-script=${CMAKE_CURRENT_SOURCE_DIR}/../exports" + "-Wl,-z,defs" + ) + +# Report to the parent scope that we are building a plugin for amdgpu +set(LIBOMPTARGET_SYSTEM_TARGETS "${LIBOMPTARGET_SYSTEM_TARGETS} amdgcn-amd-amdhsa" PARENT_SCOPE) + diff --git a/openmp/libomptarget/plugins/amdgpu/impl/atmi.h b/openmp/libomptarget/plugins/amdgpu/impl/atmi.h new file mode 100644 --- /dev/null +++ b/openmp/libomptarget/plugins/amdgpu/impl/atmi.h @@ -0,0 +1,203 @@ +/*===-------------------------------------------------------------------------- + * ATMI (Asynchronous Task and Memory Interface) + * + * This file is distributed under the MIT License. See LICENSE.txt for details. + *===------------------------------------------------------------------------*/ +#ifndef INCLUDE_ATMI_H_ +#define INCLUDE_ATMI_H_ + +#define ROCM_VERSION_MAJOR 3 +#define ROCM_VERSION_MINOR 2 + +/** \defgroup enumerations Enumerated Types + * @{ + */ + +/** + * @brief Status codes. + */ +typedef enum atmi_status_t { + /** + * The function has been executed successfully. + */ + ATMI_STATUS_SUCCESS = 0, + /** + * A undocumented error has occurred. + */ + ATMI_STATUS_UNKNOWN = 1, + /** + * A generic error has occurred. + */ + ATMI_STATUS_ERROR = 2, +} atmi_status_t; + +/** + * @brief Device Types. + */ +typedef enum atmi_devtype_s { + ATMI_DEVTYPE_CPU = 0x0001, + ATMI_DEVTYPE_iGPU = 0x0010, // Integrated GPU + ATMI_DEVTYPE_dGPU = 0x0100, // Discrete GPU + ATMI_DEVTYPE_GPU = ATMI_DEVTYPE_iGPU | ATMI_DEVTYPE_dGPU, // Any GPU + ATMI_DEVTYPE_ALL = 0x111 // Union of all device types +} atmi_devtype_t; + +/** + * @brief Memory Access Type. + */ +typedef enum atmi_memtype_s { + ATMI_MEMTYPE_FINE_GRAINED = 0, + ATMI_MEMTYPE_COARSE_GRAINED = 1, + ATMI_MEMTYPE_ANY +} atmi_memtype_t; + +/** + * @brief ATMI Memory Fences for Tasks. + */ +typedef enum atmi_task_fence_scope_s { + /** + * No memory fence applied; external fences have to be applied around the task + * launch/completion. + */ + ATMI_FENCE_SCOPE_NONE = 0, + /** + * The fence is applied to the device. + */ + ATMI_FENCE_SCOPE_DEVICE = 1, + /** + * The fence is applied to the entire system. + */ + ATMI_FENCE_SCOPE_SYSTEM = 2 +} atmi_task_fence_scope_t; + +/** @} */ + +/** \defgroup common Common ATMI Structures + * @{ + */ + +/** + * @brief ATMI Compute Place + */ +typedef struct atmi_place_s { + /** + * The node in a cluster where computation should occur. + * Default is node_id = 0 for local computations. + */ + unsigned int node_id; + /** + * Device type: CPU, GPU or DSP + */ + atmi_devtype_t type; + /** + * The device ordinal number ordered by runtime; -1 for any + */ + int device_id; +} atmi_place_t; + +/** + * @brief ATMI Memory Place + */ +typedef struct atmi_mem_place_s { + /** + * The node in a cluster where computation should occur. + * Default is node_id = 0 for local computations. + */ + unsigned int node_id; + /** + * Device type: CPU, GPU or DSP + */ + atmi_devtype_t dev_type; + /** + * The device ordinal number ordered by runtime; -1 for any + */ + int dev_id; + // atmi_memtype_t mem_type; // Fine grained or Coarse grained + /** + * The memory space/region ordinal number ordered by runtime; -1 for any + */ + int mem_id; +} atmi_mem_place_t; + +/** + * @brief ATMI Memory Space/region Structure + */ +typedef struct atmi_memory_s { + /** + * Memory capacity + */ + unsigned long int capacity; + /** + * Memory type + */ + atmi_memtype_t type; +} atmi_memory_t; + +/** + * @brief ATMI Device Structure + */ +typedef struct atmi_device_s { + /** + * Device type: CPU, GPU or DSP + */ + atmi_devtype_t type; + /** + * The number of compute cores + */ + unsigned int core_count; + /** + * The number of memory spaces/regions that are accessible + * from this device + */ + unsigned int memory_count; + /** + * Array of memory spaces/regions that are accessible + * from this device. + */ + atmi_memory_t *memories; +} atmi_device_t; + +/** + * @brief ATMI Machine Structure + */ +typedef struct atmi_machine_s { + /** + * The number of devices categorized by the device type + */ + unsigned int device_count_by_type[ATMI_DEVTYPE_ALL]; + /** + * The device structures categorized by the device type + */ + atmi_device_t *devices_by_type[ATMI_DEVTYPE_ALL]; +} atmi_machine_t; + +// Below are some helper macros that can be used to setup +// some of the ATMI data structures. +#define ATMI_PLACE_CPU(node, cpu_id) \ + { .node_id = node, .type = ATMI_DEVTYPE_CPU, .device_id = cpu_id } +#define ATMI_PLACE_GPU(node, gpu_id) \ + { .node_id = node, .type = ATMI_DEVTYPE_GPU, .device_id = gpu_id } +#define ATMI_MEM_PLACE_CPU(node, cpu_id) \ + { \ + .node_id = node, .dev_type = ATMI_DEVTYPE_CPU, .dev_id = cpu_id, \ + .mem_id = -1 \ + } +#define ATMI_MEM_PLACE_GPU(node, gpu_id) \ + { \ + .node_id = node, .dev_type = ATMI_DEVTYPE_GPU, .dev_id = gpu_id, \ + .mem_id = -1 \ + } +#define ATMI_MEM_PLACE_CPU_MEM(node, cpu_id, cpu_mem_id) \ + { \ + .node_id = node, .dev_type = ATMI_DEVTYPE_CPU, .dev_id = cpu_id, \ + .mem_id = cpu_mem_id \ + } +#define ATMI_MEM_PLACE_GPU_MEM(node, gpu_id, gpu_mem_id) \ + { \ + .node_id = node, .dev_type = ATMI_DEVTYPE_GPU, .dev_id = gpu_id, \ + .mem_id = gpu_mem_id \ + } +#define ATMI_MEM_PLACE(d_type, d_id, m_id) \ + { .node_id = 0, .dev_type = d_type, .dev_id = d_id, .mem_id = m_id } + +#endif // INCLUDE_ATMI_H_ diff --git a/openmp/libomptarget/plugins/amdgpu/impl/atmi.cpp b/openmp/libomptarget/plugins/amdgpu/impl/atmi.cpp new file mode 100644 --- /dev/null +++ b/openmp/libomptarget/plugins/amdgpu/impl/atmi.cpp @@ -0,0 +1,44 @@ +/*===-------------------------------------------------------------------------- + * ATMI (Asynchronous Task and Memory Interface) + * + * This file is distributed under the MIT License. See LICENSE.txt for details. + *===------------------------------------------------------------------------*/ +#include "rt.h" +/* + * Initialize/Finalize + */ +atmi_status_t atmi_init() { return core::Runtime::Initialize(); } + +atmi_status_t atmi_finalize() { return core::Runtime::Finalize(); } + +/* + * Machine Info + */ +atmi_machine_t *atmi_machine_get_info() { + return core::Runtime::GetMachineInfo(); +} + +/* + * Modules + */ +atmi_status_t atmi_module_register_from_memory_to_place( + void *module_bytes, size_t module_size, atmi_place_t place, + atmi_status_t (*on_deserialized_data)(void *data, size_t size, + void *cb_state), + void *cb_state) { + return core::Runtime::getInstance().RegisterModuleFromMemory( + module_bytes, module_size, place, on_deserialized_data, cb_state); +} + +/* + * Data + */ +atmi_status_t atmi_memcpy(void *dest, const void *src, size_t size) { + return core::Runtime::Memcpy(dest, src, size); +} + +atmi_status_t atmi_free(void *ptr) { return core::Runtime::Memfree(ptr); } + +atmi_status_t atmi_malloc(void **ptr, size_t size, atmi_mem_place_t place) { + return core::Runtime::Malloc(ptr, size, place); +} diff --git a/openmp/libomptarget/plugins/amdgpu/impl/atmi_interop_hsa.h b/openmp/libomptarget/plugins/amdgpu/impl/atmi_interop_hsa.h new file mode 100644 --- /dev/null +++ b/openmp/libomptarget/plugins/amdgpu/impl/atmi_interop_hsa.h @@ -0,0 +1,86 @@ +/*===-------------------------------------------------------------------------- + * ATMI (Asynchronous Task and Memory Interface) + * + * This file is distributed under the MIT License. See LICENSE.txt for details. + *===------------------------------------------------------------------------*/ +#ifndef INCLUDE_ATMI_INTEROP_HSA_H_ +#define INCLUDE_ATMI_INTEROP_HSA_H_ + +#include "atmi_runtime.h" +#include "hsa.h" +#include "hsa_ext_amd.h" + +#ifdef __cplusplus +extern "C" { +#endif +/** \defgroup interop_hsa_functions ATMI-HSA Interop + * @{ + */ + +/** + * @brief Get the device address and size of an HSA global symbol + * + * @detail Use this function to query the device address and size of an HSA + * global symbol. + * The symbol can be set at by the compiler or by the application writer in a + * language-specific manner. This function is meaningful only after calling one + * of the @p atmi_module_register functions. + * + * @param[in] place The ATMI memory place + * + * @param[in] symbol Pointer to a non-NULL global symbol name + * + * @param[in] var_addr Pointer to a non-NULL @p void* variable that will + * hold the device address of the global symbol object. + * + * @param[in] var_size Pointer to a non-NULL @p uint variable that will + * hold the size of the global symbol object. + * + * @retval ::ATMI_STATUS_SUCCESS The function has executed successfully. + * + * @retval ::ATMI_STATUS_ERROR If @p symbol, @p var_addr or @p var_size are + * invalid + * location in the current node, or if ATMI is not initialized. + * + * @retval ::ATMI_STATUS_UNKNOWN The function encountered errors. + */ +atmi_status_t atmi_interop_hsa_get_symbol_info(atmi_mem_place_t place, + const char *symbol, + void **var_addr, + unsigned int *var_size); + +/** + * @brief Get the HSA-specific kernel info from a kernel name + * + * @detail Use this function to query the HSA-specific kernel info from the + * kernel name. + * This function is meaningful only after calling one + * of the @p atmi_module_register functions. + * + * @param[in] place The ATMI memory place + * + * @param[in] kernel_name Pointer to a char array with the kernel name + * + * @param[in] info The different possible kernel properties + * + * @param[in] value Pointer to a non-NULL @p uint variable that will + * hold the return value of the kernel property. + * + * @retval ::ATMI_STATUS_SUCCESS The function has executed successfully. + * + * @retval ::ATMI_STATUS_ERROR If @p symbol, @p var_addr or @p var_size are + * invalid + * location in the current node, or if ATMI is not initialized. + * + * @retval ::ATMI_STATUS_UNKNOWN The function encountered errors. + */ +atmi_status_t atmi_interop_hsa_get_kernel_info( + atmi_mem_place_t place, const char *kernel_name, + hsa_executable_symbol_info_t info, uint32_t *value); +/** @} */ + +#ifdef __cplusplus +} +#endif + +#endif // INCLUDE_ATMI_INTEROP_HSA_H_ diff --git a/openmp/libomptarget/plugins/amdgpu/impl/atmi_interop_hsa.cpp b/openmp/libomptarget/plugins/amdgpu/impl/atmi_interop_hsa.cpp new file mode 100644 --- /dev/null +++ b/openmp/libomptarget/plugins/amdgpu/impl/atmi_interop_hsa.cpp @@ -0,0 +1,96 @@ +/*===-------------------------------------------------------------------------- + * ATMI (Asynchronous Task and Memory Interface) + * + * This file is distributed under the MIT License. See LICENSE.txt for details. + *===------------------------------------------------------------------------*/ +#include "atmi_interop_hsa.h" +#include "internal.h" + +using core::atl_is_atmi_initialized; + +atmi_status_t atmi_interop_hsa_get_symbol_info(atmi_mem_place_t place, + const char *symbol, + void **var_addr, + unsigned int *var_size) { + /* + // Typical usage: + void *var_addr; + size_t var_size; + atmi_interop_hsa_get_symbol_addr(gpu_place, "symbol_name", &var_addr, + &var_size); + atmi_memcpy(host_add, var_addr, var_size); + */ + + if (!atl_is_atmi_initialized()) + return ATMI_STATUS_ERROR; + atmi_machine_t *machine = atmi_machine_get_info(); + if (!symbol || !var_addr || !var_size || !machine) + return ATMI_STATUS_ERROR; + if (place.dev_id < 0 || + place.dev_id >= machine->device_count_by_type[place.dev_type]) + return ATMI_STATUS_ERROR; + + // get the symbol info + std::string symbolStr = std::string(symbol); + if (SymbolInfoTable[place.dev_id].find(symbolStr) != + SymbolInfoTable[place.dev_id].end()) { + atl_symbol_info_t info = SymbolInfoTable[place.dev_id][symbolStr]; + *var_addr = reinterpret_cast(info.addr); + *var_size = info.size; + return ATMI_STATUS_SUCCESS; + } else { + *var_addr = NULL; + *var_size = 0; + return ATMI_STATUS_ERROR; + } +} + +atmi_status_t atmi_interop_hsa_get_kernel_info( + atmi_mem_place_t place, const char *kernel_name, + hsa_executable_symbol_info_t kernel_info, uint32_t *value) { + /* + // Typical usage: + uint32_t value; + atmi_interop_hsa_get_kernel_addr(gpu_place, "kernel_name", + HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_SIZE, + &val); + */ + + if (!atl_is_atmi_initialized()) + return ATMI_STATUS_ERROR; + atmi_machine_t *machine = atmi_machine_get_info(); + if (!kernel_name || !value || !machine) + return ATMI_STATUS_ERROR; + if (place.dev_id < 0 || + place.dev_id >= machine->device_count_by_type[place.dev_type]) + return ATMI_STATUS_ERROR; + + atmi_status_t status = ATMI_STATUS_SUCCESS; + // get the kernel info + std::string kernelStr = std::string(kernel_name); + if (KernelInfoTable[place.dev_id].find(kernelStr) != + KernelInfoTable[place.dev_id].end()) { + atl_kernel_info_t info = KernelInfoTable[place.dev_id][kernelStr]; + switch (kernel_info) { + case HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_GROUP_SEGMENT_SIZE: + *value = info.group_segment_size; + break; + case HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_PRIVATE_SEGMENT_SIZE: + *value = info.private_segment_size; + break; + case HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_SIZE: + // return the size for non-implicit args + *value = info.kernel_segment_size - sizeof(atmi_implicit_args_t); + break; + default: + *value = 0; + status = ATMI_STATUS_ERROR; + break; + } + } else { + *value = 0; + status = ATMI_STATUS_ERROR; + } + + return status; +} diff --git a/openmp/libomptarget/plugins/amdgpu/impl/atmi_kl.h b/openmp/libomptarget/plugins/amdgpu/impl/atmi_kl.h new file mode 100644 --- /dev/null +++ b/openmp/libomptarget/plugins/amdgpu/impl/atmi_kl.h @@ -0,0 +1,39 @@ +/*===-------------------------------------------------------------------------- + * ATMI (Asynchronous Task and Memory Interface) + * + * This file is distributed under the MIT License. See LICENSE.txt for details. + *===------------------------------------------------------------------------*/ +#ifndef INCLUDE_ATMI_KL_H_ +#define INCLUDE_ATMI_KL_H_ + +#include "atmi.h" +#ifdef __OPENCL_C_VERSION__ +#include "ockl_hsa.h" +#endif +#define MAX_NUM_KERNELS (1024 * 16) + +typedef struct atmi_implicit_args_s { + unsigned long offset_x; + unsigned long offset_y; + unsigned long offset_z; + unsigned long hostcall_ptr; + char num_gpu_queues; + unsigned long gpu_queue_ptr; + char num_cpu_queues; + unsigned long cpu_worker_signals; + unsigned long cpu_queue_ptr; + unsigned long kernarg_template_ptr; + // possible TODO: send signal pool to be used by DAGs on GPU + // uint8_t num_signals; + // unsigned long signal_ptr; +} atmi_implicit_args_t; + +typedef struct atmi_kernel_enqueue_template_s { + unsigned long kernel_handle; + hsa_kernel_dispatch_packet_t k_packet; + hsa_agent_dispatch_packet_t a_packet; + unsigned long kernarg_segment_size; + void *kernarg_regions; +} atmi_kernel_enqueue_template_t; + +#endif // INCLUDE_ATMI_KL_H_ diff --git a/openmp/libomptarget/plugins/amdgpu/impl/atmi_runtime.h b/openmp/libomptarget/plugins/amdgpu/impl/atmi_runtime.h new file mode 100644 --- /dev/null +++ b/openmp/libomptarget/plugins/amdgpu/impl/atmi_runtime.h @@ -0,0 +1,193 @@ +/*===-------------------------------------------------------------------------- + * ATMI (Asynchronous Task and Memory Interface) + * + * This file is distributed under the MIT License. See LICENSE.txt for details. + *===------------------------------------------------------------------------*/ +#ifndef INCLUDE_ATMI_RUNTIME_H_ +#define INCLUDE_ATMI_RUNTIME_H_ + +#include "atmi.h" +#include +#include +#ifndef __cplusplus +#include +#endif + +#ifdef __cplusplus +extern "C" { +#endif + +/** \defgroup context_functions ATMI Context Setup and Finalize + * @{ + */ +/** + * @brief Initialize the ATMI runtime environment. + * + * @detal All ATMI runtime functions will fail if this function is not called + * at least once. The user may initialize difference device types at different + * regions in the program in order for optimization purposes. + * + * @retval ::ATMI_STATUS_SUCCESS The function has executed successfully. + * + * @retval ::ATMI_STATUS_ERROR The function encountered errors. + * + * @retval ::ATMI_STATUS_UNKNOWN The function encountered errors. + */ +atmi_status_t atmi_init(); + +/** + * @brief Finalize the ATMI runtime environment. + * + * @detail ATMI runtime functions will fail if called after finalize. + * + * @retval ::ATMI_STATUS_SUCCESS The function has executed successfully. + * + * @retval ::ATMI_STATUS_ERROR The function encountered errors. + * + * @retval ::ATMI_STATUS_UNKNOWN The function encountered errors. + */ +atmi_status_t atmi_finalize(); +/** @} */ + +/** \defgroup module_functions ATMI Module + * @{ + */ + +/** + * @brief Register the ATMI code module from memory on to a specific place + * (device). + * + * @detail Currently, only GPU devices need explicit module registration because + * of their specific ISAs that require a separate compilation phase. On the + * other + * hand, CPU devices execute regular x86 functions that are compiled with the + * host program. + * + * @param[in] module_bytes A memory region that contains the GPU modules + * targeting ::AMDGCN platform types. Value cannot be NULL. + * + * @param[in] module_size Size of module region + * + * @param[in] place Denotes the execution place (device) on which the module + * should be registered and loaded. + * + * @param[in] on_deserialized_data Callback run on deserialized code object, + * before loading it + * + * @param[in] cb_state void* passed to on_deserialized_data callback + * + * @retval ::ATMI_STATUS_SUCCESS The function has executed successfully. + * + * @retval ::ATMI_STATUS_ERROR The function encountered errors. + * + * @retval ::ATMI_STATUS_UNKNOWN The function encountered errors. + * + */ +atmi_status_t atmi_module_register_from_memory_to_place( + void *module_bytes, size_t module_size, atmi_place_t place, + atmi_status_t (*on_deserialized_data)(void *data, size_t size, + void *cb_state), + void *cb_state); + +/** @} */ + +/** \defgroup machine ATMI Machine + * @{ + */ +/** + * @brief ATMI's device discovery function to get the current machine's + * topology. + * + * @detail The @p atmi_machine_t structure is a tree-based representation of the + * compute and memory elements in the current node. Once ATMI is initialized, + * this function can be called to retrieve the pointer to this global structure. + * + * @return Returns a pointer to a global structure of tyoe @p atmi_machine_t. + * Returns NULL if ATMI is not initialized. + */ +atmi_machine_t *atmi_machine_get_info(); +/** @} */ + +/** \defgroup memory_functions ATMI Data Management + * @{ + */ +/** + * @brief Allocate memory from the specified memory place. + * + * @detail This function allocates memory from the specified memory place. If + * the memory + * place belongs primarily to the CPU, then the memory will be accessible by + * other GPUs and CPUs in the system. If the memory place belongs primarily to a + * GPU, + * then it cannot be accessed by other devices in the system. + * + * @param[in] ptr The pointer to the memory that will be allocated. + * + * @param[in] size The size of the allocation in bytes. + * + * @param[in] place The memory place in the system to perform the allocation. + * + * @retval ::ATMI_STATUS_SUCCESS The function has executed successfully. + * + * @retval ::ATMI_STATUS_ERROR The function encountered errors. + * + * @retval ::ATMI_STATUS_UNKNOWN The function encountered errors. + * + */ +atmi_status_t atmi_malloc(void **ptr, size_t size, atmi_mem_place_t place); + +/** + * @brief Frees memory that was previously allocated. + * + * @detail This function frees memory that was previously allocated by calling + * @p atmi_malloc. It throws an error otherwise. It is illegal to access a + * pointer after a call to this function. + * + * @param[in] ptr The pointer to the memory that has to be freed. + * + * @retval ::ATMI_STATUS_SUCCESS The function has executed successfully. + * + * @retval ::ATMI_STATUS_ERROR The function encountered errors. + * + * @retval ::ATMI_STATUS_UNKNOWN The function encountered errors. + * + */ +atmi_status_t atmi_free(void *ptr); + +/** + * @brief Syncrhonously copy memory from the source to destination memory + * locations. + * + * @detail This function assumes that the source and destination regions are + * non-overlapping. The runtime determines the memory place of the source and + * the + * destination and executes the appropriate optimized data movement methodology. + * + * @param[in] dest The destination pointer previously allocated by a system + * allocator or @p atmi_malloc. + * + * @param[in] src The source pointer previously allocated by a system + * allocator or @p atmi_malloc. + * + * @param[in] size The size of the data to be copied in bytes. + * + * @retval ::ATMI_STATUS_SUCCESS The function has executed successfully. + * + * @retval ::ATMI_STATUS_ERROR The function encountered errors. + * + * @retval ::ATMI_STATUS_UNKNOWN The function encountered errors. + * + */ +atmi_status_t atmi_memcpy(void *dest, const void *src, size_t size); + +/** @} */ + +/** \defgroup cpu_dev_runtime ATMI CPU Device Runtime + * @{ + */ + +#ifdef __cplusplus +} +#endif + +#endif // INCLUDE_ATMI_RUNTIME_H_ diff --git a/openmp/libomptarget/plugins/amdgpu/impl/data.h b/openmp/libomptarget/plugins/amdgpu/impl/data.h new file mode 100644 --- /dev/null +++ b/openmp/libomptarget/plugins/amdgpu/impl/data.h @@ -0,0 +1,83 @@ +/*===-------------------------------------------------------------------------- + * ATMI (Asynchronous Task and Memory Interface) + * + * This file is distributed under the MIT License. See LICENSE.txt for details. + *===------------------------------------------------------------------------*/ +#ifndef SRC_RUNTIME_INCLUDE_DATA_H_ +#define SRC_RUNTIME_INCLUDE_DATA_H_ +#include "atmi.h" +#include +#include +#include +#include +#include +// we maintain our own mapping of device addr to a user specified data object +// in order to work around a (possibly historic) bug in ROCr's +// hsa_amd_pointer_info_set_userdata for variable symbols +// this is expected to be temporary + +namespace core { +// Internal representation of any data that is created and managed by ATMI. +// Data can be located on any device memory or host memory. +class ATLData { +public: + ATLData(void *ptr, size_t size, atmi_mem_place_t place) + : ptr_(ptr), size_(size), place_(place) {} + + void *ptr() const { return ptr_; } + size_t size() const { return size_; } + atmi_mem_place_t place() const { return place_; } + +private: + void *ptr_; + size_t size_; + atmi_mem_place_t place_; +}; + +//--- +struct ATLMemoryRange { + const void *base_pointer; + const void *end_pointer; + ATLMemoryRange(const void *bp, size_t size_bytes) + : base_pointer(bp), + end_pointer(reinterpret_cast(bp) + size_bytes - + 1) {} +}; + +// Functor to compare ranges: +struct ATLMemoryRangeCompare { + // Return true is LHS range is less than RHS - used to order the ranges + bool operator()(const ATLMemoryRange &lhs, const ATLMemoryRange &rhs) const { + return lhs.end_pointer < rhs.base_pointer; + } +}; + +//------------------------------------------------------------------------------------------------- +// This structure tracks information for each pointer. +// Uses memory-range-based lookups - so pointers that exist anywhere in the +// range of hostPtr + size +// will find the associated ATLPointerInfo. +// The insertions and lookups use a self-balancing binary tree and should +// support O(logN) lookup speed. +// The structure is thread-safe - writers obtain a mutex before modifying the +// tree. Multiple simulatenous readers are supported. +class ATLPointerTracker { + typedef std::map + MapTrackerType; + +public: + void insert(void *pointer, ATLData *data); + void remove(void *pointer); + ATLData *find(const void *pointer); + +private: + MapTrackerType tracker_; + std::mutex mutex_; +}; + +extern ATLPointerTracker g_data_map; // Track all am pointer allocations. + +enum class Direction { ATMI_H2D, ATMI_D2H, ATMI_D2D, ATMI_H2H }; + +} // namespace core +#endif // SRC_RUNTIME_INCLUDE_DATA_H_ diff --git a/openmp/libomptarget/plugins/amdgpu/impl/data.cpp b/openmp/libomptarget/plugins/amdgpu/impl/data.cpp new file mode 100644 --- /dev/null +++ b/openmp/libomptarget/plugins/amdgpu/impl/data.cpp @@ -0,0 +1,203 @@ +/*===-------------------------------------------------------------------------- + * ATMI (Asynchronous Task and Memory Interface) + * + * This file is distributed under the MIT License. See LICENSE.txt for details. + *===------------------------------------------------------------------------*/ +#include "data.h" +#include "atmi_runtime.h" +#include "internal.h" +#include "machine.h" +#include "rt.h" +#include +#include +#include +#include +#include +#include +#include +#include + +using core::TaskImpl; +extern ATLMachine g_atl_machine; +extern hsa_signal_t IdentityCopySignal; + +namespace core { +ATLPointerTracker g_data_map; // Track all am pointer allocations. +void allow_access_to_all_gpu_agents(void *ptr); + +const char *getPlaceStr(atmi_devtype_t type) { + switch (type) { + case ATMI_DEVTYPE_CPU: + return "CPU"; + case ATMI_DEVTYPE_GPU: + return "GPU"; + default: + return NULL; + } +} + +std::ostream &operator<<(std::ostream &os, const ATLData *ap) { + atmi_mem_place_t place = ap->place(); + os << " devicePointer:" << ap->ptr() << " sizeBytes:" << ap->size() + << " place:(" << getPlaceStr(place.dev_type) << ", " << place.dev_id + << ", " << place.mem_id << ")"; + return os; +} + +void ATLPointerTracker::insert(void *pointer, ATLData *p) { + std::lock_guard l(mutex_); + + DEBUG_PRINT("insert: %p + %zu\n", pointer, p->size()); + tracker_.insert(std::make_pair(ATLMemoryRange(pointer, p->size()), p)); +} + +void ATLPointerTracker::remove(void *pointer) { + std::lock_guard l(mutex_); + DEBUG_PRINT("remove: %p\n", pointer); + tracker_.erase(ATLMemoryRange(pointer, 1)); +} + +ATLData *ATLPointerTracker::find(const void *pointer) { + std::lock_guard l(mutex_); + ATLData *ret = NULL; + auto iter = tracker_.find(ATLMemoryRange(pointer, 1)); + DEBUG_PRINT("find: %p\n", pointer); + if (iter != tracker_.end()) // found + ret = iter->second; + return ret; +} + +ATLProcessor &get_processor_by_mem_place(atmi_mem_place_t place) { + int dev_id = place.dev_id; + switch (place.dev_type) { + case ATMI_DEVTYPE_CPU: + return g_atl_machine.processors()[dev_id]; + case ATMI_DEVTYPE_GPU: + return g_atl_machine.processors()[dev_id]; + } +} + +static hsa_agent_t get_mem_agent(atmi_mem_place_t place) { + return get_processor_by_mem_place(place).agent(); +} + +hsa_amd_memory_pool_t get_memory_pool_by_mem_place(atmi_mem_place_t place) { + ATLProcessor &proc = get_processor_by_mem_place(place); + return get_memory_pool(proc, place.mem_id); +} + +void register_allocation(void *ptr, size_t size, atmi_mem_place_t place) { + ATLData *data = new ATLData(ptr, size, place); + g_data_map.insert(ptr, data); + if (place.dev_type == ATMI_DEVTYPE_CPU) + allow_access_to_all_gpu_agents(ptr); + // TODO(ashwinma): what if one GPU wants to access another GPU? +} + +atmi_status_t Runtime::Malloc(void **ptr, size_t size, atmi_mem_place_t place) { + atmi_status_t ret = ATMI_STATUS_SUCCESS; + hsa_amd_memory_pool_t pool = get_memory_pool_by_mem_place(place); + hsa_status_t err = hsa_amd_memory_pool_allocate(pool, size, 0, ptr); + ErrorCheck(atmi_malloc, err); + DEBUG_PRINT("Malloced [%s %d] %p\n", + place.dev_type == ATMI_DEVTYPE_CPU ? "CPU" : "GPU", place.dev_id, + *ptr); + if (err != HSA_STATUS_SUCCESS) + ret = ATMI_STATUS_ERROR; + + register_allocation(*ptr, size, place); + + return ret; +} + +atmi_status_t Runtime::Memfree(void *ptr) { + atmi_status_t ret = ATMI_STATUS_SUCCESS; + hsa_status_t err; + ATLData *data = g_data_map.find(ptr); + if (!data) + ErrorCheck(Checking pointer info userData, + HSA_STATUS_ERROR_INVALID_ALLOCATION); + + g_data_map.remove(ptr); + delete data; + + err = hsa_amd_memory_pool_free(ptr); + ErrorCheck(atmi_free, err); + DEBUG_PRINT("Freed %p\n", ptr); + + if (err != HSA_STATUS_SUCCESS || !data) + ret = ATMI_STATUS_ERROR; + return ret; +} + +static hsa_status_t invoke_hsa_copy(void *dest, const void *src, size_t size, + hsa_agent_t agent) { + // TODO: Use thread safe signal + hsa_signal_store_release(IdentityCopySignal, 1); + + hsa_status_t err = hsa_amd_memory_async_copy(dest, agent, src, agent, size, 0, + NULL, IdentityCopySignal); + ErrorCheck(Copy async between memory pools, err); + + // TODO: async reports errors in the signal, use NE 1 + hsa_signal_wait_acquire(IdentityCopySignal, HSA_SIGNAL_CONDITION_EQ, 0, + UINT64_MAX, ATMI_WAIT_STATE); + + return err; +} + +atmi_status_t Runtime::Memcpy(void *dest, const void *src, size_t size) { + atmi_status_t ret; + hsa_status_t err; + ATLData *src_data = g_data_map.find(src); + ATLData *dest_data = g_data_map.find(dest); + atmi_mem_place_t cpu = ATMI_MEM_PLACE_CPU_MEM(0, 0, 0); + void *temp_host_ptr; + + if (src_data && !dest_data) { + // Copy from device to scratch to host + hsa_agent_t agent = get_mem_agent(src_data->place()); + DEBUG_PRINT("Memcpy D2H device agent: %lu\n", agent.handle); + ret = atmi_malloc(&temp_host_ptr, size, cpu); + if (ret != ATMI_STATUS_SUCCESS) { + return ret; + } + + err = invoke_hsa_copy(temp_host_ptr, src, size, agent); + if (err != HSA_STATUS_SUCCESS) { + return ATMI_STATUS_ERROR; + } + + memcpy(dest, temp_host_ptr, size); + + } else if (!src_data && dest_data) { + // Copy from host to scratch to device + hsa_agent_t agent = get_mem_agent(dest_data->place()); + DEBUG_PRINT("Memcpy H2D device agent: %lu\n", agent.handle); + ret = atmi_malloc(&temp_host_ptr, size, cpu); + if (ret != ATMI_STATUS_SUCCESS) { + return ret; + } + + memcpy(temp_host_ptr, src, size); + + DEBUG_PRINT("Memcpy device agent: %lu\n", agent.handle); + err = invoke_hsa_copy(dest, temp_host_ptr, size, agent); + + } else if (!src_data && !dest_data) { + DEBUG_PRINT("atmi_memcpy invoked without metadata\n"); + // would be host to host, just call memcpy, or missing metadata + return ATMI_STATUS_ERROR; + } else { + DEBUG_PRINT("atmi_memcpy unimplemented device to device copy\n"); + return ATMI_STATUS_ERROR; + } + + ret = atmi_free(temp_host_ptr); + + if (err != HSA_STATUS_SUCCESS || ret != ATMI_STATUS_SUCCESS) + ret = ATMI_STATUS_ERROR; + return ret; +} + +} // namespace core diff --git a/openmp/libomptarget/plugins/amdgpu/impl/internal.h b/openmp/libomptarget/plugins/amdgpu/impl/internal.h new file mode 100644 --- /dev/null +++ b/openmp/libomptarget/plugins/amdgpu/impl/internal.h @@ -0,0 +1,266 @@ +/*===-------------------------------------------------------------------------- + * ATMI (Asynchronous Task and Memory Interface) + * + * This file is distributed under the MIT License. See LICENSE.txt for details. + *===------------------------------------------------------------------------*/ +#ifndef SRC_RUNTIME_INCLUDE_INTERNAL_H_ +#define SRC_RUNTIME_INCLUDE_INTERNAL_H_ +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "hsa.h" +#include "hsa_ext_amd.h" +#include "hsa_ext_finalize.h" + +#include "atmi.h" +#include "atmi_runtime.h" +#include "rt.h" + +#define MAX_NUM_KERNELS (1024 * 16) + +typedef struct atmi_implicit_args_s { + unsigned long offset_x; + unsigned long offset_y; + unsigned long offset_z; + unsigned long hostcall_ptr; + char num_gpu_queues; + unsigned long gpu_queue_ptr; + char num_cpu_queues; + unsigned long cpu_worker_signals; + unsigned long cpu_queue_ptr; + unsigned long kernarg_template_ptr; +} atmi_implicit_args_t; + +#ifdef __cplusplus +extern "C" { +#endif + +#define check(msg, status) \ + if (status != HSA_STATUS_SUCCESS) { \ + printf("%s failed.\n", #msg); \ + exit(1); \ + } + +#ifdef DEBUG +#define DEBUG_PRINT(fmt, ...) \ + if (core::Runtime::getInstance().getDebugMode()) { \ + fprintf(stderr, "[%s:%d] " fmt, __FILE__, __LINE__, ##__VA_ARGS__); \ + } +#else +#define DEBUG_PRINT(...) \ + do { \ + } while (false) +#endif + +#ifndef HSA_RUNTIME_INC_HSA_H_ +typedef struct hsa_signal_s { + uint64_t handle; +} hsa_signal_t; +#endif + +/* All global values go in this global structure */ +typedef struct atl_context_s { + bool struct_initialized; + bool g_hsa_initialized; + bool g_gpu_initialized; + bool g_tasks_initialized; +} atl_context_t; +extern atl_context_t atlc; +extern atl_context_t *atlc_p; + +#ifdef __cplusplus +} +#endif + +/* --------------------------------------------------------------------------------- + * Simulated CPU Data Structures and API + * --------------------------------------------------------------------------------- + */ + +#define ATMI_WAIT_STATE HSA_WAIT_STATE_BLOCKED + +// ---------------------- Kernel Start ------------- +typedef struct atl_kernel_info_s { + uint64_t kernel_object; + uint32_t group_segment_size; + uint32_t private_segment_size; + uint32_t kernel_segment_size; + uint32_t num_args; + std::vector arg_alignments; + std::vector arg_offsets; + std::vector arg_sizes; +} atl_kernel_info_t; + +typedef struct atl_symbol_info_s { + uint64_t addr; + uint32_t size; +} atl_symbol_info_t; + +extern std::vector> KernelInfoTable; +extern std::vector> SymbolInfoTable; + +// ---------------------- Kernel End ------------- + +extern struct timespec context_init_time; + +namespace core { +class TaskgroupImpl; +class TaskImpl; +class Kernel; +class KernelImpl; +} // namespace core + +struct SignalPoolT { + SignalPoolT() { + // If no signals are created, and none can be created later, + // will ultimately fail at pop() + + unsigned N = 1024; // default max pool size from atmi + for (unsigned i = 0; i < N; i++) { + hsa_signal_t new_signal; + hsa_status_t err = hsa_signal_create(0, 0, NULL, &new_signal); + if (err != HSA_STATUS_SUCCESS) { + break; + } + state.push(new_signal); + } + DEBUG_PRINT("Signal Pool Initial Size: %lu\n", state.size()); + } + SignalPoolT(const SignalPoolT &) = delete; + SignalPoolT(SignalPoolT &&) = delete; + ~SignalPoolT() { + size_t N = state.size(); + for (size_t i = 0; i < N; i++) { + hsa_signal_t signal = state.front(); + state.pop(); + hsa_status_t rc = hsa_signal_destroy(signal); + if (rc != HSA_STATUS_SUCCESS) { + DEBUG_PRINT("Signal pool destruction failed\n"); + } + } + } + size_t size() { + lock l(&mutex); + return state.size(); + } + void push(hsa_signal_t s) { + lock l(&mutex); + state.push(s); + } + hsa_signal_t pop(void) { + lock l(&mutex); + if (!state.empty()) { + hsa_signal_t res = state.front(); + state.pop(); + return res; + } + + // Pool empty, attempt to create another signal + hsa_signal_t new_signal; + hsa_status_t err = hsa_signal_create(0, 0, NULL, &new_signal); + if (err == HSA_STATUS_SUCCESS) { + return new_signal; + } + + // Fail + return {0}; + } + +private: + static pthread_mutex_t mutex; + std::queue state; + struct lock { + lock(pthread_mutex_t *m) : m(m) { pthread_mutex_lock(m); } + ~lock() { pthread_mutex_unlock(m); } + pthread_mutex_t *m; + }; +}; + +extern std::vector atl_gpu_kernarg_pools; + +namespace core { +atmi_status_t atl_init_gpu_context(); + +hsa_status_t init_hsa(); +hsa_status_t finalize_hsa(); +/* + * Generic utils + */ +template inline T alignDown(T value, size_t alignment) { + return (T)(value & ~(alignment - 1)); +} + +template inline T *alignDown(T *value, size_t alignment) { + return reinterpret_cast(alignDown((intptr_t)value, alignment)); +} + +template inline T alignUp(T value, size_t alignment) { + return alignDown((T)(value + alignment - 1), alignment); +} + +template inline T *alignUp(T *value, size_t alignment) { + return reinterpret_cast( + alignDown((intptr_t)(value + alignment - 1), alignment)); +} + +extern void register_allocation(void *addr, size_t size, + atmi_mem_place_t place); +extern hsa_amd_memory_pool_t +get_memory_pool_by_mem_place(atmi_mem_place_t place); +extern bool atl_is_atmi_initialized(); + +bool handle_group_signal(hsa_signal_value_t value, void *arg); + +void packet_store_release(uint32_t *packet, uint16_t header, uint16_t rest); +uint16_t +create_header(hsa_packet_type_t type, int barrier, + atmi_task_fence_scope_t acq_fence = ATMI_FENCE_SCOPE_SYSTEM, + atmi_task_fence_scope_t rel_fence = ATMI_FENCE_SCOPE_SYSTEM); + +void allow_access_to_all_gpu_agents(void *ptr); +} // namespace core + +const char *get_error_string(hsa_status_t err); +const char *get_atmi_error_string(atmi_status_t err); + +#define ATMIErrorCheck(msg, status) \ + if (status != ATMI_STATUS_SUCCESS) { \ + printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__, #msg, \ + get_atmi_error_string(status)); \ + exit(1); \ + } else { \ + /* printf("%s succeeded.\n", #msg);*/ \ + } + +#define ErrorCheck(msg, status) \ + if (status != HSA_STATUS_SUCCESS) { \ + printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__, #msg, \ + get_error_string(status)); \ + exit(1); \ + } else { \ + /* printf("%s succeeded.\n", #msg);*/ \ + } + +#define ErrorCheckAndContinue(msg, status) \ + if (status != HSA_STATUS_SUCCESS) { \ + DEBUG_PRINT("[%s:%d] %s failed: %s\n", __FILE__, __LINE__, #msg, \ + get_error_string(status)); \ + continue; \ + } else { \ + /* printf("%s succeeded.\n", #msg);*/ \ + } + +#endif // SRC_RUNTIME_INCLUDE_INTERNAL_H_ diff --git a/openmp/libomptarget/plugins/amdgpu/impl/machine.h b/openmp/libomptarget/plugins/amdgpu/impl/machine.h new file mode 100644 --- /dev/null +++ b/openmp/libomptarget/plugins/amdgpu/impl/machine.h @@ -0,0 +1,109 @@ +/*===-------------------------------------------------------------------------- + * ATMI (Asynchronous Task and Memory Interface) + * + * This file is distributed under the MIT License. See LICENSE.txt for details. + *===------------------------------------------------------------------------*/ +#ifndef SRC_RUNTIME_INCLUDE_MACHINE_H_ +#define SRC_RUNTIME_INCLUDE_MACHINE_H_ +#include "atmi.h" +#include "internal.h" +#include +#include +#include + +class ATLMemory; + +class ATLProcessor { +public: + explicit ATLProcessor(hsa_agent_t agent, + atmi_devtype_t type = ATMI_DEVTYPE_ALL) + : agent_(agent), type_(type) { + queues_.clear(); + memories_.clear(); + } + void addMemory(const ATLMemory &p); + hsa_agent_t agent() const { return agent_; } + // TODO(ashwinma): Do we need this or are we building the machine structure + // just once in the program? + // void removeMemory(ATLMemory &p); + const std::vector &memories() const; + atmi_devtype_t type() const { return type_; } + + virtual void createQueues(const int count) {} + virtual void destroyQueues(); + std::vector queues() const { return queues_; } + + int num_cus() const; + +protected: + hsa_agent_t agent_; + atmi_devtype_t type_; + std::vector queues_; + std::vector memories_; +}; + +class ATLCPUProcessor : public ATLProcessor { +public: + explicit ATLCPUProcessor(hsa_agent_t agent) + : ATLProcessor(agent, ATMI_DEVTYPE_CPU) {} + void createQueues(const int count); +}; + +class ATLGPUProcessor : public ATLProcessor { +public: + explicit ATLGPUProcessor(hsa_agent_t agent, + atmi_devtype_t type = ATMI_DEVTYPE_dGPU) + : ATLProcessor(agent, type) {} + void createQueues(const int count); +}; + +class ATLMemory { +public: + ATLMemory(hsa_amd_memory_pool_t pool, ATLProcessor p, atmi_memtype_t t) + : memory_pool_(pool), processor_(p), type_(t) {} + ATLProcessor &processor() { return processor_; } + hsa_amd_memory_pool_t memory() const { return memory_pool_; } + + atmi_memtype_t type() const { return type_; } + + void *alloc(size_t s); + void free(void *p); + +private: + hsa_amd_memory_pool_t memory_pool_; + ATLProcessor processor_; + atmi_memtype_t type_; +}; + +class ATLMachine { +public: + ATLMachine() { + cpu_processors_.clear(); + gpu_processors_.clear(); + } + template void addProcessor(const T &p); + template std::vector &processors(); + template size_t processorCount() { + return processors().size(); + } + +private: + std::vector cpu_processors_; + std::vector gpu_processors_; +}; + +hsa_amd_memory_pool_t get_memory_pool(const ATLProcessor &proc, + const int mem_id); + +extern ATLMachine g_atl_machine; +template T &get_processor(atmi_place_t place) { + int dev_id = place.device_id; + if (dev_id == -1) { + // user is asking runtime to pick a device + // TODO(ashwinma): best device of this type? pick 0 for now + dev_id = 0; + } + return g_atl_machine.processors()[dev_id]; +} + +#endif // SRC_RUNTIME_INCLUDE_MACHINE_H_ diff --git a/openmp/libomptarget/plugins/amdgpu/impl/machine.cpp b/openmp/libomptarget/plugins/amdgpu/impl/machine.cpp new file mode 100644 --- /dev/null +++ b/openmp/libomptarget/plugins/amdgpu/impl/machine.cpp @@ -0,0 +1,128 @@ +/*===-------------------------------------------------------------------------- + * ATMI (Asynchronous Task and Memory Interface) + * + * This file is distributed under the MIT License. See LICENSE.txt for details. + *===------------------------------------------------------------------------*/ +#include "machine.h" +#include "atmi_runtime.h" +#include "internal.h" +#include +#include +#include +#include +#include +#include + +extern ATLMachine g_atl_machine; +extern hsa_region_t atl_cpu_kernarg_region; + +void *ATLMemory::alloc(size_t sz) { + void *ret; + hsa_status_t err = hsa_amd_memory_pool_allocate(memory_pool_, sz, 0, &ret); + ErrorCheck(Allocate from memory pool, err); + return ret; +} + +void ATLMemory::free(void *ptr) { + hsa_status_t err = hsa_amd_memory_pool_free(ptr); + ErrorCheck(Allocate from memory pool, err); +} + +void ATLProcessor::addMemory(const ATLMemory &mem) { + for (auto &mem_obj : memories_) { + // if the memory already exists, then just return + if (mem.memory().handle == mem_obj.memory().handle) + return; + } + memories_.push_back(mem); +} + +const std::vector &ATLProcessor::memories() const { + return memories_; +} + +template <> std::vector &ATLMachine::processors() { + return cpu_processors_; +} + +template <> std::vector &ATLMachine::processors() { + return gpu_processors_; +} + +hsa_amd_memory_pool_t get_memory_pool(const ATLProcessor &proc, + const int mem_id) { + hsa_amd_memory_pool_t pool; + const std::vector &mems = proc.memories(); + assert(mems.size() && mem_id >= 0 && mem_id < mems.size() && + "Invalid memory pools for this processor"); + pool = mems[mem_id].memory(); + return pool; +} + +template <> void ATLMachine::addProcessor(const ATLCPUProcessor &p) { + cpu_processors_.push_back(p); +} + +template <> void ATLMachine::addProcessor(const ATLGPUProcessor &p) { + gpu_processors_.push_back(p); +} + +void callbackQueue(hsa_status_t status, hsa_queue_t *source, void *data) { + if (status != HSA_STATUS_SUCCESS) { + fprintf(stderr, "[%s:%d] GPU error in queue %p %d\n", __FILE__, __LINE__, + source, status); + abort(); + } +} + +void ATLGPUProcessor::createQueues(const int count) { + int *num_cus = reinterpret_cast(calloc(count, sizeof(int))); + + hsa_status_t err; + /* Query the maximum size of the queue. */ + uint32_t queue_size = 0; + err = hsa_agent_get_info(agent_, HSA_AGENT_INFO_QUEUE_MAX_SIZE, &queue_size); + ErrorCheck(Querying the agent maximum queue size, err); + if (queue_size > core::Runtime::getInstance().getMaxQueueSize()) { + queue_size = core::Runtime::getInstance().getMaxQueueSize(); + } + + /* Create queues for each device. */ + int qid; + for (qid = 0; qid < count; qid++) { + hsa_queue_t *this_Q; + err = + hsa_queue_create(agent_, queue_size, HSA_QUEUE_TYPE_MULTI, + callbackQueue, NULL, UINT32_MAX, UINT32_MAX, &this_Q); + ErrorCheck(Creating the queue, err); + err = hsa_amd_profiling_set_profiler_enabled(this_Q, 1); + ErrorCheck(Enabling profiling support, err); + + queues_.push_back(this_Q); + + DEBUG_PRINT("Queue[%d]: %p\n", qid, this_Q); + } + + free(num_cus); +} + +void ATLCPUProcessor::createQueues(const int) {} + +void ATLProcessor::destroyQueues() { + for (auto queue : queues_) { + hsa_status_t err = hsa_queue_destroy(queue); + ErrorCheck(Destroying the queue, err); + } +} + +int ATLProcessor::num_cus() const { + hsa_status_t err; + /* Query the number of compute units. */ + uint32_t num_cus = 0; + err = hsa_agent_get_info( + agent_, (hsa_agent_info_t)HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT, + &num_cus); + ErrorCheck(Querying the agent number of compute units, err); + + return num_cus; +} diff --git a/openmp/libomptarget/plugins/amdgpu/impl/msgpack.h b/openmp/libomptarget/plugins/amdgpu/impl/msgpack.h new file mode 100644 --- /dev/null +++ b/openmp/libomptarget/plugins/amdgpu/impl/msgpack.h @@ -0,0 +1,275 @@ +#ifndef MSGPACK_H +#define MSGPACK_H + +#include + +namespace msgpack { + +// The message pack format is dynamically typed, schema-less. Format is: +// message: [type][header][payload] +// where type is one byte, header length is a fixed length function of type +// payload is zero to N bytes, with the length encoded in [type][header] + +// Scalar fields include boolean, signed integer, float, string etc +// Composite types are sequences of messages +// Array field is [header][element][element]... +// Map field is [header][key][value][key][value]... + +// Multibyte integer fields are big endian encoded +// The map key can be any message type +// Maps may contain duplicate keys +// Data is not uniquely encoded, e.g. integer "8" may be stored as one byte or +// in as many as nine, as signed or unsigned. Implementation defined. +// Similarly "foo" may embed the length in the type field or in multiple bytes + +// This parser is structured as an iterator over a sequence of bytes. +// It calls a user provided function on each message in order to extract fields +// The default implementation for each scalar type is to do nothing. For map or +// arrays, the default implementation returns just after that message to support +// iterating to the next message, but otherwise has no effect. + +struct byte_range { + const unsigned char *start; + const unsigned char *end; +}; + +const unsigned char *skip_next_message(const unsigned char *start, + const unsigned char *end); + +template class functors_defaults { +public: + void cb_string(size_t N, const unsigned char *str) { + derived().handle_string(N, str); + } + void cb_boolean(bool x) { derived().handle_boolean(x); } + void cb_signed(int64_t x) { derived().handle_signed(x); } + void cb_unsigned(uint64_t x) { derived().handle_unsigned(x); } + void cb_array_elements(byte_range bytes) { + derived().handle_array_elements(bytes); + } + void cb_map_elements(byte_range key, byte_range value) { + derived().handle_map_elements(key, value); + } + const unsigned char *cb_array(uint64_t N, byte_range bytes) { + return derived().handle_array(N, bytes); + } + const unsigned char *cb_map(uint64_t N, byte_range bytes) { + return derived().handle_map(N, bytes); + } + +private: + Derived &derived() { return *static_cast(this); } + + // Default implementations for scalar ops are no-ops + void handle_string(size_t, const unsigned char *) {} + void handle_boolean(bool) {} + void handle_signed(int64_t) {} + void handle_unsigned(uint64_t) {} + void handle_array_elements(byte_range) {} + void handle_map_elements(byte_range, byte_range) {} + + // Default implementation for sequences is to skip over the messages + const unsigned char *handle_array(uint64_t N, byte_range bytes) { + for (uint64_t i = 0; i < N; i++) { + const unsigned char *next = skip_next_message(bytes.start, bytes.end); + if (!next) { + return nullptr; + } + cb_array_elements(bytes); + bytes.start = next; + } + return bytes.start; + } + const unsigned char *handle_map(uint64_t N, byte_range bytes) { + for (uint64_t i = 0; i < N; i++) { + const unsigned char *start_key = bytes.start; + const unsigned char *end_key = skip_next_message(start_key, bytes.end); + if (!end_key) { + return nullptr; + } + const unsigned char *start_value = end_key; + const unsigned char *end_value = + skip_next_message(start_value, bytes.end); + if (!end_value) { + return nullptr; + } + cb_map_elements({start_key, end_key}, {start_value, end_value}); + bytes.start = end_value; + } + return bytes.start; + } +}; + +typedef enum : uint8_t { +#define X(NAME, WIDTH, PAYLOAD, LOWER, UPPER) NAME, +#include "msgpack.def" +#undef X +} type; + +[[noreturn]] void internal_error(); +type parse_type(unsigned char x); +unsigned bytes_used_fixed(type ty); + +typedef uint64_t (*payload_info_t)(const unsigned char *); +payload_info_t payload_info(msgpack::type ty); + +template R bitcast(T x); + +template +const unsigned char *handle_msgpack_given_type(byte_range bytes, F f) { + const unsigned char *start = bytes.start; + const unsigned char *end = bytes.end; + const uint64_t available = end - start; + assert(available != 0); + assert(ty == parse_type(*start)); + + const uint64_t bytes_used = bytes_used_fixed(ty); + if (available < bytes_used) { + return 0; + } + const uint64_t available_post_header = available - bytes_used; + + const payload_info_t info = payload_info(ty); + const uint64_t N = info(start); + + switch (ty) { + case msgpack::t: + case msgpack::f: { + // t is 0b11000010, f is 0b11000011, masked with 0x1 + f.cb_boolean(N); + return start + bytes_used; + } + + case msgpack::posfixint: + case msgpack::uint8: + case msgpack::uint16: + case msgpack::uint32: + case msgpack::uint64: { + f.cb_unsigned(N); + return start + bytes_used; + } + + case msgpack::negfixint: + case msgpack::int8: + case msgpack::int16: + case msgpack::int32: + case msgpack::int64: { + f.cb_signed(bitcast(N)); + return start + bytes_used; + } + + case msgpack::fixstr: + case msgpack::str8: + case msgpack::str16: + case msgpack::str32: { + if (available_post_header < N) { + return 0; + } else { + f.cb_string(N, start + bytes_used); + return start + bytes_used + N; + } + } + + case msgpack::fixarray: + case msgpack::array16: + case msgpack::array32: { + return f.cb_array(N, {start + bytes_used, end}); + } + + case msgpack::fixmap: + case msgpack::map16: + case msgpack::map32: { + return f.cb_map(N, {start + bytes_used, end}); + } + + case msgpack::nil: + case msgpack::bin8: + case msgpack::bin16: + case msgpack::bin32: + case msgpack::float32: + case msgpack::float64: + case msgpack::ext8: + case msgpack::ext16: + case msgpack::ext32: + case msgpack::fixext1: + case msgpack::fixext2: + case msgpack::fixext4: + case msgpack::fixext8: + case msgpack::fixext16: + case msgpack::never_used: { + if (available_post_header < N) { + return 0; + } + return start + bytes_used + N; + } + } + internal_error(); +} + +template +const unsigned char *handle_msgpack(byte_range bytes, F f) { + const unsigned char *start = bytes.start; + const unsigned char *end = bytes.end; + const uint64_t available = end - start; + if (available == 0) { + return 0; + } + const type ty = parse_type(*start); + + switch (ty) { +#define X(NAME, WIDTH, PAYLOAD, LOWER, UPPER) \ + case msgpack::NAME: \ + return handle_msgpack_given_type(bytes, f); +#include "msgpack.def" +#undef X + } + + internal_error(); +} + +bool message_is_string(byte_range bytes, const char *str); + +template void foronly_string(byte_range bytes, C callback) { + struct inner : functors_defaults { + inner(C &cb) : cb(cb) {} + C &cb; + void handle_string(size_t N, const unsigned char *str) { cb(N, str); } + }; + handle_msgpack(bytes, {callback}); +} + +template void foronly_unsigned(byte_range bytes, C callback) { + struct inner : functors_defaults { + inner(C &cb) : cb(cb) {} + C &cb; + void handle_unsigned(uint64_t x) { cb(x); } + }; + handle_msgpack(bytes, {callback}); +} + +template void foreach_array(byte_range bytes, C callback) { + struct inner : functors_defaults { + inner(C &cb) : cb(cb) {} + C &cb; + void handle_array_elements(byte_range element) { cb(element); } + }; + handle_msgpack(bytes, {callback}); +} + +template void foreach_map(byte_range bytes, C callback) { + struct inner : functors_defaults { + inner(C &cb) : cb(cb) {} + C &cb; + void handle_map_elements(byte_range key, byte_range value) { + cb(key, value); + } + }; + handle_msgpack(bytes, {callback}); +} + +// Crude approximation to json +void dump(byte_range); + +} // namespace msgpack + +#endif diff --git a/openmp/libomptarget/plugins/amdgpu/impl/msgpack.cpp b/openmp/libomptarget/plugins/amdgpu/impl/msgpack.cpp new file mode 100644 --- /dev/null +++ b/openmp/libomptarget/plugins/amdgpu/impl/msgpack.cpp @@ -0,0 +1,264 @@ +#include +#include +#include +#include +#include + +#include "msgpack.h" + +namespace msgpack { + +[[noreturn]] void internal_error() { + printf("internal error\n"); + exit(1); +} + +const char *type_name(type ty) { + switch (ty) { +#define X(NAME, WIDTH, PAYLOAD, LOWER, UPPER) \ + case NAME: \ + return #NAME; +#include "msgpack.def" +#undef X + } + internal_error(); +} + +unsigned bytes_used_fixed(msgpack::type ty) { + using namespace msgpack; + switch (ty) { +#define X(NAME, WIDTH, PAYLOAD, LOWER, UPPER) \ + case NAME: \ + return WIDTH; +#include "msgpack.def" +#undef X + } + internal_error(); +} + +msgpack::type parse_type(unsigned char x) { + +#define X(NAME, WIDTH, PAYLOAD, LOWER, UPPER) \ + if (x >= LOWER && x <= UPPER) { \ + return NAME; \ + } else +#include "msgpack.def" +#undef X + { internal_error(); } +} + +template R bitcast(T x) { + static_assert(sizeof(T) == sizeof(R), ""); + R tmp; + memcpy(&tmp, &x, sizeof(T)); + return tmp; +} +template int64_t bitcast(uint64_t); +} // namespace msgpack + +// Helper functions for reading additional payload from the header +// Depending on the type, this can be a number of bytes, elements, +// key-value pairs or an embedded integer. +// Each takes a pointer to the start of the header and returns a uint64_t + +namespace { +namespace payload { +uint64_t read_zero(const unsigned char *) { return 0; } + +// Read the first byte and zero/sign extend it +uint64_t read_embedded_u8(const unsigned char *start) { return start[0]; } +uint64_t read_embedded_s8(const unsigned char *start) { + int64_t res = msgpack::bitcast(start[0]); + return msgpack::bitcast(res); +} + +// Read a masked part of the first byte +uint64_t read_via_mask_0x1(const unsigned char *start) { return *start & 0x1u; } +uint64_t read_via_mask_0xf(const unsigned char *start) { return *start & 0xfu; } +uint64_t read_via_mask_0x1f(const unsigned char *start) { + return *start & 0x1fu; +} + +// Read 1/2/4/8 bytes immediately following the type byte and zero/sign extend +// Big endian format. +uint64_t read_size_field_u8(const unsigned char *from) { + from++; + return from[0]; +} + +// TODO: detect whether host is little endian or not, and whether the intrinsic +// is available. And probably use the builtin to test the diy +const bool use_bswap = false; + +uint64_t read_size_field_u16(const unsigned char *from) { + from++; + if (use_bswap) { + uint16_t b; + memcpy(&b, from, 2); + return __builtin_bswap16(b); + } else { + return (from[0] << 8u) | from[1]; + } +} +uint64_t read_size_field_u32(const unsigned char *from) { + from++; + if (use_bswap) { + uint32_t b; + memcpy(&b, from, 4); + return __builtin_bswap32(b); + } else { + return (from[0] << 24u) | (from[1] << 16u) | (from[2] << 8u) | + (from[3] << 0u); + } +} +uint64_t read_size_field_u64(const unsigned char *from) { + from++; + if (use_bswap) { + uint64_t b; + memcpy(&b, from, 8); + return __builtin_bswap64(b); + } else { + return ((uint64_t)from[0] << 56u) | ((uint64_t)from[1] << 48u) | + ((uint64_t)from[2] << 40u) | ((uint64_t)from[3] << 32u) | + (from[4] << 24u) | (from[5] << 16u) | (from[6] << 8u) | + (from[7] << 0u); + } +} + +uint64_t read_size_field_s8(const unsigned char *from) { + uint8_t u = read_size_field_u8(from); + int64_t res = msgpack::bitcast(u); + return msgpack::bitcast(res); +} +uint64_t read_size_field_s16(const unsigned char *from) { + uint16_t u = read_size_field_u16(from); + int64_t res = msgpack::bitcast(u); + return msgpack::bitcast(res); +} +uint64_t read_size_field_s32(const unsigned char *from) { + uint32_t u = read_size_field_u32(from); + int64_t res = msgpack::bitcast(u); + return msgpack::bitcast(res); +} +uint64_t read_size_field_s64(const unsigned char *from) { + uint64_t u = read_size_field_u64(from); + int64_t res = msgpack::bitcast(u); + return msgpack::bitcast(res); +} +} // namespace payload +} // namespace + +namespace msgpack { + +payload_info_t payload_info(msgpack::type ty) { + using namespace msgpack; + switch (ty) { +#define X(NAME, WIDTH, PAYLOAD, LOWER, UPPER) \ + case NAME: \ + return payload::PAYLOAD; +#include "msgpack.def" +#undef X + } + internal_error(); +} + +} // namespace msgpack + +const unsigned char *msgpack::skip_next_message(const unsigned char *start, + const unsigned char *end) { + class f : public functors_defaults {}; + return handle_msgpack({start, end}, f()); +} + +namespace msgpack { +bool message_is_string(byte_range bytes, const char *needle) { + bool matched = false; + size_t needleN = strlen(needle); + + foronly_string(bytes, [=, &matched](size_t N, const unsigned char *str) { + if (N == needleN) { + if (memcmp(needle, str, N) == 0) { + matched = true; + } + } + }); + return matched; +} + +void dump(byte_range bytes) { + struct inner : functors_defaults { + inner(unsigned indent) : indent(indent) {} + const unsigned by = 2; + unsigned indent = 0; + + void handle_string(size_t N, const unsigned char *bytes) { + char *tmp = (char *)malloc(N + 1); + memcpy(tmp, bytes, N); + tmp[N] = '\0'; + printf("\"%s\"", tmp); + free(tmp); + } + + void handle_signed(int64_t x) { printf("%ld", x); } + void handle_unsigned(uint64_t x) { printf("%lu", x); } + + const unsigned char *handle_array(uint64_t N, byte_range bytes) { + printf("\n%*s[\n", indent, ""); + indent += by; + + for (uint64_t i = 0; i < N; i++) { + indent += by; + printf("%*s", indent, ""); + const unsigned char *next = handle_msgpack(bytes, {indent}); + printf(",\n"); + indent -= by; + bytes.start = next; + if (!next) { + break; + } + } + indent -= by; + printf("%*s]", indent, ""); + + return bytes.start; + } + + const unsigned char *handle_map(uint64_t N, byte_range bytes) { + printf("\n%*s{\n", indent, ""); + indent += by; + + for (uint64_t i = 0; i < 2 * N; i += 2) { + const unsigned char *start_key = bytes.start; + printf("%*s", indent, ""); + const unsigned char *end_key = + handle_msgpack({start_key, bytes.end}, {indent}); + if (!end_key) { + break; + } + + printf(" : "); + + const unsigned char *start_value = end_key; + const unsigned char *end_value = + handle_msgpack({start_value, bytes.end}, {indent}); + + if (!end_value) { + break; + } + + printf(",\n"); + bytes.start = end_value; + } + + indent -= by; + printf("%*s}", indent, ""); + + return bytes.start; + } + }; + + handle_msgpack(bytes, {0}); + printf("\n"); +} + +} // namespace msgpack diff --git a/openmp/libomptarget/plugins/amdgpu/impl/msgpack.def b/openmp/libomptarget/plugins/amdgpu/impl/msgpack.def new file mode 100644 --- /dev/null +++ b/openmp/libomptarget/plugins/amdgpu/impl/msgpack.def @@ -0,0 +1,38 @@ +// name, header width, reader, [lower, upper] encoding +X(posfixint, 1, read_embedded_u8, 0x00, 0x7f) +X(negfixint, 1, read_embedded_s8, 0xe0, 0xff) +X(fixmap, 1, read_via_mask_0xf, 0x80, 0x8f) +X(fixarray, 1, read_via_mask_0xf, 0x90, 0x9f) +X(fixstr, 1, read_via_mask_0x1f, 0xa0, 0xbf) +X(nil, 1, read_zero, 0xc0, 0xc0) +X(never_used, 1, read_zero, 0xc1, 0xc1) +X(f, 1, read_via_mask_0x1, 0xc2, 0xc2) +X(t, 1, read_via_mask_0x1, 0xc3, 0xc3) +X(bin8, 2, read_size_field_u8, 0xc4, 0xc4) +X(bin16, 3, read_size_field_u16, 0xc5, 0xc5) +X(bin32, 5, read_size_field_u32, 0xc6, 0xc6) +X(ext8, 3, read_size_field_u8, 0xc7, 0xc7) +X(ext16, 4, read_size_field_u16, 0xc8, 0xc8) +X(ext32, 6, read_size_field_u32, 0xc9, 0xc9) +X(float32, 5, read_zero, 0xca, 0xca) +X(float64, 9, read_zero, 0xcb, 0xcb) +X(uint8, 2, read_size_field_u8, 0xcc, 0xcc) +X(uint16, 3, read_size_field_u16, 0xcd, 0xcd) +X(uint32, 5, read_size_field_u32, 0xce, 0xce) +X(uint64, 9, read_size_field_u64, 0xcf, 0xcf) +X(int8, 2, read_size_field_s8, 0xd0, 0xd0) +X(int16, 3, read_size_field_s16, 0xd1, 0xd1) +X(int32, 5, read_size_field_s32, 0xd2, 0xd2) +X(int64, 9, read_size_field_s64, 0xd3, 0xd3) +X(fixext1, 3, read_zero, 0xd4, 0xd4) +X(fixext2, 4, read_zero, 0xd5, 0xd5) +X(fixext4, 6, read_zero, 0xd6, 0xd6) +X(fixext8, 10, read_zero, 0xd7, 0xd7) +X(fixext16, 18, read_zero, 0xd8, 0xd8) +X(str8, 2, read_size_field_u8, 0xd9, 0xd9) +X(str16, 3, read_size_field_u16, 0xda, 0xda) +X(str32, 5, read_size_field_u32, 0xdb, 0xdb) +X(array16, 3, read_size_field_u16, 0xdc, 0xdc) +X(array32, 5, read_size_field_u32, 0xdd, 0xdd) +X(map16, 3, read_size_field_u16, 0xde, 0xde) +X(map32, 5, read_size_field_u32, 0xdf, 0xdf) diff --git a/openmp/libomptarget/plugins/amdgpu/impl/rt.h b/openmp/libomptarget/plugins/amdgpu/impl/rt.h new file mode 100644 --- /dev/null +++ b/openmp/libomptarget/plugins/amdgpu/impl/rt.h @@ -0,0 +1,108 @@ +/*===-------------------------------------------------------------------------- + * ATMI (Asynchronous Task and Memory Interface) + * + * This file is distributed under the MIT License. See LICENSE.txt for details. + *===------------------------------------------------------------------------*/ +#ifndef SRC_RUNTIME_INCLUDE_RT_H_ +#define SRC_RUNTIME_INCLUDE_RT_H_ + +#include "atmi_runtime.h" +#include "hsa.h" +#include +#include + +namespace core { + +#define DEFAULT_MAX_QUEUE_SIZE 4096 +#define DEFAULT_MAX_KERNEL_TYPES 32 +#define DEFAULT_NUM_GPU_QUEUES -1 // computed in code +#define DEFAULT_NUM_CPU_QUEUES -1 // computed in code +#define DEFAULT_DEBUG_MODE 0 +class Environment { +public: + Environment() + : max_queue_size_(DEFAULT_MAX_QUEUE_SIZE), + max_kernel_types_(DEFAULT_MAX_KERNEL_TYPES), + num_gpu_queues_(DEFAULT_NUM_GPU_QUEUES), + num_cpu_queues_(DEFAULT_NUM_CPU_QUEUES), + debug_mode_(DEFAULT_DEBUG_MODE) { + GetEnvAll(); + } + + ~Environment() {} + + void GetEnvAll(); + + int getMaxQueueSize() const { return max_queue_size_; } + int getMaxKernelTypes() const { return max_kernel_types_; } + int getNumGPUQueues() const { return num_gpu_queues_; } + int getNumCPUQueues() const { return num_cpu_queues_; } + // TODO(ashwinma): int may change to enum if we have more debug modes + int getDebugMode() const { return debug_mode_; } + // TODO(ashwinma): int may change to enum if we have more profile modes + +private: + std::string GetEnv(const char *name) { + char *env = getenv(name); + std::string ret; + if (env) { + ret = env; + } + return ret; + } + + int max_queue_size_; + int max_kernel_types_; + int num_gpu_queues_; + int num_cpu_queues_; + int debug_mode_; +}; + +class Runtime final { +public: + static Runtime &getInstance() { + static Runtime instance; + return instance; + } + + // init/finalize + static atmi_status_t Initialize(); + static atmi_status_t Finalize(); + + // modules + static atmi_status_t RegisterModuleFromMemory( + void *, size_t, atmi_place_t, + atmi_status_t (*on_deserialized_data)(void *data, size_t size, + void *cb_state), + void *cb_state); + + // machine info + static atmi_machine_t *GetMachineInfo(); + + // data + static atmi_status_t Memcpy(void *, const void *, size_t); + static atmi_status_t Memfree(void *); + static atmi_status_t Malloc(void **, size_t, atmi_mem_place_t); + + // environment variables + int getMaxQueueSize() const { return env_.getMaxQueueSize(); } + int getMaxKernelTypes() const { return env_.getMaxKernelTypes(); } + int getNumGPUQueues() const { return env_.getNumGPUQueues(); } + int getNumCPUQueues() const { return env_.getNumCPUQueues(); } + // TODO(ashwinma): int may change to enum if we have more debug modes + int getDebugMode() const { return env_.getDebugMode(); } + +protected: + Runtime() = default; + ~Runtime() = default; + Runtime(const Runtime &) = delete; + Runtime &operator=(const Runtime &) = delete; + +protected: + // variable to track environment variables + Environment env_; +}; + +} // namespace core + +#endif // SRC_RUNTIME_INCLUDE_RT_H_ diff --git a/openmp/libomptarget/plugins/amdgpu/impl/system.cpp b/openmp/libomptarget/plugins/amdgpu/impl/system.cpp new file mode 100644 --- /dev/null +++ b/openmp/libomptarget/plugins/amdgpu/impl/system.cpp @@ -0,0 +1,1121 @@ +/*===-------------------------------------------------------------------------- + * ATMI (Asynchronous Task and Memory Interface) + * + * This file is distributed under the MIT License. See LICENSE.txt for details. + *===------------------------------------------------------------------------*/ +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#include "internal.h" +#include "machine.h" +#include "rt.h" + +#include "msgpack.h" + +#define msgpackErrorCheck(msg, status) \ + if (status != 0) { \ + printf("[%s:%d] %s failed\n", __FILE__, __LINE__, #msg); \ + return HSA_STATUS_ERROR_INVALID_CODE_OBJECT; \ + } else { \ + } + +typedef unsigned char *address; +/* + * Note descriptors. + */ +typedef struct { + uint32_t n_namesz; /* Length of note's name. */ + uint32_t n_descsz; /* Length of note's value. */ + uint32_t n_type; /* Type of note. */ + // then name + // then padding, optional + // then desc, at 4 byte alignment (not 8, despite being elf64) +} Elf_Note; + +// The following include file and following structs/enums +// have been replicated on a per-use basis below. For example, +// llvm::AMDGPU::HSAMD::Kernel::Metadata has several fields, +// but we may care only about kernargSegmentSize_ for now, so +// we just include that field in our KernelMD implementation. We +// chose this approach to replicate in order to avoid forcing +// a dependency on LLVM_INCLUDE_DIR just to compile the runtime. +// #include "llvm/Support/AMDGPUMetadata.h" +// typedef llvm::AMDGPU::HSAMD::Metadata CodeObjectMD; +// typedef llvm::AMDGPU::HSAMD::Kernel::Metadata KernelMD; +// typedef llvm::AMDGPU::HSAMD::Kernel::Arg::Metadata KernelArgMD; +// using llvm::AMDGPU::HSAMD::AccessQualifier; +// using llvm::AMDGPU::HSAMD::AddressSpaceQualifier; +// using llvm::AMDGPU::HSAMD::ValueKind; +// using llvm::AMDGPU::HSAMD::ValueType; + +class KernelArgMD { +public: + enum class ValueKind { + HiddenGlobalOffsetX, + HiddenGlobalOffsetY, + HiddenGlobalOffsetZ, + HiddenNone, + HiddenPrintfBuffer, + HiddenDefaultQueue, + HiddenCompletionAction, + HiddenMultiGridSyncArg, + HiddenHostcallBuffer, + Unknown + }; + + KernelArgMD() + : name_(std::string()), typeName_(std::string()), size_(0), offset_(0), + align_(0), valueKind_(ValueKind::Unknown) {} + + // fields + std::string name_; + std::string typeName_; + uint32_t size_; + uint32_t offset_; + uint32_t align_; + ValueKind valueKind_; +}; + +class KernelMD { +public: + KernelMD() : kernargSegmentSize_(0ull) {} + + // fields + uint64_t kernargSegmentSize_; +}; + +static const std::map ArgValueKind = { + // Including only those fields that are relevant to the runtime. + // {"ByValue", KernelArgMD::ValueKind::ByValue}, + // {"GlobalBuffer", KernelArgMD::ValueKind::GlobalBuffer}, + // {"DynamicSharedPointer", + // KernelArgMD::ValueKind::DynamicSharedPointer}, + // {"Sampler", KernelArgMD::ValueKind::Sampler}, + // {"Image", KernelArgMD::ValueKind::Image}, + // {"Pipe", KernelArgMD::ValueKind::Pipe}, + // {"Queue", KernelArgMD::ValueKind::Queue}, + {"HiddenGlobalOffsetX", KernelArgMD::ValueKind::HiddenGlobalOffsetX}, + {"HiddenGlobalOffsetY", KernelArgMD::ValueKind::HiddenGlobalOffsetY}, + {"HiddenGlobalOffsetZ", KernelArgMD::ValueKind::HiddenGlobalOffsetZ}, + {"HiddenNone", KernelArgMD::ValueKind::HiddenNone}, + {"HiddenPrintfBuffer", KernelArgMD::ValueKind::HiddenPrintfBuffer}, + {"HiddenDefaultQueue", KernelArgMD::ValueKind::HiddenDefaultQueue}, + {"HiddenCompletionAction", KernelArgMD::ValueKind::HiddenCompletionAction}, + {"HiddenMultiGridSyncArg", KernelArgMD::ValueKind::HiddenMultiGridSyncArg}, + {"HiddenHostcallBuffer", KernelArgMD::ValueKind::HiddenHostcallBuffer}, + // v3 + // {"by_value", KernelArgMD::ValueKind::ByValue}, + // {"global_buffer", KernelArgMD::ValueKind::GlobalBuffer}, + // {"dynamic_shared_pointer", + // KernelArgMD::ValueKind::DynamicSharedPointer}, + // {"sampler", KernelArgMD::ValueKind::Sampler}, + // {"image", KernelArgMD::ValueKind::Image}, + // {"pipe", KernelArgMD::ValueKind::Pipe}, + // {"queue", KernelArgMD::ValueKind::Queue}, + {"hidden_global_offset_x", KernelArgMD::ValueKind::HiddenGlobalOffsetX}, + {"hidden_global_offset_y", KernelArgMD::ValueKind::HiddenGlobalOffsetY}, + {"hidden_global_offset_z", KernelArgMD::ValueKind::HiddenGlobalOffsetZ}, + {"hidden_none", KernelArgMD::ValueKind::HiddenNone}, + {"hidden_printf_buffer", KernelArgMD::ValueKind::HiddenPrintfBuffer}, + {"hidden_default_queue", KernelArgMD::ValueKind::HiddenDefaultQueue}, + {"hidden_completion_action", + KernelArgMD::ValueKind::HiddenCompletionAction}, + {"hidden_multigrid_sync_arg", + KernelArgMD::ValueKind::HiddenMultiGridSyncArg}, + {"hidden_hostcall_buffer", KernelArgMD::ValueKind::HiddenHostcallBuffer}, +}; + +// public variables -- TODO(ashwinma) move these to a runtime object? +atmi_machine_t g_atmi_machine; +ATLMachine g_atl_machine; + +hsa_region_t atl_gpu_kernarg_region; +std::vector atl_gpu_kernarg_pools; +hsa_region_t atl_cpu_kernarg_region; + +static std::vector g_executables; + +std::map KernelNameMap; +std::vector> KernelInfoTable; +std::vector> SymbolInfoTable; + +bool g_atmi_initialized = false; +bool g_atmi_hostcall_required = false; + +struct timespec context_init_time; +int context_init_time_init = 0; + +/* + atlc is all internal global values. + The structure atl_context_t is defined in atl_internal.h + Most references will use the global structure prefix atlc. + However the pointer value atlc_p-> is equivalent to atlc. + +*/ + +atl_context_t atlc = {.struct_initialized = false}; +atl_context_t *atlc_p = NULL; + +hsa_signal_t IdentityCopySignal; + +namespace core { +/* Machine Info */ +atmi_machine_t *Runtime::GetMachineInfo() { + if (!atlc.g_hsa_initialized) + return NULL; + return &g_atmi_machine; +} + +void atl_set_atmi_initialized() { + // FIXME: thread safe? locks? + g_atmi_initialized = true; +} + +void atl_reset_atmi_initialized() { + // FIXME: thread safe? locks? + g_atmi_initialized = false; +} + +bool atl_is_atmi_initialized() { return g_atmi_initialized; } + +void allow_access_to_all_gpu_agents(void *ptr) { + hsa_status_t err; + std::vector &gpu_procs = + g_atl_machine.processors(); + std::vector agents; + for (uint32_t i = 0; i < gpu_procs.size(); i++) { + agents.push_back(gpu_procs[i].agent()); + } + err = hsa_amd_agents_allow_access(agents.size(), &agents[0], NULL, ptr); + ErrorCheck(Allow agents ptr access, err); +} + +atmi_status_t Runtime::Initialize() { + atmi_devtype_t devtype = ATMI_DEVTYPE_GPU; + if (atl_is_atmi_initialized()) + return ATMI_STATUS_SUCCESS; + + if (devtype == ATMI_DEVTYPE_ALL || devtype & ATMI_DEVTYPE_GPU) { + ATMIErrorCheck(GPU context init, atl_init_gpu_context()); + } + + atl_set_atmi_initialized(); + return ATMI_STATUS_SUCCESS; +} + +atmi_status_t Runtime::Finalize() { + // TODO(ashwinma): Finalize all processors, queues, signals, kernarg memory + // regions + hsa_status_t err; + + for (uint32_t i = 0; i < g_executables.size(); i++) { + err = hsa_executable_destroy(g_executables[i]); + ErrorCheck(Destroying executable, err); + } + + // Finalize queues + for (auto &p : g_atl_machine.processors()) { + p.destroyQueues(); + } + for (auto &p : g_atl_machine.processors()) { + p.destroyQueues(); + } + + for (uint32_t i = 0; i < SymbolInfoTable.size(); i++) { + SymbolInfoTable[i].clear(); + } + SymbolInfoTable.clear(); + for (uint32_t i = 0; i < KernelInfoTable.size(); i++) { + KernelInfoTable[i].clear(); + } + KernelInfoTable.clear(); + + atl_reset_atmi_initialized(); + err = hsa_shut_down(); + ErrorCheck(Shutting down HSA, err); + + return ATMI_STATUS_SUCCESS; +} + +void atmi_init_context_structs() { + atlc_p = &atlc; + atlc.struct_initialized = true; /* This only gets called one time */ + atlc.g_hsa_initialized = false; + atlc.g_gpu_initialized = false; + atlc.g_tasks_initialized = false; +} + +// Implement memory_pool iteration function +static hsa_status_t get_memory_pool_info(hsa_amd_memory_pool_t memory_pool, + void *data) { + ATLProcessor *proc = reinterpret_cast(data); + hsa_status_t err = HSA_STATUS_SUCCESS; + // Check if the memory_pool is allowed to allocate, i.e. do not return group + // memory + bool alloc_allowed = false; + err = hsa_amd_memory_pool_get_info( + memory_pool, HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALLOWED, + &alloc_allowed); + ErrorCheck(Alloc allowed in memory pool check, err); + if (alloc_allowed) { + uint32_t global_flag = 0; + err = hsa_amd_memory_pool_get_info( + memory_pool, HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, &global_flag); + ErrorCheck(Get memory pool info, err); + if (HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_FINE_GRAINED & global_flag) { + ATLMemory new_mem(memory_pool, *proc, ATMI_MEMTYPE_FINE_GRAINED); + proc->addMemory(new_mem); + if (HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_KERNARG_INIT & global_flag) { + DEBUG_PRINT("GPU kernel args pool handle: %lu\n", memory_pool.handle); + atl_gpu_kernarg_pools.push_back(memory_pool); + } + } else { + ATLMemory new_mem(memory_pool, *proc, ATMI_MEMTYPE_COARSE_GRAINED); + proc->addMemory(new_mem); + } + } + + return err; +} + +static hsa_status_t get_agent_info(hsa_agent_t agent, void *data) { + hsa_status_t err = HSA_STATUS_SUCCESS; + hsa_device_type_t device_type; + err = hsa_agent_get_info(agent, HSA_AGENT_INFO_DEVICE, &device_type); + ErrorCheck(Get device type info, err); + switch (device_type) { + case HSA_DEVICE_TYPE_CPU: { + ; + ATLCPUProcessor new_proc(agent); + err = hsa_amd_agent_iterate_memory_pools(agent, get_memory_pool_info, + &new_proc); + ErrorCheck(Iterate all memory pools, err); + g_atl_machine.addProcessor(new_proc); + } break; + case HSA_DEVICE_TYPE_GPU: { + ; + hsa_profile_t profile; + err = hsa_agent_get_info(agent, HSA_AGENT_INFO_PROFILE, &profile); + ErrorCheck(Query the agent profile, err); + atmi_devtype_t gpu_type; + gpu_type = + (profile == HSA_PROFILE_FULL) ? ATMI_DEVTYPE_iGPU : ATMI_DEVTYPE_dGPU; + ATLGPUProcessor new_proc(agent, gpu_type); + err = hsa_amd_agent_iterate_memory_pools(agent, get_memory_pool_info, + &new_proc); + ErrorCheck(Iterate all memory pools, err); + g_atl_machine.addProcessor(new_proc); + } break; + case HSA_DEVICE_TYPE_DSP: { + err = HSA_STATUS_ERROR_INVALID_CODE_OBJECT; + } break; + } + + return err; +} + +hsa_status_t get_fine_grained_region(hsa_region_t region, void *data) { + hsa_region_segment_t segment; + hsa_region_get_info(region, HSA_REGION_INFO_SEGMENT, &segment); + if (segment != HSA_REGION_SEGMENT_GLOBAL) { + return HSA_STATUS_SUCCESS; + } + hsa_region_global_flag_t flags; + hsa_region_get_info(region, HSA_REGION_INFO_GLOBAL_FLAGS, &flags); + if (flags & HSA_REGION_GLOBAL_FLAG_FINE_GRAINED) { + hsa_region_t *ret = reinterpret_cast(data); + *ret = region; + return HSA_STATUS_INFO_BREAK; + } + return HSA_STATUS_SUCCESS; +} + +/* Determines if a memory region can be used for kernarg allocations. */ +static hsa_status_t get_kernarg_memory_region(hsa_region_t region, void *data) { + hsa_region_segment_t segment; + hsa_region_get_info(region, HSA_REGION_INFO_SEGMENT, &segment); + if (HSA_REGION_SEGMENT_GLOBAL != segment) { + return HSA_STATUS_SUCCESS; + } + + hsa_region_global_flag_t flags; + hsa_region_get_info(region, HSA_REGION_INFO_GLOBAL_FLAGS, &flags); + if (flags & HSA_REGION_GLOBAL_FLAG_KERNARG) { + hsa_region_t *ret = reinterpret_cast(data); + *ret = region; + return HSA_STATUS_INFO_BREAK; + } + + return HSA_STATUS_SUCCESS; +} + +static hsa_status_t init_compute_and_memory() { + hsa_status_t err; + + /* Iterate over the agents and pick the gpu agent */ + err = hsa_iterate_agents(get_agent_info, NULL); + if (err == HSA_STATUS_INFO_BREAK) { + err = HSA_STATUS_SUCCESS; + } + ErrorCheck(Getting a gpu agent, err); + if (err != HSA_STATUS_SUCCESS) + return err; + + /* Init all devices or individual device types? */ + std::vector &cpu_procs = + g_atl_machine.processors(); + std::vector &gpu_procs = + g_atl_machine.processors(); + /* For CPU memory pools, add other devices that can access them directly + * or indirectly */ + for (auto &cpu_proc : cpu_procs) { + for (auto &cpu_mem : cpu_proc.memories()) { + hsa_amd_memory_pool_t pool = cpu_mem.memory(); + for (auto &gpu_proc : gpu_procs) { + hsa_agent_t agent = gpu_proc.agent(); + hsa_amd_memory_pool_access_t access; + hsa_amd_agent_memory_pool_get_info( + agent, pool, HSA_AMD_AGENT_MEMORY_POOL_INFO_ACCESS, &access); + if (access != 0) { + // this means not NEVER, but could be YES or NO + // add this memory pool to the proc + gpu_proc.addMemory(cpu_mem); + } + } + } + } + + /* FIXME: are the below combinations of procs and memory pools needed? + * all to all compare procs with their memory pools and add those memory + * pools that are accessible by the target procs */ + for (auto &gpu_proc : gpu_procs) { + for (auto &gpu_mem : gpu_proc.memories()) { + hsa_amd_memory_pool_t pool = gpu_mem.memory(); + for (auto &cpu_proc : cpu_procs) { + hsa_agent_t agent = cpu_proc.agent(); + hsa_amd_memory_pool_access_t access; + hsa_amd_agent_memory_pool_get_info( + agent, pool, HSA_AMD_AGENT_MEMORY_POOL_INFO_ACCESS, &access); + if (access != 0) { + // this means not NEVER, but could be YES or NO + // add this memory pool to the proc + cpu_proc.addMemory(gpu_mem); + } + } + } + } + + g_atmi_machine.device_count_by_type[ATMI_DEVTYPE_CPU] = cpu_procs.size(); + g_atmi_machine.device_count_by_type[ATMI_DEVTYPE_GPU] = gpu_procs.size(); + + size_t num_procs = cpu_procs.size() + gpu_procs.size(); + // g_atmi_machine.devices = (atmi_device_t *)malloc(num_procs * + // sizeof(atmi_device_t)); + atmi_device_t *all_devices = reinterpret_cast( + malloc(num_procs * sizeof(atmi_device_t))); + int num_iGPUs = 0; + int num_dGPUs = 0; + for (uint32_t i = 0; i < gpu_procs.size(); i++) { + if (gpu_procs[i].type() == ATMI_DEVTYPE_iGPU) + num_iGPUs++; + else + num_dGPUs++; + } + assert(num_iGPUs + num_dGPUs == gpu_procs.size() && + "Number of dGPUs and iGPUs do not add up"); + DEBUG_PRINT("CPU Agents: %lu\n", cpu_procs.size()); + DEBUG_PRINT("iGPU Agents: %d\n", num_iGPUs); + DEBUG_PRINT("dGPU Agents: %d\n", num_dGPUs); + DEBUG_PRINT("GPU Agents: %lu\n", gpu_procs.size()); + + g_atmi_machine.device_count_by_type[ATMI_DEVTYPE_iGPU] = num_iGPUs; + g_atmi_machine.device_count_by_type[ATMI_DEVTYPE_dGPU] = num_dGPUs; + + int cpus_begin = 0; + int cpus_end = cpu_procs.size(); + int gpus_begin = cpu_procs.size(); + int gpus_end = cpu_procs.size() + gpu_procs.size(); + g_atmi_machine.devices_by_type[ATMI_DEVTYPE_CPU] = &all_devices[cpus_begin]; + g_atmi_machine.devices_by_type[ATMI_DEVTYPE_GPU] = &all_devices[gpus_begin]; + g_atmi_machine.devices_by_type[ATMI_DEVTYPE_iGPU] = &all_devices[gpus_begin]; + g_atmi_machine.devices_by_type[ATMI_DEVTYPE_dGPU] = &all_devices[gpus_begin]; + int proc_index = 0; + for (int i = cpus_begin; i < cpus_end; i++) { + all_devices[i].type = cpu_procs[proc_index].type(); + all_devices[i].core_count = cpu_procs[proc_index].num_cus(); + + std::vector memories = cpu_procs[proc_index].memories(); + int fine_memories_size = 0; + int coarse_memories_size = 0; + DEBUG_PRINT("CPU memory types:\t"); + for (auto &memory : memories) { + atmi_memtype_t type = memory.type(); + if (type == ATMI_MEMTYPE_FINE_GRAINED) { + fine_memories_size++; + DEBUG_PRINT("Fine\t"); + } else { + coarse_memories_size++; + DEBUG_PRINT("Coarse\t"); + } + } + DEBUG_PRINT("\nFine Memories : %d", fine_memories_size); + DEBUG_PRINT("\tCoarse Memories : %d\n", coarse_memories_size); + all_devices[i].memory_count = memories.size(); + proc_index++; + } + proc_index = 0; + for (int i = gpus_begin; i < gpus_end; i++) { + all_devices[i].type = gpu_procs[proc_index].type(); + all_devices[i].core_count = gpu_procs[proc_index].num_cus(); + + std::vector memories = gpu_procs[proc_index].memories(); + int fine_memories_size = 0; + int coarse_memories_size = 0; + DEBUG_PRINT("GPU memory types:\t"); + for (auto &memory : memories) { + atmi_memtype_t type = memory.type(); + if (type == ATMI_MEMTYPE_FINE_GRAINED) { + fine_memories_size++; + DEBUG_PRINT("Fine\t"); + } else { + coarse_memories_size++; + DEBUG_PRINT("Coarse\t"); + } + } + DEBUG_PRINT("\nFine Memories : %d", fine_memories_size); + DEBUG_PRINT("\tCoarse Memories : %d\n", coarse_memories_size); + all_devices[i].memory_count = memories.size(); + proc_index++; + } + proc_index = 0; + atl_cpu_kernarg_region.handle = (uint64_t)-1; + if (cpu_procs.size() > 0) { + err = hsa_agent_iterate_regions( + cpu_procs[0].agent(), get_fine_grained_region, &atl_cpu_kernarg_region); + if (err == HSA_STATUS_INFO_BREAK) { + err = HSA_STATUS_SUCCESS; + } + err = (atl_cpu_kernarg_region.handle == (uint64_t)-1) ? HSA_STATUS_ERROR + : HSA_STATUS_SUCCESS; + ErrorCheck(Finding a CPU kernarg memory region handle, err); + } + /* Find a memory region that supports kernel arguments. */ + atl_gpu_kernarg_region.handle = (uint64_t)-1; + if (gpu_procs.size() > 0) { + hsa_agent_iterate_regions(gpu_procs[0].agent(), get_kernarg_memory_region, + &atl_gpu_kernarg_region); + err = (atl_gpu_kernarg_region.handle == (uint64_t)-1) ? HSA_STATUS_ERROR + : HSA_STATUS_SUCCESS; + ErrorCheck(Finding a kernarg memory region, err); + } + if (num_procs > 0) + return HSA_STATUS_SUCCESS; + else + return HSA_STATUS_ERROR_NOT_INITIALIZED; +} + +hsa_status_t init_hsa() { + if (atlc.g_hsa_initialized == false) { + DEBUG_PRINT("Initializing HSA..."); + hsa_status_t err = hsa_init(); + ErrorCheck(Initializing the hsa runtime, err); + if (err != HSA_STATUS_SUCCESS) + return err; + + err = init_compute_and_memory(); + if (err != HSA_STATUS_SUCCESS) + return err; + ErrorCheck(After initializing compute and memory, err); + + int gpu_count = g_atl_machine.processorCount(); + KernelInfoTable.resize(gpu_count); + SymbolInfoTable.resize(gpu_count); + for (uint32_t i = 0; i < SymbolInfoTable.size(); i++) + SymbolInfoTable[i].clear(); + for (uint32_t i = 0; i < KernelInfoTable.size(); i++) + KernelInfoTable[i].clear(); + atlc.g_hsa_initialized = true; + DEBUG_PRINT("done\n"); + } + return HSA_STATUS_SUCCESS; +} + +void init_tasks() { + if (atlc.g_tasks_initialized != false) + return; + hsa_status_t err; + int task_num; + std::vector gpu_agents; + int gpu_count = g_atl_machine.processorCount(); + for (int gpu = 0; gpu < gpu_count; gpu++) { + atmi_place_t place = ATMI_PLACE_GPU(0, gpu); + ATLGPUProcessor &proc = get_processor(place); + gpu_agents.push_back(proc.agent()); + } + err = hsa_signal_create(0, 0, NULL, &IdentityCopySignal); + ErrorCheck(Creating a HSA signal, err); + atlc.g_tasks_initialized = true; +} + +hsa_status_t callbackEvent(const hsa_amd_event_t *event, void *data) { +#if (ROCM_VERSION_MAJOR >= 3) || \ + (ROCM_VERSION_MAJOR >= 2 && ROCM_VERSION_MINOR >= 3) + if (event->event_type == HSA_AMD_GPU_MEMORY_FAULT_EVENT) { +#else + if (event->event_type == GPU_MEMORY_FAULT_EVENT) { +#endif + hsa_amd_gpu_memory_fault_info_t memory_fault = event->memory_fault; + // memory_fault.agent + // memory_fault.virtual_address + // memory_fault.fault_reason_mask + // fprintf("[GPU Error at %p: Reason is ", memory_fault.virtual_address); + std::stringstream stream; + stream << std::hex << (uintptr_t)memory_fault.virtual_address; + std::string addr("0x" + stream.str()); + + std::string err_string = "[GPU Memory Error] Addr: " + addr; + err_string += " Reason: "; + if (!(memory_fault.fault_reason_mask & 0x00111111)) { + err_string += "No Idea! "; + } else { + if (memory_fault.fault_reason_mask & 0x00000001) + err_string += "Page not present or supervisor privilege. "; + if (memory_fault.fault_reason_mask & 0x00000010) + err_string += "Write access to a read-only page. "; + if (memory_fault.fault_reason_mask & 0x00000100) + err_string += "Execute access to a page marked NX. "; + if (memory_fault.fault_reason_mask & 0x00001000) + err_string += "Host access only. "; + if (memory_fault.fault_reason_mask & 0x00010000) + err_string += "ECC failure (if supported by HW). "; + if (memory_fault.fault_reason_mask & 0x00100000) + err_string += "Can't determine the exact fault address. "; + } + fprintf(stderr, "%s\n", err_string.c_str()); + return HSA_STATUS_ERROR; + } + return HSA_STATUS_SUCCESS; +} + +atmi_status_t atl_init_gpu_context() { + if (atlc.struct_initialized == false) + atmi_init_context_structs(); + if (atlc.g_gpu_initialized != false) + return ATMI_STATUS_SUCCESS; + + hsa_status_t err; + err = init_hsa(); + if (err != HSA_STATUS_SUCCESS) + return ATMI_STATUS_ERROR; + + int gpu_count = g_atl_machine.processorCount(); + for (int gpu = 0; gpu < gpu_count; gpu++) { + atmi_place_t place = ATMI_PLACE_GPU(0, gpu); + ATLGPUProcessor &proc = get_processor(place); + int num_gpu_queues = core::Runtime::getInstance().getNumGPUQueues(); + if (num_gpu_queues == -1) { + num_gpu_queues = proc.num_cus(); + num_gpu_queues = (num_gpu_queues > 8) ? 8 : num_gpu_queues; + } + proc.createQueues(num_gpu_queues); + } + + if (context_init_time_init == 0) { + clock_gettime(CLOCK_MONOTONIC_RAW, &context_init_time); + context_init_time_init = 1; + } + + err = hsa_amd_register_system_event_handler(callbackEvent, NULL); + ErrorCheck(Registering the system for memory faults, err); + + init_tasks(); + atlc.g_gpu_initialized = true; + return ATMI_STATUS_SUCCESS; +} + +bool isImplicit(KernelArgMD::ValueKind value_kind) { + switch (value_kind) { + case KernelArgMD::ValueKind::HiddenGlobalOffsetX: + case KernelArgMD::ValueKind::HiddenGlobalOffsetY: + case KernelArgMD::ValueKind::HiddenGlobalOffsetZ: + case KernelArgMD::ValueKind::HiddenNone: + case KernelArgMD::ValueKind::HiddenPrintfBuffer: + case KernelArgMD::ValueKind::HiddenDefaultQueue: + case KernelArgMD::ValueKind::HiddenCompletionAction: + case KernelArgMD::ValueKind::HiddenMultiGridSyncArg: + case KernelArgMD::ValueKind::HiddenHostcallBuffer: + return true; + default: + return false; + } +} + +static std::pair +find_metadata(void *binary, size_t binSize) { + std::pair failure = {nullptr, nullptr}; + + Elf *e = elf_memory(static_cast(binary), binSize); + if (elf_kind(e) != ELF_K_ELF) { + return failure; + } + + size_t numpHdrs; + if (elf_getphdrnum(e, &numpHdrs) != 0) { + return failure; + } + + for (size_t i = 0; i < numpHdrs; ++i) { + GElf_Phdr pHdr; + if (gelf_getphdr(e, i, &pHdr) != &pHdr) { + continue; + } + // Look for the runtime metadata note + if (pHdr.p_type == PT_NOTE && pHdr.p_align >= sizeof(int)) { + // Iterate over the notes in this segment + address ptr = (address)binary + pHdr.p_offset; + address segmentEnd = ptr + pHdr.p_filesz; + + while (ptr < segmentEnd) { + Elf_Note *note = reinterpret_cast(ptr); + address name = (address)¬e[1]; + + if (note->n_type == 7 || note->n_type == 8) { + return failure; + } else if (note->n_type == 10 /* NT_AMD_AMDGPU_HSA_METADATA */ && + note->n_namesz == sizeof "AMD" && + !memcmp(name, "AMD", note->n_namesz)) { + // code object v2 uses yaml metadata, no longer supported + return failure; + } else if (note->n_type == 32 /* NT_AMDGPU_METADATA */ && + note->n_namesz == sizeof "AMDGPU" && + !memcmp(name, "AMDGPU", note->n_namesz)) { + + // n_descsz = 485 + // value is padded to 4 byte alignment, may want to move end up to + // match + size_t offset = sizeof(uint32_t) * 3 /* fields */ + + sizeof("AMDGPU") /* name */ + + 1 /* padding to 4 byte alignment */; + + // Including the trailing padding means both pointers are 4 bytes + // aligned, which may be useful later. + unsigned char *metadata_start = (unsigned char *)ptr + offset; + unsigned char *metadata_end = + metadata_start + core::alignUp(note->n_descsz, 4); + return {metadata_start, metadata_end}; + } + ptr += sizeof(*note) + core::alignUp(note->n_namesz, sizeof(int)) + + core::alignUp(note->n_descsz, sizeof(int)); + } + } + } + + return failure; +} + +namespace { +int map_lookup_array(msgpack::byte_range message, const char *needle, + msgpack::byte_range *res, uint64_t *size) { + unsigned count = 0; + struct s : msgpack::functors_defaults { + s(unsigned &count, uint64_t *size) : count(count), size(size) {} + unsigned &count; + uint64_t *size; + const unsigned char *handle_array(uint64_t N, msgpack::byte_range bytes) { + count++; + *size = N; + return bytes.end; + } + }; + + msgpack::foreach_map(message, + [&](msgpack::byte_range key, msgpack::byte_range value) { + if (msgpack::message_is_string(key, needle)) { + // If the message is an array, record number of + // elements in *size + msgpack::handle_msgpack(value, {count, size}); + // return the whole array + *res = value; + } + }); + // Only claim success if exactly one key/array pair matched + return count != 1; +} + +int map_lookup_string(msgpack::byte_range message, const char *needle, + std::string *res) { + unsigned count = 0; + struct s : public msgpack::functors_defaults { + s(unsigned &count, std::string *res) : count(count), res(res) {} + unsigned &count; + std::string *res; + void handle_string(size_t N, const unsigned char *str) { + count++; + *res = std::string(str, str + N); + } + }; + msgpack::foreach_map(message, + [&](msgpack::byte_range key, msgpack::byte_range value) { + if (msgpack::message_is_string(key, needle)) { + msgpack::handle_msgpack(value, {count, res}); + } + }); + return count != 1; +} + +int map_lookup_uint64_t(msgpack::byte_range message, const char *needle, + uint64_t *res) { + unsigned count = 0; + msgpack::foreach_map(message, + [&](msgpack::byte_range key, msgpack::byte_range value) { + if (msgpack::message_is_string(key, needle)) { + msgpack::foronly_unsigned(value, [&](uint64_t x) { + count++; + *res = x; + }); + } + }); + return count != 1; +} + +int array_lookup_element(msgpack::byte_range message, uint64_t elt, + msgpack::byte_range *res) { + int rc = 1; + uint64_t i = 0; + msgpack::foreach_array(message, [&](msgpack::byte_range value) { + if (i == elt) { + *res = value; + rc = 0; + } + i++; + }); + return rc; +} + +int populate_kernelArgMD(msgpack::byte_range args_element, + KernelArgMD *kernelarg) { + using namespace msgpack; + int error = 0; + foreach_map(args_element, [&](byte_range key, byte_range value) -> void { + if (message_is_string(key, ".name")) { + foronly_string(value, [&](size_t N, const unsigned char *str) { + kernelarg->name_ = std::string(str, str + N); + }); + } else if (message_is_string(key, ".type_name")) { + foronly_string(value, [&](size_t N, const unsigned char *str) { + kernelarg->typeName_ = std::string(str, str + N); + }); + } else if (message_is_string(key, ".size")) { + foronly_unsigned(value, [&](uint64_t x) { kernelarg->size_ = x; }); + } else if (message_is_string(key, ".offset")) { + foronly_unsigned(value, [&](uint64_t x) { kernelarg->offset_ = x; }); + } else if (message_is_string(key, ".value_kind")) { + foronly_string(value, [&](size_t N, const unsigned char *str) { + std::string s = std::string(str, str + N); + auto itValueKind = ArgValueKind.find(s); + if (itValueKind != ArgValueKind.end()) { + kernelarg->valueKind_ = itValueKind->second; + } + }); + } + }); + return error; +} +} // namespace + +static hsa_status_t get_code_object_custom_metadata(void *binary, + size_t binSize, int gpu) { + // parse code object with different keys from v2 + // also, the kernel name is not the same as the symbol name -- so a + // symbol->name map is needed + + std::pair metadata = + find_metadata(binary, binSize); + if (!metadata.first) { + return HSA_STATUS_ERROR_INVALID_CODE_OBJECT; + } + + uint64_t kernelsSize = 0; + int msgpack_errors = 0; + msgpack::byte_range kernel_array; + msgpack_errors = + map_lookup_array({metadata.first, metadata.second}, "amdhsa.kernels", + &kernel_array, &kernelsSize); + msgpackErrorCheck(kernels lookup in program metadata, msgpack_errors); + + for (size_t i = 0; i < kernelsSize; i++) { + assert(msgpack_errors == 0); + std::string kernelName; + std::string languageName; + std::string symbolName; + + msgpack::byte_range element; + msgpack_errors += array_lookup_element(kernel_array, i, &element); + msgpackErrorCheck(element lookup in kernel metadata, msgpack_errors); + + msgpack_errors += map_lookup_string(element, ".name", &kernelName); + msgpack_errors += map_lookup_string(element, ".language", &languageName); + msgpack_errors += map_lookup_string(element, ".symbol", &symbolName); + msgpackErrorCheck(strings lookup in kernel metadata, msgpack_errors); + + atl_kernel_info_t info = {0, 0, 0, 0, 0, {}, {}, {}}; + size_t kernel_explicit_args_size = 0; + uint64_t kernel_segment_size; + msgpack_errors += map_lookup_uint64_t(element, ".kernarg_segment_size", + &kernel_segment_size); + msgpackErrorCheck(kernarg segment size metadata lookup in kernel metadata, + msgpack_errors); + + // create a map from symbol to name + DEBUG_PRINT("Kernel symbol %s; Name: %s; Size: %lu\n", symbolName.c_str(), + kernelName.c_str(), kernel_segment_size); + KernelNameMap[symbolName] = kernelName; + + bool hasHiddenArgs = false; + if (kernel_segment_size > 0) { + uint64_t argsSize; + size_t offset = 0; + + msgpack::byte_range args_array; + msgpack_errors += + map_lookup_array(element, ".args", &args_array, &argsSize); + msgpackErrorCheck(kernel args metadata lookup in kernel metadata, + msgpack_errors); + + info.num_args = argsSize; + + for (size_t i = 0; i < argsSize; ++i) { + KernelArgMD lcArg; + + msgpack::byte_range args_element; + msgpack_errors += array_lookup_element(args_array, i, &args_element); + msgpackErrorCheck(iterate args map in kernel args metadata, + msgpack_errors); + + msgpack_errors += populate_kernelArgMD(args_element, &lcArg); + msgpackErrorCheck(iterate args map in kernel args metadata, + msgpack_errors); + + // TODO(ashwinma): should the below population actions be done only for + // non-implicit args? + // populate info with sizes and offsets + info.arg_sizes.push_back(lcArg.size_); + // v3 has offset field and not align field + size_t new_offset = lcArg.offset_; + size_t padding = new_offset - offset; + offset = new_offset; + info.arg_offsets.push_back(lcArg.offset_); + DEBUG_PRINT("Arg[%lu] \"%s\" (%u, %u)\n", i, lcArg.name_.c_str(), + lcArg.size_, lcArg.offset_); + offset += lcArg.size_; + + // check if the arg is a hidden/implicit arg + // this logic assumes that all hidden args are 8-byte aligned + if (!isImplicit(lcArg.valueKind_)) { + kernel_explicit_args_size += lcArg.size_; + } else { + hasHiddenArgs = true; + } + kernel_explicit_args_size += padding; + } + } + + // add size of implicit args, e.g.: offset x, y and z and pipe pointer, but + // in ATMI, do not count the compiler set implicit args, but set your own + // implicit args by discounting the compiler set implicit args + info.kernel_segment_size = + (hasHiddenArgs ? kernel_explicit_args_size : kernel_segment_size) + + sizeof(atmi_implicit_args_t); + DEBUG_PRINT("[%s: kernarg seg size] (%lu --> %u)\n", kernelName.c_str(), + kernel_segment_size, info.kernel_segment_size); + + // kernel received, now add it to the kernel info table + KernelInfoTable[gpu][kernelName] = info; + } + + return HSA_STATUS_SUCCESS; +} + +static hsa_status_t populate_InfoTables(hsa_executable_t executable, + hsa_executable_symbol_t symbol, + void *data) { + int gpu = *static_cast(data); + hsa_symbol_kind_t type; + + uint32_t name_length; + hsa_status_t err; + err = hsa_executable_symbol_get_info(symbol, HSA_EXECUTABLE_SYMBOL_INFO_TYPE, + &type); + ErrorCheck(Symbol info extraction, err); + DEBUG_PRINT("Exec Symbol type: %d\n", type); + if (type == HSA_SYMBOL_KIND_KERNEL) { + err = hsa_executable_symbol_get_info( + symbol, HSA_EXECUTABLE_SYMBOL_INFO_NAME_LENGTH, &name_length); + ErrorCheck(Symbol info extraction, err); + char *name = reinterpret_cast(malloc(name_length + 1)); + err = hsa_executable_symbol_get_info(symbol, + HSA_EXECUTABLE_SYMBOL_INFO_NAME, name); + ErrorCheck(Symbol info extraction, err); + name[name_length] = 0; + + if (KernelNameMap.find(std::string(name)) == KernelNameMap.end()) { + // did not find kernel name in the kernel map; this can happen only + // if the ROCr API for getting symbol info (name) is different from + // the comgr method of getting symbol info + ErrorCheck(Invalid kernel name, HSA_STATUS_ERROR_INVALID_CODE_OBJECT); + } + atl_kernel_info_t info; + std::string kernelName = KernelNameMap[std::string(name)]; + // by now, the kernel info table should already have an entry + // because the non-ROCr custom code object parsing is called before + // iterating over the code object symbols using ROCr + if (KernelInfoTable[gpu].find(kernelName) == KernelInfoTable[gpu].end()) { + ErrorCheck(Finding the entry kernel info table, + HSA_STATUS_ERROR_INVALID_CODE_OBJECT); + } + // found, so assign and update + info = KernelInfoTable[gpu][kernelName]; + + /* Extract dispatch information from the symbol */ + err = hsa_executable_symbol_get_info( + symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT, + &(info.kernel_object)); + ErrorCheck(Extracting the symbol from the executable, err); + err = hsa_executable_symbol_get_info( + symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_GROUP_SEGMENT_SIZE, + &(info.group_segment_size)); + ErrorCheck(Extracting the group segment size from the executable, err); + err = hsa_executable_symbol_get_info( + symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_PRIVATE_SEGMENT_SIZE, + &(info.private_segment_size)); + ErrorCheck(Extracting the private segment from the executable, err); + + DEBUG_PRINT( + "Kernel %s --> %lx symbol %u group segsize %u pvt segsize %u bytes " + "kernarg\n", + kernelName.c_str(), info.kernel_object, info.group_segment_size, + info.private_segment_size, info.kernel_segment_size); + + // assign it back to the kernel info table + KernelInfoTable[gpu][kernelName] = info; + free(name); + } else if (type == HSA_SYMBOL_KIND_VARIABLE) { + err = hsa_executable_symbol_get_info( + symbol, HSA_EXECUTABLE_SYMBOL_INFO_NAME_LENGTH, &name_length); + ErrorCheck(Symbol info extraction, err); + char *name = reinterpret_cast(malloc(name_length + 1)); + err = hsa_executable_symbol_get_info(symbol, + HSA_EXECUTABLE_SYMBOL_INFO_NAME, name); + ErrorCheck(Symbol info extraction, err); + name[name_length] = 0; + + atl_symbol_info_t info; + + err = hsa_executable_symbol_get_info( + symbol, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS, &(info.addr)); + ErrorCheck(Symbol info address extraction, err); + + err = hsa_executable_symbol_get_info( + symbol, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_SIZE, &(info.size)); + ErrorCheck(Symbol info size extraction, err); + + atmi_mem_place_t place = ATMI_MEM_PLACE(ATMI_DEVTYPE_GPU, gpu, 0); + DEBUG_PRINT("Symbol %s = %p (%u bytes)\n", name, (void *)info.addr, + info.size); + register_allocation(reinterpret_cast(info.addr), (size_t)info.size, + place); + SymbolInfoTable[gpu][std::string(name)] = info; + if (strcmp(name, "needs_hostcall_buffer") == 0) + g_atmi_hostcall_required = true; + free(name); + } else { + DEBUG_PRINT("Symbol is an indirect function\n"); + } + return HSA_STATUS_SUCCESS; +} + +atmi_status_t Runtime::RegisterModuleFromMemory( + void *module_bytes, size_t module_size, atmi_place_t place, + atmi_status_t (*on_deserialized_data)(void *data, size_t size, + void *cb_state), + void *cb_state) { + hsa_status_t err; + int gpu = place.device_id; + assert(gpu >= 0); + + DEBUG_PRINT("Trying to load module to GPU-%d\n", gpu); + ATLGPUProcessor &proc = get_processor(place); + hsa_agent_t agent = proc.agent(); + hsa_executable_t executable = {0}; + hsa_profile_t agent_profile; + + err = hsa_agent_get_info(agent, HSA_AGENT_INFO_PROFILE, &agent_profile); + ErrorCheck(Query the agent profile, err); + // FIXME: Assume that every profile is FULL until we understand how to build + // GCN with base profile + agent_profile = HSA_PROFILE_FULL; + /* Create the empty executable. */ + err = hsa_executable_create(agent_profile, HSA_EXECUTABLE_STATE_UNFROZEN, "", + &executable); + ErrorCheck(Create the executable, err); + + bool module_load_success = false; + do // Existing control flow used continue, preserve that for this patch + { + { + // Some metadata info is not available through ROCr API, so use custom + // code object metadata parsing to collect such metadata info + + err = get_code_object_custom_metadata(module_bytes, module_size, gpu); + ErrorCheckAndContinue(Getting custom code object metadata, err); + + // Deserialize code object. + hsa_code_object_t code_object = {0}; + err = hsa_code_object_deserialize(module_bytes, module_size, NULL, + &code_object); + ErrorCheckAndContinue(Code Object Deserialization, err); + assert(0 != code_object.handle); + + // Mutating the device image here avoids another allocation & memcpy + void *code_object_alloc_data = + reinterpret_cast(code_object.handle); + atmi_status_t atmi_err = + on_deserialized_data(code_object_alloc_data, module_size, cb_state); + ATMIErrorCheck(Error in deserialized_data callback, atmi_err); + + /* Load the code object. */ + err = + hsa_executable_load_code_object(executable, agent, code_object, NULL); + ErrorCheckAndContinue(Loading the code object, err); + + // cannot iterate over symbols until executable is frozen + } + module_load_success = true; + } while (0); + DEBUG_PRINT("Modules loaded successful? %d\n", module_load_success); + if (module_load_success) { + /* Freeze the executable; it can now be queried for symbols. */ + err = hsa_executable_freeze(executable, ""); + ErrorCheck(Freeze the executable, err); + + err = hsa_executable_iterate_symbols(executable, populate_InfoTables, + static_cast(&gpu)); + ErrorCheck(Iterating over symbols for execuatable, err); + + // save the executable and destroy during finalize + g_executables.push_back(executable); + return ATMI_STATUS_SUCCESS; + } else { + return ATMI_STATUS_ERROR; + } +} + +} // namespace core diff --git a/openmp/libomptarget/plugins/amdgpu/impl/utils.cpp b/openmp/libomptarget/plugins/amdgpu/impl/utils.cpp new file mode 100644 --- /dev/null +++ b/openmp/libomptarget/plugins/amdgpu/impl/utils.cpp @@ -0,0 +1,136 @@ +/*===-------------------------------------------------------------------------- + * ATMI (Asynchronous Task and Memory Interface) + * + * This file is distributed under the MIT License. See LICENSE.txt for details. + *===------------------------------------------------------------------------*/ +#include "internal.h" +#include "rt.h" + +#ifndef _GNU_SOURCE +#define _GNU_SOURCE +#endif + +#include +#include +#include +#include +#include + +/* + * Helper functions + */ +const char *get_atmi_error_string(atmi_status_t err) { + switch (err) { + case ATMI_STATUS_SUCCESS: + return "ATMI_STATUS_SUCCESS"; + case ATMI_STATUS_UNKNOWN: + return "ATMI_STATUS_UNKNOWN"; + case ATMI_STATUS_ERROR: + return "ATMI_STATUS_ERROR"; + default: + return ""; + } +} + +const char *get_error_string(hsa_status_t err) { + switch (err) { + case HSA_STATUS_SUCCESS: + return "HSA_STATUS_SUCCESS"; + case HSA_STATUS_INFO_BREAK: + return "HSA_STATUS_INFO_BREAK"; + case HSA_STATUS_ERROR: + return "HSA_STATUS_ERROR"; + case HSA_STATUS_ERROR_INVALID_ARGUMENT: + return "HSA_STATUS_ERROR_INVALID_ARGUMENT"; + case HSA_STATUS_ERROR_INVALID_QUEUE_CREATION: + return "HSA_STATUS_ERROR_INVALID_QUEUE_CREATION"; + case HSA_STATUS_ERROR_INVALID_ALLOCATION: + return "HSA_STATUS_ERROR_INVALID_ALLOCATION"; + case HSA_STATUS_ERROR_INVALID_AGENT: + return "HSA_STATUS_ERROR_INVALID_AGENT"; + case HSA_STATUS_ERROR_INVALID_REGION: + return "HSA_STATUS_ERROR_INVALID_REGION"; + case HSA_STATUS_ERROR_INVALID_SIGNAL: + return "HSA_STATUS_ERROR_INVALID_SIGNAL"; + case HSA_STATUS_ERROR_INVALID_QUEUE: + return "HSA_STATUS_ERROR_INVALID_QUEUE"; + case HSA_STATUS_ERROR_OUT_OF_RESOURCES: + return "HSA_STATUS_ERROR_OUT_OF_RESOURCES"; + case HSA_STATUS_ERROR_INVALID_PACKET_FORMAT: + return "HSA_STATUS_ERROR_INVALID_PACKET_FORMAT"; + case HSA_STATUS_ERROR_RESOURCE_FREE: + return "HSA_STATUS_ERROR_RESOURCE_FREE"; + case HSA_STATUS_ERROR_NOT_INITIALIZED: + return "HSA_STATUS_ERROR_NOT_INITIALIZED"; + case HSA_STATUS_ERROR_REFCOUNT_OVERFLOW: + return "HSA_STATUS_ERROR_REFCOUNT_OVERFLOW"; + case HSA_STATUS_ERROR_INCOMPATIBLE_ARGUMENTS: + return "HSA_STATUS_ERROR_INCOMPATIBLE_ARGUMENTS"; + case HSA_STATUS_ERROR_INVALID_INDEX: + return "HSA_STATUS_ERROR_INVALID_INDEX"; + case HSA_STATUS_ERROR_INVALID_ISA: + return "HSA_STATUS_ERROR_INVALID_ISA"; + case HSA_STATUS_ERROR_INVALID_ISA_NAME: + return "HSA_STATUS_ERROR_INVALID_ISA_NAME"; + case HSA_STATUS_ERROR_INVALID_CODE_OBJECT: + return "HSA_STATUS_ERROR_INVALID_CODE_OBJECT"; + case HSA_STATUS_ERROR_INVALID_EXECUTABLE: + return "HSA_STATUS_ERROR_INVALID_EXECUTABLE"; + case HSA_STATUS_ERROR_FROZEN_EXECUTABLE: + return "HSA_STATUS_ERROR_FROZEN_EXECUTABLE"; + case HSA_STATUS_ERROR_INVALID_SYMBOL_NAME: + return "HSA_STATUS_ERROR_INVALID_SYMBOL_NAME"; + case HSA_STATUS_ERROR_VARIABLE_ALREADY_DEFINED: + return "HSA_STATUS_ERROR_VARIABLE_ALREADY_DEFINED"; + case HSA_STATUS_ERROR_VARIABLE_UNDEFINED: + return "HSA_STATUS_ERROR_VARIABLE_UNDEFINED"; + case HSA_STATUS_ERROR_EXCEPTION: + return "HSA_STATUS_ERROR_EXCEPTION"; + } +} + +namespace core { +/* + * Environment variables + */ +void Environment::GetEnvAll() { + std::string var = GetEnv("ATMI_HELP"); + if (!var.empty()) { + std::cout << "ATMI_MAX_HSA_QUEUE_SIZE : positive integer" << std::endl + << "ATMI_MAX_KERNEL_TYPES : positive integer" << std::endl + << "ATMI_DEVICE_GPU_WORKERS : positive integer" << std::endl + << "ATMI_DEVICE_CPU_WORKERS : positive integer" << std::endl + << "ATMI_DEBUG : 1 for printing out trace/debug info" + << std::endl; + exit(0); + } + + var = GetEnv("ATMI_MAX_HSA_QUEUE_SIZE"); + if (!var.empty()) + max_queue_size_ = std::stoi(var); + + var = GetEnv("ATMI_MAX_KERNEL_TYPES"); + if (!var.empty()) + max_kernel_types_ = std::stoi(var); + + /* TODO: If we get a good use case for device-specific worker count, we + * should explore it, but let us keep the worker count uniform for all + * devices of a type until that time + */ + var = GetEnv("ATMI_DEVICE_GPU_WORKERS"); + if (!var.empty()) + num_gpu_queues_ = std::stoi(var); + + /* TODO: If we get a good use case for device-specific worker count, we + * should explore it, but let us keep the worker count uniform for all + * devices of a type until that time + */ + var = GetEnv("ATMI_DEVICE_CPU_WORKERS"); + if (!var.empty()) + num_cpu_queues_ = std::stoi(var); + + var = GetEnv("ATMI_DEBUG"); + if (!var.empty()) + debug_mode_ = std::stoi(var); +} +} // namespace core diff --git a/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp b/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp new file mode 100644 --- /dev/null +++ b/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp @@ -0,0 +1,1713 @@ +//===----RTLs/hsa/src/rtl.cpp - Target RTLs Implementation -------- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// RTL for hsa machine +// +//===----------------------------------------------------------------------===// + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +// Header from ATMI interface +#include "atmi_interop_hsa.h" +#include "atmi_runtime.h" + +#include "internal.h" + +#include "internal.h" + +#include "omptargetplugin.h" + +// Get static gpu grid values from clang target-specific constants managed +// in the header file llvm/Frontend/OpenMP/OMPGridValues.h +// Copied verbatim to meet the requirement that libomptarget builds without +// a copy of llvm checked out nearby +namespace llvm { +namespace omp { +enum GVIDX { + /// The maximum number of workers in a kernel. + /// (THREAD_ABSOLUTE_LIMIT) - (GV_Warp_Size), might be issue for blockDim.z + GV_Threads, + /// The size reserved for data in a shared memory slot. + GV_Slot_Size, + /// The default value of maximum number of threads in a worker warp. + GV_Warp_Size, + /// Alternate warp size for some AMDGCN architectures. Same as GV_Warp_Size + /// for NVPTX. + GV_Warp_Size_32, + /// The number of bits required to represent the max number of threads in warp + GV_Warp_Size_Log2, + /// GV_Warp_Size * GV_Slot_Size, + GV_Warp_Slot_Size, + /// the maximum number of teams. + GV_Max_Teams, + /// Global Memory Alignment + GV_Mem_Align, + /// (~0u >> (GV_Warp_Size - GV_Warp_Size_Log2)) + GV_Warp_Size_Log2_Mask, + // An alternative to the heavy data sharing infrastructure that uses global + // memory is one that uses device __shared__ memory. The amount of such space + // (in bytes) reserved by the OpenMP runtime is noted here. + GV_SimpleBufferSize, + // The absolute maximum team size for a working group + GV_Max_WG_Size, + // The default maximum team size for a working group + GV_Default_WG_Size, + // This is GV_Max_WG_Size / GV_WarpSize. 32 for NVPTX and 16 for AMDGCN. + GV_Max_Warp_Number, + /// The slot size that should be reserved for a working warp. + /// (~0u >> (GV_Warp_Size - GV_Warp_Size_Log2)) + GV_Warp_Size_Log2_MaskL +}; + +static constexpr unsigned AMDGPUGpuGridValues[] = { + 448, // GV_Threads + 256, // GV_Slot_Size + 64, // GV_Warp_Size + 32, // GV_Warp_Size_32 + 6, // GV_Warp_Size_Log2 + 64 * 256, // GV_Warp_Slot_Size + 128, // GV_Max_Teams + 256, // GV_Mem_Align + 63, // GV_Warp_Size_Log2_Mask + 896, // GV_SimpleBufferSize + 1024, // GV_Max_WG_Size, + 256, // GV_Defaut_WG_Size + 1024 / 64, // GV_Max_WG_Size / GV_WarpSize + 63 // GV_Warp_Size_Log2_MaskL +}; +} // namespace omp +} // namespace llvm + +#ifndef TARGET_NAME +#define TARGET_NAME AMDHSA +#endif + +int print_kernel_trace; + +// Size of the target call stack struture +uint32_t TgtStackItemSize = 0; + +#ifdef OMPTARGET_DEBUG +static int DebugLevel = 0; + +#define GETNAME2(name) #name +#define GETNAME(name) GETNAME2(name) +#define DP(...) \ + do { \ + if (DebugLevel > 0) { \ + DEBUGP("Target " GETNAME(TARGET_NAME) " RTL", __VA_ARGS__); \ + } \ + } while (false) +#else // OMPTARGET_DEBUG +#define DP(...) \ + {} +#endif // OMPTARGET_DEBUG + +#undef check // Drop definition from internal.h +#ifdef OMPTARGET_DEBUG +#define check(msg, status) \ + if (status != ATMI_STATUS_SUCCESS) { \ + /* fprintf(stderr, "[%s:%d] %s failed.\n", __FILE__, __LINE__, #msg);*/ \ + DP(#msg " failed\n"); \ + /*assert(0);*/ \ + } else { \ + /* fprintf(stderr, "[%s:%d] %s succeeded.\n", __FILE__, __LINE__, #msg); \ + */ \ + DP(#msg " succeeded\n"); \ + } +#else +#define check(msg, status) \ + {} +#endif + +#include "../../common/elf_common.c" + +static bool elf_machine_id_is_amdgcn(__tgt_device_image *image) { + const uint16_t amdgcnMachineID = 224; + int32_t r = elf_check_machine(image, amdgcnMachineID); + if (!r) { + DP("Supported machine ID not found\n"); + } + return r; +} + +/// Keep entries table per device +struct FuncOrGblEntryTy { + __tgt_target_table Table; + std::vector<__tgt_offload_entry> Entries; +}; + +enum ExecutionModeType { + SPMD, // constructors, destructors, + // combined constructs (`teams distribute parallel for [simd]`) + GENERIC, // everything else + NONE +}; + +struct KernelArgPool { +private: + static pthread_mutex_t mutex; + +public: + uint32_t kernarg_segment_size; + void *kernarg_region = nullptr; + std::queue free_kernarg_segments; + + uint32_t kernarg_size_including_implicit() { + return kernarg_segment_size + sizeof(atmi_implicit_args_t); + } + + ~KernelArgPool() { + if (kernarg_region) { + auto r = hsa_amd_memory_pool_free(kernarg_region); + assert(r == HSA_STATUS_SUCCESS); + ErrorCheck(Memory pool free, r); + } + } + + // Can't really copy or move a mutex + KernelArgPool() = default; + KernelArgPool(const KernelArgPool &) = delete; + KernelArgPool(KernelArgPool &&) = delete; + + KernelArgPool(uint32_t kernarg_segment_size) + : kernarg_segment_size(kernarg_segment_size) { + + // atmi uses one pool per kernel for all gpus, with a fixed upper size + // preserving that exact scheme here, including the queue + { + hsa_status_t err = hsa_amd_memory_pool_allocate( + atl_gpu_kernarg_pools[0], + kernarg_size_including_implicit() * MAX_NUM_KERNELS, 0, + &kernarg_region); + ErrorCheck(Allocating memory for the executable-kernel, err); + core::allow_access_to_all_gpu_agents(kernarg_region); + + for (int i = 0; i < MAX_NUM_KERNELS; i++) { + free_kernarg_segments.push(i); + } + } + } + + void *allocate(uint64_t arg_num) { + assert((arg_num * sizeof(void *)) == kernarg_segment_size); + lock l(&mutex); + void *res = nullptr; + if (!free_kernarg_segments.empty()) { + + int free_idx = free_kernarg_segments.front(); + res = static_cast(static_cast(kernarg_region) + + (free_idx * kernarg_size_including_implicit())); + assert(free_idx == pointer_to_index(res)); + free_kernarg_segments.pop(); + } + return res; + } + + void deallocate(void *ptr) { + lock l(&mutex); + int idx = pointer_to_index(ptr); + free_kernarg_segments.push(idx); + } + +private: + int pointer_to_index(void *ptr) { + ptrdiff_t bytes = + static_cast(ptr) - static_cast(kernarg_region); + assert(bytes >= 0); + assert(bytes % kernarg_size_including_implicit() == 0); + return bytes / kernarg_size_including_implicit(); + } + struct lock { + lock(pthread_mutex_t *m) : m(m) { pthread_mutex_lock(m); } + ~lock() { pthread_mutex_unlock(m); } + pthread_mutex_t *m; + }; +}; +pthread_mutex_t KernelArgPool::mutex = PTHREAD_MUTEX_INITIALIZER; + +std::unordered_map> + KernelArgPoolMap; + +/// Use a single entity to encode a kernel and a set of flags +struct KernelTy { + // execution mode of kernel + // 0 - SPMD mode (without master warp) + // 1 - Generic mode (with master warp) + int8_t ExecutionMode; + int16_t ConstWGSize; + int8_t MaxParLevel; + int32_t device_id; + void *CallStackAddr; + const char *Name; + + KernelTy(int8_t _ExecutionMode, int16_t _ConstWGSize, int8_t _MaxParLevel, + int32_t _device_id, void *_CallStackAddr, const char *_Name, + uint32_t _kernarg_segment_size) + : ExecutionMode(_ExecutionMode), ConstWGSize(_ConstWGSize), + MaxParLevel(_MaxParLevel), device_id(_device_id), + CallStackAddr(_CallStackAddr), Name(_Name) { + DP("Construct kernelinfo: ExecMode %d\n", ExecutionMode); + + std::string N(_Name); + if (KernelArgPoolMap.find(N) == KernelArgPoolMap.end()) { + KernelArgPoolMap.insert( + std::make_pair(N, std::unique_ptr( + new KernelArgPool(_kernarg_segment_size)))); + } + } +}; + +/// List that contains all the kernels. +/// FIXME: we may need this to be per device and per library. +std::list KernelsList; + +// ATMI API to get gpu and gpu memory place +static atmi_place_t get_gpu_place(int device_id) { + return ATMI_PLACE_GPU(0, device_id); +} +static atmi_mem_place_t get_gpu_mem_place(int device_id) { + return ATMI_MEM_PLACE_GPU_MEM(0, device_id, 0); +} + +static std::vector find_gpu_agents() { + std::vector res; + + hsa_status_t err = hsa_iterate_agents( + [](hsa_agent_t agent, void *data) -> hsa_status_t { + std::vector *res = + static_cast *>(data); + + hsa_device_type_t device_type; + // get_info fails iff HSA runtime not yet initialized + hsa_status_t err = + hsa_agent_get_info(agent, HSA_AGENT_INFO_DEVICE, &device_type); + if (print_kernel_trace > 0 && err != HSA_STATUS_SUCCESS) + printf("rtl.cpp: err %d\n", err); + assert(err == HSA_STATUS_SUCCESS); + + if (device_type == HSA_DEVICE_TYPE_GPU) { + res->push_back(agent); + } + return HSA_STATUS_SUCCESS; + }, + &res); + + // iterate_agents fails iff HSA runtime not yet initialized + if (print_kernel_trace > 0 && err != HSA_STATUS_SUCCESS) + printf("rtl.cpp: err %d\n", err); + assert(err == HSA_STATUS_SUCCESS); + return res; +} + +static void callbackQueue(hsa_status_t status, hsa_queue_t *source, + void *data) { + if (status != HSA_STATUS_SUCCESS) { + const char *status_string; + if (hsa_status_string(status, &status_string) != HSA_STATUS_SUCCESS) { + status_string = "unavailable"; + } + fprintf(stderr, "[%s:%d] GPU error in queue %p %d (%s)\n", __FILE__, + __LINE__, source, status, status_string); + abort(); + } +} + +namespace core { +void packet_store_release(uint32_t *packet, uint16_t header, uint16_t rest) { + __atomic_store_n(packet, header | (rest << 16), __ATOMIC_RELEASE); +} + +uint16_t create_header(hsa_packet_type_t type, int barrier, + atmi_task_fence_scope_t acq_fence, + atmi_task_fence_scope_t rel_fence) { + uint16_t header = type << HSA_PACKET_HEADER_TYPE; + header |= barrier << HSA_PACKET_HEADER_BARRIER; + header |= (hsa_fence_scope_t) static_cast( + acq_fence << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE); + header |= (hsa_fence_scope_t) static_cast( + rel_fence << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE); + return header; +} +} // namespace core + +/// Class containing all the device information +class RTLDeviceInfoTy { + std::vector> FuncGblEntries; + +public: + int NumberOfDevices; + + // GPU devices + std::vector HSAAgents; + std::vector HSAQueues; // one per gpu + + // Device properties + std::vector ComputeUnits; + std::vector GroupsPerDevice; + std::vector ThreadsPerGroup; + std::vector WarpSize; + + // OpenMP properties + std::vector NumTeams; + std::vector NumThreads; + + // OpenMP Environment properties + int EnvNumTeams; + int EnvTeamLimit; + int EnvMaxTeamsDefault; + + // OpenMP Requires Flags + int64_t RequiresFlags; + + // Resource pools + SignalPoolT FreeSignalPool; + + static const int HardTeamLimit = 1 << 20; // 1 Meg + static const int DefaultNumTeams = 128; + static const int Max_Teams = + llvm::omp::AMDGPUGpuGridValues[llvm::omp::GVIDX::GV_Max_Teams]; + static const int Warp_Size = + llvm::omp::AMDGPUGpuGridValues[llvm::omp::GVIDX::GV_Warp_Size]; + static const int Max_WG_Size = + llvm::omp::AMDGPUGpuGridValues[llvm::omp::GVIDX::GV_Max_WG_Size]; + static const int Default_WG_Size = + llvm::omp::AMDGPUGpuGridValues[llvm::omp::GVIDX::GV_Default_WG_Size]; + + // Record entry point associated with device + void addOffloadEntry(int32_t device_id, __tgt_offload_entry entry) { + assert(device_id < (int32_t)FuncGblEntries.size() && + "Unexpected device id!"); + FuncOrGblEntryTy &E = FuncGblEntries[device_id].back(); + + E.Entries.push_back(entry); + } + + // Return true if the entry is associated with device + bool findOffloadEntry(int32_t device_id, void *addr) { + assert(device_id < (int32_t)FuncGblEntries.size() && + "Unexpected device id!"); + FuncOrGblEntryTy &E = FuncGblEntries[device_id].back(); + + for (auto &it : E.Entries) { + if (it.addr == addr) + return true; + } + + return false; + } + + // Return the pointer to the target entries table + __tgt_target_table *getOffloadEntriesTable(int32_t device_id) { + assert(device_id < (int32_t)FuncGblEntries.size() && + "Unexpected device id!"); + FuncOrGblEntryTy &E = FuncGblEntries[device_id].back(); + + int32_t size = E.Entries.size(); + + // Table is empty + if (!size) + return 0; + + __tgt_offload_entry *begin = &E.Entries[0]; + __tgt_offload_entry *end = &E.Entries[size - 1]; + + // Update table info according to the entries and return the pointer + E.Table.EntriesBegin = begin; + E.Table.EntriesEnd = ++end; + + return &E.Table; + } + + // Clear entries table for a device + void clearOffloadEntriesTable(int device_id) { + assert(device_id < (int32_t)FuncGblEntries.size() && + "Unexpected device id!"); + FuncGblEntries[device_id].emplace_back(); + FuncOrGblEntryTy &E = FuncGblEntries[device_id].back(); + // KernelArgPoolMap.clear(); + E.Entries.clear(); + E.Table.EntriesBegin = E.Table.EntriesEnd = 0; + } + + RTLDeviceInfoTy() { +#ifdef OMPTARGET_DEBUG + if (char *envStr = getenv("LIBOMPTARGET_DEBUG")) + DebugLevel = std::stoi(envStr); +#endif // OMPTARGET_DEBUG + + // LIBOMPTARGET_KERNEL_TRACE provides a kernel launch trace to stderr + // anytime. You do not need a debug library build. + // 0 => no tracing + // 1 => tracing dispatch only + // >1 => verbosity increase + if (char *envStr = getenv("LIBOMPTARGET_KERNEL_TRACE")) + print_kernel_trace = atoi(envStr); + else + print_kernel_trace = 0; + + DP("Start initializing HSA-ATMI\n"); + atmi_status_t err = atmi_init(); + if (err != ATMI_STATUS_SUCCESS) { + DP("Error when initializing HSA-ATMI\n"); + return; + } + + HSAAgents = find_gpu_agents(); + NumberOfDevices = (int)HSAAgents.size(); + + if (NumberOfDevices == 0) { + DP("There are no devices supporting HSA.\n"); + return; + } else { + DP("There are %d devices supporting HSA.\n", NumberOfDevices); + } + + // Init the device info + HSAQueues.resize(NumberOfDevices); + FuncGblEntries.resize(NumberOfDevices); + ThreadsPerGroup.resize(NumberOfDevices); + ComputeUnits.resize(NumberOfDevices); + GroupsPerDevice.resize(NumberOfDevices); + WarpSize.resize(NumberOfDevices); + NumTeams.resize(NumberOfDevices); + NumThreads.resize(NumberOfDevices); + + for (int i = 0; i < NumberOfDevices; i++) { + uint32_t queue_size = 0; + { + hsa_status_t err; + err = hsa_agent_get_info(HSAAgents[i], HSA_AGENT_INFO_QUEUE_MAX_SIZE, + &queue_size); + ErrorCheck(Querying the agent maximum queue size, err); + if (queue_size > core::Runtime::getInstance().getMaxQueueSize()) { + queue_size = core::Runtime::getInstance().getMaxQueueSize(); + } + } + + hsa_status_t rc = hsa_queue_create( + HSAAgents[i], queue_size, HSA_QUEUE_TYPE_MULTI, callbackQueue, NULL, + UINT32_MAX, UINT32_MAX, &HSAQueues[i]); + if (rc != HSA_STATUS_SUCCESS) { + DP("Failed to create HSA queues\n"); + return; + } + } + + for (int i = 0; i < NumberOfDevices; i++) { + ThreadsPerGroup[i] = RTLDeviceInfoTy::Default_WG_Size; + GroupsPerDevice[i] = RTLDeviceInfoTy::DefaultNumTeams; + ComputeUnits[i] = 1; + DP("Device %d: Initial groupsPerDevice %d & threadsPerGroup %d\n", i, + GroupsPerDevice[i], ThreadsPerGroup[i]); + } + + // Get environment variables regarding teams + char *envStr = getenv("OMP_TEAM_LIMIT"); + if (envStr) { + // OMP_TEAM_LIMIT has been set + EnvTeamLimit = std::stoi(envStr); + DP("Parsed OMP_TEAM_LIMIT=%d\n", EnvTeamLimit); + } else { + EnvTeamLimit = -1; + } + envStr = getenv("OMP_NUM_TEAMS"); + if (envStr) { + // OMP_NUM_TEAMS has been set + EnvNumTeams = std::stoi(envStr); + DP("Parsed OMP_NUM_TEAMS=%d\n", EnvNumTeams); + } else { + EnvNumTeams = -1; + } + // Get environment variables regarding expMaxTeams + envStr = getenv("OMP_MAX_TEAMS_DEFAULT"); + if (envStr) { + EnvMaxTeamsDefault = std::stoi(envStr); + DP("Parsed OMP_MAX_TEAMS_DEFAULT=%d\n", EnvMaxTeamsDefault); + } else { + EnvMaxTeamsDefault = -1; + } + + // Default state. + RequiresFlags = OMP_REQ_UNDEFINED; + } + + ~RTLDeviceInfoTy() { + DP("Finalizing the HSA-ATMI DeviceInfo.\n"); + KernelArgPoolMap.clear(); // calls hsa to free memory + atmi_finalize(); + } +}; + +pthread_mutex_t SignalPoolT::mutex = PTHREAD_MUTEX_INITIALIZER; + +// TODO: May need to drop the trailing to fields until deviceRTL is updated +struct omptarget_device_environmentTy { + int32_t debug_level; // gets value of envvar LIBOMPTARGET_DEVICE_RTL_DEBUG + // only useful for Debug build of deviceRTLs + int32_t num_devices; // gets number of active offload devices + int32_t device_num; // gets a value 0 to num_devices-1 +}; + +static RTLDeviceInfoTy DeviceInfo; + +namespace { + +int32_t dataRetrieve(int32_t DeviceId, void *HstPtr, void *TgtPtr, int64_t Size, + __tgt_async_info *AsyncInfoPtr) { + assert(AsyncInfoPtr && "AsyncInfoPtr is nullptr"); + assert(DeviceId < DeviceInfo.NumberOfDevices && "Device ID too large"); + // Return success if we are not copying back to host from target. + if (!HstPtr) + return OFFLOAD_SUCCESS; + atmi_status_t err; + DP("Retrieve data %ld bytes, (tgt:%016llx) -> (hst:%016llx).\n", Size, + (long long unsigned)(Elf64_Addr)TgtPtr, + (long long unsigned)(Elf64_Addr)HstPtr); + err = atmi_memcpy(HstPtr, TgtPtr, (size_t)Size); + if (err != ATMI_STATUS_SUCCESS) { + DP("Error when copying data from device to host. Pointers: " + "host = 0x%016lx, device = 0x%016lx, size = %lld\n", + (Elf64_Addr)HstPtr, (Elf64_Addr)TgtPtr, (unsigned long long)Size); + return OFFLOAD_FAIL; + } + DP("DONE Retrieve data %ld bytes, (tgt:%016llx) -> (hst:%016llx).\n", Size, + (long long unsigned)(Elf64_Addr)TgtPtr, + (long long unsigned)(Elf64_Addr)HstPtr); + return OFFLOAD_SUCCESS; +} + +int32_t dataSubmit(int32_t DeviceId, void *TgtPtr, void *HstPtr, int64_t Size, + __tgt_async_info *AsyncInfoPtr) { + assert(AsyncInfoPtr && "AsyncInfoPtr is nullptr"); + atmi_status_t err; + assert(DeviceId < DeviceInfo.NumberOfDevices && "Device ID too large"); + // Return success if we are not doing host to target. + if (!HstPtr) + return OFFLOAD_SUCCESS; + + DP("Submit data %ld bytes, (hst:%016llx) -> (tgt:%016llx).\n", Size, + (long long unsigned)(Elf64_Addr)HstPtr, + (long long unsigned)(Elf64_Addr)TgtPtr); + err = atmi_memcpy(TgtPtr, HstPtr, (size_t)Size); + if (err != ATMI_STATUS_SUCCESS) { + DP("Error when copying data from host to device. Pointers: " + "host = 0x%016lx, device = 0x%016lx, size = %lld\n", + (Elf64_Addr)HstPtr, (Elf64_Addr)TgtPtr, (unsigned long long)Size); + return OFFLOAD_FAIL; + } + return OFFLOAD_SUCCESS; +} + +// Async. +// The implementation was written with cuda streams in mind. The semantics of +// that are to execute kernels on a queue in order of insertion. A synchronise +// call then makes writes visible between host and device. This means a series +// of N data_submit_async calls are expected to execute serially. HSA offers +// various options to run the data copies concurrently. This may require changes +// to libomptarget. + +// __tgt_async_info* contains a void * Queue. Queue = 0 is used to indicate that +// there are no outstanding kernels that need to be synchronized. Any async call +// may be passed a Queue==0, at which point the cuda implementation will set it +// to non-null (see getStream). The cuda streams are per-device. Upstream may +// change this interface to explicitly initialize the async_info_pointer, but +// until then hsa lazily initializes it as well. + +void initAsyncInfoPtr(__tgt_async_info *async_info_ptr) { + // set non-null while using async calls, return to null to indicate completion + assert(async_info_ptr); + if (!async_info_ptr->Queue) { + async_info_ptr->Queue = reinterpret_cast(UINT64_MAX); + } +} +void finiAsyncInfoPtr(__tgt_async_info *async_info_ptr) { + assert(async_info_ptr); + assert(async_info_ptr->Queue); + async_info_ptr->Queue = 0; +} +} // namespace + +int32_t __tgt_rtl_is_valid_binary(__tgt_device_image *image) { + return elf_machine_id_is_amdgcn(image); +} + +int __tgt_rtl_number_of_devices() { return DeviceInfo.NumberOfDevices; } + +int64_t __tgt_rtl_init_requires(int64_t RequiresFlags) { + DP("Init requires flags to %ld\n", RequiresFlags); + DeviceInfo.RequiresFlags = RequiresFlags; + return RequiresFlags; +} + +int32_t __tgt_rtl_init_device(int device_id) { + hsa_status_t err; + + // this is per device id init + DP("Initialize the device id: %d\n", device_id); + + hsa_agent_t agent = DeviceInfo.HSAAgents[device_id]; + + // Get number of Compute Unit + uint32_t compute_units = 0; + err = hsa_agent_get_info( + agent, (hsa_agent_info_t)HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT, + &compute_units); + if (err != HSA_STATUS_SUCCESS) { + DeviceInfo.ComputeUnits[device_id] = 1; + DP("Error getting compute units : settiing to 1\n"); + } else { + DeviceInfo.ComputeUnits[device_id] = compute_units; + DP("Using %d compute unis per grid\n", DeviceInfo.ComputeUnits[device_id]); + } + if (print_kernel_trace > 1) + fprintf(stderr, "Device#%-2d CU's: %2d\n", device_id, + DeviceInfo.ComputeUnits[device_id]); + + // Query attributes to determine number of threads/block and blocks/grid. + uint16_t workgroup_max_dim[3]; + err = hsa_agent_get_info(agent, HSA_AGENT_INFO_WORKGROUP_MAX_DIM, + &workgroup_max_dim); + if (err != HSA_STATUS_SUCCESS) { + DeviceInfo.GroupsPerDevice[device_id] = RTLDeviceInfoTy::DefaultNumTeams; + DP("Error getting grid dims: num groups : %d\n", + RTLDeviceInfoTy::DefaultNumTeams); + } else if (workgroup_max_dim[0] <= RTLDeviceInfoTy::HardTeamLimit) { + DeviceInfo.GroupsPerDevice[device_id] = workgroup_max_dim[0]; + DP("Using %d ROCm blocks per grid\n", + DeviceInfo.GroupsPerDevice[device_id]); + } else { + DeviceInfo.GroupsPerDevice[device_id] = RTLDeviceInfoTy::HardTeamLimit; + DP("Max ROCm blocks per grid %d exceeds the hard team limit %d, capping " + "at the hard limit\n", + workgroup_max_dim[0], RTLDeviceInfoTy::HardTeamLimit); + } + + // Get thread limit + hsa_dim3_t grid_max_dim; + err = hsa_agent_get_info(agent, HSA_AGENT_INFO_GRID_MAX_DIM, &grid_max_dim); + if (err == HSA_STATUS_SUCCESS) { + DeviceInfo.ThreadsPerGroup[device_id] = + reinterpret_cast(&grid_max_dim)[0] / + DeviceInfo.GroupsPerDevice[device_id]; + if ((DeviceInfo.ThreadsPerGroup[device_id] > + RTLDeviceInfoTy::Max_WG_Size) || + DeviceInfo.ThreadsPerGroup[device_id] == 0) { + DP("Capped thread limit: %d\n", RTLDeviceInfoTy::Max_WG_Size); + DeviceInfo.ThreadsPerGroup[device_id] = RTLDeviceInfoTy::Max_WG_Size; + } else { + DP("Using ROCm Queried thread limit: %d\n", + DeviceInfo.ThreadsPerGroup[device_id]); + } + } else { + DeviceInfo.ThreadsPerGroup[device_id] = RTLDeviceInfoTy::Max_WG_Size; + DP("Error getting max block dimension, use default:%d \n", + RTLDeviceInfoTy::Max_WG_Size); + } + + // Get wavefront size + uint32_t wavefront_size = 0; + err = + hsa_agent_get_info(agent, HSA_AGENT_INFO_WAVEFRONT_SIZE, &wavefront_size); + if (err == HSA_STATUS_SUCCESS) { + DP("Queried wavefront size: %d\n", wavefront_size); + DeviceInfo.WarpSize[device_id] = wavefront_size; + } else { + DP("Default wavefront size: %d\n", + llvm::omp::AMDGPUGpuGridValues[llvm::omp::GVIDX::GV_Warp_Size]); + DeviceInfo.WarpSize[device_id] = + llvm::omp::AMDGPUGpuGridValues[llvm::omp::GVIDX::GV_Warp_Size]; + } + + // Adjust teams to the env variables + if (DeviceInfo.EnvTeamLimit > 0 && + DeviceInfo.GroupsPerDevice[device_id] > DeviceInfo.EnvTeamLimit) { + DeviceInfo.GroupsPerDevice[device_id] = DeviceInfo.EnvTeamLimit; + DP("Capping max groups per device to OMP_TEAM_LIMIT=%d\n", + DeviceInfo.EnvTeamLimit); + } + + // Set default number of teams + if (DeviceInfo.EnvNumTeams > 0) { + DeviceInfo.NumTeams[device_id] = DeviceInfo.EnvNumTeams; + DP("Default number of teams set according to environment %d\n", + DeviceInfo.EnvNumTeams); + } else { + DeviceInfo.NumTeams[device_id] = RTLDeviceInfoTy::DefaultNumTeams; + DP("Default number of teams set according to library's default %d\n", + RTLDeviceInfoTy::DefaultNumTeams); + } + + if (DeviceInfo.NumTeams[device_id] > DeviceInfo.GroupsPerDevice[device_id]) { + DeviceInfo.NumTeams[device_id] = DeviceInfo.GroupsPerDevice[device_id]; + DP("Default number of teams exceeds device limit, capping at %d\n", + DeviceInfo.GroupsPerDevice[device_id]); + } + + // Set default number of threads + DeviceInfo.NumThreads[device_id] = RTLDeviceInfoTy::Default_WG_Size; + DP("Default number of threads set according to library's default %d\n", + RTLDeviceInfoTy::Default_WG_Size); + if (DeviceInfo.NumThreads[device_id] > + DeviceInfo.ThreadsPerGroup[device_id]) { + DeviceInfo.NumTeams[device_id] = DeviceInfo.ThreadsPerGroup[device_id]; + DP("Default number of threads exceeds device limit, capping at %d\n", + DeviceInfo.ThreadsPerGroup[device_id]); + } + + DP("Device %d: default limit for groupsPerDevice %d & threadsPerGroup %d\n", + device_id, DeviceInfo.GroupsPerDevice[device_id], + DeviceInfo.ThreadsPerGroup[device_id]); + + DP("Device %d: wavefront size %d, total threads %d x %d = %d\n", device_id, + DeviceInfo.WarpSize[device_id], DeviceInfo.ThreadsPerGroup[device_id], + DeviceInfo.GroupsPerDevice[device_id], + DeviceInfo.GroupsPerDevice[device_id] * + DeviceInfo.ThreadsPerGroup[device_id]); + + return OFFLOAD_SUCCESS; +} + +namespace { +Elf64_Shdr *find_only_SHT_HASH(Elf *elf) { + size_t N; + int rc = elf_getshdrnum(elf, &N); + if (rc != 0) { + return nullptr; + } + + Elf64_Shdr *result = nullptr; + for (size_t i = 0; i < N; i++) { + Elf_Scn *scn = elf_getscn(elf, i); + if (scn) { + Elf64_Shdr *shdr = elf64_getshdr(scn); + if (shdr) { + if (shdr->sh_type == SHT_HASH) { + if (result == nullptr) { + result = shdr; + } else { + // multiple SHT_HASH sections not handled + return nullptr; + } + } + } + } + } + return result; +} + +const Elf64_Sym *elf_lookup(Elf *elf, char *base, Elf64_Shdr *section_hash, + const char *symname) { + + assert(section_hash); + size_t section_symtab_index = section_hash->sh_link; + Elf64_Shdr *section_symtab = + elf64_getshdr(elf_getscn(elf, section_symtab_index)); + size_t section_strtab_index = section_symtab->sh_link; + + const Elf64_Sym *symtab = + reinterpret_cast(base + section_symtab->sh_offset); + + const uint32_t *hashtab = + reinterpret_cast(base + section_hash->sh_offset); + + // Layout: + // nbucket + // nchain + // bucket[nbucket] + // chain[nchain] + uint32_t nbucket = hashtab[0]; + const uint32_t *bucket = &hashtab[2]; + const uint32_t *chain = &hashtab[nbucket + 2]; + + const size_t max = strlen(symname) + 1; + const uint32_t hash = elf_hash(symname); + for (uint32_t i = bucket[hash % nbucket]; i != 0; i = chain[i]) { + char *n = elf_strptr(elf, section_strtab_index, symtab[i].st_name); + if (strncmp(symname, n, max) == 0) { + return &symtab[i]; + } + } + + return nullptr; +} + +typedef struct { + void *addr = nullptr; + uint32_t size = UINT32_MAX; +} symbol_info; + +int get_symbol_info_without_loading(Elf *elf, char *base, const char *symname, + symbol_info *res) { + if (elf_kind(elf) != ELF_K_ELF) { + return 1; + } + + Elf64_Shdr *section_hash = find_only_SHT_HASH(elf); + if (!section_hash) { + return 1; + } + + const Elf64_Sym *sym = elf_lookup(elf, base, section_hash, symname); + if (!sym) { + return 1; + } + + if (sym->st_size > UINT32_MAX) { + return 1; + } + + res->size = static_cast(sym->st_size); + res->addr = sym->st_value + base; + return 0; +} + +int get_symbol_info_without_loading(char *base, size_t img_size, + const char *symname, symbol_info *res) { + Elf *elf = elf_memory(base, img_size); + if (elf) { + int rc = get_symbol_info_without_loading(elf, base, symname, res); + elf_end(elf); + return rc; + } + return 1; +} + +atmi_status_t interop_get_symbol_info(char *base, size_t img_size, + const char *symname, void **var_addr, + uint32_t *var_size) { + symbol_info si; + int rc = get_symbol_info_without_loading(base, img_size, symname, &si); + if (rc == 0) { + *var_addr = si.addr; + *var_size = si.size; + return ATMI_STATUS_SUCCESS; + } else { + return ATMI_STATUS_ERROR; + } +} + +template +atmi_status_t module_register_from_memory_to_place(void *module_bytes, + size_t module_size, + atmi_place_t place, C cb) { + auto L = [](void *data, size_t size, void *cb_state) -> atmi_status_t { + C *unwrapped = static_cast(cb_state); + return (*unwrapped)(data, size); + }; + return atmi_module_register_from_memory_to_place( + module_bytes, module_size, place, L, static_cast(&cb)); +} +} // namespace + +static __tgt_target_table * +__tgt_rtl_load_binary_locked(int32_t device_id, __tgt_device_image *image); + +__tgt_target_table *__tgt_rtl_load_binary(int32_t device_id, + __tgt_device_image *image) { + static pthread_mutex_t load_binary_mutex = PTHREAD_MUTEX_INITIALIZER; + pthread_mutex_lock(&load_binary_mutex); + __tgt_target_table *res = __tgt_rtl_load_binary_locked(device_id, image); + pthread_mutex_unlock(&load_binary_mutex); + return res; +} + +__tgt_target_table *__tgt_rtl_load_binary_locked(int32_t device_id, + __tgt_device_image *image) { + const size_t img_size = (char *)image->ImageEnd - (char *)image->ImageStart; + + DeviceInfo.clearOffloadEntriesTable(device_id); + + // We do not need to set the ELF version because the caller of this function + // had to do that to decide the right runtime to use + + if (!elf_machine_id_is_amdgcn(image)) { + return NULL; + } + + omptarget_device_environmentTy host_device_env; + host_device_env.num_devices = DeviceInfo.NumberOfDevices; + host_device_env.device_num = device_id; + host_device_env.debug_level = 0; +#ifdef OMPTARGET_DEBUG + if (char *envStr = getenv("LIBOMPTARGET_DEVICE_RTL_DEBUG")) { + host_device_env.debug_level = std::stoi(envStr); + } +#endif + + auto on_deserialized_data = [&](void *data, size_t size) -> atmi_status_t { + const char *device_env_Name = "omptarget_device_environment"; + symbol_info si; + int rc = get_symbol_info_without_loading((char *)image->ImageStart, + img_size, device_env_Name, &si); + if (rc != 0) { + DP("Finding global device environment '%s' - symbol missing.\n", + device_env_Name); + // no need to return FAIL, consider this is a not a device debug build. + return ATMI_STATUS_SUCCESS; + } + if (si.size != sizeof(host_device_env)) { + return ATMI_STATUS_ERROR; + } + DP("Setting global device environment %lu bytes\n", si.size); + uint64_t offset = (char *)si.addr - (char *)image->ImageStart; + void *pos = (char *)data + offset; + memcpy(pos, &host_device_env, sizeof(host_device_env)); + return ATMI_STATUS_SUCCESS; + }; + + atmi_status_t err; + { + err = module_register_from_memory_to_place( + (void *)image->ImageStart, img_size, get_gpu_place(device_id), + on_deserialized_data); + + check("Module registering", err); + if (err != ATMI_STATUS_SUCCESS) { + char GPUName[64] = "--unknown gpu--"; + hsa_agent_t agent = DeviceInfo.HSAAgents[device_id]; + (void)hsa_agent_get_info(agent, (hsa_agent_info_t)HSA_AGENT_INFO_NAME, + (void *)GPUName); + fprintf(stderr, + "Possible gpu arch mismatch: %s, please check" + " compiler: -march= flag\n", + GPUName); + return NULL; + } + } + + DP("ATMI module successfully loaded!\n"); + + // TODO: Check with Guansong to understand the below comment more thoroughly. + // Here, we take advantage of the data that is appended after img_end to get + // the symbols' name we need to load. This data consist of the host entries + // begin and end as well as the target name (see the offloading linker script + // creation in clang compiler). + + // Find the symbols in the module by name. The name can be obtain by + // concatenating the host entry name with the target name + + __tgt_offload_entry *HostBegin = image->EntriesBegin; + __tgt_offload_entry *HostEnd = image->EntriesEnd; + + for (__tgt_offload_entry *e = HostBegin; e != HostEnd; ++e) { + + if (!e->addr) { + // The host should have always something in the address to + // uniquely identify the target region. + fprintf(stderr, "Analyzing host entry '' (size = %lld)...\n", + (unsigned long long)e->size); + return NULL; + } + + if (e->size) { + __tgt_offload_entry entry = *e; + + void *varptr; + uint32_t varsize; + + err = atmi_interop_hsa_get_symbol_info(get_gpu_mem_place(device_id), + e->name, &varptr, &varsize); + + if (err != ATMI_STATUS_SUCCESS) { + DP("Loading global '%s' (Failed)\n", e->name); + // Inform the user what symbol prevented offloading + fprintf(stderr, "Loading global '%s' (Failed)\n", e->name); + return NULL; + } + + if (varsize != e->size) { + DP("Loading global '%s' - size mismatch (%u != %lu)\n", e->name, + varsize, e->size); + return NULL; + } + + DP("Entry point " DPxMOD " maps to global %s (" DPxMOD ")\n", + DPxPTR(e - HostBegin), e->name, DPxPTR(varptr)); + entry.addr = (void *)varptr; + + DeviceInfo.addOffloadEntry(device_id, entry); + + if (DeviceInfo.RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY && + e->flags & OMP_DECLARE_TARGET_LINK) { + // If unified memory is present any target link variables + // can access host addresses directly. There is no longer a + // need for device copies. + err = atmi_memcpy(varptr, e->addr, sizeof(void *)); + if (err != ATMI_STATUS_SUCCESS) + DP("Error when copying USM\n"); + DP("Copy linked variable host address (" DPxMOD ")" + "to device address (" DPxMOD ")\n", + DPxPTR(*((void **)e->addr)), DPxPTR(varptr)); + } + + continue; + } + + DP("to find the kernel name: %s size: %lu\n", e->name, strlen(e->name)); + + atmi_mem_place_t place = get_gpu_mem_place(device_id); + uint32_t kernarg_segment_size; + err = atmi_interop_hsa_get_kernel_info( + place, e->name, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_SIZE, + &kernarg_segment_size); + + // each arg is a void * in this openmp implementation + uint32_t arg_num = kernarg_segment_size / sizeof(void *); + std::vector arg_sizes(arg_num); + for (std::vector::iterator it = arg_sizes.begin(); + it != arg_sizes.end(); it++) { + *it = sizeof(void *); + } + + // default value GENERIC (in case symbol is missing from cubin file) + int8_t ExecModeVal = ExecutionModeType::GENERIC; + + // get flat group size if present, else Default_WG_Size + int16_t WGSizeVal = RTLDeviceInfoTy::Default_WG_Size; + + // Max parallel level + int16_t MaxParLevVal = 0; + + // get Kernel Descriptor if present. + // Keep struct in sync wih getTgtAttributeStructQTy in CGOpenMPRuntime.cpp + struct KernDescValType { + uint16_t Version; + uint16_t TSize; + uint16_t WG_Size; + uint8_t Mode; + uint8_t HostServices; + uint8_t MaxParallelLevel; + }; + struct KernDescValType KernDescVal; + std::string KernDescNameStr(e->name); + KernDescNameStr += "_kern_desc"; + const char *KernDescName = KernDescNameStr.c_str(); + + void *KernDescPtr; + uint32_t KernDescSize; + void *CallStackAddr; + err = interop_get_symbol_info((char *)image->ImageStart, img_size, + KernDescName, &KernDescPtr, &KernDescSize); + + if (err == ATMI_STATUS_SUCCESS) { + if ((size_t)KernDescSize != sizeof(KernDescVal)) + DP("Loading global computation properties '%s' - size mismatch (%u != " + "%lu)\n", + KernDescName, KernDescSize, sizeof(KernDescVal)); + + memcpy(&KernDescVal, KernDescPtr, (size_t)KernDescSize); + + // Check structure size against recorded size. + if ((size_t)KernDescSize != KernDescVal.TSize) + DP("KernDescVal size %lu does not match advertized size %d for '%s'\n", + sizeof(KernDescVal), KernDescVal.TSize, KernDescName); + + DP("After loading global for %s KernDesc \n", KernDescName); + DP("KernDesc: Version: %d\n", KernDescVal.Version); + DP("KernDesc: TSize: %d\n", KernDescVal.TSize); + DP("KernDesc: WG_Size: %d\n", KernDescVal.WG_Size); + DP("KernDesc: Mode: %d\n", KernDescVal.Mode); + DP("KernDesc: HostServices: %x\n", KernDescVal.HostServices); + DP("KernDesc: MaxParallelLevel: %x\n", KernDescVal.MaxParallelLevel); + + // gather location of callStack and size of struct + MaxParLevVal = KernDescVal.MaxParallelLevel; + if (MaxParLevVal > 0) { + uint32_t varsize; + const char *CsNam = "omptarget_nest_par_call_stack"; + err = atmi_interop_hsa_get_symbol_info(place, CsNam, &CallStackAddr, + &varsize); + if (err != ATMI_STATUS_SUCCESS) { + fprintf(stderr, "Addr of %s failed\n", CsNam); + return NULL; + } + void *StructSizePtr; + const char *SsNam = "omptarget_nest_par_call_struct_size"; + err = interop_get_symbol_info((char *)image->ImageStart, img_size, + SsNam, &StructSizePtr, &varsize); + if ((err != ATMI_STATUS_SUCCESS) || + (varsize != sizeof(TgtStackItemSize))) { + fprintf(stderr, "Addr of %s failed\n", SsNam); + return NULL; + } + memcpy(&TgtStackItemSize, StructSizePtr, sizeof(TgtStackItemSize)); + DP("Size of our struct is %d\n", TgtStackItemSize); + } + + // Get ExecMode + ExecModeVal = KernDescVal.Mode; + DP("ExecModeVal %d\n", ExecModeVal); + if (KernDescVal.WG_Size == 0) { + KernDescVal.WG_Size = RTLDeviceInfoTy::Default_WG_Size; + DP("Setting KernDescVal.WG_Size to default %d\n", KernDescVal.WG_Size); + } + WGSizeVal = KernDescVal.WG_Size; + DP("WGSizeVal %d\n", WGSizeVal); + check("Loading KernDesc computation property", err); + } else { + DP("Warning: Loading KernDesc '%s' - symbol not found, ", KernDescName); + + // Generic + std::string ExecModeNameStr(e->name); + ExecModeNameStr += "_exec_mode"; + const char *ExecModeName = ExecModeNameStr.c_str(); + + void *ExecModePtr; + uint32_t varsize; + err = interop_get_symbol_info((char *)image->ImageStart, img_size, + ExecModeName, &ExecModePtr, &varsize); + + if (err == ATMI_STATUS_SUCCESS) { + if ((size_t)varsize != sizeof(int8_t)) { + DP("Loading global computation properties '%s' - size mismatch(%u != " + "%lu)\n", + ExecModeName, varsize, sizeof(int8_t)); + return NULL; + } + + memcpy(&ExecModeVal, ExecModePtr, (size_t)varsize); + + DP("After loading global for %s ExecMode = %d\n", ExecModeName, + ExecModeVal); + + if (ExecModeVal < 0 || ExecModeVal > 1) { + DP("Error wrong exec_mode value specified in HSA code object file: " + "%d\n", + ExecModeVal); + return NULL; + } + } else { + DP("Loading global exec_mode '%s' - symbol missing, using default " + "value " + "GENERIC (1)\n", + ExecModeName); + } + check("Loading computation property", err); + + // Flat group size + std::string WGSizeNameStr(e->name); + WGSizeNameStr += "_wg_size"; + const char *WGSizeName = WGSizeNameStr.c_str(); + + void *WGSizePtr; + uint32_t WGSize; + err = interop_get_symbol_info((char *)image->ImageStart, img_size, + WGSizeName, &WGSizePtr, &WGSize); + + if (err == ATMI_STATUS_SUCCESS) { + if ((size_t)WGSize != sizeof(int16_t)) { + DP("Loading global computation properties '%s' - size mismatch (%u " + "!= " + "%lu)\n", + WGSizeName, WGSize, sizeof(int16_t)); + return NULL; + } + + memcpy(&WGSizeVal, WGSizePtr, (size_t)WGSize); + + DP("After loading global for %s WGSize = %d\n", WGSizeName, WGSizeVal); + + if (WGSizeVal < RTLDeviceInfoTy::Default_WG_Size || + WGSizeVal > RTLDeviceInfoTy::Max_WG_Size) { + DP("Error wrong WGSize value specified in HSA code object file: " + "%d\n", + WGSizeVal); + WGSizeVal = RTLDeviceInfoTy::Default_WG_Size; + } + } else { + DP("Warning: Loading WGSize '%s' - symbol not found, " + "using default value %d\n", + WGSizeName, WGSizeVal); + } + + check("Loading WGSize computation property", err); + } + + KernelsList.push_back(KernelTy(ExecModeVal, WGSizeVal, MaxParLevVal, + device_id, CallStackAddr, e->name, + kernarg_segment_size)); + __tgt_offload_entry entry = *e; + entry.addr = (void *)&KernelsList.back(); + DeviceInfo.addOffloadEntry(device_id, entry); + DP("Entry point %ld maps to %s\n", e - HostBegin, e->name); + } + + return DeviceInfo.getOffloadEntriesTable(device_id); +} + +void *__tgt_rtl_data_alloc(int device_id, int64_t size, void *) { + void *ptr = NULL; + assert(device_id < DeviceInfo.NumberOfDevices && "Device ID too large"); + atmi_status_t err = atmi_malloc(&ptr, size, get_gpu_mem_place(device_id)); + DP("Tgt alloc data %ld bytes, (tgt:%016llx).\n", size, + (long long unsigned)(Elf64_Addr)ptr); + ptr = (err == ATMI_STATUS_SUCCESS) ? ptr : NULL; + return ptr; +} + +int32_t __tgt_rtl_data_submit(int device_id, void *tgt_ptr, void *hst_ptr, + int64_t size) { + assert(device_id < DeviceInfo.NumberOfDevices && "Device ID too large"); + __tgt_async_info async_info; + int32_t rc = dataSubmit(device_id, tgt_ptr, hst_ptr, size, &async_info); + if (rc != OFFLOAD_SUCCESS) + return OFFLOAD_FAIL; + + return __tgt_rtl_synchronize(device_id, &async_info); +} + +int32_t __tgt_rtl_data_submit_async(int device_id, void *tgt_ptr, void *hst_ptr, + int64_t size, + __tgt_async_info *async_info_ptr) { + assert(device_id < DeviceInfo.NumberOfDevices && "Device ID too large"); + if (async_info_ptr) { + initAsyncInfoPtr(async_info_ptr); + return dataSubmit(device_id, tgt_ptr, hst_ptr, size, async_info_ptr); + } else { + return __tgt_rtl_data_submit(device_id, tgt_ptr, hst_ptr, size); + } +} + +int32_t __tgt_rtl_data_retrieve(int device_id, void *hst_ptr, void *tgt_ptr, + int64_t size) { + assert(device_id < DeviceInfo.NumberOfDevices && "Device ID too large"); + __tgt_async_info async_info; + int32_t rc = dataRetrieve(device_id, hst_ptr, tgt_ptr, size, &async_info); + if (rc != OFFLOAD_SUCCESS) + return OFFLOAD_FAIL; + + return __tgt_rtl_synchronize(device_id, &async_info); +} + +int32_t __tgt_rtl_data_retrieve_async(int device_id, void *hst_ptr, + void *tgt_ptr, int64_t size, + __tgt_async_info *async_info_ptr) { + assert(async_info_ptr && "async_info is nullptr"); + assert(device_id < DeviceInfo.NumberOfDevices && "Device ID too large"); + initAsyncInfoPtr(async_info_ptr); + return dataRetrieve(device_id, hst_ptr, tgt_ptr, size, async_info_ptr); +} + +int32_t __tgt_rtl_data_delete(int device_id, void *tgt_ptr) { + assert(device_id < DeviceInfo.NumberOfDevices && "Device ID too large"); + atmi_status_t err; + DP("Tgt free data (tgt:%016llx).\n", (long long unsigned)(Elf64_Addr)tgt_ptr); + err = atmi_free(tgt_ptr); + if (err != ATMI_STATUS_SUCCESS) { + DP("Error when freeing CUDA memory\n"); + return OFFLOAD_FAIL; + } + return OFFLOAD_SUCCESS; +} + +// Determine launch values for threadsPerGroup and num_groups. +// Outputs: treadsPerGroup, num_groups +// Inputs: Max_Teams, Max_WG_Size, Warp_Size, ExecutionMode, +// EnvTeamLimit, EnvNumTeams, num_teams, thread_limit, +// loop_tripcount. +void getLaunchVals(int &threadsPerGroup, int &num_groups, int ConstWGSize, + int ExecutionMode, int EnvTeamLimit, int EnvNumTeams, + int num_teams, int thread_limit, uint64_t loop_tripcount) { + + int Max_Teams = DeviceInfo.EnvMaxTeamsDefault > 0 + ? DeviceInfo.EnvMaxTeamsDefault + : DeviceInfo.Max_Teams; + if (Max_Teams > DeviceInfo.HardTeamLimit) + Max_Teams = DeviceInfo.HardTeamLimit; + + if (print_kernel_trace > 1) { + fprintf(stderr, "RTLDeviceInfoTy::Max_Teams: %d\n", + RTLDeviceInfoTy::Max_Teams); + fprintf(stderr, "Max_Teams: %d\n", Max_Teams); + fprintf(stderr, "RTLDeviceInfoTy::Warp_Size: %d\n", + RTLDeviceInfoTy::Warp_Size); + fprintf(stderr, "RTLDeviceInfoTy::Max_WG_Size: %d\n", + RTLDeviceInfoTy::Max_WG_Size); + fprintf(stderr, "RTLDeviceInfoTy::Default_WG_Size: %d\n", + RTLDeviceInfoTy::Default_WG_Size); + fprintf(stderr, "thread_limit: %d\n", thread_limit); + fprintf(stderr, "threadsPerGroup: %d\n", threadsPerGroup); + fprintf(stderr, "ConstWGSize: %d\n", ConstWGSize); + } + // check for thread_limit() clause + if (thread_limit > 0) { + threadsPerGroup = thread_limit; + DP("Setting threads per block to requested %d\n", thread_limit); + if (ExecutionMode == GENERIC) { // Add master warp for GENERIC + threadsPerGroup += RTLDeviceInfoTy::Warp_Size; + DP("Adding master wavefront: +%d threads\n", RTLDeviceInfoTy::Warp_Size); + } + if (threadsPerGroup > RTLDeviceInfoTy::Max_WG_Size) { // limit to max + threadsPerGroup = RTLDeviceInfoTy::Max_WG_Size; + DP("Setting threads per block to maximum %d\n", threadsPerGroup); + } + } + // check flat_max_work_group_size attr here + if (threadsPerGroup > ConstWGSize) { + threadsPerGroup = ConstWGSize; + DP("Reduced threadsPerGroup to flat-attr-group-size limit %d\n", + threadsPerGroup); + } + if (print_kernel_trace > 1) + fprintf(stderr, "threadsPerGroup: %d\n", threadsPerGroup); + DP("Preparing %d threads\n", threadsPerGroup); + + // Set default num_groups (teams) + if (DeviceInfo.EnvTeamLimit > 0) + num_groups = (Max_Teams < DeviceInfo.EnvTeamLimit) + ? Max_Teams + : DeviceInfo.EnvTeamLimit; + else + num_groups = Max_Teams; + DP("Set default num of groups %d\n", num_groups); + + if (print_kernel_trace > 1) { + fprintf(stderr, "num_groups: %d\n", num_groups); + fprintf(stderr, "num_teams: %d\n", num_teams); + } + + // Reduce num_groups if threadsPerGroup exceeds RTLDeviceInfoTy::Max_WG_Size + // This reduction is typical for default case (no thread_limit clause). + // or when user goes crazy with num_teams clause. + // FIXME: We cant distinguish between a constant or variable thread limit. + // So we only handle constant thread_limits. + if (threadsPerGroup > + RTLDeviceInfoTy::Default_WG_Size) // 256 < threadsPerGroup <= 1024 + // Should we round threadsPerGroup up to nearest RTLDeviceInfoTy::Warp_Size + // here? + num_groups = (Max_Teams * RTLDeviceInfoTy::Max_WG_Size) / threadsPerGroup; + + // check for num_teams() clause + if (num_teams > 0) { + num_groups = (num_teams < num_groups) ? num_teams : num_groups; + } + if (print_kernel_trace > 1) { + fprintf(stderr, "num_groups: %d\n", num_groups); + fprintf(stderr, "DeviceInfo.EnvNumTeams %d\n", DeviceInfo.EnvNumTeams); + fprintf(stderr, "DeviceInfo.EnvTeamLimit %d\n", DeviceInfo.EnvTeamLimit); + } + + if (DeviceInfo.EnvNumTeams > 0) { + num_groups = (DeviceInfo.EnvNumTeams < num_groups) ? DeviceInfo.EnvNumTeams + : num_groups; + DP("Modifying teams based on EnvNumTeams %d\n", DeviceInfo.EnvNumTeams); + } else if (DeviceInfo.EnvTeamLimit > 0) { + num_groups = (DeviceInfo.EnvTeamLimit < num_groups) + ? DeviceInfo.EnvTeamLimit + : num_groups; + DP("Modifying teams based on EnvTeamLimit%d\n", DeviceInfo.EnvTeamLimit); + } else { + if (num_teams <= 0) { + if (loop_tripcount > 0) { + if (ExecutionMode == SPMD) { + // round up to the nearest integer + num_groups = ((loop_tripcount - 1) / threadsPerGroup) + 1; + } else { + num_groups = loop_tripcount; + } + DP("Using %d teams due to loop trip count %" PRIu64 " and number of " + "threads per block %d\n", + num_groups, loop_tripcount, threadsPerGroup); + } + } else { + num_groups = num_teams; + } + if (num_groups > Max_Teams) { + num_groups = Max_Teams; + if (print_kernel_trace > 1) + fprintf(stderr, "Limiting num_groups %d to Max_Teams %d \n", num_groups, + Max_Teams); + } + if (num_groups > num_teams && num_teams > 0) { + num_groups = num_teams; + if (print_kernel_trace > 1) + fprintf(stderr, "Limiting num_groups %d to clause num_teams %d \n", + num_groups, num_teams); + } + } + + // num_teams clause always honored, no matter what, unless DEFAULT is active. + if (num_teams > 0) { + num_groups = num_teams; + // Cap num_groups to EnvMaxTeamsDefault if set. + if (DeviceInfo.EnvMaxTeamsDefault > 0 && + num_groups > DeviceInfo.EnvMaxTeamsDefault) + num_groups = DeviceInfo.EnvMaxTeamsDefault; + } + if (print_kernel_trace > 1) { + fprintf(stderr, "threadsPerGroup: %d\n", threadsPerGroup); + fprintf(stderr, "num_groups: %d\n", num_groups); + fprintf(stderr, "loop_tripcount: %ld\n", loop_tripcount); + } + DP("Final %d num_groups and %d threadsPerGroup\n", num_groups, + threadsPerGroup); +} + +static void *AllocateNestedParallelCallMemory(int MaxParLevel, int NumGroups, + int ThreadsPerGroup, + int device_id, + void *CallStackAddr, int SPMD) { + if (print_kernel_trace > 1) + fprintf(stderr, "MaxParLevel %d SPMD %d NumGroups %d NumThrds %d\n", + MaxParLevel, SPMD, NumGroups, ThreadsPerGroup); + // Total memory needed is Teams * Threads * ParLevels + size_t NestedMemSize = + MaxParLevel * NumGroups * ThreadsPerGroup * TgtStackItemSize * 4; + + if (print_kernel_trace > 1) + fprintf(stderr, "NestedMemSize %ld \n", NestedMemSize); + assert(device_id < DeviceInfo.NumberOfDevices && "Device ID too large"); + void *TgtPtr = NULL; + atmi_status_t err = + atmi_malloc(&TgtPtr, NestedMemSize, get_gpu_mem_place(device_id)); + err = atmi_memcpy(CallStackAddr, &TgtPtr, sizeof(void *)); + if (print_kernel_trace > 2) + fprintf(stderr, "CallSck %lx TgtPtr %lx *TgtPtr %lx \n", + (long)CallStackAddr, (long)&TgtPtr, (long)TgtPtr); + if (err != ATMI_STATUS_SUCCESS) { + fprintf(stderr, "Mem not wrtten to target, err %d\n", err); + } + return TgtPtr; // we need to free this after kernel. +} + +static uint64_t acquire_available_packet_id(hsa_queue_t *queue) { + uint64_t packet_id = hsa_queue_add_write_index_relaxed(queue, 1); + bool full = true; + while (full) { + full = + packet_id >= (queue->size + hsa_queue_load_read_index_acquire(queue)); + } + return packet_id; +} + +int32_t __tgt_rtl_run_target_team_region(int32_t device_id, void *tgt_entry_ptr, + void **tgt_args, + ptrdiff_t *tgt_offsets, + int32_t arg_num, int32_t num_teams, + int32_t thread_limit, + uint64_t loop_tripcount) { + // Set the context we are using + // update thread limit content in gpu memory if un-initialized or specified + // from host + + DP("Run target team region thread_limit %d\n", thread_limit); + + // All args are references. + std::vector args(arg_num); + std::vector ptrs(arg_num); + + DP("Arg_num: %d\n", arg_num); + for (int32_t i = 0; i < arg_num; ++i) { + ptrs[i] = (void *)((intptr_t)tgt_args[i] + tgt_offsets[i]); + args[i] = &ptrs[i]; + DP("Offseted base: arg[%d]:" DPxMOD "\n", i, DPxPTR(ptrs[i])); + } + + KernelTy *KernelInfo = (KernelTy *)tgt_entry_ptr; + + /* + * Set limit based on ThreadsPerGroup and GroupsPerDevice + */ + int num_groups = 0; + + int threadsPerGroup = RTLDeviceInfoTy::Default_WG_Size; + + getLaunchVals(threadsPerGroup, num_groups, KernelInfo->ConstWGSize, + KernelInfo->ExecutionMode, DeviceInfo.EnvTeamLimit, + DeviceInfo.EnvNumTeams, + num_teams, // From run_region arg + thread_limit, // From run_region arg + loop_tripcount // From run_region arg + ); + + void *TgtCallStack = NULL; + if (KernelInfo->MaxParLevel > 0) + TgtCallStack = AllocateNestedParallelCallMemory( + KernelInfo->MaxParLevel, num_groups, threadsPerGroup, + KernelInfo->device_id, KernelInfo->CallStackAddr, + KernelInfo->ExecutionMode); + + if (print_kernel_trace > 0) + // enum modes are SPMD, GENERIC, NONE 0,1,2 + fprintf(stderr, + "DEVID:%2d SGN:%1d ConstWGSize:%-4d args:%2d teamsXthrds:(%4dX%4d) " + "reqd:(%4dX%4d) n:%s\n", + device_id, KernelInfo->ExecutionMode, KernelInfo->ConstWGSize, + arg_num, num_groups, threadsPerGroup, num_teams, thread_limit, + KernelInfo->Name); + + // Run on the device. + { + hsa_queue_t *queue = DeviceInfo.HSAQueues[device_id]; + uint64_t packet_id = acquire_available_packet_id(queue); + + const uint32_t mask = queue->size - 1; // size is a power of 2 + hsa_kernel_dispatch_packet_t *packet = + (hsa_kernel_dispatch_packet_t *)queue->base_address + + (packet_id & mask); + + // packet->header is written last + packet->setup = UINT16_C(1) << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS; + packet->workgroup_size_x = threadsPerGroup; + packet->workgroup_size_y = 1; + packet->workgroup_size_z = 1; + packet->reserved0 = 0; + packet->grid_size_x = num_groups * threadsPerGroup; + packet->grid_size_y = 1; + packet->grid_size_z = 1; + packet->private_segment_size = 0; + packet->group_segment_size = 0; + packet->kernel_object = 0; + packet->kernarg_address = 0; // use the block allocator + packet->reserved2 = 0; // atmi writes id_ here + packet->completion_signal = {0}; // may want a pool of signals + + std::string kernel_name = std::string(KernelInfo->Name); + { + assert(KernelInfoTable[device_id].find(kernel_name) != + KernelInfoTable[device_id].end()); + auto it = KernelInfoTable[device_id][kernel_name]; + packet->kernel_object = it.kernel_object; + packet->private_segment_size = it.private_segment_size; + packet->group_segment_size = it.group_segment_size; + assert(arg_num == (int)it.num_args); + } + + KernelArgPool *ArgPool = nullptr; + { + auto it = KernelArgPoolMap.find(std::string(KernelInfo->Name)); + if (it != KernelArgPoolMap.end()) { + ArgPool = (it->second).get(); + } + } + if (!ArgPool) { + fprintf(stderr, "Warning: No ArgPool for %s on device %d\n", + KernelInfo->Name, device_id); + } + { + void *kernarg = nullptr; + if (ArgPool) { + assert(ArgPool->kernarg_segment_size == (arg_num * sizeof(void *))); + kernarg = ArgPool->allocate(arg_num); + } + if (!kernarg) { + printf("Allocate kernarg failed\n"); + exit(1); + } + + // Copy explicit arguments + for (int i = 0; i < arg_num; i++) { + memcpy((char *)kernarg + sizeof(void *) * i, args[i], sizeof(void *)); + } + + // Initialize implicit arguments. ATMI seems to leave most fields + // uninitialized + atmi_implicit_args_t *impl_args = + reinterpret_cast( + static_cast(kernarg) + ArgPool->kernarg_segment_size); + memset(impl_args, 0, + sizeof(atmi_implicit_args_t)); // may not be necessary + impl_args->offset_x = 0; + impl_args->offset_y = 0; + impl_args->offset_z = 0; + + packet->kernarg_address = kernarg; + } + + { + hsa_signal_t s = DeviceInfo.FreeSignalPool.pop(); + if (s.handle == 0) { + printf("Failed to get signal instance\n"); + exit(1); + } + packet->completion_signal = s; + hsa_signal_store_relaxed(packet->completion_signal, 1); + } + + core::packet_store_release( + reinterpret_cast(packet), + core::create_header(HSA_PACKET_TYPE_KERNEL_DISPATCH, 0, + ATMI_FENCE_SCOPE_SYSTEM, ATMI_FENCE_SCOPE_SYSTEM), + packet->setup); + + hsa_signal_store_relaxed(queue->doorbell_signal, packet_id); + + while (hsa_signal_wait_acquire(packet->completion_signal, + HSA_SIGNAL_CONDITION_EQ, 0, UINT64_MAX, + HSA_WAIT_STATE_BLOCKED) != 0) + ; + + assert(ArgPool); + ArgPool->deallocate(packet->kernarg_address); + DeviceInfo.FreeSignalPool.push(packet->completion_signal); + } + + DP("Kernel completed\n"); + // Free call stack for nested + if (TgtCallStack) + atmi_free(TgtCallStack); + + return OFFLOAD_SUCCESS; +} + +int32_t __tgt_rtl_run_target_region(int32_t device_id, void *tgt_entry_ptr, + void **tgt_args, ptrdiff_t *tgt_offsets, + int32_t arg_num) { + // use one team and one thread + // fix thread num + int32_t team_num = 1; + int32_t thread_limit = 0; // use default + return __tgt_rtl_run_target_team_region(device_id, tgt_entry_ptr, tgt_args, + tgt_offsets, arg_num, team_num, + thread_limit, 0); +} + +int32_t __tgt_rtl_run_target_region_async(int32_t device_id, + void *tgt_entry_ptr, void **tgt_args, + ptrdiff_t *tgt_offsets, + int32_t arg_num, + __tgt_async_info *async_info_ptr) { + assert(async_info_ptr && "async_info is nullptr"); + initAsyncInfoPtr(async_info_ptr); + + // use one team and one thread + // fix thread num + int32_t team_num = 1; + int32_t thread_limit = 0; // use default + return __tgt_rtl_run_target_team_region(device_id, tgt_entry_ptr, tgt_args, + tgt_offsets, arg_num, team_num, + thread_limit, 0); +} + +int32_t __tgt_rtl_synchronize(int32_t device_id, + __tgt_async_info *async_info_ptr) { + assert(async_info_ptr && "async_info is nullptr"); + + // Cuda asserts that async_info_ptr->Queue is non-null, but this invariant + // is not ensured by devices.cpp for amdgcn + // assert(async_info_ptr->Queue && "async_info_ptr->Queue is nullptr"); + if (async_info_ptr->Queue) { + finiAsyncInfoPtr(async_info_ptr); + } + return OFFLOAD_SUCCESS; +} diff --git a/openmp/libomptarget/src/rtl.cpp b/openmp/libomptarget/src/rtl.cpp --- a/openmp/libomptarget/src/rtl.cpp +++ b/openmp/libomptarget/src/rtl.cpp @@ -27,7 +27,9 @@ /* PowerPC target */ "libomptarget.rtl.ppc64.so", /* x86_64 target */ "libomptarget.rtl.x86_64.so", /* CUDA target */ "libomptarget.rtl.cuda.so", - /* AArch64 target */ "libomptarget.rtl.aarch64.so"}; + /* AArch64 target */ "libomptarget.rtl.aarch64.so", + /* AMGGPU target */ "libomptarget.rtl.amdgpu.so", +}; RTLsTy *RTLs; std::mutex *RTLsMtx;