diff --git a/openmp/libomptarget/CMakeLists.txt b/openmp/libomptarget/CMakeLists.txt --- a/openmp/libomptarget/CMakeLists.txt +++ b/openmp/libomptarget/CMakeLists.txt @@ -106,7 +106,6 @@ "Path to folder containing llvm library libomptarget.so") # Build offloading plugins and device RTLs if they are available. -add_subdirectory(plugins) add_subdirectory(plugins-nextgen) add_subdirectory(DeviceRTL) add_subdirectory(tools) diff --git a/openmp/libomptarget/plugins/CMakeLists.txt b/openmp/libomptarget/plugins/CMakeLists.txt deleted file mode 100644 --- a/openmp/libomptarget/plugins/CMakeLists.txt +++ /dev/null @@ -1,93 +0,0 @@ -##===----------------------------------------------------------------------===## -# -# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -# See https://llvm.org/LICENSE.txt for license information. -# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -# -##===----------------------------------------------------------------------===## -# -# Build plugins for the user system if available. -# -##===----------------------------------------------------------------------===## - -add_subdirectory(common) - -# void build_generic_elf64(string tmachine, string tmachine_name, string tmachine_libname, string elf_machine_id); -# - build a plugin for an ELF based generic 64-bit target based on libffi. -# - tmachine: name of the machine processor as used in the cmake build system. -# - tmachine_name: name of the machine to be printed with the debug messages. -# - tmachine_libname: machine name to be appended to the plugin library name. -macro(build_generic_elf64 tmachine tmachine_name tmachine_libname tmachine_triple elf_machine_id) -if(CMAKE_SYSTEM_PROCESSOR MATCHES "${tmachine}$") - if(LIBOMPTARGET_DEP_LIBFFI_FOUND) - - libomptarget_say("Building ${tmachine_name} offloading plugin.") - - # Define macro to be used as prefix of the runtime messages for this target. - add_definitions("-DTARGET_NAME=${tmachine_name}") - - # Define macro with the ELF ID for this target. - add_definitions("-DTARGET_ELF_ID=${elf_machine_id}") - - add_llvm_library("omptarget.rtl.${tmachine_libname}" - SHARED - - ${CMAKE_CURRENT_SOURCE_DIR}/../generic-elf-64bit/src/rtl.cpp - - ADDITIONAL_HEADER_DIRS - ${LIBOMPTARGET_INCLUDE_DIR} - ${LIBOMPTARGET_DEP_LIBFFI_INCLUDE_DIR} - - LINK_LIBS - PRIVATE - elf_common - ${LIBOMPTARGET_DEP_LIBFFI_LIBRARIES} - ${OPENMP_PTHREAD_LIB} - "-Wl,--version-script=${CMAKE_CURRENT_SOURCE_DIR}/../exports" - - NO_INSTALL_RPATH - ) - - # Install plugin under the lib destination folder. - install(TARGETS "omptarget.rtl.${tmachine_libname}" - LIBRARY DESTINATION "${OPENMP_INSTALL_LIBDIR}") - set_target_properties("omptarget.rtl.${tmachine_libname}" PROPERTIES - INSTALL_RPATH "$ORIGIN" BUILD_RPATH "$ORIGIN:${CMAKE_CURRENT_BINARY_DIR}/.." - CXX_VISIBILITY_PRESET protected) - - target_include_directories( "omptarget.rtl.${tmachine_libname}" PRIVATE - ${LIBOMPTARGET_INCLUDE_DIR} - ${LIBOMPTARGET_DEP_LIBFFI_INCLUDE_DIR}) - - list(APPEND LIBOMPTARGET_TESTED_PLUGINS - "omptarget.rtl.${tmachine_libname}") - - # Report to the parent scope that we are building a plugin. - set(LIBOMPTARGET_SYSTEM_TARGETS - "${LIBOMPTARGET_SYSTEM_TARGETS} ${tmachine_triple} ${tmachine_triple}-oldDriver" PARENT_SCOPE) - set(LIBOMPTARGET_SYSTEM_TARGETS - "${LIBOMPTARGET_SYSTEM_TARGETS} ${tmachine_triple} ${tmachine_triple}-LTO" PARENT_SCOPE) - set(LIBOMPTARGET_TESTED_PLUGINS - "${LIBOMPTARGET_TESTED_PLUGINS}" PARENT_SCOPE) - - else(LIBOMPTARGET_DEP_LIBFFI_FOUND) - libomptarget_say("Not building ${tmachine_name} offloading plugin: libffi dependency not found.") - endif(LIBOMPTARGET_DEP_LIBFFI_FOUND) -else() - libomptarget_say("Not building ${tmachine_name} offloading plugin: machine not found in the system.") -endif() -endmacro() - -add_subdirectory(aarch64) -add_subdirectory(amdgpu) -add_subdirectory(cuda) -add_subdirectory(ppc64) -add_subdirectory(ppc64le) -add_subdirectory(ve) -add_subdirectory(x86_64) -add_subdirectory(remote) - -# Make sure the parent scope can see the plugins that will be created. -set(LIBOMPTARGET_SYSTEM_TARGETS "${LIBOMPTARGET_SYSTEM_TARGETS}" PARENT_SCOPE) -set(LIBOMPTARGET_TESTED_PLUGINS "${LIBOMPTARGET_TESTED_PLUGINS}" PARENT_SCOPE) - diff --git a/openmp/libomptarget/plugins/aarch64/CMakeLists.txt b/openmp/libomptarget/plugins/aarch64/CMakeLists.txt deleted file mode 100644 --- a/openmp/libomptarget/plugins/aarch64/CMakeLists.txt +++ /dev/null @@ -1,17 +0,0 @@ -##===----------------------------------------------------------------------===## -# -# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -# See https://llvm.org/LICENSE.txt for license information. -# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -# -##===----------------------------------------------------------------------===## -# -# Build a plugin for an aarch64 machine if available. -# -##===----------------------------------------------------------------------===## - -if(CMAKE_SYSTEM_NAME MATCHES "Linux") - build_generic_elf64("aarch64" "aarch64" "aarch64" "aarch64-unknown-linux-gnu" "183") -else() - libomptarget_say("Not building aarch64 offloading plugin: machine not found in the system.") -endif() diff --git a/openmp/libomptarget/plugins/amdgpu/CMakeLists.txt b/openmp/libomptarget/plugins/amdgpu/CMakeLists.txt deleted file mode 100644 --- a/openmp/libomptarget/plugins/amdgpu/CMakeLists.txt +++ /dev/null @@ -1,125 +0,0 @@ -##===----------------------------------------------------------------------===## -# -# The LLVM Compiler Infrastructure -# -# This file is dual licensed under the MIT and the University of Illinois Open -# Source Licenses. See LICENSE.txt for details. -# -##===----------------------------------------------------------------------===## -# -# Build a plugin for an AMDGPU machine if available. -# -##===----------------------------------------------------------------------===## - -################################################################################ -set(LIBOMPTARGET_BUILD_AMDGPU_PLUGIN TRUE CACHE BOOL - "Whether to build AMDGPU plugin") -if (NOT LIBOMPTARGET_BUILD_AMDGPU_PLUGIN) - libomptarget_say("Not building AMDGPU offloading plugin: LIBOMPTARGET_BUILD_AMDGPU_PLUGIN is false") - return() -endif() - -# as of rocm-3.7, hsa is installed with cmake packages and kmt is found via hsa -find_package(hsa-runtime64 QUIET 1.2.0 HINTS ${CMAKE_INSTALL_PREFIX} PATHS /opt/rocm) - -if(NOT CMAKE_SYSTEM_PROCESSOR MATCHES "(x86_64)|(ppc64le)|(aarch64)$" AND CMAKE_SYSTEM_NAME MATCHES "Linux") - libomptarget_say("Not building AMDGPU plugin: only support AMDGPU in Linux x86_64, ppc64le, or aarch64 hosts") - return() -endif() - -################################################################################ -# Define the suffix for the runtime messaging dumps. -add_definitions(-DTARGET_NAME=AMDGPU) -if(CMAKE_SYSTEM_PROCESSOR MATCHES "(ppc64le)|(aarch64)$") - add_definitions(-DLITTLEENDIAN_CPU=1) -endif() - -if(CMAKE_BUILD_TYPE MATCHES Debug) - add_definitions(-DDEBUG) -endif() - -set(LIBOMPTARGET_DLOPEN_LIBHSA OFF) -option(LIBOMPTARGET_FORCE_DLOPEN_LIBHSA "Build with dlopened libhsa" ${LIBOMPTARGET_DLOPEN_LIBHSA}) - -if (${hsa-runtime64_FOUND} AND NOT LIBOMPTARGET_FORCE_DLOPEN_LIBHSA) - libomptarget_say("Building AMDGPU plugin linked against libhsa") - set(LIBOMPTARGET_EXTRA_SOURCE) - set(LIBOMPTARGET_DEP_LIBRARIES hsa-runtime64::hsa-runtime64) -else() - libomptarget_say("Building AMDGPU plugin for dlopened libhsa") - include_directories(dynamic_hsa) - set(LIBOMPTARGET_EXTRA_SOURCE dynamic_hsa/hsa.cpp) - set(LIBOMPTARGET_DEP_LIBRARIES) -endif() - -if(CMAKE_SYSTEM_NAME MATCHES "FreeBSD") - # On FreeBSD, the 'environ' symbol is undefined at link time, but resolved by - # the dynamic linker at runtime. Therefore, allow the symbol to be undefined - # when creating a shared library. - set(LDFLAGS_UNDEFINED "-Wl,--allow-shlib-undefined") -else() - set(LDFLAGS_UNDEFINED "-Wl,-z,defs") -endif() - -add_llvm_library(omptarget.rtl.amdgpu SHARED - impl/impl.cpp - impl/interop_hsa.cpp - impl/data.cpp - impl/get_elf_mach_gfx_name.cpp - impl/system.cpp - impl/msgpack.cpp - src/rtl.cpp - ${LIBOMPTARGET_EXTRA_SOURCE} - - ADDITIONAL_HEADER_DIRS - ${LIBOMPTARGET_INCLUDE_DIR} - ${CMAKE_CURRENT_SOURCE_DIR}/impl - ${CMAKE_CURRENT_SOURCE_DIR}/../../plugins-nextgen/amdgpu/utils - - LINK_COMPONENTS - Support - Object - - LINK_LIBS - PRIVATE - elf_common - ${LIBOMPTARGET_DEP_LIBRARIES} - ${OPENMP_PTHREAD_LIB} - "-Wl,--version-script=${CMAKE_CURRENT_SOURCE_DIR}/../exports" - ${LDFLAGS_UNDEFINED} - - NO_INSTALL_RPATH -) -add_dependencies(omptarget.rtl.amdgpu omptarget.devicertl.amdgpu) - -target_include_directories( - omptarget.rtl.amdgpu - PRIVATE - ${LIBOMPTARGET_INCLUDE_DIR} - ${CMAKE_CURRENT_SOURCE_DIR}/impl - ${CMAKE_CURRENT_SOURCE_DIR}/../../plugins-nextgen/amdgpu/utils -) - - -# Install plugin under the lib destination folder. -install(TARGETS omptarget.rtl.amdgpu LIBRARY DESTINATION "${OPENMP_INSTALL_LIBDIR}") -set_target_properties(omptarget.rtl.amdgpu PROPERTIES - INSTALL_RPATH "$ORIGIN" BUILD_RPATH "$ORIGIN:${CMAKE_CURRENT_BINARY_DIR}/.." - CXX_VISIBILITY_PRESET protected) - -# Report to the parent scope that we are building a plugin for hsa. -# This controls whether tests are run for the nvptx offloading target -# Run them if libhsa is available, or if the user explicitly asked for dlopen -# Otherwise this plugin is being built speculatively and there may be no hsa available -option(LIBOMPTARGET_FORCE_AMDGPU_TESTS "Build AMDGPU libomptarget tests" OFF) -if (LIBOMPTARGET_FOUND_AMDGPU_GPU OR LIBOMPTARGET_FORCE_AMDGPU_TESTS) - # Report to the parent scope that we are building a plugin for amdgpu - set(LIBOMPTARGET_SYSTEM_TARGETS "${LIBOMPTARGET_SYSTEM_TARGETS} amdgcn-amd-amdhsa amdgcn-amd-amdhsa-oldDriver" PARENT_SCOPE) - set(LIBOMPTARGET_SYSTEM_TARGETS "${LIBOMPTARGET_SYSTEM_TARGETS} amdgcn-amd-amdhsa amdgcn-amd-amdhsa-LTO" PARENT_SCOPE) - list(APPEND LIBOMPTARGET_TESTED_PLUGINS "omptarget.rtl.amdgpu") - set(LIBOMPTARGET_TESTED_PLUGINS "${LIBOMPTARGET_TESTED_PLUGINS}" PARENT_SCOPE) -else() - libomptarget_say("Not generating amdgcn test targets as libhsa is not linkable") - return() -endif() - diff --git a/openmp/libomptarget/plugins/amdgpu/dynamic_hsa/hsa.h b/openmp/libomptarget/plugins/amdgpu/dynamic_hsa/hsa.h deleted file mode 100644 --- a/openmp/libomptarget/plugins/amdgpu/dynamic_hsa/hsa.h +++ /dev/null @@ -1,368 +0,0 @@ -//===--- amdgpu/dynamic_hsa/hsa.h --------------------------------- C++ -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// The parts of the hsa api that are presently in use by the amdgpu plugin -// -//===----------------------------------------------------------------------===// -#ifndef HSA_RUNTIME_INC_HSA_H_ -#define HSA_RUNTIME_INC_HSA_H_ - -#include -#include - -// Detect and set large model builds. -#undef HSA_LARGE_MODEL -#if defined(__LP64__) || defined(_M_X64) -#define HSA_LARGE_MODEL -#endif - -#ifdef __cplusplus -extern "C" { -#endif - -typedef enum { - HSA_STATUS_SUCCESS = 0x0, - HSA_STATUS_INFO_BREAK = 0x1, - HSA_STATUS_ERROR = 0x1000, - HSA_STATUS_ERROR_INVALID_CODE_OBJECT = 0x1010, - HSA_STATUS_ERROR_NOT_INITIALIZED = 0x100B, -} hsa_status_t; - -hsa_status_t hsa_status_string(hsa_status_t status, const char **status_string); - -typedef struct hsa_dim3_s { - uint32_t x; - uint32_t y; - uint32_t z; -} hsa_dim3_t; - -hsa_status_t hsa_init(); - -hsa_status_t hsa_shut_down(); - -typedef struct hsa_agent_s { - uint64_t handle; -} hsa_agent_t; - -typedef enum { - HSA_DEVICE_TYPE_CPU = 0, - HSA_DEVICE_TYPE_GPU = 1, - HSA_DEVICE_TYPE_DSP = 2 -} hsa_device_type_t; - -typedef enum { - HSA_ISA_INFO_NAME_LENGTH = 0, - HSA_ISA_INFO_NAME = 1 -} hsa_isa_info_t; - -typedef enum { - HSA_AGENT_INFO_NAME = 0, - HSA_AGENT_INFO_VENDOR_NAME = 1, - HSA_AGENT_INFO_FEATURE = 2, - HSA_AGENT_INFO_PROFILE = 4, - HSA_AGENT_INFO_WAVEFRONT_SIZE = 6, - HSA_AGENT_INFO_WORKGROUP_MAX_DIM = 7, - HSA_AGENT_INFO_WORKGROUP_MAX_SIZE = 8, - HSA_AGENT_INFO_GRID_MAX_DIM = 9, - HSA_AGENT_INFO_GRID_MAX_SIZE = 10, - HSA_AGENT_INFO_FBARRIER_MAX_SIZE = 11, - HSA_AGENT_INFO_QUEUES_MAX = 12, - HSA_AGENT_INFO_QUEUE_MIN_SIZE = 13, - HSA_AGENT_INFO_QUEUE_MAX_SIZE = 14, - HSA_AGENT_INFO_DEVICE = 17, - HSA_AGENT_INFO_CACHE_SIZE = 18, - HSA_AGENT_INFO_FAST_F16_OPERATION = 24, -} hsa_agent_info_t; - -typedef enum { - HSA_SYSTEM_INFO_VERSION_MAJOR = 0, - HSA_SYSTEM_INFO_VERSION_MINOR = 1, -} hsa_system_info_t; - -typedef enum { - HSA_AGENT_FEATURE_KERNEL_DISPATCH = 1, - HSA_AGENT_FEATURE_AGENT_DISPATCH = 2, -} hsa_agent_feature_t; - -typedef struct hsa_region_s { - uint64_t handle; -} hsa_region_t; - -typedef struct hsa_isa_s { - uint64_t handle; -} hsa_isa_t; - -hsa_status_t hsa_system_get_info(hsa_system_info_t attribute, void *value); - -hsa_status_t hsa_agent_get_info(hsa_agent_t agent, hsa_agent_info_t attribute, - void *value); - -hsa_status_t hsa_isa_get_info_alt(hsa_isa_t isa, hsa_isa_info_t attribute, - void *value); - -hsa_status_t hsa_iterate_agents(hsa_status_t (*callback)(hsa_agent_t agent, - void *data), - void *data); - -hsa_status_t hsa_agent_iterate_isas(hsa_agent_t agent, - hsa_status_t (*callback)(hsa_isa_t isa, - void *data), - void *data); - -typedef struct hsa_signal_s { - uint64_t handle; -} hsa_signal_t; - -#ifdef HSA_LARGE_MODEL -typedef int64_t hsa_signal_value_t; -#else -typedef int32_t hsa_signal_value_t; -#endif - -hsa_status_t hsa_signal_create(hsa_signal_value_t initial_value, - uint32_t num_consumers, - const hsa_agent_t *consumers, - hsa_signal_t *signal); - -hsa_status_t hsa_amd_signal_create(hsa_signal_value_t initial_value, - uint32_t num_consumers, - const hsa_agent_t *consumers, - uint64_t attributes, hsa_signal_t *signal); - -hsa_status_t hsa_signal_destroy(hsa_signal_t signal); - -void hsa_signal_store_relaxed(hsa_signal_t signal, hsa_signal_value_t value); - -void hsa_signal_store_screlease(hsa_signal_t signal, hsa_signal_value_t value); - -hsa_signal_value_t hsa_signal_load_scacquire(hsa_signal_t signal); - -void hsa_signal_subtract_screlease(hsa_signal_t signal, - hsa_signal_value_t value); - -typedef enum { - HSA_SIGNAL_CONDITION_EQ = 0, - HSA_SIGNAL_CONDITION_NE = 1, -} hsa_signal_condition_t; - -typedef enum { - HSA_WAIT_STATE_BLOCKED = 0, - HSA_WAIT_STATE_ACTIVE = 1 -} hsa_wait_state_t; - -hsa_signal_value_t hsa_signal_wait_scacquire(hsa_signal_t signal, - hsa_signal_condition_t condition, - hsa_signal_value_t compare_value, - uint64_t timeout_hint, - hsa_wait_state_t wait_state_hint); - -typedef enum { - HSA_QUEUE_TYPE_MULTI = 0, - HSA_QUEUE_TYPE_SINGLE = 1, -} hsa_queue_type_t; - -typedef enum { - HSA_QUEUE_FEATURE_KERNEL_DISPATCH = 1, - HSA_QUEUE_FEATURE_AGENT_DISPATCH = 2 -} hsa_queue_feature_t; - -typedef uint32_t hsa_queue_type32_t; - -typedef struct hsa_queue_s { - hsa_queue_type32_t type; - uint32_t features; - -#ifdef HSA_LARGE_MODEL - void *base_address; -#elif defined HSA_LITTLE_ENDIAN - void *base_address; - uint32_t reserved0; -#else - uint32_t reserved0; - void *base_address; -#endif - hsa_signal_t doorbell_signal; - uint32_t size; - uint32_t reserved1; - uint64_t id; -} hsa_queue_t; - -hsa_status_t hsa_queue_create(hsa_agent_t agent, uint32_t size, - hsa_queue_type32_t type, - void (*callback)(hsa_status_t status, - hsa_queue_t *source, void *data), - void *data, uint32_t private_segment_size, - uint32_t group_segment_size, hsa_queue_t **queue); - -hsa_status_t hsa_queue_destroy(hsa_queue_t *queue); - -uint64_t hsa_queue_load_read_index_scacquire(const hsa_queue_t *queue); - -uint64_t hsa_queue_add_write_index_relaxed(const hsa_queue_t *queue, - uint64_t value); - -typedef enum { - HSA_PACKET_TYPE_KERNEL_DISPATCH = 2, - HSA_PACKET_TYPE_BARRIER_AND = 3, -} hsa_packet_type_t; - -typedef enum { HSA_FENCE_SCOPE_SYSTEM = 2 } hsa_fence_scope_t; - -typedef enum { - HSA_PACKET_HEADER_TYPE = 0, - HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE = 9, - HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE = 11 -} hsa_packet_header_t; - -typedef enum { - HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS = 0 -} hsa_kernel_dispatch_packet_setup_t; - -typedef enum { - HSA_KERNEL_DISPATCH_PACKET_SETUP_WIDTH_DIMENSIONS = 2 -} hsa_kernel_dispatch_packet_setup_width_t; - -typedef struct hsa_kernel_dispatch_packet_s { - uint16_t header; - uint16_t setup; - uint16_t workgroup_size_x; - uint16_t workgroup_size_y; - uint16_t workgroup_size_z; - uint16_t reserved0; - uint32_t grid_size_x; - uint32_t grid_size_y; - uint32_t grid_size_z; - uint32_t private_segment_size; - uint32_t group_segment_size; - uint64_t kernel_object; -#ifdef HSA_LARGE_MODEL - void *kernarg_address; -#elif defined HSA_LITTLE_ENDIAN - void *kernarg_address; - uint32_t reserved1; -#else - uint32_t reserved1; - void *kernarg_address; -#endif - uint64_t reserved2; - hsa_signal_t completion_signal; -} hsa_kernel_dispatch_packet_t; - -typedef struct hsa_barrier_and_packet_s { - uint16_t header; - uint16_t reserved0; - uint32_t reserved1; - hsa_signal_t dep_signal[5]; - uint64_t reserved2; - hsa_signal_t completion_signal; -} hsa_barrier_and_packet_t; - -typedef enum { HSA_PROFILE_BASE = 0, HSA_PROFILE_FULL = 1 } hsa_profile_t; - -typedef enum { - HSA_EXECUTABLE_STATE_UNFROZEN = 0, - HSA_EXECUTABLE_STATE_FROZEN = 1 -} hsa_executable_state_t; - -typedef struct hsa_executable_s { - uint64_t handle; -} hsa_executable_t; - -typedef struct hsa_executable_symbol_s { - uint64_t handle; -} hsa_executable_symbol_t; - -typedef enum { - HSA_EXECUTABLE_SYMBOL_INFO_TYPE = 0, - HSA_EXECUTABLE_SYMBOL_INFO_NAME_LENGTH = 1, - HSA_EXECUTABLE_SYMBOL_INFO_NAME = 2, - HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS = 21, - HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_SIZE = 9, - HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT = 22, - HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_SIZE = 11, - HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_GROUP_SEGMENT_SIZE = 13, - HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_PRIVATE_SEGMENT_SIZE = 14, -} hsa_executable_symbol_info_t; - -typedef struct hsa_code_object_s { - uint64_t handle; -} hsa_code_object_t; - -typedef enum { - HSA_SYMBOL_KIND_VARIABLE = 0, - HSA_SYMBOL_KIND_KERNEL = 1, - HSA_SYMBOL_KIND_INDIRECT_FUNCTION = 2 -} hsa_symbol_kind_t; - -typedef enum { - HSA_DEFAULT_FLOAT_ROUNDING_MODE_DEFAULT = 0, - HSA_DEFAULT_FLOAT_ROUNDING_MODE_ZERO = 1, - HSA_DEFAULT_FLOAT_ROUNDING_MODE_NEAR = 2, -} hsa_default_float_rounding_mode_t; - -hsa_status_t hsa_memory_copy(void *dst, const void *src, size_t size); - -hsa_status_t hsa_executable_create(hsa_profile_t profile, - hsa_executable_state_t executable_state, - const char *options, - hsa_executable_t *executable); - -hsa_status_t hsa_executable_create_alt( - hsa_profile_t profile, - hsa_default_float_rounding_mode_t default_float_rounding_mode, - const char *options, hsa_executable_t *executable); - -hsa_status_t hsa_executable_destroy(hsa_executable_t executable); - -hsa_status_t hsa_executable_freeze(hsa_executable_t executable, - const char *options); - -hsa_status_t hsa_executable_validate(hsa_executable_t executable, - uint32_t *result); - -hsa_status_t -hsa_executable_symbol_get_info(hsa_executable_symbol_t executable_symbol, - hsa_executable_symbol_info_t attribute, - void *value); - -hsa_status_t hsa_executable_iterate_symbols( - hsa_executable_t executable, - hsa_status_t (*callback)(hsa_executable_t exec, - hsa_executable_symbol_t symbol, void *data), - void *data); - -hsa_status_t hsa_executable_get_symbol_by_name(hsa_executable_t executable, - const char *symbol_name, - const hsa_agent_t *agent, - hsa_executable_symbol_t *symbol); - -hsa_status_t hsa_code_object_deserialize(void *serialized_code_object, - size_t serialized_code_object_size, - const char *options, - hsa_code_object_t *code_object); - -hsa_status_t hsa_executable_load_code_object(hsa_executable_t executable, - hsa_agent_t agent, - hsa_code_object_t code_object, - const char *options); - -hsa_status_t hsa_code_object_destroy(hsa_code_object_t code_object); - -typedef bool (*hsa_amd_signal_handler)(hsa_signal_value_t value, void *arg); - -hsa_status_t hsa_amd_signal_async_handler(hsa_signal_t signal, - hsa_signal_condition_t cond, - hsa_signal_value_t value, - hsa_amd_signal_handler handler, - void *arg); - -#ifdef __cplusplus -} -#endif - -#endif diff --git a/openmp/libomptarget/plugins/amdgpu/dynamic_hsa/hsa.cpp b/openmp/libomptarget/plugins/amdgpu/dynamic_hsa/hsa.cpp deleted file mode 100644 --- a/openmp/libomptarget/plugins/amdgpu/dynamic_hsa/hsa.cpp +++ /dev/null @@ -1,117 +0,0 @@ -//===--- amdgpu/dynamic_hsa/hsa.cpp ------------------------------- C++ -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// Implement subset of hsa api by calling into hsa library via dlopen -// Does the dlopen/dlsym calls as part of the call to hsa_init -// -//===----------------------------------------------------------------------===// - -#include "llvm/Support/DynamicLibrary.h" - -#include "Debug.h" -#include "dlwrap.h" -#include "hsa.h" -#include "hsa_ext_amd.h" -#include - -DLWRAP_INITIALIZE() - -DLWRAP_INTERNAL(hsa_init, 0) - -DLWRAP(hsa_status_string, 2) -DLWRAP(hsa_shut_down, 0) -DLWRAP(hsa_system_get_info, 2) -DLWRAP(hsa_agent_get_info, 3) -DLWRAP(hsa_isa_get_info_alt, 3) -DLWRAP(hsa_iterate_agents, 2) -DLWRAP(hsa_agent_iterate_isas, 3) -DLWRAP(hsa_signal_create, 4) -DLWRAP(hsa_signal_destroy, 1) -DLWRAP(hsa_signal_store_relaxed, 2) -DLWRAP(hsa_signal_store_screlease, 2) -DLWRAP(hsa_signal_wait_scacquire, 5) -DLWRAP(hsa_signal_load_scacquire, 1) -DLWRAP(hsa_signal_subtract_screlease, 2) -DLWRAP(hsa_queue_create, 8) -DLWRAP(hsa_queue_destroy, 1) -DLWRAP(hsa_queue_load_read_index_scacquire, 1) -DLWRAP(hsa_queue_add_write_index_relaxed, 2) -DLWRAP(hsa_memory_copy, 3) -DLWRAP(hsa_executable_create, 4) -DLWRAP(hsa_executable_create_alt, 4) -DLWRAP(hsa_executable_destroy, 1) -DLWRAP(hsa_executable_freeze, 2) -DLWRAP(hsa_executable_validate, 2) -DLWRAP(hsa_executable_symbol_get_info, 3) -DLWRAP(hsa_executable_get_symbol_by_name, 4) -DLWRAP(hsa_executable_iterate_symbols, 3) -DLWRAP(hsa_code_object_deserialize, 4) -DLWRAP(hsa_executable_load_code_object, 4) -DLWRAP(hsa_code_object_destroy, 1) -DLWRAP(hsa_amd_agent_memory_pool_get_info, 4) -DLWRAP(hsa_amd_agent_iterate_memory_pools, 3) -DLWRAP(hsa_amd_memory_pool_allocate, 4) -DLWRAP(hsa_amd_memory_pool_free, 1) -DLWRAP(hsa_amd_memory_async_copy, 8) -DLWRAP(hsa_amd_memory_pool_get_info, 3) -DLWRAP(hsa_amd_agents_allow_access, 4) -DLWRAP(hsa_amd_memory_lock, 5) -DLWRAP(hsa_amd_memory_unlock, 1) -DLWRAP(hsa_amd_memory_fill, 3) -DLWRAP(hsa_amd_register_system_event_handler, 2) -DLWRAP(hsa_amd_signal_create, 5) -DLWRAP(hsa_amd_signal_async_handler, 5) -DLWRAP(hsa_amd_pointer_info, 5) - -DLWRAP_FINALIZE() - -#ifndef DYNAMIC_HSA_PATH -#define DYNAMIC_HSA_PATH "libhsa-runtime64.so" -#endif - -#ifndef TARGET_NAME -#error "Missing TARGET_NAME macro" -#endif -#ifndef DEBUG_PREFIX -#define DEBUG_PREFIX "Target " GETNAME(TARGET_NAME) " RTL" -#endif - -static bool checkForHSA() { - // return true if dlopen succeeded and all functions found - - const char *HsaLib = DYNAMIC_HSA_PATH; - std::string ErrMsg; - auto DynlibHandle = std::make_unique( - llvm::sys::DynamicLibrary::getPermanentLibrary(HsaLib, &ErrMsg)); - if (!DynlibHandle->isValid()) { - DP("Unable to load library '%s': %s!\n", HsaLib, ErrMsg.c_str()); - return false; - } - - for (size_t I = 0; I < dlwrap::size(); I++) { - const char *Sym = dlwrap::symbol(I); - - void *P = DynlibHandle->getAddressOfSymbol(Sym); - if (P == nullptr) { - DP("Unable to find '%s' in '%s'!\n", Sym, HsaLib); - return false; - } - DP("Implementing %s with dlsym(%s) -> %p\n", Sym, Sym, P); - - *dlwrap::pointer(I) = P; - } - - return true; -} - -hsa_status_t hsa_init() { - if (!checkForHSA()) { - return HSA_STATUS_ERROR; - } - return dlwrap_hsa_init(); -} diff --git a/openmp/libomptarget/plugins/amdgpu/dynamic_hsa/hsa_ext_amd.h b/openmp/libomptarget/plugins/amdgpu/dynamic_hsa/hsa_ext_amd.h deleted file mode 100644 --- a/openmp/libomptarget/plugins/amdgpu/dynamic_hsa/hsa_ext_amd.h +++ /dev/null @@ -1,163 +0,0 @@ -//===--- amdgpu/dynamic_hsa/hsa_ext_amd.h ------------------------- C++ -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// The parts of the hsa api that are presently in use by the amdgpu plugin -// -//===----------------------------------------------------------------------===// -#ifndef HSA_RUNTIME_EXT_AMD_H_ -#define HSA_RUNTIME_EXT_AMD_H_ - -#include "hsa.h" - -#ifdef __cplusplus -extern "C" { -#endif - -typedef struct hsa_amd_memory_pool_s { - uint64_t handle; -} hsa_amd_memory_pool_t; - -typedef enum hsa_amd_memory_pool_global_flag_s { - HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_KERNARG_INIT = 1, - HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_FINE_GRAINED = 2, - HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_COARSE_GRAINED = 4 -} hsa_amd_memory_pool_global_flag_t; - -typedef enum { - HSA_AMD_SEGMENT_GLOBAL = 0, - HSA_AMD_SEGMENT_READONLY = 1, - HSA_AMD_SEGMENT_PRIVATE = 2, - HSA_AMD_SEGMENT_GROUP = 3, -} hsa_amd_segment_t; - -typedef enum { - HSA_AMD_MEMORY_POOL_INFO_SEGMENT = 0, - HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS = 1, - HSA_AMD_MEMORY_POOL_INFO_SIZE = 2, - HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALLOWED = 5, - HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_GRANULE = 6, - HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALIGNMENT = 7, - HSA_AMD_MEMORY_POOL_INFO_ACCESSIBLE_BY_ALL = 15, -} hsa_amd_memory_pool_info_t; - -typedef enum { - HSA_AMD_AGENT_MEMORY_POOL_INFO_ACCESS = 0, -} hsa_amd_agent_memory_pool_info_t; - -typedef enum { - HSA_AMD_MEMORY_POOL_ACCESS_NEVER_ALLOWED = 0, -} hsa_amd_memory_pool_access_t; - -typedef enum hsa_amd_agent_info_s { - HSA_AMD_AGENT_INFO_CACHELINE_SIZE = 0xA001, - HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT = 0xA002, - HSA_AMD_AGENT_INFO_MAX_CLOCK_FREQUENCY = 0xA003, - HSA_AMD_AGENT_INFO_PRODUCT_NAME = 0xA009, - HSA_AMD_AGENT_INFO_MAX_WAVES_PER_CU = 0xA00A, - HSA_AMD_AGENT_INFO_NUM_SIMDS_PER_CU = 0xA00B, - HSA_AMD_AGENT_INFO_COOPERATIVE_QUEUES = 0xA010 -} hsa_amd_agent_info_t; - -hsa_status_t hsa_amd_memory_pool_get_info(hsa_amd_memory_pool_t memory_pool, - hsa_amd_memory_pool_info_t attribute, - void *value); - -hsa_status_t hsa_amd_agent_iterate_memory_pools( - hsa_agent_t agent, - hsa_status_t (*callback)(hsa_amd_memory_pool_t memory_pool, void *data), - void *data); - -hsa_status_t hsa_amd_memory_pool_allocate(hsa_amd_memory_pool_t memory_pool, - size_t size, uint32_t flags, - void **ptr); - -hsa_status_t hsa_amd_memory_pool_free(void *ptr); - -hsa_status_t hsa_amd_memory_async_copy(void *dst, hsa_agent_t dst_agent, - const void *src, hsa_agent_t src_agent, - size_t size, uint32_t num_dep_signals, - const hsa_signal_t *dep_signals, - hsa_signal_t completion_signal); - -hsa_status_t hsa_amd_agent_memory_pool_get_info( - hsa_agent_t agent, hsa_amd_memory_pool_t memory_pool, - hsa_amd_agent_memory_pool_info_t attribute, void *value); - -hsa_status_t hsa_amd_agents_allow_access(uint32_t num_agents, - const hsa_agent_t *agents, - const uint32_t *flags, - const void *ptr); - -hsa_status_t hsa_amd_memory_lock(void* host_ptr, size_t size, - hsa_agent_t* agents, int num_agent, - void** agent_ptr); - -hsa_status_t hsa_amd_memory_unlock(void* host_ptr); - -hsa_status_t hsa_amd_memory_fill(void *ptr, uint32_t value, size_t count); - -typedef enum hsa_amd_event_type_s { - HSA_AMD_GPU_MEMORY_FAULT_EVENT = 0, -} hsa_amd_event_type_t; - -typedef struct hsa_amd_gpu_memory_fault_info_s { - hsa_agent_t agent; - uint64_t virtual_address; - uint32_t fault_reason_mask; -} hsa_amd_gpu_memory_fault_info_t; - -typedef struct hsa_amd_event_s { - hsa_amd_event_type_t event_type; - union { - hsa_amd_gpu_memory_fault_info_t memory_fault; - }; -} hsa_amd_event_t; - -typedef hsa_status_t (*hsa_amd_system_event_callback_t)( - const hsa_amd_event_t *event, void *data); - -hsa_status_t -hsa_amd_register_system_event_handler(hsa_amd_system_event_callback_t callback, - void *data); - -typedef enum { - HSA_AMD_MEMORY_FAULT_PAGE_NOT_PRESENT = 1 << 0, - HSA_AMD_MEMORY_FAULT_READ_ONLY = 1 << 1, - HSA_AMD_MEMORY_FAULT_NX = 1 << 2, - HSA_AMD_MEMORY_FAULT_HOST_ONLY = 1 << 3, - HSA_AMD_MEMORY_FAULT_DRAMECC = 1 << 4, - HSA_AMD_MEMORY_FAULT_IMPRECISE = 1 << 5, - HSA_AMD_MEMORY_FAULT_SRAMECC = 1 << 6, - HSA_AMD_MEMORY_FAULT_HANG = 1 << 31 -} hsa_amd_memory_fault_reason_t; - -typedef enum { - HSA_EXT_POINTER_TYPE_UNKNOWN = 0, - HSA_EXT_POINTER_TYPE_HSA = 1, - HSA_EXT_POINTER_TYPE_LOCKED = 2 -} hsa_amd_pointer_type_t; - -typedef struct hsa_amd_pointer_info_s { - uint32_t size; - hsa_amd_pointer_type_t type; - void* agentBaseAddress; - void* hostBaseAddress; - size_t sizeInBytes; -} hsa_amd_pointer_info_t; - -hsa_status_t hsa_amd_pointer_info(const void* ptr, - hsa_amd_pointer_info_t* info, - void* (*alloc)(size_t), - uint32_t* num_agents_accessible, - hsa_agent_t** accessible); - -#ifdef __cplusplus -} -#endif - -#endif diff --git a/openmp/libomptarget/plugins/amdgpu/impl/data.cpp b/openmp/libomptarget/plugins/amdgpu/impl/data.cpp deleted file mode 100644 --- a/openmp/libomptarget/plugins/amdgpu/impl/data.cpp +++ /dev/null @@ -1,37 +0,0 @@ -//===--- amdgpu/impl/data.cpp ------------------------------------- C++ -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -#include "impl_runtime.h" -#include "hsa_api.h" -#include "internal.h" -#include "rt.h" -#include -#include -#include -#include - -using core::TaskImpl; - -namespace core { -namespace Runtime { -hsa_status_t HostMalloc(void **ptr, size_t size, - hsa_amd_memory_pool_t MemoryPool) { - hsa_status_t err = hsa_amd_memory_pool_allocate(MemoryPool, size, 0, ptr); - DP("Malloced %p\n", *ptr); - if (err == HSA_STATUS_SUCCESS) { - err = core::allow_access_to_all_gpu_agents(*ptr); - } - return err; -} - -hsa_status_t Memfree(void *ptr) { - hsa_status_t err = hsa_amd_memory_pool_free(ptr); - DP("Freed %p\n", ptr); - return err; -} -} // namespace Runtime -} // namespace core diff --git a/openmp/libomptarget/plugins/amdgpu/impl/get_elf_mach_gfx_name.h b/openmp/libomptarget/plugins/amdgpu/impl/get_elf_mach_gfx_name.h deleted file mode 100644 --- a/openmp/libomptarget/plugins/amdgpu/impl/get_elf_mach_gfx_name.h +++ /dev/null @@ -1,15 +0,0 @@ -//===--- amdgpu/impl/get_elf_mach_gfx_name.h ---------------------- C++ -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -#ifndef GET_ELF_MACH_GFX_NAME_H_INCLUDED -#define GET_ELF_MACH_GFX_NAME_H_INCLUDED - -#include - -const char *get_elf_mach_gfx_name(uint32_t EFlags); - -#endif diff --git a/openmp/libomptarget/plugins/amdgpu/impl/get_elf_mach_gfx_name.cpp b/openmp/libomptarget/plugins/amdgpu/impl/get_elf_mach_gfx_name.cpp deleted file mode 100644 --- a/openmp/libomptarget/plugins/amdgpu/impl/get_elf_mach_gfx_name.cpp +++ /dev/null @@ -1,80 +0,0 @@ -//===--- amdgpu/impl/get_elf_mach_gfx_name.cpp -------------------- C++ -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -#include "get_elf_mach_gfx_name.h" - -// This header conflicts with the system elf.h (macros vs enums of the same -// identifier) and contains more up to date values for the enum checked here. -// rtl.cpp uses the system elf.h. -#include "llvm/BinaryFormat/ELF.h" - -const char *get_elf_mach_gfx_name(uint32_t EFlags) { - using namespace llvm::ELF; - uint32_t Gfx = (EFlags & EF_AMDGPU_MACH); - switch (Gfx) { - case EF_AMDGPU_MACH_AMDGCN_GFX801: - return "gfx801"; - case EF_AMDGPU_MACH_AMDGCN_GFX802: - return "gfx802"; - case EF_AMDGPU_MACH_AMDGCN_GFX803: - return "gfx803"; - case EF_AMDGPU_MACH_AMDGCN_GFX805: - return "gfx805"; - case EF_AMDGPU_MACH_AMDGCN_GFX810: - return "gfx810"; - case EF_AMDGPU_MACH_AMDGCN_GFX900: - return "gfx900"; - case EF_AMDGPU_MACH_AMDGCN_GFX902: - return "gfx902"; - case EF_AMDGPU_MACH_AMDGCN_GFX904: - return "gfx904"; - case EF_AMDGPU_MACH_AMDGCN_GFX906: - return "gfx906"; - case EF_AMDGPU_MACH_AMDGCN_GFX908: - return "gfx908"; - case EF_AMDGPU_MACH_AMDGCN_GFX909: - return "gfx909"; - case EF_AMDGPU_MACH_AMDGCN_GFX90A: - return "gfx90a"; - case EF_AMDGPU_MACH_AMDGCN_GFX90C: - return "gfx90c"; - case EF_AMDGPU_MACH_AMDGCN_GFX940: - return "gfx940"; - case EF_AMDGPU_MACH_AMDGCN_GFX1010: - return "gfx1010"; - case EF_AMDGPU_MACH_AMDGCN_GFX1011: - return "gfx1011"; - case EF_AMDGPU_MACH_AMDGCN_GFX1012: - return "gfx1012"; - case EF_AMDGPU_MACH_AMDGCN_GFX1013: - return "gfx1013"; - case EF_AMDGPU_MACH_AMDGCN_GFX1030: - return "gfx1030"; - case EF_AMDGPU_MACH_AMDGCN_GFX1031: - return "gfx1031"; - case EF_AMDGPU_MACH_AMDGCN_GFX1032: - return "gfx1032"; - case EF_AMDGPU_MACH_AMDGCN_GFX1033: - return "gfx1033"; - case EF_AMDGPU_MACH_AMDGCN_GFX1034: - return "gfx1034"; - case EF_AMDGPU_MACH_AMDGCN_GFX1035: - return "gfx1035"; - case EF_AMDGPU_MACH_AMDGCN_GFX1036: - return "gfx1036"; - case EF_AMDGPU_MACH_AMDGCN_GFX1100: - return "gfx1100"; - case EF_AMDGPU_MACH_AMDGCN_GFX1101: - return "gfx1101"; - case EF_AMDGPU_MACH_AMDGCN_GFX1102: - return "gfx1102"; - case EF_AMDGPU_MACH_AMDGCN_GFX1103: - return "gfx1103"; - default: - return "--unknown gfx"; - } -} diff --git a/openmp/libomptarget/plugins/amdgpu/impl/hsa_api.h b/openmp/libomptarget/plugins/amdgpu/impl/hsa_api.h deleted file mode 100644 --- a/openmp/libomptarget/plugins/amdgpu/impl/hsa_api.h +++ /dev/null @@ -1,26 +0,0 @@ -//===--- amdgpu/impl/hsa_api.h ------------------------------------ C++ -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -#ifndef AMDGPU_HSA_API_H_INCLUDED -#define AMDGPU_HSA_API_H_INCLUDED - -#if defined(__has_include) -#if __has_include("hsa/hsa.h") -#include "hsa/hsa.h" -#include "hsa/hsa_ext_amd.h" -#elif __has_include("hsa.h") -#include "hsa.h" -#include "hsa_ext_amd.h" -#endif -#else -#include "hsa/hsa.h" -#include "hsa_ext_amd.h" -#endif - - - -#endif diff --git a/openmp/libomptarget/plugins/amdgpu/impl/impl.cpp b/openmp/libomptarget/plugins/amdgpu/impl/impl.cpp deleted file mode 100644 --- a/openmp/libomptarget/plugins/amdgpu/impl/impl.cpp +++ /dev/null @@ -1,182 +0,0 @@ -//===--- amdgpu/impl/impl.cpp ------------------------------------- C++ -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -#include "rt.h" -#include - -/* - * Data - */ - -hsa_status_t is_locked(void *ptr, void **agentBaseAddress) { - hsa_status_t err = HSA_STATUS_SUCCESS; - hsa_amd_pointer_info_t info; - info.size = sizeof(hsa_amd_pointer_info_t); - err = hsa_amd_pointer_info(ptr, &info, /*alloc=*/nullptr, - /*num_agents_accessible=*/nullptr, - /*accessible=*/nullptr); - if (err != HSA_STATUS_SUCCESS) { - DP("Error when getting pointer info\n"); - return err; - } - - if (info.type == HSA_EXT_POINTER_TYPE_LOCKED) { - // When user passes in a basePtr+offset we need to fix the - // locked pointer to include the offset: ROCr always returns - // the base locked address, not the shifted one. - if ((char *)info.hostBaseAddress <= (char *)ptr && - (char *)ptr < (char *)info.hostBaseAddress + info.sizeInBytes) - *agentBaseAddress = - (void *)((uint64_t)info.agentBaseAddress + (uint64_t)ptr - - (uint64_t)info.hostBaseAddress); - else // address is already device-agent accessible, no need to compute - // offset - *agentBaseAddress = ptr; - } else - *agentBaseAddress = nullptr; - - return HSA_STATUS_SUCCESS; -} - -// host pointer (either src or dest) must be locked via hsa_amd_memory_lock -static hsa_status_t invoke_hsa_copy(hsa_signal_t signal, void *dest, - hsa_agent_t agent, const void *src, - size_t size) { - const hsa_signal_value_t init = 1; - const hsa_signal_value_t success = 0; - hsa_signal_store_screlease(signal, init); - - hsa_status_t err = hsa_amd_memory_async_copy(dest, agent, src, agent, size, 0, - nullptr, signal); - if (err != HSA_STATUS_SUCCESS) - return err; - - // async_copy reports success by decrementing and failure by setting to < 0 - hsa_signal_value_t got = init; - while (got == init) - got = hsa_signal_wait_scacquire(signal, HSA_SIGNAL_CONDITION_NE, init, - UINT64_MAX, HSA_WAIT_STATE_BLOCKED); - - if (got != success) - return HSA_STATUS_ERROR; - - return err; -} - -struct implFreePtrDeletor { - void operator()(void *p) { - core::Runtime::Memfree(p); // ignore failure to free - } -}; - -enum CopyDirection { H2D, D2H }; - -static hsa_status_t locking_async_memcpy(enum CopyDirection direction, - hsa_signal_t signal, void *dest, - hsa_agent_t agent, void *src, - void *lockingPtr, size_t size) { - void *lockedPtr = nullptr; - hsa_status_t err = is_locked(lockingPtr, &lockedPtr); - bool HostPtrIsLocked = true; - if (err != HSA_STATUS_SUCCESS) - return err; - if (!lockedPtr) { // not locked - HostPtrIsLocked = false; - hsa_agent_t agents[1] = {agent}; - err = hsa_amd_memory_lock(lockingPtr, size, agents, /*num_agent=*/1, - (void **)&lockedPtr); - if (err != HSA_STATUS_SUCCESS) - return err; - DP("locking_async_memcpy: lockingPtr=%p lockedPtr=%p Size = %lu\n", - lockingPtr, lockedPtr, size); - } - - switch (direction) { - case H2D: - err = invoke_hsa_copy(signal, dest, agent, lockedPtr, size); - break; - case D2H: - err = invoke_hsa_copy(signal, lockedPtr, agent, src, size); - break; - } - - if (err != HSA_STATUS_SUCCESS && !HostPtrIsLocked) { - // do not leak locked host pointers, but discard potential error message - // because the initial error was in the copy function - hsa_amd_memory_unlock(lockingPtr); - return err; - } - - // unlock only if not user locked - if (!HostPtrIsLocked) - err = hsa_amd_memory_unlock(lockingPtr); - if (err != HSA_STATUS_SUCCESS) - return err; - - return HSA_STATUS_SUCCESS; -} - -hsa_status_t impl_memcpy_h2d(hsa_signal_t signal, void *deviceDest, - void *hostSrc, size_t size, - hsa_agent_t device_agent, - hsa_amd_memory_pool_t MemoryPool) { - hsa_status_t err; - - err = locking_async_memcpy(CopyDirection::H2D, signal, deviceDest, - device_agent, hostSrc, hostSrc, size); - - if (err == HSA_STATUS_SUCCESS) - return err; - - // async memcpy sometimes fails in situations where - // allocate + copy succeeds. Looks like it might be related to - // locking part of a read only segment. Fall back for now. - void *tempHostPtr; - hsa_status_t ret = core::Runtime::HostMalloc(&tempHostPtr, size, MemoryPool); - if (ret != HSA_STATUS_SUCCESS) { - DP("HostMalloc: Unable to alloc %zu bytes for temp scratch\n", size); - return ret; - } - std::unique_ptr del(tempHostPtr); - memcpy(tempHostPtr, hostSrc, size); - - return locking_async_memcpy(CopyDirection::H2D, signal, deviceDest, - device_agent, tempHostPtr, tempHostPtr, size); -} - -hsa_status_t impl_memcpy_d2h(hsa_signal_t signal, void *hostDest, - void *deviceSrc, size_t size, - hsa_agent_t deviceAgent, - hsa_amd_memory_pool_t MemoryPool) { - hsa_status_t err; - - // device has always visibility over both pointers, so use that - err = locking_async_memcpy(CopyDirection::D2H, signal, hostDest, deviceAgent, - deviceSrc, hostDest, size); - - if (err == HSA_STATUS_SUCCESS) - return err; - - // hsa_memory_copy sometimes fails in situations where - // allocate + copy succeeds. Looks like it might be related to - // locking part of a read only segment. Fall back for now. - void *tempHostPtr; - hsa_status_t ret = core::Runtime::HostMalloc(&tempHostPtr, size, MemoryPool); - if (ret != HSA_STATUS_SUCCESS) { - DP("HostMalloc: Unable to alloc %zu bytes for temp scratch\n", size); - return ret; - } - std::unique_ptr del(tempHostPtr); - - err = locking_async_memcpy(CopyDirection::D2H, signal, tempHostPtr, - deviceAgent, deviceSrc, tempHostPtr, size); - if (err != HSA_STATUS_SUCCESS) - return HSA_STATUS_ERROR; - - memcpy(hostDest, tempHostPtr, size); - return HSA_STATUS_SUCCESS; -} diff --git a/openmp/libomptarget/plugins/amdgpu/impl/impl_runtime.h b/openmp/libomptarget/plugins/amdgpu/impl/impl_runtime.h deleted file mode 100644 --- a/openmp/libomptarget/plugins/amdgpu/impl/impl_runtime.h +++ /dev/null @@ -1,34 +0,0 @@ -//===--- amdgpu/impl/impl_runtime.h ------------------------------- C++ -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -#ifndef INCLUDE_IMPL_RUNTIME_H_ -#define INCLUDE_IMPL_RUNTIME_H_ - -#include "hsa_api.h" - -extern "C" { - -// Check if pointer ptr is already locked -hsa_status_t is_locked(void *ptr, void **agentBaseAddress); - -hsa_status_t impl_module_register_from_memory_to_place( - void *module_bytes, size_t module_size, int DeviceId, - hsa_status_t (*on_deserialized_data)(void *data, size_t size, - void *cb_state), - void *cb_state); - -hsa_status_t impl_memcpy_h2d(hsa_signal_t signal, void *deviceDest, - void *hostSrc, size_t size, - hsa_agent_t device_agent, - hsa_amd_memory_pool_t MemoryPool); - -hsa_status_t impl_memcpy_d2h(hsa_signal_t sig, void *hostDest, void *deviceSrc, - size_t size, hsa_agent_t device_agent, - hsa_amd_memory_pool_t MemoryPool); -} - -#endif // INCLUDE_IMPL_RUNTIME_H_ diff --git a/openmp/libomptarget/plugins/amdgpu/impl/internal.h b/openmp/libomptarget/plugins/amdgpu/impl/internal.h deleted file mode 100644 --- a/openmp/libomptarget/plugins/amdgpu/impl/internal.h +++ /dev/null @@ -1,154 +0,0 @@ -//===--- amdgpu/impl/internal.h ----------------------------------- C++ -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -#ifndef SRC_RUNTIME_INCLUDE_INTERNAL_H_ -#define SRC_RUNTIME_INCLUDE_INTERNAL_H_ -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include - -#include "hsa_api.h" - -#include "impl_runtime.h" - -#ifndef TARGET_NAME -#error "Missing TARGET_NAME macro" -#endif -#define DEBUG_PREFIX "Target " GETNAME(TARGET_NAME) " RTL" -#include "Debug.h" - -#define MAX_NUM_KERNELS (1024 * 16) - -// ---------------------- Kernel Start ------------- -typedef struct atl_kernel_info_s { - uint64_t kernel_object; - uint32_t group_segment_size; - uint32_t private_segment_size; - uint32_t sgpr_count; - uint32_t vgpr_count; - uint32_t sgpr_spill_count; - uint32_t vgpr_spill_count; - uint32_t kernel_segment_size; - uint32_t explicit_argument_count; - uint32_t implicit_argument_count; -} atl_kernel_info_t; - -typedef struct atl_symbol_info_s { - uint64_t addr; - uint32_t size; -} atl_symbol_info_t; - -// ---------------------- Kernel End ------------- - -namespace core { -class TaskgroupImpl; -class TaskImpl; -class Kernel; -class KernelImpl; -} // namespace core - -struct SignalPoolT { - SignalPoolT() {} - SignalPoolT(const SignalPoolT &) = delete; - SignalPoolT(SignalPoolT &&) = delete; - ~SignalPoolT() { - size_t N = state.size(); - for (size_t i = 0; i < N; i++) { - hsa_signal_t signal = state.front(); - state.pop(); - hsa_status_t rc = hsa_signal_destroy(signal); - if (rc != HSA_STATUS_SUCCESS) { - DP("Signal pool destruction failed\n"); - } - } - } - size_t size() { - lock l(&mutex); - return state.size(); - } - void push(hsa_signal_t s) { - lock l(&mutex); - state.push(s); - } - hsa_signal_t pop(void) { - lock l(&mutex); - if (!state.empty()) { - hsa_signal_t res = state.front(); - state.pop(); - return res; - } - - // Pool empty, attempt to create another signal - hsa_signal_t new_signal; - hsa_status_t err = hsa_signal_create(0, 0, NULL, &new_signal); - if (err == HSA_STATUS_SUCCESS) { - return new_signal; - } - - // Fail - return {0}; - } - -private: - static pthread_mutex_t mutex; - std::queue state; - struct lock { - lock(pthread_mutex_t *m) : m(m) { pthread_mutex_lock(m); } - ~lock() { pthread_mutex_unlock(m); } - pthread_mutex_t *m; - }; -}; - -namespace core { -hsa_status_t atl_init_gpu_context(); - -hsa_status_t init_hsa(); -hsa_status_t finalize_hsa(); -/* - * Generic utils - */ -template inline T alignDown(T value, size_t alignment) { - return (T)(value & ~(alignment - 1)); -} - -template inline T *alignDown(T *value, size_t alignment) { - return reinterpret_cast(alignDown((intptr_t)value, alignment)); -} - -template inline T alignUp(T value, size_t alignment) { - return alignDown((T)(value + alignment - 1), alignment); -} - -template inline T *alignUp(T *value, size_t alignment) { - return reinterpret_cast( - alignDown((intptr_t)(value + alignment - 1), alignment)); -} - -extern bool atl_is_impl_initialized(); - -bool handle_group_signal(hsa_signal_value_t value, void *arg); - -hsa_status_t allow_access_to_all_gpu_agents(void *ptr); -} // namespace core - -inline const char *get_error_string(hsa_status_t err) { - const char *res; - hsa_status_t rc = hsa_status_string(err, &res); - return (rc == HSA_STATUS_SUCCESS) ? res : "HSA_STATUS UNKNOWN."; -} - -#endif // SRC_RUNTIME_INCLUDE_INTERNAL_H_ diff --git a/openmp/libomptarget/plugins/amdgpu/impl/interop_hsa.h b/openmp/libomptarget/plugins/amdgpu/impl/interop_hsa.h deleted file mode 100644 --- a/openmp/libomptarget/plugins/amdgpu/impl/interop_hsa.h +++ /dev/null @@ -1,26 +0,0 @@ -//===--- amdgpu/impl/interop_hsa.h -------------------------------- C++ -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -#ifndef INCLUDE_INTEROP_HSA_H_ -#define INCLUDE_INTEROP_HSA_H_ - -#include "impl_runtime.h" -#include "hsa_api.h" -#include "internal.h" - -#include -#include - -extern "C" { - -hsa_status_t interop_hsa_get_symbol_info( - const std::map &SymbolInfoTable, - int DeviceId, const char *symbol, void **var_addr, unsigned int *var_size); - -} - -#endif // INCLUDE_INTEROP_HSA_H_ diff --git a/openmp/libomptarget/plugins/amdgpu/impl/interop_hsa.cpp b/openmp/libomptarget/plugins/amdgpu/impl/interop_hsa.cpp deleted file mode 100644 --- a/openmp/libomptarget/plugins/amdgpu/impl/interop_hsa.cpp +++ /dev/null @@ -1,39 +0,0 @@ -//===--- amdgpu/impl/interop_hsa.cpp ------------------------------ C++ -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -#include "interop_hsa.h" -#include "internal.h" - -hsa_status_t interop_hsa_get_symbol_info( - const std::map &SymbolInfoTable, - int DeviceId, const char *symbol, void **var_addr, unsigned int *var_size) { - /* - // Typical usage: - void *var_addr; - size_t var_size; - interop_hsa_get_symbol_addr(gpu_place, "symbol_name", &var_addr, - &var_size); - impl_memcpy(signal, host_add, var_addr, var_size); - */ - - if (!symbol || !var_addr || !var_size) - return HSA_STATUS_ERROR; - - // get the symbol info - std::string symbolStr = std::string(symbol); - auto It = SymbolInfoTable.find(symbolStr); - if (It != SymbolInfoTable.end()) { - atl_symbol_info_t info = It->second; - *var_addr = reinterpret_cast(info.addr); - *var_size = info.size; - return HSA_STATUS_SUCCESS; - } else { - *var_addr = NULL; - *var_size = 0; - return HSA_STATUS_ERROR; - } -} diff --git a/openmp/libomptarget/plugins/amdgpu/impl/msgpack.h b/openmp/libomptarget/plugins/amdgpu/impl/msgpack.h deleted file mode 100644 --- a/openmp/libomptarget/plugins/amdgpu/impl/msgpack.h +++ /dev/null @@ -1,282 +0,0 @@ -//===--- amdgpu/impl/msgpack.h ------------------------------------ C++ -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -#ifndef MSGPACK_H -#define MSGPACK_H - -#include - -namespace msgpack { - -// The message pack format is dynamically typed, schema-less. Format is: -// message: [type][header][payload] -// where type is one byte, header length is a fixed length function of type -// payload is zero to N bytes, with the length encoded in [type][header] - -// Scalar fields include boolean, signed integer, float, string etc -// Composite types are sequences of messages -// Array field is [header][element][element]... -// Map field is [header][key][value][key][value]... - -// Multibyte integer fields are big endian encoded -// The map key can be any message type -// Maps may contain duplicate keys -// Data is not uniquely encoded, e.g. integer "8" may be stored as one byte or -// in as many as nine, as signed or unsigned. Implementation defined. -// Similarly "foo" may embed the length in the type field or in multiple bytes - -// This parser is structured as an iterator over a sequence of bytes. -// It calls a user provided function on each message in order to extract fields -// The default implementation for each scalar type is to do nothing. For map or -// arrays, the default implementation returns just after that message to support -// iterating to the next message, but otherwise has no effect. - -struct byte_range { - const unsigned char *start; - const unsigned char *end; -}; - -const unsigned char *skip_next_message(const unsigned char *start, - const unsigned char *end); - -template class functors_defaults { -public: - void cb_string(size_t N, const unsigned char *str) { - derived().handle_string(N, str); - } - void cb_boolean(bool x) { derived().handle_boolean(x); } - void cb_signed(int64_t x) { derived().handle_signed(x); } - void cb_unsigned(uint64_t x) { derived().handle_unsigned(x); } - void cb_array_elements(byte_range bytes) { - derived().handle_array_elements(bytes); - } - void cb_map_elements(byte_range key, byte_range value) { - derived().handle_map_elements(key, value); - } - const unsigned char *cb_array(uint64_t N, byte_range bytes) { - return derived().handle_array(N, bytes); - } - const unsigned char *cb_map(uint64_t N, byte_range bytes) { - return derived().handle_map(N, bytes); - } - -private: - Derived &derived() { return *static_cast(this); } - - // Default implementations for scalar ops are no-ops - void handle_string(size_t, const unsigned char *) {} - void handle_boolean(bool) {} - void handle_signed(int64_t) {} - void handle_unsigned(uint64_t) {} - void handle_array_elements(byte_range) {} - void handle_map_elements(byte_range, byte_range) {} - - // Default implementation for sequences is to skip over the messages - const unsigned char *handle_array(uint64_t N, byte_range bytes) { - for (uint64_t i = 0; i < N; i++) { - const unsigned char *next = skip_next_message(bytes.start, bytes.end); - if (!next) { - return nullptr; - } - cb_array_elements(bytes); - bytes.start = next; - } - return bytes.start; - } - const unsigned char *handle_map(uint64_t N, byte_range bytes) { - for (uint64_t i = 0; i < N; i++) { - const unsigned char *start_key = bytes.start; - const unsigned char *end_key = skip_next_message(start_key, bytes.end); - if (!end_key) { - return nullptr; - } - const unsigned char *start_value = end_key; - const unsigned char *end_value = - skip_next_message(start_value, bytes.end); - if (!end_value) { - return nullptr; - } - cb_map_elements({start_key, end_key}, {start_value, end_value}); - bytes.start = end_value; - } - return bytes.start; - } -}; - -typedef enum : uint8_t { -#define X(NAME, WIDTH, PAYLOAD, LOWER, UPPER) NAME, -#include "msgpack.def" -#undef X -} type; - -[[noreturn]] void internal_error(); -type parse_type(unsigned char x); -unsigned bytes_used_fixed(type ty); - -typedef uint64_t (*payload_info_t)(const unsigned char *); -payload_info_t payload_info(msgpack::type ty); - -template R bitcast(T x); - -template -const unsigned char *handle_msgpack_given_type(byte_range bytes, F f) { - const unsigned char *start = bytes.start; - const unsigned char *end = bytes.end; - const uint64_t available = end - start; - assert(available != 0); - assert(ty == parse_type(*start)); - - const uint64_t bytes_used = bytes_used_fixed(ty); - if (available < bytes_used) { - return 0; - } - const uint64_t available_post_header = available - bytes_used; - - const payload_info_t info = payload_info(ty); - const uint64_t N = info(start); - - switch (ty) { - case msgpack::t: - case msgpack::f: { - // t is 0b11000010, f is 0b11000011, masked with 0x1 - f.cb_boolean(N); - return start + bytes_used; - } - - case msgpack::posfixint: - case msgpack::uint8: - case msgpack::uint16: - case msgpack::uint32: - case msgpack::uint64: { - f.cb_unsigned(N); - return start + bytes_used; - } - - case msgpack::negfixint: - case msgpack::int8: - case msgpack::int16: - case msgpack::int32: - case msgpack::int64: { - f.cb_signed(bitcast(N)); - return start + bytes_used; - } - - case msgpack::fixstr: - case msgpack::str8: - case msgpack::str16: - case msgpack::str32: { - if (available_post_header < N) { - return 0; - } else { - f.cb_string(N, start + bytes_used); - return start + bytes_used + N; - } - } - - case msgpack::fixarray: - case msgpack::array16: - case msgpack::array32: { - return f.cb_array(N, {start + bytes_used, end}); - } - - case msgpack::fixmap: - case msgpack::map16: - case msgpack::map32: { - return f.cb_map(N, {start + bytes_used, end}); - } - - case msgpack::nil: - case msgpack::bin8: - case msgpack::bin16: - case msgpack::bin32: - case msgpack::float32: - case msgpack::float64: - case msgpack::ext8: - case msgpack::ext16: - case msgpack::ext32: - case msgpack::fixext1: - case msgpack::fixext2: - case msgpack::fixext4: - case msgpack::fixext8: - case msgpack::fixext16: - case msgpack::never_used: { - if (available_post_header < N) { - return 0; - } - return start + bytes_used + N; - } - } - internal_error(); -} - -template -const unsigned char *handle_msgpack(byte_range bytes, F f) { - const unsigned char *start = bytes.start; - const unsigned char *end = bytes.end; - const uint64_t available = end - start; - if (available == 0) { - return 0; - } - const type ty = parse_type(*start); - - switch (ty) { -#define X(NAME, WIDTH, PAYLOAD, LOWER, UPPER) \ - case msgpack::NAME: \ - return handle_msgpack_given_type(bytes, f); -#include "msgpack.def" -#undef X - } - - internal_error(); -} - -bool message_is_string(byte_range bytes, const char *str); - -template void foronly_string(byte_range bytes, C callback) { - struct inner : functors_defaults { - inner(C &cb) : cb(cb) {} - C &cb; - void handle_string(size_t N, const unsigned char *str) { cb(N, str); } - }; - handle_msgpack(bytes, {callback}); -} - -template void foronly_unsigned(byte_range bytes, C callback) { - struct inner : functors_defaults { - inner(C &cb) : cb(cb) {} - C &cb; - void handle_unsigned(uint64_t x) { cb(x); } - }; - handle_msgpack(bytes, {callback}); -} - -template void foreach_array(byte_range bytes, C callback) { - struct inner : functors_defaults { - inner(C &cb) : cb(cb) {} - C &cb; - void handle_array_elements(byte_range element) { cb(element); } - }; - handle_msgpack(bytes, {callback}); -} - -template void foreach_map(byte_range bytes, C callback) { - struct inner : functors_defaults { - inner(C &cb) : cb(cb) {} - C &cb; - void handle_map_elements(byte_range key, byte_range value) { - cb(key, value); - } - }; - handle_msgpack(bytes, {callback}); -} - -// Crude approximation to json -void dump(byte_range); - -} // namespace msgpack - -#endif diff --git a/openmp/libomptarget/plugins/amdgpu/impl/msgpack.cpp b/openmp/libomptarget/plugins/amdgpu/impl/msgpack.cpp deleted file mode 100644 --- a/openmp/libomptarget/plugins/amdgpu/impl/msgpack.cpp +++ /dev/null @@ -1,271 +0,0 @@ -//===--- amdgpu/impl/msgpack.cpp ---------------------------------- C++ -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -#include -#include -#include -#include -#include - -#include "msgpack.h" - -namespace msgpack { - -[[noreturn]] void internal_error() { - printf("internal error\n"); - exit(1); -} - -const char *type_name(type ty) { - switch (ty) { -#define X(NAME, WIDTH, PAYLOAD, LOWER, UPPER) \ - case NAME: \ - return #NAME; -#include "msgpack.def" -#undef X - } - internal_error(); -} - -unsigned bytes_used_fixed(msgpack::type ty) { - using namespace msgpack; - switch (ty) { -#define X(NAME, WIDTH, PAYLOAD, LOWER, UPPER) \ - case NAME: \ - return WIDTH; -#include "msgpack.def" -#undef X - } - internal_error(); -} - -msgpack::type parse_type(unsigned char x) { - -#define X(NAME, WIDTH, PAYLOAD, LOWER, UPPER) \ - if (x >= LOWER && x <= UPPER) { \ - return NAME; \ - } else -#include "msgpack.def" -#undef X - { internal_error(); } -} - -template R bitcast(T x) { - static_assert(sizeof(T) == sizeof(R), ""); - R tmp; - memcpy(&tmp, &x, sizeof(T)); - return tmp; -} -template int64_t bitcast(uint64_t); -} // namespace msgpack - -// Helper functions for reading additional payload from the header -// Depending on the type, this can be a number of bytes, elements, -// key-value pairs or an embedded integer. -// Each takes a pointer to the start of the header and returns a uint64_t - -namespace { -namespace payload { -uint64_t read_zero(const unsigned char *) { return 0; } - -// Read the first byte and zero/sign extend it -uint64_t read_embedded_u8(const unsigned char *start) { return start[0]; } -uint64_t read_embedded_s8(const unsigned char *start) { - int64_t res = msgpack::bitcast(start[0]); - return msgpack::bitcast(res); -} - -// Read a masked part of the first byte -uint64_t read_via_mask_0x1(const unsigned char *start) { return *start & 0x1u; } -uint64_t read_via_mask_0xf(const unsigned char *start) { return *start & 0xfu; } -uint64_t read_via_mask_0x1f(const unsigned char *start) { - return *start & 0x1fu; -} - -// Read 1/2/4/8 bytes immediately following the type byte and zero/sign extend -// Big endian format. -uint64_t read_size_field_u8(const unsigned char *from) { - from++; - return from[0]; -} - -// TODO: detect whether host is little endian or not, and whether the intrinsic -// is available. And probably use the builtin to test the diy -const bool use_bswap = false; - -uint64_t read_size_field_u16(const unsigned char *from) { - from++; - if (use_bswap) { - uint16_t b; - memcpy(&b, from, 2); - return __builtin_bswap16(b); - } else { - return (from[0] << 8u) | from[1]; - } -} -uint64_t read_size_field_u32(const unsigned char *from) { - from++; - if (use_bswap) { - uint32_t b; - memcpy(&b, from, 4); - return __builtin_bswap32(b); - } else { - return (from[0] << 24u) | (from[1] << 16u) | (from[2] << 8u) | - (from[3] << 0u); - } -} -uint64_t read_size_field_u64(const unsigned char *from) { - from++; - if (use_bswap) { - uint64_t b; - memcpy(&b, from, 8); - return __builtin_bswap64(b); - } else { - return ((uint64_t)from[0] << 56u) | ((uint64_t)from[1] << 48u) | - ((uint64_t)from[2] << 40u) | ((uint64_t)from[3] << 32u) | - (from[4] << 24u) | (from[5] << 16u) | (from[6] << 8u) | - (from[7] << 0u); - } -} - -uint64_t read_size_field_s8(const unsigned char *from) { - uint8_t u = read_size_field_u8(from); - int64_t res = msgpack::bitcast(u); - return msgpack::bitcast(res); -} -uint64_t read_size_field_s16(const unsigned char *from) { - uint16_t u = read_size_field_u16(from); - int64_t res = msgpack::bitcast(u); - return msgpack::bitcast(res); -} -uint64_t read_size_field_s32(const unsigned char *from) { - uint32_t u = read_size_field_u32(from); - int64_t res = msgpack::bitcast(u); - return msgpack::bitcast(res); -} -uint64_t read_size_field_s64(const unsigned char *from) { - uint64_t u = read_size_field_u64(from); - int64_t res = msgpack::bitcast(u); - return msgpack::bitcast(res); -} -} // namespace payload -} // namespace - -namespace msgpack { - -payload_info_t payload_info(msgpack::type ty) { - using namespace msgpack; - switch (ty) { -#define X(NAME, WIDTH, PAYLOAD, LOWER, UPPER) \ - case NAME: \ - return payload::PAYLOAD; -#include "msgpack.def" -#undef X - } - internal_error(); -} - -} // namespace msgpack - -const unsigned char *msgpack::skip_next_message(const unsigned char *start, - const unsigned char *end) { - class f : public functors_defaults {}; - return handle_msgpack({start, end}, f()); -} - -namespace msgpack { -bool message_is_string(byte_range bytes, const char *needle) { - bool matched = false; - size_t needleN = strlen(needle); - - foronly_string(bytes, [=, &matched](size_t N, const unsigned char *str) { - if (N == needleN) { - if (memcmp(needle, str, N) == 0) { - matched = true; - } - } - }); - return matched; -} - -void dump(byte_range bytes) { - struct inner : functors_defaults { - inner(unsigned indent) : indent(indent) {} - const unsigned by = 2; - unsigned indent = 0; - - void handle_string(size_t N, const unsigned char *bytes) { - char *tmp = (char *)malloc(N + 1); - memcpy(tmp, bytes, N); - tmp[N] = '\0'; - printf("\"%s\"", tmp); - free(tmp); - } - - void handle_signed(int64_t x) { printf("%ld", x); } - void handle_unsigned(uint64_t x) { printf("%lu", x); } - - const unsigned char *handle_array(uint64_t N, byte_range bytes) { - printf("\n%*s[\n", indent, ""); - indent += by; - - for (uint64_t i = 0; i < N; i++) { - indent += by; - printf("%*s", indent, ""); - const unsigned char *next = handle_msgpack(bytes, {indent}); - printf(",\n"); - indent -= by; - bytes.start = next; - if (!next) { - break; - } - } - indent -= by; - printf("%*s]", indent, ""); - - return bytes.start; - } - - const unsigned char *handle_map(uint64_t N, byte_range bytes) { - printf("\n%*s{\n", indent, ""); - indent += by; - - for (uint64_t i = 0; i < 2 * N; i += 2) { - const unsigned char *start_key = bytes.start; - printf("%*s", indent, ""); - const unsigned char *end_key = - handle_msgpack({start_key, bytes.end}, {indent}); - if (!end_key) { - break; - } - - printf(" : "); - - const unsigned char *start_value = end_key; - const unsigned char *end_value = - handle_msgpack({start_value, bytes.end}, {indent}); - - if (!end_value) { - break; - } - - printf(",\n"); - bytes.start = end_value; - } - - indent -= by; - printf("%*s}", indent, ""); - - return bytes.start; - } - }; - - handle_msgpack(bytes, {0}); - printf("\n"); -} - -} // namespace msgpack diff --git a/openmp/libomptarget/plugins/amdgpu/impl/msgpack.def b/openmp/libomptarget/plugins/amdgpu/impl/msgpack.def deleted file mode 100644 --- a/openmp/libomptarget/plugins/amdgpu/impl/msgpack.def +++ /dev/null @@ -1,46 +0,0 @@ -//===--- amdgpu/impl/msgpack.def ---------------------------------- C++ -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -// name, header width, reader, [lower, upper] encoding -X(posfixint, 1, read_embedded_u8, 0x00, 0x7f) -X(negfixint, 1, read_embedded_s8, 0xe0, 0xff) -X(fixmap, 1, read_via_mask_0xf, 0x80, 0x8f) -X(fixarray, 1, read_via_mask_0xf, 0x90, 0x9f) -X(fixstr, 1, read_via_mask_0x1f, 0xa0, 0xbf) -X(nil, 1, read_zero, 0xc0, 0xc0) -X(never_used, 1, read_zero, 0xc1, 0xc1) -X(f, 1, read_via_mask_0x1, 0xc2, 0xc2) -X(t, 1, read_via_mask_0x1, 0xc3, 0xc3) -X(bin8, 2, read_size_field_u8, 0xc4, 0xc4) -X(bin16, 3, read_size_field_u16, 0xc5, 0xc5) -X(bin32, 5, read_size_field_u32, 0xc6, 0xc6) -X(ext8, 3, read_size_field_u8, 0xc7, 0xc7) -X(ext16, 4, read_size_field_u16, 0xc8, 0xc8) -X(ext32, 6, read_size_field_u32, 0xc9, 0xc9) -X(float32, 5, read_zero, 0xca, 0xca) -X(float64, 9, read_zero, 0xcb, 0xcb) -X(uint8, 2, read_size_field_u8, 0xcc, 0xcc) -X(uint16, 3, read_size_field_u16, 0xcd, 0xcd) -X(uint32, 5, read_size_field_u32, 0xce, 0xce) -X(uint64, 9, read_size_field_u64, 0xcf, 0xcf) -X(int8, 2, read_size_field_s8, 0xd0, 0xd0) -X(int16, 3, read_size_field_s16, 0xd1, 0xd1) -X(int32, 5, read_size_field_s32, 0xd2, 0xd2) -X(int64, 9, read_size_field_s64, 0xd3, 0xd3) -X(fixext1, 3, read_zero, 0xd4, 0xd4) -X(fixext2, 4, read_zero, 0xd5, 0xd5) -X(fixext4, 6, read_zero, 0xd6, 0xd6) -X(fixext8, 10, read_zero, 0xd7, 0xd7) -X(fixext16, 18, read_zero, 0xd8, 0xd8) -X(str8, 2, read_size_field_u8, 0xd9, 0xd9) -X(str16, 3, read_size_field_u16, 0xda, 0xda) -X(str32, 5, read_size_field_u32, 0xdb, 0xdb) -X(array16, 3, read_size_field_u16, 0xdc, 0xdc) -X(array32, 5, read_size_field_u32, 0xdd, 0xdd) -X(map16, 3, read_size_field_u16, 0xde, 0xde) -X(map32, 5, read_size_field_u32, 0xdf, 0xdf) diff --git a/openmp/libomptarget/plugins/amdgpu/impl/rt.h b/openmp/libomptarget/plugins/amdgpu/impl/rt.h deleted file mode 100644 --- a/openmp/libomptarget/plugins/amdgpu/impl/rt.h +++ /dev/null @@ -1,34 +0,0 @@ -//===--- amdgpu/impl/rt.h ----------------------------------------- C++ -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -#ifndef SRC_RUNTIME_INCLUDE_RT_H_ -#define SRC_RUNTIME_INCLUDE_RT_H_ - -#include "hsa_api.h" -#include "impl_runtime.h" -#include "internal.h" - -#include - -namespace core { -namespace Runtime { -hsa_status_t Memfree(void *); -hsa_status_t HostMalloc(void **ptr, size_t size, - hsa_amd_memory_pool_t MemoryPool); - -} // namespace Runtime -hsa_status_t RegisterModuleFromMemory( - std::map &KernelInfoTable, - std::map &SymbolInfoTable, - void *module_bytes, size_t module_size, hsa_agent_t agent, - hsa_status_t (*on_deserialized_data)(void *data, size_t size, - void *cb_state), - void *cb_state, std::vector &HSAExecutables); - -} // namespace core - -#endif // SRC_RUNTIME_INCLUDE_RT_H_ diff --git a/openmp/libomptarget/plugins/amdgpu/impl/system.cpp b/openmp/libomptarget/plugins/amdgpu/impl/system.cpp deleted file mode 100644 --- a/openmp/libomptarget/plugins/amdgpu/impl/system.cpp +++ /dev/null @@ -1,744 +0,0 @@ -//===--- amdgpu/impl/system.cpp ----------------------------------- C++ -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#include "llvm/ADT/StringRef.h" -#include "llvm/BinaryFormat/ELF.h" -#include "llvm/Object/ELF.h" -#include "llvm/Object/ELFObjectFile.h" - -#include -#include -#include - -#include "internal.h" -#include "rt.h" - -#include "msgpack.h" - -using namespace llvm; -using namespace llvm::object; -using namespace llvm::ELF; - -namespace hsa { -// Wrap HSA iterate API in a shim that allows passing general callables -template -hsa_status_t executable_iterate_symbols(hsa_executable_t executable, C cb) { - auto L = [](hsa_executable_t executable, hsa_executable_symbol_t symbol, - void *data) -> hsa_status_t { - C *unwrapped = static_cast(data); - return (*unwrapped)(executable, symbol); - }; - return hsa_executable_iterate_symbols(executable, L, - static_cast(&cb)); -} -} // namespace hsa - -typedef unsigned char *address; -/* - * Note descriptors. - */ -// FreeBSD already declares Elf_Note (indirectly via ) -#if !defined(__FreeBSD__) -typedef struct { - uint32_t n_namesz; /* Length of note's name. */ - uint32_t n_descsz; /* Length of note's value. */ - uint32_t n_type; /* Type of note. */ - // then name - // then padding, optional - // then desc, at 4 byte alignment (not 8, despite being elf64) -} Elf_Note; -#endif - -class KernelArgMD { -public: - enum class ValueKind { - HiddenGlobalOffsetX, - HiddenGlobalOffsetY, - HiddenGlobalOffsetZ, - HiddenNone, - HiddenPrintfBuffer, - HiddenDefaultQueue, - HiddenCompletionAction, - HiddenMultiGridSyncArg, - HiddenHostcallBuffer, - HiddenHeapV1, - Unknown - }; - - KernelArgMD() - : name_(std::string()), size_(0), offset_(0), - valueKind_(ValueKind::Unknown) {} - - // fields - std::string name_; - uint32_t size_; - uint32_t offset_; - ValueKind valueKind_; -}; - -static const std::map ArgValueKind = { - // v3 - // {"by_value", KernelArgMD::ValueKind::ByValue}, - // {"global_buffer", KernelArgMD::ValueKind::GlobalBuffer}, - // {"dynamic_shared_pointer", - // KernelArgMD::ValueKind::DynamicSharedPointer}, - // {"sampler", KernelArgMD::ValueKind::Sampler}, - // {"image", KernelArgMD::ValueKind::Image}, - // {"pipe", KernelArgMD::ValueKind::Pipe}, - // {"queue", KernelArgMD::ValueKind::Queue}, - {"hidden_global_offset_x", KernelArgMD::ValueKind::HiddenGlobalOffsetX}, - {"hidden_global_offset_y", KernelArgMD::ValueKind::HiddenGlobalOffsetY}, - {"hidden_global_offset_z", KernelArgMD::ValueKind::HiddenGlobalOffsetZ}, - {"hidden_none", KernelArgMD::ValueKind::HiddenNone}, - {"hidden_printf_buffer", KernelArgMD::ValueKind::HiddenPrintfBuffer}, - {"hidden_default_queue", KernelArgMD::ValueKind::HiddenDefaultQueue}, - {"hidden_completion_action", - KernelArgMD::ValueKind::HiddenCompletionAction}, - {"hidden_multigrid_sync_arg", - KernelArgMD::ValueKind::HiddenMultiGridSyncArg}, - {"hidden_hostcall_buffer", KernelArgMD::ValueKind::HiddenHostcallBuffer}, - {"hidden_heap_v1", KernelArgMD::ValueKind::HiddenHeapV1}}; - -namespace core { - -hsa_status_t callbackEvent(const hsa_amd_event_t *event, void *data) { - if (event->event_type == HSA_AMD_GPU_MEMORY_FAULT_EVENT) { - hsa_amd_gpu_memory_fault_info_t memory_fault = event->memory_fault; - // memory_fault.agent - // memory_fault.virtual_address - // memory_fault.fault_reason_mask - // fprintf("[GPU Error at %p: Reason is ", memory_fault.virtual_address); - std::stringstream stream; - stream << std::hex << (uintptr_t)memory_fault.virtual_address; - std::string addr("0x" + stream.str()); - - std::string err_string = "[GPU Memory Error] Addr: " + addr; - err_string += " Reason: "; - if (!(memory_fault.fault_reason_mask & 0x00111111)) { - err_string += "No Idea! "; - } else { - if (memory_fault.fault_reason_mask & 0x00000001) - err_string += "Page not present or supervisor privilege. "; - if (memory_fault.fault_reason_mask & 0x00000010) - err_string += "Write access to a read-only page. "; - if (memory_fault.fault_reason_mask & 0x00000100) - err_string += "Execute access to a page marked NX. "; - if (memory_fault.fault_reason_mask & 0x00001000) - err_string += "Host access only. "; - if (memory_fault.fault_reason_mask & 0x00010000) - err_string += "ECC failure (if supported by HW). "; - if (memory_fault.fault_reason_mask & 0x00100000) - err_string += "Can't determine the exact fault address. "; - } - fprintf(stderr, "%s\n", err_string.c_str()); - return HSA_STATUS_ERROR; - } - return HSA_STATUS_SUCCESS; -} - -hsa_status_t atl_init_gpu_context() { - hsa_status_t err = hsa_amd_register_system_event_handler(callbackEvent, NULL); - if (err != HSA_STATUS_SUCCESS) { - printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__, - "Registering the system for memory faults", get_error_string(err)); - return HSA_STATUS_ERROR; - } - - return HSA_STATUS_SUCCESS; -} - -static bool isImplicit(KernelArgMD::ValueKind value_kind) { - switch (value_kind) { - case KernelArgMD::ValueKind::HiddenGlobalOffsetX: - case KernelArgMD::ValueKind::HiddenGlobalOffsetY: - case KernelArgMD::ValueKind::HiddenGlobalOffsetZ: - case KernelArgMD::ValueKind::HiddenNone: - case KernelArgMD::ValueKind::HiddenPrintfBuffer: - case KernelArgMD::ValueKind::HiddenDefaultQueue: - case KernelArgMD::ValueKind::HiddenCompletionAction: - case KernelArgMD::ValueKind::HiddenMultiGridSyncArg: - case KernelArgMD::ValueKind::HiddenHostcallBuffer: - case KernelArgMD::ValueKind::HiddenHeapV1: - return true; - default: - return false; - } -} - -static std::pair -findMetadata(const ELFObjectFile &ELFObj) { - constexpr std::pair Failure = { - nullptr, nullptr}; - const auto &Elf = ELFObj.getELFFile(); - auto PhdrsOrErr = Elf.program_headers(); - if (!PhdrsOrErr) { - consumeError(PhdrsOrErr.takeError()); - return Failure; - } - - for (auto Phdr : *PhdrsOrErr) { - if (Phdr.p_type != PT_NOTE) - continue; - - Error Err = Error::success(); - for (auto Note : Elf.notes(Phdr, Err)) { - if (Note.getType() == 7 || Note.getType() == 8) - return Failure; - - // Code object v2 uses yaml metadata and is no longer supported. - if (Note.getType() == NT_AMD_HSA_METADATA && Note.getName() == "AMD") - return Failure; - // Code object v3 should have AMDGPU metadata. - if (Note.getType() == NT_AMDGPU_METADATA && Note.getName() != "AMDGPU") - return Failure; - - ArrayRef Desc = Note.getDesc(); - return {Desc.data(), Desc.data() + Desc.size()}; - } - - if (Err) { - consumeError(std::move(Err)); - return Failure; - } - } - - return Failure; -} - -static std::pair -find_metadata(void *binary, size_t binSize) { - constexpr std::pair Failure = { - nullptr, nullptr}; - - StringRef Buffer = StringRef(static_cast(binary), binSize); - auto ElfOrErr = ObjectFile::createELFObjectFile(MemoryBufferRef(Buffer, ""), - /*InitContent=*/false); - if (!ElfOrErr) { - consumeError(ElfOrErr.takeError()); - return Failure; - } - - if (const auto *ELFObj = dyn_cast(ElfOrErr->get())) - return findMetadata(*ELFObj); - return Failure; -} - -namespace { -int map_lookup_array(msgpack::byte_range message, const char *needle, - msgpack::byte_range *res, uint64_t *size) { - unsigned count = 0; - struct s : msgpack::functors_defaults { - s(unsigned &count, uint64_t *size) : count(count), size(size) {} - unsigned &count; - uint64_t *size; - const unsigned char *handle_array(uint64_t N, msgpack::byte_range bytes) { - count++; - *size = N; - return bytes.end; - } - }; - - msgpack::foreach_map(message, - [&](msgpack::byte_range key, msgpack::byte_range value) { - if (msgpack::message_is_string(key, needle)) { - // If the message is an array, record number of - // elements in *size - msgpack::handle_msgpack(value, {count, size}); - // return the whole array - *res = value; - } - }); - // Only claim success if exactly one key/array pair matched - return count != 1; -} - -int map_lookup_string(msgpack::byte_range message, const char *needle, - std::string *res) { - unsigned count = 0; - struct s : public msgpack::functors_defaults { - s(unsigned &count, std::string *res) : count(count), res(res) {} - unsigned &count; - std::string *res; - void handle_string(size_t N, const unsigned char *str) { - count++; - *res = std::string(str, str + N); - } - }; - msgpack::foreach_map(message, - [&](msgpack::byte_range key, msgpack::byte_range value) { - if (msgpack::message_is_string(key, needle)) { - msgpack::handle_msgpack(value, {count, res}); - } - }); - return count != 1; -} - -int map_lookup_uint64_t(msgpack::byte_range message, const char *needle, - uint64_t *res) { - unsigned count = 0; - msgpack::foreach_map(message, - [&](msgpack::byte_range key, msgpack::byte_range value) { - if (msgpack::message_is_string(key, needle)) { - msgpack::foronly_unsigned(value, [&](uint64_t x) { - count++; - *res = x; - }); - } - }); - return count != 1; -} - -int array_lookup_element(msgpack::byte_range message, uint64_t elt, - msgpack::byte_range *res) { - int rc = 1; - uint64_t i = 0; - msgpack::foreach_array(message, [&](msgpack::byte_range value) { - if (i == elt) { - *res = value; - rc = 0; - } - i++; - }); - return rc; -} - -int populate_kernelArgMD(msgpack::byte_range args_element, - KernelArgMD *kernelarg) { - using namespace msgpack; - int error = 0; - foreach_map(args_element, [&](byte_range key, byte_range value) -> void { - if (message_is_string(key, ".name")) { - foronly_string(value, [&](size_t N, const unsigned char *str) { - kernelarg->name_ = std::string(str, str + N); - }); - } else if (message_is_string(key, ".size")) { - foronly_unsigned(value, [&](uint64_t x) { kernelarg->size_ = x; }); - } else if (message_is_string(key, ".offset")) { - foronly_unsigned(value, [&](uint64_t x) { kernelarg->offset_ = x; }); - } else if (message_is_string(key, ".value_kind")) { - foronly_string(value, [&](size_t N, const unsigned char *str) { - std::string s = std::string(str, str + N); - auto itValueKind = ArgValueKind.find(s); - if (itValueKind != ArgValueKind.end()) { - kernelarg->valueKind_ = itValueKind->second; - } - }); - } - }); - return error; -} -} // namespace - -static hsa_status_t get_code_object_custom_metadata( - void *binary, size_t binSize, - std::map &KernelInfoTable) { - // parse code object with different keys from v2 - // also, the kernel name is not the same as the symbol name -- so a - // symbol->name map is needed - - std::pair metadata = - find_metadata(binary, binSize); - if (!metadata.first) { - return HSA_STATUS_ERROR_INVALID_CODE_OBJECT; - } - - uint64_t kernelsSize = 0; - int msgpack_errors = 0; - msgpack::byte_range kernel_array; - msgpack_errors = - map_lookup_array({metadata.first, metadata.second}, "amdhsa.kernels", - &kernel_array, &kernelsSize); - if (msgpack_errors != 0) { - printf("[%s:%d] %s failed\n", __FILE__, __LINE__, - "kernels lookup in program metadata"); - return HSA_STATUS_ERROR_INVALID_CODE_OBJECT; - } - - for (size_t i = 0; i < kernelsSize; i++) { - assert(msgpack_errors == 0); - std::string kernelName; - std::string symbolName; - - msgpack::byte_range element; - msgpack_errors += array_lookup_element(kernel_array, i, &element); - if (msgpack_errors != 0) { - printf("[%s:%d] %s failed\n", __FILE__, __LINE__, - "element lookup in kernel metadata"); - return HSA_STATUS_ERROR_INVALID_CODE_OBJECT; - } - - msgpack_errors += map_lookup_string(element, ".name", &kernelName); - msgpack_errors += map_lookup_string(element, ".symbol", &symbolName); - if (msgpack_errors != 0) { - printf("[%s:%d] %s failed\n", __FILE__, __LINE__, - "strings lookup in kernel metadata"); - return HSA_STATUS_ERROR_INVALID_CODE_OBJECT; - } - - // Make sure that kernelName + ".kd" == symbolName - if ((kernelName + ".kd") != symbolName) { - printf("[%s:%d] Kernel name mismatching symbol: %s != %s + .kd\n", - __FILE__, __LINE__, symbolName.c_str(), kernelName.c_str()); - return HSA_STATUS_ERROR_INVALID_CODE_OBJECT; - } - - atl_kernel_info_t info = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; - - uint64_t sgpr_count, vgpr_count, sgpr_spill_count, vgpr_spill_count; - msgpack_errors += map_lookup_uint64_t(element, ".sgpr_count", &sgpr_count); - if (msgpack_errors != 0) { - printf("[%s:%d] %s failed\n", __FILE__, __LINE__, - "sgpr count metadata lookup in kernel metadata"); - return HSA_STATUS_ERROR_INVALID_CODE_OBJECT; - } - - info.sgpr_count = sgpr_count; - - msgpack_errors += map_lookup_uint64_t(element, ".vgpr_count", &vgpr_count); - if (msgpack_errors != 0) { - printf("[%s:%d] %s failed\n", __FILE__, __LINE__, - "vgpr count metadata lookup in kernel metadata"); - return HSA_STATUS_ERROR_INVALID_CODE_OBJECT; - } - - info.vgpr_count = vgpr_count; - - msgpack_errors += - map_lookup_uint64_t(element, ".sgpr_spill_count", &sgpr_spill_count); - if (msgpack_errors != 0) { - printf("[%s:%d] %s failed\n", __FILE__, __LINE__, - "sgpr spill count metadata lookup in kernel metadata"); - return HSA_STATUS_ERROR_INVALID_CODE_OBJECT; - } - - info.sgpr_spill_count = sgpr_spill_count; - - msgpack_errors += - map_lookup_uint64_t(element, ".vgpr_spill_count", &vgpr_spill_count); - if (msgpack_errors != 0) { - printf("[%s:%d] %s failed\n", __FILE__, __LINE__, - "vgpr spill count metadata lookup in kernel metadata"); - return HSA_STATUS_ERROR_INVALID_CODE_OBJECT; - } - - info.vgpr_spill_count = vgpr_spill_count; - - size_t kernel_explicit_args_size = 0; - uint64_t kernel_segment_size; - msgpack_errors += map_lookup_uint64_t(element, ".kernarg_segment_size", - &kernel_segment_size); - if (msgpack_errors != 0) { - printf("[%s:%d] %s failed\n", __FILE__, __LINE__, - "kernarg segment size metadata lookup in kernel metadata"); - return HSA_STATUS_ERROR_INVALID_CODE_OBJECT; - } - - bool hasHiddenArgs = false; - if (kernel_segment_size > 0) { - uint64_t argsSize; - size_t offset = 0; - - msgpack::byte_range args_array; - msgpack_errors += - map_lookup_array(element, ".args", &args_array, &argsSize); - if (msgpack_errors != 0) { - printf("[%s:%d] %s failed\n", __FILE__, __LINE__, - "kernel args metadata lookup in kernel metadata"); - return HSA_STATUS_ERROR_INVALID_CODE_OBJECT; - } - - for (size_t i = 0; i < argsSize; ++i) { - KernelArgMD lcArg; - - msgpack::byte_range args_element; - msgpack_errors += array_lookup_element(args_array, i, &args_element); - if (msgpack_errors != 0) { - printf("[%s:%d] %s failed\n", __FILE__, __LINE__, - "iterate args map in kernel args metadata"); - return HSA_STATUS_ERROR_INVALID_CODE_OBJECT; - } - - msgpack_errors += populate_kernelArgMD(args_element, &lcArg); - if (msgpack_errors != 0) { - printf("[%s:%d] %s failed\n", __FILE__, __LINE__, - "iterate args map in kernel args metadata"); - return HSA_STATUS_ERROR_INVALID_CODE_OBJECT; - } - // v3 has offset field and not align field - size_t new_offset = lcArg.offset_; - size_t padding = new_offset - offset; - offset = new_offset; - DP("Arg[%lu] \"%s\" (%u, %u)\n", i, lcArg.name_.c_str(), lcArg.size_, - lcArg.offset_); - offset += lcArg.size_; - - // check if the arg is a hidden/implicit arg - // this logic assumes that all hidden args are 8-byte aligned - if (!isImplicit(lcArg.valueKind_)) { - info.explicit_argument_count++; - kernel_explicit_args_size += lcArg.size_; - } else { - info.implicit_argument_count++; - hasHiddenArgs = true; - } - kernel_explicit_args_size += padding; - } - } - - // TODO: Probably don't want this arithmetic - info.kernel_segment_size = - (hasHiddenArgs ? kernel_explicit_args_size : kernel_segment_size); - DP("[%s: kernarg seg size] (%lu --> %u)\n", kernelName.c_str(), - kernel_segment_size, info.kernel_segment_size); - - // kernel received, now add it to the kernel info table - KernelInfoTable[kernelName] = info; - } - - return HSA_STATUS_SUCCESS; -} - -static hsa_status_t -populate_InfoTables(hsa_executable_symbol_t symbol, - std::map &KernelInfoTable, - std::map &SymbolInfoTable) { - hsa_symbol_kind_t type; - - uint32_t name_length; - hsa_status_t err; - err = hsa_executable_symbol_get_info(symbol, HSA_EXECUTABLE_SYMBOL_INFO_TYPE, - &type); - if (err != HSA_STATUS_SUCCESS) { - printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__, - "Symbol info extraction", get_error_string(err)); - return err; - } - DP("Exec Symbol type: %d\n", type); - if (type == HSA_SYMBOL_KIND_KERNEL) { - err = hsa_executable_symbol_get_info( - symbol, HSA_EXECUTABLE_SYMBOL_INFO_NAME_LENGTH, &name_length); - if (err != HSA_STATUS_SUCCESS) { - printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__, - "Symbol info extraction", get_error_string(err)); - return err; - } - char *name = reinterpret_cast(malloc(name_length + 1)); - err = hsa_executable_symbol_get_info(symbol, - HSA_EXECUTABLE_SYMBOL_INFO_NAME, name); - if (err != HSA_STATUS_SUCCESS) { - printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__, - "Symbol info extraction", get_error_string(err)); - return err; - } - // remove the suffix .kd from symbol name. - name[name_length - 3] = 0; - - atl_kernel_info_t info; - std::string kernelName(name); - // by now, the kernel info table should already have an entry - // because the non-ROCr custom code object parsing is called before - // iterating over the code object symbols using ROCr - if (KernelInfoTable.find(kernelName) == KernelInfoTable.end()) { - DP("amdgpu internal consistency error\n"); - return HSA_STATUS_ERROR; - } - // found, so assign and update - info = KernelInfoTable[kernelName]; - - /* Extract dispatch information from the symbol */ - err = hsa_executable_symbol_get_info( - symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT, - &(info.kernel_object)); - if (err != HSA_STATUS_SUCCESS) { - printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__, - "Extracting the symbol from the executable", - get_error_string(err)); - return err; - } - err = hsa_executable_symbol_get_info( - symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_GROUP_SEGMENT_SIZE, - &(info.group_segment_size)); - if (err != HSA_STATUS_SUCCESS) { - printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__, - "Extracting the group segment size from the executable", - get_error_string(err)); - return err; - } - err = hsa_executable_symbol_get_info( - symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_PRIVATE_SEGMENT_SIZE, - &(info.private_segment_size)); - if (err != HSA_STATUS_SUCCESS) { - printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__, - "Extracting the private segment from the executable", - get_error_string(err)); - return err; - } - - DP("Kernel %s --> %lx symbol %u group segsize %u pvt segsize %u bytes " - "kernarg\n", - kernelName.c_str(), info.kernel_object, info.group_segment_size, - info.private_segment_size, info.kernel_segment_size); - - // assign it back to the kernel info table - KernelInfoTable[kernelName] = info; - free(name); - } else if (type == HSA_SYMBOL_KIND_VARIABLE) { - err = hsa_executable_symbol_get_info( - symbol, HSA_EXECUTABLE_SYMBOL_INFO_NAME_LENGTH, &name_length); - if (err != HSA_STATUS_SUCCESS) { - printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__, - "Symbol info extraction", get_error_string(err)); - return err; - } - char *name = reinterpret_cast(malloc(name_length + 1)); - err = hsa_executable_symbol_get_info(symbol, - HSA_EXECUTABLE_SYMBOL_INFO_NAME, name); - if (err != HSA_STATUS_SUCCESS) { - printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__, - "Symbol info extraction", get_error_string(err)); - return err; - } - name[name_length] = 0; - - atl_symbol_info_t info; - - err = hsa_executable_symbol_get_info( - symbol, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS, &(info.addr)); - if (err != HSA_STATUS_SUCCESS) { - printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__, - "Symbol info address extraction", get_error_string(err)); - return err; - } - - err = hsa_executable_symbol_get_info( - symbol, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_SIZE, &(info.size)); - if (err != HSA_STATUS_SUCCESS) { - printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__, - "Symbol info size extraction", get_error_string(err)); - return err; - } - - DP("Symbol %s = %p (%u bytes)\n", name, (void *)info.addr, info.size); - SymbolInfoTable[std::string(name)] = info; - free(name); - } else { - DP("Symbol is an indirect function\n"); - } - return HSA_STATUS_SUCCESS; -} - -hsa_status_t RegisterModuleFromMemory( - std::map &KernelInfoTable, - std::map &SymbolInfoTable, - void *module_bytes, size_t module_size, hsa_agent_t agent, - hsa_status_t (*on_deserialized_data)(void *data, size_t size, - void *cb_state), - void *cb_state, std::vector &HSAExecutables) { - hsa_status_t err; - hsa_executable_t executable = {0}; - hsa_profile_t agent_profile; - - err = hsa_agent_get_info(agent, HSA_AGENT_INFO_PROFILE, &agent_profile); - if (err != HSA_STATUS_SUCCESS) { - printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__, - "Query the agent profile", get_error_string(err)); - return HSA_STATUS_ERROR; - } - // FIXME: Assume that every profile is FULL until we understand how to build - // GCN with base profile - agent_profile = HSA_PROFILE_FULL; - /* Create the empty executable. */ - err = hsa_executable_create(agent_profile, HSA_EXECUTABLE_STATE_UNFROZEN, "", - &executable); - if (err != HSA_STATUS_SUCCESS) { - printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__, - "Create the executable", get_error_string(err)); - return HSA_STATUS_ERROR; - } - - bool module_load_success = false; - do // Existing control flow used continue, preserve that for this patch - { - { - // Some metadata info is not available through ROCr API, so use custom - // code object metadata parsing to collect such metadata info - - err = get_code_object_custom_metadata(module_bytes, module_size, - KernelInfoTable); - if (err != HSA_STATUS_SUCCESS) { - DP("[%s:%d] %s failed: %s\n", __FILE__, __LINE__, - "Getting custom code object metadata", get_error_string(err)); - continue; - } - - // Deserialize code object. - hsa_code_object_t code_object = {0}; - err = hsa_code_object_deserialize(module_bytes, module_size, NULL, - &code_object); - if (err != HSA_STATUS_SUCCESS) { - DP("[%s:%d] %s failed: %s\n", __FILE__, __LINE__, - "Code Object Deserialization", get_error_string(err)); - continue; - } - assert(0 != code_object.handle); - - // Mutating the device image here avoids another allocation & memcpy - void *code_object_alloc_data = - reinterpret_cast(code_object.handle); - hsa_status_t impl_err = - on_deserialized_data(code_object_alloc_data, module_size, cb_state); - if (impl_err != HSA_STATUS_SUCCESS) { - printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__, - "Error in deserialized_data callback", - get_error_string(impl_err)); - return impl_err; - } - - /* Load the code object. */ - err = - hsa_executable_load_code_object(executable, agent, code_object, NULL); - if (err != HSA_STATUS_SUCCESS) { - DP("[%s:%d] %s failed: %s\n", __FILE__, __LINE__, - "Loading the code object", get_error_string(err)); - continue; - } - - // cannot iterate over symbols until executable is frozen - } - module_load_success = true; - } while (0); - DP("Modules loaded successful? %d\n", module_load_success); - if (module_load_success) { - /* Freeze the executable; it can now be queried for symbols. */ - err = hsa_executable_freeze(executable, ""); - if (err != HSA_STATUS_SUCCESS) { - printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__, - "Freeze the executable", get_error_string(err)); - return HSA_STATUS_ERROR; - } - - err = hsa::executable_iterate_symbols( - executable, - [&](hsa_executable_t, hsa_executable_symbol_t symbol) -> hsa_status_t { - return populate_InfoTables(symbol, KernelInfoTable, SymbolInfoTable); - }); - if (err != HSA_STATUS_SUCCESS) { - printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__, - "Iterating over symbols for execuatable", get_error_string(err)); - return HSA_STATUS_ERROR; - } - - // save the executable and destroy during finalize - HSAExecutables.push_back(executable); - return HSA_STATUS_SUCCESS; - } else { - return HSA_STATUS_ERROR; - } -} - -} // namespace core diff --git a/openmp/libomptarget/plugins/amdgpu/src/print_tracing.h b/openmp/libomptarget/plugins/amdgpu/src/print_tracing.h deleted file mode 100644 --- a/openmp/libomptarget/plugins/amdgpu/src/print_tracing.h +++ /dev/null @@ -1,20 +0,0 @@ -//===--- amdgpu/src/print_tracing.h ------------------------------- C++ -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -#ifndef LIBOMPTARGET_PLUGINS_AMGGPU_SRC_PRINT_TRACING_H_INCLUDED -#define LIBOMPTARGET_PLUGINS_AMGGPU_SRC_PRINT_TRACING_H_INCLUDED - -enum PrintTraceControlBits { - LAUNCH = 1, // print a message to stderr for each kernel launch - RTL_TIMING = 2, // Print timing info around each RTL step - STARTUP_DETAILS = 4, // Details around loading up kernel - RTL_TO_STDOUT = 8 // Redirect RTL tracing to stdout -}; - -extern int print_kernel_trace; // set by environment variable - -#endif diff --git a/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp b/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp deleted file mode 100644 --- a/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp +++ /dev/null @@ -1,2615 +0,0 @@ -//===--- amdgpu/src/rtl.cpp --------------------------------------- C++ -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// RTL for AMD hsa machine -// -//===----------------------------------------------------------------------===// - -#include "llvm/ADT/StringRef.h" -#include "llvm/Frontend/OpenMP/OMPConstants.h" -#include "llvm/Frontend/OpenMP/OMPGridValues.h" -#include "llvm/Object/ELF.h" -#include "llvm/Object/ELFObjectFile.h" - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "ELFSymbols.h" -#include "impl_runtime.h" -#include "interop_hsa.h" - -#include "UtilitiesRTL.h" -#include "internal.h" -#include "rt.h" - -#include "DeviceEnvironment.h" -#include "get_elf_mach_gfx_name.h" -#include "omptargetplugin.h" -#include "print_tracing.h" - -using namespace llvm; -using namespace llvm::object; -using namespace llvm::ELF; -using namespace llvm::omp::target::plugin::utils; - -// hostrpc interface, FIXME: consider moving to its own include these are -// statically linked into amdgpu/plugin if present from hostrpc_services.a, -// linked as --whole-archive to override the weak symbols that are used to -// implement a fallback for toolchains that do not yet have a hostrpc library. -extern "C" { -uint64_t hostrpc_assign_buffer(hsa_agent_t Agent, hsa_queue_t *ThisQ, - uint32_t DeviceId); -hsa_status_t hostrpc_init(); -hsa_status_t hostrpc_terminate(); - -__attribute__((weak)) hsa_status_t hostrpc_init() { return HSA_STATUS_SUCCESS; } -__attribute__((weak)) hsa_status_t hostrpc_terminate() { - return HSA_STATUS_SUCCESS; -} -__attribute__((weak)) uint64_t hostrpc_assign_buffer(hsa_agent_t, hsa_queue_t *, - uint32_t DeviceId) { - DP("Warning: Attempting to assign hostrpc to device %u, but hostrpc library " - "missing\n", - DeviceId); - return 0; -} -} - -// Heuristic parameters used for kernel launch -// Number of teams per CU to allow scheduling flexibility -static const unsigned DefaultTeamsPerCU = 4; - -int print_kernel_trace; - -#ifdef OMPTARGET_DEBUG -#define check(msg, status) \ - if (status != HSA_STATUS_SUCCESS) { \ - DP(#msg " failed\n"); \ - } else { \ - DP(#msg " succeeded\n"); \ - } -#else -#define check(msg, status) \ - {} -#endif - -#include "elf_common.h" - -namespace hsa { -template hsa_status_t iterate_agents(C Cb) { - auto L = [](hsa_agent_t Agent, void *Data) -> hsa_status_t { - C *Unwrapped = static_cast(Data); - return (*Unwrapped)(Agent); - }; - return hsa_iterate_agents(L, static_cast(&Cb)); -} - -template -hsa_status_t amd_agent_iterate_memory_pools(hsa_agent_t Agent, C Cb) { - auto L = [](hsa_amd_memory_pool_t MemoryPool, void *Data) -> hsa_status_t { - C *Unwrapped = static_cast(Data); - return (*Unwrapped)(MemoryPool); - }; - - return hsa_amd_agent_iterate_memory_pools(Agent, L, static_cast(&Cb)); -} - -} // namespace hsa - -/// Keep entries table per device -struct FuncOrGblEntryTy { - __tgt_target_table Table; - std::vector<__tgt_offload_entry> Entries; -}; - -struct KernelArgPool { -private: - static pthread_mutex_t Mutex; - -public: - uint32_t KernargSegmentSize; - void *KernargRegion = nullptr; - std::queue FreeKernargSegments; - - uint32_t kernargSizeIncludingImplicit() { - return KernargSegmentSize + sizeof(AMDGPUImplicitArgsTy); - } - - ~KernelArgPool() { - if (KernargRegion) { - auto R = hsa_amd_memory_pool_free(KernargRegion); - if (R != HSA_STATUS_SUCCESS) { - DP("hsa_amd_memory_pool_free failed: %s\n", get_error_string(R)); - } - } - } - - // Can't really copy or move a mutex - KernelArgPool() = default; - KernelArgPool(const KernelArgPool &) = delete; - KernelArgPool(KernelArgPool &&) = delete; - - KernelArgPool(uint32_t KernargSegmentSize, hsa_amd_memory_pool_t &MemoryPool) - : KernargSegmentSize(KernargSegmentSize) { - - // impl uses one pool per kernel for all gpus, with a fixed upper size - // preserving that exact scheme here, including the queue - - hsa_status_t Err = hsa_amd_memory_pool_allocate( - MemoryPool, kernargSizeIncludingImplicit() * MAX_NUM_KERNELS, 0, - &KernargRegion); - - if (Err != HSA_STATUS_SUCCESS) { - DP("hsa_amd_memory_pool_allocate failed: %s\n", get_error_string(Err)); - KernargRegion = nullptr; // paranoid - return; - } - - Err = core::allow_access_to_all_gpu_agents(KernargRegion); - if (Err != HSA_STATUS_SUCCESS) { - DP("hsa allow_access_to_all_gpu_agents failed: %s\n", - get_error_string(Err)); - auto R = hsa_amd_memory_pool_free(KernargRegion); - if (R != HSA_STATUS_SUCCESS) { - // if free failed, can't do anything more to resolve it - DP("hsa memory poll free failed: %s\n", get_error_string(Err)); - } - KernargRegion = nullptr; - return; - } - - for (int I = 0; I < MAX_NUM_KERNELS; I++) { - FreeKernargSegments.push(I); - } - } - - void *allocate(uint64_t ArgNum) { - assert((ArgNum * sizeof(void *)) == KernargSegmentSize); - Lock L(&Mutex); - void *Res = nullptr; - if (!FreeKernargSegments.empty()) { - - int FreeIdx = FreeKernargSegments.front(); - Res = static_cast(static_cast(KernargRegion) + - (FreeIdx * kernargSizeIncludingImplicit())); - assert(FreeIdx == pointerToIndex(Res)); - FreeKernargSegments.pop(); - } - return Res; - } - - void deallocate(void *Ptr) { - Lock L(&Mutex); - int Idx = pointerToIndex(Ptr); - FreeKernargSegments.push(Idx); - } - -private: - int pointerToIndex(void *Ptr) { - ptrdiff_t Bytes = - static_cast(Ptr) - static_cast(KernargRegion); - assert(Bytes >= 0); - assert(Bytes % kernargSizeIncludingImplicit() == 0); - return Bytes / kernargSizeIncludingImplicit(); - } - struct Lock { - Lock(pthread_mutex_t *M) : M(M) { pthread_mutex_lock(M); } - ~Lock() { pthread_mutex_unlock(M); } - pthread_mutex_t *M; - }; -}; -pthread_mutex_t KernelArgPool::Mutex = PTHREAD_MUTEX_INITIALIZER; - -std::unordered_map> - KernelArgPoolMap; - -/// Use a single entity to encode a kernel and a set of flags -struct KernelTy { - llvm::omp::OMPTgtExecModeFlags ExecutionMode; - int16_t ConstWGSize; - int32_t DeviceId; - void *CallStackAddr = nullptr; - const char *Name; - - KernelTy(llvm::omp::OMPTgtExecModeFlags ExecutionMode, int16_t ConstWgSize, - int32_t DeviceId, void *CallStackAddr, const char *Name, - uint32_t KernargSegmentSize, - hsa_amd_memory_pool_t &KernArgMemoryPool) - : ExecutionMode(ExecutionMode), ConstWGSize(ConstWgSize), - DeviceId(DeviceId), CallStackAddr(CallStackAddr), Name(Name) { - DP("Construct kernelinfo: ExecMode %d\n", ExecutionMode); - - std::string N(Name); - if (KernelArgPoolMap.find(N) == KernelArgPoolMap.end()) { - KernelArgPoolMap.insert( - std::make_pair(N, std::unique_ptr(new KernelArgPool( - KernargSegmentSize, KernArgMemoryPool)))); - } - } -}; - -/// List that contains all the kernels. -/// FIXME: we may need this to be per device and per library. -std::list KernelsList; - -template static hsa_status_t findAgents(Callback CB) { - - hsa_status_t Err = - hsa::iterate_agents([&](hsa_agent_t Agent) -> hsa_status_t { - hsa_device_type_t DeviceType; - // get_info fails iff HSA runtime not yet initialized - hsa_status_t Err = - hsa_agent_get_info(Agent, HSA_AGENT_INFO_DEVICE, &DeviceType); - - if (Err != HSA_STATUS_SUCCESS) { - if (print_kernel_trace > 0) - DP("rtl.cpp: err %s\n", get_error_string(Err)); - - return Err; - } - - CB(DeviceType, Agent); - return HSA_STATUS_SUCCESS; - }); - - // iterate_agents fails iff HSA runtime not yet initialized - if (print_kernel_trace > 0 && Err != HSA_STATUS_SUCCESS) { - DP("rtl.cpp: err %s\n", get_error_string(Err)); - } - - return Err; -} - -static void callbackQueue(hsa_status_t Status, hsa_queue_t *Source, - void *Data) { - if (Status != HSA_STATUS_SUCCESS) { - const char *StatusString; - if (hsa_status_string(Status, &StatusString) != HSA_STATUS_SUCCESS) { - StatusString = "unavailable"; - } - DP("[%s:%d] GPU error in queue %p %d (%s)\n", __FILE__, __LINE__, Source, - Status, StatusString); - abort(); - } -} - -namespace core { -namespace { - -bool checkResult(hsa_status_t Err, const char *ErrMsg) { - if (Err == HSA_STATUS_SUCCESS) - return true; - - REPORT("%s", ErrMsg); - REPORT("%s", get_error_string(Err)); - return false; -} - -void packetStoreRelease(uint32_t *Packet, uint16_t Header, uint16_t Rest) { - __atomic_store_n(Packet, Header | (Rest << 16), __ATOMIC_RELEASE); -} - -uint16_t createHeader() { - uint16_t Header = HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE; - Header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE; - Header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE; - return Header; -} - -hsa_status_t isValidMemoryPool(hsa_amd_memory_pool_t MemoryPool) { - bool AllocAllowed = false; - hsa_status_t Err = hsa_amd_memory_pool_get_info( - MemoryPool, HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALLOWED, - &AllocAllowed); - if (Err != HSA_STATUS_SUCCESS) { - DP("Alloc allowed in memory pool check failed: %s\n", - get_error_string(Err)); - return Err; - } - - size_t Size = 0; - Err = hsa_amd_memory_pool_get_info(MemoryPool, HSA_AMD_MEMORY_POOL_INFO_SIZE, - &Size); - if (Err != HSA_STATUS_SUCCESS) { - DP("Get memory pool size failed: %s\n", get_error_string(Err)); - return Err; - } - - return (AllocAllowed && Size > 0) ? HSA_STATUS_SUCCESS : HSA_STATUS_ERROR; -} - -hsa_status_t addMemoryPool(hsa_amd_memory_pool_t MemoryPool, void *Data) { - std::vector *Result = - static_cast *>(Data); - - hsa_status_t Err; - if ((Err = isValidMemoryPool(MemoryPool)) != HSA_STATUS_SUCCESS) { - return Err; - } - - Result->push_back(MemoryPool); - return HSA_STATUS_SUCCESS; -} - -} // namespace -} // namespace core - -struct EnvironmentVariables { - int NumTeams; - int TeamLimit; - int TeamThreadLimit; - int MaxTeamsDefault; - int DynamicMemSize; -}; - -template -static constexpr const llvm::omp::GV &getGridValue() { - return llvm::omp::getAMDGPUGridValues(); -} - -struct HSALifetime { - // Wrapper around HSA used to ensure it is constructed before other types - // and destructed after, which means said other types can use raii for - // cleanup without risking running outside of the lifetime of HSA - const hsa_status_t S; - - bool HSAInitSuccess() { return S == HSA_STATUS_SUCCESS; } - HSALifetime() : S(hsa_init()) {} - - ~HSALifetime() { - if (S == HSA_STATUS_SUCCESS) { - hsa_status_t Err = hsa_shut_down(); - if (Err != HSA_STATUS_SUCCESS) { - // Can't call into HSA to get a string from the integer - DP("Shutting down HSA failed: %d\n", Err); - } - } - } -}; - -// Handle scheduling of multiple hsa_queue's per device to -// multiple threads (one scheduler per device) -class HSAQueueScheduler { -public: - HSAQueueScheduler() : Current(0) {} - - HSAQueueScheduler(const HSAQueueScheduler &) = delete; - - HSAQueueScheduler(HSAQueueScheduler &&Q) { - Current = Q.Current.load(); - for (uint8_t I = 0; I < NUM_QUEUES_PER_DEVICE; I++) { - HSAQueues[I] = Q.HSAQueues[I]; - Q.HSAQueues[I] = nullptr; - } - } - - // \return false if any HSA queue creation fails - bool createQueues(hsa_agent_t HSAAgent, uint32_t QueueSize) { - for (uint8_t I = 0; I < NUM_QUEUES_PER_DEVICE; I++) { - hsa_queue_t *Q = nullptr; - hsa_status_t Rc = - hsa_queue_create(HSAAgent, QueueSize, HSA_QUEUE_TYPE_MULTI, - callbackQueue, NULL, UINT32_MAX, UINT32_MAX, &Q); - if (Rc != HSA_STATUS_SUCCESS) { - DP("Failed to create HSA queue %d\n", I); - return false; - } - HSAQueues[I] = Q; - } - return true; - } - - ~HSAQueueScheduler() { - for (uint8_t I = 0; I < NUM_QUEUES_PER_DEVICE; I++) { - if (HSAQueues[I]) { - hsa_status_t Err = hsa_queue_destroy(HSAQueues[I]); - if (Err != HSA_STATUS_SUCCESS) - DP("Error destroying HSA queue"); - } - } - } - - // \return next queue to use for device - hsa_queue_t *next() { - return HSAQueues[(Current.fetch_add(1, std::memory_order_relaxed)) % - NUM_QUEUES_PER_DEVICE]; - } - -private: - // Number of queues per device - enum : uint8_t { NUM_QUEUES_PER_DEVICE = 4 }; - hsa_queue_t *HSAQueues[NUM_QUEUES_PER_DEVICE] = {}; - std::atomic Current; -}; - -/// Class containing all the device information -class RTLDeviceInfoTy : HSALifetime { - std::vector> FuncGblEntries; - - struct QueueDeleter { - void operator()(hsa_queue_t *Q) { - if (Q) { - hsa_status_t Err = hsa_queue_destroy(Q); - if (Err != HSA_STATUS_SUCCESS) { - DP("Error destroying hsa queue: %s\n", get_error_string(Err)); - } - } - } - }; - -public: - bool ConstructionSucceeded = false; - - // load binary populates symbol tables and mutates various global state - // run uses those symbol tables - std::shared_timed_mutex LoadRunLock; - - int NumberOfDevices = 0; - - // GPU devices - std::vector HSAAgents; - std::vector HSAQueueSchedulers; // one per gpu - - // CPUs - std::vector CPUAgents; - - // Device properties - std::vector ComputeUnits; - std::vector GroupsPerDevice; - std::vector ThreadsPerGroup; - std::vector WarpSize; - std::vector GPUName; - std::vector TargetID; - - // OpenMP properties - std::vector NumTeams; - std::vector NumThreads; - - // OpenMP Environment properties - EnvironmentVariables Env; - - // OpenMP Requires Flags - int64_t RequiresFlags; - - // Resource pools - SignalPoolT FreeSignalPool; - - bool HostcallRequired = false; - - std::vector HSAExecutables; - - std::vector> KernelInfoTable; - std::vector> SymbolInfoTable; - - hsa_amd_memory_pool_t KernArgPool; - - // fine grained memory pool for host allocations - hsa_amd_memory_pool_t HostFineGrainedMemoryPool; - - // fine and coarse-grained memory pools per offloading device - std::vector DeviceFineGrainedMemoryPools; - std::vector DeviceCoarseGrainedMemoryPools; - - struct ImplFreePtrDeletor { - void operator()(void *P) { - core::Runtime::Memfree(P); // ignore failure to free - } - }; - - // device_State shared across loaded binaries, error if inconsistent size - std::vector, uint64_t>> - DeviceStateStore; - - static const unsigned HardTeamLimit = - (1 << 16) - 1; // 64K needed to fit in uint16 - static const int DefaultNumTeams = 128; - - // These need to be per-device since different devices can have different - // wave sizes, but are currently the same number for each so that refactor - // can be postponed. - static_assert(getGridValue<32>().GV_Max_Teams == - getGridValue<64>().GV_Max_Teams, - ""); - static const int MaxTeams = getGridValue<64>().GV_Max_Teams; - - static_assert(getGridValue<32>().GV_Max_WG_Size == - getGridValue<64>().GV_Max_WG_Size, - ""); - static const int MaxWgSize = getGridValue<64>().GV_Max_WG_Size; - - static_assert(getGridValue<32>().GV_Default_WG_Size == - getGridValue<64>().GV_Default_WG_Size, - ""); - static const int DefaultWgSize = getGridValue<64>().GV_Default_WG_Size; - - using MemcpyFunc = hsa_status_t (*)(hsa_signal_t, void *, void *, size_t Size, - hsa_agent_t, hsa_amd_memory_pool_t); - hsa_status_t freesignalpoolMemcpy(void *Dest, void *Src, size_t Size, - MemcpyFunc Func, int32_t DeviceId) { - hsa_agent_t Agent = HSAAgents[DeviceId]; - hsa_signal_t S = FreeSignalPool.pop(); - if (S.handle == 0) { - return HSA_STATUS_ERROR; - } - hsa_status_t R = Func(S, Dest, Src, Size, Agent, HostFineGrainedMemoryPool); - FreeSignalPool.push(S); - return R; - } - - hsa_status_t freesignalpoolMemcpyD2H(void *Dest, void *Src, size_t Size, - int32_t DeviceId) { - return freesignalpoolMemcpy(Dest, Src, Size, impl_memcpy_d2h, DeviceId); - } - - hsa_status_t freesignalpoolMemcpyH2D(void *Dest, void *Src, size_t Size, - int32_t DeviceId) { - return freesignalpoolMemcpy(Dest, Src, Size, impl_memcpy_h2d, DeviceId); - } - - static void printDeviceInfo(int32_t DeviceId, hsa_agent_t Agent) { - char TmpChar[1000]; - uint16_t Major, Minor; - uint32_t TmpUInt; - uint32_t TmpUInt2; - uint32_t CacheSize[4]; - bool TmpBool; - uint16_t WorkgroupMaxDim[3]; - hsa_dim3_t GridMaxDim; - - // Getting basic information about HSA and Device - core::checkResult( - hsa_system_get_info(HSA_SYSTEM_INFO_VERSION_MAJOR, &Major), - "Error from hsa_system_get_info when obtaining " - "HSA_SYSTEM_INFO_VERSION_MAJOR\n"); - core::checkResult( - hsa_system_get_info(HSA_SYSTEM_INFO_VERSION_MINOR, &Minor), - "Error from hsa_system_get_info when obtaining " - "HSA_SYSTEM_INFO_VERSION_MINOR\n"); - printf(" HSA Runtime Version: \t\t%u.%u \n", Major, Minor); - printf(" HSA OpenMP Device Number: \t\t%d \n", DeviceId); - core::checkResult( - hsa_agent_get_info( - Agent, (hsa_agent_info_t)HSA_AMD_AGENT_INFO_PRODUCT_NAME, TmpChar), - "Error returned from hsa_agent_get_info when obtaining " - "HSA_AMD_AGENT_INFO_PRODUCT_NAME\n"); - printf(" Product Name: \t\t\t%s \n", TmpChar); - core::checkResult(hsa_agent_get_info(Agent, HSA_AGENT_INFO_NAME, TmpChar), - "Error returned from hsa_agent_get_info when obtaining " - "HSA_AGENT_INFO_NAME\n"); - printf(" Device Name: \t\t\t%s \n", TmpChar); - core::checkResult( - hsa_agent_get_info(Agent, HSA_AGENT_INFO_VENDOR_NAME, TmpChar), - "Error returned from hsa_agent_get_info when obtaining " - "HSA_AGENT_INFO_NAME\n"); - printf(" Vendor Name: \t\t\t%s \n", TmpChar); - hsa_device_type_t DevType; - core::checkResult( - hsa_agent_get_info(Agent, HSA_AGENT_INFO_DEVICE, &DevType), - "Error returned from hsa_agent_get_info when obtaining " - "HSA_AGENT_INFO_DEVICE\n"); - printf(" Device Type: \t\t\t%s \n", - DevType == HSA_DEVICE_TYPE_CPU - ? "CPU" - : (DevType == HSA_DEVICE_TYPE_GPU - ? "GPU" - : (DevType == HSA_DEVICE_TYPE_DSP ? "DSP" : "UNKNOWN"))); - core::checkResult( - hsa_agent_get_info(Agent, HSA_AGENT_INFO_QUEUES_MAX, &TmpUInt), - "Error returned from hsa_agent_get_info when obtaining " - "HSA_AGENT_INFO_QUEUES_MAX\n"); - printf(" Max Queues: \t\t\t%u \n", TmpUInt); - core::checkResult( - hsa_agent_get_info(Agent, HSA_AGENT_INFO_QUEUE_MIN_SIZE, &TmpUInt), - "Error returned from hsa_agent_get_info when obtaining " - "HSA_AGENT_INFO_QUEUE_MIN_SIZE\n"); - printf(" Queue Min Size: \t\t\t%u \n", TmpUInt); - core::checkResult( - hsa_agent_get_info(Agent, HSA_AGENT_INFO_QUEUE_MAX_SIZE, &TmpUInt), - "Error returned from hsa_agent_get_info when obtaining " - "HSA_AGENT_INFO_QUEUE_MAX_SIZE\n"); - printf(" Queue Max Size: \t\t\t%u \n", TmpUInt); - - // Getting cache information - printf(" Cache:\n"); - - // FIXME: This is deprecated according to HSA documentation. But using - // hsa_agent_iterate_caches and hsa_cache_get_info breaks execution during - // runtime. - core::checkResult( - hsa_agent_get_info(Agent, HSA_AGENT_INFO_CACHE_SIZE, CacheSize), - "Error returned from hsa_agent_get_info when obtaining " - "HSA_AGENT_INFO_CACHE_SIZE\n"); - - for (int I = 0; I < 4; I++) { - if (CacheSize[I]) { - printf(" L%u: \t\t\t\t%u bytes\n", I, CacheSize[I]); - } - } - - core::checkResult( - hsa_agent_get_info(Agent, - (hsa_agent_info_t)HSA_AMD_AGENT_INFO_CACHELINE_SIZE, - &TmpUInt), - "Error returned from hsa_agent_get_info when obtaining " - "HSA_AMD_AGENT_INFO_CACHELINE_SIZE\n"); - printf(" Cacheline Size: \t\t\t%u \n", TmpUInt); - core::checkResult( - hsa_agent_get_info( - Agent, (hsa_agent_info_t)HSA_AMD_AGENT_INFO_MAX_CLOCK_FREQUENCY, - &TmpUInt), - "Error returned from hsa_agent_get_info when obtaining " - "HSA_AMD_AGENT_INFO_MAX_CLOCK_FREQUENCY\n"); - printf(" Max Clock Freq(MHz): \t\t%u \n", TmpUInt); - core::checkResult( - hsa_agent_get_info( - Agent, (hsa_agent_info_t)HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT, - &TmpUInt), - "Error returned from hsa_agent_get_info when obtaining " - "HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT\n"); - printf(" Compute Units: \t\t\t%u \n", TmpUInt); - core::checkResult(hsa_agent_get_info( - Agent, - (hsa_agent_info_t)HSA_AMD_AGENT_INFO_NUM_SIMDS_PER_CU, - &TmpUInt), - "Error returned from hsa_agent_get_info when obtaining " - "HSA_AMD_AGENT_INFO_NUM_SIMDS_PER_CU\n"); - printf(" SIMD per CU: \t\t\t%u \n", TmpUInt); - core::checkResult( - hsa_agent_get_info(Agent, HSA_AGENT_INFO_FAST_F16_OPERATION, &TmpBool), - "Error returned from hsa_agent_get_info when obtaining " - "HSA_AMD_AGENT_INFO_NUM_SIMDS_PER_CU\n"); - printf(" Fast F16 Operation: \t\t%s \n", (TmpBool ? "TRUE" : "FALSE")); - core::checkResult( - hsa_agent_get_info(Agent, HSA_AGENT_INFO_WAVEFRONT_SIZE, &TmpUInt2), - "Error returned from hsa_agent_get_info when obtaining " - "HSA_AGENT_INFO_WAVEFRONT_SIZE\n"); - printf(" Wavefront Size: \t\t\t%u \n", TmpUInt2); - core::checkResult( - hsa_agent_get_info(Agent, HSA_AGENT_INFO_WORKGROUP_MAX_SIZE, &TmpUInt), - "Error returned from hsa_agent_get_info when obtaining " - "HSA_AGENT_INFO_WORKGROUP_MAX_SIZE\n"); - printf(" Workgroup Max Size: \t\t%u \n", TmpUInt); - core::checkResult(hsa_agent_get_info(Agent, - HSA_AGENT_INFO_WORKGROUP_MAX_DIM, - WorkgroupMaxDim), - "Error returned from hsa_agent_get_info when obtaining " - "HSA_AGENT_INFO_WORKGROUP_MAX_DIM\n"); - printf(" Workgroup Max Size per Dimension:\n"); - printf(" x: \t\t\t\t%u\n", WorkgroupMaxDim[0]); - printf(" y: \t\t\t\t%u\n", WorkgroupMaxDim[1]); - printf(" z: \t\t\t\t%u\n", WorkgroupMaxDim[2]); - core::checkResult(hsa_agent_get_info( - Agent, - (hsa_agent_info_t)HSA_AMD_AGENT_INFO_MAX_WAVES_PER_CU, - &TmpUInt), - "Error returned from hsa_agent_get_info when obtaining " - "HSA_AMD_AGENT_INFO_MAX_WAVES_PER_CU\n"); - printf(" Max Waves Per CU: \t\t\t%u \n", TmpUInt); - printf(" Max Work-item Per CU: \t\t%u \n", TmpUInt * TmpUInt2); - core::checkResult( - hsa_agent_get_info(Agent, HSA_AGENT_INFO_GRID_MAX_SIZE, &TmpUInt), - "Error returned from hsa_agent_get_info when obtaining " - "HSA_AGENT_INFO_GRID_MAX_SIZE\n"); - printf(" Grid Max Size: \t\t\t%u \n", TmpUInt); - core::checkResult( - hsa_agent_get_info(Agent, HSA_AGENT_INFO_GRID_MAX_DIM, &GridMaxDim), - "Error returned from hsa_agent_get_info when obtaining " - "HSA_AGENT_INFO_GRID_MAX_DIM\n"); - printf(" Grid Max Size per Dimension: \t\t\n"); - printf(" x: \t\t\t\t%u\n", GridMaxDim.x); - printf(" y: \t\t\t\t%u\n", GridMaxDim.y); - printf(" z: \t\t\t\t%u\n", GridMaxDim.z); - core::checkResult( - hsa_agent_get_info(Agent, HSA_AGENT_INFO_FBARRIER_MAX_SIZE, &TmpUInt), - "Error returned from hsa_agent_get_info when obtaining " - "HSA_AGENT_INFO_FBARRIER_MAX_SIZE\n"); - printf(" Max fbarriers/Workgrp: \t\t%u\n", TmpUInt); - - printf(" Memory Pools:\n"); - auto CbMem = [](hsa_amd_memory_pool_t Region, void *Data) -> hsa_status_t { - std::string TmpStr; - size_t Size; - bool Alloc, Access; - hsa_amd_segment_t Segment; - hsa_amd_memory_pool_global_flag_t GlobalFlags; - core::checkResult( - hsa_amd_memory_pool_get_info( - Region, HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, &GlobalFlags), - "Error returned from hsa_amd_memory_pool_get_info when obtaining " - "HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS\n"); - core::checkResult(hsa_amd_memory_pool_get_info( - Region, HSA_AMD_MEMORY_POOL_INFO_SEGMENT, &Segment), - "Error returned from hsa_amd_memory_pool_get_info when " - "obtaining HSA_AMD_MEMORY_POOL_INFO_SEGMENT\n"); - - switch (Segment) { - case HSA_AMD_SEGMENT_GLOBAL: - TmpStr = "GLOBAL; FLAGS: "; - if (HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_KERNARG_INIT & GlobalFlags) - TmpStr += "KERNARG, "; - if (HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_FINE_GRAINED & GlobalFlags) - TmpStr += "FINE GRAINED, "; - if (HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_COARSE_GRAINED & GlobalFlags) - TmpStr += "COARSE GRAINED, "; - break; - case HSA_AMD_SEGMENT_READONLY: - TmpStr = "READONLY"; - break; - case HSA_AMD_SEGMENT_PRIVATE: - TmpStr = "PRIVATE"; - break; - case HSA_AMD_SEGMENT_GROUP: - TmpStr = "GROUP"; - break; - } - printf(" Pool %s: \n", TmpStr.c_str()); - - core::checkResult(hsa_amd_memory_pool_get_info( - Region, HSA_AMD_MEMORY_POOL_INFO_SIZE, &Size), - "Error returned from hsa_amd_memory_pool_get_info when " - "obtaining HSA_AMD_MEMORY_POOL_INFO_SIZE\n"); - printf(" Size: \t\t\t\t %zu bytes\n", Size); - core::checkResult( - hsa_amd_memory_pool_get_info( - Region, HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALLOWED, &Alloc), - "Error returned from hsa_amd_memory_pool_get_info when obtaining " - "HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALLOWED\n"); - printf(" Allocatable: \t\t\t %s\n", (Alloc ? "TRUE" : "FALSE")); - core::checkResult( - hsa_amd_memory_pool_get_info( - Region, HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_GRANULE, &Size), - "Error returned from hsa_amd_memory_pool_get_info when obtaining " - "HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_GRANULE\n"); - printf(" Runtime Alloc Granule: \t\t %zu bytes\n", Size); - core::checkResult( - hsa_amd_memory_pool_get_info( - Region, HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALIGNMENT, &Size), - "Error returned from hsa_amd_memory_pool_get_info when obtaining " - "HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALIGNMENT\n"); - printf(" Runtime Alloc alignment: \t %zu bytes\n", Size); - core::checkResult( - hsa_amd_memory_pool_get_info( - Region, HSA_AMD_MEMORY_POOL_INFO_ACCESSIBLE_BY_ALL, &Access), - "Error returned from hsa_amd_memory_pool_get_info when obtaining " - "HSA_AMD_MEMORY_POOL_INFO_ACCESSIBLE_BY_ALL\n"); - printf(" Accessable by all: \t\t %s\n", - (Access ? "TRUE" : "FALSE")); - - return HSA_STATUS_SUCCESS; - }; - // Iterate over all the memory regions for this agent. Get the memory region - // type and size - hsa_amd_agent_iterate_memory_pools(Agent, CbMem, nullptr); - - printf(" ISAs:\n"); - auto CBIsas = [](hsa_isa_t Isa, void *Data) -> hsa_status_t { - char TmpChar[1000]; - core::checkResult(hsa_isa_get_info_alt(Isa, HSA_ISA_INFO_NAME, TmpChar), - "Error returned from hsa_isa_get_info_alt when " - "obtaining HSA_ISA_INFO_NAME\n"); - printf(" Name: \t\t\t\t %s\n", TmpChar); - - return HSA_STATUS_SUCCESS; - }; - // Iterate over all the memory regions for this agent. Get the memory region - // type and size - hsa_agent_iterate_isas(Agent, CBIsas, nullptr); - } - - // Record entry point associated with device - void addOffloadEntry(int32_t DeviceId, __tgt_offload_entry Entry) { - assert(DeviceId < (int32_t)FuncGblEntries.size() && - "Unexpected device id!"); - FuncOrGblEntryTy &E = FuncGblEntries[DeviceId].back(); - - E.Entries.push_back(Entry); - } - - // Return true if the entry is associated with device - bool findOffloadEntry(int32_t DeviceId, void *Addr) { - assert(DeviceId < (int32_t)FuncGblEntries.size() && - "Unexpected device id!"); - FuncOrGblEntryTy &E = FuncGblEntries[DeviceId].back(); - - for (auto &It : E.Entries) { - if (It.addr == Addr) - return true; - } - - return false; - } - - // Return the pointer to the target entries table - __tgt_target_table *getOffloadEntriesTable(int32_t DeviceId) { - assert(DeviceId < (int32_t)FuncGblEntries.size() && - "Unexpected device id!"); - FuncOrGblEntryTy &E = FuncGblEntries[DeviceId].back(); - - int32_t Size = E.Entries.size(); - - // Table is empty - if (!Size) - return 0; - - __tgt_offload_entry *Begin = &E.Entries[0]; - __tgt_offload_entry *End = &E.Entries[Size - 1]; - - // Update table info according to the entries and return the pointer - E.Table.EntriesBegin = Begin; - E.Table.EntriesEnd = ++End; - - return &E.Table; - } - - // Clear entries table for a device - void clearOffloadEntriesTable(int DeviceId) { - assert(DeviceId < (int32_t)FuncGblEntries.size() && - "Unexpected device id!"); - FuncGblEntries[DeviceId].emplace_back(); - FuncOrGblEntryTy &E = FuncGblEntries[DeviceId].back(); - // KernelArgPoolMap.clear(); - E.Entries.clear(); - E.Table.EntriesBegin = E.Table.EntriesEnd = 0; - } - - hsa_status_t addDeviceMemoryPool(hsa_amd_memory_pool_t MemoryPool, - unsigned int DeviceId) { - assert(DeviceId < DeviceFineGrainedMemoryPools.size() && "Error here."); - uint32_t GlobalFlags = 0; - hsa_status_t Err = hsa_amd_memory_pool_get_info( - MemoryPool, HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, &GlobalFlags); - - if (Err != HSA_STATUS_SUCCESS) { - return Err; - } - - if (GlobalFlags & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_FINE_GRAINED) { - DeviceFineGrainedMemoryPools[DeviceId] = MemoryPool; - } else if (GlobalFlags & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_COARSE_GRAINED) { - DeviceCoarseGrainedMemoryPools[DeviceId] = MemoryPool; - } - - return HSA_STATUS_SUCCESS; - } - - hsa_status_t setupDevicePools(const std::vector &Agents) { - for (unsigned int DeviceId = 0; DeviceId < Agents.size(); DeviceId++) { - hsa_status_t Err = hsa::amd_agent_iterate_memory_pools( - Agents[DeviceId], [&](hsa_amd_memory_pool_t MemoryPool) { - hsa_status_t ValidStatus = core::isValidMemoryPool(MemoryPool); - if (ValidStatus != HSA_STATUS_SUCCESS) { - DP("Alloc allowed in memory pool check failed: %s\n", - get_error_string(ValidStatus)); - return HSA_STATUS_SUCCESS; - } - return addDeviceMemoryPool(MemoryPool, DeviceId); - }); - - if (Err != HSA_STATUS_SUCCESS) { - DP("[%s:%d] %s failed: %s\n", __FILE__, __LINE__, - "Iterate all memory pools", get_error_string(Err)); - return Err; - } - } - return HSA_STATUS_SUCCESS; - } - - hsa_status_t setupHostMemoryPools(std::vector &Agents) { - std::vector HostPools; - - // collect all the "valid" pools for all the given agents. - for (const auto &Agent : Agents) { - hsa_status_t Err = hsa_amd_agent_iterate_memory_pools( - Agent, core::addMemoryPool, static_cast(&HostPools)); - if (Err != HSA_STATUS_SUCCESS) { - DP("addMemoryPool returned %s, continuing\n", get_error_string(Err)); - } - } - - // We need two fine-grained pools. - // 1. One with kernarg flag set for storing kernel arguments - // 2. Second for host allocations - bool FineGrainedMemoryPoolSet = false; - bool KernArgPoolSet = false; - for (const auto &MemoryPool : HostPools) { - hsa_status_t Err = HSA_STATUS_SUCCESS; - uint32_t GlobalFlags = 0; - Err = hsa_amd_memory_pool_get_info( - MemoryPool, HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, &GlobalFlags); - if (Err != HSA_STATUS_SUCCESS) { - DP("Get memory pool info failed: %s\n", get_error_string(Err)); - return Err; - } - - if (GlobalFlags & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_FINE_GRAINED) { - if (GlobalFlags & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_KERNARG_INIT) { - KernArgPool = MemoryPool; - KernArgPoolSet = true; - } else { - HostFineGrainedMemoryPool = MemoryPool; - FineGrainedMemoryPoolSet = true; - } - } - } - - if (FineGrainedMemoryPoolSet && KernArgPoolSet) - return HSA_STATUS_SUCCESS; - - return HSA_STATUS_ERROR; - } - - hsa_amd_memory_pool_t getDeviceMemoryPool(unsigned int DeviceId) { - assert(DeviceId >= 0 && DeviceId < DeviceCoarseGrainedMemoryPools.size() && - "Invalid device Id"); - return DeviceCoarseGrainedMemoryPools[DeviceId]; - } - - hsa_amd_memory_pool_t getHostMemoryPool() { - return HostFineGrainedMemoryPool; - } - - static int readEnv(const char *Env, int Default = -1) { - const char *EnvStr = getenv(Env); - int Res = Default; - if (EnvStr) { - Res = std::stoi(EnvStr); - DP("Parsed %s=%d\n", Env, Res); - } - return Res; - } - - RTLDeviceInfoTy() { - DP("Start initializing " GETNAME(TARGET_NAME) "\n"); - - // LIBOMPTARGET_KERNEL_TRACE provides a kernel launch trace to stderr - // anytime. You do not need a debug library build. - // 0 => no tracing - // 1 => tracing dispatch only - // >1 => verbosity increase - - if (!HSAInitSuccess()) { - DP("Error when initializing HSA in " GETNAME(TARGET_NAME) "\n"); - return; - } - - if (char *EnvStr = getenv("LIBOMPTARGET_KERNEL_TRACE")) - print_kernel_trace = atoi(EnvStr); - else - print_kernel_trace = 0; - - hsa_status_t Err = core::atl_init_gpu_context(); - if (Err != HSA_STATUS_SUCCESS) { - DP("Error when initializing " GETNAME(TARGET_NAME) "\n"); - return; - } - - // Init hostcall soon after initializing hsa - hostrpc_init(); - - Err = findAgents([&](hsa_device_type_t DeviceType, hsa_agent_t Agent) { - if (DeviceType == HSA_DEVICE_TYPE_CPU) { - CPUAgents.push_back(Agent); - } else { - HSAAgents.push_back(Agent); - } - }); - if (Err != HSA_STATUS_SUCCESS) - return; - - NumberOfDevices = (int)HSAAgents.size(); - - if (NumberOfDevices == 0) { - DP("There are no devices supporting HSA.\n"); - return; - } - DP("There are %d devices supporting HSA.\n", NumberOfDevices); - - // Init the device info - HSAQueueSchedulers.reserve(NumberOfDevices); - FuncGblEntries.resize(NumberOfDevices); - ThreadsPerGroup.resize(NumberOfDevices); - ComputeUnits.resize(NumberOfDevices); - GPUName.resize(NumberOfDevices); - GroupsPerDevice.resize(NumberOfDevices); - WarpSize.resize(NumberOfDevices); - NumTeams.resize(NumberOfDevices); - NumThreads.resize(NumberOfDevices); - DeviceStateStore.resize(NumberOfDevices); - KernelInfoTable.resize(NumberOfDevices); - SymbolInfoTable.resize(NumberOfDevices); - DeviceCoarseGrainedMemoryPools.resize(NumberOfDevices); - DeviceFineGrainedMemoryPools.resize(NumberOfDevices); - - Err = setupDevicePools(HSAAgents); - if (Err != HSA_STATUS_SUCCESS) { - DP("Setup for Device Memory Pools failed\n"); - return; - } - - Err = setupHostMemoryPools(CPUAgents); - if (Err != HSA_STATUS_SUCCESS) { - DP("Setup for Host Memory Pools failed\n"); - return; - } - - for (int I = 0; I < NumberOfDevices; I++) { - uint32_t QueueSize = 0; - { - hsa_status_t Err = hsa_agent_get_info( - HSAAgents[I], HSA_AGENT_INFO_QUEUE_MAX_SIZE, &QueueSize); - if (Err != HSA_STATUS_SUCCESS) { - DP("HSA query QUEUE_MAX_SIZE failed for agent %d\n", I); - return; - } - enum { MaxQueueSize = 4096 }; - if (QueueSize > MaxQueueSize) { - QueueSize = MaxQueueSize; - } - } - - { - HSAQueueScheduler QSched; - if (!QSched.createQueues(HSAAgents[I], QueueSize)) - return; - HSAQueueSchedulers.emplace_back(std::move(QSched)); - } - - DeviceStateStore[I] = {nullptr, 0}; - } - - for (int I = 0; I < NumberOfDevices; I++) { - ThreadsPerGroup[I] = RTLDeviceInfoTy::DefaultWgSize; - GroupsPerDevice[I] = RTLDeviceInfoTy::DefaultNumTeams; - ComputeUnits[I] = 1; - DP("Device %d: Initial groupsPerDevice %d & threadsPerGroup %d\n", I, - GroupsPerDevice[I], ThreadsPerGroup[I]); - } - - // Get environment variables regarding teams - Env.TeamLimit = readEnv("OMP_TEAM_LIMIT"); - Env.NumTeams = readEnv("OMP_NUM_TEAMS"); - Env.MaxTeamsDefault = readEnv("OMP_MAX_TEAMS_DEFAULT"); - Env.TeamThreadLimit = readEnv("OMP_TEAMS_THREAD_LIMIT"); - Env.DynamicMemSize = readEnv("LIBOMPTARGET_SHARED_MEMORY_SIZE", 0); - - // Default state. - RequiresFlags = OMP_REQ_UNDEFINED; - - ConstructionSucceeded = true; - } - - ~RTLDeviceInfoTy() { - DP("Finalizing the " GETNAME(TARGET_NAME) " DeviceInfo.\n"); - if (!HSAInitSuccess()) { - // Then none of these can have been set up and they can't be torn down - return; - } - // Run destructors on types that use HSA before - // impl_finalize removes access to it - DeviceStateStore.clear(); - KernelArgPoolMap.clear(); - // Terminate hostrpc before finalizing hsa - hostrpc_terminate(); - - hsa_status_t Err; - for (uint32_t I = 0; I < HSAExecutables.size(); I++) { - Err = hsa_executable_destroy(HSAExecutables[I]); - if (Err != HSA_STATUS_SUCCESS) { - DP("[%s:%d] %s failed: %s\n", __FILE__, __LINE__, - "Destroying executable", get_error_string(Err)); - } - } - } -}; - -pthread_mutex_t SignalPoolT::mutex = PTHREAD_MUTEX_INITIALIZER; - -// Putting accesses to DeviceInfo global behind a function call prior -// to changing to use init_plugin/deinit_plugin calls -static RTLDeviceInfoTy DeviceInfoState; -static RTLDeviceInfoTy &DeviceInfo() { return DeviceInfoState; } - -namespace { - -int32_t dataRetrieve(int32_t DeviceId, void *HstPtr, void *TgtPtr, int64_t Size, - __tgt_async_info *AsyncInfo) { - assert(AsyncInfo && "AsyncInfo is nullptr"); - assert(DeviceId < DeviceInfo().NumberOfDevices && "Device ID too large"); - // Return success if we are not copying back to host from target. - if (!HstPtr) - return OFFLOAD_SUCCESS; - hsa_status_t Err; - DP("Retrieve data %ld bytes, (tgt:%016llx) -> (hst:%016llx).\n", Size, - (long long unsigned)(Elf64_Addr)TgtPtr, - (long long unsigned)(Elf64_Addr)HstPtr); - - Err = DeviceInfo().freesignalpoolMemcpyD2H(HstPtr, TgtPtr, (size_t)Size, - DeviceId); - - if (Err != HSA_STATUS_SUCCESS) { - DP("Error when copying data from device to host. Pointers: " - "host = 0x%016lx, device = 0x%016lx, size = %lld\n", - (Elf64_Addr)HstPtr, (Elf64_Addr)TgtPtr, (unsigned long long)Size); - return OFFLOAD_FAIL; - } - DP("DONE Retrieve data %ld bytes, (tgt:%016llx) -> (hst:%016llx).\n", Size, - (long long unsigned)(Elf64_Addr)TgtPtr, - (long long unsigned)(Elf64_Addr)HstPtr); - return OFFLOAD_SUCCESS; -} - -int32_t dataSubmit(int32_t DeviceId, void *TgtPtr, void *HstPtr, int64_t Size, - __tgt_async_info *AsyncInfo) { - assert(AsyncInfo && "AsyncInfo is nullptr"); - hsa_status_t Err; - assert(DeviceId < DeviceInfo().NumberOfDevices && "Device ID too large"); - // Return success if we are not doing host to target. - if (!HstPtr) - return OFFLOAD_SUCCESS; - - DP("Submit data %ld bytes, (hst:%016llx) -> (tgt:%016llx).\n", Size, - (long long unsigned)(Elf64_Addr)HstPtr, - (long long unsigned)(Elf64_Addr)TgtPtr); - Err = DeviceInfo().freesignalpoolMemcpyH2D(TgtPtr, HstPtr, (size_t)Size, - DeviceId); - if (Err != HSA_STATUS_SUCCESS) { - DP("Error when copying data from host to device. Pointers: " - "host = 0x%016lx, device = 0x%016lx, size = %lld\n", - (Elf64_Addr)HstPtr, (Elf64_Addr)TgtPtr, (unsigned long long)Size); - return OFFLOAD_FAIL; - } - return OFFLOAD_SUCCESS; -} - -// Async. -// The implementation was written with cuda streams in mind. The semantics of -// that are to execute kernels on a queue in order of insertion. A synchronise -// call then makes writes visible between host and device. This means a series -// of N data_submit_async calls are expected to execute serially. HSA offers -// various options to run the data copies concurrently. This may require changes -// to libomptarget. - -// __tgt_async_info* contains a void * Queue. Queue = 0 is used to indicate that -// there are no outstanding kernels that need to be synchronized. Any async call -// may be passed a Queue==0, at which point the cuda implementation will set it -// to non-null (see getStream). The cuda streams are per-device. Upstream may -// change this interface to explicitly initialize the AsyncInfo_pointer, but -// until then hsa lazily initializes it as well. - -void initAsyncInfo(__tgt_async_info *AsyncInfo) { - // set non-null while using async calls, return to null to indicate completion - assert(AsyncInfo); - if (!AsyncInfo->Queue) { - AsyncInfo->Queue = reinterpret_cast(UINT64_MAX); - } -} -void finiAsyncInfo(__tgt_async_info *AsyncInfo) { - assert(AsyncInfo); - assert(AsyncInfo->Queue); - AsyncInfo->Queue = 0; -} - -// Determine launch values for kernel. -struct LaunchVals { - int WorkgroupSize; - int GridSize; -}; -LaunchVals getLaunchVals(int WarpSize, EnvironmentVariables Env, - int ConstWGSize, - llvm::omp::OMPTgtExecModeFlags ExecutionMode, - int NumTeams, int ThreadLimit, uint64_t LoopTripcount, - int DeviceNumTeams) { - - int ThreadsPerGroup = RTLDeviceInfoTy::DefaultWgSize; - int NumGroups = 0; - - int MaxTeams = Env.MaxTeamsDefault > 0 ? Env.MaxTeamsDefault : DeviceNumTeams; - if (MaxTeams > static_cast(RTLDeviceInfoTy::HardTeamLimit)) - MaxTeams = RTLDeviceInfoTy::HardTeamLimit; - - if (print_kernel_trace & STARTUP_DETAILS) { - DP("RTLDeviceInfoTy::Max_Teams: %d\n", RTLDeviceInfoTy::MaxTeams); - DP("Max_Teams: %d\n", MaxTeams); - DP("RTLDeviceInfoTy::Warp_Size: %d\n", WarpSize); - DP("RTLDeviceInfoTy::Max_WG_Size: %d\n", RTLDeviceInfoTy::MaxWgSize); - DP("RTLDeviceInfoTy::Default_WG_Size: %d\n", - RTLDeviceInfoTy::DefaultWgSize); - DP("thread_limit: %d\n", ThreadLimit); - DP("threadsPerGroup: %d\n", ThreadsPerGroup); - DP("ConstWGSize: %d\n", ConstWGSize); - } - // check for thread_limit() clause - if (ThreadLimit > 0) { - ThreadsPerGroup = ThreadLimit; - DP("Setting threads per block to requested %d\n", ThreadLimit); - // Add master warp for GENERIC - if (ExecutionMode == - llvm::omp::OMPTgtExecModeFlags::OMP_TGT_EXEC_MODE_GENERIC) { - ThreadsPerGroup += WarpSize; - DP("Adding master wavefront: +%d threads\n", WarpSize); - } - if (ThreadsPerGroup > RTLDeviceInfoTy::MaxWgSize) { // limit to max - ThreadsPerGroup = RTLDeviceInfoTy::MaxWgSize; - DP("Setting threads per block to maximum %d\n", ThreadsPerGroup); - } - } - // check flat_max_work_group_size attr here - if (ThreadsPerGroup > ConstWGSize) { - ThreadsPerGroup = ConstWGSize; - DP("Reduced threadsPerGroup to flat-attr-group-size limit %d\n", - ThreadsPerGroup); - } - if (print_kernel_trace & STARTUP_DETAILS) - DP("threadsPerGroup: %d\n", ThreadsPerGroup); - DP("Preparing %d threads\n", ThreadsPerGroup); - - // Set default num_groups (teams) - if (Env.TeamLimit > 0) - NumGroups = (MaxTeams < Env.TeamLimit) ? MaxTeams : Env.TeamLimit; - else - NumGroups = MaxTeams; - DP("Set default num of groups %d\n", NumGroups); - - if (print_kernel_trace & STARTUP_DETAILS) { - DP("num_groups: %d\n", NumGroups); - DP("num_teams: %d\n", NumTeams); - } - - // Reduce num_groups if threadsPerGroup exceeds RTLDeviceInfoTy::Max_WG_Size - // This reduction is typical for default case (no thread_limit clause). - // or when user goes crazy with num_teams clause. - // FIXME: We cant distinguish between a constant or variable thread limit. - // So we only handle constant thread_limits. - if (ThreadsPerGroup > - RTLDeviceInfoTy::DefaultWgSize) // 256 < threadsPerGroup <= 1024 - // Should we round threadsPerGroup up to nearest WarpSize - // here? - NumGroups = (MaxTeams * RTLDeviceInfoTy::MaxWgSize) / ThreadsPerGroup; - - // check for num_teams() clause - if (NumTeams > 0) { - NumGroups = (NumTeams < NumGroups) ? NumTeams : NumGroups; - } - if (print_kernel_trace & STARTUP_DETAILS) { - DP("num_groups: %d\n", NumGroups); - DP("Env.NumTeams %d\n", Env.NumTeams); - DP("Env.TeamLimit %d\n", Env.TeamLimit); - } - - if (Env.NumTeams > 0) { - NumGroups = (Env.NumTeams < NumGroups) ? Env.NumTeams : NumGroups; - DP("Modifying teams based on Env.NumTeams %d\n", Env.NumTeams); - } else if (Env.TeamLimit > 0) { - NumGroups = (Env.TeamLimit < NumGroups) ? Env.TeamLimit : NumGroups; - DP("Modifying teams based on Env.TeamLimit%d\n", Env.TeamLimit); - } else { - if (NumTeams <= 0) { - if (LoopTripcount > 0) { - if (ExecutionMode == - llvm::omp::OMPTgtExecModeFlags::OMP_TGT_EXEC_MODE_SPMD) { - // round up to the nearest integer - NumGroups = ((LoopTripcount - 1) / ThreadsPerGroup) + 1; - } else if (ExecutionMode == - llvm::omp::OMPTgtExecModeFlags::OMP_TGT_EXEC_MODE_GENERIC) { - NumGroups = LoopTripcount; - } else /* OMP_TGT_EXEC_MODE_GENERIC_SPMD */ { - // This is a generic kernel that was transformed to use SPMD-mode - // execution but uses Generic-mode semantics for scheduling. - NumGroups = LoopTripcount; - } - DP("Using %d teams due to loop trip count %" PRIu64 " and number of " - "threads per block %d\n", - NumGroups, LoopTripcount, ThreadsPerGroup); - } - } else { - NumGroups = NumTeams; - } - if (NumGroups > MaxTeams) { - NumGroups = MaxTeams; - if (print_kernel_trace & STARTUP_DETAILS) - DP("Limiting num_groups %d to Max_Teams %d \n", NumGroups, MaxTeams); - } - if (NumGroups > NumTeams && NumTeams > 0) { - NumGroups = NumTeams; - if (print_kernel_trace & STARTUP_DETAILS) - DP("Limiting num_groups %d to clause num_teams %d \n", NumGroups, - NumTeams); - } - } - - // num_teams clause always honored, no matter what, unless DEFAULT is active. - if (NumTeams > 0) { - NumGroups = NumTeams; - // Cap num_groups to EnvMaxTeamsDefault if set. - if (Env.MaxTeamsDefault > 0 && NumGroups > Env.MaxTeamsDefault) - NumGroups = Env.MaxTeamsDefault; - } - if (print_kernel_trace & STARTUP_DETAILS) { - DP("threadsPerGroup: %d\n", ThreadsPerGroup); - DP("num_groups: %d\n", NumGroups); - DP("loop_tripcount: %ld\n", LoopTripcount); - } - DP("Final %d num_groups and %d threadsPerGroup\n", NumGroups, - ThreadsPerGroup); - - LaunchVals Res; - Res.WorkgroupSize = ThreadsPerGroup; - Res.GridSize = ThreadsPerGroup * NumGroups; - return Res; -} - -static uint64_t acquireAvailablePacketId(hsa_queue_t *Queue) { - uint64_t PacketId = hsa_queue_add_write_index_relaxed(Queue, 1); - bool Full = true; - while (Full) { - Full = - PacketId >= (Queue->size + hsa_queue_load_read_index_scacquire(Queue)); - } - return PacketId; -} - -int32_t runRegionLocked(int32_t DeviceId, void *TgtEntryPtr, void **TgtArgs, - ptrdiff_t *TgtOffsets, int32_t ArgNum, int32_t NumTeams, - int32_t ThreadLimit, uint64_t LoopTripcount) { - // Set the context we are using - // update thread limit content in gpu memory if un-initialized or specified - // from host - - DP("Run target team region thread_limit %d\n", ThreadLimit); - - // All args are references. - std::vector Args(ArgNum); - std::vector Ptrs(ArgNum); - - DP("Arg_num: %d\n", ArgNum); - for (int32_t I = 0; I < ArgNum; ++I) { - Ptrs[I] = (void *)((intptr_t)TgtArgs[I] + TgtOffsets[I]); - Args[I] = &Ptrs[I]; - DP("Offseted base: arg[%d]:" DPxMOD "\n", I, DPxPTR(Ptrs[I])); - } - - KernelTy *KernelInfo = (KernelTy *)TgtEntryPtr; - - std::string KernelName = std::string(KernelInfo->Name); - auto &KernelInfoTable = DeviceInfo().KernelInfoTable; - if (KernelInfoTable[DeviceId].find(KernelName) == - KernelInfoTable[DeviceId].end()) { - DP("Kernel %s not found\n", KernelName.c_str()); - return OFFLOAD_FAIL; - } - - const atl_kernel_info_t KernelInfoEntry = - KernelInfoTable[DeviceId][KernelName]; - const uint32_t GroupSegmentSize = - KernelInfoEntry.group_segment_size + DeviceInfo().Env.DynamicMemSize; - const uint32_t SgprCount = KernelInfoEntry.sgpr_count; - const uint32_t VgprCount = KernelInfoEntry.vgpr_count; - const uint32_t SgprSpillCount = KernelInfoEntry.sgpr_spill_count; - const uint32_t VgprSpillCount = KernelInfoEntry.vgpr_spill_count; - - assert(ArgNum == (int)KernelInfoEntry.explicit_argument_count); - - /* - * Set limit based on ThreadsPerGroup and GroupsPerDevice - */ - LaunchVals LV = - getLaunchVals(DeviceInfo().WarpSize[DeviceId], DeviceInfo().Env, - KernelInfo->ConstWGSize, KernelInfo->ExecutionMode, - NumTeams, // From run_region arg - ThreadLimit, // From run_region arg - LoopTripcount, // From run_region arg - DeviceInfo().NumTeams[KernelInfo->DeviceId]); - const int GridSize = LV.GridSize; - const int WorkgroupSize = LV.WorkgroupSize; - - if (print_kernel_trace >= LAUNCH) { - int NumGroups = GridSize / WorkgroupSize; - // enum modes are SPMD, GENERIC, NONE 0,1,2 - // if doing rtl timing, print to stderr, unless stdout requested. - bool TraceToStdout = print_kernel_trace & (RTL_TO_STDOUT | RTL_TIMING); - fprintf(TraceToStdout ? stdout : stderr, - "DEVID:%2d SGN:%1d ConstWGSize:%-4d args:%2d teamsXthrds:(%4dX%4d) " - "reqd:(%4dX%4d) lds_usage:%uB sgpr_count:%u vgpr_count:%u " - "sgpr_spill_count:%u vgpr_spill_count:%u tripcount:%lu n:%s\n", - DeviceId, KernelInfo->ExecutionMode, KernelInfo->ConstWGSize, - ArgNum, NumGroups, WorkgroupSize, NumTeams, ThreadLimit, - GroupSegmentSize, SgprCount, VgprCount, SgprSpillCount, - VgprSpillCount, LoopTripcount, KernelInfo->Name); - } - - // Run on the device. - { - hsa_queue_t *Queue = DeviceInfo().HSAQueueSchedulers[DeviceId].next(); - if (!Queue) { - return OFFLOAD_FAIL; - } - uint64_t PacketId = acquireAvailablePacketId(Queue); - - const uint32_t Mask = Queue->size - 1; // size is a power of 2 - hsa_kernel_dispatch_packet_t *Packet = - (hsa_kernel_dispatch_packet_t *)Queue->base_address + (PacketId & Mask); - - // packet->header is written last - Packet->setup = UINT16_C(1) << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS; - Packet->workgroup_size_x = WorkgroupSize; - Packet->workgroup_size_y = 1; - Packet->workgroup_size_z = 1; - Packet->reserved0 = 0; - Packet->grid_size_x = GridSize; - Packet->grid_size_y = 1; - Packet->grid_size_z = 1; - Packet->private_segment_size = KernelInfoEntry.private_segment_size; - Packet->group_segment_size = GroupSegmentSize; - Packet->kernel_object = KernelInfoEntry.kernel_object; - Packet->kernarg_address = 0; // use the block allocator - Packet->reserved2 = 0; // impl writes id_ here - Packet->completion_signal = {0}; // may want a pool of signals - - KernelArgPool *ArgPool = nullptr; - void *KernArg = nullptr; - { - auto It = KernelArgPoolMap.find(std::string(KernelInfo->Name)); - if (It != KernelArgPoolMap.end()) { - ArgPool = (It->second).get(); - } - } - if (!ArgPool) { - DP("Warning: No ArgPool for %s on device %d\n", KernelInfo->Name, - DeviceId); - } - { - if (ArgPool) { - assert(ArgPool->KernargSegmentSize == (ArgNum * sizeof(void *))); - KernArg = ArgPool->allocate(ArgNum); - } - if (!KernArg) { - DP("Allocate kernarg failed\n"); - return OFFLOAD_FAIL; - } - - // Copy explicit arguments - for (int I = 0; I < ArgNum; I++) { - memcpy((char *)KernArg + sizeof(void *) * I, Args[I], sizeof(void *)); - } - - // Initialize implicit arguments. TODO: Which of these can be dropped - AMDGPUImplicitArgsTy *ImplArgs = reinterpret_cast( - static_cast(KernArg) + ArgPool->KernargSegmentSize); - memset(ImplArgs, 0, - sizeof(AMDGPUImplicitArgsTy)); // may not be necessary - ImplArgs->OffsetX = 0; - ImplArgs->OffsetY = 0; - ImplArgs->OffsetZ = 0; - - // assign a hostcall buffer for the selected Q - if (__atomic_load_n(&DeviceInfo().HostcallRequired, __ATOMIC_ACQUIRE)) { - // hostrpc_assign_buffer is not thread safe, and this function is - // under a multiple reader lock, not a writer lock. - static pthread_mutex_t HostcallInitLock = PTHREAD_MUTEX_INITIALIZER; - pthread_mutex_lock(&HostcallInitLock); - uint64_t Buffer = hostrpc_assign_buffer( - DeviceInfo().HSAAgents[DeviceId], Queue, DeviceId); - pthread_mutex_unlock(&HostcallInitLock); - if (!Buffer) { - DP("hostrpc_assign_buffer failed, gpu would dereference null and " - "error\n"); - return OFFLOAD_FAIL; - } - - DP("Implicit argument count: %d\n", - KernelInfoEntry.implicit_argument_count); - if (KernelInfoEntry.implicit_argument_count >= 4) { - // Initialise pointer for implicit_argument_count != 0 ABI - // Guess that the right implicit argument is at offset 24 after - // the explicit arguments. In the future, should be able to read - // the offset from msgpack. Clang is not annotating it at present. - uint64_t Offset = - sizeof(void *) * (KernelInfoEntry.explicit_argument_count + 3); - if ((Offset + 8) > ArgPool->kernargSizeIncludingImplicit()) { - DP("Bad offset of hostcall: %lu, exceeds kernarg size w/ implicit " - "args: %d\n", - Offset + 8, ArgPool->kernargSizeIncludingImplicit()); - } else { - memcpy(static_cast(KernArg) + Offset, &Buffer, 8); - } - } - - // initialise pointer for implicit_argument_count == 0 ABI - ImplArgs->HostcallPtr = Buffer; - } - - Packet->kernarg_address = KernArg; - } - - hsa_signal_t S = DeviceInfo().FreeSignalPool.pop(); - if (S.handle == 0) { - DP("Failed to get signal instance\n"); - return OFFLOAD_FAIL; - } - Packet->completion_signal = S; - hsa_signal_store_relaxed(Packet->completion_signal, 1); - - // Publish the packet indicating it is ready to be processed - core::packetStoreRelease(reinterpret_cast(Packet), - core::createHeader(), Packet->setup); - - // Since the packet is already published, its contents must not be - // accessed any more - hsa_signal_store_relaxed(Queue->doorbell_signal, PacketId); - - while (hsa_signal_wait_scacquire(S, HSA_SIGNAL_CONDITION_EQ, 0, UINT64_MAX, - HSA_WAIT_STATE_BLOCKED) != 0) - ; - - assert(ArgPool); - ArgPool->deallocate(KernArg); - DeviceInfo().FreeSignalPool.push(S); - } - - DP("Kernel completed\n"); - return OFFLOAD_SUCCESS; -} - -bool elfMachineIdIsAmdgcn(__tgt_device_image *Image) { - const uint16_t AmdgcnMachineID = EM_AMDGPU; - const int32_t R = elf_check_machine(Image, AmdgcnMachineID); - if (!R) { - DP("Supported machine ID not found\n"); - } - return R; -} - -uint32_t elfEFlags(__tgt_device_image *Image) { - const char *ImgBegin = (char *)Image->ImageStart; - size_t ImgSize = (char *)Image->ImageEnd - ImgBegin; - - StringRef Buffer = StringRef(ImgBegin, ImgSize); - auto ElfOrErr = ObjectFile::createELFObjectFile(MemoryBufferRef(Buffer, ""), - /*InitContent=*/false); - if (!ElfOrErr) { - consumeError(ElfOrErr.takeError()); - return 0; - } - - if (const auto *ELFObj = dyn_cast(ElfOrErr->get())) - return ELFObj->getPlatformFlags(); - return 0; -} - -template bool enforceUpperBound(T *Value, T Upper) { - bool Changed = *Value > Upper; - if (Changed) { - *Value = Upper; - } - return Changed; -} - -struct SymbolInfo { - const void *Addr = nullptr; - uint32_t Size = UINT32_MAX; - uint32_t ShType = SHT_NULL; -}; - -int getSymbolInfoWithoutLoading(const ELFObjectFile &ELFObj, - StringRef SymName, SymbolInfo *Res) { - auto SymOrErr = getELFSymbol(ELFObj, SymName); - if (!SymOrErr) { - std::string ErrorString = toString(SymOrErr.takeError()); - DP("Failed ELF lookup: %s\n", ErrorString.c_str()); - return 1; - } - if (!*SymOrErr) - return 1; - - auto SymSecOrErr = ELFObj.getELFFile().getSection((*SymOrErr)->st_shndx); - if (!SymSecOrErr) { - std::string ErrorString = toString(SymOrErr.takeError()); - DP("Failed ELF lookup: %s\n", ErrorString.c_str()); - return 1; - } - - Res->Addr = (*SymOrErr)->st_value + ELFObj.getELFFile().base(); - Res->Size = static_cast((*SymOrErr)->st_size); - Res->ShType = static_cast((*SymSecOrErr)->sh_type); - return 0; -} - -int getSymbolInfoWithoutLoading(char *Base, size_t ImgSize, const char *SymName, - SymbolInfo *Res) { - StringRef Buffer = StringRef(Base, ImgSize); - auto ElfOrErr = ObjectFile::createELFObjectFile(MemoryBufferRef(Buffer, ""), - /*InitContent=*/false); - if (!ElfOrErr) { - REPORT("Failed to load ELF: %s\n", toString(ElfOrErr.takeError()).c_str()); - return 1; - } - - if (const auto *ELFObj = dyn_cast(ElfOrErr->get())) - return getSymbolInfoWithoutLoading(*ELFObj, SymName, Res); - return 1; -} - -hsa_status_t interopGetSymbolInfo(char *Base, size_t ImgSize, - const char *SymName, const void **VarAddr, - uint32_t *VarSize) { - SymbolInfo SI; - int Rc = getSymbolInfoWithoutLoading(Base, ImgSize, SymName, &SI); - if (Rc == 0) { - *VarAddr = SI.Addr; - *VarSize = SI.Size; - return HSA_STATUS_SUCCESS; - } - return HSA_STATUS_ERROR; -} - -template -hsa_status_t moduleRegisterFromMemoryToPlace( - std::map &KernelInfoTable, - std::map &SymbolInfoTable, - void *ModuleBytes, size_t ModuleSize, int DeviceId, C Cb, - std::vector &HSAExecutables) { - auto L = [](void *Data, size_t Size, void *CbState) -> hsa_status_t { - C *Unwrapped = static_cast(CbState); - return (*Unwrapped)(Data, Size); - }; - return core::RegisterModuleFromMemory( - KernelInfoTable, SymbolInfoTable, ModuleBytes, ModuleSize, - DeviceInfo().HSAAgents[DeviceId], L, static_cast(&Cb), - HSAExecutables); -} - -uint64_t getDeviceStateBytes(char *ImageStart, size_t ImgSize) { - uint64_t DeviceStateBytes = 0; - { - // If this is the deviceRTL, get the state variable size - SymbolInfo SizeSi; - int Rc = getSymbolInfoWithoutLoading( - ImageStart, ImgSize, "omptarget_nvptx_device_State_size", &SizeSi); - - if (Rc == 0) { - if (SizeSi.Size != sizeof(uint64_t)) { - DP("Found device_State_size variable with wrong size\n"); - return 0; - } - - // Read number of bytes directly from the elf - memcpy(&DeviceStateBytes, SizeSi.Addr, sizeof(uint64_t)); - } - } - return DeviceStateBytes; -} - -struct DeviceEnvironment { - // initialise an DeviceEnvironmentTy in the deviceRTL - // patches around differences in the deviceRTL between trunk, aomp, - // rocmcc. Over time these differences will tend to zero and this class - // simplified. - // Symbol may be in .data or .bss, and may be missing fields, todo: - // review aomp/trunk/rocm and simplify the following - - // The symbol may also have been deadstripped because the device side - // accessors were unused. - - // If the symbol is in .data (aomp, rocm) it can be written directly. - // If it is in .bss, we must wait for it to be allocated space on the - // gpu (trunk) and initialize after loading. - const char *sym() { return "__omp_rtl_device_environment"; } - - DeviceEnvironmentTy HostDeviceEnv; - SymbolInfo SI; - bool Valid = false; - - __tgt_device_image *Image; - const size_t ImgSize; - - DeviceEnvironment(int DeviceId, int NumberDevices, int DynamicMemSize, - __tgt_device_image *Image, const size_t ImgSize) - : Image(Image), ImgSize(ImgSize) { - - HostDeviceEnv.NumDevices = NumberDevices; - HostDeviceEnv.DeviceNum = DeviceId; - HostDeviceEnv.DebugKind = 0; - HostDeviceEnv.DynamicMemSize = DynamicMemSize; - if (char *EnvStr = getenv("LIBOMPTARGET_DEVICE_RTL_DEBUG")) - HostDeviceEnv.DebugKind = std::stoi(EnvStr); - - int Rc = getSymbolInfoWithoutLoading((char *)Image->ImageStart, ImgSize, - sym(), &SI); - if (Rc != 0) { - DP("Finding global device environment '%s' - symbol missing.\n", sym()); - return; - } - - if (SI.Size > sizeof(HostDeviceEnv)) { - DP("Symbol '%s' has size %u, expected at most %zu.\n", sym(), SI.Size, - sizeof(HostDeviceEnv)); - return; - } - - Valid = true; - } - - bool inImage() { return SI.ShType != SHT_NOBITS; } - - hsa_status_t beforeLoading(void *Data, size_t Size) { - if (Valid) { - if (inImage()) { - DP("Setting global device environment before load (%u bytes)\n", - SI.Size); - uint64_t Offset = reinterpret_cast(SI.Addr) - - reinterpret_cast(Image->ImageStart); - void *Pos = reinterpret_cast(Data) + Offset; - memcpy(Pos, &HostDeviceEnv, SI.Size); - } - } - return HSA_STATUS_SUCCESS; - } - - hsa_status_t afterLoading() { - if (Valid) { - if (!inImage()) { - DP("Setting global device environment after load (%u bytes)\n", - SI.Size); - int DeviceId = HostDeviceEnv.DeviceNum; - auto &SymbolInfo = DeviceInfo().SymbolInfoTable[DeviceId]; - void *StatePtr; - uint32_t StatePtrSize; - hsa_status_t Err = interop_hsa_get_symbol_info( - SymbolInfo, DeviceId, sym(), &StatePtr, &StatePtrSize); - if (Err != HSA_STATUS_SUCCESS) { - DP("failed to find %s in loaded image\n", sym()); - return Err; - } - - if (StatePtrSize != SI.Size) { - DP("Symbol had size %u before loading, %u after\n", StatePtrSize, - SI.Size); - return HSA_STATUS_ERROR; - } - - return DeviceInfo().freesignalpoolMemcpyH2D(StatePtr, &HostDeviceEnv, - StatePtrSize, DeviceId); - } - } - return HSA_STATUS_SUCCESS; - } -}; - -hsa_status_t implCalloc(void **RetPtr, size_t Size, int DeviceId) { - uint64_t Rounded = 4 * ((Size + 3) / 4); - void *Ptr; - hsa_amd_memory_pool_t MemoryPool = DeviceInfo().getDeviceMemoryPool(DeviceId); - hsa_status_t Err = hsa_amd_memory_pool_allocate(MemoryPool, Rounded, 0, &Ptr); - if (Err != HSA_STATUS_SUCCESS) { - return Err; - } - - hsa_status_t Rc = hsa_amd_memory_fill(Ptr, 0, Rounded / 4); - if (Rc != HSA_STATUS_SUCCESS) { - DP("zero fill device_state failed with %u\n", Rc); - core::Runtime::Memfree(Ptr); - return HSA_STATUS_ERROR; - } - - *RetPtr = Ptr; - return HSA_STATUS_SUCCESS; -} - -bool imageContainsSymbol(void *Data, size_t Size, const char *Sym) { - SymbolInfo SI; - int Rc = getSymbolInfoWithoutLoading((char *)Data, Size, Sym, &SI); - return (Rc == 0) && (SI.Addr != nullptr); -} - -hsa_status_t lock_memory(void *HostPtr, size_t Size, hsa_agent_t Agent, - void **LockedHostPtr) { - hsa_status_t err = is_locked(HostPtr, LockedHostPtr); - if (err != HSA_STATUS_SUCCESS) - return err; - - // HostPtr is already locked, just return it - if (*LockedHostPtr) - return HSA_STATUS_SUCCESS; - - hsa_agent_t Agents[1] = {Agent}; - return hsa_amd_memory_lock(HostPtr, Size, Agents, /*num_agent=*/1, - LockedHostPtr); -} - -hsa_status_t unlock_memory(void *HostPtr) { - void *LockedHostPtr = nullptr; - hsa_status_t err = is_locked(HostPtr, &LockedHostPtr); - if (err != HSA_STATUS_SUCCESS) - return err; - - // if LockedHostPtr is nullptr, then HostPtr was not locked - if (!LockedHostPtr) - return HSA_STATUS_SUCCESS; - - err = hsa_amd_memory_unlock(HostPtr); - return err; -} - -} // namespace - -namespace core { -hsa_status_t allow_access_to_all_gpu_agents(void *Ptr) { - return hsa_amd_agents_allow_access(DeviceInfo().HSAAgents.size(), - &DeviceInfo().HSAAgents[0], NULL, Ptr); -} -} // namespace core - -static hsa_status_t GetIsaInfo(hsa_isa_t isa, void *data) { - hsa_status_t err; - uint32_t name_len; - err = hsa_isa_get_info_alt(isa, HSA_ISA_INFO_NAME_LENGTH, &name_len); - if (err != HSA_STATUS_SUCCESS) { - DP("Error getting ISA info length\n"); - return err; - } - - char TargetID[name_len]; - err = hsa_isa_get_info_alt(isa, HSA_ISA_INFO_NAME, TargetID); - if (err != HSA_STATUS_SUCCESS) { - DP("Error getting ISA info name\n"); - return err; - } - - auto TripleTargetID = llvm::StringRef(TargetID); - if (TripleTargetID.consume_front("amdgcn-amd-amdhsa")) { - DeviceInfo().TargetID.push_back(TripleTargetID.ltrim('-').str()); - } - return HSA_STATUS_SUCCESS; -} - -extern "C" { -int32_t __tgt_rtl_is_valid_binary(__tgt_device_image *Image) { - return elfMachineIdIsAmdgcn(Image); -} - -int32_t __tgt_rtl_is_valid_binary_info(__tgt_device_image *image, - __tgt_image_info *info) { - if (!__tgt_rtl_is_valid_binary(image)) - return false; - - // A subarchitecture was not specified. Assume it is compatible. - if (!info->Arch) - return true; - - int32_t NumberOfDevices = __tgt_rtl_number_of_devices(); - - for (int32_t DeviceId = 0; DeviceId < NumberOfDevices; ++DeviceId) { - __tgt_rtl_init_device(DeviceId); - hsa_agent_t agent = DeviceInfo().HSAAgents[DeviceId]; - hsa_status_t err = hsa_agent_iterate_isas(agent, GetIsaInfo, &DeviceId); - if (err != HSA_STATUS_SUCCESS) { - DP("Error iterating ISAs\n"); - return false; - } - if (!isImageCompatibleWithEnv(info, DeviceInfo().TargetID[DeviceId])) - return false; - } - DP("Image has Target ID compatible with the current environment: %s\n", - info->Arch); - return true; -} - -int32_t __tgt_rtl_init_plugin() { return OFFLOAD_SUCCESS; } -int32_t __tgt_rtl_deinit_plugin() { return OFFLOAD_SUCCESS; } - -int __tgt_rtl_number_of_devices() { - // If the construction failed, no methods are safe to call - if (DeviceInfo().ConstructionSucceeded) { - return DeviceInfo().NumberOfDevices; - } - DP("AMDGPU plugin construction failed. Zero devices available\n"); - return 0; -} - -int64_t __tgt_rtl_init_requires(int64_t RequiresFlags) { - DP("Init requires flags to %ld\n", RequiresFlags); - DeviceInfo().RequiresFlags = RequiresFlags; - return RequiresFlags; -} - -int32_t __tgt_rtl_init_device(int DeviceId) { - hsa_status_t Err = hsa_init(); - if (Err != HSA_STATUS_SUCCESS) { - DP("HSA Initialization Failed.\n"); - return HSA_STATUS_ERROR; - } - // this is per device id init - DP("Initialize the device id: %d\n", DeviceId); - - hsa_agent_t Agent = DeviceInfo().HSAAgents[DeviceId]; - - // Get number of Compute Unit - uint32_t ComputeUnits = 0; - Err = hsa_agent_get_info( - Agent, (hsa_agent_info_t)HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT, - &ComputeUnits); - if (Err != HSA_STATUS_SUCCESS) { - DeviceInfo().ComputeUnits[DeviceId] = 1; - DP("Error getting compute units : settiing to 1\n"); - } else { - DeviceInfo().ComputeUnits[DeviceId] = ComputeUnits; - DP("Using %d compute unis per grid\n", DeviceInfo().ComputeUnits[DeviceId]); - } - - char GetInfoName[64]; // 64 max size returned by get info - Err = hsa_agent_get_info(Agent, (hsa_agent_info_t)HSA_AGENT_INFO_NAME, - (void *)GetInfoName); - if (Err) - DeviceInfo().GPUName[DeviceId] = "--unknown gpu--"; - else { - DeviceInfo().GPUName[DeviceId] = GetInfoName; - } - - if (print_kernel_trace & STARTUP_DETAILS) - DP("Device#%-2d CU's: %2d %s\n", DeviceId, - DeviceInfo().ComputeUnits[DeviceId], - DeviceInfo().GPUName[DeviceId].c_str()); - - // Query attributes to determine number of threads/block and blocks/grid. - uint16_t WorkgroupMaxDim[3]; - Err = hsa_agent_get_info(Agent, HSA_AGENT_INFO_WORKGROUP_MAX_DIM, - &WorkgroupMaxDim); - if (Err != HSA_STATUS_SUCCESS) { - DeviceInfo().GroupsPerDevice[DeviceId] = RTLDeviceInfoTy::DefaultNumTeams; - DP("Error getting grid dims: num groups : %d\n", - RTLDeviceInfoTy::DefaultNumTeams); - } else if (WorkgroupMaxDim[0] <= RTLDeviceInfoTy::HardTeamLimit) { - DeviceInfo().GroupsPerDevice[DeviceId] = WorkgroupMaxDim[0]; - DP("Using %d ROCm blocks per grid\n", - DeviceInfo().GroupsPerDevice[DeviceId]); - } else { - DeviceInfo().GroupsPerDevice[DeviceId] = RTLDeviceInfoTy::HardTeamLimit; - DP("Max ROCm blocks per grid %d exceeds the hard team limit %d, capping " - "at the hard limit\n", - WorkgroupMaxDim[0], RTLDeviceInfoTy::HardTeamLimit); - } - - // Get thread limit - hsa_dim3_t GridMaxDim; - Err = hsa_agent_get_info(Agent, HSA_AGENT_INFO_GRID_MAX_DIM, &GridMaxDim); - if (Err == HSA_STATUS_SUCCESS) { - DeviceInfo().ThreadsPerGroup[DeviceId] = - reinterpret_cast(&GridMaxDim)[0] / - DeviceInfo().GroupsPerDevice[DeviceId]; - - if (DeviceInfo().ThreadsPerGroup[DeviceId] == 0) { - DeviceInfo().ThreadsPerGroup[DeviceId] = RTLDeviceInfoTy::MaxWgSize; - DP("Default thread limit: %d\n", RTLDeviceInfoTy::MaxWgSize); - } else if (enforceUpperBound(&DeviceInfo().ThreadsPerGroup[DeviceId], - RTLDeviceInfoTy::MaxWgSize)) { - DP("Capped thread limit: %d\n", RTLDeviceInfoTy::MaxWgSize); - } else { - DP("Using ROCm Queried thread limit: %d\n", - DeviceInfo().ThreadsPerGroup[DeviceId]); - } - } else { - DeviceInfo().ThreadsPerGroup[DeviceId] = RTLDeviceInfoTy::MaxWgSize; - DP("Error getting max block dimension, use default:%d \n", - RTLDeviceInfoTy::MaxWgSize); - } - - // Get wavefront size - uint32_t WavefrontSize = 0; - Err = - hsa_agent_get_info(Agent, HSA_AGENT_INFO_WAVEFRONT_SIZE, &WavefrontSize); - if (Err == HSA_STATUS_SUCCESS) { - DP("Queried wavefront size: %d\n", WavefrontSize); - DeviceInfo().WarpSize[DeviceId] = WavefrontSize; - } else { - // TODO: Burn the wavefront size into the code object - DP("Warning: Unknown wavefront size, assuming 64\n"); - DeviceInfo().WarpSize[DeviceId] = 64; - } - - // Adjust teams to the env variables - - if (DeviceInfo().Env.TeamLimit > 0 && - (enforceUpperBound(&DeviceInfo().GroupsPerDevice[DeviceId], - DeviceInfo().Env.TeamLimit))) { - DP("Capping max groups per device to OMP_TEAM_LIMIT=%d\n", - DeviceInfo().Env.TeamLimit); - } - - // Set default number of teams - if (DeviceInfo().Env.NumTeams > 0) { - DeviceInfo().NumTeams[DeviceId] = DeviceInfo().Env.NumTeams; - DP("Default number of teams set according to environment %d\n", - DeviceInfo().Env.NumTeams); - } else { - char *TeamsPerCUEnvStr = getenv("OMP_TARGET_TEAMS_PER_PROC"); - int TeamsPerCU = DefaultTeamsPerCU; - if (TeamsPerCUEnvStr) { - TeamsPerCU = std::stoi(TeamsPerCUEnvStr); - } - - DeviceInfo().NumTeams[DeviceId] = - TeamsPerCU * DeviceInfo().ComputeUnits[DeviceId]; - DP("Default number of teams = %d * number of compute units %d\n", - TeamsPerCU, DeviceInfo().ComputeUnits[DeviceId]); - } - - if (enforceUpperBound(&DeviceInfo().NumTeams[DeviceId], - DeviceInfo().GroupsPerDevice[DeviceId])) { - DP("Default number of teams exceeds device limit, capping at %d\n", - DeviceInfo().GroupsPerDevice[DeviceId]); - } - - // Adjust threads to the env variables - if (DeviceInfo().Env.TeamThreadLimit > 0 && - (enforceUpperBound(&DeviceInfo().NumThreads[DeviceId], - DeviceInfo().Env.TeamThreadLimit))) { - DP("Capping max number of threads to OMP_TEAMS_THREAD_LIMIT=%d\n", - DeviceInfo().Env.TeamThreadLimit); - } - - // Set default number of threads - DeviceInfo().NumThreads[DeviceId] = RTLDeviceInfoTy::DefaultWgSize; - DP("Default number of threads set according to library's default %d\n", - RTLDeviceInfoTy::DefaultWgSize); - if (enforceUpperBound(&DeviceInfo().NumThreads[DeviceId], - DeviceInfo().ThreadsPerGroup[DeviceId])) { - DP("Default number of threads exceeds device limit, capping at %d\n", - DeviceInfo().ThreadsPerGroup[DeviceId]); - } - - DP("Device %d: default limit for groupsPerDevice %d & threadsPerGroup %d\n", - DeviceId, DeviceInfo().GroupsPerDevice[DeviceId], - DeviceInfo().ThreadsPerGroup[DeviceId]); - - DP("Device %d: wavefront size %d, total threads %d x %d = %d\n", DeviceId, - DeviceInfo().WarpSize[DeviceId], DeviceInfo().ThreadsPerGroup[DeviceId], - DeviceInfo().GroupsPerDevice[DeviceId], - DeviceInfo().GroupsPerDevice[DeviceId] * - DeviceInfo().ThreadsPerGroup[DeviceId]); - - return OFFLOAD_SUCCESS; -} - -static __tgt_target_table * -__tgt_rtl_load_binary_locked(int32_t DeviceId, __tgt_device_image *Image); - -__tgt_target_table *__tgt_rtl_load_binary(int32_t DeviceId, - __tgt_device_image *Image) { - DeviceInfo().LoadRunLock.lock(); - __tgt_target_table *Res = __tgt_rtl_load_binary_locked(DeviceId, Image); - DeviceInfo().LoadRunLock.unlock(); - return Res; -} - -__tgt_target_table *__tgt_rtl_load_binary_locked(int32_t DeviceId, - __tgt_device_image *Image) { - // This function loads the device image onto gpu[DeviceId] and does other - // per-image initialization work. Specifically: - // - // - Initialize an DeviceEnvironmentTy instance embedded in the - // image at the symbol "__omp_rtl_device_environment" - // Fields DebugKind, DeviceNum, NumDevices. Used by the deviceRTL. - // - // - Allocate a large array per-gpu (could be moved to init_device) - // - Read a uint64_t at symbol omptarget_nvptx_device_State_size - // - Allocate at least that many bytes of gpu memory - // - Zero initialize it - // - Write the pointer to the symbol omptarget_nvptx_device_State - // - // - Pulls some per-kernel information together from various sources and - // records it in the KernelsList for quicker access later - // - // The initialization can be done before or after loading the image onto the - // gpu. This function presently does a mixture. Using the hsa api to get/set - // the information is simpler to implement, in exchange for more complicated - // runtime behaviour. E.g. launching a kernel or using dma to get eight bytes - // back from the gpu vs a hashtable lookup on the host. - - const size_t ImgSize = (char *)Image->ImageEnd - (char *)Image->ImageStart; - - DeviceInfo().clearOffloadEntriesTable(DeviceId); - - // We do not need to set the ELF version because the caller of this function - // had to do that to decide the right runtime to use - - if (!elfMachineIdIsAmdgcn(Image)) - return NULL; - - { - auto Env = - DeviceEnvironment(DeviceId, DeviceInfo().NumberOfDevices, - DeviceInfo().Env.DynamicMemSize, Image, ImgSize); - - auto &KernelInfo = DeviceInfo().KernelInfoTable[DeviceId]; - auto &SymbolInfo = DeviceInfo().SymbolInfoTable[DeviceId]; - hsa_status_t Err = moduleRegisterFromMemoryToPlace( - KernelInfo, SymbolInfo, (void *)Image->ImageStart, ImgSize, DeviceId, - [&](void *Data, size_t Size) { - if (imageContainsSymbol(Data, Size, "needs_hostcall_buffer")) { - __atomic_store_n(&DeviceInfo().HostcallRequired, true, - __ATOMIC_RELEASE); - } - return Env.beforeLoading(Data, Size); - }, - DeviceInfo().HSAExecutables); - - check("Module registering", Err); - if (Err != HSA_STATUS_SUCCESS) { - const char *DeviceName = DeviceInfo().GPUName[DeviceId].c_str(); - const char *ElfName = get_elf_mach_gfx_name(elfEFlags(Image)); - - if (strcmp(DeviceName, ElfName) != 0) { - DP("Possible gpu arch mismatch: device:%s, image:%s please check" - " compiler flag: -march=\n", - DeviceName, ElfName); - } else { - DP("Error loading image onto GPU: %s\n", get_error_string(Err)); - } - - return NULL; - } - - Err = Env.afterLoading(); - if (Err != HSA_STATUS_SUCCESS) { - return NULL; - } - } - - DP("AMDGPU module successfully loaded!\n"); - - { - // the device_State array is either large value in bss or a void* that - // needs to be assigned to a pointer to an array of size device_state_bytes - // If absent, it has been deadstripped and needs no setup. - - void *StatePtr; - uint32_t StatePtrSize; - auto &SymbolInfoMap = DeviceInfo().SymbolInfoTable[DeviceId]; - hsa_status_t Err = interop_hsa_get_symbol_info( - SymbolInfoMap, DeviceId, "omptarget_nvptx_device_State", &StatePtr, - &StatePtrSize); - - if (Err != HSA_STATUS_SUCCESS) { - DP("No device_state symbol found, skipping initialization\n"); - } else { - if (StatePtrSize < sizeof(void *)) { - DP("unexpected size of state_ptr %u != %zu\n", StatePtrSize, - sizeof(void *)); - return NULL; - } - - // if it's larger than a void*, assume it's a bss array and no further - // initialization is required. Only try to set up a pointer for - // sizeof(void*) - if (StatePtrSize == sizeof(void *)) { - uint64_t DeviceStateBytes = - getDeviceStateBytes((char *)Image->ImageStart, ImgSize); - if (DeviceStateBytes == 0) { - DP("Can't initialize device_State, missing size information\n"); - return NULL; - } - - auto &DSS = DeviceInfo().DeviceStateStore[DeviceId]; - if (DSS.first.get() == nullptr) { - assert(DSS.second == 0); - void *Ptr = NULL; - hsa_status_t Err = implCalloc(&Ptr, DeviceStateBytes, DeviceId); - if (Err != HSA_STATUS_SUCCESS) { - DP("Failed to allocate device_state array\n"); - return NULL; - } - DSS = { - std::unique_ptr{Ptr}, - DeviceStateBytes, - }; - } - - void *Ptr = DSS.first.get(); - if (DeviceStateBytes != DSS.second) { - DP("Inconsistent sizes of device_State unsupported\n"); - return NULL; - } - - // write ptr to device memory so it can be used by later kernels - Err = DeviceInfo().freesignalpoolMemcpyH2D(StatePtr, &Ptr, - sizeof(void *), DeviceId); - if (Err != HSA_STATUS_SUCCESS) { - DP("memcpy install of state_ptr failed\n"); - return NULL; - } - } - } - } - - // Here, we take advantage of the data that is appended after img_end to get - // the symbols' name we need to load. This data consist of the host entries - // begin and end as well as the target name (see the offloading linker script - // creation in clang compiler). - - // Find the symbols in the module by name. The name can be obtain by - // concatenating the host entry name with the target name - - __tgt_offload_entry *HostBegin = Image->EntriesBegin; - __tgt_offload_entry *HostEnd = Image->EntriesEnd; - - for (__tgt_offload_entry *E = HostBegin; E != HostEnd; ++E) { - - if (!E->addr) { - // The host should have always something in the address to - // uniquely identify the target region. - DP("Analyzing host entry '' (size = %lld)...\n", - (unsigned long long)E->size); - return NULL; - } - - if (E->size) { - __tgt_offload_entry Entry = *E; - - void *Varptr; - uint32_t Varsize; - - auto &SymbolInfoMap = DeviceInfo().SymbolInfoTable[DeviceId]; - hsa_status_t Err = interop_hsa_get_symbol_info( - SymbolInfoMap, DeviceId, E->name, &Varptr, &Varsize); - - if (Err != HSA_STATUS_SUCCESS) { - // Inform the user what symbol prevented offloading - DP("Loading global '%s' (Failed)\n", E->name); - return NULL; - } - - if (Varsize != E->size) { - DP("Loading global '%s' - size mismatch (%u != %lu)\n", E->name, - Varsize, E->size); - return NULL; - } - - DP("Entry point " DPxMOD " maps to global %s (" DPxMOD ")\n", - DPxPTR(E - HostBegin), E->name, DPxPTR(Varptr)); - Entry.addr = (void *)Varptr; - - DeviceInfo().addOffloadEntry(DeviceId, Entry); - - if (DeviceInfo().RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY && - E->flags & OMP_DECLARE_TARGET_LINK) { - // If unified memory is present any target link variables - // can access host addresses directly. There is no longer a - // need for device copies. - Err = DeviceInfo().freesignalpoolMemcpyH2D(Varptr, E->addr, - sizeof(void *), DeviceId); - if (Err != HSA_STATUS_SUCCESS) - DP("Error when copying USM\n"); - DP("Copy linked variable host address (" DPxMOD ")" - "to device address (" DPxMOD ")\n", - DPxPTR(*((void **)E->addr)), DPxPTR(Varptr)); - } - - continue; - } - - DP("to find the kernel name: %s size: %lu\n", E->name, strlen(E->name)); - - // errors in kernarg_segment_size previously treated as = 0 (or as undef) - uint32_t KernargSegmentSize = 0; - auto &KernelInfoMap = DeviceInfo().KernelInfoTable[DeviceId]; - hsa_status_t Err = HSA_STATUS_SUCCESS; - if (!E->name) { - Err = HSA_STATUS_ERROR; - } else { - std::string KernelStr = std::string(E->name); - auto It = KernelInfoMap.find(KernelStr); - if (It != KernelInfoMap.end()) { - atl_kernel_info_t Info = It->second; - KernargSegmentSize = Info.kernel_segment_size; - } else { - Err = HSA_STATUS_ERROR; - } - } - - // default value GENERIC (in case symbol is missing from cubin file) - llvm::omp::OMPTgtExecModeFlags ExecModeVal = - llvm::omp::OMPTgtExecModeFlags::OMP_TGT_EXEC_MODE_GENERIC; - - // get flat group size if present, else Default_WG_Size - int16_t WGSizeVal = RTLDeviceInfoTy::DefaultWgSize; - - // get Kernel Descriptor if present. - // Keep struct in sync wih getTgtAttributeStructQTy in CGOpenMPRuntime.cpp - struct KernDescValType { - uint16_t Version; - uint16_t TSize; - uint16_t WGSize; - }; - struct KernDescValType KernDescVal; - std::string KernDescNameStr(E->name); - KernDescNameStr += "_kern_desc"; - const char *KernDescName = KernDescNameStr.c_str(); - - const void *KernDescPtr; - uint32_t KernDescSize; - void *CallStackAddr = nullptr; - Err = interopGetSymbolInfo((char *)Image->ImageStart, ImgSize, KernDescName, - &KernDescPtr, &KernDescSize); - - if (Err == HSA_STATUS_SUCCESS) { - if ((size_t)KernDescSize != sizeof(KernDescVal)) - DP("Loading global computation properties '%s' - size mismatch (%u != " - "%lu)\n", - KernDescName, KernDescSize, sizeof(KernDescVal)); - - memcpy(&KernDescVal, KernDescPtr, (size_t)KernDescSize); - - // Check structure size against recorded size. - if ((size_t)KernDescSize != KernDescVal.TSize) - DP("KernDescVal size %lu does not match advertized size %d for '%s'\n", - sizeof(KernDescVal), KernDescVal.TSize, KernDescName); - - DP("After loading global for %s KernDesc \n", KernDescName); - DP("KernDesc: Version: %d\n", KernDescVal.Version); - DP("KernDesc: TSize: %d\n", KernDescVal.TSize); - DP("KernDesc: WG_Size: %d\n", KernDescVal.WGSize); - - if (KernDescVal.WGSize == 0) { - KernDescVal.WGSize = RTLDeviceInfoTy::DefaultWgSize; - DP("Setting KernDescVal.WG_Size to default %d\n", KernDescVal.WGSize); - } - WGSizeVal = KernDescVal.WGSize; - DP("WGSizeVal %d\n", WGSizeVal); - check("Loading KernDesc computation property", Err); - } else { - DP("Warning: Loading KernDesc '%s' - symbol not found, ", KernDescName); - - // Flat group size - std::string WGSizeNameStr(E->name); - WGSizeNameStr += "_wg_size"; - const char *WGSizeName = WGSizeNameStr.c_str(); - - const void *WGSizePtr; - uint32_t WGSize; - Err = interopGetSymbolInfo((char *)Image->ImageStart, ImgSize, WGSizeName, - &WGSizePtr, &WGSize); - - if (Err == HSA_STATUS_SUCCESS) { - if ((size_t)WGSize != sizeof(int16_t)) { - DP("Loading global computation properties '%s' - size mismatch (%u " - "!= " - "%lu)\n", - WGSizeName, WGSize, sizeof(int16_t)); - return NULL; - } - - memcpy(&WGSizeVal, WGSizePtr, (size_t)WGSize); - - DP("After loading global for %s WGSize = %d\n", WGSizeName, WGSizeVal); - - if (WGSizeVal < RTLDeviceInfoTy::DefaultWgSize || - WGSizeVal > RTLDeviceInfoTy::MaxWgSize) { - DP("Error wrong WGSize value specified in HSA code object file: " - "%d\n", - WGSizeVal); - WGSizeVal = RTLDeviceInfoTy::DefaultWgSize; - } - } else { - DP("Warning: Loading WGSize '%s' - symbol not found, " - "using default value %d\n", - WGSizeName, WGSizeVal); - } - - check("Loading WGSize computation property", Err); - } - - // Read execution mode from global in binary - std::string ExecModeNameStr(E->name); - ExecModeNameStr += "_exec_mode"; - const char *ExecModeName = ExecModeNameStr.c_str(); - - const void *ExecModePtr; - uint32_t VarSize; - Err = interopGetSymbolInfo((char *)Image->ImageStart, ImgSize, ExecModeName, - &ExecModePtr, &VarSize); - - if (Err == HSA_STATUS_SUCCESS) { - if ((size_t)VarSize != sizeof(llvm::omp::OMPTgtExecModeFlags)) { - DP("Loading global computation properties '%s' - size mismatch(%u != " - "%lu)\n", - ExecModeName, VarSize, sizeof(llvm::omp::OMPTgtExecModeFlags)); - return NULL; - } - - memcpy(&ExecModeVal, ExecModePtr, (size_t)VarSize); - - DP("After loading global for %s ExecMode = %d\n", ExecModeName, - ExecModeVal); - - if (ExecModeVal < 0 || - ExecModeVal > llvm::omp::OMP_TGT_EXEC_MODE_GENERIC_SPMD) { - DP("Error wrong exec_mode value specified in HSA code object file: " - "%d\n", - ExecModeVal); - return NULL; - } - } else { - DP("Loading global exec_mode '%s' - symbol missing, using default " - "value " - "GENERIC (1)\n", - ExecModeName); - } - check("Loading computation property", Err); - - KernelsList.push_back(KernelTy(ExecModeVal, WGSizeVal, DeviceId, - CallStackAddr, E->name, KernargSegmentSize, - DeviceInfo().KernArgPool)); - __tgt_offload_entry Entry = *E; - Entry.addr = (void *)&KernelsList.back(); - DeviceInfo().addOffloadEntry(DeviceId, Entry); - DP("Entry point %ld maps to %s\n", E - HostBegin, E->name); - } - - return DeviceInfo().getOffloadEntriesTable(DeviceId); -} - -void *__tgt_rtl_data_alloc(int DeviceId, int64_t Size, void *, int32_t Kind) { - void *Ptr = NULL; - assert(DeviceId < DeviceInfo().NumberOfDevices && "Device ID too large"); - - hsa_amd_memory_pool_t MemoryPool; - switch (Kind) { - case TARGET_ALLOC_DEFAULT: - case TARGET_ALLOC_DEVICE: - // GPU memory - MemoryPool = DeviceInfo().getDeviceMemoryPool(DeviceId); - break; - case TARGET_ALLOC_HOST: - // non-migratable memory accessible by host and device(s) - MemoryPool = DeviceInfo().getHostMemoryPool(); - break; - default: - REPORT("Invalid target data allocation kind or requested allocator not " - "implemented yet\n"); - return NULL; - } - - hsa_status_t Err = hsa_amd_memory_pool_allocate(MemoryPool, Size, 0, &Ptr); - DP("Tgt alloc data %ld bytes, (tgt:%016llx).\n", Size, - (long long unsigned)(Elf64_Addr)Ptr); - Ptr = (Err == HSA_STATUS_SUCCESS) ? Ptr : NULL; - return Ptr; -} - -int32_t __tgt_rtl_data_submit(int DeviceId, void *TgtPtr, void *HstPtr, - int64_t Size) { - assert(DeviceId < DeviceInfo().NumberOfDevices && "Device ID too large"); - __tgt_async_info AsyncInfo; - int32_t Rc = dataSubmit(DeviceId, TgtPtr, HstPtr, Size, &AsyncInfo); - if (Rc != OFFLOAD_SUCCESS) - return OFFLOAD_FAIL; - - return __tgt_rtl_synchronize(DeviceId, &AsyncInfo); -} - -int32_t __tgt_rtl_data_submit_async(int DeviceId, void *TgtPtr, void *HstPtr, - int64_t Size, __tgt_async_info *AsyncInfo) { - assert(DeviceId < DeviceInfo().NumberOfDevices && "Device ID too large"); - if (AsyncInfo) { - initAsyncInfo(AsyncInfo); - return dataSubmit(DeviceId, TgtPtr, HstPtr, Size, AsyncInfo); - } - return __tgt_rtl_data_submit(DeviceId, TgtPtr, HstPtr, Size); -} - -int32_t __tgt_rtl_data_retrieve(int DeviceId, void *HstPtr, void *TgtPtr, - int64_t Size) { - assert(DeviceId < DeviceInfo().NumberOfDevices && "Device ID too large"); - __tgt_async_info AsyncInfo; - int32_t Rc = dataRetrieve(DeviceId, HstPtr, TgtPtr, Size, &AsyncInfo); - if (Rc != OFFLOAD_SUCCESS) - return OFFLOAD_FAIL; - - return __tgt_rtl_synchronize(DeviceId, &AsyncInfo); -} - -int32_t __tgt_rtl_data_retrieve_async(int DeviceId, void *HstPtr, void *TgtPtr, - int64_t Size, - __tgt_async_info *AsyncInfo) { - assert(AsyncInfo && "AsyncInfo is nullptr"); - assert(DeviceId < DeviceInfo().NumberOfDevices && "Device ID too large"); - initAsyncInfo(AsyncInfo); - return dataRetrieve(DeviceId, HstPtr, TgtPtr, Size, AsyncInfo); -} - -int32_t __tgt_rtl_data_delete(int DeviceId, void *TgtPtr, int32_t) { - assert(DeviceId < DeviceInfo().NumberOfDevices && "Device ID too large"); - // HSA can free pointers allocated from different types of memory pool. - hsa_status_t Err; - DP("Tgt free data (tgt:%016llx).\n", (long long unsigned)(Elf64_Addr)TgtPtr); - Err = core::Runtime::Memfree(TgtPtr); - if (Err != HSA_STATUS_SUCCESS) { - DP("Error when freeing CUDA memory\n"); - return OFFLOAD_FAIL; - } - return OFFLOAD_SUCCESS; -} - -int32_t __tgt_rtl_launch_kernel(int32_t DeviceId, void *TgtEntryPtr, - void **TgtArgs, ptrdiff_t *TgtOffsets, - KernelArgsTy *KernelArgs, - __tgt_async_info *AsyncInfo) { - assert(!KernelArgs->NumTeams[1] && !KernelArgs->NumTeams[2] && - !KernelArgs->ThreadLimit[1] && !KernelArgs->ThreadLimit[2] && - "Only one dimensional kernels supported."); - assert(AsyncInfo && "AsyncInfo is nullptr"); - initAsyncInfo(AsyncInfo); - - DeviceInfo().LoadRunLock.lock_shared(); - int32_t Res = - runRegionLocked(DeviceId, TgtEntryPtr, TgtArgs, TgtOffsets, - KernelArgs->NumArgs, KernelArgs->NumTeams[0], - KernelArgs->ThreadLimit[0], KernelArgs->Tripcount); - - DeviceInfo().LoadRunLock.unlock_shared(); - return Res; -} - -int32_t __tgt_rtl_synchronize(int32_t DeviceId, __tgt_async_info *AsyncInfo) { - assert(AsyncInfo && "AsyncInfo is nullptr"); - - // Cuda asserts that AsyncInfo->Queue is non-null, but this invariant - // is not ensured by devices.cpp for amdgcn - // assert(AsyncInfo->Queue && "AsyncInfo->Queue is nullptr"); - if (AsyncInfo->Queue) { - finiAsyncInfo(AsyncInfo); - } - return OFFLOAD_SUCCESS; -} - -void __tgt_rtl_print_device_info(int32_t DeviceId) { - // TODO: Assertion to see if DeviceId is correct - // NOTE: We don't need to set context for print device info. - - DeviceInfo().printDeviceInfo(DeviceId, DeviceInfo().HSAAgents[DeviceId]); -} - -int32_t __tgt_rtl_data_lock(int32_t DeviceId, void *HostPtr, int64_t Size, - void **LockedHostPtr) { - assert(DeviceId < DeviceInfo().NumberOfDevices && "Device ID too large"); - - hsa_agent_t Agent = DeviceInfo().HSAAgents[DeviceId]; - hsa_status_t err = lock_memory(HostPtr, Size, Agent, LockedHostPtr); - if (err != HSA_STATUS_SUCCESS) { - DP("Error in tgt_rtl_data_lock\n"); - return OFFLOAD_FAIL; - } - DP("Tgt lock host data %ld bytes, (HostPtr:%016llx).\n", Size, - (long long unsigned)(Elf64_Addr)*LockedHostPtr); - return OFFLOAD_SUCCESS; -} - -int32_t __tgt_rtl_data_unlock(int DeviceId, void *HostPtr) { - assert(DeviceId < DeviceInfo().NumberOfDevices && "Device ID too large"); - hsa_status_t err = unlock_memory(HostPtr); - if (err != HSA_STATUS_SUCCESS) { - DP("Error in tgt_rtl_data_unlock\n"); - return OFFLOAD_FAIL; - } - - DP("Tgt unlock data (tgt:%016llx).\n", - (long long unsigned)(Elf64_Addr)HostPtr); - return OFFLOAD_SUCCESS; -} - -} // extern "C" diff --git a/openmp/libomptarget/plugins/common/CMakeLists.txt b/openmp/libomptarget/plugins/common/CMakeLists.txt deleted file mode 100644 --- a/openmp/libomptarget/plugins/common/CMakeLists.txt +++ /dev/null @@ -1,14 +0,0 @@ -##===----------------------------------------------------------------------===## -# -# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -# See https://llvm.org/LICENSE.txt for license information. -# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -# -##===----------------------------------------------------------------------===## -# -# Common parts which can be used by all plugins -# -##===----------------------------------------------------------------------===## - -add_subdirectory(elf_common) -add_subdirectory(MemoryManager) diff --git a/openmp/libomptarget/plugins/common/MemoryManager/CMakeLists.txt b/openmp/libomptarget/plugins/common/MemoryManager/CMakeLists.txt deleted file mode 100644 --- a/openmp/libomptarget/plugins/common/MemoryManager/CMakeLists.txt +++ /dev/null @@ -1,11 +0,0 @@ -##===----------------------------------------------------------------------===## -# -# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -# See https://llvm.org/LICENSE.txt for license information. -# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -# -##===----------------------------------------------------------------------===## - -add_library(MemoryManager INTERFACE) - -target_include_directories(MemoryManager INTERFACE ${CMAKE_CURRENT_SOURCE_DIR}) diff --git a/openmp/libomptarget/plugins/common/MemoryManager/MemoryManager.h b/openmp/libomptarget/plugins/common/MemoryManager/MemoryManager.h deleted file mode 100644 --- a/openmp/libomptarget/plugins/common/MemoryManager/MemoryManager.h +++ /dev/null @@ -1,347 +0,0 @@ -//===----------- MemoryManager.h - Target independent memory manager ------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// Target independent memory manager. -// -//===----------------------------------------------------------------------===// - -#ifndef LLVM_OPENMP_LIBOMPTARGET_PLUGINS_COMMON_MEMORYMANAGER_MEMORYMANAGER_H -#define LLVM_OPENMP_LIBOMPTARGET_PLUGINS_COMMON_MEMORYMANAGER_MEMORYMANAGER_H - -#include -#include -#include -#include -#include -#include -#include - -#include "Debug.h" -#include "omptargetplugin.h" - -/// Base class of per-device allocator. -class DeviceAllocatorTy { -public: - virtual ~DeviceAllocatorTy() = default; - - /// Allocate a memory of size \p Size . \p HstPtr is used to assist the - /// allocation. - virtual void *allocate(size_t Size, void *HstPtr, - TargetAllocTy Kind = TARGET_ALLOC_DEFAULT) = 0; - - /// Delete the pointer \p TgtPtr on the device - virtual int free(void *TgtPtr, TargetAllocTy Kind = TARGET_ALLOC_DEFAULT) = 0; -}; - -/// Class of memory manager. The memory manager is per-device by using -/// per-device allocator. Therefore, each plugin using memory manager should -/// have an allocator for each device. -class MemoryManagerTy { - static constexpr const size_t BucketSize[] = { - 0, 1U << 2, 1U << 3, 1U << 4, 1U << 5, 1U << 6, 1U << 7, - 1U << 8, 1U << 9, 1U << 10, 1U << 11, 1U << 12, 1U << 13}; - - static constexpr const int NumBuckets = - sizeof(BucketSize) / sizeof(BucketSize[0]); - - /// Find the previous number that is power of 2 given a number that is not - /// power of 2. - static size_t floorToPowerOfTwo(size_t Num) { - Num |= Num >> 1; - Num |= Num >> 2; - Num |= Num >> 4; - Num |= Num >> 8; - Num |= Num >> 16; -#if INTPTR_MAX == INT64_MAX - Num |= Num >> 32; -#elif INTPTR_MAX == INT32_MAX - // Do nothing with 32-bit -#else -#error Unsupported architecture -#endif - Num += 1; - return Num >> 1; - } - - /// Find a suitable bucket - static int findBucket(size_t Size) { - const size_t F = floorToPowerOfTwo(Size); - - DP("findBucket: Size %zu is floored to %zu.\n", Size, F); - - int L = 0, H = NumBuckets - 1; - while (H - L > 1) { - int M = (L + H) >> 1; - if (BucketSize[M] == F) - return M; - if (BucketSize[M] > F) - H = M - 1; - else - L = M; - } - - assert(L >= 0 && L < NumBuckets && "L is out of range"); - - DP("findBucket: Size %zu goes to bucket %d\n", Size, L); - - return L; - } - - /// A structure stores the meta data of a target pointer - struct NodeTy { - /// Memory size - const size_t Size; - /// Target pointer - void *Ptr; - - /// Constructor - NodeTy(size_t Size, void *Ptr) : Size(Size), Ptr(Ptr) {} - }; - - /// To make \p NodePtrTy ordered when they're put into \p std::multiset. - struct NodeCmpTy { - bool operator()(const NodeTy &LHS, const NodeTy &RHS) const { - return LHS.Size < RHS.Size; - } - }; - - /// A \p FreeList is a set of Nodes. We're using \p std::multiset here to make - /// the look up procedure more efficient. - using FreeListTy = std::multiset, NodeCmpTy>; - - /// A list of \p FreeListTy entries, each of which is a \p std::multiset of - /// Nodes whose size is less or equal to a specific bucket size. - std::vector FreeLists; - /// A list of mutex for each \p FreeListTy entry - std::vector FreeListLocks; - /// A table to map from a target pointer to its node - std::unordered_map PtrToNodeTable; - /// The mutex for the table \p PtrToNodeTable - std::mutex MapTableLock; - - /// The reference to a device allocator - DeviceAllocatorTy &DeviceAllocator; - - /// The threshold to manage memory using memory manager. If the request size - /// is larger than \p SizeThreshold, the allocation will not be managed by the - /// memory manager. - size_t SizeThreshold = 1U << 13; - - /// Request memory from target device - void *allocateOnDevice(size_t Size, void *HstPtr) const { - return DeviceAllocator.allocate(Size, HstPtr, TARGET_ALLOC_DEVICE); - } - - /// Deallocate data on device - int deleteOnDevice(void *Ptr) const { return DeviceAllocator.free(Ptr); } - - /// This function is called when it tries to allocate memory on device but the - /// device returns out of memory. It will first free all memory in the - /// FreeList and try to allocate again. - void *freeAndAllocate(size_t Size, void *HstPtr) { - std::vector RemoveList; - - // Deallocate all memory in FreeList - for (int I = 0; I < NumBuckets; ++I) { - FreeListTy &List = FreeLists[I]; - std::lock_guard Lock(FreeListLocks[I]); - if (List.empty()) - continue; - for (const NodeTy &N : List) { - deleteOnDevice(N.Ptr); - RemoveList.push_back(N.Ptr); - } - FreeLists[I].clear(); - } - - // Remove all nodes in the map table which have been released - if (!RemoveList.empty()) { - std::lock_guard LG(MapTableLock); - for (void *P : RemoveList) - PtrToNodeTable.erase(P); - } - - // Try allocate memory again - return allocateOnDevice(Size, HstPtr); - } - - /// The goal is to allocate memory on the device. It first tries to - /// allocate directly on the device. If a \p nullptr is returned, it might - /// be because the device is OOM. In that case, it will free all unused - /// memory and then try again. - void *allocateOrFreeAndAllocateOnDevice(size_t Size, void *HstPtr) { - void *TgtPtr = allocateOnDevice(Size, HstPtr); - // We cannot get memory from the device. It might be due to OOM. Let's - // free all memory in FreeLists and try again. - if (TgtPtr == nullptr) { - DP("Failed to get memory on device. Free all memory in FreeLists and " - "try again.\n"); - TgtPtr = freeAndAllocate(Size, HstPtr); - } - - if (TgtPtr == nullptr) - DP("Still cannot get memory on device probably because the device is " - "OOM.\n"); - - return TgtPtr; - } - -public: - /// Constructor. If \p Threshold is non-zero, then the default threshold will - /// be overwritten by \p Threshold. - MemoryManagerTy(DeviceAllocatorTy &DeviceAllocator, size_t Threshold = 0) - : FreeLists(NumBuckets), FreeListLocks(NumBuckets), - DeviceAllocator(DeviceAllocator) { - if (Threshold) - SizeThreshold = Threshold; - } - - /// Destructor - ~MemoryManagerTy() { - for (auto Itr = PtrToNodeTable.begin(); Itr != PtrToNodeTable.end(); - ++Itr) { - assert(Itr->second.Ptr && "nullptr in map table"); - deleteOnDevice(Itr->second.Ptr); - } - } - - /// Allocate memory of size \p Size from target device. \p HstPtr is used to - /// assist the allocation. - void *allocate(size_t Size, void *HstPtr) { - // If the size is zero, we will not bother the target device. Just return - // nullptr directly. - if (Size == 0) - return nullptr; - - DP("MemoryManagerTy::allocate: size %zu with host pointer " DPxMOD ".\n", - Size, DPxPTR(HstPtr)); - - // If the size is greater than the threshold, allocate it directly from - // device. - if (Size > SizeThreshold) { - DP("%zu is greater than the threshold %zu. Allocate it directly from " - "device\n", - Size, SizeThreshold); - void *TgtPtr = allocateOrFreeAndAllocateOnDevice(Size, HstPtr); - - DP("Got target pointer " DPxMOD ". Return directly.\n", DPxPTR(TgtPtr)); - - return TgtPtr; - } - - NodeTy *NodePtr = nullptr; - - // Try to get a node from FreeList - { - const int B = findBucket(Size); - FreeListTy &List = FreeLists[B]; - - NodeTy TempNode(Size, nullptr); - std::lock_guard LG(FreeListLocks[B]); - const auto Itr = List.find(TempNode); - - if (Itr != List.end()) { - NodePtr = &Itr->get(); - List.erase(Itr); - } - } - - if (NodePtr != nullptr) - DP("Find one node " DPxMOD " in the bucket.\n", DPxPTR(NodePtr)); - - // We cannot find a valid node in FreeLists. Let's allocate on device and - // create a node for it. - if (NodePtr == nullptr) { - DP("Cannot find a node in the FreeLists. Allocate on device.\n"); - // Allocate one on device - void *TgtPtr = allocateOrFreeAndAllocateOnDevice(Size, HstPtr); - - if (TgtPtr == nullptr) - return nullptr; - - // Create a new node and add it into the map table - { - std::lock_guard Guard(MapTableLock); - auto Itr = PtrToNodeTable.emplace(TgtPtr, NodeTy(Size, TgtPtr)); - NodePtr = &Itr.first->second; - } - - DP("Node address " DPxMOD ", target pointer " DPxMOD ", size %zu\n", - DPxPTR(NodePtr), DPxPTR(TgtPtr), Size); - } - - assert(NodePtr && "NodePtr should not be nullptr at this point"); - - return NodePtr->Ptr; - } - - /// Deallocate memory pointed by \p TgtPtr - int free(void *TgtPtr) { - DP("MemoryManagerTy::free: target memory " DPxMOD ".\n", DPxPTR(TgtPtr)); - - NodeTy *P = nullptr; - - // Look it up into the table - { - std::lock_guard G(MapTableLock); - auto Itr = PtrToNodeTable.find(TgtPtr); - - // We don't remove the node from the map table because the map does not - // change. - if (Itr != PtrToNodeTable.end()) - P = &Itr->second; - } - - // The memory is not managed by the manager - if (P == nullptr) { - DP("Cannot find its node. Delete it on device directly.\n"); - return deleteOnDevice(TgtPtr); - } - - // Insert the node to the free list - const int B = findBucket(P->Size); - - DP("Found its node " DPxMOD ". Insert it to bucket %d.\n", DPxPTR(P), B); - - { - std::lock_guard G(FreeListLocks[B]); - FreeLists[B].insert(*P); - } - - return OFFLOAD_SUCCESS; - } - - /// Get the size threshold from the environment variable - /// \p LIBOMPTARGET_MEMORY_MANAGER_THRESHOLD . Returns a - /// std::pair where the first element represents the - /// threshold and the second element represents whether user disables memory - /// manager explicitly by setting the var to 0. If user doesn't specify - /// anything, returns <0, true>. - static std::pair getSizeThresholdFromEnv() { - size_t Threshold = 0; - - if (const char *Env = - std::getenv("LIBOMPTARGET_MEMORY_MANAGER_THRESHOLD")) { - Threshold = std::stoul(Env); - if (Threshold == 0) { - DP("Disabled memory manager as user set " - "LIBOMPTARGET_MEMORY_MANAGER_THRESHOLD=0.\n"); - return std::make_pair(0, false); - } - } - - return std::make_pair(Threshold, true); - } -}; - -// GCC still cannot handle the static data member like Clang so we still need -// this part. -constexpr const size_t MemoryManagerTy::BucketSize[]; -constexpr const int MemoryManagerTy::NumBuckets; - -#endif // LLVM_OPENMP_LIBOMPTARGET_PLUGINS_COMMON_MEMORYMANAGER_MEMORYMANAGER_H diff --git a/openmp/libomptarget/plugins/common/elf_common/CMakeLists.txt b/openmp/libomptarget/plugins/common/elf_common/CMakeLists.txt deleted file mode 100644 --- a/openmp/libomptarget/plugins/common/elf_common/CMakeLists.txt +++ /dev/null @@ -1,36 +0,0 @@ -##===----------------------------------------------------------------------===## -# -# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -# See https://llvm.org/LICENSE.txt for license information. -# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -# -##===----------------------------------------------------------------------===## -# -# Common ELF functionality for target plugins -# -##===----------------------------------------------------------------------===## - -# NOTE: Don't try to build `elf_common` using `add_llvm_library`. -# See openmp/libomptarget/plugins-nextgen/common/PluginInterface/CMakeLists.txt -# for more explanation. -add_library(elf_common OBJECT elf_common.cpp ELFSymbols.cpp) - -# This is required when using LLVM libraries. -llvm_update_compile_flags(elf_common) - -if (LLVM_LINK_LLVM_DYLIB) - set(llvm_libs LLVM) -else() - llvm_map_components_to_libnames(llvm_libs BinaryFormat Object Support) -endif() - -target_link_libraries(elf_common PUBLIC ${llvm_libs} ${OPENMP_PTHREAD_LIB}) - -# Build elf_common with PIC to be able to link it with plugin shared libraries. -set_property(TARGET elf_common PROPERTY POSITION_INDEPENDENT_CODE ON) - -# Expose elf_common.h directory to the users of this library. -target_include_directories(elf_common - INTERFACE ${CMAKE_CURRENT_SOURCE_DIR} - PRIVATE ${LIBOMPTARGET_INCLUDE_DIR} -) diff --git a/openmp/libomptarget/plugins/common/elf_common/ELFSymbols.h b/openmp/libomptarget/plugins/common/elf_common/ELFSymbols.h deleted file mode 100644 --- a/openmp/libomptarget/plugins/common/elf_common/ELFSymbols.h +++ /dev/null @@ -1,27 +0,0 @@ -//===-- ELFSymbols.h - ELF Symbol look-up functionality ---------*- C++ -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// ELF routines for obtaining a symbol from an Elf file without loading it. -// -//===----------------------------------------------------------------------===// - -#ifndef LLVM_OPENMP_LIBOMPTARGET_PLUGINS_COMMON_ELF_COMMON_ELF_SYMBOLS_H -#define LLVM_OPENMP_LIBOMPTARGET_PLUGINS_COMMON_ELF_COMMON_ELF_SYMBOLS_H - -#include "llvm/Object/ELF.h" -#include "llvm/Object/ELFObjectFile.h" - -/// Returns the symbol associated with the \p Name in the \p ELFObj. It will -/// first search for the hash sections to identify symbols from the hash table. -/// If that fails it will fall back to a linear search in the case of an -/// executable file without a hash table. -llvm::Expected -getELFSymbol(const llvm::object::ELFObjectFile &ELFObj, - llvm::StringRef Name); - -#endif diff --git a/openmp/libomptarget/plugins/common/elf_common/ELFSymbols.cpp b/openmp/libomptarget/plugins/common/elf_common/ELFSymbols.cpp deleted file mode 100644 --- a/openmp/libomptarget/plugins/common/elf_common/ELFSymbols.cpp +++ /dev/null @@ -1,201 +0,0 @@ -//===-- ELFSymbols.cpp - ELF Symbol look-up functionality -------*- C++ -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#include "ELFSymbols.h" - -using namespace llvm; -using namespace llvm::object; -using namespace llvm::ELF; - -template -static Expected -getSymbolFromGnuHashTable(StringRef Name, const typename ELFT::GnuHash &HashTab, - ArrayRef SymTab, - StringRef StrTab) { - const uint32_t NameHash = hashGnu(Name); - const typename ELFT::Word NBucket = HashTab.nbuckets; - const typename ELFT::Word SymOffset = HashTab.symndx; - ArrayRef Filter = HashTab.filter(); - ArrayRef Bucket = HashTab.buckets(); - ArrayRef Chain = HashTab.values(SymTab.size()); - - // Check the bloom filter and exit early if the symbol is not present. - uint64_t ElfClassBits = ELFT::Is64Bits ? 64 : 32; - typename ELFT::Off Word = - Filter[(NameHash / ElfClassBits) % HashTab.maskwords]; - uint64_t Mask = (0x1ull << (NameHash % ElfClassBits)) | - (0x1ull << ((NameHash >> HashTab.shift2) % ElfClassBits)); - if ((Word & Mask) != Mask) - return nullptr; - - // The symbol may or may not be present, check the hash values. - for (typename ELFT::Word I = Bucket[NameHash % NBucket]; - I >= SymOffset && I < SymTab.size(); I = I + 1) { - const uint32_t ChainHash = Chain[I - SymOffset]; - - if ((NameHash | 0x1) != (ChainHash | 0x1)) - continue; - - if (SymTab[I].st_name >= StrTab.size()) - return createError("symbol [index " + Twine(I) + - "] has invalid st_name: " + Twine(SymTab[I].st_name)); - if (StrTab.drop_front(SymTab[I].st_name).data() == Name) - return &SymTab[I]; - - if (ChainHash & 0x1) - return nullptr; - } - return nullptr; -} - -template -static Expected -getSymbolFromSysVHashTable(StringRef Name, const typename ELFT::Hash &HashTab, - ArrayRef SymTab, - StringRef StrTab) { - const uint32_t Hash = hashSysV(Name); - const typename ELFT::Word NBucket = HashTab.nbucket; - ArrayRef Bucket = HashTab.buckets(); - ArrayRef Chain = HashTab.chains(); - for (typename ELFT::Word I = Bucket[Hash % NBucket]; I != ELF::STN_UNDEF; - I = Chain[I]) { - if (I >= SymTab.size()) - return createError( - "symbol [index " + Twine(I) + - "] is greater than the number of symbols: " + Twine(SymTab.size())); - if (SymTab[I].st_name >= StrTab.size()) - return createError("symbol [index " + Twine(I) + - "] has invalid st_name: " + Twine(SymTab[I].st_name)); - - if (StrTab.drop_front(SymTab[I].st_name).data() == Name) - return &SymTab[I]; - } - return nullptr; -} - -template -static Expected -getHashTableSymbol(const ELFFile &Elf, const typename ELFT::Shdr &Sec, - StringRef Name) { - if (Sec.sh_type != ELF::SHT_HASH && Sec.sh_type != ELF::SHT_GNU_HASH) - return createError( - "invalid sh_type for hash table, expected SHT_HASH or SHT_GNU_HASH"); - Expected SectionsOrError = Elf.sections(); - if (!SectionsOrError) - return SectionsOrError.takeError(); - - auto SymTabOrErr = getSection(*SectionsOrError, Sec.sh_link); - if (!SymTabOrErr) - return SymTabOrErr.takeError(); - - auto StrTabOrErr = - Elf.getStringTableForSymtab(**SymTabOrErr, *SectionsOrError); - if (!StrTabOrErr) - return StrTabOrErr.takeError(); - StringRef StrTab = *StrTabOrErr; - - auto SymsOrErr = Elf.symbols(*SymTabOrErr); - if (!SymsOrErr) - return SymsOrErr.takeError(); - ArrayRef SymTab = *SymsOrErr; - - // If this is a GNU hash table we verify its size and search the symbol - // table using the GNU hash table format. - if (Sec.sh_type == ELF::SHT_GNU_HASH) { - const typename ELFT::GnuHash *HashTab = - reinterpret_cast(Elf.base() + - Sec.sh_offset); - if (Sec.sh_offset + Sec.sh_size >= Elf.getBufSize()) - return createError("section has invalid sh_offset: " + - Twine(Sec.sh_offset)); - if (Sec.sh_size < sizeof(typename ELFT::GnuHash) || - Sec.sh_size < - sizeof(typename ELFT::GnuHash) + - sizeof(typename ELFT::Word) * HashTab->maskwords + - sizeof(typename ELFT::Word) * HashTab->nbuckets + - sizeof(typename ELFT::Word) * (SymTab.size() - HashTab->symndx)) - return createError("section has invalid sh_size: " + Twine(Sec.sh_size)); - return getSymbolFromGnuHashTable(Name, *HashTab, SymTab, StrTab); - } - - // If this is a Sys-V hash table we verify its size and search the symbol - // table using the Sys-V hash table format. - if (Sec.sh_type == ELF::SHT_HASH) { - const typename ELFT::Hash *HashTab = - reinterpret_cast(Elf.base() + - Sec.sh_offset); - if (Sec.sh_offset + Sec.sh_size >= Elf.getBufSize()) - return createError("section has invalid sh_offset: " + - Twine(Sec.sh_offset)); - if (Sec.sh_size < sizeof(typename ELFT::Hash) || - Sec.sh_size < sizeof(typename ELFT::Hash) + - sizeof(typename ELFT::Word) * HashTab->nbucket + - sizeof(typename ELFT::Word) * HashTab->nchain) - return createError("section has invalid sh_size: " + Twine(Sec.sh_size)); - - return getSymbolFromSysVHashTable(Name, *HashTab, SymTab, StrTab); - } - - return nullptr; -} - -template -static Expected -getSymTableSymbol(const ELFFile &Elf, const typename ELFT::Shdr &Sec, - StringRef Name) { - if (Sec.sh_type != ELF::SHT_SYMTAB && Sec.sh_type != ELF::SHT_DYNSYM) - return createError( - "invalid sh_type for hash table, expected SHT_SYMTAB or SHT_DYNSYM"); - Expected SectionsOrError = Elf.sections(); - if (!SectionsOrError) - return SectionsOrError.takeError(); - - auto StrTabOrErr = Elf.getStringTableForSymtab(Sec, *SectionsOrError); - if (!StrTabOrErr) - return StrTabOrErr.takeError(); - StringRef StrTab = *StrTabOrErr; - - auto SymsOrErr = Elf.symbols(&Sec); - if (!SymsOrErr) - return SymsOrErr.takeError(); - ArrayRef SymTab = *SymsOrErr; - - for (const typename ELFT::Sym &Sym : SymTab) - if (StrTab.drop_front(Sym.st_name).data() == Name) - return &Sym; - - return nullptr; -} - -Expected -getELFSymbol(const ELFObjectFile &ELFObj, StringRef Name) { - // First try to look up the symbol via the hash table. - for (ELFSectionRef Sec : ELFObj.sections()) { - if (Sec.getType() != SHT_HASH && Sec.getType() != SHT_GNU_HASH) - continue; - - auto HashTabOrErr = ELFObj.getELFFile().getSection(Sec.getIndex()); - if (!HashTabOrErr) - return HashTabOrErr.takeError(); - return getHashTableSymbol(ELFObj.getELFFile(), **HashTabOrErr, - Name); - } - - // If this is an executable file check the entire standard symbol table. - for (ELFSectionRef Sec : ELFObj.sections()) { - if (Sec.getType() != SHT_SYMTAB) - continue; - - auto SymTabOrErr = ELFObj.getELFFile().getSection(Sec.getIndex()); - if (!SymTabOrErr) - return SymTabOrErr.takeError(); - return getSymTableSymbol(ELFObj.getELFFile(), **SymTabOrErr, Name); - } - - return nullptr; -} diff --git a/openmp/libomptarget/plugins/common/elf_common/elf_common.h b/openmp/libomptarget/plugins/common/elf_common/elf_common.h deleted file mode 100644 --- a/openmp/libomptarget/plugins/common/elf_common/elf_common.h +++ /dev/null @@ -1,27 +0,0 @@ -//===-- elf_common.h - Common ELF functionality -----------------*- C++ -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// Common ELF functionality for target plugins. -// -//===----------------------------------------------------------------------===// - -#ifndef LLVM_OPENMP_LIBOMPTARGET_PLUGINS_COMMON_ELF_COMMON_ELF_COMMON_H -#define LLVM_OPENMP_LIBOMPTARGET_PLUGINS_COMMON_ELF_COMMON_ELF_COMMON_H - -#include "omptargetplugin.h" -#include - -/// Return non-zero, if the given \p image is an ELF object, which -/// e_machine matches \p target_id; return zero otherwise. -EXTERN int32_t elf_check_machine(__tgt_device_image *Image, uint16_t TargetId); - -/// Return non-zero, if the given \p image is an ET_DYN ELF object; -/// return zero otherwise. -EXTERN int32_t elf_is_dynamic(__tgt_device_image *Image); - -#endif // LLVM_OPENMP_LIBOMPTARGET_PLUGINS_COMMON_ELF_COMMON_ELF_COMMON_H diff --git a/openmp/libomptarget/plugins/common/elf_common/elf_common.cpp b/openmp/libomptarget/plugins/common/elf_common/elf_common.cpp deleted file mode 100644 --- a/openmp/libomptarget/plugins/common/elf_common/elf_common.cpp +++ /dev/null @@ -1,88 +0,0 @@ -//===-- elf_common.cpp - Common ELF functionality -------------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// Common ELF functionality for target plugins. -// -//===----------------------------------------------------------------------===// -#include "elf_common.h" -#include "Debug.h" - -#include "llvm/BinaryFormat/Magic.h" -#include "llvm/Object/Binary.h" -#include "llvm/Object/ELFObjectFile.h" -#include "llvm/Object/ELFTypes.h" -#include "llvm/Object/ObjectFile.h" -#include "llvm/Support/MemoryBuffer.h" - -#ifndef TARGET_NAME -#define TARGET_NAME ELF Common -#endif -#define DEBUG_PREFIX "TARGET " GETNAME(TARGET_NAME) - -using namespace llvm; -using namespace llvm::ELF; -using namespace llvm::object; - -/// If the given range of bytes [\p BytesBegin, \p BytesEnd) represents -/// a valid ELF, then invoke \p Callback on the ELFObjectFileBase -/// created from this range, otherwise, return 0. -/// If \p Callback is invoked, then return whatever value \p Callback returns. -template -static int32_t withBytesAsElf(char *BytesBegin, char *BytesEnd, F Callback) { - size_t Size = BytesEnd - BytesBegin; - StringRef StrBuf(BytesBegin, Size); - - auto Magic = identify_magic(StrBuf); - if (Magic != file_magic::elf && Magic != file_magic::elf_relocatable && - Magic != file_magic::elf_executable && - Magic != file_magic::elf_shared_object && Magic != file_magic::elf_core) { - DP("Not an ELF image!\n"); - return 0; - } - - std::unique_ptr MemBuf = - MemoryBuffer::getMemBuffer(StrBuf, "", false); - Expected> BinOrErr = - ObjectFile::createELFObjectFile(MemBuf->getMemBufferRef(), - /*InitContent=*/false); - if (!BinOrErr) { - DP("Unable to get ELF handle: %s!\n", - toString(BinOrErr.takeError()).c_str()); - return 0; - } - - auto *Object = dyn_cast(BinOrErr->get()); - - if (!Object) { - DP("Unknown ELF format!\n"); - return 0; - } - - return Callback(Object); -} - -// Check whether an image is valid for execution on target_id -int32_t elf_check_machine(__tgt_device_image *Image, uint16_t TargetId) { - auto CheckMachine = [TargetId](const ELFObjectFileBase *Object) { - return TargetId == Object->getEMachine(); - }; - return withBytesAsElf(reinterpret_cast(Image->ImageStart), - reinterpret_cast(Image->ImageEnd), - CheckMachine); -} - -int32_t elf_is_dynamic(__tgt_device_image *Image) { - auto CheckDynType = [](const ELFObjectFileBase *Object) { - uint16_t Type = Object->getEType(); - DP("ELF Type: %d\n", Type); - return Type == ET_DYN; - }; - return withBytesAsElf(reinterpret_cast(Image->ImageStart), - reinterpret_cast(Image->ImageEnd), - CheckDynType); -} diff --git a/openmp/libomptarget/plugins/cuda/CMakeLists.txt b/openmp/libomptarget/plugins/cuda/CMakeLists.txt deleted file mode 100644 --- a/openmp/libomptarget/plugins/cuda/CMakeLists.txt +++ /dev/null @@ -1,77 +0,0 @@ -##===----------------------------------------------------------------------===## -# -# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -# See https://llvm.org/LICENSE.txt for license information. -# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -# -##===----------------------------------------------------------------------===## -# -# Build a plugin for a CUDA machine if available. -# -##===----------------------------------------------------------------------===## -set(LIBOMPTARGET_BUILD_CUDA_PLUGIN TRUE CACHE BOOL - "Whether to build CUDA plugin") -if (NOT LIBOMPTARGET_BUILD_CUDA_PLUGIN) - libomptarget_say("Not building CUDA offloading plugin: LIBOMPTARGET_BUILD_CUDA_PLUGIN is false") - return() -endif() - -if (NOT(CMAKE_SYSTEM_PROCESSOR MATCHES "(x86_64)|(ppc64le)|(aarch64)$" AND CMAKE_SYSTEM_NAME MATCHES "Linux")) - libomptarget_say("Not building CUDA offloading plugin: only support CUDA in Linux x86_64, ppc64le, or aarch64 hosts.") - return() -endif() - -libomptarget_say("Building CUDA offloading plugin.") - -set(LIBOMPTARGET_DLOPEN_LIBCUDA OFF) -option(LIBOMPTARGET_FORCE_DLOPEN_LIBCUDA "Build with dlopened libcuda" ${LIBOMPTARGET_DLOPEN_LIBCUDA}) - -add_llvm_library(omptarget.rtl.cuda SHARED - src/rtl.cpp - - LINK_COMPONENTS - Support - Object - - LINK_LIBS PRIVATE - elf_common - MemoryManager - ${OPENMP_PTHREAD_LIB} - "-Wl,--version-script=${CMAKE_CURRENT_SOURCE_DIR}/../exports,-z,defs" - - NO_INSTALL_RPATH -) - -if(LIBOMPTARGET_DEP_CUDA_FOUND AND NOT LIBOMPTARGET_FORCE_DLOPEN_LIBCUDA) - libomptarget_say("Building CUDA plugin linked against libcuda") - target_link_libraries(omptarget.rtl.cuda PRIVATE CUDA::cuda_driver) -else() - libomptarget_say("Building CUDA plugin for dlopened libcuda") - target_include_directories(omptarget.rtl.cuda PRIVATE dynamic_cuda) - target_sources(omptarget.rtl.cuda PRIVATE dynamic_cuda/cuda.cpp) -endif() -add_dependencies(omptarget.rtl.cuda omptarget.devicertl.nvptx) - -# Define the suffix for the runtime messaging dumps. -target_compile_definitions(omptarget.rtl.cuda PRIVATE TARGET_NAME="CUDA") -target_include_directories(omptarget.rtl.cuda PRIVATE ${LIBOMPTARGET_INCLUDE_DIR}) - -# Install plugin under the lib destination folder. -install(TARGETS omptarget.rtl.cuda LIBRARY DESTINATION "${OPENMP_INSTALL_LIBDIR}") -set_target_properties(omptarget.rtl.cuda PROPERTIES - INSTALL_RPATH "$ORIGIN" BUILD_RPATH "$ORIGIN:${CMAKE_CURRENT_BINARY_DIR}/.." - CXX_VISIBILITY_PRESET protected) - -# Report to the parent scope that we are building a plugin for CUDA. -# This controls whether tests are run for the nvptx offloading target -# Run them if libcuda is available, or if the user explicitly asked for dlopen -# Otherwise this plugin is being built speculatively and there may be no cuda available -option(LIBOMPTARGET_FORCE_NVIDIA_TESTS "Build NVIDIA libomptarget tests" OFF) -if (LIBOMPTARGET_FOUND_NVIDIA_GPU OR LIBOMPTARGET_FORCE_NVIDIA_TESTS) - libomptarget_say("Enable tests using CUDA plugin") - set(LIBOMPTARGET_SYSTEM_TARGETS "${LIBOMPTARGET_SYSTEM_TARGETS} nvptx64-nvidia-cuda nvptx64-nvidia-cuda-LTO" PARENT_SCOPE) - list(APPEND LIBOMPTARGET_TESTED_PLUGINS "omptarget.rtl.cuda") - set(LIBOMPTARGET_TESTED_PLUGINS "${LIBOMPTARGET_TESTED_PLUGINS}" PARENT_SCOPE) -else() - libomptarget_say("Disabling tests using CUDA plugin as cuda may not be available") -endif() diff --git a/openmp/libomptarget/plugins/cuda/dynamic_cuda/cuda.h b/openmp/libomptarget/plugins/cuda/dynamic_cuda/cuda.h deleted file mode 100644 --- a/openmp/libomptarget/plugins/cuda/dynamic_cuda/cuda.h +++ /dev/null @@ -1,271 +0,0 @@ -//===--- cuda/dynamic_cuda/cuda.h --------------------------------- C++ -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// The parts of the cuda api that are presently in use by the openmp cuda plugin -// -//===----------------------------------------------------------------------===// - -#ifndef DYNAMIC_CUDA_CUDA_H_INCLUDED -#define DYNAMIC_CUDA_CUDA_H_INCLUDED - -#include -#include - -typedef int CUdevice; -typedef uintptr_t CUdeviceptr; -typedef struct CUmod_st *CUmodule; -typedef struct CUctx_st *CUcontext; -typedef struct CUfunc_st *CUfunction; -typedef struct CUstream_st *CUstream; -typedef struct CUevent_st *CUevent; - -#define CU_DEVICE_INVALID ((CUdevice)-2) - -typedef enum cudaError_enum { - CUDA_SUCCESS = 0, - CUDA_ERROR_INVALID_VALUE = 1, - CUDA_ERROR_NO_DEVICE = 100, - CUDA_ERROR_INVALID_HANDLE = 400, - CUDA_ERROR_NOT_READY = 600, - CUDA_ERROR_TOO_MANY_PEERS = 711, -} CUresult; - -typedef enum CUstream_flags_enum { - CU_STREAM_DEFAULT = 0x0, - CU_STREAM_NON_BLOCKING = 0x1, -} CUstream_flags; - -typedef enum CUlimit_enum { - CU_LIMIT_STACK_SIZE = 0x0, - CU_LIMIT_PRINTF_FIFO_SIZE = 0x1, - CU_LIMIT_MALLOC_HEAP_SIZE = 0x2, - CU_LIMIT_DEV_RUNTIME_SYNC_DEPTH = 0x3, - CU_LIMIT_DEV_RUNTIME_PENDING_LAUNCH_COUNT = 0x4, - CU_LIMIT_MAX_L2_FETCH_GRANULARITY = 0x5, - CU_LIMIT_PERSISTING_L2_CACHE_SIZE = 0x6, - CU_LIMIT_MAX -} CUlimit; - -typedef enum CUdevice_attribute_enum { - CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK = 1, - CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X = 2, - CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y = 3, - CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z = 4, - CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X = 5, - CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y = 6, - CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z = 7, - CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK = 8, - CU_DEVICE_ATTRIBUTE_SHARED_MEMORY_PER_BLOCK = 8, - CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY = 9, - CU_DEVICE_ATTRIBUTE_WARP_SIZE = 10, - CU_DEVICE_ATTRIBUTE_MAX_PITCH = 11, - CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK = 12, - CU_DEVICE_ATTRIBUTE_REGISTERS_PER_BLOCK = 12, - CU_DEVICE_ATTRIBUTE_CLOCK_RATE = 13, - CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT = 14, - CU_DEVICE_ATTRIBUTE_GPU_OVERLAP = 15, - CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT = 16, - CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT = 17, - CU_DEVICE_ATTRIBUTE_INTEGRATED = 18, - CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY = 19, - CU_DEVICE_ATTRIBUTE_COMPUTE_MODE = 20, - CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_WIDTH = 21, - CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_WIDTH = 22, - CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_HEIGHT = 23, - CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH = 24, - CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT = 25, - CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH = 26, - CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_WIDTH = 27, - CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_HEIGHT = 28, - CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_LAYERS = 29, - CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_WIDTH = 27, - CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_HEIGHT = 28, - CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_NUMSLICES = 29, - CU_DEVICE_ATTRIBUTE_SURFACE_ALIGNMENT = 30, - CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS = 31, - CU_DEVICE_ATTRIBUTE_ECC_ENABLED = 32, - CU_DEVICE_ATTRIBUTE_PCI_BUS_ID = 33, - CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID = 34, - CU_DEVICE_ATTRIBUTE_TCC_DRIVER = 35, - CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE = 36, - CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH = 37, - CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE = 38, - CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR = 39, - CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT = 40, - CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING = 41, - CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_WIDTH = 42, - CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_LAYERS = 43, - CU_DEVICE_ATTRIBUTE_CAN_TEX2D_GATHER = 44, - CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_GATHER_WIDTH = 45, - CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_GATHER_HEIGHT = 46, - CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH_ALTERNATE = 47, - CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT_ALTERNATE = 48, - CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH_ALTERNATE = 49, - CU_DEVICE_ATTRIBUTE_PCI_DOMAIN_ID = 50, - CU_DEVICE_ATTRIBUTE_TEXTURE_PITCH_ALIGNMENT = 51, - CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_WIDTH = 52, - CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_LAYERED_WIDTH = 53, - CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_LAYERED_LAYERS = 54, - CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_WIDTH = 55, - CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_WIDTH = 56, - CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_HEIGHT = 57, - CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_WIDTH = 58, - CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_HEIGHT = 59, - CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_DEPTH = 60, - CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_LAYERED_WIDTH = 61, - CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_LAYERED_LAYERS = 62, - CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_WIDTH = 63, - CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_HEIGHT = 64, - CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_LAYERS = 65, - CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_WIDTH = 66, - CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_LAYERED_WIDTH = 67, - CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_LAYERED_LAYERS = 68, - CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LINEAR_WIDTH = 69, - CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_WIDTH = 70, - CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_HEIGHT = 71, - CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_PITCH = 72, - CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_MIPMAPPED_WIDTH = 73, - CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_MIPMAPPED_HEIGHT = 74, - CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR = 75, - CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR = 76, - CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_MIPMAPPED_WIDTH = 77, - CU_DEVICE_ATTRIBUTE_STREAM_PRIORITIES_SUPPORTED = 78, - CU_DEVICE_ATTRIBUTE_GLOBAL_L1_CACHE_SUPPORTED = 79, - CU_DEVICE_ATTRIBUTE_LOCAL_L1_CACHE_SUPPORTED = 80, - CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_MULTIPROCESSOR = 81, - CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR = 82, - CU_DEVICE_ATTRIBUTE_MANAGED_MEMORY = 83, - CU_DEVICE_ATTRIBUTE_MULTI_GPU_BOARD = 84, - CU_DEVICE_ATTRIBUTE_MULTI_GPU_BOARD_GROUP_ID = 85, - CU_DEVICE_ATTRIBUTE_HOST_NATIVE_ATOMIC_SUPPORTED = 86, - CU_DEVICE_ATTRIBUTE_SINGLE_TO_DOUBLE_PRECISION_PERF_RATIO = 87, - CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS = 88, - CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS = 89, - CU_DEVICE_ATTRIBUTE_COMPUTE_PREEMPTION_SUPPORTED = 90, - CU_DEVICE_ATTRIBUTE_CAN_USE_HOST_POINTER_FOR_REGISTERED_MEM = 91, - CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_MEM_OPS = 92, - CU_DEVICE_ATTRIBUTE_CAN_USE_64_BIT_STREAM_MEM_OPS = 93, - CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_WAIT_VALUE_NOR = 94, - CU_DEVICE_ATTRIBUTE_COOPERATIVE_LAUNCH = 95, - CU_DEVICE_ATTRIBUTE_COOPERATIVE_MULTI_DEVICE_LAUNCH = 96, - CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK_OPTIN = 97, - CU_DEVICE_ATTRIBUTE_CAN_FLUSH_REMOTE_WRITES = 98, - CU_DEVICE_ATTRIBUTE_HOST_REGISTER_SUPPORTED = 99, - CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES = 100, - CU_DEVICE_ATTRIBUTE_DIRECT_MANAGED_MEM_ACCESS_FROM_HOST = 101, - CU_DEVICE_ATTRIBUTE_VIRTUAL_ADDRESS_MANAGEMENT_SUPPORTED = 102, - CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED = 102, - CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR_SUPPORTED = 103, - CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_WIN32_HANDLE_SUPPORTED = 104, - CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_WIN32_KMT_HANDLE_SUPPORTED = 105, - CU_DEVICE_ATTRIBUTE_MAX_BLOCKS_PER_MULTIPROCESSOR = 106, - CU_DEVICE_ATTRIBUTE_GENERIC_COMPRESSION_SUPPORTED = 107, - CU_DEVICE_ATTRIBUTE_MAX_PERSISTING_L2_CACHE_SIZE = 108, - CU_DEVICE_ATTRIBUTE_MAX_ACCESS_POLICY_WINDOW_SIZE = 109, - CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_WITH_CUDA_VMM_SUPPORTED = 110, - CU_DEVICE_ATTRIBUTE_RESERVED_SHARED_MEMORY_PER_BLOCK = 111, - CU_DEVICE_ATTRIBUTE_SPARSE_CUDA_ARRAY_SUPPORTED = 112, - CU_DEVICE_ATTRIBUTE_READ_ONLY_HOST_REGISTER_SUPPORTED = 113, - CU_DEVICE_ATTRIBUTE_TIMELINE_SEMAPHORE_INTEROP_SUPPORTED = 114, - CU_DEVICE_ATTRIBUTE_MEMORY_POOLS_SUPPORTED = 115, - CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_SUPPORTED = 116, - CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_FLUSH_WRITES_OPTIONS = 117, - CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_WRITES_ORDERING = 118, - CU_DEVICE_ATTRIBUTE_MEMPOOL_SUPPORTED_HANDLE_TYPES = 119, - CU_DEVICE_ATTRIBUTE_MAX, -} CUdevice_attribute; - -typedef enum CUfunction_attribute_enum { - CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK = 0, -} CUfunction_attribute; - -typedef enum CUctx_flags_enum { - CU_CTX_SCHED_BLOCKING_SYNC = 0x04, - CU_CTX_SCHED_MASK = 0x07, -} CUctx_flags; - -typedef enum CUmemAttach_flags_enum { - CU_MEM_ATTACH_GLOBAL = 0x1, - CU_MEM_ATTACH_HOST = 0x2, - CU_MEM_ATTACH_SINGLE = 0x4, -} CUmemAttach_flags; - -typedef enum CUcomputeMode_enum { - CU_COMPUTEMODE_DEFAULT = 0, - CU_COMPUTEMODE_PROHIBITED = 2, - CU_COMPUTEMODE_EXCLUSIVE_PROCESS = 3, -} CUcompute_mode; - -typedef enum CUevent_flags_enum { - CU_EVENT_DEFAULT = 0x0, - CU_EVENT_BLOCKING_SYNC = 0x1, - CU_EVENT_DISABLE_TIMING = 0x2, - CU_EVENT_INTERPROCESS = 0x4 -} CUevent_flags; - -CUresult cuCtxGetDevice(CUdevice *); -CUresult cuDeviceGet(CUdevice *, int); -CUresult cuDeviceGetAttribute(int *, CUdevice_attribute, CUdevice); -CUresult cuDeviceGetCount(int *); -CUresult cuFuncGetAttribute(int *, CUfunction_attribute, CUfunction); - -// Device info -CUresult cuDeviceGetName(char *, int, CUdevice); -CUresult cuDeviceTotalMem(size_t *, CUdevice); -CUresult cuDriverGetVersion(int *); - -CUresult cuGetErrorString(CUresult, const char **); -CUresult cuInit(unsigned); -CUresult cuLaunchKernel(CUfunction, unsigned, unsigned, unsigned, unsigned, - unsigned, unsigned, unsigned, CUstream, void **, - void **); - -CUresult cuMemAlloc(CUdeviceptr *, size_t); -CUresult cuMemAllocHost(void **, size_t); -CUresult cuMemAllocManaged(CUdeviceptr *, size_t, unsigned int); - -CUresult cuMemcpyDtoDAsync(CUdeviceptr, CUdeviceptr, size_t, CUstream); -CUresult cuMemcpyDtoH(void *, CUdeviceptr, size_t); -CUresult cuMemcpyDtoHAsync(void *, CUdeviceptr, size_t, CUstream); -CUresult cuMemcpyHtoD(CUdeviceptr, const void *, size_t); -CUresult cuMemcpyHtoDAsync(CUdeviceptr, const void *, size_t, CUstream); - -CUresult cuMemFree(CUdeviceptr); -CUresult cuMemFreeHost(void *); - -CUresult cuModuleGetFunction(CUfunction *, CUmodule, const char *); -CUresult cuModuleGetGlobal(CUdeviceptr *, size_t *, CUmodule, const char *); - -CUresult cuModuleUnload(CUmodule); -CUresult cuStreamCreate(CUstream *, unsigned); -CUresult cuStreamDestroy(CUstream); -CUresult cuStreamSynchronize(CUstream); -CUresult cuStreamQuery(CUstream); -CUresult cuCtxSetCurrent(CUcontext); -CUresult cuDevicePrimaryCtxRelease(CUdevice); -CUresult cuDevicePrimaryCtxGetState(CUdevice, unsigned *, int *); -CUresult cuDevicePrimaryCtxSetFlags(CUdevice, unsigned); -CUresult cuDevicePrimaryCtxRetain(CUcontext *, CUdevice); -CUresult cuModuleLoadDataEx(CUmodule *, const void *, unsigned, void *, - void **); - -CUresult cuDeviceCanAccessPeer(int *, CUdevice, CUdevice); -CUresult cuCtxEnablePeerAccess(CUcontext, unsigned); -CUresult cuMemcpyPeerAsync(CUdeviceptr, CUcontext, CUdeviceptr, CUcontext, - size_t, CUstream); - -CUresult cuCtxGetLimit(size_t *, CUlimit); -CUresult cuCtxSetLimit(CUlimit, size_t); - -CUresult cuEventCreate(CUevent *, unsigned int); -CUresult cuEventRecord(CUevent, CUstream); -CUresult cuStreamWaitEvent(CUstream, CUevent, unsigned int); -CUresult cuEventSynchronize(CUevent); -CUresult cuEventDestroy(CUevent); - -#endif diff --git a/openmp/libomptarget/plugins/cuda/dynamic_cuda/cuda.cpp b/openmp/libomptarget/plugins/cuda/dynamic_cuda/cuda.cpp deleted file mode 100644 --- a/openmp/libomptarget/plugins/cuda/dynamic_cuda/cuda.cpp +++ /dev/null @@ -1,152 +0,0 @@ -//===--- cuda/dynamic_cuda/cuda.pp ------------------------------- C++ -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// Implement subset of cuda api by calling into cuda library via dlopen -// Does the dlopen/dlsym calls as part of the call to cuInit -// -//===----------------------------------------------------------------------===// - -#include "llvm/Support/DynamicLibrary.h" - -#include "Debug.h" -#include "cuda.h" -#include "dlwrap.h" - -#include -#include -#include - -DLWRAP_INITIALIZE() - -DLWRAP_INTERNAL(cuInit, 1) - -DLWRAP(cuCtxGetDevice, 1) -DLWRAP(cuDeviceGet, 2) -DLWRAP(cuDeviceGetAttribute, 3) -DLWRAP(cuDeviceGetCount, 1) -DLWRAP(cuFuncGetAttribute, 3) - -// Device info -DLWRAP(cuDeviceGetName, 3) -DLWRAP(cuDeviceTotalMem, 2) -DLWRAP(cuDriverGetVersion, 1) - -DLWRAP(cuGetErrorString, 2) -DLWRAP(cuLaunchKernel, 11) - -DLWRAP(cuMemAlloc, 2) -DLWRAP(cuMemAllocHost, 2) -DLWRAP(cuMemAllocManaged, 3) - -DLWRAP(cuMemcpyDtoDAsync, 4) -DLWRAP(cuMemcpyDtoH, 3) -DLWRAP(cuMemcpyDtoHAsync, 4) -DLWRAP(cuMemcpyHtoD, 3) -DLWRAP(cuMemcpyHtoDAsync, 4) - -DLWRAP(cuMemFree, 1) -DLWRAP(cuMemFreeHost, 1) -DLWRAP(cuModuleGetFunction, 3) -DLWRAP(cuModuleGetGlobal, 4) - -DLWRAP(cuModuleUnload, 1) -DLWRAP(cuStreamCreate, 2) -DLWRAP(cuStreamDestroy, 1) -DLWRAP(cuStreamSynchronize, 1) -DLWRAP(cuStreamQuery, 1) -DLWRAP(cuCtxSetCurrent, 1) -DLWRAP(cuDevicePrimaryCtxRelease, 1) -DLWRAP(cuDevicePrimaryCtxGetState, 3) -DLWRAP(cuDevicePrimaryCtxSetFlags, 2) -DLWRAP(cuDevicePrimaryCtxRetain, 2) -DLWRAP(cuModuleLoadDataEx, 5) - -DLWRAP(cuDeviceCanAccessPeer, 3) -DLWRAP(cuCtxEnablePeerAccess, 2) -DLWRAP(cuMemcpyPeerAsync, 6) - -DLWRAP(cuCtxGetLimit, 2) -DLWRAP(cuCtxSetLimit, 2) - -DLWRAP(cuEventCreate, 2) -DLWRAP(cuEventRecord, 2) -DLWRAP(cuStreamWaitEvent, 3) -DLWRAP(cuEventSynchronize, 1) -DLWRAP(cuEventDestroy, 1) - -DLWRAP_FINALIZE() - -#ifndef DYNAMIC_CUDA_PATH -#define DYNAMIC_CUDA_PATH "libcuda.so" -#endif - -#define TARGET_NAME CUDA -#define DEBUG_PREFIX "Target " GETNAME(TARGET_NAME) " RTL" - -static bool checkForCUDA() { - // return true if dlopen succeeded and all functions found - - // Prefer _v2 versions of functions if found in the library - std::unordered_map TryFirst = { - {"cuMemAlloc", "cuMemAlloc_v2"}, - {"cuMemFree", "cuMemFree_v2"}, - {"cuMemcpyDtoH", "cuMemcpyDtoH_v2"}, - {"cuMemcpyHtoD", "cuMemcpyHtoD_v2"}, - {"cuStreamDestroy", "cuStreamDestroy_v2"}, - {"cuModuleGetGlobal", "cuModuleGetGlobal_v2"}, - {"cuMemcpyDtoHAsync", "cuMemcpyDtoHAsync_v2"}, - {"cuMemcpyDtoDAsync", "cuMemcpyDtoDAsync_v2"}, - {"cuMemcpyHtoDAsync", "cuMemcpyHtoDAsync_v2"}, - {"cuDevicePrimaryCtxRelease", "cuDevicePrimaryCtxRelease_v2"}, - {"cuDevicePrimaryCtxSetFlags", "cuDevicePrimaryCtxSetFlags_v2"}, - }; - - const char *CudaLib = DYNAMIC_CUDA_PATH; - std::string ErrMsg; - auto DynlibHandle = std::make_unique( - llvm::sys::DynamicLibrary::getPermanentLibrary(CudaLib, &ErrMsg)); - if (!DynlibHandle->isValid()) { - DP("Unable to load library '%s': %s!\n", CudaLib, ErrMsg.c_str()); - return false; - } - - for (size_t I = 0; I < dlwrap::size(); I++) { - const char *Sym = dlwrap::symbol(I); - - auto It = TryFirst.find(Sym); - if (It != TryFirst.end()) { - const char *First = It->second; - void *P = DynlibHandle->getAddressOfSymbol(First); - if (P) { - DP("Implementing %s with dlsym(%s) -> %p\n", Sym, First, P); - *dlwrap::pointer(I) = P; - continue; - } - } - - void *P = DynlibHandle->getAddressOfSymbol(Sym); - if (P == nullptr) { - DP("Unable to find '%s' in '%s'!\n", Sym, CudaLib); - return false; - } - DP("Implementing %s with dlsym(%s) -> %p\n", Sym, Sym, P); - - *dlwrap::pointer(I) = P; - } - - return true; -} - -CUresult cuInit(unsigned X) { - // Note: Called exactly once from cuda rtl.cpp in a global constructor so - // does not need to handle being called repeatedly or concurrently - if (!checkForCUDA()) { - return CUDA_ERROR_INVALID_HANDLE; - } - return dlwrap_cuInit(X); -} diff --git a/openmp/libomptarget/plugins/cuda/src/rtl.cpp b/openmp/libomptarget/plugins/cuda/src/rtl.cpp deleted file mode 100644 --- a/openmp/libomptarget/plugins/cuda/src/rtl.cpp +++ /dev/null @@ -1,1906 +0,0 @@ -//===----RTLs/cuda/src/rtl.cpp - Target RTLs Implementation ------- C++ -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// RTL for CUDA machine -// -//===----------------------------------------------------------------------===// - -#include "llvm/ADT/StringRef.h" - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "Debug.h" -#include "DeviceEnvironment.h" -#include "omptarget.h" -#include "omptargetplugin.h" - -#define TARGET_NAME CUDA -#define DEBUG_PREFIX "Target " GETNAME(TARGET_NAME) " RTL" - -#include "MemoryManager.h" - -#include "llvm/Frontend/OpenMP/OMPConstants.h" - -using namespace llvm; - -// Utility for retrieving and printing CUDA error string. -#ifdef OMPTARGET_DEBUG -#define CUDA_ERR_STRING(err) \ - do { \ - if (getDebugLevel() > 0) { \ - const char *errStr = nullptr; \ - CUresult errStr_status = cuGetErrorString(err, &errStr); \ - if (errStr_status == CUDA_ERROR_INVALID_VALUE) \ - REPORT("Unrecognized CUDA error code: %d\n", err); \ - else if (errStr_status == CUDA_SUCCESS) \ - REPORT("CUDA error is: %s\n", errStr); \ - else { \ - REPORT("Unresolved CUDA error code: %d\n", err); \ - REPORT("Unsuccessful cuGetErrorString return status: %d\n", \ - errStr_status); \ - } \ - } else { \ - const char *errStr = nullptr; \ - CUresult errStr_status = cuGetErrorString(err, &errStr); \ - if (errStr_status == CUDA_SUCCESS) \ - REPORT("%s \n", errStr); \ - } \ - } while (false) -#else // OMPTARGET_DEBUG -#define CUDA_ERR_STRING(err) \ - do { \ - const char *errStr = nullptr; \ - CUresult errStr_status = cuGetErrorString(err, &errStr); \ - if (errStr_status == CUDA_SUCCESS) \ - REPORT("%s \n", errStr); \ - } while (false) -#endif // OMPTARGET_DEBUG - -#define BOOL2TEXT(b) ((b) ? "Yes" : "No") - -#include "elf_common.h" - -/// Keep entries table per device. -struct FuncOrGblEntryTy { - __tgt_target_table Table; - std::vector<__tgt_offload_entry> Entries; -}; - -/// Use a single entity to encode a kernel and a set of flags. -struct KernelTy { - CUfunction Func; - - // execution mode of kernel - llvm::omp::OMPTgtExecModeFlags ExecutionMode; - - /// Maximal number of threads per block for this kernel. - int MaxThreadsPerBlock = 0; - - KernelTy(CUfunction Func, llvm::omp::OMPTgtExecModeFlags ExecutionMode) - : Func(Func), ExecutionMode(ExecutionMode) {} -}; - -namespace { -bool checkResult(CUresult Err, const char *ErrMsg) { - if (Err == CUDA_SUCCESS) - return true; - - REPORT("%s", ErrMsg); - CUDA_ERR_STRING(Err); - return false; -} - -int memcpyDtoD(const void *SrcPtr, void *DstPtr, int64_t Size, - CUstream Stream) { - CUresult Err = - cuMemcpyDtoDAsync((CUdeviceptr)DstPtr, (CUdeviceptr)SrcPtr, Size, Stream); - - if (Err != CUDA_SUCCESS) { - DP("Error when copying data from device to device. Pointers: src " - "= " DPxMOD ", dst = " DPxMOD ", size = %" PRId64 "\n", - DPxPTR(SrcPtr), DPxPTR(DstPtr), Size); - CUDA_ERR_STRING(Err); - return OFFLOAD_FAIL; - } - - return OFFLOAD_SUCCESS; -} - -int recordEvent(void *EventPtr, __tgt_async_info *AsyncInfo) { - CUstream Stream = reinterpret_cast(AsyncInfo->Queue); - CUevent Event = reinterpret_cast(EventPtr); - - CUresult Err = cuEventRecord(Event, Stream); - if (Err != CUDA_SUCCESS) { - DP("Error when recording event. stream = " DPxMOD ", event = " DPxMOD "\n", - DPxPTR(Stream), DPxPTR(Event)); - CUDA_ERR_STRING(Err); - return OFFLOAD_FAIL; - } - - return OFFLOAD_SUCCESS; -} - -int syncEvent(void *EventPtr) { - CUevent Event = reinterpret_cast(EventPtr); - - CUresult Err = cuEventSynchronize(Event); - if (Err != CUDA_SUCCESS) { - DP("Error when syncing event = " DPxMOD "\n", DPxPTR(Event)); - CUDA_ERR_STRING(Err); - return OFFLOAD_FAIL; - } - - return OFFLOAD_SUCCESS; -} - -namespace { - -// Structure contains per-device data -struct DeviceDataTy { - /// List that contains all the kernels. - std::list KernelsList; - - std::list FuncGblEntries; - - CUcontext Context = nullptr; - // Device properties - unsigned int ThreadsPerBlock = 0; - unsigned int BlocksPerGrid = 0; - unsigned int WarpSize = 0; - // OpenMP properties - unsigned int NumTeams = 0; - unsigned int NumThreads = 0; -}; - -/// Resource allocator where \p T is the resource type. -/// Functions \p create and \p destroy return OFFLOAD_SUCCESS and OFFLOAD_FAIL -/// accordingly. The implementation should not raise any exception. -template struct AllocatorTy { - using ElementTy = T; - virtual ~AllocatorTy() {} - - /// Create a resource and assign to R. - virtual int create(T &R) noexcept = 0; - /// Destroy the resource. - virtual int destroy(T) noexcept = 0; -}; - -/// Allocator for CUstream. -struct StreamAllocatorTy final : public AllocatorTy { - /// See AllocatorTy::create. - int create(CUstream &Stream) noexcept override { - if (!checkResult(cuStreamCreate(&Stream, CU_STREAM_NON_BLOCKING), - "Error returned from cuStreamCreate\n")) - return OFFLOAD_FAIL; - - return OFFLOAD_SUCCESS; - } - - /// See AllocatorTy::destroy. - int destroy(CUstream Stream) noexcept override { - if (!checkResult(cuStreamDestroy(Stream), - "Error returned from cuStreamDestroy\n")) - return OFFLOAD_FAIL; - - return OFFLOAD_SUCCESS; - } -}; - -/// Allocator for CUevent. -struct EventAllocatorTy final : public AllocatorTy { - /// See AllocatorTy::create. - int create(CUevent &Event) noexcept override { - if (!checkResult(cuEventCreate(&Event, CU_EVENT_DEFAULT), - "Error returned from cuEventCreate\n")) - return OFFLOAD_FAIL; - - return OFFLOAD_SUCCESS; - } - - /// See AllocatorTy::destroy. - int destroy(CUevent Event) noexcept override { - if (!checkResult(cuEventDestroy(Event), - "Error returned from cuEventDestroy\n")) - return OFFLOAD_FAIL; - - return OFFLOAD_SUCCESS; - } -}; - -/// A generic pool of resources where \p T is the resource type. -/// \p T should be copyable as the object is stored in \p std::vector . -template class ResourcePoolTy { - using ElementTy = typename AllocTy::ElementTy; - /// Index of the next available resource. - size_t Next = 0; - /// Mutex to guard the pool. - std::mutex Mutex; - /// Pool of resources. The difference between \p Resources and \p Pool is, - /// when a resource is acquired and released, it is all on \p Resources. When - /// a batch of new resources are needed, they are both added to \p Resources - /// and \p Pool. The reason for this setting is, \p Resources could contain - /// redundant elements because resources are not released, which can cause - /// double free. This setting makes sure that \p Pool always has every - /// resource allocated from the device. - std::vector Resources; - std::vector Pool; - /// A reference to the corresponding allocator. - AllocTy Allocator; - - /// If `Resources` is used up, we will fill in more resources. It assumes that - /// the new size `Size` should be always larger than the current size. - bool resize(size_t Size) { - assert(Resources.size() == Pool.size() && "size mismatch"); - auto CurSize = Resources.size(); - assert(Size > CurSize && "Unexpected smaller size"); - Pool.reserve(Size); - Resources.reserve(Size); - for (auto I = CurSize; I < Size; ++I) { - ElementTy NewItem; - int Ret = Allocator.create(NewItem); - if (Ret != OFFLOAD_SUCCESS) - return false; - Pool.push_back(NewItem); - Resources.push_back(NewItem); - } - return true; - } - -public: - ResourcePoolTy(AllocTy &&A, size_t Size = 0) noexcept - : Allocator(std::move(A)) { - if (Size) - (void)resize(Size); - } - - ~ResourcePoolTy() noexcept { clear(); } - - /// Get a resource from pool. `Next` always points to the next available - /// resource. That means, `[0, next-1]` have been assigned, and `[id,]` are - /// still available. If there is no resource left, we will ask for more. Each - /// time a resource is assigned, the id will increase one. - /// xxxxxs+++++++++ - /// ^ - /// Next - /// After assignment, the pool becomes the following and s is assigned. - /// xxxxxs+++++++++ - /// ^ - /// Next - int acquire(ElementTy &R) noexcept { - std::lock_guard LG(Mutex); - if (Next == Resources.size()) { - auto NewSize = Resources.size() ? Resources.size() * 2 : 1; - if (!resize(NewSize)) - return OFFLOAD_FAIL; - } - - assert(Next < Resources.size()); - - R = Resources[Next++]; - - return OFFLOAD_SUCCESS; - } - - /// Return the resource back to the pool. When we return a resource, we need - /// to first decrease `Next`, and then copy the resource back. It is worth - /// noting that, the order of resources return might be different from that - /// they're assigned, that saying, at some point, there might be two identical - /// resources. - /// xxax+a+++++ - /// ^ - /// Next - /// However, it doesn't matter, because they're always on the two sides of - /// `Next`. The left one will in the end be overwritten by another resource. - /// Therefore, after several execution, the order of pool might be different - /// from its initial state. - void release(ElementTy R) noexcept { - std::lock_guard LG(Mutex); - Resources[--Next] = R; - } - - /// Released all stored resources and clear the pool. - /// Note: This function is not thread safe. Be sure to guard it if necessary. - void clear() noexcept { - for (auto &R : Pool) - (void)Allocator.destroy(R); - Pool.clear(); - Resources.clear(); - } -}; - -} // namespace - -class DeviceRTLTy { - int NumberOfDevices; - // OpenMP environment properties - int EnvNumTeams; - unsigned int EnvTeamLimit; - unsigned int EnvTeamThreadLimit; - // OpenMP requires flags - int64_t RequiresFlags; - // Amount of dynamic shared memory to use at launch. - uint64_t DynamicMemorySize; - - /// Number of initial streams for each device. - int NumInitialStreams = 32; - - /// Number of initial events for each device. - int NumInitialEvents = 8; - - static constexpr const int32_t HardThreadLimit = 1024; - static constexpr const int32_t DefaultNumTeams = 128; - static constexpr const int32_t DefaultNumThreads = 128; - - using StreamPoolTy = ResourcePoolTy; - std::vector> StreamPool; - - using EventPoolTy = ResourcePoolTy; - std::vector> EventPool; - - std::vector DeviceData; - std::vector> Modules; - - /// Vector of flags indicating the initalization status of all associated - /// devices. - std::vector InitializedFlags; - - enum class PeerAccessState : uint8_t { Unkown, Yes, No }; - std::vector> PeerAccessMatrix; - std::mutex PeerAccessMatrixLock; - - /// A class responsible for interacting with device native runtime library to - /// allocate and free memory. - class CUDADeviceAllocatorTy : public DeviceAllocatorTy { - public: - void *allocate(size_t Size, void *, TargetAllocTy Kind) override { - if (Size == 0) - return nullptr; - - void *MemAlloc = nullptr; - CUresult Err; - switch (Kind) { - case TARGET_ALLOC_DEFAULT: - case TARGET_ALLOC_DEVICE: - CUdeviceptr DevicePtr; - Err = cuMemAlloc(&DevicePtr, Size); - MemAlloc = (void *)DevicePtr; - if (!checkResult(Err, "Error returned from cuMemAlloc\n")) - return nullptr; - break; - case TARGET_ALLOC_HOST: - void *HostPtr; - Err = cuMemAllocHost(&HostPtr, Size); - MemAlloc = HostPtr; - if (!checkResult(Err, "Error returned from cuMemAllocHost\n")) - return nullptr; - break; - case TARGET_ALLOC_SHARED: - CUdeviceptr SharedPtr; - Err = cuMemAllocManaged(&SharedPtr, Size, CU_MEM_ATTACH_GLOBAL); - MemAlloc = (void *)SharedPtr; - if (!checkResult(Err, "Error returned from cuMemAllocManaged\n")) - return nullptr; - break; - } - - return MemAlloc; - } - - int free(void *TgtPtr, TargetAllocTy Kind) override { - CUresult Err; - // Host pinned memory must be freed differently. - switch (Kind) { - case TARGET_ALLOC_DEFAULT: - case TARGET_ALLOC_DEVICE: - case TARGET_ALLOC_SHARED: - Err = cuMemFree((CUdeviceptr)TgtPtr); - if (!checkResult(Err, "Error returned from cuMemFree\n")) - return OFFLOAD_FAIL; - break; - case TARGET_ALLOC_HOST: - Err = cuMemFreeHost(TgtPtr); - if (!checkResult(Err, "Error returned from cuMemFreeHost\n")) - return OFFLOAD_FAIL; - break; - } - - return OFFLOAD_SUCCESS; - } - }; - - /// A vector of device allocators - std::vector DeviceAllocators; - - /// A vector of memory managers. Since the memory manager is non-copyable and - // non-removable, we wrap them into std::unique_ptr. - std::vector> MemoryManagers; - - /// Whether use memory manager - bool UseMemoryManager = true; - - // Record entry point associated with device - void addOffloadEntry(const int DeviceId, const __tgt_offload_entry Entry) { - FuncOrGblEntryTy &E = DeviceData[DeviceId].FuncGblEntries.back(); - E.Entries.push_back(Entry); - } - - // Return a pointer to the entry associated with the pointer - const __tgt_offload_entry *getOffloadEntry(const int DeviceId, - const void *Addr) const { - for (const __tgt_offload_entry &Itr : - DeviceData[DeviceId].FuncGblEntries.back().Entries) - if (Itr.addr == Addr) - return &Itr; - - return nullptr; - } - - // Return the pointer to the target entries table - __tgt_target_table *getOffloadEntriesTable(const int DeviceId) { - FuncOrGblEntryTy &E = DeviceData[DeviceId].FuncGblEntries.back(); - - if (E.Entries.empty()) - return nullptr; - - // Update table info according to the entries and return the pointer - E.Table.EntriesBegin = E.Entries.data(); - E.Table.EntriesEnd = E.Entries.data() + E.Entries.size(); - - return &E.Table; - } - - // Clear entries table for a device - void clearOffloadEntriesTable(const int DeviceId) { - DeviceData[DeviceId].FuncGblEntries.emplace_back(); - FuncOrGblEntryTy &E = DeviceData[DeviceId].FuncGblEntries.back(); - E.Entries.clear(); - E.Table.EntriesBegin = E.Table.EntriesEnd = nullptr; - } - -public: - CUstream getStream(const int DeviceId, __tgt_async_info *AsyncInfo) const { - assert(AsyncInfo && "AsyncInfo is nullptr"); - - if (!AsyncInfo->Queue) { - CUstream S; - if (StreamPool[DeviceId]->acquire(S) != OFFLOAD_SUCCESS) - return nullptr; - - AsyncInfo->Queue = S; - } - - return reinterpret_cast(AsyncInfo->Queue); - } - - // This class should not be copied - DeviceRTLTy(const DeviceRTLTy &) = delete; - DeviceRTLTy(DeviceRTLTy &&) = delete; - - DeviceRTLTy() - : NumberOfDevices(0), EnvNumTeams(-1), EnvTeamLimit(-1), - EnvTeamThreadLimit(-1), RequiresFlags(OMP_REQ_UNDEFINED), - DynamicMemorySize(0) { - - DP("Start initializing CUDA\n"); - - CUresult Err = cuInit(0); - if (Err == CUDA_ERROR_INVALID_HANDLE) { - // Can't call cuGetErrorString if dlsym failed - DP("Failed to load CUDA shared library\n"); - return; - } - if (Err == CUDA_ERROR_NO_DEVICE) { - DP("There are no devices supporting CUDA.\n"); - return; - } - if (!checkResult(Err, "Error returned from cuInit\n")) { - return; - } - - Err = cuDeviceGetCount(&NumberOfDevices); - if (!checkResult(Err, "Error returned from cuDeviceGetCount\n")) - return; - - if (NumberOfDevices == 0) { - DP("There are no devices supporting CUDA.\n"); - return; - } - - DeviceData.resize(NumberOfDevices); - Modules.resize(NumberOfDevices); - StreamPool.resize(NumberOfDevices); - EventPool.resize(NumberOfDevices); - PeerAccessMatrix.resize(NumberOfDevices); - for (auto &V : PeerAccessMatrix) - V.resize(NumberOfDevices, PeerAccessState::Unkown); - - // Get environment variables regarding teams - if (const char *EnvStr = getenv("OMP_TEAM_LIMIT")) { - // OMP_TEAM_LIMIT has been set - EnvTeamLimit = std::stoi(EnvStr); - DP("Parsed OMP_TEAM_LIMIT=%d\n", EnvTeamLimit); - } - if (const char *EnvStr = getenv("OMP_TEAMS_THREAD_LIMIT")) { - // OMP_TEAMS_THREAD_LIMIT has been set - EnvTeamThreadLimit = std::stoi(EnvStr); - DP("Parsed OMP_TEAMS_THREAD_LIMIT=%d\n", EnvTeamThreadLimit); - } - if (const char *EnvStr = getenv("OMP_NUM_TEAMS")) { - // OMP_NUM_TEAMS has been set - EnvNumTeams = std::stoi(EnvStr); - DP("Parsed OMP_NUM_TEAMS=%d\n", EnvNumTeams); - } - if (const char *EnvStr = getenv("LIBOMPTARGET_SHARED_MEMORY_SIZE")) { - // LIBOMPTARGET_SHARED_MEMORY_SIZE has been set - DynamicMemorySize = std::stoi(EnvStr); - DP("Parsed LIBOMPTARGET_SHARED_MEMORY_SIZE = %" PRIu64 "\n", - DynamicMemorySize); - } - if (const char *EnvStr = getenv("LIBOMPTARGET_NUM_INITIAL_STREAMS")) { - // LIBOMPTARGET_NUM_INITIAL_STREAMS has been set - NumInitialStreams = std::stoi(EnvStr); - DP("Parsed LIBOMPTARGET_NUM_INITIAL_STREAMS=%d\n", NumInitialStreams); - } - - for (int I = 0; I < NumberOfDevices; ++I) - DeviceAllocators.emplace_back(); - - // Get the size threshold from environment variable - std::pair Res = MemoryManagerTy::getSizeThresholdFromEnv(); - UseMemoryManager = Res.second; - size_t MemoryManagerThreshold = Res.first; - - if (UseMemoryManager) - for (int I = 0; I < NumberOfDevices; ++I) - MemoryManagers.emplace_back(std::make_unique( - DeviceAllocators[I], MemoryManagerThreshold)); - - // We lazily initialize all devices later. - InitializedFlags.assign(NumberOfDevices, false); - } - - ~DeviceRTLTy() { - for (int DeviceId = 0; DeviceId < NumberOfDevices; ++DeviceId) - deinitDevice(DeviceId); - } - - // Check whether a given DeviceId is valid - bool isValidDeviceId(const int DeviceId) const { - return DeviceId >= 0 && DeviceId < NumberOfDevices; - } - - int getNumOfDevices() const { return NumberOfDevices; } - - void setRequiresFlag(const int64_t Flags) { this->RequiresFlags = Flags; } - - int initDevice(const int DeviceId) { - CUdevice Device; - - DP("Getting device %d\n", DeviceId); - CUresult Err = cuDeviceGet(&Device, DeviceId); - if (!checkResult(Err, "Error returned from cuDeviceGet\n")) - return OFFLOAD_FAIL; - - assert(InitializedFlags[DeviceId] == false && "Reinitializing device!"); - InitializedFlags[DeviceId] = true; - - // Query the current flags of the primary context and set its flags if - // it is inactive - unsigned int FormerPrimaryCtxFlags = 0; - int FormerPrimaryCtxIsActive = 0; - Err = cuDevicePrimaryCtxGetState(Device, &FormerPrimaryCtxFlags, - &FormerPrimaryCtxIsActive); - if (!checkResult(Err, "Error returned from cuDevicePrimaryCtxGetState\n")) - return OFFLOAD_FAIL; - - if (FormerPrimaryCtxIsActive) { - DP("The primary context is active, no change to its flags\n"); - if ((FormerPrimaryCtxFlags & CU_CTX_SCHED_MASK) != - CU_CTX_SCHED_BLOCKING_SYNC) - DP("Warning the current flags are not CU_CTX_SCHED_BLOCKING_SYNC\n"); - } else { - DP("The primary context is inactive, set its flags to " - "CU_CTX_SCHED_BLOCKING_SYNC\n"); - Err = cuDevicePrimaryCtxSetFlags(Device, CU_CTX_SCHED_BLOCKING_SYNC); - if (!checkResult(Err, "Error returned from cuDevicePrimaryCtxSetFlags\n")) - return OFFLOAD_FAIL; - } - - // Retain the per device primary context and save it to use whenever this - // device is selected. - Err = cuDevicePrimaryCtxRetain(&DeviceData[DeviceId].Context, Device); - if (!checkResult(Err, "Error returned from cuDevicePrimaryCtxRetain\n")) - return OFFLOAD_FAIL; - - Err = cuCtxSetCurrent(DeviceData[DeviceId].Context); - if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n")) - return OFFLOAD_FAIL; - - // Initialize the stream pool. - if (!StreamPool[DeviceId]) - StreamPool[DeviceId] = std::make_unique(StreamAllocatorTy(), - NumInitialStreams); - - // Initialize the event pool. - if (!EventPool[DeviceId]) - EventPool[DeviceId] = - std::make_unique(EventAllocatorTy(), NumInitialEvents); - - // Query attributes to determine number of threads/block and blocks/grid. - int MaxGridDimX; - Err = cuDeviceGetAttribute(&MaxGridDimX, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X, - Device); - if (Err != CUDA_SUCCESS) { - DP("Error getting max grid dimension, use default value %d\n", - DeviceRTLTy::DefaultNumTeams); - DeviceData[DeviceId].BlocksPerGrid = DeviceRTLTy::DefaultNumTeams; - } else { - DP("Using %d CUDA blocks per grid\n", MaxGridDimX); - DeviceData[DeviceId].BlocksPerGrid = MaxGridDimX; - } - - // We are only exploiting threads along the x axis. - int MaxBlockDimX; - Err = cuDeviceGetAttribute(&MaxBlockDimX, - CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X, Device); - if (Err != CUDA_SUCCESS) { - DP("Error getting max block dimension, use default value %d\n", - DeviceRTLTy::DefaultNumThreads); - DeviceData[DeviceId].ThreadsPerBlock = DeviceRTLTy::DefaultNumThreads; - } else { - DP("Using %d CUDA threads per block\n", MaxBlockDimX); - DeviceData[DeviceId].ThreadsPerBlock = MaxBlockDimX; - - if (EnvTeamThreadLimit > 0 && - DeviceData[DeviceId].ThreadsPerBlock > EnvTeamThreadLimit) { - DP("Max CUDA threads per block %d exceeds the thread limit %d set by " - "OMP_TEAMS_THREAD_LIMIT, capping at the limit\n", - DeviceData[DeviceId].ThreadsPerBlock, EnvTeamThreadLimit); - DeviceData[DeviceId].ThreadsPerBlock = EnvTeamThreadLimit; - } - if (DeviceData[DeviceId].ThreadsPerBlock > DeviceRTLTy::HardThreadLimit) { - DP("Max CUDA threads per block %d exceeds the hard thread limit %d, " - "capping at the hard limit\n", - DeviceData[DeviceId].ThreadsPerBlock, DeviceRTLTy::HardThreadLimit); - DeviceData[DeviceId].ThreadsPerBlock = DeviceRTLTy::HardThreadLimit; - } - } - - // Get and set warp size - int WarpSize; - Err = - cuDeviceGetAttribute(&WarpSize, CU_DEVICE_ATTRIBUTE_WARP_SIZE, Device); - if (Err != CUDA_SUCCESS) { - DP("Error getting warp size, assume default value 32\n"); - DeviceData[DeviceId].WarpSize = 32; - } else { - DP("Using warp size %d\n", WarpSize); - DeviceData[DeviceId].WarpSize = WarpSize; - } - - // Adjust teams to the env variables - if (EnvTeamLimit > 0 && DeviceData[DeviceId].BlocksPerGrid > EnvTeamLimit) { - DP("Capping max CUDA blocks per grid to OMP_TEAM_LIMIT=%d\n", - EnvTeamLimit); - DeviceData[DeviceId].BlocksPerGrid = EnvTeamLimit; - } - - size_t StackLimit; - size_t HeapLimit; - if (const char *EnvStr = getenv("LIBOMPTARGET_STACK_SIZE")) { - StackLimit = std::stol(EnvStr); - if (cuCtxSetLimit(CU_LIMIT_STACK_SIZE, StackLimit) != CUDA_SUCCESS) - return OFFLOAD_FAIL; - } else { - if (cuCtxGetLimit(&StackLimit, CU_LIMIT_STACK_SIZE) != CUDA_SUCCESS) - return OFFLOAD_FAIL; - } - if (const char *EnvStr = getenv("LIBOMPTARGET_HEAP_SIZE")) { - HeapLimit = std::stol(EnvStr); - if (cuCtxSetLimit(CU_LIMIT_MALLOC_HEAP_SIZE, HeapLimit) != CUDA_SUCCESS) - return OFFLOAD_FAIL; - } else { - if (cuCtxGetLimit(&HeapLimit, CU_LIMIT_MALLOC_HEAP_SIZE) != CUDA_SUCCESS) - return OFFLOAD_FAIL; - } - - INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId, - "Device supports up to %d CUDA blocks and %d threads with a " - "warp size of %d\n", - DeviceData[DeviceId].BlocksPerGrid, - DeviceData[DeviceId].ThreadsPerBlock, DeviceData[DeviceId].WarpSize); - INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId, - "Device heap size is %d Bytes, device stack size is %d Bytes per " - "thread\n", - (int)HeapLimit, (int)StackLimit); - - // Set default number of teams - if (EnvNumTeams > 0) { - DP("Default number of teams set according to environment %d\n", - EnvNumTeams); - DeviceData[DeviceId].NumTeams = EnvNumTeams; - } else { - DeviceData[DeviceId].NumTeams = DeviceRTLTy::DefaultNumTeams; - DP("Default number of teams set according to library's default %d\n", - DeviceRTLTy::DefaultNumTeams); - } - - if (DeviceData[DeviceId].NumTeams > DeviceData[DeviceId].BlocksPerGrid) { - DP("Default number of teams exceeds device limit, capping at %d\n", - DeviceData[DeviceId].BlocksPerGrid); - DeviceData[DeviceId].NumTeams = DeviceData[DeviceId].BlocksPerGrid; - } - - // Set default number of threads - DeviceData[DeviceId].NumThreads = DeviceRTLTy::DefaultNumThreads; - DP("Default number of threads set according to library's default %d\n", - DeviceRTLTy::DefaultNumThreads); - if (DeviceData[DeviceId].NumThreads > - DeviceData[DeviceId].ThreadsPerBlock) { - DP("Default number of threads exceeds device limit, capping at %d\n", - DeviceData[DeviceId].ThreadsPerBlock); - DeviceData[DeviceId].NumThreads = DeviceData[DeviceId].ThreadsPerBlock; - } - - return OFFLOAD_SUCCESS; - } - - int deinitDevice(const int DeviceId) { - auto IsInitialized = InitializedFlags[DeviceId]; - if (!IsInitialized) - return OFFLOAD_SUCCESS; - InitializedFlags[DeviceId] = false; - - if (UseMemoryManager) - MemoryManagers[DeviceId].release(); - - StreamPool[DeviceId].reset(); - EventPool[DeviceId].reset(); - - DeviceDataTy &D = DeviceData[DeviceId]; - if (!checkResult(cuCtxSetCurrent(D.Context), - "Error returned from cuCtxSetCurrent\n")) - return OFFLOAD_FAIL; - - // Unload all modules. - for (auto &M : Modules[DeviceId]) - if (!checkResult(cuModuleUnload(M), - "Error returned from cuModuleUnload\n")) - return OFFLOAD_FAIL; - - // Destroy context. - CUdevice Device; - if (!checkResult(cuCtxGetDevice(&Device), - "Error returned from cuCtxGetDevice\n")) - return OFFLOAD_FAIL; - - if (!checkResult(cuDevicePrimaryCtxRelease(Device), - "Error returned from cuDevicePrimaryCtxRelease\n")) - return OFFLOAD_FAIL; - - return OFFLOAD_SUCCESS; - } - - __tgt_target_table *loadBinary(const int DeviceId, - const __tgt_device_image *Image) { - // Clear the offload table as we are going to create a new one. - clearOffloadEntriesTable(DeviceId); - - // Create the module and extract the function pointers. - CUmodule Module; - DP("Load data from image " DPxMOD "\n", DPxPTR(Image->ImageStart)); - CUresult Err = - cuModuleLoadDataEx(&Module, Image->ImageStart, 0, nullptr, nullptr); - if (!checkResult(Err, "Error returned from cuModuleLoadDataEx\n")) - return nullptr; - - DP("CUDA module successfully loaded!\n"); - - Modules[DeviceId].push_back(Module); - - // Find the symbols in the module by name. - const __tgt_offload_entry *HostBegin = Image->EntriesBegin; - const __tgt_offload_entry *HostEnd = Image->EntriesEnd; - - std::list &KernelsList = DeviceData[DeviceId].KernelsList; - for (const __tgt_offload_entry *E = HostBegin; E != HostEnd; ++E) { - if (!E->addr) { - // We return nullptr when something like this happens, the host should - // have always something in the address to uniquely identify the target - // region. - DP("Invalid binary: host entry '' (size = %zd)...\n", E->size); - return nullptr; - } - - if (E->size) { - __tgt_offload_entry Entry = *E; - CUdeviceptr CUPtr; - size_t CUSize; - Err = cuModuleGetGlobal(&CUPtr, &CUSize, Module, E->name); - // We keep this style here because we need the name - if (Err != CUDA_SUCCESS) { - REPORT("Loading global '%s' Failed\n", E->name); - CUDA_ERR_STRING(Err); - return nullptr; - } - - if (CUSize != E->size) { - DP("Loading global '%s' - size mismatch (%zd != %zd)\n", E->name, - CUSize, E->size); - return nullptr; - } - - DP("Entry point " DPxMOD " maps to global %s (" DPxMOD ")\n", - DPxPTR(E - HostBegin), E->name, DPxPTR(CUPtr)); - - Entry.addr = (void *)(CUPtr); - - // Note: In the current implementation declare target variables - // can either be link or to. This means that once unified - // memory is activated via the requires directive, the variable - // can be used directly from the host in both cases. - // TODO: when variables types other than to or link are added, - // the below condition should be changed to explicitly - // check for to and link variables types: - // (RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY && (e->flags & - // OMP_DECLARE_TARGET_LINK || e->flags == OMP_DECLARE_TARGET_TO)) - if (RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY) { - // If unified memory is present any target link or to variables - // can access host addresses directly. There is no longer a - // need for device copies. - cuMemcpyHtoD(CUPtr, E->addr, sizeof(void *)); - DP("Copy linked variable host address (" DPxMOD - ") to device address (" DPxMOD ")\n", - DPxPTR(*((void **)E->addr)), DPxPTR(CUPtr)); - } - - addOffloadEntry(DeviceId, Entry); - - continue; - } - - CUfunction Func; - Err = cuModuleGetFunction(&Func, Module, E->name); - // We keep this style here because we need the name - if (Err != CUDA_SUCCESS) { - REPORT("Loading '%s' Failed\n", E->name); - CUDA_ERR_STRING(Err); - return nullptr; - } - - DP("Entry point " DPxMOD " maps to %s (" DPxMOD ")\n", - DPxPTR(E - HostBegin), E->name, DPxPTR(Func)); - - // default value GENERIC (in case symbol is missing from cubin file) - llvm::omp::OMPTgtExecModeFlags ExecModeVal; - std::string ExecModeNameStr(E->name); - ExecModeNameStr += "_exec_mode"; - const char *ExecModeName = ExecModeNameStr.c_str(); - - CUdeviceptr ExecModePtr; - size_t CUSize; - Err = cuModuleGetGlobal(&ExecModePtr, &CUSize, Module, ExecModeName); - if (Err == CUDA_SUCCESS) { - if (CUSize != sizeof(llvm::omp::OMPTgtExecModeFlags)) { - DP("Loading global exec_mode '%s' - size mismatch (%zd != %zd)\n", - ExecModeName, CUSize, sizeof(llvm::omp::OMPTgtExecModeFlags)); - return nullptr; - } - - Err = cuMemcpyDtoH(&ExecModeVal, ExecModePtr, CUSize); - if (Err != CUDA_SUCCESS) { - REPORT("Error when copying data from device to host. Pointers: " - "host = " DPxMOD ", device = " DPxMOD ", size = %zd\n", - DPxPTR(&ExecModeVal), DPxPTR(ExecModePtr), CUSize); - CUDA_ERR_STRING(Err); - return nullptr; - } - } else { - DP("Loading global exec_mode '%s' - symbol missing, using default " - "value GENERIC (1)\n", - ExecModeName); - } - - KernelsList.emplace_back(Func, ExecModeVal); - - __tgt_offload_entry Entry = *E; - Entry.addr = &KernelsList.back(); - addOffloadEntry(DeviceId, Entry); - } - - // send device environment data to the device - { - // TODO: The device ID used here is not the real device ID used by OpenMP. - DeviceEnvironmentTy DeviceEnv{0, static_cast(NumberOfDevices), - static_cast(DeviceId), - static_cast(DynamicMemorySize)}; - - if (const char *EnvStr = getenv("LIBOMPTARGET_DEVICE_RTL_DEBUG")) - DeviceEnv.DebugKind = std::stoi(EnvStr); - - const char *DeviceEnvName = "__omp_rtl_device_environment"; - CUdeviceptr DeviceEnvPtr; - size_t CUSize; - - Err = cuModuleGetGlobal(&DeviceEnvPtr, &CUSize, Module, DeviceEnvName); - if (Err == CUDA_SUCCESS) { - if (CUSize != sizeof(DeviceEnv)) { - REPORT( - "Global device_environment '%s' - size mismatch (%zu != %zu)\n", - DeviceEnvName, CUSize, sizeof(int32_t)); - CUDA_ERR_STRING(Err); - return nullptr; - } - - Err = cuMemcpyHtoD(DeviceEnvPtr, &DeviceEnv, CUSize); - if (Err != CUDA_SUCCESS) { - REPORT("Error when copying data from host to device. Pointers: " - "host = " DPxMOD ", device = " DPxMOD ", size = %zu\n", - DPxPTR(&DeviceEnv), DPxPTR(DeviceEnvPtr), CUSize); - CUDA_ERR_STRING(Err); - return nullptr; - } - - DP("Sending global device environment data %zu bytes\n", CUSize); - } else { - DP("Finding global device environment '%s' - symbol missing.\n", - DeviceEnvName); - DP("Continue, considering this is a device RTL which does not accept " - "environment setting.\n"); - } - } - - return getOffloadEntriesTable(DeviceId); - } - - void *dataAlloc(const int DeviceId, const int64_t Size, - const TargetAllocTy Kind) { - switch (Kind) { - case TARGET_ALLOC_DEFAULT: - case TARGET_ALLOC_DEVICE: - if (UseMemoryManager) - return MemoryManagers[DeviceId]->allocate(Size, nullptr); - else - return DeviceAllocators[DeviceId].allocate(Size, nullptr, Kind); - case TARGET_ALLOC_HOST: - case TARGET_ALLOC_SHARED: - return DeviceAllocators[DeviceId].allocate(Size, nullptr, Kind); - } - - REPORT("Invalid target data allocation kind or requested allocator not " - "implemented yet\n"); - - return nullptr; - } - - int dataSubmit(const int DeviceId, const void *TgtPtr, const void *HstPtr, - const int64_t Size, __tgt_async_info *AsyncInfo) const { - assert(AsyncInfo && "AsyncInfo is nullptr"); - - CUstream Stream = getStream(DeviceId, AsyncInfo); - CUresult Err = cuMemcpyHtoDAsync((CUdeviceptr)TgtPtr, HstPtr, Size, Stream); - if (Err != CUDA_SUCCESS) { - DP("Error when copying data from host to device. Pointers: host " - "= " DPxMOD ", device = " DPxMOD ", size = %" PRId64 "\n", - DPxPTR(HstPtr), DPxPTR(TgtPtr), Size); - CUDA_ERR_STRING(Err); - return OFFLOAD_FAIL; - } - - return OFFLOAD_SUCCESS; - } - - int dataRetrieve(const int DeviceId, void *HstPtr, const void *TgtPtr, - const int64_t Size, __tgt_async_info *AsyncInfo) const { - assert(AsyncInfo && "AsyncInfo is nullptr"); - - CUstream Stream = getStream(DeviceId, AsyncInfo); - CUresult Err = cuMemcpyDtoHAsync(HstPtr, (CUdeviceptr)TgtPtr, Size, Stream); - if (Err != CUDA_SUCCESS) { - DP("Error when copying data from device to host. Pointers: host " - "= " DPxMOD ", device = " DPxMOD ", size = %" PRId64 "\n", - DPxPTR(HstPtr), DPxPTR(TgtPtr), Size); - CUDA_ERR_STRING(Err); - return OFFLOAD_FAIL; - } - - return OFFLOAD_SUCCESS; - } - - int dataExchange(int SrcDevId, const void *SrcPtr, int DstDevId, void *DstPtr, - int64_t Size, __tgt_async_info *AsyncInfo) { - assert(AsyncInfo && "AsyncInfo is nullptr"); - - CUresult Err; - CUstream Stream = getStream(SrcDevId, AsyncInfo); - - // If they are two devices, we try peer to peer copy first - if (SrcDevId != DstDevId) { - std::lock_guard LG(PeerAccessMatrixLock); - - switch (PeerAccessMatrix[SrcDevId][DstDevId]) { - case PeerAccessState::No: { - REPORT("Peer access from %" PRId32 " to %" PRId32 - " is not supported. Fall back to D2D memcpy.\n", - SrcDevId, DstDevId); - return memcpyDtoD(SrcPtr, DstPtr, Size, Stream); - } - case PeerAccessState::Unkown: { - int CanAccessPeer = 0; - Err = cuDeviceCanAccessPeer(&CanAccessPeer, SrcDevId, DstDevId); - if (Err != CUDA_SUCCESS) { - REPORT("Error returned from cuDeviceCanAccessPeer. src = %" PRId32 - ", dst = %" PRId32 ". Fall back to D2D memcpy.\n", - SrcDevId, DstDevId); - CUDA_ERR_STRING(Err); - PeerAccessMatrix[SrcDevId][DstDevId] = PeerAccessState::No; - return memcpyDtoD(SrcPtr, DstPtr, Size, Stream); - } - - if (!CanAccessPeer) { - REPORT("P2P access from %d to %d is not supported. Fall back to D2D " - "memcpy.\n", - SrcDevId, DstDevId); - PeerAccessMatrix[SrcDevId][DstDevId] = PeerAccessState::No; - return memcpyDtoD(SrcPtr, DstPtr, Size, Stream); - } - - Err = cuCtxEnablePeerAccess(DeviceData[DstDevId].Context, 0); - if (Err != CUDA_SUCCESS) { - REPORT("Error returned from cuCtxEnablePeerAccess. src = %" PRId32 - ", dst = %" PRId32 ". Fall back to D2D memcpy.\n", - SrcDevId, DstDevId); - CUDA_ERR_STRING(Err); - PeerAccessMatrix[SrcDevId][DstDevId] = PeerAccessState::No; - return memcpyDtoD(SrcPtr, DstPtr, Size, Stream); - } - - PeerAccessMatrix[SrcDevId][DstDevId] = PeerAccessState::Yes; - - [[fallthrough]]; - } - case PeerAccessState::Yes: { - Err = cuMemcpyPeerAsync( - (CUdeviceptr)DstPtr, DeviceData[DstDevId].Context, - (CUdeviceptr)SrcPtr, DeviceData[SrcDevId].Context, Size, Stream); - if (Err == CUDA_SUCCESS) - return OFFLOAD_SUCCESS; - - DP("Error returned from cuMemcpyPeerAsync. src_ptr = " DPxMOD - ", src_id =%" PRId32 ", dst_ptr = " DPxMOD ", dst_id =%" PRId32 - ". Fall back to D2D memcpy.\n", - DPxPTR(SrcPtr), SrcDevId, DPxPTR(DstPtr), DstDevId); - CUDA_ERR_STRING(Err); - - return memcpyDtoD(SrcPtr, DstPtr, Size, Stream); - } - } - } - - return memcpyDtoD(SrcPtr, DstPtr, Size, Stream); - } - - int dataDelete(const int DeviceId, void *TgtPtr, TargetAllocTy Kind) { - switch (Kind) { - case TARGET_ALLOC_DEFAULT: - case TARGET_ALLOC_DEVICE: - if (UseMemoryManager) - return MemoryManagers[DeviceId]->free(TgtPtr); - else - return DeviceAllocators[DeviceId].free(TgtPtr, Kind); - case TARGET_ALLOC_HOST: - case TARGET_ALLOC_SHARED: - return DeviceAllocators[DeviceId].free(TgtPtr, Kind); - } - - REPORT("Invalid target data allocation kind or requested allocator not " - "implemented yet\n"); - - return OFFLOAD_FAIL; - } - - int runTargetTeamRegion(const int DeviceId, void *TgtEntryPtr, void **TgtArgs, - ptrdiff_t *TgtOffsets, const int ArgNum, - const int TeamNum, const int ThreadLimit, - const unsigned int LoopTripCount, - __tgt_async_info *AsyncInfo) const { - // All args are references. - std::vector Args(ArgNum); - std::vector Ptrs(ArgNum); - - for (int I = 0; I < ArgNum; ++I) { - Ptrs[I] = (void *)((intptr_t)TgtArgs[I] + TgtOffsets[I]); - Args[I] = &Ptrs[I]; - } - - KernelTy *KernelInfo = reinterpret_cast(TgtEntryPtr); - - const bool IsSPMDGenericMode = - KernelInfo->ExecutionMode == llvm::omp::OMP_TGT_EXEC_MODE_GENERIC_SPMD; - const bool IsSPMDMode = - KernelInfo->ExecutionMode == llvm::omp::OMP_TGT_EXEC_MODE_SPMD; - const bool IsGenericMode = - KernelInfo->ExecutionMode == llvm::omp::OMP_TGT_EXEC_MODE_GENERIC; - - int CudaThreadsPerBlock; - if (ThreadLimit > 0) { - DP("Setting CUDA threads per block to requested %d\n", ThreadLimit); - CudaThreadsPerBlock = ThreadLimit; - // Add master warp if necessary - if (IsGenericMode) { - DP("Adding master warp: +%d threads\n", DeviceData[DeviceId].WarpSize); - CudaThreadsPerBlock += DeviceData[DeviceId].WarpSize; - } - } else { - DP("Setting CUDA threads per block to default %d\n", - DeviceData[DeviceId].NumThreads); - CudaThreadsPerBlock = DeviceData[DeviceId].NumThreads; - } - - if ((unsigned)CudaThreadsPerBlock > DeviceData[DeviceId].ThreadsPerBlock) { - DP("Threads per block capped at device limit %d\n", - DeviceData[DeviceId].ThreadsPerBlock); - CudaThreadsPerBlock = DeviceData[DeviceId].ThreadsPerBlock; - } - - CUresult Err; - if (!KernelInfo->MaxThreadsPerBlock) { - Err = cuFuncGetAttribute(&KernelInfo->MaxThreadsPerBlock, - CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, - KernelInfo->Func); - if (!checkResult(Err, "Error returned from cuFuncGetAttribute\n")) - return OFFLOAD_FAIL; - } - - if (KernelInfo->MaxThreadsPerBlock < CudaThreadsPerBlock) { - DP("Threads per block capped at kernel limit %d\n", - KernelInfo->MaxThreadsPerBlock); - CudaThreadsPerBlock = KernelInfo->MaxThreadsPerBlock; - } - - unsigned int CudaBlocksPerGrid; - if (TeamNum <= 0) { - if (LoopTripCount > 0 && EnvNumTeams < 0) { - if (IsSPMDGenericMode) { - // If we reach this point, then we are executing a kernel that was - // transformed from Generic-mode to SPMD-mode. This kernel has - // SPMD-mode execution, but needs its blocks to be scheduled - // differently because the current loop trip count only applies to the - // `teams distribute` region and will create var too few blocks using - // the regular SPMD-mode method. - CudaBlocksPerGrid = LoopTripCount; - } else if (IsSPMDMode) { - // We have a combined construct, i.e. `target teams distribute - // parallel for [simd]`. We launch so many teams so that each thread - // will execute one iteration of the loop. round up to the nearest - // integer - CudaBlocksPerGrid = ((LoopTripCount - 1) / CudaThreadsPerBlock) + 1; - } else if (IsGenericMode) { - // If we reach this point, then we have a non-combined construct, i.e. - // `teams distribute` with a nested `parallel for` and each team is - // assigned one iteration of the `distribute` loop. E.g.: - // - // #pragma omp target teams distribute - // for(...loop_tripcount...) { - // #pragma omp parallel for - // for(...) {} - // } - // - // Threads within a team will execute the iterations of the `parallel` - // loop. - CudaBlocksPerGrid = LoopTripCount; - } else { - REPORT("Unknown execution mode: %d\n", - static_cast(KernelInfo->ExecutionMode)); - return OFFLOAD_FAIL; - } - DP("Using %d teams due to loop trip count %" PRIu32 - " and number of threads per block %d\n", - CudaBlocksPerGrid, LoopTripCount, CudaThreadsPerBlock); - } else { - DP("Using default number of teams %d\n", DeviceData[DeviceId].NumTeams); - CudaBlocksPerGrid = DeviceData[DeviceId].NumTeams; - } - } else { - DP("Using requested number of teams %d\n", TeamNum); - CudaBlocksPerGrid = TeamNum; - } - - if (CudaBlocksPerGrid > DeviceData[DeviceId].BlocksPerGrid) { - DP("Capping number of teams to team limit %d\n", - DeviceData[DeviceId].BlocksPerGrid); - CudaBlocksPerGrid = DeviceData[DeviceId].BlocksPerGrid; - } - - INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId, - "Launching kernel %s with %d blocks and %d threads in %s mode\n", - (getOffloadEntry(DeviceId, TgtEntryPtr)) - ? getOffloadEntry(DeviceId, TgtEntryPtr)->name - : "(null)", - CudaBlocksPerGrid, CudaThreadsPerBlock, - (!IsSPMDMode ? (IsGenericMode ? "Generic" : "SPMD-Generic") : "SPMD")); - - CUstream Stream = getStream(DeviceId, AsyncInfo); - Err = cuLaunchKernel(KernelInfo->Func, CudaBlocksPerGrid, /* gridDimY */ 1, - /* gridDimZ */ 1, CudaThreadsPerBlock, - /* blockDimY */ 1, /* blockDimZ */ 1, - DynamicMemorySize, Stream, &Args[0], nullptr); - if (!checkResult(Err, "Error returned from cuLaunchKernel\n")) - return OFFLOAD_FAIL; - - DP("Launch of entry point at " DPxMOD " successful!\n", - DPxPTR(TgtEntryPtr)); - - return OFFLOAD_SUCCESS; - } - - int synchronize(const int DeviceId, __tgt_async_info *AsyncInfo) const { - CUstream Stream = reinterpret_cast(AsyncInfo->Queue); - CUresult Err = cuStreamSynchronize(Stream); - - // Once the stream is synchronized, return it to stream pool and reset - // AsyncInfo. This is to make sure the synchronization only works for its - // own tasks. - StreamPool[DeviceId]->release(reinterpret_cast(AsyncInfo->Queue)); - AsyncInfo->Queue = nullptr; - - if (Err != CUDA_SUCCESS) { - DP("Error when synchronizing stream. stream = " DPxMOD - ", async info ptr = " DPxMOD "\n", - DPxPTR(Stream), DPxPTR(AsyncInfo)); - CUDA_ERR_STRING(Err); - } - return (Err == CUDA_SUCCESS) ? OFFLOAD_SUCCESS : OFFLOAD_FAIL; - } - - int queryAsync(const int DeviceId, __tgt_async_info *AsyncInfo) const { - CUstream Stream = reinterpret_cast(AsyncInfo->Queue); - CUresult Err = cuStreamQuery(Stream); - - // Not ready streams must be considered as successful operations. - if (Err == CUDA_ERROR_NOT_READY) - return OFFLOAD_SUCCESS; - - // Once the stream is synchronized or an error occurs, return it to the - // stream pool and reset AsyncInfo. This is to make sure the - // synchronization only works for its own tasks. - StreamPool[DeviceId]->release(Stream); - AsyncInfo->Queue = nullptr; - - if (Err != CUDA_SUCCESS) { - DP("Error when querying for stream progress. stream = " DPxMOD - ", async info ptr = " DPxMOD "\n", - DPxPTR(Stream), DPxPTR(AsyncInfo)); - CUDA_ERR_STRING(Err); - } - return (Err == CUDA_SUCCESS) ? OFFLOAD_SUCCESS : OFFLOAD_FAIL; - } - - void printDeviceInfo(int32_t DeviceId) { - char TmpChar[1000]; - std::string TmpStr; - size_t TmpSt; - int TmpInt, TmpInt2, TmpInt3; - - CUdevice Device; - checkResult(cuDeviceGet(&Device, DeviceId), - "Error returned from cuCtxGetDevice\n"); - - cuDriverGetVersion(&TmpInt); - printf(" CUDA Driver Version: \t\t%d \n", TmpInt); - printf(" CUDA Device Number: \t\t%d \n", DeviceId); - checkResult(cuDeviceGetName(TmpChar, 1000, Device), - "Error returned from cuDeviceGetName\n"); - printf(" Device Name: \t\t\t%s \n", TmpChar); - checkResult(cuDeviceTotalMem(&TmpSt, Device), - "Error returned from cuDeviceTotalMem\n"); - printf(" Global Memory Size: \t\t%zu bytes \n", TmpSt); - checkResult(cuDeviceGetAttribute( - &TmpInt, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, Device), - "Error returned from cuDeviceGetAttribute\n"); - printf(" Number of Multiprocessors: \t\t%d \n", TmpInt); - checkResult( - cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_GPU_OVERLAP, Device), - "Error returned from cuDeviceGetAttribute\n"); - printf(" Concurrent Copy and Execution: \t%s \n", BOOL2TEXT(TmpInt)); - checkResult(cuDeviceGetAttribute( - &TmpInt, CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY, Device), - "Error returned from cuDeviceGetAttribute\n"); - printf(" Total Constant Memory: \t\t%d bytes\n", TmpInt); - checkResult( - cuDeviceGetAttribute( - &TmpInt, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK, Device), - "Error returned from cuDeviceGetAttribute\n"); - printf(" Max Shared Memory per Block: \t%d bytes \n", TmpInt); - checkResult( - cuDeviceGetAttribute( - &TmpInt, CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK, Device), - "Error returned from cuDeviceGetAttribute\n"); - printf(" Registers per Block: \t\t%d \n", TmpInt); - checkResult( - cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_WARP_SIZE, Device), - "Error returned from cuDeviceGetAttribute\n"); - printf(" Warp Size: \t\t\t\t%d Threads \n", TmpInt); - checkResult(cuDeviceGetAttribute( - &TmpInt, CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK, Device), - "Error returned from cuDeviceGetAttribute\n"); - printf(" Maximum Threads per Block: \t\t%d \n", TmpInt); - checkResult(cuDeviceGetAttribute( - &TmpInt, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X, Device), - "Error returned from cuDeviceGetAttribute\n"); - checkResult(cuDeviceGetAttribute( - &TmpInt2, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y, Device), - "Error returned from cuDeviceGetAttribute\n"); - checkResult(cuDeviceGetAttribute( - &TmpInt3, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z, Device), - "Error returned from cuDeviceGetAttribute\n"); - printf(" Maximum Block Dimensions: \t\t%d, %d, %d \n", TmpInt, TmpInt2, - TmpInt3); - checkResult(cuDeviceGetAttribute( - &TmpInt, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X, Device), - "Error returned from cuDeviceGetAttribute\n"); - checkResult(cuDeviceGetAttribute( - &TmpInt2, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y, Device), - "Error returned from cuDeviceGetAttribute\n"); - checkResult(cuDeviceGetAttribute( - &TmpInt3, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z, Device), - "Error returned from cuDeviceGetAttribute\n"); - printf(" Maximum Grid Dimensions: \t\t%d x %d x %d \n", TmpInt, TmpInt2, - TmpInt3); - checkResult( - cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_MAX_PITCH, Device), - "Error returned from cuDeviceGetAttribute\n"); - printf(" Maximum Memory Pitch: \t\t%d bytes \n", TmpInt); - checkResult(cuDeviceGetAttribute( - &TmpInt, CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT, Device), - "Error returned from cuDeviceGetAttribute\n"); - printf(" Texture Alignment: \t\t\t%d bytes \n", TmpInt); - checkResult( - cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, Device), - "Error returned from cuDeviceGetAttribute\n"); - printf(" Clock Rate: \t\t\t%d kHz\n", TmpInt); - checkResult(cuDeviceGetAttribute( - &TmpInt, CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT, Device), - "Error returned from cuDeviceGetAttribute\n"); - printf(" Execution Timeout: \t\t\t%s \n", BOOL2TEXT(TmpInt)); - checkResult( - cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_INTEGRATED, Device), - "Error returned from cuDeviceGetAttribute\n"); - printf(" Integrated Device: \t\t\t%s \n", BOOL2TEXT(TmpInt)); - checkResult(cuDeviceGetAttribute( - &TmpInt, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, Device), - "Error returned from cuDeviceGetAttribute\n"); - printf(" Can Map Host Memory: \t\t%s \n", BOOL2TEXT(TmpInt)); - checkResult( - cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, Device), - "Error returned from cuDeviceGetAttribute\n"); - if (TmpInt == CU_COMPUTEMODE_DEFAULT) - TmpStr = "DEFAULT"; - else if (TmpInt == CU_COMPUTEMODE_PROHIBITED) - TmpStr = "PROHIBITED"; - else if (TmpInt == CU_COMPUTEMODE_EXCLUSIVE_PROCESS) - TmpStr = "EXCLUSIVE PROCESS"; - else - TmpStr = "unknown"; - printf(" Compute Mode: \t\t\t%s \n", TmpStr.c_str()); - checkResult(cuDeviceGetAttribute( - &TmpInt, CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS, Device), - "Error returned from cuDeviceGetAttribute\n"); - printf(" Concurrent Kernels: \t\t%s \n", BOOL2TEXT(TmpInt)); - checkResult( - cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_ECC_ENABLED, Device), - "Error returned from cuDeviceGetAttribute\n"); - printf(" ECC Enabled: \t\t\t%s \n", BOOL2TEXT(TmpInt)); - checkResult(cuDeviceGetAttribute( - &TmpInt, CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE, Device), - "Error returned from cuDeviceGetAttribute\n"); - printf(" Memory Clock Rate: \t\t\t%d kHz\n", TmpInt); - checkResult( - cuDeviceGetAttribute( - &TmpInt, CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH, Device), - "Error returned from cuDeviceGetAttribute\n"); - printf(" Memory Bus Width: \t\t\t%d bits\n", TmpInt); - checkResult(cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE, - Device), - "Error returned from cuDeviceGetAttribute\n"); - printf(" L2 Cache Size: \t\t\t%d bytes \n", TmpInt); - checkResult(cuDeviceGetAttribute( - &TmpInt, CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR, - Device), - "Error returned from cuDeviceGetAttribute\n"); - printf(" Max Threads Per SMP: \t\t%d \n", TmpInt); - checkResult(cuDeviceGetAttribute( - &TmpInt, CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT, Device), - "Error returned from cuDeviceGetAttribute\n"); - printf(" Async Engines: \t\t\t%s (%d) \n", BOOL2TEXT(TmpInt), TmpInt); - checkResult(cuDeviceGetAttribute( - &TmpInt, CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING, Device), - "Error returned from cuDeviceGetAttribute\n"); - printf(" Unified Addressing: \t\t%s \n", BOOL2TEXT(TmpInt)); - checkResult(cuDeviceGetAttribute( - &TmpInt, CU_DEVICE_ATTRIBUTE_MANAGED_MEMORY, Device), - "Error returned from cuDeviceGetAttribute\n"); - printf(" Managed Memory: \t\t\t%s \n", BOOL2TEXT(TmpInt)); - checkResult( - cuDeviceGetAttribute( - &TmpInt, CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS, Device), - "Error returned from cuDeviceGetAttribute\n"); - printf(" Concurrent Managed Memory: \t\t%s \n", BOOL2TEXT(TmpInt)); - checkResult( - cuDeviceGetAttribute( - &TmpInt, CU_DEVICE_ATTRIBUTE_COMPUTE_PREEMPTION_SUPPORTED, Device), - "Error returned from cuDeviceGetAttribute\n"); - printf(" Preemption Supported: \t\t%s \n", BOOL2TEXT(TmpInt)); - checkResult(cuDeviceGetAttribute( - &TmpInt, CU_DEVICE_ATTRIBUTE_COOPERATIVE_LAUNCH, Device), - "Error returned from cuDeviceGetAttribute\n"); - printf(" Cooperative Launch: \t\t%s \n", BOOL2TEXT(TmpInt)); - checkResult(cuDeviceGetAttribute( - &TmpInt, CU_DEVICE_ATTRIBUTE_MULTI_GPU_BOARD, Device), - "Error returned from cuDeviceGetAttribute\n"); - printf(" Multi-Device Boars: \t\t%s \n", BOOL2TEXT(TmpInt)); - checkResult( - cuDeviceGetAttribute( - &TmpInt, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, Device), - "Error returned from cuDeviceGetAttribute\n"); - checkResult( - cuDeviceGetAttribute( - &TmpInt2, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, Device), - "Error returned from cuDeviceGetAttribute\n"); - printf(" Compute Capabilities: \t\t%d%d \n", TmpInt, TmpInt2); - } - - int createEvent(int DeviceId, void **P) { - CUevent Event = nullptr; - if (EventPool[DeviceId]->acquire(Event) != OFFLOAD_SUCCESS) - return OFFLOAD_FAIL; - *P = Event; - return OFFLOAD_SUCCESS; - } - - int destroyEvent(int DeviceId, void *EventPtr) { - EventPool[DeviceId]->release(reinterpret_cast(EventPtr)); - return OFFLOAD_SUCCESS; - } - - int waitEvent(const int DeviceId, __tgt_async_info *AsyncInfo, - void *EventPtr) const { - CUstream Stream = getStream(DeviceId, AsyncInfo); - CUevent Event = reinterpret_cast(EventPtr); - - // We don't use CU_EVENT_WAIT_DEFAULT here as it is only available from - // specific CUDA version, and defined as 0x0. In previous version, per CUDA - // API document, that argument has to be 0x0. - CUresult Err = cuStreamWaitEvent(Stream, Event, 0); - if (Err != CUDA_SUCCESS) { - DP("Error when waiting event. stream = " DPxMOD ", event = " DPxMOD "\n", - DPxPTR(Stream), DPxPTR(Event)); - CUDA_ERR_STRING(Err); - return OFFLOAD_FAIL; - } - - return OFFLOAD_SUCCESS; - } - - int releaseAsyncInfo(int DeviceId, __tgt_async_info *AsyncInfo) const { - if (AsyncInfo->Queue) { - StreamPool[DeviceId]->release( - reinterpret_cast(AsyncInfo->Queue)); - AsyncInfo->Queue = nullptr; - } - - return OFFLOAD_SUCCESS; - } - - int initAsyncInfo(int DeviceId, __tgt_async_info **AsyncInfo) const { - *AsyncInfo = new __tgt_async_info; - getStream(DeviceId, *AsyncInfo); - return OFFLOAD_SUCCESS; - } - - int initDeviceInfo(int DeviceId, __tgt_device_info *DeviceInfo, - const char **ErrStr) const { - assert(DeviceInfo && "DeviceInfo is nullptr"); - - if (!DeviceInfo->Context) - DeviceInfo->Context = DeviceData[DeviceId].Context; - if (!DeviceInfo->Device) { - CUdevice Dev; - CUresult Err = cuDeviceGet(&Dev, DeviceId); - if (Err == CUDA_SUCCESS) { - DeviceInfo->Device = reinterpret_cast(Dev); - } else { - cuGetErrorString(Err, ErrStr); - return OFFLOAD_FAIL; - } - } - return OFFLOAD_SUCCESS; - } - - int setContext(int DeviceId) { - assert(InitializedFlags[DeviceId] && "Device is not initialized"); - - CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context); - if (!checkResult(Err, "error returned from cuCtxSetCurrent")) - return OFFLOAD_FAIL; - - return OFFLOAD_SUCCESS; - } -}; - -DeviceRTLTy DeviceRTL; -} // namespace - -// Exposed library API function -#ifdef __cplusplus -extern "C" { -#endif - -int32_t __tgt_rtl_is_valid_binary(__tgt_device_image *Image) { - return elf_check_machine(Image, /* EM_CUDA */ 190); -} - -int32_t __tgt_rtl_is_valid_binary_info(__tgt_device_image *Image, - __tgt_image_info *Info) { - if (!__tgt_rtl_is_valid_binary(Image)) - return false; - - // A subarchitecture was not specified. Assume it is compatible. - if (!Info || !Info->Arch) - return true; - - int32_t NumberOfDevices = 0; - if (cuDeviceGetCount(&NumberOfDevices) != CUDA_SUCCESS) - return false; - - StringRef ArchStr = StringRef(Info->Arch).drop_front(sizeof("sm_") - 1); - for (int32_t DeviceId = 0; DeviceId < NumberOfDevices; ++DeviceId) { - CUdevice Device; - if (cuDeviceGet(&Device, DeviceId) != CUDA_SUCCESS) - return false; - - int32_t Major, Minor; - if (cuDeviceGetAttribute(&Major, - CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, - Device) != CUDA_SUCCESS) - return false; - if (cuDeviceGetAttribute(&Minor, - CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, - Device) != CUDA_SUCCESS) - return false; - - // A cubin generated for a certain compute capability is supported to run on - // any GPU with the same major revision and same or higher minor revision. - int32_t ImageMajor = ArchStr[0] - '0'; - int32_t ImageMinor = ArchStr[1] - '0'; - if (Major != ImageMajor || Minor < ImageMinor) - return false; - } - - DP("Image has compatible compute capability: %s\n", Info->Arch); - return true; -} - -int32_t __tgt_rtl_number_of_devices() { return DeviceRTL.getNumOfDevices(); } - -int64_t __tgt_rtl_init_requires(int64_t RequiresFlags) { - DP("Init requires flags to %" PRId64 "\n", RequiresFlags); - DeviceRTL.setRequiresFlag(RequiresFlags); - return RequiresFlags; -} - -int32_t __tgt_rtl_is_data_exchangable(int32_t SrcDevId, int DstDevId) { - if (DeviceRTL.isValidDeviceId(SrcDevId) && - DeviceRTL.isValidDeviceId(DstDevId)) - return 1; - - return 0; -} - -int32_t __tgt_rtl_init_device(int32_t DeviceId) { - assert(DeviceRTL.isValidDeviceId(DeviceId) && "device_id is invalid"); - // Context is set when init the device. - - return DeviceRTL.initDevice(DeviceId); -} - -int32_t __tgt_rtl_deinit_device(int32_t DeviceId) { - assert(DeviceRTL.isValidDeviceId(DeviceId) && "device_id is invalid"); - // Context is set when deinit the device. - - return DeviceRTL.deinitDevice(DeviceId); -} - -__tgt_target_table *__tgt_rtl_load_binary(int32_t DeviceId, - __tgt_device_image *Image) { - assert(DeviceRTL.isValidDeviceId(DeviceId) && "device_id is invalid"); - - if (DeviceRTL.setContext(DeviceId) != OFFLOAD_SUCCESS) - return nullptr; - - return DeviceRTL.loadBinary(DeviceId, Image); -} - -void *__tgt_rtl_data_alloc(int32_t DeviceId, int64_t Size, void *, - int32_t Kind) { - assert(DeviceRTL.isValidDeviceId(DeviceId) && "device_id is invalid"); - - if (DeviceRTL.setContext(DeviceId) != OFFLOAD_SUCCESS) - return nullptr; - - return DeviceRTL.dataAlloc(DeviceId, Size, (TargetAllocTy)Kind); -} - -int32_t __tgt_rtl_data_submit(int32_t DeviceId, void *TgtPtr, void *HstPtr, - int64_t Size) { - assert(DeviceRTL.isValidDeviceId(DeviceId) && "device_id is invalid"); - // Context is set in __tgt_rtl_data_submit_async. - - __tgt_async_info AsyncInfo; - const int32_t Rc = - __tgt_rtl_data_submit_async(DeviceId, TgtPtr, HstPtr, Size, &AsyncInfo); - if (Rc != OFFLOAD_SUCCESS) - return OFFLOAD_FAIL; - - return __tgt_rtl_synchronize(DeviceId, &AsyncInfo); -} - -int32_t __tgt_rtl_data_submit_async(int32_t DeviceId, void *TgtPtr, - void *HstPtr, int64_t Size, - __tgt_async_info *AsyncInfoPtr) { - assert(DeviceRTL.isValidDeviceId(DeviceId) && "device_id is invalid"); - assert(AsyncInfoPtr && "async_info_ptr is nullptr"); - - if (DeviceRTL.setContext(DeviceId) != OFFLOAD_SUCCESS) - return OFFLOAD_FAIL; - - return DeviceRTL.dataSubmit(DeviceId, TgtPtr, HstPtr, Size, AsyncInfoPtr); -} - -int32_t __tgt_rtl_data_retrieve(int32_t DeviceId, void *HstPtr, void *TgtPtr, - int64_t Size) { - assert(DeviceRTL.isValidDeviceId(DeviceId) && "device_id is invalid"); - // Context is set in __tgt_rtl_data_retrieve_async. - - __tgt_async_info AsyncInfo; - const int32_t Rc = - __tgt_rtl_data_retrieve_async(DeviceId, HstPtr, TgtPtr, Size, &AsyncInfo); - if (Rc != OFFLOAD_SUCCESS) - return OFFLOAD_FAIL; - - return __tgt_rtl_synchronize(DeviceId, &AsyncInfo); -} - -int32_t __tgt_rtl_data_retrieve_async(int32_t DeviceId, void *HstPtr, - void *TgtPtr, int64_t Size, - __tgt_async_info *AsyncInfoPtr) { - assert(DeviceRTL.isValidDeviceId(DeviceId) && "device_id is invalid"); - assert(AsyncInfoPtr && "async_info_ptr is nullptr"); - - if (DeviceRTL.setContext(DeviceId) != OFFLOAD_SUCCESS) - return OFFLOAD_FAIL; - - return DeviceRTL.dataRetrieve(DeviceId, HstPtr, TgtPtr, Size, AsyncInfoPtr); -} - -int32_t __tgt_rtl_data_exchange_async(int32_t SrcDevId, void *SrcPtr, - int DstDevId, void *DstPtr, int64_t Size, - __tgt_async_info *AsyncInfo) { - assert(DeviceRTL.isValidDeviceId(SrcDevId) && "src_dev_id is invalid"); - assert(DeviceRTL.isValidDeviceId(DstDevId) && "dst_dev_id is invalid"); - assert(AsyncInfo && "AsyncInfo is nullptr"); - - if (DeviceRTL.setContext(SrcDevId) != OFFLOAD_SUCCESS) - return OFFLOAD_FAIL; - - return DeviceRTL.dataExchange(SrcDevId, SrcPtr, DstDevId, DstPtr, Size, - AsyncInfo); -} - -int32_t __tgt_rtl_data_exchange(int32_t SrcDevId, void *SrcPtr, - int32_t DstDevId, void *DstPtr, int64_t Size) { - assert(DeviceRTL.isValidDeviceId(SrcDevId) && "src_dev_id is invalid"); - assert(DeviceRTL.isValidDeviceId(DstDevId) && "dst_dev_id is invalid"); - // Context is set in __tgt_rtl_data_exchange_async. - - __tgt_async_info AsyncInfo; - const int32_t Rc = __tgt_rtl_data_exchange_async(SrcDevId, SrcPtr, DstDevId, - DstPtr, Size, &AsyncInfo); - if (Rc != OFFLOAD_SUCCESS) - return OFFLOAD_FAIL; - - return __tgt_rtl_synchronize(SrcDevId, &AsyncInfo); -} - -int32_t __tgt_rtl_data_delete(int32_t DeviceId, void *TgtPtr, int32_t Kind) { - assert(DeviceRTL.isValidDeviceId(DeviceId) && "device_id is invalid"); - - if (DeviceRTL.setContext(DeviceId) != OFFLOAD_SUCCESS) - return OFFLOAD_FAIL; - - return DeviceRTL.dataDelete(DeviceId, TgtPtr, (TargetAllocTy)Kind); -} - -int32_t __tgt_rtl_run_target_team_region(int32_t DeviceId, void *TgtEntryPtr, - void **TgtArgs, ptrdiff_t *TgtOffsets, - int32_t ArgNum, int32_t TeamNum, - int32_t ThreadLimit, - uint64_t LoopTripcount) { - assert(DeviceRTL.isValidDeviceId(DeviceId) && "device_id is invalid"); - // Context is set in __tgt_rtl_run_target_team_region_async. - - __tgt_async_info AsyncInfo; - const int32_t Rc = __tgt_rtl_run_target_team_region_async( - DeviceId, TgtEntryPtr, TgtArgs, TgtOffsets, ArgNum, TeamNum, ThreadLimit, - LoopTripcount, &AsyncInfo); - if (Rc != OFFLOAD_SUCCESS) - return OFFLOAD_FAIL; - - return __tgt_rtl_synchronize(DeviceId, &AsyncInfo); -} - -int32_t __tgt_rtl_run_target_team_region_async( - int32_t DeviceId, void *TgtEntryPtr, void **TgtArgs, ptrdiff_t *TgtOffsets, - int32_t ArgNum, int32_t TeamNum, int32_t ThreadLimit, - uint64_t LoopTripcount, __tgt_async_info *AsyncInfoPtr) { - assert(DeviceRTL.isValidDeviceId(DeviceId) && "device_id is invalid"); - - if (DeviceRTL.setContext(DeviceId) != OFFLOAD_SUCCESS) - return OFFLOAD_FAIL; - - return DeviceRTL.runTargetTeamRegion(DeviceId, TgtEntryPtr, TgtArgs, - TgtOffsets, ArgNum, TeamNum, ThreadLimit, - LoopTripcount, AsyncInfoPtr); -} - -int32_t __tgt_rtl_run_target_region(int32_t DeviceId, void *TgtEntryPtr, - void **TgtArgs, ptrdiff_t *TgtOffsets, - int32_t ArgNum) { - assert(DeviceRTL.isValidDeviceId(DeviceId) && "device_id is invalid"); - // Context is set in __tgt_rtl_run_target_region_async. - - __tgt_async_info AsyncInfo; - const int32_t Rc = __tgt_rtl_run_target_region_async( - DeviceId, TgtEntryPtr, TgtArgs, TgtOffsets, ArgNum, &AsyncInfo); - if (Rc != OFFLOAD_SUCCESS) - return OFFLOAD_FAIL; - - return __tgt_rtl_synchronize(DeviceId, &AsyncInfo); -} - -int32_t __tgt_rtl_run_target_region_async(int32_t DeviceId, void *TgtEntryPtr, - void **TgtArgs, ptrdiff_t *TgtOffsets, - int32_t ArgNum, - __tgt_async_info *AsyncInfoPtr) { - assert(DeviceRTL.isValidDeviceId(DeviceId) && "device_id is invalid"); - // Context is set in __tgt_rtl_run_target_team_region_async. - return __tgt_rtl_run_target_team_region_async( - DeviceId, TgtEntryPtr, TgtArgs, TgtOffsets, ArgNum, - /* team num*/ 1, /* thread_limit */ 1, /* loop_tripcount */ 0, - AsyncInfoPtr); -} - -int32_t __tgt_rtl_synchronize(int32_t DeviceId, - __tgt_async_info *AsyncInfoPtr) { - assert(DeviceRTL.isValidDeviceId(DeviceId) && "device_id is invalid"); - assert(AsyncInfoPtr && "async_info_ptr is nullptr"); - assert(AsyncInfoPtr->Queue && "async_info_ptr->Queue is nullptr"); - // NOTE: We don't need to set context for stream sync. - return DeviceRTL.synchronize(DeviceId, AsyncInfoPtr); -} - -int32_t __tgt_rtl_query_async(int32_t DeviceId, - __tgt_async_info *AsyncInfoPtr) { - assert(DeviceRTL.isValidDeviceId(DeviceId) && "device_id is invalid"); - assert(AsyncInfoPtr && "async_info_ptr is nullptr"); - assert(AsyncInfoPtr->Queue && "async_info_ptr->Queue is nullptr"); - // NOTE: We don't need to set context for stream query. - return DeviceRTL.queryAsync(DeviceId, AsyncInfoPtr); -} - -void __tgt_rtl_set_info_flag(uint32_t NewInfoLevel) { - std::atomic &InfoLevel = getInfoLevelInternal(); - InfoLevel.store(NewInfoLevel); -} - -void __tgt_rtl_print_device_info(int32_t DeviceId) { - assert(DeviceRTL.isValidDeviceId(DeviceId) && "device_id is invalid"); - // NOTE: We don't need to set context for print device info. - DeviceRTL.printDeviceInfo(DeviceId); -} - -int32_t __tgt_rtl_create_event(int32_t DeviceId, void **Event) { - assert(Event && "event is nullptr"); - - if (DeviceRTL.setContext(DeviceId) != OFFLOAD_SUCCESS) - return OFFLOAD_FAIL; - - return DeviceRTL.createEvent(DeviceId, Event); -} - -int32_t __tgt_rtl_record_event(int32_t DeviceId, void *EventPtr, - __tgt_async_info *AsyncInfoPtr) { - assert(AsyncInfoPtr && "async_info_ptr is nullptr"); - assert(AsyncInfoPtr->Queue && "async_info_ptr->Queue is nullptr"); - assert(EventPtr && "event_ptr is nullptr"); - // NOTE: We might not need to set context for event record. - return recordEvent(EventPtr, AsyncInfoPtr); -} - -int32_t __tgt_rtl_wait_event(int32_t DeviceId, void *EventPtr, - __tgt_async_info *AsyncInfoPtr) { - assert(DeviceRTL.isValidDeviceId(DeviceId) && "device_id is invalid"); - assert(AsyncInfoPtr && "async_info_ptr is nullptr"); - assert(EventPtr && "event is nullptr"); - // If we don't have a queue we need to set the context. - if (!AsyncInfoPtr->Queue && DeviceRTL.setContext(DeviceId) != OFFLOAD_SUCCESS) - return OFFLOAD_FAIL; - return DeviceRTL.waitEvent(DeviceId, AsyncInfoPtr, EventPtr); -} - -int32_t __tgt_rtl_sync_event(int32_t DeviceId, void *EventPtr) { - assert(EventPtr && "event is nullptr"); - // NOTE: We might not need to set context for event sync. - return syncEvent(EventPtr); -} - -int32_t __tgt_rtl_destroy_event(int32_t DeviceId, void *EventPtr) { - assert(EventPtr && "event is nullptr"); - - if (DeviceRTL.setContext(DeviceId) != OFFLOAD_SUCCESS) - return OFFLOAD_FAIL; - - return DeviceRTL.destroyEvent(DeviceId, EventPtr); -} - -int32_t __tgt_rtl_release_async_info(int32_t DeviceId, - __tgt_async_info *AsyncInfo) { - assert(DeviceRTL.isValidDeviceId(DeviceId) && "device_id is invalid"); - assert(AsyncInfo && "async_info is nullptr"); - - if (DeviceRTL.setContext(DeviceId) != OFFLOAD_SUCCESS) - return OFFLOAD_FAIL; - - return DeviceRTL.releaseAsyncInfo(DeviceId, AsyncInfo); -} - -int32_t __tgt_rtl_init_async_info(int32_t DeviceId, - __tgt_async_info **AsyncInfo) { - assert(DeviceRTL.isValidDeviceId(DeviceId) && "device_id is invalid"); - assert(AsyncInfo && "async_info is nullptr"); - - if (DeviceRTL.setContext(DeviceId) != OFFLOAD_SUCCESS) - return OFFLOAD_FAIL; - - return DeviceRTL.initAsyncInfo(DeviceId, AsyncInfo); -} - -int32_t __tgt_rtl_init_device_info(int32_t DeviceId, - __tgt_device_info *DeviceInfoPtr, - const char **ErrStr) { - assert(DeviceRTL.isValidDeviceId(DeviceId) && "device_id is invalid"); - assert(DeviceInfoPtr && "device_info_ptr is nullptr"); - - if (DeviceRTL.setContext(DeviceId) != OFFLOAD_SUCCESS) - return OFFLOAD_FAIL; - - return DeviceRTL.initDeviceInfo(DeviceId, DeviceInfoPtr, ErrStr); -} - -#ifdef __cplusplus -} -#endif diff --git a/openmp/libomptarget/plugins/exports b/openmp/libomptarget/plugins/exports deleted file mode 100644 --- a/openmp/libomptarget/plugins/exports +++ /dev/null @@ -1,6 +0,0 @@ -VERS1.0 { - global: - __tgt_rtl*; - local: - *; -}; diff --git a/openmp/libomptarget/plugins/generic-elf-64bit/src/rtl.cpp b/openmp/libomptarget/plugins/generic-elf-64bit/src/rtl.cpp deleted file mode 100644 --- a/openmp/libomptarget/plugins/generic-elf-64bit/src/rtl.cpp +++ /dev/null @@ -1,280 +0,0 @@ -//===-RTLs/generic-64bit/src/rtl.cpp - Target RTLs Implementation - C++ -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// RTL for generic 64-bit machine -// -//===----------------------------------------------------------------------===// - -#include "llvm/ADT/SmallVector.h" -#include "llvm/Support/DynamicLibrary.h" - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "Debug.h" -#include "omptargetplugin.h" - -using namespace llvm; -using namespace llvm::sys; - -#ifndef TARGET_NAME -#define TARGET_NAME Generic ELF - 64bit -#endif -#define DEBUG_PREFIX "TARGET " GETNAME(TARGET_NAME) " RTL" - -#ifndef TARGET_ELF_ID -#define TARGET_ELF_ID 0 -#endif - -#include "elf_common.h" - -#define NUMBER_OF_DEVICES 4 -#define OFFLOAD_SECTION_NAME "omp_offloading_entries" - -/// Array of Dynamic libraries loaded for this target. -struct DynLibTy { - std::string FileName; - std::unique_ptr DynLib; -}; - -/// Keep entries table per device. -struct FuncOrGblEntryTy { - __tgt_target_table Table; - SmallVector<__tgt_offload_entry> Entries; -}; - -/// Class containing all the device information. -class RTLDeviceInfoTy { - std::vector> FuncGblEntries; - -public: - std::list DynLibs; - - // Record entry point associated with device. - void createOffloadTable(int32_t DeviceId, - SmallVector<__tgt_offload_entry> &&Entries) { - assert(DeviceId < (int32_t)FuncGblEntries.size() && - "Unexpected device id!"); - FuncGblEntries[DeviceId].emplace_back(); - FuncOrGblEntryTy &E = FuncGblEntries[DeviceId].back(); - - E.Entries = Entries; - E.Table.EntriesBegin = E.Entries.begin(); - E.Table.EntriesEnd = E.Entries.end(); - } - - // Return true if the entry is associated with device. - bool findOffloadEntry(int32_t DeviceId, void *Addr) { - assert(DeviceId < (int32_t)FuncGblEntries.size() && - "Unexpected device id!"); - FuncOrGblEntryTy &E = FuncGblEntries[DeviceId].back(); - - for (__tgt_offload_entry *I = E.Table.EntriesBegin, - *End = E.Table.EntriesEnd; - I < End; ++I) { - if (I->addr == Addr) - return true; - } - - return false; - } - - // Return the pointer to the target entries table. - __tgt_target_table *getOffloadEntriesTable(int32_t DeviceId) { - assert(DeviceId < (int32_t)FuncGblEntries.size() && - "Unexpected device id!"); - FuncOrGblEntryTy &E = FuncGblEntries[DeviceId].back(); - - return &E.Table; - } - - RTLDeviceInfoTy(int32_t NumDevices) { FuncGblEntries.resize(NumDevices); } - - ~RTLDeviceInfoTy() { - // Close dynamic libraries - for (auto &Lib : DynLibs) { - if (Lib.DynLib->isValid()) - remove(Lib.FileName.c_str()); - } - } -}; - -static RTLDeviceInfoTy DeviceInfo(NUMBER_OF_DEVICES); - -#ifdef __cplusplus -extern "C" { -#endif - -int32_t __tgt_rtl_is_valid_binary(__tgt_device_image *Image) { -// If we don't have a valid ELF ID we can just fail. -#if TARGET_ELF_ID < 1 - return 0; -#else - return elf_check_machine(Image, TARGET_ELF_ID); -#endif -} - -int32_t __tgt_rtl_number_of_devices() { return NUMBER_OF_DEVICES; } - -int32_t __tgt_rtl_init_device(int32_t DeviceId) { return OFFLOAD_SUCCESS; } - -__tgt_target_table *__tgt_rtl_load_binary(int32_t DeviceId, - __tgt_device_image *Image) { - - DP("Dev %d: load binary from " DPxMOD " image\n", DeviceId, - DPxPTR(Image->ImageStart)); - - assert(DeviceId >= 0 && DeviceId < NUMBER_OF_DEVICES && "bad dev id"); - - size_t ImageSize = (size_t)Image->ImageEnd - (size_t)Image->ImageStart; - - // load dynamic library and get the entry points. We use the dl library - // to do the loading of the library, but we could do it directly to avoid the - // dump to the temporary file. - // - // 1) Create tmp file with the library contents. - // 2) Use dlopen to load the file and dlsym to retrieve the symbols. - char TmpName[] = "/tmp/tmpfile_XXXXXX"; - int TmpFd = mkstemp(TmpName); - - if (TmpFd == -1) - return nullptr; - - FILE *Ftmp = fdopen(TmpFd, "wb"); - - if (!Ftmp) - return nullptr; - - fwrite(Image->ImageStart, ImageSize, 1, Ftmp); - fclose(Ftmp); - - std::string ErrMsg; - auto DynLib = std::make_unique( - sys::DynamicLibrary::getPermanentLibrary(TmpName, &ErrMsg)); - DynLibTy Lib = {TmpName, std::move(DynLib)}; - - if (!Lib.DynLib->isValid()) { - DP("Target library loading error: %s\n", ErrMsg.c_str()); - return NULL; - } - - __tgt_offload_entry *HostBegin = Image->EntriesBegin; - __tgt_offload_entry *HostEnd = Image->EntriesEnd; - - // Create a new offloading entry list using the device symbol address. - SmallVector<__tgt_offload_entry> Entries; - for (__tgt_offload_entry *E = HostBegin; E != HostEnd; ++E) { - if (!E->addr) - return nullptr; - - __tgt_offload_entry Entry = *E; - - void *DevAddr = Lib.DynLib->getAddressOfSymbol(E->name); - Entry.addr = DevAddr; - - DP("Entry point " DPxMOD " maps to global %s (" DPxMOD ")\n", - DPxPTR(E - HostBegin), E->name, DPxPTR(DevAddr)); - - Entries.emplace_back(Entry); - } - - DeviceInfo.createOffloadTable(DeviceId, std::move(Entries)); - DeviceInfo.DynLibs.emplace_back(std::move(Lib)); - - return DeviceInfo.getOffloadEntriesTable(DeviceId); -} - -void __tgt_rtl_print_device_info(int32_t DeviceId) { - printf(" This is a generic-elf-64bit device\n"); -} - -// Sample implementation of explicit memory allocator. For this plugin all kinds -// are equivalent to each other. -void *__tgt_rtl_data_alloc(int32_t DeviceId, int64_t Size, void *HstPtr, - int32_t Kind) { - void *Ptr = NULL; - - switch (Kind) { - case TARGET_ALLOC_DEVICE: - case TARGET_ALLOC_HOST: - case TARGET_ALLOC_SHARED: - case TARGET_ALLOC_DEFAULT: - Ptr = malloc(Size); - break; - default: - REPORT("Invalid target data allocation kind"); - } - - return Ptr; -} - -int32_t __tgt_rtl_data_submit(int32_t DeviceId, void *TgtPtr, void *HstPtr, - int64_t Size) { - memcpy(TgtPtr, HstPtr, Size); - return OFFLOAD_SUCCESS; -} - -int32_t __tgt_rtl_data_retrieve(int32_t DeviceId, void *HstPtr, void *TgtPtr, - int64_t Size) { - memcpy(HstPtr, TgtPtr, Size); - return OFFLOAD_SUCCESS; -} - -int32_t __tgt_rtl_data_delete(int32_t DeviceId, void *TgtPtr, int32_t) { - free(TgtPtr); - return OFFLOAD_SUCCESS; -} - -int32_t __tgt_rtl_launch_kernel(int32_t DeviceId, void *TgtEntryPtr, - void **TgtArgs, ptrdiff_t *TgtOffsets, - KernelArgsTy *KernelArgs, - __tgt_async_info *AsyncInfoPtr) { - assert(!KernelArgs->NumTeams[1] && !KernelArgs->NumTeams[2] && - !KernelArgs->ThreadLimit[1] && !KernelArgs->ThreadLimit[2] && - "Only one dimensional kernels supported."); - // ignore team num and thread limit. - - // Use libffi to launch execution. - ffi_cif Cif; - - // All args are references. - std::vector ArgsTypes(KernelArgs->NumArgs, &ffi_type_pointer); - std::vector Args(KernelArgs->NumArgs); - std::vector Ptrs(KernelArgs->NumArgs); - - for (uint32_t I = 0; I < KernelArgs->NumArgs; ++I) { - Ptrs[I] = (void *)((intptr_t)TgtArgs[I] + TgtOffsets[I]); - Args[I] = &Ptrs[I]; - } - - ffi_status Status = ffi_prep_cif(&Cif, FFI_DEFAULT_ABI, KernelArgs->NumArgs, - &ffi_type_void, &ArgsTypes[0]); - - assert(Status == FFI_OK && "Unable to prepare target launch!"); - - if (Status != FFI_OK) - return OFFLOAD_FAIL; - - DP("Running entry point at " DPxMOD "...\n", DPxPTR(TgtEntryPtr)); - - void (*Entry)(void); - *((void **)&Entry) = TgtEntryPtr; - ffi_call(&Cif, Entry, NULL, &Args[0]); - return OFFLOAD_SUCCESS; -} - -#ifdef __cplusplus -} -#endif diff --git a/openmp/libomptarget/plugins/ppc64/CMakeLists.txt b/openmp/libomptarget/plugins/ppc64/CMakeLists.txt deleted file mode 100644 --- a/openmp/libomptarget/plugins/ppc64/CMakeLists.txt +++ /dev/null @@ -1,17 +0,0 @@ -##===----------------------------------------------------------------------===## -# -# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -# See https://llvm.org/LICENSE.txt for license information. -# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -# -##===----------------------------------------------------------------------===## -# -# Build a plugin for a ppc64 machine if available. -# -##===----------------------------------------------------------------------===## - -if(CMAKE_SYSTEM_NAME MATCHES "Linux") - build_generic_elf64("ppc64" "PPC64" "ppc64" "powerpc64-ibm-linux-gnu" "21") -else() - libomptarget_say("Not building ppc64 offloading plugin: machine not found in the system.") -endif() diff --git a/openmp/libomptarget/plugins/ppc64le/CMakeLists.txt b/openmp/libomptarget/plugins/ppc64le/CMakeLists.txt deleted file mode 100644 --- a/openmp/libomptarget/plugins/ppc64le/CMakeLists.txt +++ /dev/null @@ -1,17 +0,0 @@ -##===----------------------------------------------------------------------===## -# -# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -# See https://llvm.org/LICENSE.txt for license information. -# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -# -##===----------------------------------------------------------------------===## -# -# Build a plugin for a ppc64le machine if available. -# -##===----------------------------------------------------------------------===## - -if(CMAKE_SYSTEM_NAME MATCHES "Linux") - build_generic_elf64("ppc64le" "PPC64le" "ppc64" "powerpc64le-ibm-linux-gnu" "21") -else() - libomptarget_say("Not building ppc64le offloading plugin: machine not found in the system.") -endif() diff --git a/openmp/libomptarget/plugins/remote/CMakeLists.txt b/openmp/libomptarget/plugins/remote/CMakeLists.txt deleted file mode 100644 --- a/openmp/libomptarget/plugins/remote/CMakeLists.txt +++ /dev/null @@ -1,54 +0,0 @@ -##===----------------------------------------------------------------------===## -# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -# See https://llvm.org/LICENSE.txt for license information. -# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -# -##===----------------------------------------------------------------------===## -# -# Build a plugin (client) and server for remote offloading. -# -##===----------------------------------------------------------------------===# - -if (NOT(CMAKE_SYSTEM_NAME MATCHES "Linux")) - libomptarget_say("Not building remote offloading plugin: only support Linux hosts.") - return() -endif() - -if (NOT(LIBOMPTARGET_ENABLE_EXPERIMENTAL_REMOTE_PLUGIN)) - return() -endif() - -find_package(Protobuf) -find_package(gRPC CONFIG) - -find_program(PROTOC protoc) -find_program(GRPC_CPP_PLUGIN grpc_cpp_plugin) - -if (Protobuf_FOUND AND gRPC_FOUND AND PROTOC AND GRPC_CPP_PLUGIN) - libomptarget_say("Building remote offloading plugin.") - set(directory "${CMAKE_BINARY_DIR}/include/openmp/libomptarget/plugins/remote/") - file(MAKE_DIRECTORY ${directory}) - execute_process(COMMAND ${CMAKE_COMMAND} -E make_directory ${directory}) - execute_process( - COMMAND protoc --cpp_out=${directory} -I ${CMAKE_CURRENT_SOURCE_DIR}/include ${CMAKE_CURRENT_SOURCE_DIR}/include/openmp.proto - COMMAND protoc --grpc_out=${directory} -I ${CMAKE_CURRENT_SOURCE_DIR}/include ${CMAKE_CURRENT_SOURCE_DIR}/include/openmp.proto --plugin=protoc-gen-grpc=${GRPC_CPP_PLUGIN} - ) - - set(GRPC_SRC_FILES - ${directory}/openmp.grpc.pb.cc - ${directory}/openmp.pb.cc - ) - - set(GRPC_INCLUDE_DIR - ${directory} - ) - - set(RPC_INCLUDE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/include/) - set(RPC_SRC_DIR ${CMAKE_CURRENT_SOURCE_DIR}/lib/) - - add_subdirectory(src) - add_subdirectory(server) -else() - libomptarget_say("Not building remote offloading plugin: required libraries were not found.") -endif() - diff --git a/openmp/libomptarget/plugins/remote/include/Utils.h b/openmp/libomptarget/plugins/remote/include/Utils.h deleted file mode 100644 --- a/openmp/libomptarget/plugins/remote/include/Utils.h +++ /dev/null @@ -1,133 +0,0 @@ -//===----------------- Utils.h - Utilities for Remote RTL -----------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// Utilities for data transfer through protobuf and debugging. -// -//===----------------------------------------------------------------------===// - -#ifndef UTILS_H -#define UTILS_H - -#include "Debug.h" -#include "omptarget.h" -#include "openmp.grpc.pb.h" -#include "openmp.pb.h" -#include "rtl.h" -#include - -#define CLIENT_DBG(...) \ - { \ - if (DebugLevel > 0) { \ - fprintf(stderr, "[[Client]] --> "); \ - fprintf(stderr, __VA_ARGS__); \ - fprintf(stderr, "\n"); \ - } \ - } - -#define SERVER_DBG(...) \ - { \ - if (DebugLevel > 0) { \ - fprintf(stderr, "[[Server]] --> "); \ - fprintf(stderr, __VA_ARGS__); \ - fprintf(stderr, "\n"); \ - } \ - } - -namespace RemoteOffloading { - -using namespace openmp::libomptarget::remote; - -using openmp::libomptarget::remote::DeviceOffloadEntry; -using openmp::libomptarget::remote::TargetBinaryDescription; -using openmp::libomptarget::remote::TargetOffloadEntry; -using openmp::libomptarget::remote::TargetTable; - -struct ClientManagerConfigTy { - std::vector ServerAddresses; - uint64_t MaxSize; - uint64_t BlockSize; - int Timeout; - - ClientManagerConfigTy() - : ServerAddresses({"0.0.0.0:50051"}), MaxSize(1 << 30), - BlockSize(1 << 20), Timeout(5) { - // TODO: Error handle for incorrect inputs - if (const char *Env = std::getenv("LIBOMPTARGET_RPC_ADDRESS")) { - ServerAddresses.clear(); - std::string AddressString = Env; - const std::string Delimiter = ","; - - size_t Pos; - std::string Token; - while ((Pos = AddressString.find(Delimiter)) != std::string::npos) { - Token = AddressString.substr(0, Pos); - ServerAddresses.push_back(Token); - AddressString.erase(0, Pos + Delimiter.length()); - } - ServerAddresses.push_back(AddressString); - } - if (const char *Env = std::getenv("LIBOMPTARGET_RPC_ALLOCATOR_MAX")) - MaxSize = std::stoi(Env); - if (const char *Env = std::getenv("LIBOMPTARGET_RPC_BLOCK_SIZE")) - BlockSize = std::stoi(Env); - if (const char *Env1 = std::getenv("LIBOMPTARGET_RPC_LATENCY")) - Timeout = std::stoi(Env1); - } -}; - -/// Loads a target binary description into protobuf. -void loadTargetBinaryDescription(const __tgt_bin_desc *Desc, - TargetBinaryDescription &Request); - -/// Unload a target binary description from protobuf. The map is used to keep -/// track of already copied device images. -void unloadTargetBinaryDescription( - const TargetBinaryDescription *Request, __tgt_bin_desc *Desc, - std::unordered_map - &HostToRemoteDeviceImage); - -/// Frees argument as constructed by loadTargetBinaryDescription -void freeTargetBinaryDescription(__tgt_bin_desc *Desc); - -/// Copies from TargetOffloadEntry protobuf to a tgt_bin_desc during unloading. -void copyOffloadEntry(const TargetOffloadEntry &EntryResponse, - __tgt_offload_entry *Entry); - -/// Copies from tgt_bin_desc into TargetOffloadEntry protobuf during loading. -void copyOffloadEntry(const __tgt_offload_entry *Entry, - TargetOffloadEntry *EntryResponse); - -/// Shallow copy of offload entry from tgt_bin_desc to TargetOffloadEntry -/// during loading. -void shallowCopyOffloadEntry(const __tgt_offload_entry *Entry, - TargetOffloadEntry *EntryResponse); - -/// Copies DeviceOffloadEntries into table during unloading. -void copyOffloadEntry(const DeviceOffloadEntry &EntryResponse, - __tgt_offload_entry *Entry); - -/// Loads tgt_target_table into a TargetTable protobuf message. -void loadTargetTable(__tgt_target_table *Table, TargetTable &TableResponse, - __tgt_device_image *Image); - -/// Unloads from a target_table from protobuf. -void unloadTargetTable( - TargetTable &TableResponse, __tgt_target_table *Table, - std::unordered_map &HostToRemoteTargetTableMap); - -/// Frees argument as constructed by unloadTargetTable -void freeTargetTable(__tgt_target_table *Table); - -void dump(const void *Start, const void *End); -void dump(__tgt_offload_entry *Entry); -void dump(TargetOffloadEntry Entry); -void dump(__tgt_target_table *Table); -void dump(__tgt_device_image *Image); -} // namespace RemoteOffloading - -#endif diff --git a/openmp/libomptarget/plugins/remote/include/openmp.proto b/openmp/libomptarget/plugins/remote/include/openmp.proto deleted file mode 100644 --- a/openmp/libomptarget/plugins/remote/include/openmp.proto +++ /dev/null @@ -1,153 +0,0 @@ -syntax = "proto3"; - -package openmp.libomptarget.remote; -option cc_enable_arenas = true; - -service RemoteOffload { - rpc Shutdown(Null) returns (I32) {} - - rpc RegisterLib(TargetBinaryDescription) returns (I32) {} - rpc UnregisterLib(Pointer) returns (I32) {} - - rpc IsValidBinary(TargetDeviceImagePtr) returns (I32) {} - rpc GetNumberOfDevices(Null) returns (I32) {} - - rpc InitDevice(I32) returns (I32) {} - rpc InitRequires(I64) returns (I32) {} - - rpc LoadBinary(Binary) returns (TargetTable) {} - - rpc DataAlloc(AllocData) returns (Pointer) {} - rpc DataDelete(DeleteData) returns (I32) {} - - rpc DataSubmit(stream SubmitData) returns (I32) {} - rpc DataRetrieve(RetrieveData) returns (stream Data) {} - - rpc IsDataExchangeable(DevicePair) returns (I32) {} - rpc DataExchange(ExchangeData) returns (I32) {} - - rpc RunTargetRegion(TargetRegion) returns (I32) {} - rpc RunTargetTeamRegion(TargetTeamRegion) returns (I32) {} -} - -message Null {} - -message Pointer { uint64 number = 1; } - -message I32 { int32 number = 1; } - -message I64 { int64 number = 1; } - -message DevicePair { - int32 src_dev_id = 1; - int32 dst_dev_id = 2; -} - -message Binary { - uint64 image_ptr = 1; - int32 device_id = 2; -} - -message TargetOffloadEntry { - bytes data = 1; - string name = 2; - int32 flags = 3; - int32 reserved = 4; -} - -message DeviceOffloadEntry { - string name = 1; - uint64 addr = 2; - int32 flags = 3; - int32 reserved = 4; - int32 size = 5; -} - -message TargetTable { - repeated DeviceOffloadEntry entries = 1; - repeated uint64 entry_ptrs = 2; -} - -message TargetDeviceImagePtr { - uint64 image_ptr = 1; - repeated uint64 entry_ptrs = 2; -} - -message TargetDeviceImage { - bytes binary = 1; - repeated TargetOffloadEntry entries = 2; -} - -message ImagePtrs { - uint64 img_ptr = 1; - repeated uint64 entry_ptrs = 2; -} - -message TargetBinaryDescription { - repeated ImagePtrs image_ptrs = 1; - repeated TargetOffloadEntry entries = 2; - repeated TargetDeviceImage images = 3; - repeated uint64 entry_ptrs = 4; - uint64 bin_ptr = 5; -} - -message AllocData { - uint64 size = 1; - uint64 hst_ptr = 2; - int32 device_id = 3; -} - -message SubmitData { - bytes data = 1; - uint64 hst_ptr = 2; - uint64 tgt_ptr = 3; - uint64 start = 5; - uint64 size = 6; - int32 device_id = 7; -} - -message RetrieveData { - uint64 hst_ptr = 1; - uint64 tgt_ptr = 2; - uint64 size = 3; - int32 device_id = 5; -} - -message Data { - bytes data = 1; - uint64 start = 2; - uint64 size = 3; - int32 ret = 4; -} - -message ExchangeData { - uint64 src_dev_id = 1; - uint64 src_ptr = 2; - uint64 dst_dev_id = 3; - uint64 dst_ptr = 4; - uint64 size = 6; -} - -message DeleteData { - uint64 tgt_ptr = 1; - int32 device_id = 2; -} - -message TargetRegion { - repeated uint64 tgt_args = 1; - repeated int64 tgt_offsets = 2; - uint64 tgt_entry_ptr = 3; - int32 device_id = 4; - int32 arg_num = 5; -} - -message TargetTeamRegion { - repeated uint64 tgt_args = 1; - repeated int64 tgt_offsets = 2; - uint64 tgt_entry_ptr = 3; - uint64 loop_tripcount = 4; - int32 device_id = 5; - int32 arg_num = 6; - int32 team_num = 7; - int32 thread_limit = 8; -} diff --git a/openmp/libomptarget/plugins/remote/lib/Utils.cpp b/openmp/libomptarget/plugins/remote/lib/Utils.cpp deleted file mode 100644 --- a/openmp/libomptarget/plugins/remote/lib/Utils.cpp +++ /dev/null @@ -1,295 +0,0 @@ -//===---------------- Utils.cpp - Utilities for Remote RTL ----------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// Utilities for data movement and debugging. -// -//===----------------------------------------------------------------------===// - -#include "Utils.h" -#include "omptarget.h" - -namespace RemoteOffloading { - -void loadTargetBinaryDescription(const __tgt_bin_desc *Desc, - TargetBinaryDescription &Request) { - // Keeps track of entries which have already been deep copied. - std::vector DeepCopiedEntryAddrs; - - // Copy Global Offload Entries - for (auto *CurEntry = Desc->HostEntriesBegin; - CurEntry != Desc->HostEntriesEnd; CurEntry++) { - auto *NewEntry = Request.add_entries(); - copyOffloadEntry(CurEntry, NewEntry); - - // Copy the pointer of the offload entry of the image into the Request - Request.add_entry_ptrs((uint64_t)CurEntry); - DeepCopiedEntryAddrs.push_back(CurEntry); - } - - // Copy Device Images and Device Offload Entries - __tgt_device_image *CurImage = Desc->DeviceImages; - for (auto I = 0; I < Desc->NumDeviceImages; I++, CurImage++) { - auto *Image = Request.add_images(); - auto Size = (char *)CurImage->ImageEnd - (char *)CurImage->ImageStart; - Image->set_binary(CurImage->ImageStart, Size); - - // Copy the pointer of the image into the Request - auto *NewImagePtr = Request.add_image_ptrs(); - NewImagePtr->set_img_ptr((uint64_t)CurImage->ImageStart); - - // Copy Device Offload Entries - for (auto *CurEntry = CurImage->EntriesBegin; - CurEntry != CurImage->EntriesEnd; CurEntry++) { - auto *NewEntry = Image->add_entries(); - - auto Entry = std::find(DeepCopiedEntryAddrs.begin(), - DeepCopiedEntryAddrs.end(), CurEntry); - if (Entry != DeepCopiedEntryAddrs.end()) { - // Offload entry has already been loaded - shallowCopyOffloadEntry(CurEntry, NewEntry); - } else { // Offload Entry has not been loaded into the Request - copyOffloadEntry(CurEntry, NewEntry); - DeepCopiedEntryAddrs.push_back(CurEntry); - } - - // Copy the pointer of the offload entry of the image into the Request - NewImagePtr->add_entry_ptrs((uint64_t)CurEntry); - } - } -} - -void unloadTargetBinaryDescription( - const TargetBinaryDescription *Request, __tgt_bin_desc *Desc, - std::unordered_map - &HostToRemoteDeviceImage) { - std::unordered_map CopiedOffloadEntries; - Desc->NumDeviceImages = Request->images_size(); - Desc->DeviceImages = new __tgt_device_image[Desc->NumDeviceImages]; - - if (Request->entries_size()) - Desc->HostEntriesBegin = new __tgt_offload_entry[Request->entries_size()]; - else { - Desc->HostEntriesBegin = nullptr; - Desc->HostEntriesEnd = nullptr; - } - - // Copy Global Offload Entries - __tgt_offload_entry *CurEntry = Desc->HostEntriesBegin; - size_t I = 0; - for (auto &Entry : Request->entries()) { - copyOffloadEntry(Entry, CurEntry); - CopiedOffloadEntries[(void *)Request->entry_ptrs()[I]] = CurEntry; - CurEntry++; - I++; - } - Desc->HostEntriesEnd = CurEntry; - - // Copy Device Images and Device Offload Entries - __tgt_device_image *CurImage = Desc->DeviceImages; - auto ImageItr = Request->image_ptrs().begin(); - for (auto Image : Request->images()) { - // Copy Device Offload Entries - CurEntry = Desc->HostEntriesBegin; - bool Found = false; - - if (!Desc->HostEntriesBegin) { - CurImage->EntriesBegin = nullptr; - CurImage->EntriesEnd = nullptr; - } - - for (size_t I = 0; I < Image.entries_size(); I++) { - auto TgtEntry = - CopiedOffloadEntries.find((void *)Request->entry_ptrs()[I]); - if (TgtEntry != CopiedOffloadEntries.end()) { - if (!Found) - CurImage->EntriesBegin = CurEntry; - - CurImage->EntriesEnd = CurEntry + 1; - Found = true; - } else { - Found = false; - copyOffloadEntry(Image.entries()[I], CurEntry); - CopiedOffloadEntries[(void *)(Request->entry_ptrs()[I])] = CurEntry; - } - CurEntry++; - } - - // Copy Device Image - CurImage->ImageStart = new uint8_t[Image.binary().size()]; - memcpy(CurImage->ImageStart, - static_cast(Image.binary().data()), - Image.binary().size()); - CurImage->ImageEnd = - (void *)((char *)CurImage->ImageStart + Image.binary().size()); - - HostToRemoteDeviceImage[(void *)ImageItr->img_ptr()] = CurImage; - CurImage++; - ImageItr++; - } -} - -void freeTargetBinaryDescription(__tgt_bin_desc *Desc) { - __tgt_device_image *CurImage = Desc->DeviceImages; - for (auto I = 0; I < Desc->NumDeviceImages; I++, CurImage++) - delete[](uint64_t *) CurImage->ImageStart; - - delete[] Desc->DeviceImages; - - for (auto *Entry = Desc->HostEntriesBegin; Entry != Desc->HostEntriesEnd; - Entry++) { - free(Entry->name); - free(Entry->addr); - } - - delete[] Desc->HostEntriesBegin; -} - -void freeTargetTable(__tgt_target_table *Table) { - for (auto *Entry = Table->EntriesBegin; Entry != Table->EntriesEnd; Entry++) - free(Entry->name); - - delete[] Table->EntriesBegin; -} - -void loadTargetTable(__tgt_target_table *Table, TargetTable &TableResponse, - __tgt_device_image *Image) { - auto *ImageEntry = Image->EntriesBegin; - for (__tgt_offload_entry *CurEntry = Table->EntriesBegin; - CurEntry != Table->EntriesEnd; CurEntry++, ImageEntry++) { - // TODO: This can probably be trimmed substantially. - auto *NewEntry = TableResponse.add_entries(); - NewEntry->set_name(CurEntry->name); - NewEntry->set_addr((uint64_t)CurEntry->addr); - NewEntry->set_flags(CurEntry->flags); - NewEntry->set_reserved(CurEntry->reserved); - NewEntry->set_size(CurEntry->size); - TableResponse.add_entry_ptrs((int64_t)CurEntry); - } -} - -void unloadTargetTable( - TargetTable &TableResponse, __tgt_target_table *Table, - std::unordered_map &HostToRemoteTargetTableMap) { - Table->EntriesBegin = new __tgt_offload_entry[TableResponse.entries_size()]; - - auto *CurEntry = Table->EntriesBegin; - for (size_t I = 0; I < TableResponse.entries_size(); I++) { - copyOffloadEntry(TableResponse.entries()[I], CurEntry); - HostToRemoteTargetTableMap[CurEntry->addr] = - (void *)TableResponse.entry_ptrs()[I]; - CurEntry++; - } - Table->EntriesEnd = CurEntry; -} - -void copyOffloadEntry(const TargetOffloadEntry &EntryResponse, - __tgt_offload_entry *Entry) { - Entry->name = strdup(EntryResponse.name().c_str()); - Entry->reserved = EntryResponse.reserved(); - Entry->flags = EntryResponse.flags(); - Entry->addr = strdup(EntryResponse.data().c_str()); - Entry->size = EntryResponse.data().size(); -} - -void copyOffloadEntry(const DeviceOffloadEntry &EntryResponse, - __tgt_offload_entry *Entry) { - Entry->name = strdup(EntryResponse.name().c_str()); - Entry->reserved = EntryResponse.reserved(); - Entry->flags = EntryResponse.flags(); - Entry->addr = (void *)EntryResponse.addr(); - Entry->size = EntryResponse.size(); -} - -/// We shallow copy with just the name because it is a convenient identifier, we -/// do actually just match off of the address. -void shallowCopyOffloadEntry(const __tgt_offload_entry *Entry, - TargetOffloadEntry *EntryResponse) { - EntryResponse->set_name(Entry->name); -} - -void copyOffloadEntry(const __tgt_offload_entry *Entry, - TargetOffloadEntry *EntryResponse) { - shallowCopyOffloadEntry(Entry, EntryResponse); - EntryResponse->set_reserved(Entry->reserved); - EntryResponse->set_flags(Entry->flags); - EntryResponse->set_data(Entry->addr, Entry->size); -} - -/// Dumps the memory region from Start to End in order to debug memory transfer -/// errors within the plugin -void dump(const void *Start, const void *End) { - unsigned char Line[17]; - const unsigned char *PrintCharacter = (const unsigned char *)Start; - - unsigned int I = 0; - for (; I < ((const int *)End - (const int *)Start); I++) { - if ((I % 16) == 0) { - if (I != 0) - printf(" %s\n", Line); - - printf(" %04x ", I); - } - - printf(" %02x", PrintCharacter[I]); - - if ((PrintCharacter[I] < 0x20) || (PrintCharacter[I] > 0x7e)) - Line[I % 16] = '.'; - else - Line[I % 16] = PrintCharacter[I]; - - Line[(I % 16) + 1] = '\0'; - } - - while ((I % 16) != 0) { - printf(" "); - I++; - } - - printf(" %s\n", Line); -} - -void dump(__tgt_offload_entry *Entry) { - fprintf(stderr, "Entry (%p):\n", (void *)Entry); - fprintf(stderr, " Name: %s (%p)\n", Entry->name, (void *)&Entry->name); - fprintf(stderr, " Reserved: %d (%p)\n", Entry->reserved, - (void *)&Entry->reserved); - fprintf(stderr, " Flags: %d (%p)\n", Entry->flags, (void *)&Entry->flags); - fprintf(stderr, " Addr: %p\n", Entry->addr); - fprintf(stderr, " Size: %lu\n", Entry->size); -} - -void dump(__tgt_target_table *Table) { - for (auto *CurEntry = Table->EntriesBegin; CurEntry != Table->EntriesEnd; - CurEntry++) - dump(CurEntry); -} - -void dump(TargetOffloadEntry Entry) { - fprintf(stderr, "Entry: "); - fprintf(stderr, " Name: %s\n", Entry.name().c_str()); - fprintf(stderr, " Reserved: %d\n", Entry.reserved()); - fprintf(stderr, " Flags: %d\n", Entry.flags()); - fprintf(stderr, " Size: %ld\n", Entry.data().size()); - dump(static_cast(Entry.data().data()), - static_cast((Entry.data().c_str() + Entry.data().size()))); -} - -void dump(__tgt_device_image *Image) { - dump(Image->ImageStart, Image->ImageEnd); - __tgt_offload_entry *EntryItr = Image->EntriesBegin; - for (; EntryItr != Image->EntriesEnd; EntryItr++) - dump(EntryItr); -} - -void dump(std::unordered_map &Map) { - fprintf(stderr, "Host to Remote Entry Map:\n"); - for (auto Entry : Map) - fprintf(stderr, " Host (%p) -> Tgt (%p): Addr((%p))\n", Entry.first, - (void *)Entry.second, (void *)Entry.second->addr); -} -} // namespace RemoteOffloading diff --git a/openmp/libomptarget/plugins/remote/server/CMakeLists.txt b/openmp/libomptarget/plugins/remote/server/CMakeLists.txt deleted file mode 100644 --- a/openmp/libomptarget/plugins/remote/server/CMakeLists.txt +++ /dev/null @@ -1,32 +0,0 @@ -##===----------------------------------------------------------------------===## -# -# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -# See https://llvm.org/LICENSE.txt for license information. -# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -# -##===----------------------------------------------------------------------===## -# -# Build server for remote offloading. -# -##===----------------------------------------------------------------------===## - -include_directories(${LIBOMPTARGET_SRC_DIR}) -include_directories(${LIBOMPTARGET_INCLUDE_DIR}) -include_directories(${GRPC_INCLUDE_DIR}) -include_directories(${RPC_INCLUDE_DIR}) - -add_executable(openmp-offloading-server - ${LIBOMPTARGET_SRC_FILES} - ${GRPC_SRC_FILES} - ${RPC_SRC_DIR}/Utils.cpp - Server.cpp - OffloadingServer.cpp -) - -target_link_libraries(openmp-offloading-server - grpc++ - protobuf - absl::synchronization - ${OPENMP_PTHREAD_LIB} - omp - "-Wl,--version-script=${CMAKE_CURRENT_SOURCE_DIR}/../../exports") diff --git a/openmp/libomptarget/plugins/remote/server/OffloadingServer.cpp b/openmp/libomptarget/plugins/remote/server/OffloadingServer.cpp deleted file mode 100644 --- a/openmp/libomptarget/plugins/remote/server/OffloadingServer.cpp +++ /dev/null @@ -1,51 +0,0 @@ -//===------------- OffloadingServer.cpp - Server Application --------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// Offloading server for remote host. -// -//===----------------------------------------------------------------------===// - -#include -#include -#include -#include -#include - -#include "Server.h" - -using grpc::Server; -using grpc::ServerBuilder; - -std::promise ShutdownPromise; - -int main() { - ClientManagerConfigTy Config; - - RemoteOffloadImpl Service(Config.MaxSize, Config.BlockSize); - - ServerBuilder Builder; - Builder.AddListeningPort(Config.ServerAddresses[0], - grpc::InsecureServerCredentials()); - Builder.RegisterService(&Service); - Builder.SetMaxMessageSize(INT_MAX); - std::unique_ptr Server(Builder.BuildAndStart()); - if (getDebugLevel()) - std::cerr << "Server listening on " << Config.ServerAddresses[0] - << std::endl; - - auto WaitForServer = [&]() { Server->Wait(); }; - - std::thread ServerThread(WaitForServer); - - auto ShutdownFuture = ShutdownPromise.get_future(); - ShutdownFuture.wait(); - Server->Shutdown(); - ServerThread.join(); - - return 0; -} diff --git a/openmp/libomptarget/plugins/remote/server/Server.h b/openmp/libomptarget/plugins/remote/server/Server.h deleted file mode 100644 --- a/openmp/libomptarget/plugins/remote/server/Server.h +++ /dev/null @@ -1,106 +0,0 @@ -//===-------------------------- Server.h - Server -------------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// Offloading gRPC server for remote host. -// -//===----------------------------------------------------------------------===// - -#ifndef LLVM_OPENMP_LIBOMPTARGET_PLUGINS_REMOTE_SERVER_SERVER_H -#define LLVM_OPENMP_LIBOMPTARGET_PLUGINS_REMOTE_SERVER_SERVER_H - -#include - -#include "Utils.h" -#include "device.h" -#include "omptarget.h" -#include "openmp.grpc.pb.h" -#include "openmp.pb.h" -#include "rtl.h" - -using grpc::ServerContext; -using grpc::ServerReader; -using grpc::ServerWriter; -using grpc::Status; - -using namespace openmp::libomptarget::remote; -using namespace RemoteOffloading; - -using namespace google; - -extern PluginManager *PM; - -class RemoteOffloadImpl final : public RemoteOffload::Service { -private: - int32_t mapHostRTLDeviceId(int32_t RTLDeviceID); - - std::unordered_map - HostToRemoteDeviceImage; - std::unordered_map> - Descriptions; - __tgt_target_table *Table = nullptr; - - int DebugLevel; - uint64_t MaxSize; - uint64_t BlockSize; - std::unique_ptr Arena; - -public: - RemoteOffloadImpl(uint64_t MaxSize, uint64_t BlockSize) - : MaxSize(MaxSize), BlockSize(BlockSize) { - DebugLevel = getDebugLevel(); - Arena = std::make_unique(); - } - - Status Shutdown(ServerContext *Context, const Null *Request, - I32 *Reply) override; - - Status RegisterLib(ServerContext *Context, - const TargetBinaryDescription *Description, - I32 *Reply) override; - Status UnregisterLib(ServerContext *Context, const Pointer *Request, - I32 *Reply) override; - - Status IsValidBinary(ServerContext *Context, - const TargetDeviceImagePtr *Image, - I32 *IsValid) override; - Status GetNumberOfDevices(ServerContext *Context, const Null *Null, - I32 *NumberOfDevices) override; - - Status InitDevice(ServerContext *Context, const I32 *DeviceNum, - I32 *Reply) override; - Status InitRequires(ServerContext *Context, const I64 *RequiresFlag, - I32 *Reply) override; - - Status LoadBinary(ServerContext *Context, const Binary *Binary, - TargetTable *Reply) override; - Status IsDataExchangeable(ServerContext *Context, const DevicePair *Request, - I32 *Reply) override; - - Status DataAlloc(ServerContext *Context, const AllocData *Request, - Pointer *Reply) override; - - Status DataSubmit(ServerContext *Context, ServerReader *Reader, - I32 *Reply) override; - Status DataRetrieve(ServerContext *Context, const RetrieveData *Request, - ServerWriter *Writer) override; - - Status DataExchange(ServerContext *Context, const ExchangeData *Request, - I32 *Reply) override; - - Status DataDelete(ServerContext *Context, const DeleteData *Request, - I32 *Reply) override; - - Status RunTargetRegion(ServerContext *Context, const TargetRegion *Request, - I32 *Reply) override; - - Status RunTargetTeamRegion(ServerContext *Context, - const TargetTeamRegion *Request, - I32 *Reply) override; -}; - -#endif diff --git a/openmp/libomptarget/plugins/remote/server/Server.cpp b/openmp/libomptarget/plugins/remote/server/Server.cpp deleted file mode 100644 --- a/openmp/libomptarget/plugins/remote/server/Server.cpp +++ /dev/null @@ -1,352 +0,0 @@ -//===----------------- Server.cpp - Server Implementation -----------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// Offloading gRPC server for remote host. -// -//===----------------------------------------------------------------------===// - -#include -#include - -#include "Server.h" -#include "omptarget.h" -#include "openmp.grpc.pb.h" -#include "openmp.pb.h" - -using grpc::WriteOptions; - -extern std::promise ShutdownPromise; - -Status RemoteOffloadImpl::Shutdown(ServerContext *Context, const Null *Request, - I32 *Reply) { - SERVER_DBG("Shutting down the server") - - Reply->set_number(0); - ShutdownPromise.set_value(); - return Status::OK; -} - -Status -RemoteOffloadImpl::RegisterLib(ServerContext *Context, - const TargetBinaryDescription *Description, - I32 *Reply) { - auto Desc = std::make_unique<__tgt_bin_desc>(); - - unloadTargetBinaryDescription(Description, Desc.get(), - HostToRemoteDeviceImage); - PM->RTLs.RegisterLib(Desc.get()); - - if (Descriptions.find((void *)Description->bin_ptr()) != Descriptions.end()) - freeTargetBinaryDescription( - Descriptions[(void *)Description->bin_ptr()].get()); - else - Descriptions[(void *)Description->bin_ptr()] = std::move(Desc); - - SERVER_DBG("Registered library") - Reply->set_number(0); - return Status::OK; -} - -Status RemoteOffloadImpl::UnregisterLib(ServerContext *Context, - const Pointer *Request, I32 *Reply) { - if (Descriptions.find((void *)Request->number()) == Descriptions.end()) { - Reply->set_number(1); - return Status::OK; - } - - PM->RTLs.UnregisterLib(Descriptions[(void *)Request->number()].get()); - freeTargetBinaryDescription(Descriptions[(void *)Request->number()].get()); - Descriptions.erase((void *)Request->number()); - - SERVER_DBG("Unregistered library") - Reply->set_number(0); - return Status::OK; -} - -Status RemoteOffloadImpl::IsValidBinary(ServerContext *Context, - const TargetDeviceImagePtr *DeviceImage, - I32 *IsValid) { - __tgt_device_image *Image = - HostToRemoteDeviceImage[(void *)DeviceImage->image_ptr()]; - - IsValid->set_number(0); - - for (auto &RTL : PM->RTLs.AllRTLs) - if (auto Ret = RTL.is_valid_binary(Image)) { - IsValid->set_number(Ret); - break; - } - - SERVER_DBG("Checked if binary (%p) is valid", - (void *)(DeviceImage->image_ptr())) - return Status::OK; -} - -Status RemoteOffloadImpl::GetNumberOfDevices(ServerContext *Context, - const Null *Null, - I32 *NumberOfDevices) { - int32_t Devices = 0; - PM->RTLsMtx.lock(); - for (auto &RTL : PM->RTLs.AllRTLs) - Devices += RTL.NumberOfDevices; - PM->RTLsMtx.unlock(); - - NumberOfDevices->set_number(Devices); - - SERVER_DBG("Got number of devices") - return Status::OK; -} - -Status RemoteOffloadImpl::InitDevice(ServerContext *Context, - const I32 *DeviceNum, I32 *Reply) { - Reply->set_number(PM->Devices[DeviceNum->number()]->RTL->init_device( - mapHostRTLDeviceId(DeviceNum->number()))); - - SERVER_DBG("Initialized device %d", DeviceNum->number()) - return Status::OK; -} - -Status RemoteOffloadImpl::InitRequires(ServerContext *Context, - const I64 *RequiresFlag, I32 *Reply) { - for (auto &Device : PM->Devices) - if (Device->RTL->init_requires) - Device->RTL->init_requires(RequiresFlag->number()); - Reply->set_number(RequiresFlag->number()); - - SERVER_DBG("Initialized requires for devices") - return Status::OK; -} - -Status RemoteOffloadImpl::LoadBinary(ServerContext *Context, - const Binary *Binary, TargetTable *Reply) { - __tgt_device_image *Image = - HostToRemoteDeviceImage[(void *)Binary->image_ptr()]; - - Table = PM->Devices[Binary->device_id()]->RTL->load_binary( - mapHostRTLDeviceId(Binary->device_id()), Image); - if (Table) - loadTargetTable(Table, *Reply, Image); - - SERVER_DBG("Loaded binary (%p) to device %d", (void *)Binary->image_ptr(), - Binary->device_id()) - return Status::OK; -} - -Status RemoteOffloadImpl::IsDataExchangeable(ServerContext *Context, - const DevicePair *Request, - I32 *Reply) { - Reply->set_number(-1); - if (PM->Devices[mapHostRTLDeviceId(Request->src_dev_id())] - ->RTL->is_data_exchangable) - Reply->set_number(PM->Devices[mapHostRTLDeviceId(Request->src_dev_id())] - ->RTL->is_data_exchangable(Request->src_dev_id(), - Request->dst_dev_id())); - - SERVER_DBG("Checked if data exchangeable between device %d and device %d", - Request->src_dev_id(), Request->dst_dev_id()) - return Status::OK; -} - -Status RemoteOffloadImpl::DataAlloc(ServerContext *Context, - const AllocData *Request, Pointer *Reply) { - uint64_t TgtPtr = - (uint64_t)PM->Devices[Request->device_id()]->RTL->data_alloc( - mapHostRTLDeviceId(Request->device_id()), Request->size(), - (void *)Request->hst_ptr(), TARGET_ALLOC_DEFAULT); - Reply->set_number(TgtPtr); - - SERVER_DBG("Allocated at " DPxMOD "", DPxPTR((void *)TgtPtr)) - - return Status::OK; -} - -Status RemoteOffloadImpl::DataSubmit(ServerContext *Context, - ServerReader *Reader, - I32 *Reply) { - SubmitData Request; - uint8_t *HostCopy = nullptr; - while (Reader->Read(&Request)) { - if (Request.start() == 0 && Request.size() == Request.data().size()) { - Reader->SendInitialMetadata(); - - Reply->set_number(PM->Devices[Request.device_id()]->RTL->data_submit( - mapHostRTLDeviceId(Request.device_id()), (void *)Request.tgt_ptr(), - (void *)Request.data().data(), Request.data().size())); - - SERVER_DBG("Submitted %lu bytes async to (%p) on device %d", - Request.data().size(), (void *)Request.tgt_ptr(), - Request.device_id()) - - return Status::OK; - } - if (!HostCopy) { - HostCopy = new uint8_t[Request.size()]; - Reader->SendInitialMetadata(); - } - - memcpy((void *)((char *)HostCopy + Request.start()), Request.data().data(), - Request.data().size()); - } - - Reply->set_number(PM->Devices[Request.device_id()]->RTL->data_submit( - mapHostRTLDeviceId(Request.device_id()), (void *)Request.tgt_ptr(), - HostCopy, Request.size())); - - delete[] HostCopy; - - SERVER_DBG("Submitted %lu bytes to (%p) on device %d", Request.data().size(), - (void *)Request.tgt_ptr(), Request.device_id()) - - return Status::OK; -} - -Status RemoteOffloadImpl::DataRetrieve(ServerContext *Context, - const RetrieveData *Request, - ServerWriter *Writer) { - auto HstPtr = std::make_unique(Request->size()); - - auto Ret = PM->Devices[Request->device_id()]->RTL->data_retrieve( - mapHostRTLDeviceId(Request->device_id()), HstPtr.get(), - (void *)Request->tgt_ptr(), Request->size()); - - if (Arena->SpaceAllocated() >= MaxSize) - Arena->Reset(); - - if (Request->size() > BlockSize) { - uint64_t Start = 0, End = BlockSize; - for (auto I = 0; I < ceil((float)Request->size() / BlockSize); I++) { - auto *Reply = protobuf::Arena::CreateMessage(Arena.get()); - - Reply->set_start(Start); - Reply->set_size(Request->size()); - Reply->set_data((char *)HstPtr.get() + Start, End - Start); - Reply->set_ret(Ret); - - if (!Writer->Write(*Reply)) { - CLIENT_DBG("Broken stream when submitting data") - } - - SERVER_DBG("Retrieved %lu-%lu/%lu bytes from (%p) on device %d", Start, - End, Request->size(), (void *)Request->tgt_ptr(), - mapHostRTLDeviceId(Request->device_id())) - - Start += BlockSize; - End += BlockSize; - if (End >= Request->size()) - End = Request->size(); - } - } else { - auto *Reply = protobuf::Arena::CreateMessage(Arena.get()); - - Reply->set_start(0); - Reply->set_size(Request->size()); - Reply->set_data((char *)HstPtr.get(), Request->size()); - Reply->set_ret(Ret); - - SERVER_DBG("Retrieved %lu bytes from (%p) on device %d", Request->size(), - (void *)Request->tgt_ptr(), - mapHostRTLDeviceId(Request->device_id())) - - Writer->WriteLast(*Reply, WriteOptions()); - } - - return Status::OK; -} - -Status RemoteOffloadImpl::DataExchange(ServerContext *Context, - const ExchangeData *Request, - I32 *Reply) { - if (PM->Devices[Request->src_dev_id()]->RTL->data_exchange) { - int32_t Ret = PM->Devices[Request->src_dev_id()]->RTL->data_exchange( - mapHostRTLDeviceId(Request->src_dev_id()), (void *)Request->src_ptr(), - mapHostRTLDeviceId(Request->dst_dev_id()), (void *)Request->dst_ptr(), - Request->size()); - Reply->set_number(Ret); - } else - Reply->set_number(-1); - - SERVER_DBG( - "Exchanged data asynchronously from device %d (%p) to device %d (%p) of " - "size %lu", - mapHostRTLDeviceId(Request->src_dev_id()), (void *)Request->src_ptr(), - mapHostRTLDeviceId(Request->dst_dev_id()), (void *)Request->dst_ptr(), - Request->size()) - return Status::OK; -} - -Status RemoteOffloadImpl::DataDelete(ServerContext *Context, - const DeleteData *Request, I32 *Reply) { - auto Ret = PM->Devices[Request->device_id()]->RTL->data_delete( - mapHostRTLDeviceId(Request->device_id()), (void *)Request->tgt_ptr()); - Reply->set_number(Ret); - - SERVER_DBG("Deleted data from (%p) on device %d", (void *)Request->tgt_ptr(), - mapHostRTLDeviceId(Request->device_id())) - return Status::OK; -} - -Status RemoteOffloadImpl::RunTargetRegion(ServerContext *Context, - const TargetRegion *Request, - I32 *Reply) { - std::vector TgtArgs(Request->arg_num()); - for (auto I = 0; I < Request->arg_num(); I++) - TgtArgs[I] = (uint64_t)Request->tgt_args()[I]; - - std::vector TgtOffsets(Request->arg_num()); - const auto *TgtOffsetItr = Request->tgt_offsets().begin(); - for (auto I = 0; I < Request->arg_num(); I++, TgtOffsetItr++) - TgtOffsets[I] = (ptrdiff_t)*TgtOffsetItr; - - void *TgtEntryPtr = ((__tgt_offload_entry *)Request->tgt_entry_ptr())->addr; - - int32_t Ret = PM->Devices[Request->device_id()]->RTL->run_region( - mapHostRTLDeviceId(Request->device_id()), TgtEntryPtr, - (void **)TgtArgs.data(), TgtOffsets.data(), Request->arg_num()); - - Reply->set_number(Ret); - - SERVER_DBG("Ran TargetRegion on device %d with %d args", - mapHostRTLDeviceId(Request->device_id()), Request->arg_num()) - return Status::OK; -} - -Status RemoteOffloadImpl::RunTargetTeamRegion(ServerContext *Context, - const TargetTeamRegion *Request, - I32 *Reply) { - std::vector TgtArgs(Request->arg_num()); - for (auto I = 0; I < Request->arg_num(); I++) - TgtArgs[I] = (uint64_t)Request->tgt_args()[I]; - - std::vector TgtOffsets(Request->arg_num()); - const auto *TgtOffsetItr = Request->tgt_offsets().begin(); - for (auto I = 0; I < Request->arg_num(); I++, TgtOffsetItr++) - TgtOffsets[I] = (ptrdiff_t)*TgtOffsetItr; - - void *TgtEntryPtr = ((__tgt_offload_entry *)Request->tgt_entry_ptr())->addr; - - int32_t Ret = PM->Devices[Request->device_id()]->RTL->run_team_region( - mapHostRTLDeviceId(Request->device_id()), TgtEntryPtr, - (void **)TgtArgs.data(), TgtOffsets.data(), Request->arg_num(), - Request->team_num(), Request->thread_limit(), Request->loop_tripcount()); - - Reply->set_number(Ret); - - SERVER_DBG("Ran TargetTeamRegion on device %d with %d args", - mapHostRTLDeviceId(Request->device_id()), Request->arg_num()) - return Status::OK; -} - -int32_t RemoteOffloadImpl::mapHostRTLDeviceId(int32_t RTLDeviceID) { - for (auto &RTL : PM->RTLs.UsedRTLs) { - if (RTLDeviceID - RTL->NumberOfDevices >= 0) - RTLDeviceID -= RTL->NumberOfDevices; - else - break; - } - return RTLDeviceID; -} diff --git a/openmp/libomptarget/plugins/remote/src/CMakeLists.txt b/openmp/libomptarget/plugins/remote/src/CMakeLists.txt deleted file mode 100644 --- a/openmp/libomptarget/plugins/remote/src/CMakeLists.txt +++ /dev/null @@ -1,43 +0,0 @@ -##===----------------------------------------------------------------------===## -# -# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -# See https://llvm.org/LICENSE.txt for license information. -# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -# -##===----------------------------------------------------------------------===## -# -# Build a plugin for remote offloading. -# -##===----------------------------------------------------------------------===## - -cmake_minimum_required(VERSION 3.13.4) - -# Define the suffix for the runtime messaging dumps. -add_definitions(-DTARGET_NAME=RPC) - -include_directories(${LIBOMPTARGET_SRC_DIR}) -include_directories(${LIBOMPTARGET_INCLUDE_DIR}) -include_directories(${GRPC_INCLUDE_DIR}) -include_directories(${RPC_INCLUDE_DIR}) - -add_library(omptarget.rtl.rpc SHARED - ${LIBOMPTARGET_SRC_FILES} - ${GRPC_SRC_FILES} - ${RPC_SRC_DIR}/Utils.cpp - Client.cpp - rtl.cpp -) - -# Install plugin under the lib destination folder. -install(TARGETS omptarget.rtl.rpc LIBRARY DESTINATION "${OPENMP_INSTALL_LIBDIR}") - -target_link_libraries(omptarget.rtl.rpc - grpc++ - protobuf - absl::synchronization - ${OPENMP_PTHREAD_LIB} - omp - "-Wl,--version-script=${CMAKE_CURRENT_SOURCE_DIR}/../../exports") - -# Report to the parent scope that we are building a plugin for RPC. -set(LIBOMPTARGET_SYSTEM_TARGETS "${LIBOMPTARGET_SYSTEM_TARGETS} rpc" PARENT_SCOPE) diff --git a/openmp/libomptarget/plugins/remote/src/Client.h b/openmp/libomptarget/plugins/remote/src/Client.h deleted file mode 100644 --- a/openmp/libomptarget/plugins/remote/src/Client.h +++ /dev/null @@ -1,153 +0,0 @@ -//===------------------ Client.h - Client Implementation ------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// gRPC Client for the remote plugin. -// -//===----------------------------------------------------------------------===// - -#ifndef LLVM_OPENMP_LIBOMPTARGET_PLUGINS_REMOTE_SRC_CLIENT_H -#define LLVM_OPENMP_LIBOMPTARGET_PLUGINS_REMOTE_SRC_CLIENT_H - -#include "Utils.h" -#include "omptarget.h" -#include -#include -#include -#include -#include -#include -#include - -using grpc::Channel; -using openmp::libomptarget::remote::RemoteOffload; -using namespace RemoteOffloading; - -using namespace google; - -class RemoteOffloadClient { - int DebugLevel; - const int Timeout; - const uint64_t MaxSize; - const int64_t BlockSize; - - std::unique_ptr Stub; - std::unique_ptr Arena; - - std::unique_ptr ArenaAllocatorLock; - - std::map> RemoteEntries; - std::map> DevicesToTables; - - template - auto remoteCall(Fn1 Preprocessor, Fn2 Postprocessor, TReturn ErrorValue, - bool CanTimeOut = true); - -public: - RemoteOffloadClient(std::shared_ptr Channel, int Timeout, - uint64_t MaxSize, int64_t BlockSize) - : Timeout(Timeout), MaxSize(MaxSize), BlockSize(BlockSize), - Stub(RemoteOffload::NewStub(Channel)) { - DebugLevel = getDebugLevel(); - Arena = std::make_unique(); - ArenaAllocatorLock = std::make_unique(); - } - - RemoteOffloadClient(RemoteOffloadClient &&C) = default; - - ~RemoteOffloadClient() { - for (auto &TableIt : DevicesToTables) - freeTargetTable(TableIt.second.get()); - } - - int32_t shutdown(void); - - int32_t registerLib(__tgt_bin_desc *Desc); - int32_t unregisterLib(__tgt_bin_desc *Desc); - - int32_t isValidBinary(__tgt_device_image *Image); - int32_t getNumberOfDevices(); - - int32_t initDevice(int32_t DeviceId); - int32_t initRequires(int64_t RequiresFlags); - - __tgt_target_table *loadBinary(int32_t DeviceId, __tgt_device_image *Image); - - void *dataAlloc(int32_t DeviceId, int64_t Size, void *HstPtr); - int32_t dataDelete(int32_t DeviceId, void *TgtPtr); - - int32_t dataSubmit(int32_t DeviceId, void *TgtPtr, void *HstPtr, - int64_t Size); - int32_t dataRetrieve(int32_t DeviceId, void *HstPtr, void *TgtPtr, - int64_t Size); - - int32_t isDataExchangeable(int32_t SrcDevId, int32_t DstDevId); - int32_t dataExchange(int32_t SrcDevId, void *SrcPtr, int32_t DstDevId, - void *DstPtr, int64_t Size); - - int32_t runTargetRegion(int32_t DeviceId, void *TgtEntryPtr, void **TgtArgs, - ptrdiff_t *TgtOffsets, int32_t ArgNum); - int32_t runTargetTeamRegion(int32_t DeviceId, void *TgtEntryPtr, - void **TgtArgs, ptrdiff_t *TgtOffsets, - int32_t ArgNum, int32_t TeamNum, - int32_t ThreadLimit, uint64_t LoopTripCount); -}; - -class RemoteClientManager { -private: - std::vector Clients; - std::vector Devices; - - std::pair mapDeviceId(int32_t DeviceId); - int DebugLevel; - -public: - RemoteClientManager() { - ClientManagerConfigTy Config; - - grpc::ChannelArguments ChArgs; - ChArgs.SetMaxReceiveMessageSize(-1); - DebugLevel = getDebugLevel(); - for (auto Address : Config.ServerAddresses) { - Clients.push_back(RemoteOffloadClient( - grpc::CreateChannel(Address, grpc::InsecureChannelCredentials()), - Config.Timeout, Config.MaxSize, Config.BlockSize)); - } - } - - int32_t shutdown(void); - - int32_t registerLib(__tgt_bin_desc *Desc); - int32_t unregisterLib(__tgt_bin_desc *Desc); - - int32_t isValidBinary(__tgt_device_image *Image); - int32_t getNumberOfDevices(); - - int32_t initDevice(int32_t DeviceId); - int32_t initRequires(int64_t RequiresFlags); - - __tgt_target_table *loadBinary(int32_t DeviceId, __tgt_device_image *Image); - - void *dataAlloc(int32_t DeviceId, int64_t Size, void *HstPtr); - int32_t dataDelete(int32_t DeviceId, void *TgtPtr); - - int32_t dataSubmit(int32_t DeviceId, void *TgtPtr, void *HstPtr, - int64_t Size); - int32_t dataRetrieve(int32_t DeviceId, void *HstPtr, void *TgtPtr, - int64_t Size); - - int32_t isDataExchangeable(int32_t SrcDevId, int32_t DstDevId); - int32_t dataExchange(int32_t SrcDevId, void *SrcPtr, int32_t DstDevId, - void *DstPtr, int64_t Size); - - int32_t runTargetTeamRegion(int32_t DeviceId, void *TgtEntryPtr, - void **TgtArgs, ptrdiff_t *TgtOffsets, - int32_t ArgNum, int32_t TeamNum, - int32_t ThreadLimit, uint64_t LoopTripCount); -}; - -#endif diff --git a/openmp/libomptarget/plugins/remote/src/Client.cpp b/openmp/libomptarget/plugins/remote/src/Client.cpp deleted file mode 100644 --- a/openmp/libomptarget/plugins/remote/src/Client.cpp +++ /dev/null @@ -1,711 +0,0 @@ -//===----------------- Client.cpp - Client Implementation -----------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// gRPC (Client) for the remote plugin. -// -//===----------------------------------------------------------------------===// - -#include - -#include "Client.h" -#include "omptarget.h" -#include "openmp.pb.h" - -using namespace std::chrono; - -using grpc::ClientContext; -using grpc::ClientReader; -using grpc::ClientWriter; -using grpc::Status; - -template -auto RemoteOffloadClient::remoteCall(Fn1 Preprocessor, Fn2 Postprocessor, - TReturn ErrorValue, bool CanTimeOut) { - ArenaAllocatorLock->lock(); - if (Arena->SpaceAllocated() >= MaxSize) - Arena->Reset(); - ArenaAllocatorLock->unlock(); - - ClientContext Context; - if (CanTimeOut) { - auto Deadline = - std::chrono::system_clock::now() + std::chrono::seconds(Timeout); - Context.set_deadline(Deadline); - } - - Status RPCStatus; - auto Reply = Preprocessor(RPCStatus, Context); - - if (!RPCStatus.ok()) { - CLIENT_DBG("%s", RPCStatus.error_message().c_str()) - } else { - return Postprocessor(Reply); - } - - CLIENT_DBG("Failed") - return ErrorValue; -} - -int32_t RemoteOffloadClient::shutdown(void) { - ClientContext Context; - Null Request; - I32 Reply; - CLIENT_DBG("Shutting down server.") - auto Status = Stub->Shutdown(&Context, Request, &Reply); - if (Status.ok()) - return Reply.number(); - return 1; -} - -int32_t RemoteOffloadClient::registerLib(__tgt_bin_desc *Desc) { - return remoteCall( - /* Preprocessor */ - [&](auto &RPCStatus, auto &Context) { - auto *Request = protobuf::Arena::CreateMessage( - Arena.get()); - auto *Reply = protobuf::Arena::CreateMessage(Arena.get()); - loadTargetBinaryDescription(Desc, *Request); - Request->set_bin_ptr((uint64_t)Desc); - - RPCStatus = Stub->RegisterLib(&Context, *Request, Reply); - return Reply; - }, - /* Postprocessor */ - [&](const auto &Reply) { - if (Reply->number() == 0) { - CLIENT_DBG("Registered library") - return 0; - } - return 1; - }, - /* Error Value */ 1); -} - -int32_t RemoteOffloadClient::unregisterLib(__tgt_bin_desc *Desc) { - return remoteCall( - /* Preprocessor */ - [&](auto &RPCStatus, auto &Context) { - auto *Request = protobuf::Arena::CreateMessage(Arena.get()); - auto *Reply = protobuf::Arena::CreateMessage(Arena.get()); - - Request->set_number((uint64_t)Desc); - - RPCStatus = Stub->UnregisterLib(&Context, *Request, Reply); - return Reply; - }, - /* Postprocessor */ - [&](const auto &Reply) { - if (Reply->number() == 0) { - CLIENT_DBG("Unregistered library") - return 0; - } - CLIENT_DBG("Failed to unregister library") - return 1; - }, - /* Error Value */ 1); -} - -int32_t RemoteOffloadClient::isValidBinary(__tgt_device_image *Image) { - return remoteCall( - /* Preprocessor */ - [&](auto &RPCStatus, auto &Context) { - auto *Request = - protobuf::Arena::CreateMessage(Arena.get()); - auto *Reply = protobuf::Arena::CreateMessage(Arena.get()); - - Request->set_image_ptr((uint64_t)Image->ImageStart); - - auto *EntryItr = Image->EntriesBegin; - while (EntryItr != Image->EntriesEnd) - Request->add_entry_ptrs((uint64_t)EntryItr++); - - RPCStatus = Stub->IsValidBinary(&Context, *Request, Reply); - return Reply; - }, - /* Postprocessor */ - [&](const auto &Reply) { - if (Reply->number()) { - CLIENT_DBG("Validated binary") - } else { - CLIENT_DBG("Could not validate binary") - } - return Reply->number(); - }, - /* Error Value */ 0); -} - -int32_t RemoteOffloadClient::getNumberOfDevices() { - return remoteCall( - /* Preprocessor */ - [&](Status &RPCStatus, ClientContext &Context) { - auto *Request = protobuf::Arena::CreateMessage(Arena.get()); - auto *Reply = protobuf::Arena::CreateMessage(Arena.get()); - - RPCStatus = Stub->GetNumberOfDevices(&Context, *Request, Reply); - - return Reply; - }, - /* Postprocessor */ - [&](const auto &Reply) { - if (Reply->number()) { - CLIENT_DBG("Found %d devices", Reply->number()) - } else { - CLIENT_DBG("Could not get the number of devices") - } - return Reply->number(); - }, - /*Error Value*/ -1); -} - -int32_t RemoteOffloadClient::initDevice(int32_t DeviceId) { - return remoteCall( - /* Preprocessor */ - [&](auto &RPCStatus, auto &Context) { - auto *Request = protobuf::Arena::CreateMessage(Arena.get()); - auto *Reply = protobuf::Arena::CreateMessage(Arena.get()); - - Request->set_number(DeviceId); - - RPCStatus = Stub->InitDevice(&Context, *Request, Reply); - - return Reply; - }, - /* Postprocessor */ - [&](const auto &Reply) { - if (!Reply->number()) { - CLIENT_DBG("Initialized device %d", DeviceId) - } else { - CLIENT_DBG("Could not initialize device %d", DeviceId) - } - return Reply->number(); - }, - /* Error Value */ -1); -} - -int32_t RemoteOffloadClient::initRequires(int64_t RequiresFlags) { - return remoteCall( - /* Preprocessor */ - [&](auto &RPCStatus, auto &Context) { - auto *Request = protobuf::Arena::CreateMessage(Arena.get()); - auto *Reply = protobuf::Arena::CreateMessage(Arena.get()); - Request->set_number(RequiresFlags); - RPCStatus = Stub->InitRequires(&Context, *Request, Reply); - return Reply; - }, - /* Postprocessor */ - [&](const auto &Reply) { - if (Reply->number()) { - CLIENT_DBG("Initialized requires") - } else { - CLIENT_DBG("Could not initialize requires") - } - return Reply->number(); - }, - /* Error Value */ -1); -} - -__tgt_target_table *RemoteOffloadClient::loadBinary(int32_t DeviceId, - __tgt_device_image *Image) { - return remoteCall( - /* Preprocessor */ - [&](auto &RPCStatus, auto &Context) { - auto *ImageMessage = - protobuf::Arena::CreateMessage(Arena.get()); - auto *Reply = protobuf::Arena::CreateMessage(Arena.get()); - ImageMessage->set_image_ptr((uint64_t)Image->ImageStart); - ImageMessage->set_device_id(DeviceId); - - RPCStatus = Stub->LoadBinary(&Context, *ImageMessage, Reply); - return Reply; - }, - /* Postprocessor */ - [&](auto &Reply) { - if (Reply->entries_size() == 0) { - CLIENT_DBG("Could not load image %p onto device %d", Image, DeviceId) - return (__tgt_target_table *)nullptr; - } - DevicesToTables[DeviceId] = std::make_unique<__tgt_target_table>(); - unloadTargetTable(*Reply, DevicesToTables[DeviceId].get(), - RemoteEntries[DeviceId]); - - CLIENT_DBG("Loaded Image %p to device %d with %d entries", Image, - DeviceId, Reply->entries_size()) - - return DevicesToTables[DeviceId].get(); - }, - /* Error Value */ (__tgt_target_table *)nullptr, - /* CanTimeOut */ false); -} - -int32_t RemoteOffloadClient::isDataExchangeable(int32_t SrcDevId, - int32_t DstDevId) { - return remoteCall( - /* Preprocessor */ - [&](auto &RPCStatus, auto &Context) { - auto *Request = protobuf::Arena::CreateMessage(Arena.get()); - auto *Reply = protobuf::Arena::CreateMessage(Arena.get()); - - Request->set_src_dev_id(SrcDevId); - Request->set_dst_dev_id(DstDevId); - - RPCStatus = Stub->IsDataExchangeable(&Context, *Request, Reply); - return Reply; - }, - /* Postprocessor */ - [&](auto &Reply) { - if (Reply->number()) { - CLIENT_DBG("Data is exchangeable between %d, %d", SrcDevId, DstDevId) - } else { - CLIENT_DBG("Data is not exchangeable between %d, %d", SrcDevId, - DstDevId) - } - return Reply->number(); - }, - /* Error Value */ -1); -} - -void *RemoteOffloadClient::dataAlloc(int32_t DeviceId, int64_t Size, - void *HstPtr) { - return remoteCall( - /* Preprocessor */ - [&](auto &RPCStatus, auto &Context) { - auto *Reply = protobuf::Arena::CreateMessage(Arena.get()); - auto *Request = protobuf::Arena::CreateMessage(Arena.get()); - - Request->set_device_id(DeviceId); - Request->set_size(Size); - Request->set_hst_ptr((uint64_t)HstPtr); - - RPCStatus = Stub->DataAlloc(&Context, *Request, Reply); - return Reply; - }, - /* Postprocessor */ - [&](auto &Reply) { - if (Reply->number()) { - CLIENT_DBG("Allocated %ld bytes on device %d at %p", Size, DeviceId, - (void *)Reply->number()) - } else { - CLIENT_DBG("Could not allocate %ld bytes on device %d at %p", Size, - DeviceId, (void *)Reply->number()) - } - return (void *)Reply->number(); - }, - /* Error Value */ (void *)nullptr); -} - -int32_t RemoteOffloadClient::dataSubmit(int32_t DeviceId, void *TgtPtr, - void *HstPtr, int64_t Size) { - - return remoteCall( - /* Preprocessor */ - [&](auto &RPCStatus, auto &Context) { - auto *Reply = protobuf::Arena::CreateMessage(Arena.get()); - std::unique_ptr> Writer( - Stub->DataSubmit(&Context, Reply)); - - if (Size > BlockSize) { - int64_t Start = 0, End = BlockSize; - for (auto I = 0; I < ceil((float)Size / BlockSize); I++) { - auto *Request = - protobuf::Arena::CreateMessage(Arena.get()); - - Request->set_device_id(DeviceId); - Request->set_data((char *)HstPtr + Start, End - Start); - Request->set_hst_ptr((uint64_t)HstPtr); - Request->set_tgt_ptr((uint64_t)TgtPtr); - Request->set_start(Start); - Request->set_size(Size); - - if (!Writer->Write(*Request)) { - CLIENT_DBG("Broken stream when submitting data") - Reply->set_number(0); - return Reply; - } - - Start += BlockSize; - End += BlockSize; - if (End >= Size) - End = Size; - } - } else { - auto *Request = - protobuf::Arena::CreateMessage(Arena.get()); - - Request->set_device_id(DeviceId); - Request->set_data(HstPtr, Size); - Request->set_hst_ptr((uint64_t)HstPtr); - Request->set_tgt_ptr((uint64_t)TgtPtr); - Request->set_start(0); - Request->set_size(Size); - - if (!Writer->Write(*Request)) { - CLIENT_DBG("Broken stream when submitting data") - Reply->set_number(0); - return Reply; - } - } - - Writer->WritesDone(); - RPCStatus = Writer->Finish(); - - return Reply; - }, - /* Postprocessor */ - [&](auto &Reply) { - if (!Reply->number()) { - CLIENT_DBG(" submitted %ld bytes on device %d at %p", Size, DeviceId, - TgtPtr) - } else { - CLIENT_DBG("Could not async submit %ld bytes on device %d at %p", - Size, DeviceId, TgtPtr) - } - return Reply->number(); - }, - /* Error Value */ -1, - /* CanTimeOut */ false); -} - -int32_t RemoteOffloadClient::dataRetrieve(int32_t DeviceId, void *HstPtr, - void *TgtPtr, int64_t Size) { - return remoteCall( - /* Preprocessor */ - [&](auto &RPCStatus, auto &Context) { - auto *Request = - protobuf::Arena::CreateMessage(Arena.get()); - - Request->set_device_id(DeviceId); - Request->set_size(Size); - Request->set_hst_ptr((int64_t)HstPtr); - Request->set_tgt_ptr((int64_t)TgtPtr); - - auto *Reply = protobuf::Arena::CreateMessage(Arena.get()); - std::unique_ptr> Reader( - Stub->DataRetrieve(&Context, *Request)); - Reader->WaitForInitialMetadata(); - while (Reader->Read(Reply)) { - if (Reply->ret()) { - CLIENT_DBG("Could not async retrieve %ld bytes on device %d at %p " - "for %p", - Size, DeviceId, TgtPtr, HstPtr) - return Reply; - } - - if (Reply->start() == 0 && Reply->size() == Reply->data().size()) { - memcpy(HstPtr, Reply->data().data(), Reply->data().size()); - - return Reply; - } - - memcpy((void *)((char *)HstPtr + Reply->start()), - Reply->data().data(), Reply->data().size()); - } - RPCStatus = Reader->Finish(); - - return Reply; - }, - /* Postprocessor */ - [&](auto &Reply) { - if (!Reply->ret()) { - CLIENT_DBG("Retrieved %ld bytes on Device %d", Size, DeviceId) - } else { - CLIENT_DBG("Could not async retrieve %ld bytes on Device %d", Size, - DeviceId) - } - return Reply->ret(); - }, - /* Error Value */ -1, - /* CanTimeOut */ false); -} - -int32_t RemoteOffloadClient::dataExchange(int32_t SrcDevId, void *SrcPtr, - int32_t DstDevId, void *DstPtr, - int64_t Size) { - return remoteCall( - /* Preprocessor */ - [&](auto &RPCStatus, auto &Context) { - auto *Reply = protobuf::Arena::CreateMessage(Arena.get()); - auto *Request = - protobuf::Arena::CreateMessage(Arena.get()); - - Request->set_src_dev_id(SrcDevId); - Request->set_src_ptr((uint64_t)SrcPtr); - Request->set_dst_dev_id(DstDevId); - Request->set_dst_ptr((uint64_t)DstPtr); - Request->set_size(Size); - - RPCStatus = Stub->DataExchange(&Context, *Request, Reply); - return Reply; - }, - /* Postprocessor */ - [&](auto &Reply) { - if (Reply->number()) { - CLIENT_DBG( - "Exchanged %ld bytes on device %d at %p for %p on device %d", - Size, SrcDevId, SrcPtr, DstPtr, DstDevId) - } else { - CLIENT_DBG("Could not exchange %ld bytes on device %d at %p for %p " - "on device %d", - Size, SrcDevId, SrcPtr, DstPtr, DstDevId) - } - return Reply->number(); - }, - /* Error Value */ -1); -} - -int32_t RemoteOffloadClient::dataDelete(int32_t DeviceId, void *TgtPtr) { - return remoteCall( - /* Preprocessor */ - [&](auto &RPCStatus, auto &Context) { - auto *Reply = protobuf::Arena::CreateMessage(Arena.get()); - auto *Request = protobuf::Arena::CreateMessage(Arena.get()); - - Request->set_device_id(DeviceId); - Request->set_tgt_ptr((uint64_t)TgtPtr); - - RPCStatus = Stub->DataDelete(&Context, *Request, Reply); - return Reply; - }, - /* Postprocessor */ - [&](auto &Reply) { - if (!Reply->number()) { - CLIENT_DBG("Deleted data at %p on device %d", TgtPtr, DeviceId) - } else { - CLIENT_DBG("Could not delete data at %p on device %d", TgtPtr, - DeviceId) - } - return Reply->number(); - }, - /* Error Value */ -1); -} - -int32_t RemoteOffloadClient::runTargetRegion(int32_t DeviceId, - void *TgtEntryPtr, void **TgtArgs, - ptrdiff_t *TgtOffsets, - int32_t ArgNum) { - return remoteCall( - /* Preprocessor */ - [&](auto &RPCStatus, auto &Context) { - auto *Reply = protobuf::Arena::CreateMessage(Arena.get()); - auto *Request = - protobuf::Arena::CreateMessage(Arena.get()); - - Request->set_device_id(DeviceId); - - Request->set_tgt_entry_ptr( - (uint64_t)RemoteEntries[DeviceId][TgtEntryPtr]); - - char **ArgPtr = (char **)TgtArgs; - for (auto I = 0; I < ArgNum; I++, ArgPtr++) - Request->add_tgt_args((uint64_t)*ArgPtr); - - char *OffsetPtr = (char *)TgtOffsets; - for (auto I = 0; I < ArgNum; I++, OffsetPtr++) - Request->add_tgt_offsets((uint64_t)*OffsetPtr); - - Request->set_arg_num(ArgNum); - - RPCStatus = Stub->RunTargetRegion(&Context, *Request, Reply); - return Reply; - }, - /* Postprocessor */ - [&](auto &Reply) { - if (!Reply->number()) { - CLIENT_DBG("Ran target region async on device %d", DeviceId) - } else { - CLIENT_DBG("Could not run target region async on device %d", DeviceId) - } - return Reply->number(); - }, - /* Error Value */ -1, - /* CanTimeOut */ false); -} - -int32_t RemoteOffloadClient::runTargetTeamRegion( - int32_t DeviceId, void *TgtEntryPtr, void **TgtArgs, ptrdiff_t *TgtOffsets, - int32_t ArgNum, int32_t TeamNum, int32_t ThreadLimit, - uint64_t LoopTripcount) { - return remoteCall( - /* Preprocessor */ - [&](auto &RPCStatus, auto &Context) { - auto *Reply = protobuf::Arena::CreateMessage(Arena.get()); - auto *Request = - protobuf::Arena::CreateMessage(Arena.get()); - - Request->set_device_id(DeviceId); - - Request->set_tgt_entry_ptr( - (uint64_t)RemoteEntries[DeviceId][TgtEntryPtr]); - - char **ArgPtr = (char **)TgtArgs; - for (auto I = 0; I < ArgNum; I++, ArgPtr++) { - Request->add_tgt_args((uint64_t)*ArgPtr); - } - - char *OffsetPtr = (char *)TgtOffsets; - for (auto I = 0; I < ArgNum; I++, OffsetPtr++) - Request->add_tgt_offsets((uint64_t)*OffsetPtr); - - Request->set_arg_num(ArgNum); - Request->set_team_num(TeamNum); - Request->set_thread_limit(ThreadLimit); - Request->set_loop_tripcount(LoopTripcount); - - RPCStatus = Stub->RunTargetTeamRegion(&Context, *Request, Reply); - return Reply; - }, - /* Postprocessor */ - [&](auto &Reply) { - if (!Reply->number()) { - CLIENT_DBG("Ran target team region async on device %d", DeviceId) - } else { - CLIENT_DBG("Could not run target team region async on device %d", - DeviceId) - } - return Reply->number(); - }, - /* Error Value */ -1, - /* CanTimeOut */ false); -} - -int32_t RemoteClientManager::shutdown(void) { - int32_t Ret = 0; - for (auto &Client : Clients) - Ret &= Client.shutdown(); - return Ret; -} - -int32_t RemoteClientManager::registerLib(__tgt_bin_desc *Desc) { - int32_t Ret = 0; - for (auto &Client : Clients) - Ret &= Client.registerLib(Desc); - return Ret; -} - -int32_t RemoteClientManager::unregisterLib(__tgt_bin_desc *Desc) { - int32_t Ret = 0; - for (auto &Client : Clients) - Ret &= Client.unregisterLib(Desc); - return Ret; -} - -int32_t RemoteClientManager::isValidBinary(__tgt_device_image *Image) { - int32_t ClientIdx = 0; - for (auto &Client : Clients) { - if (auto Ret = Client.isValidBinary(Image)) - return Ret; - ClientIdx++; - } - return 0; -} - -int32_t RemoteClientManager::getNumberOfDevices() { - auto ClientIdx = 0; - for (auto &Client : Clients) { - if (auto NumDevices = Client.getNumberOfDevices()) { - Devices.push_back(NumDevices); - } - ClientIdx++; - } - - return std::accumulate(Devices.begin(), Devices.end(), 0); -} - -std::pair RemoteClientManager::mapDeviceId(int32_t DeviceId) { - for (size_t ClientIdx = 0; ClientIdx < Devices.size(); ClientIdx++) { - if (DeviceId < Devices[ClientIdx]) - return {ClientIdx, DeviceId}; - DeviceId -= Devices[ClientIdx]; - } - return {-1, -1}; -} - -int32_t RemoteClientManager::initDevice(int32_t DeviceId) { - int32_t ClientIdx, DeviceIdx; - std::tie(ClientIdx, DeviceIdx) = mapDeviceId(DeviceId); - return Clients[ClientIdx].initDevice(DeviceIdx); -} - -int32_t RemoteClientManager::initRequires(int64_t RequiresFlags) { - for (auto &Client : Clients) - Client.initRequires(RequiresFlags); - - return RequiresFlags; -} - -__tgt_target_table *RemoteClientManager::loadBinary(int32_t DeviceId, - __tgt_device_image *Image) { - int32_t ClientIdx, DeviceIdx; - std::tie(ClientIdx, DeviceIdx) = mapDeviceId(DeviceId); - return Clients[ClientIdx].loadBinary(DeviceIdx, Image); -} - -int32_t RemoteClientManager::isDataExchangeable(int32_t SrcDevId, - int32_t DstDevId) { - int32_t SrcClientIdx, SrcDeviceIdx, DstClientIdx, DstDeviceIdx; - std::tie(SrcClientIdx, SrcDeviceIdx) = mapDeviceId(SrcDevId); - std::tie(DstClientIdx, DstDeviceIdx) = mapDeviceId(DstDevId); - return Clients[SrcClientIdx].isDataExchangeable(SrcDeviceIdx, DstDeviceIdx); -} - -void *RemoteClientManager::dataAlloc(int32_t DeviceId, int64_t Size, - void *HstPtr) { - int32_t ClientIdx, DeviceIdx; - std::tie(ClientIdx, DeviceIdx) = mapDeviceId(DeviceId); - return Clients[ClientIdx].dataAlloc(DeviceIdx, Size, HstPtr); -} - -int32_t RemoteClientManager::dataDelete(int32_t DeviceId, void *TgtPtr) { - int32_t ClientIdx, DeviceIdx; - std::tie(ClientIdx, DeviceIdx) = mapDeviceId(DeviceId); - return Clients[ClientIdx].dataDelete(DeviceIdx, TgtPtr); -} - -int32_t RemoteClientManager::dataSubmit(int32_t DeviceId, void *TgtPtr, - void *HstPtr, int64_t Size) { - int32_t ClientIdx, DeviceIdx; - std::tie(ClientIdx, DeviceIdx) = mapDeviceId(DeviceId); - return Clients[ClientIdx].dataSubmit(DeviceIdx, TgtPtr, HstPtr, Size); -} - -int32_t RemoteClientManager::dataRetrieve(int32_t DeviceId, void *HstPtr, - void *TgtPtr, int64_t Size) { - int32_t ClientIdx, DeviceIdx; - std::tie(ClientIdx, DeviceIdx) = mapDeviceId(DeviceId); - return Clients[ClientIdx].dataRetrieve(DeviceIdx, HstPtr, TgtPtr, Size); -} - -int32_t RemoteClientManager::dataExchange(int32_t SrcDevId, void *SrcPtr, - int32_t DstDevId, void *DstPtr, - int64_t Size) { - int32_t SrcClientIdx, SrcDeviceIdx, DstClientIdx, DstDeviceIdx; - std::tie(SrcClientIdx, SrcDeviceIdx) = mapDeviceId(SrcDevId); - std::tie(DstClientIdx, DstDeviceIdx) = mapDeviceId(DstDevId); - return Clients[SrcClientIdx].dataExchange(SrcDeviceIdx, SrcPtr, DstDeviceIdx, - DstPtr, Size); -} - -int32_t RemoteClientManager::runTargetRegion(int32_t DeviceId, - void *TgtEntryPtr, void **TgtArgs, - ptrdiff_t *TgtOffsets, - int32_t ArgNum) { - int32_t ClientIdx, DeviceIdx; - std::tie(ClientIdx, DeviceIdx) = mapDeviceId(DeviceId); - return Clients[ClientIdx].runTargetRegion(DeviceIdx, TgtEntryPtr, TgtArgs, - TgtOffsets, ArgNum); -} - -int32_t RemoteClientManager::runTargetTeamRegion( - int32_t DeviceId, void *TgtEntryPtr, void **TgtArgs, ptrdiff_t *TgtOffsets, - int32_t ArgNum, int32_t TeamNum, int32_t ThreadLimit, - uint64_t LoopTripCount) { - int32_t ClientIdx, DeviceIdx; - std::tie(ClientIdx, DeviceIdx) = mapDeviceId(DeviceId); - return Clients[ClientIdx].runTargetTeamRegion(DeviceIdx, TgtEntryPtr, TgtArgs, - TgtOffsets, ArgNum, TeamNum, - ThreadLimit, LoopTripCount); -} diff --git a/openmp/libomptarget/plugins/remote/src/rtl.cpp b/openmp/libomptarget/plugins/remote/src/rtl.cpp deleted file mode 100644 --- a/openmp/libomptarget/plugins/remote/src/rtl.cpp +++ /dev/null @@ -1,121 +0,0 @@ -//===--------------------- rtl.cpp - Remote RTL Plugin --------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// RTL for Host. -// -//===----------------------------------------------------------------------===// - -#include -#include -#include - -#include "Client.h" -#include "Utils.h" -#include "omptarget.h" -#include "omptargetplugin.h" - -#define TARGET_NAME RPC -#define DEBUG_PREFIX "Target " GETNAME(TARGET_NAME) " RTL" - -RemoteClientManager *Manager; - -__attribute__((constructor(101))) void initRPC() { - DP("Init RPC library!\n"); - - Manager = new RemoteClientManager(); -} - -__attribute__((destructor(101))) void deinitRPC() { - Manager->shutdown(); // TODO: Error handle shutting down - DP("Deinit RPC library!\n"); - delete Manager; -} - -// Exposed library API function -#ifdef __cplusplus -extern "C" { -#endif - -int32_t __tgt_rtl_register_lib(__tgt_bin_desc *Desc) { - return Manager->registerLib(Desc); -} - -int32_t __tgt_rtl_unregister_lib(__tgt_bin_desc *Desc) { - return Manager->unregisterLib(Desc); -} - -int32_t __tgt_rtl_is_valid_binary(__tgt_device_image *Image) { - return Manager->isValidBinary(Image); -} - -int32_t __tgt_rtl_number_of_devices() { return Manager->getNumberOfDevices(); } - -int32_t __tgt_rtl_init_device(int32_t DeviceId) { - return Manager->initDevice(DeviceId); -} - -int64_t __tgt_rtl_init_requires(int64_t RequiresFlags) { - return Manager->initRequires(RequiresFlags); -} - -__tgt_target_table *__tgt_rtl_load_binary(int32_t DeviceId, - __tgt_device_image *Image) { - return Manager->loadBinary(DeviceId, (__tgt_device_image *)Image); -} - -int32_t __tgt_rtl_is_data_exchangable(int32_t SrcDevId, int32_t DstDevId) { - return Manager->isDataExchangeable(SrcDevId, DstDevId); -} - -void *__tgt_rtl_data_alloc(int32_t DeviceId, int64_t Size, void *HstPtr, - int32_t Kind) { - if (Kind != TARGET_ALLOC_DEFAULT) { - REPORT("Invalid target data allocation kind or requested allocator not " - "implemented yet\n"); - return NULL; - } - - return Manager->dataAlloc(DeviceId, Size, HstPtr); -} - -int32_t __tgt_rtl_data_submit(int32_t DeviceId, void *TgtPtr, void *HstPtr, - int64_t Size) { - return Manager->dataSubmit(DeviceId, TgtPtr, HstPtr, Size); -} - -int32_t __tgt_rtl_data_retrieve(int32_t DeviceId, void *HstPtr, void *TgtPtr, - int64_t Size) { - return Manager->dataRetrieve(DeviceId, HstPtr, TgtPtr, Size); -} - -int32_t __tgt_rtl_data_delete(int32_t DeviceId, void *TgtPtr, int32_t) { - return Manager->dataDelete(DeviceId, TgtPtr); -} - -int32_t __tgt_rtl_data_exchange(int32_t SrcDevId, void *SrcPtr, - int32_t DstDevId, void *DstPtr, int64_t Size) { - return Manager->dataExchange(SrcDevId, SrcPtr, DstDevId, DstPtr, Size); -} - -int32_t __tgt_rtl_launch_kernel(int32_t DeviceId, void *TgtEntryPtr, - void **TgtArgs, ptrdiff_t *TgtOffsets, - KernelArgsTy *KernelArgs, - __tgt_async_info *AsyncInfoPtr) { - assert(!KernelArgs->NumTeams[1] && !KernelArgs->NumTeams[2] && - !KernelArgs->ThreadLimit[1] && !KernelArgs->ThreadLimit[2] && - "Only one dimensional kernels supported."); - return Manager->runTargetTeamRegion( - DeviceId, TgtEntryPtr, TgtArgs, TgtOffsets, KernelArgs->NumArgs, - KernelArgs->NumTeams[0], KernelArgs->ThreadLimit[0], - KernelArgs->Tripcount); -} - -// Exposed library API function -#ifdef __cplusplus -} -#endif diff --git a/openmp/libomptarget/plugins/ve/CMakeLists.txt b/openmp/libomptarget/plugins/ve/CMakeLists.txt deleted file mode 100644 --- a/openmp/libomptarget/plugins/ve/CMakeLists.txt +++ /dev/null @@ -1,66 +0,0 @@ -##===----------------------------------------------------------------------===## -# -# Build a plugin for a NEC Aurora machine if available. (Can also run on host) -# -##===----------------------------------------------------------------------===## - - -if(${LIBOMPTARGET_DEP_VEO_FOUND}) - libomptarget_say("Building SX-Aurora VE offloading plugin.") - set(additional_libs "") - set(additional_libs ${LIBOMPTARGET_DEP_VEO_LIBRARIES} - ${LIBOMPTARGET_DEP_VEOSINFO_LIBRARIES} - ${additional_libs}) - - set(tmachine_name "ve") - set(tmachine_libname "ve") - set(tmachine_triple "ve-unknown-linux-unknown") - set(elf_machine_id 251) - - # Define macro to be used as prefix of the runtime messages for this target. - add_definitions("-DTARGET_NAME=${tmachine_name}") - - # Define macro with the ELF ID for this target. - add_definitions("-DTARGET_ELF_ID=${elf_machine_id}") - - add_llvm_library("omptarget.rtl.${tmachine_libname}" - SHARED - ${CMAKE_CURRENT_SOURCE_DIR}/src/rtl.cpp - - ADDITIONAL_HEADER_DIRS - ${LIBOMPTARGET_INCLUDE_DIR} - ${LIBOMPTARGET_DEP_VEO_INCLUDE_DIR} - - LINK_LIBS - PRIVATE - elf_common - ${LIBOMPTARGET_DEP_LIBFFI_LIBRARIES} - ${additional_libs} - "-Wl,--version-script=${CMAKE_CURRENT_SOURCE_DIR}/../exports -Wl,-z,defs" - - NO_INSTALL_RPATH - ) - - # Install plugin under the lib destination folder. - install(TARGETS "omptarget.rtl.${tmachine_libname}" LIBRARY DESTINATION "${OPENMP_INSTALL_LIBDIR}") - set_target_properties("omptarget.rtl.${tmachine_libname}" PROPERTIES - INSTALL_RPATH "$ORIGIN" BUILD_RPATH "$ORIGIN:${CMAKE_CURRENT_BINARY_DIR}/.." - CXX_VISIBILITY_PRESET protected) - - target_include_directories("omptarget.rtl.${tmachine_libname}" PRIVATE - ${LIBOMPTARGET_INCLUDE_DIR} - ${LIBOMPTARGET_DEP_VEO_INCLUDE_DIR}) - - target_link_libraries( - "omptarget.rtl.${tmachine_libname}" - elf_common - ${LIBOMPTARGET_DEP_LIBFFI_LIBRARIES} - ${additional_libs} - "-Wl,--version-script=${CMAKE_CURRENT_SOURCE_DIR}/../exports -Wl,-z,defs") - - # Report to the parent scope that we are building a plugin. - set(LIBOMPTARGET_SYSTEM_TARGETS - "${LIBOMPTARGET_SYSTEM_TARGETS} ${tmachine_triple}" PARENT_SCOPE) -else() - libomptarget_say("Not building nec-aurora plugin: libveo or libveosinfo not found.") -endif() diff --git a/openmp/libomptarget/plugins/ve/src/rtl.cpp b/openmp/libomptarget/plugins/ve/src/rtl.cpp deleted file mode 100644 --- a/openmp/libomptarget/plugins/ve/src/rtl.cpp +++ /dev/null @@ -1,453 +0,0 @@ -//===-RTLs/nec-aurora/src/rtl.cpp - Target RTLs Implementation - C++ -*-======// -// -// The LLVM Compiler Infrastructure -// -// This file is dual licensed under the MIT and the University of Illinois Open -// Source Licenses. See LICENSE.txt for details. -// -//===----------------------------------------------------------------------===// -// -// RTL for NEC Aurora TSUBASA machines -// -//===----------------------------------------------------------------------===// - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "Debug.h" -#include "omptargetplugin.h" - -#ifndef TARGET_NAME -#define TARGET_NAME VE -#endif - -#define DEBUG_PREFIX "Target " GETNAME(TARGET_NAME) " RTL" - -#ifndef TARGET_ELF_ID -#define TARGET_ELF_ID 0 -#endif - -#include "elf_common.h" - -struct DynLibTy { - char *FileName; - uint64_t VeoLibHandle; -}; - -/// Keep entries table per device. -struct FuncOrGblEntryTy { - __tgt_target_table Table; - std::vector<__tgt_offload_entry> Entries; -}; - -class RTLDeviceInfoTy { - std::vector> FuncOrGblEntry; - -public: - std::vector ProcHandles; - std::vector Contexts; - std::vector LibraryHandles; - std::list DynLibs; - // Maps OpenMP device Ids to Ve nodeids - std::vector NodeIds; - - void buildOffloadTableFromHost(int32_t device_id, uint64_t VeoLibHandle, - __tgt_offload_entry *HostBegin, - __tgt_offload_entry *HostEnd) { - FuncOrGblEntry[device_id].emplace_back(); - std::vector<__tgt_offload_entry> &T = - FuncOrGblEntry[device_id].back().Entries; - T.clear(); - for (__tgt_offload_entry *i = HostBegin; i != HostEnd; ++i) { - char *SymbolName = i->name; - // we have not enough access to the target memory to conveniently parse - // the offload table there so we need to lookup every symbol with the host - // table - DP("Looking up symbol: %s\n", SymbolName); - uint64_t SymbolTargetAddr = - veo_get_sym(ProcHandles[device_id], VeoLibHandle, SymbolName); - __tgt_offload_entry Entry; - - if (!SymbolTargetAddr) { - DP("Symbol %s not found in target image\n", SymbolName); - Entry = {NULL, NULL, 0, 0, 0}; - } else { - DP("Found symbol %s successfully in target image (addr: %p)\n", - SymbolName, reinterpret_cast(SymbolTargetAddr)); - Entry = {reinterpret_cast(SymbolTargetAddr), i->name, i->size, - i->flags, 0}; - } - - T.push_back(Entry); - } - - FuncOrGblEntry[device_id].back().Table.EntriesBegin = &T.front(); - FuncOrGblEntry[device_id].back().Table.EntriesEnd = &T.back() + 1; - } - - __tgt_target_table *getOffloadTable(int32_t device_id) { - return &FuncOrGblEntry[device_id].back().Table; - } - - RTLDeviceInfoTy() { - - struct ve_nodeinfo node_info; - ve_node_info(&node_info); - - // Build a predictable mapping between VE node ids and OpenMP device ids. - // This is necessary, because nodes can be missing or offline and (active) - // node ids are thus not consecutive. The entries in ve_nodeinfo may also - // not be in the order of their node ids. - for (int i = 0; i < node_info.total_node_count; ++i) { - if (node_info.status[i] == 0) { - NodeIds.push_back(node_info.nodeid[i]); - } - } - - // Because the entries in ve_nodeinfo may not be in the order of their node - // ids, we sort NodeIds to get a predictable mapping. - std::sort(NodeIds.begin(), NodeIds.end()); - - int NumDevices = NodeIds.size(); - DP("Found %i VE devices\n", NumDevices); - ProcHandles.resize(NumDevices, NULL); - Contexts.resize(NumDevices, NULL); - FuncOrGblEntry.resize(NumDevices); - LibraryHandles.resize(NumDevices); - } - - ~RTLDeviceInfoTy() { - for (auto &ctx : Contexts) { - if (ctx != NULL) { - if (veo_context_close(ctx) != 0) { - DP("Failed to close VEO context.\n"); - } - } - } - - for (auto &hdl : ProcHandles) { - if (hdl != NULL) { - veo_proc_destroy(hdl); - } - } - - for (auto &lib : DynLibs) { - if (lib.FileName) { - remove(lib.FileName); - } - } - } -}; - -static RTLDeviceInfoTy DeviceInfo; - -static int target_run_function_wait(uint32_t DeviceID, uint64_t FuncAddr, - struct veo_args *args, uint64_t *RetVal) { - DP("Running function with entry point %p\n", - reinterpret_cast(FuncAddr)); - uint64_t RequestHandle = - veo_call_async(DeviceInfo.Contexts[DeviceID], FuncAddr, args); - if (RequestHandle == VEO_REQUEST_ID_INVALID) { - DP("Execution of entry point %p failed\n", - reinterpret_cast(FuncAddr)); - return OFFLOAD_FAIL; - } - - DP("Function at address %p called (VEO request ID: %" PRIu64 ")\n", - reinterpret_cast(FuncAddr), RequestHandle); - - int ret = veo_call_wait_result(DeviceInfo.Contexts[DeviceID], RequestHandle, - RetVal); - if (ret != 0) { - DP("Waiting for entry point %p failed (Error code %d)\n", - reinterpret_cast(FuncAddr), ret); - return OFFLOAD_FAIL; - } - return OFFLOAD_SUCCESS; -} - -// Return the number of available devices of the type supported by the -// target RTL. -int32_t __tgt_rtl_number_of_devices(void) { return DeviceInfo.NodeIds.size(); } - -// Return an integer different from zero if the provided device image can be -// supported by the runtime. The functionality is similar to comparing the -// result of __tgt__rtl__load__binary to NULL. However, this is meant to be a -// lightweight query to determine if the RTL is suitable for an image without -// having to load the library, which can be expensive. -int32_t __tgt_rtl_is_valid_binary(__tgt_device_image *Image) { -#if TARGET_ELF_ID < 1 - return 0; -#else - return elf_check_machine(Image, TARGET_ELF_ID); -#endif -} - -// Initialize the specified device. In case of success return 0; otherwise -// return an error code. -int32_t __tgt_rtl_init_device(int32_t ID) { - DP("Available VEO version: %i\n", veo_api_version()); - - // At the moment we do not really initialize (i.e. create a process or - // context on) the device here, but in "__tgt_rtl_load_binary". - // The reason for this is, that, when we create a process for a statically - // linked binary, the VEO api needs us to already supply the binary (but we - // can load a dynamically linked binary later, after we create the process). - // At this stage, we cannot check if we have a dynamically or statically - // linked binary so we defer process creation until we know. - return OFFLOAD_SUCCESS; -} - -// Pass an executable image section described by image to the specified -// device and prepare an address table of target entities. In case of error, -// return NULL. Otherwise, return a pointer to the built address table. -// Individual entries in the table may also be NULL, when the corresponding -// offload region is not supported on the target device. -__tgt_target_table *__tgt_rtl_load_binary(int32_t ID, - __tgt_device_image *Image) { - DP("Dev %d: load binary from " DPxMOD " image\n", ID, - DPxPTR(Image->ImageStart)); - - assert(ID >= 0 && "bad dev id"); - - size_t ImageSize = (size_t)Image->ImageEnd - (size_t)Image->ImageStart; - size_t NumEntries = (size_t)(Image->EntriesEnd - Image->EntriesBegin); - DP("Expecting to have %zd entries defined.\n", NumEntries); - - // load dynamic library and get the entry points. We use the dl library - // to do the loading of the library, but we could do it directly to avoid the - // dump to the temporary file. - // - // 1) Create tmp file with the library contents. - // 2) Use dlopen to load the file and dlsym to retrieve the symbols. - char tmp_name[] = "/tmp/tmpfile_XXXXXX"; - int tmp_fd = mkstemp(tmp_name); - - if (tmp_fd == -1) { - return NULL; - } - - FILE *ftmp = fdopen(tmp_fd, "wb"); - - if (!ftmp) { - DP("fdopen() for %s failed. Could not write target image\n", tmp_name); - return NULL; - } - - fwrite(Image->ImageStart, ImageSize, 1, ftmp); - - // at least for the static case we need to change the permissions - chmod(tmp_name, 0700); - - DP("Wrote target image to %s. ImageSize=%zu\n", tmp_name, ImageSize); - - fclose(ftmp); - - // See comment in "__tgt_rtl_init_device" - bool is_dyn = true; - if (DeviceInfo.ProcHandles[ID] == NULL) { - struct veo_proc_handle *proc_handle; - is_dyn = elf_is_dynamic(Image); - // If we have a dynamically linked image, we create the process handle, then - // the thread, and then load the image. - // If we have a statically linked image, we need to create the process - // handle and load the image at the same time with veo_proc_create_static(). - if (is_dyn) { - proc_handle = veo_proc_create(DeviceInfo.NodeIds[ID]); - if (!proc_handle) { - DP("veo_proc_create() failed for device %d\n", ID); - return NULL; - } - } else { - proc_handle = veo_proc_create_static(DeviceInfo.NodeIds[ID], tmp_name); - if (!proc_handle) { - DP("veo_proc_create_static() failed for device %d, image=%s\n", ID, - tmp_name); - return NULL; - } - } - DeviceInfo.ProcHandles[ID] = proc_handle; - } - - if (DeviceInfo.Contexts[ID] == NULL) { - struct veo_thr_ctxt *ctx = veo_context_open(DeviceInfo.ProcHandles[ID]); - - if (!ctx) { - DP("veo_context_open() failed: %s\n", std::strerror(errno)); - return NULL; - } - - DeviceInfo.Contexts[ID] = ctx; - } - - DP("Aurora device successfully initialized with loaded binary: " - "proc_handle=%p, ctx=%p\n", - DeviceInfo.ProcHandles[ID], DeviceInfo.Contexts[ID]); - - uint64_t LibHandle = 0UL; - if (is_dyn) { - LibHandle = veo_load_library(DeviceInfo.ProcHandles[ID], tmp_name); - - if (!LibHandle) { - DP("veo_load_library() failed: LibHandle=%" PRIu64 - " Name=%s. Set env VEORUN_BIN for static linked target code.\n", - LibHandle, tmp_name); - return NULL; - } - - DP("Successfully loaded library dynamically\n"); - } else { - DP("Symbol table is expected to have been created by " - "veo_create_proc_static()\n"); - } - - DynLibTy Lib = {tmp_name, LibHandle}; - DeviceInfo.DynLibs.push_back(Lib); - DeviceInfo.LibraryHandles[ID] = LibHandle; - - DeviceInfo.buildOffloadTableFromHost(ID, LibHandle, Image->EntriesBegin, - Image->EntriesEnd); - - return DeviceInfo.getOffloadTable(ID); -} - -// Allocate data on the particular target device, of the specified size. -// HostPtr is a address of the host data the allocated target data -// will be associated with (HostPtr may be NULL if it is not known at -// allocation time, like for example it would be for target data that -// is allocated by omp_target_alloc() API). Return address of the -// allocated data on the target that will be used by libomptarget.so to -// initialize the target data mapping structures. These addresses are -// used to generate a table of target variables to pass to -// __tgt_rtl_run_region(). The __tgt_rtl_data_alloc() returns NULL in -// case an error occurred on the target device. -void *__tgt_rtl_data_alloc(int32_t ID, int64_t Size, void *HostPtr, - int32_t kind) { - int ret; - uint64_t addr; - - if (kind != TARGET_ALLOC_DEFAULT) { - REPORT("Invalid target data allocation kind or requested allocator not " - "implemented yet\n"); - return NULL; - } - - if (DeviceInfo.ProcHandles[ID] == NULL) { - struct veo_proc_handle *proc_handle; - proc_handle = veo_proc_create(DeviceInfo.NodeIds[ID]); - if (!proc_handle) { - DP("veo_proc_create() failed for device %d\n", ID); - return NULL; - } - DeviceInfo.ProcHandles[ID] = proc_handle; - DP("Aurora device successfully initialized: proc_handle=%p", proc_handle); - } - - ret = veo_alloc_mem(DeviceInfo.ProcHandles[ID], &addr, Size); - DP("Allocate target memory: device=%d, target addr=%p, size=%" PRIu64 "\n", - ID, reinterpret_cast(addr), Size); - if (ret != 0) { - DP("veo_alloc_mem(%d, %p, %" PRIu64 ") failed with error code %d\n", ID, - reinterpret_cast(addr), Size, ret); - return NULL; - } - - return reinterpret_cast(addr); -} - -// Pass the data content to the target device using the target address. -// In case of success, return zero. Otherwise, return an error code. -int32_t __tgt_rtl_data_submit(int32_t ID, void *TargetPtr, void *HostPtr, - int64_t Size) { - int ret = veo_write_mem(DeviceInfo.ProcHandles[ID], (uint64_t)TargetPtr, - HostPtr, (size_t)Size); - if (ret != 0) { - DP("veo_write_mem() failed with error code %d\n", ret); - return OFFLOAD_FAIL; - } - return OFFLOAD_SUCCESS; -} - -// Retrieve the data content from the target device using its address. -// In case of success, return zero. Otherwise, return an error code. -int32_t __tgt_rtl_data_retrieve(int32_t ID, void *HostPtr, void *TargetPtr, - int64_t Size) { - int ret = veo_read_mem(DeviceInfo.ProcHandles[ID], HostPtr, - (uint64_t)TargetPtr, Size); - if (ret != 0) { - DP("veo_read_mem() failed with error code %d\n", ret); - return OFFLOAD_FAIL; - } - return OFFLOAD_SUCCESS; -} - -// De-allocate the data referenced by target ptr on the device. In case of -// success, return zero. Otherwise, return an error code. -int32_t __tgt_rtl_data_delete(int32_t ID, void *TargetPtr, int32_t) { - int ret = veo_free_mem(DeviceInfo.ProcHandles[ID], (uint64_t)TargetPtr); - - if (ret != 0) { - DP("veo_free_mem() failed with error code %d\n", ret); - return OFFLOAD_FAIL; - } - return OFFLOAD_SUCCESS; -} - -// Transfer control to the offloaded entry Entry on the target device. -// Args and Offsets are arrays of NumArgs size of target addresses and -// offsets. An offset should be added to the target address before passing it -// to the outlined function on device side. In case of success, return zero. -// Otherwise, return an error code. -int32_t __tgt_rtl_launch_kernel(int32_t DeviceId, void *TgtEntryPtr, - void **TgtArgs, ptrdiff_t *TgtOffsets, - KernelArgsTy *KernelArgs, - __tgt_async_info *AsyncInfoPtr) { - assert(!KernelArgs->NumTeams[1] && !KernelArgs->NumTeams[2] && - !KernelArgs->ThreadLimit[1] && !KernelArgs->ThreadLimit[2] && - "Only one dimensional kernels supported."); - int ret; - - // ignore team num and thread limit. - std::vector ptrs(KernelArgs->NumArgs); - - struct veo_args *TargetArgs; - TargetArgs = veo_args_alloc(); - - if (TargetArgs == NULL) { - DP("Could not allocate VEO args\n"); - return OFFLOAD_FAIL; - } - - for (int i = 0; i < KernelArgs->NumArgs; ++i) { - ret = veo_args_set_u64(TargetArgs, i, (intptr_t)Args[i]); - - if (ret != 0) { - DP("veo_args_set_u64() has returned %d for argnum=%d and value %p\n", ret, - i, Args[i]); - return OFFLOAD_FAIL; - } - } - - uint64_t RetVal; - if (target_run_function_wait(ID, reinterpret_cast(Entry), - TargetArgs, &RetVal) != OFFLOAD_SUCCESS) { - veo_args_free(TargetArgs); - return OFFLOAD_FAIL; - } - veo_args_free(TargetArgs); - return OFFLOAD_SUCCESS; -} - -int32_t __tgt_rtl_supports_empty_images() { return 1; } - -// VEC plugin's internal InfoLevel. -std::atomic InfoLevel; diff --git a/openmp/libomptarget/plugins/x86_64/CMakeLists.txt b/openmp/libomptarget/plugins/x86_64/CMakeLists.txt deleted file mode 100644 --- a/openmp/libomptarget/plugins/x86_64/CMakeLists.txt +++ /dev/null @@ -1,17 +0,0 @@ -##===----------------------------------------------------------------------===## -# -# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -# See https://llvm.org/LICENSE.txt for license information. -# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -# -##===----------------------------------------------------------------------===## -# -# Build a plugin for a x86_64 machine if available. -# -##===----------------------------------------------------------------------===## - -if(CMAKE_SYSTEM_NAME MATCHES "Linux") - build_generic_elf64("x86_64" "x86_64" "x86_64" "x86_64-pc-linux-gnu" "62") -else() - libomptarget_say("Not building x86_64 offloading plugin: machine not found in the system.") -endif() diff --git a/openmp/libomptarget/src/rtl.cpp b/openmp/libomptarget/src/rtl.cpp --- a/openmp/libomptarget/src/rtl.cpp +++ b/openmp/libomptarget/src/rtl.cpp @@ -92,8 +92,6 @@ DP("Loading RTLs...\n"); - BoolEnvar NextGenPlugins("LIBOMPTARGET_NEXTGEN_PLUGINS", true); - // Attempt to open all the plugins and, if they exist, check if the interface // is correct and if they are supporting any devices. for (const char *Name : RTLNames) { @@ -102,14 +100,7 @@ RTLInfoTy &RTL = AllRTLs.back(); const std::string BaseRTLName(Name); - if (NextGenPlugins) { - if (attemptLoadRTL(BaseRTLName + ".nextgen.so", RTL)) - continue; - - DP("Falling back to original plugin...\n"); - } - - if (!attemptLoadRTL(BaseRTLName + ".so", RTL)) + if (!attemptLoadRTL(BaseRTLName + ".nextgen.so", RTL)) AllRTLs.pop_back(); } diff --git a/openmp/libomptarget/test/api/omp_dynamic_shared_memory_mixed_amdgpu.c b/openmp/libomptarget/test/api/omp_dynamic_shared_memory_mixed_amdgpu.c --- a/openmp/libomptarget/test/api/omp_dynamic_shared_memory_mixed_amdgpu.c +++ b/openmp/libomptarget/test/api/omp_dynamic_shared_memory_mixed_amdgpu.c @@ -1,6 +1,5 @@ // RUN: %libomptarget-compile-amdgcn-amd-amdhsa -O1 -mllvm -openmp-opt-inline-device -I %S -// RUN: env LIBOMPTARGET_NEXTGEN_PLUGINS=1 \ -// RUN: %libomptarget-run-amdgcn-amd-amdhsa | %fcheck-amdgcn-amd-amdhsa +// RUN: %libomptarget-run-amdgcn-amd-amdhsa | %fcheck-amdgcn-amd-amdhsa // REQUIRES: amdgcn-amd-amdhsa #include "omp_dynamic_shared_memory_mixed.inc" diff --git a/openmp/libomptarget/test/api/omp_dynamic_shared_memory_mixed_nvptx.c b/openmp/libomptarget/test/api/omp_dynamic_shared_memory_mixed_nvptx.c --- a/openmp/libomptarget/test/api/omp_dynamic_shared_memory_mixed_nvptx.c +++ b/openmp/libomptarget/test/api/omp_dynamic_shared_memory_mixed_nvptx.c @@ -1,7 +1,5 @@ // RUN: %libomptarget-compile-nvptx64-nvidia-cuda -I %S -// RUN: env LIBOMPTARGET_NEXTGEN_PLUGINS=1 \ -// RUN: %libomptarget-run-nvptx64-nvidia-cuda | %fcheck-nvptx64-nvidia-cuda -// REQUIRES: nvptx64-nvidia-cuda +// RUN: %libomptarget-run-nvptx64-nvidia-cuda | %fcheck-nvptx64-nvidia-cuda #include "omp_dynamic_shared_memory_mixed.inc" // CHECK: PASS diff --git a/openmp/libomptarget/test/lit.cfg b/openmp/libomptarget/test/lit.cfg --- a/openmp/libomptarget/test/lit.cfg +++ b/openmp/libomptarget/test/lit.cfg @@ -21,10 +21,6 @@ if 'LIBOMPTARGET_DEBUG' in os.environ: config.environment['LIBOMPTARGET_DEBUG'] = os.environ['LIBOMPTARGET_DEBUG'] -# Allow running the tests with nextgen plugins when available -if 'LIBOMPTARGET_NEXTGEN_PLUGINS' in os.environ: - config.environment['LIBOMPTARGET_NEXTGEN_PLUGINS'] = os.environ['LIBOMPTARGET_NEXTGEN_PLUGINS'] - if 'OMP_TARGET_OFFLOAD' in os.environ: config.environment['OMP_TARGET_OFFLOAD'] = os.environ['OMP_TARGET_OFFLOAD'] @@ -118,9 +114,7 @@ config.test_flags += " --libomptarget-nvptx-bc-path=" + config.library_dir if config.libomptarget_current_target.endswith('-LTO'): config.test_flags += " -foffload-lto" - if config.libomptarget_current_target.endswith('-JIT-LTO') and evaluate_bool_env( - config.environment['LIBOMPTARGET_NEXTGEN_PLUGINS'] - ): + if config.libomptarget_current_target.endswith('-JIT-LTO'): config.test_flags += " -foffload-lto" config.test_flags += " -Wl,--embed-bitcode" diff --git a/openmp/libomptarget/test/mapping/prelock.cpp b/openmp/libomptarget/test/mapping/prelock.cpp --- a/openmp/libomptarget/test/mapping/prelock.cpp +++ b/openmp/libomptarget/test/mapping/prelock.cpp @@ -1,5 +1,4 @@ -// RUN: %libomptarget-compilexx-generic -// RUN: env LIBOMPTARGET_NEXTGEN_PLUGINS=0 %libomptarget-run-generic %fcheck-generic +// RUN: %libomptarget-compile-run-and-check-generic // UNSUPPORTED: aarch64-unknown-linux-gnu // UNSUPPORTED: aarch64-unknown-linux-gnu-LTO