diff --git a/openmp/docs/design/Runtimes.rst b/openmp/docs/design/Runtimes.rst --- a/openmp/docs/design/Runtimes.rst +++ b/openmp/docs/design/Runtimes.rst @@ -1355,7 +1355,7 @@ .. code-block:: console $ clang++ -fopenmp --offload-arch=gfx90a -O3 shared.c - $ env LIBOMPTARGET_NEXTGEN_PLUGINS=1 ./shared + $ env ./shared .. _libomptarget_device_debugging: diff --git a/openmp/libomptarget/CMakeLists.txt b/openmp/libomptarget/CMakeLists.txt --- a/openmp/libomptarget/CMakeLists.txt +++ b/openmp/libomptarget/CMakeLists.txt @@ -105,7 +105,6 @@ "Path to folder containing llvm library libomptarget.so") # Build offloading plugins and device RTLs if they are available. -add_subdirectory(plugins) add_subdirectory(plugins-nextgen) add_subdirectory(DeviceRTL) add_subdirectory(tools) diff --git a/openmp/libomptarget/plugins-nextgen/CMakeLists.txt b/openmp/libomptarget/plugins-nextgen/CMakeLists.txt --- a/openmp/libomptarget/plugins-nextgen/CMakeLists.txt +++ b/openmp/libomptarget/plugins-nextgen/CMakeLists.txt @@ -12,12 +12,12 @@ add_subdirectory(common) -# void build_generic_elf64_nextgen(string tmachine, string tmachine_name, string tmachine_libname, string elf_machine_id); +# void build_generic_elf64(string tmachine, string tmachine_name, string tmachine_libname, string elf_machine_id); # - build a plugin for an ELF based generic 64-bit target based on libffi. # - tmachine: name of the machine processor as used in the cmake build system. # - tmachine_name: name of the machine to be printed with the debug messages. # - tmachine_libname: machine name to be appended to the plugin library name. -macro(build_generic_elf64_nextgen tmachine tmachine_name tmachine_libname tmachine_triple elf_machine_id) +macro(build_generic_elf64 tmachine tmachine_name tmachine_libname tmachine_triple elf_machine_id) if(CMAKE_SYSTEM_PROCESSOR MATCHES "${tmachine}$") if(LIBOMPTARGET_DEP_LIBFFI_FOUND) @@ -36,7 +36,7 @@ # Define target regiple add_definitions("-DLIBOMPTARGET_NEXTGEN_GENERIC_PLUGIN_TRIPLE=${tmachine}") - add_llvm_library("omptarget.rtl.${tmachine_libname}.nextgen" + add_llvm_library("omptarget.rtl.${tmachine_libname}" SHARED ${CMAKE_CURRENT_SOURCE_DIR}/../generic-elf-64bit/src/rtl.cpp @@ -58,18 +58,18 @@ ) # Install plugin under the lib destination folder. - install(TARGETS "omptarget.rtl.${tmachine_libname}.nextgen" + install(TARGETS "omptarget.rtl.${tmachine_libname}" LIBRARY DESTINATION "${OPENMP_INSTALL_LIBDIR}") - set_target_properties("omptarget.rtl.${tmachine_libname}.nextgen" PROPERTIES + set_target_properties("omptarget.rtl.${tmachine_libname}" PROPERTIES INSTALL_RPATH "$ORIGIN" BUILD_RPATH "$ORIGIN:${CMAKE_CURRENT_BINARY_DIR}/.." CXX_VISIBILITY_PRESET protected) - target_include_directories( "omptarget.rtl.${tmachine_libname}.nextgen" PRIVATE + target_include_directories( "omptarget.rtl.${tmachine_libname}" PRIVATE ${LIBOMPTARGET_INCLUDE_DIR} ${LIBOMPTARGET_DEP_LIBFFI_INCLUDE_DIR}) list(APPEND LIBOMPTARGET_TESTED_PLUGINS - "omptarget.rtl.${tmachine_libname}.nextgen") + "omptarget.rtl.${tmachine_libname}") else(LIBOMPTARGET_DEP_LIBFFI_FOUND) libomptarget_say("Not building ${tmachine_name} NextGen offloading plugin: libffi dependency not found.") @@ -85,6 +85,8 @@ add_subdirectory(ppc64) add_subdirectory(ppc64le) add_subdirectory(x86_64) +add_subdirectory(ve) +add_subdirectory(remote) # Make sure the parent scope can see the plugins that will be created. set(LIBOMPTARGET_SYSTEM_TARGETS "${LIBOMPTARGET_SYSTEM_TARGETS}" PARENT_SCOPE) diff --git a/openmp/libomptarget/plugins-nextgen/aarch64/CMakeLists.txt b/openmp/libomptarget/plugins-nextgen/aarch64/CMakeLists.txt --- a/openmp/libomptarget/plugins-nextgen/aarch64/CMakeLists.txt +++ b/openmp/libomptarget/plugins-nextgen/aarch64/CMakeLists.txt @@ -11,7 +11,7 @@ ##===----------------------------------------------------------------------===## if(CMAKE_SYSTEM_NAME MATCHES "Linux") - build_generic_elf64_nextgen("aarch64" "aarch64" "aarch64" "aarch64-unknown-linux-gnu" "183") + build_generic_elf64("aarch64" "aarch64" "aarch64" "aarch64-unknown-linux-gnu" "183") else() libomptarget_say("Not building aarch64 NextGen offloading plugin: machine not found in the system.") endif() diff --git a/openmp/libomptarget/plugins-nextgen/amdgpu/CMakeLists.txt b/openmp/libomptarget/plugins-nextgen/amdgpu/CMakeLists.txt --- a/openmp/libomptarget/plugins-nextgen/amdgpu/CMakeLists.txt +++ b/openmp/libomptarget/plugins-nextgen/amdgpu/CMakeLists.txt @@ -52,8 +52,8 @@ set(LIBOMPTARGET_DEP_LIBRARIES hsa-runtime64::hsa-runtime64) else() libomptarget_say("Building AMDGPU NextGen plugin for dlopened libhsa") - include_directories(../../plugins/amdgpu/dynamic_hsa) - set(LIBOMPTARGET_EXTRA_SOURCE ../../plugins/amdgpu/dynamic_hsa/hsa.cpp) + include_directories(dynamic_hsa) + set(LIBOMPTARGET_EXTRA_SOURCE dynamic_hsa/hsa.cpp) set(LIBOMPTARGET_DEP_LIBRARIES) endif() @@ -66,7 +66,7 @@ set(LDFLAGS_UNDEFINED "-Wl,-z,defs") endif() -add_llvm_library(omptarget.rtl.amdgpu.nextgen SHARED +add_llvm_library(omptarget.rtl.amdgpu SHARED src/rtl.cpp ${LIBOMPTARGET_EXTRA_SOURCE} @@ -92,7 +92,7 @@ ) target_include_directories( - omptarget.rtl.amdgpu.nextgen + omptarget.rtl.amdgpu PRIVATE ${LIBOMPTARGET_INCLUDE_DIR} ${CMAKE_CURRENT_SOURCE_DIR}/utils @@ -100,7 +100,7 @@ # Install plugin under the lib destination folder. -install(TARGETS omptarget.rtl.amdgpu.nextgen LIBRARY DESTINATION "${OPENMP_INSTALL_LIBDIR}") -set_target_properties(omptarget.rtl.amdgpu.nextgen PROPERTIES +install(TARGETS omptarget.rtl.amdgpu LIBRARY DESTINATION "${OPENMP_INSTALL_LIBDIR}") +set_target_properties(omptarget.rtl.amdgpu PROPERTIES INSTALL_RPATH "$ORIGIN" BUILD_RPATH "$ORIGIN:${CMAKE_CURRENT_BINARY_DIR}/.." CXX_VISIBILITY_PRESET protected) diff --git a/openmp/libomptarget/plugins/amdgpu/dynamic_hsa/hsa.h b/openmp/libomptarget/plugins-nextgen/amdgpu/dynamic_hsa/hsa.h rename from openmp/libomptarget/plugins/amdgpu/dynamic_hsa/hsa.h rename to openmp/libomptarget/plugins-nextgen/amdgpu/dynamic_hsa/hsa.h diff --git a/openmp/libomptarget/plugins/amdgpu/dynamic_hsa/hsa.cpp b/openmp/libomptarget/plugins-nextgen/amdgpu/dynamic_hsa/hsa.cpp rename from openmp/libomptarget/plugins/amdgpu/dynamic_hsa/hsa.cpp rename to openmp/libomptarget/plugins-nextgen/amdgpu/dynamic_hsa/hsa.cpp diff --git a/openmp/libomptarget/plugins/amdgpu/dynamic_hsa/hsa_ext_amd.h b/openmp/libomptarget/plugins-nextgen/amdgpu/dynamic_hsa/hsa_ext_amd.h rename from openmp/libomptarget/plugins/amdgpu/dynamic_hsa/hsa_ext_amd.h rename to openmp/libomptarget/plugins-nextgen/amdgpu/dynamic_hsa/hsa_ext_amd.h diff --git a/openmp/libomptarget/plugins-nextgen/common/CMakeLists.txt b/openmp/libomptarget/plugins-nextgen/common/CMakeLists.txt --- a/openmp/libomptarget/plugins-nextgen/common/CMakeLists.txt +++ b/openmp/libomptarget/plugins-nextgen/common/CMakeLists.txt @@ -11,3 +11,5 @@ ##===----------------------------------------------------------------------===## add_subdirectory(PluginInterface) +add_subdirectory(MemoryManager) +add_subdirectory(elf_common) diff --git a/openmp/libomptarget/plugins/common/MemoryManager/CMakeLists.txt b/openmp/libomptarget/plugins-nextgen/common/MemoryManager/CMakeLists.txt rename from openmp/libomptarget/plugins/common/MemoryManager/CMakeLists.txt rename to openmp/libomptarget/plugins-nextgen/common/MemoryManager/CMakeLists.txt diff --git a/openmp/libomptarget/plugins/common/MemoryManager/MemoryManager.h b/openmp/libomptarget/plugins-nextgen/common/MemoryManager/MemoryManager.h rename from openmp/libomptarget/plugins/common/MemoryManager/MemoryManager.h rename to openmp/libomptarget/plugins-nextgen/common/MemoryManager/MemoryManager.h diff --git a/openmp/libomptarget/plugins/common/elf_common/CMakeLists.txt b/openmp/libomptarget/plugins-nextgen/common/elf_common/CMakeLists.txt rename from openmp/libomptarget/plugins/common/elf_common/CMakeLists.txt rename to openmp/libomptarget/plugins-nextgen/common/elf_common/CMakeLists.txt --- a/openmp/libomptarget/plugins/common/elf_common/CMakeLists.txt +++ b/openmp/libomptarget/plugins-nextgen/common/elf_common/CMakeLists.txt @@ -11,7 +11,7 @@ ##===----------------------------------------------------------------------===## # NOTE: Don't try to build `elf_common` using `add_llvm_library`. -# See openmp/libomptarget/plugins-nextgen/common/PluginInterface/CMakeLists.txt +# See openmp/libomptarget/plugins/common/PluginInterface/CMakeLists.txt # for more explanation. add_library(elf_common OBJECT elf_common.cpp ELFSymbols.cpp) diff --git a/openmp/libomptarget/plugins/common/elf_common/ELFSymbols.h b/openmp/libomptarget/plugins-nextgen/common/elf_common/ELFSymbols.h rename from openmp/libomptarget/plugins/common/elf_common/ELFSymbols.h rename to openmp/libomptarget/plugins-nextgen/common/elf_common/ELFSymbols.h diff --git a/openmp/libomptarget/plugins/common/elf_common/ELFSymbols.cpp b/openmp/libomptarget/plugins-nextgen/common/elf_common/ELFSymbols.cpp rename from openmp/libomptarget/plugins/common/elf_common/ELFSymbols.cpp rename to openmp/libomptarget/plugins-nextgen/common/elf_common/ELFSymbols.cpp diff --git a/openmp/libomptarget/plugins/common/elf_common/elf_common.h b/openmp/libomptarget/plugins-nextgen/common/elf_common/elf_common.h rename from openmp/libomptarget/plugins/common/elf_common/elf_common.h rename to openmp/libomptarget/plugins-nextgen/common/elf_common/elf_common.h diff --git a/openmp/libomptarget/plugins/common/elf_common/elf_common.cpp b/openmp/libomptarget/plugins-nextgen/common/elf_common/elf_common.cpp rename from openmp/libomptarget/plugins/common/elf_common/elf_common.cpp rename to openmp/libomptarget/plugins-nextgen/common/elf_common/elf_common.cpp diff --git a/openmp/libomptarget/plugins-nextgen/cuda/CMakeLists.txt b/openmp/libomptarget/plugins-nextgen/cuda/CMakeLists.txt --- a/openmp/libomptarget/plugins-nextgen/cuda/CMakeLists.txt +++ b/openmp/libomptarget/plugins-nextgen/cuda/CMakeLists.txt @@ -26,7 +26,7 @@ set(LIBOMPTARGET_DLOPEN_LIBCUDA OFF) option(LIBOMPTARGET_FORCE_DLOPEN_LIBCUDA "Build with dlopened libcuda" ${LIBOMPTARGET_DLOPEN_LIBCUDA}) -add_llvm_library(omptarget.rtl.cuda.nextgen SHARED +add_llvm_library(omptarget.rtl.cuda SHARED src/rtl.cpp LINK_COMPONENTS @@ -45,22 +45,22 @@ if(LIBOMPTARGET_DEP_CUDA_FOUND AND NOT LIBOMPTARGET_FORCE_DLOPEN_LIBCUDA) libomptarget_say("Building CUDA plugin linked against libcuda") - target_link_libraries(omptarget.rtl.cuda.nextgen PRIVATE CUDA::cuda_driver) + target_link_libraries(omptarget.rtl.cuda PRIVATE CUDA::cuda_driver) else() libomptarget_say("Building CUDA plugin for dlopened libcuda") - target_include_directories(omptarget.rtl.cuda.nextgen PRIVATE ../../plugins/cuda/dynamic_cuda) - target_sources(omptarget.rtl.cuda.nextgen PRIVATE ../../plugins/cuda/dynamic_cuda/cuda.cpp) + target_include_directories(omptarget.rtl.cuda PRIVATE dynamic_cuda) + target_sources(omptarget.rtl.cuda PRIVATE dynamic_cuda/cuda.cpp) endif() # Define debug prefix. TODO: This should be automatized in the Debug.h but it # requires changing the original plugins. -target_compile_definitions(omptarget.rtl.cuda.nextgen PRIVATE TARGET_NAME="CUDA") -target_compile_definitions(omptarget.rtl.cuda.nextgen PRIVATE DEBUG_PREFIX="TARGET CUDA RTL") +target_compile_definitions(omptarget.rtl.cuda PRIVATE TARGET_NAME="CUDA") +target_compile_definitions(omptarget.rtl.cuda PRIVATE DEBUG_PREFIX="TARGET CUDA RTL") -target_include_directories(omptarget.rtl.cuda.nextgen PRIVATE ${LIBOMPTARGET_INCLUDE_DIR}) +target_include_directories(omptarget.rtl.cuda PRIVATE ${LIBOMPTARGET_INCLUDE_DIR}) # Install plugin under the lib destination folder. -install(TARGETS omptarget.rtl.cuda.nextgen LIBRARY DESTINATION "${OPENMP_INSTALL_LIBDIR}") -set_target_properties(omptarget.rtl.cuda.nextgen PROPERTIES +install(TARGETS omptarget.rtl.cuda LIBRARY DESTINATION "${OPENMP_INSTALL_LIBDIR}") +set_target_properties(omptarget.rtl.cuda PROPERTIES INSTALL_RPATH "$ORIGIN" BUILD_RPATH "$ORIGIN:${CMAKE_CURRENT_BINARY_DIR}/.." CXX_VISIBILITY_PRESET protected) diff --git a/openmp/libomptarget/plugins/cuda/dynamic_cuda/cuda.h b/openmp/libomptarget/plugins-nextgen/cuda/dynamic_cuda/cuda.h rename from openmp/libomptarget/plugins/cuda/dynamic_cuda/cuda.h rename to openmp/libomptarget/plugins-nextgen/cuda/dynamic_cuda/cuda.h diff --git a/openmp/libomptarget/plugins/cuda/dynamic_cuda/cuda.cpp b/openmp/libomptarget/plugins-nextgen/cuda/dynamic_cuda/cuda.cpp rename from openmp/libomptarget/plugins/cuda/dynamic_cuda/cuda.cpp rename to openmp/libomptarget/plugins-nextgen/cuda/dynamic_cuda/cuda.cpp diff --git a/openmp/libomptarget/plugins-nextgen/ppc64/CMakeLists.txt b/openmp/libomptarget/plugins-nextgen/ppc64/CMakeLists.txt --- a/openmp/libomptarget/plugins-nextgen/ppc64/CMakeLists.txt +++ b/openmp/libomptarget/plugins-nextgen/ppc64/CMakeLists.txt @@ -11,7 +11,7 @@ ##===----------------------------------------------------------------------===## if(CMAKE_SYSTEM_NAME MATCHES "Linux") - build_generic_elf64_nextgen("ppc64" "PPC64" "ppc64" "powerpc64-ibm-linux-gnu" "21") + build_generic_elf64("ppc64" "PPC64" "ppc64" "powerpc64-ibm-linux-gnu" "21") else() libomptarget_say("Not building ppc64 NextGen offloading plugin: machine not found in the system.") endif() diff --git a/openmp/libomptarget/plugins-nextgen/ppc64le/CMakeLists.txt b/openmp/libomptarget/plugins-nextgen/ppc64le/CMakeLists.txt --- a/openmp/libomptarget/plugins-nextgen/ppc64le/CMakeLists.txt +++ b/openmp/libomptarget/plugins-nextgen/ppc64le/CMakeLists.txt @@ -11,7 +11,7 @@ ##===----------------------------------------------------------------------===## if(CMAKE_SYSTEM_NAME MATCHES "Linux") - build_generic_elf64_nextgen("ppc64le" "PPC64le" "ppc64" "powerpc64le-ibm-linux-gnu" "21") + build_generic_elf64("ppc64le" "PPC64le" "ppc64" "powerpc64le-ibm-linux-gnu" "21") else() libomptarget_say("Not building ppc64le NextGen offloading plugin: machine not found in the system.") endif() diff --git a/openmp/libomptarget/plugins/remote/CMakeLists.txt b/openmp/libomptarget/plugins-nextgen/remote/CMakeLists.txt rename from openmp/libomptarget/plugins/remote/CMakeLists.txt rename to openmp/libomptarget/plugins-nextgen/remote/CMakeLists.txt diff --git a/openmp/libomptarget/plugins/remote/include/Utils.h b/openmp/libomptarget/plugins-nextgen/remote/include/Utils.h rename from openmp/libomptarget/plugins/remote/include/Utils.h rename to openmp/libomptarget/plugins-nextgen/remote/include/Utils.h diff --git a/openmp/libomptarget/plugins/remote/include/openmp.proto b/openmp/libomptarget/plugins-nextgen/remote/include/openmp.proto rename from openmp/libomptarget/plugins/remote/include/openmp.proto rename to openmp/libomptarget/plugins-nextgen/remote/include/openmp.proto diff --git a/openmp/libomptarget/plugins/remote/lib/Utils.cpp b/openmp/libomptarget/plugins-nextgen/remote/lib/Utils.cpp rename from openmp/libomptarget/plugins/remote/lib/Utils.cpp rename to openmp/libomptarget/plugins-nextgen/remote/lib/Utils.cpp diff --git a/openmp/libomptarget/plugins/remote/server/CMakeLists.txt b/openmp/libomptarget/plugins-nextgen/remote/server/CMakeLists.txt rename from openmp/libomptarget/plugins/remote/server/CMakeLists.txt rename to openmp/libomptarget/plugins-nextgen/remote/server/CMakeLists.txt diff --git a/openmp/libomptarget/plugins/remote/server/OffloadingServer.cpp b/openmp/libomptarget/plugins-nextgen/remote/server/OffloadingServer.cpp rename from openmp/libomptarget/plugins/remote/server/OffloadingServer.cpp rename to openmp/libomptarget/plugins-nextgen/remote/server/OffloadingServer.cpp diff --git a/openmp/libomptarget/plugins/remote/server/Server.h b/openmp/libomptarget/plugins-nextgen/remote/server/Server.h rename from openmp/libomptarget/plugins/remote/server/Server.h rename to openmp/libomptarget/plugins-nextgen/remote/server/Server.h diff --git a/openmp/libomptarget/plugins/remote/server/Server.cpp b/openmp/libomptarget/plugins-nextgen/remote/server/Server.cpp rename from openmp/libomptarget/plugins/remote/server/Server.cpp rename to openmp/libomptarget/plugins-nextgen/remote/server/Server.cpp diff --git a/openmp/libomptarget/plugins/remote/src/CMakeLists.txt b/openmp/libomptarget/plugins-nextgen/remote/src/CMakeLists.txt rename from openmp/libomptarget/plugins/remote/src/CMakeLists.txt rename to openmp/libomptarget/plugins-nextgen/remote/src/CMakeLists.txt diff --git a/openmp/libomptarget/plugins/remote/src/Client.h b/openmp/libomptarget/plugins-nextgen/remote/src/Client.h rename from openmp/libomptarget/plugins/remote/src/Client.h rename to openmp/libomptarget/plugins-nextgen/remote/src/Client.h diff --git a/openmp/libomptarget/plugins/remote/src/Client.cpp b/openmp/libomptarget/plugins-nextgen/remote/src/Client.cpp rename from openmp/libomptarget/plugins/remote/src/Client.cpp rename to openmp/libomptarget/plugins-nextgen/remote/src/Client.cpp diff --git a/openmp/libomptarget/plugins/remote/src/rtl.cpp b/openmp/libomptarget/plugins-nextgen/remote/src/rtl.cpp rename from openmp/libomptarget/plugins/remote/src/rtl.cpp rename to openmp/libomptarget/plugins-nextgen/remote/src/rtl.cpp diff --git a/openmp/libomptarget/plugins/ve/CMakeLists.txt b/openmp/libomptarget/plugins-nextgen/ve/CMakeLists.txt rename from openmp/libomptarget/plugins/ve/CMakeLists.txt rename to openmp/libomptarget/plugins-nextgen/ve/CMakeLists.txt diff --git a/openmp/libomptarget/plugins/ve/src/rtl.cpp b/openmp/libomptarget/plugins-nextgen/ve/src/rtl.cpp rename from openmp/libomptarget/plugins/ve/src/rtl.cpp rename to openmp/libomptarget/plugins-nextgen/ve/src/rtl.cpp diff --git a/openmp/libomptarget/plugins-nextgen/x86_64/CMakeLists.txt b/openmp/libomptarget/plugins-nextgen/x86_64/CMakeLists.txt --- a/openmp/libomptarget/plugins-nextgen/x86_64/CMakeLists.txt +++ b/openmp/libomptarget/plugins-nextgen/x86_64/CMakeLists.txt @@ -11,7 +11,7 @@ ##===----------------------------------------------------------------------===## if(CMAKE_SYSTEM_NAME MATCHES "Linux") - build_generic_elf64_nextgen("x86_64" "x86_64" "x86_64" "x86_64-pc-linux-gnu" "62") + build_generic_elf64("x86_64" "x86_64" "x86_64" "x86_64-pc-linux-gnu" "62") else() libomptarget_say("Not building x86_64 NextGen offloading plugin: machine not found in the system.") endif() diff --git a/openmp/libomptarget/plugins/CMakeLists.txt b/openmp/libomptarget/plugins/CMakeLists.txt deleted file mode 100644 --- a/openmp/libomptarget/plugins/CMakeLists.txt +++ /dev/null @@ -1,91 +0,0 @@ -##===----------------------------------------------------------------------===## -# -# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -# See https://llvm.org/LICENSE.txt for license information. -# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -# -##===----------------------------------------------------------------------===## -# -# Build plugins for the user system if available. -# -##===----------------------------------------------------------------------===## - -add_subdirectory(common) - -# void build_generic_elf64(string tmachine, string tmachine_name, string tmachine_libname, string elf_machine_id); -# - build a plugin for an ELF based generic 64-bit target based on libffi. -# - tmachine: name of the machine processor as used in the cmake build system. -# - tmachine_name: name of the machine to be printed with the debug messages. -# - tmachine_libname: machine name to be appended to the plugin library name. -macro(build_generic_elf64 tmachine tmachine_name tmachine_libname tmachine_triple elf_machine_id) -if(CMAKE_SYSTEM_PROCESSOR MATCHES "${tmachine}$") - if(LIBOMPTARGET_DEP_LIBFFI_FOUND) - - libomptarget_say("Building ${tmachine_name} offloading plugin.") - - # Define macro to be used as prefix of the runtime messages for this target. - add_definitions("-DTARGET_NAME=${tmachine_name}") - - # Define macro with the ELF ID for this target. - add_definitions("-DTARGET_ELF_ID=${elf_machine_id}") - - add_llvm_library("omptarget.rtl.${tmachine_libname}" - SHARED - - ${CMAKE_CURRENT_SOURCE_DIR}/../generic-elf-64bit/src/rtl.cpp - - ADDITIONAL_HEADER_DIRS - ${LIBOMPTARGET_INCLUDE_DIR} - ${LIBOMPTARGET_DEP_LIBFFI_INCLUDE_DIR} - - LINK_LIBS - PRIVATE - elf_common - ${LIBOMPTARGET_DEP_LIBFFI_LIBRARIES} - ${OPENMP_PTHREAD_LIB} - "-Wl,--version-script=${CMAKE_CURRENT_SOURCE_DIR}/../exports" - - NO_INSTALL_RPATH - ) - - # Install plugin under the lib destination folder. - install(TARGETS "omptarget.rtl.${tmachine_libname}" - LIBRARY DESTINATION "${OPENMP_INSTALL_LIBDIR}") - set_target_properties("omptarget.rtl.${tmachine_libname}" PROPERTIES - INSTALL_RPATH "$ORIGIN" BUILD_RPATH "$ORIGIN:${CMAKE_CURRENT_BINARY_DIR}/.." - CXX_VISIBILITY_PRESET protected) - - target_include_directories( "omptarget.rtl.${tmachine_libname}" PRIVATE - ${LIBOMPTARGET_INCLUDE_DIR} - ${LIBOMPTARGET_DEP_LIBFFI_INCLUDE_DIR}) - - list(APPEND LIBOMPTARGET_TESTED_PLUGINS - "omptarget.rtl.${tmachine_libname}") - - # Report to the parent scope that we are building a plugin. - set(LIBOMPTARGET_SYSTEM_TARGETS - "${LIBOMPTARGET_SYSTEM_TARGETS} ${tmachine_triple} ${tmachine_triple}-LTO" PARENT_SCOPE) - set(LIBOMPTARGET_TESTED_PLUGINS - "${LIBOMPTARGET_TESTED_PLUGINS}" PARENT_SCOPE) - - else(LIBOMPTARGET_DEP_LIBFFI_FOUND) - libomptarget_say("Not building ${tmachine_name} offloading plugin: libffi dependency not found.") - endif(LIBOMPTARGET_DEP_LIBFFI_FOUND) -else() - libomptarget_say("Not building ${tmachine_name} offloading plugin: machine not found in the system.") -endif() -endmacro() - -add_subdirectory(aarch64) -add_subdirectory(amdgpu) -add_subdirectory(cuda) -add_subdirectory(ppc64) -add_subdirectory(ppc64le) -add_subdirectory(ve) -add_subdirectory(x86_64) -add_subdirectory(remote) - -# Make sure the parent scope can see the plugins that will be created. -set(LIBOMPTARGET_SYSTEM_TARGETS "${LIBOMPTARGET_SYSTEM_TARGETS}" PARENT_SCOPE) -set(LIBOMPTARGET_TESTED_PLUGINS "${LIBOMPTARGET_TESTED_PLUGINS}" PARENT_SCOPE) - diff --git a/openmp/libomptarget/plugins/aarch64/CMakeLists.txt b/openmp/libomptarget/plugins/aarch64/CMakeLists.txt deleted file mode 100644 --- a/openmp/libomptarget/plugins/aarch64/CMakeLists.txt +++ /dev/null @@ -1,17 +0,0 @@ -##===----------------------------------------------------------------------===## -# -# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -# See https://llvm.org/LICENSE.txt for license information. -# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -# -##===----------------------------------------------------------------------===## -# -# Build a plugin for an aarch64 machine if available. -# -##===----------------------------------------------------------------------===## - -if(CMAKE_SYSTEM_NAME MATCHES "Linux") - build_generic_elf64("aarch64" "aarch64" "aarch64" "aarch64-unknown-linux-gnu" "183") -else() - libomptarget_say("Not building aarch64 offloading plugin: machine not found in the system.") -endif() diff --git a/openmp/libomptarget/plugins/amdgpu/CMakeLists.txt b/openmp/libomptarget/plugins/amdgpu/CMakeLists.txt deleted file mode 100644 --- a/openmp/libomptarget/plugins/amdgpu/CMakeLists.txt +++ /dev/null @@ -1,123 +0,0 @@ -##===----------------------------------------------------------------------===## -# -# The LLVM Compiler Infrastructure -# -# This file is dual licensed under the MIT and the University of Illinois Open -# Source Licenses. See LICENSE.txt for details. -# -##===----------------------------------------------------------------------===## -# -# Build a plugin for an AMDGPU machine if available. -# -##===----------------------------------------------------------------------===## - -################################################################################ -set(LIBOMPTARGET_BUILD_AMDGPU_PLUGIN TRUE CACHE BOOL - "Whether to build AMDGPU plugin") -if (NOT LIBOMPTARGET_BUILD_AMDGPU_PLUGIN) - libomptarget_say("Not building AMDGPU offloading plugin: LIBOMPTARGET_BUILD_AMDGPU_PLUGIN is false") - return() -endif() - -# as of rocm-3.7, hsa is installed with cmake packages and kmt is found via hsa -find_package(hsa-runtime64 QUIET 1.2.0 HINTS ${CMAKE_INSTALL_PREFIX} PATHS /opt/rocm) - -if(NOT (CMAKE_SYSTEM_PROCESSOR MATCHES "(x86_64)|(ppc64le)|(aarch64)$" AND CMAKE_SYSTEM_NAME MATCHES "Linux")) - libomptarget_say("Not building AMDGPU plugin: only support AMDGPU in Linux x86_64, ppc64le, or aarch64 hosts") - return() -endif() - -################################################################################ -# Define the suffix for the runtime messaging dumps. -add_definitions(-DTARGET_NAME=AMDGPU) -if(CMAKE_SYSTEM_PROCESSOR MATCHES "(ppc64le)|(aarch64)$") - add_definitions(-DLITTLEENDIAN_CPU=1) -endif() - -if(CMAKE_BUILD_TYPE MATCHES Debug) - add_definitions(-DDEBUG) -endif() - -set(LIBOMPTARGET_DLOPEN_LIBHSA OFF) -option(LIBOMPTARGET_FORCE_DLOPEN_LIBHSA "Build with dlopened libhsa" ${LIBOMPTARGET_DLOPEN_LIBHSA}) - -if (${hsa-runtime64_FOUND} AND NOT LIBOMPTARGET_FORCE_DLOPEN_LIBHSA) - libomptarget_say("Building AMDGPU plugin linked against libhsa") - set(LIBOMPTARGET_EXTRA_SOURCE) - set(LIBOMPTARGET_DEP_LIBRARIES hsa-runtime64::hsa-runtime64) -else() - libomptarget_say("Building AMDGPU plugin for dlopened libhsa") - include_directories(dynamic_hsa) - set(LIBOMPTARGET_EXTRA_SOURCE dynamic_hsa/hsa.cpp) - set(LIBOMPTARGET_DEP_LIBRARIES) -endif() - -if(CMAKE_SYSTEM_NAME MATCHES "FreeBSD") - # On FreeBSD, the 'environ' symbol is undefined at link time, but resolved by - # the dynamic linker at runtime. Therefore, allow the symbol to be undefined - # when creating a shared library. - set(LDFLAGS_UNDEFINED "-Wl,--allow-shlib-undefined") -else() - set(LDFLAGS_UNDEFINED "-Wl,-z,defs") -endif() - -add_llvm_library(omptarget.rtl.amdgpu SHARED - impl/impl.cpp - impl/interop_hsa.cpp - impl/data.cpp - impl/get_elf_mach_gfx_name.cpp - impl/system.cpp - impl/msgpack.cpp - src/rtl.cpp - ${LIBOMPTARGET_EXTRA_SOURCE} - - ADDITIONAL_HEADER_DIRS - ${LIBOMPTARGET_INCLUDE_DIR} - ${CMAKE_CURRENT_SOURCE_DIR}/impl - ${CMAKE_CURRENT_SOURCE_DIR}/../../plugins-nextgen/amdgpu/utils - - LINK_COMPONENTS - Support - Object - - LINK_LIBS - PRIVATE - elf_common - ${LIBOMPTARGET_DEP_LIBRARIES} - ${OPENMP_PTHREAD_LIB} - "-Wl,--version-script=${CMAKE_CURRENT_SOURCE_DIR}/../exports" - ${LDFLAGS_UNDEFINED} - - NO_INSTALL_RPATH -) - -target_include_directories( - omptarget.rtl.amdgpu - PRIVATE - ${LIBOMPTARGET_INCLUDE_DIR} - ${CMAKE_CURRENT_SOURCE_DIR}/impl - ${CMAKE_CURRENT_SOURCE_DIR}/../../plugins-nextgen/amdgpu/utils -) - - -# Install plugin under the lib destination folder. -install(TARGETS omptarget.rtl.amdgpu LIBRARY DESTINATION "${OPENMP_INSTALL_LIBDIR}") -set_target_properties(omptarget.rtl.amdgpu PROPERTIES - INSTALL_RPATH "$ORIGIN" BUILD_RPATH "$ORIGIN:${CMAKE_CURRENT_BINARY_DIR}/.." - CXX_VISIBILITY_PRESET protected) - -# Report to the parent scope that we are building a plugin for hsa. -# This controls whether tests are run for the nvptx offloading target -# Run them if libhsa is available, or if the user explicitly asked for dlopen -# Otherwise this plugin is being built speculatively and there may be no hsa available -option(LIBOMPTARGET_FORCE_AMDGPU_TESTS "Build AMDGPU libomptarget tests" OFF) -if (LIBOMPTARGET_FOUND_AMDGPU_GPU OR LIBOMPTARGET_FORCE_AMDGPU_TESTS) - # Report to the parent scope that we are building a plugin for amdgpu - set(LIBOMPTARGET_SYSTEM_TARGETS "${LIBOMPTARGET_SYSTEM_TARGETS} amdgcn-amd-amdhsa" PARENT_SCOPE) - list(APPEND LIBOMPTARGET_TESTED_PLUGINS "omptarget.rtl.amdgpu") - set(LIBOMPTARGET_TESTED_PLUGINS "${LIBOMPTARGET_TESTED_PLUGINS}" PARENT_SCOPE) -else() - libomptarget_say("Not generating amdgcn test targets as libhsa is not linkable") - return() -endif() - diff --git a/openmp/libomptarget/plugins/amdgpu/impl/data.cpp b/openmp/libomptarget/plugins/amdgpu/impl/data.cpp deleted file mode 100644 --- a/openmp/libomptarget/plugins/amdgpu/impl/data.cpp +++ /dev/null @@ -1,37 +0,0 @@ -//===--- amdgpu/impl/data.cpp ------------------------------------- C++ -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -#include "impl_runtime.h" -#include "hsa_api.h" -#include "internal.h" -#include "rt.h" -#include -#include -#include -#include - -using core::TaskImpl; - -namespace core { -namespace Runtime { -hsa_status_t HostMalloc(void **ptr, size_t size, - hsa_amd_memory_pool_t MemoryPool) { - hsa_status_t err = hsa_amd_memory_pool_allocate(MemoryPool, size, 0, ptr); - DP("Malloced %p\n", *ptr); - if (err == HSA_STATUS_SUCCESS) { - err = core::allow_access_to_all_gpu_agents(*ptr); - } - return err; -} - -hsa_status_t Memfree(void *ptr) { - hsa_status_t err = hsa_amd_memory_pool_free(ptr); - DP("Freed %p\n", ptr); - return err; -} -} // namespace Runtime -} // namespace core diff --git a/openmp/libomptarget/plugins/amdgpu/impl/get_elf_mach_gfx_name.h b/openmp/libomptarget/plugins/amdgpu/impl/get_elf_mach_gfx_name.h deleted file mode 100644 --- a/openmp/libomptarget/plugins/amdgpu/impl/get_elf_mach_gfx_name.h +++ /dev/null @@ -1,15 +0,0 @@ -//===--- amdgpu/impl/get_elf_mach_gfx_name.h ---------------------- C++ -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -#ifndef GET_ELF_MACH_GFX_NAME_H_INCLUDED -#define GET_ELF_MACH_GFX_NAME_H_INCLUDED - -#include - -const char *get_elf_mach_gfx_name(uint32_t EFlags); - -#endif diff --git a/openmp/libomptarget/plugins/amdgpu/impl/get_elf_mach_gfx_name.cpp b/openmp/libomptarget/plugins/amdgpu/impl/get_elf_mach_gfx_name.cpp deleted file mode 100644 --- a/openmp/libomptarget/plugins/amdgpu/impl/get_elf_mach_gfx_name.cpp +++ /dev/null @@ -1,80 +0,0 @@ -//===--- amdgpu/impl/get_elf_mach_gfx_name.cpp -------------------- C++ -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -#include "get_elf_mach_gfx_name.h" - -// This header conflicts with the system elf.h (macros vs enums of the same -// identifier) and contains more up to date values for the enum checked here. -// rtl.cpp uses the system elf.h. -#include "llvm/BinaryFormat/ELF.h" - -const char *get_elf_mach_gfx_name(uint32_t EFlags) { - using namespace llvm::ELF; - uint32_t Gfx = (EFlags & EF_AMDGPU_MACH); - switch (Gfx) { - case EF_AMDGPU_MACH_AMDGCN_GFX801: - return "gfx801"; - case EF_AMDGPU_MACH_AMDGCN_GFX802: - return "gfx802"; - case EF_AMDGPU_MACH_AMDGCN_GFX803: - return "gfx803"; - case EF_AMDGPU_MACH_AMDGCN_GFX805: - return "gfx805"; - case EF_AMDGPU_MACH_AMDGCN_GFX810: - return "gfx810"; - case EF_AMDGPU_MACH_AMDGCN_GFX900: - return "gfx900"; - case EF_AMDGPU_MACH_AMDGCN_GFX902: - return "gfx902"; - case EF_AMDGPU_MACH_AMDGCN_GFX904: - return "gfx904"; - case EF_AMDGPU_MACH_AMDGCN_GFX906: - return "gfx906"; - case EF_AMDGPU_MACH_AMDGCN_GFX908: - return "gfx908"; - case EF_AMDGPU_MACH_AMDGCN_GFX909: - return "gfx909"; - case EF_AMDGPU_MACH_AMDGCN_GFX90A: - return "gfx90a"; - case EF_AMDGPU_MACH_AMDGCN_GFX90C: - return "gfx90c"; - case EF_AMDGPU_MACH_AMDGCN_GFX940: - return "gfx940"; - case EF_AMDGPU_MACH_AMDGCN_GFX1010: - return "gfx1010"; - case EF_AMDGPU_MACH_AMDGCN_GFX1011: - return "gfx1011"; - case EF_AMDGPU_MACH_AMDGCN_GFX1012: - return "gfx1012"; - case EF_AMDGPU_MACH_AMDGCN_GFX1013: - return "gfx1013"; - case EF_AMDGPU_MACH_AMDGCN_GFX1030: - return "gfx1030"; - case EF_AMDGPU_MACH_AMDGCN_GFX1031: - return "gfx1031"; - case EF_AMDGPU_MACH_AMDGCN_GFX1032: - return "gfx1032"; - case EF_AMDGPU_MACH_AMDGCN_GFX1033: - return "gfx1033"; - case EF_AMDGPU_MACH_AMDGCN_GFX1034: - return "gfx1034"; - case EF_AMDGPU_MACH_AMDGCN_GFX1035: - return "gfx1035"; - case EF_AMDGPU_MACH_AMDGCN_GFX1036: - return "gfx1036"; - case EF_AMDGPU_MACH_AMDGCN_GFX1100: - return "gfx1100"; - case EF_AMDGPU_MACH_AMDGCN_GFX1101: - return "gfx1101"; - case EF_AMDGPU_MACH_AMDGCN_GFX1102: - return "gfx1102"; - case EF_AMDGPU_MACH_AMDGCN_GFX1103: - return "gfx1103"; - default: - return "--unknown gfx"; - } -} diff --git a/openmp/libomptarget/plugins/amdgpu/impl/hsa_api.h b/openmp/libomptarget/plugins/amdgpu/impl/hsa_api.h deleted file mode 100644 --- a/openmp/libomptarget/plugins/amdgpu/impl/hsa_api.h +++ /dev/null @@ -1,26 +0,0 @@ -//===--- amdgpu/impl/hsa_api.h ------------------------------------ C++ -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -#ifndef AMDGPU_HSA_API_H_INCLUDED -#define AMDGPU_HSA_API_H_INCLUDED - -#if defined(__has_include) -#if __has_include("hsa/hsa.h") -#include "hsa/hsa.h" -#include "hsa/hsa_ext_amd.h" -#elif __has_include("hsa.h") -#include "hsa.h" -#include "hsa_ext_amd.h" -#endif -#else -#include "hsa/hsa.h" -#include "hsa_ext_amd.h" -#endif - - - -#endif diff --git a/openmp/libomptarget/plugins/amdgpu/impl/impl.cpp b/openmp/libomptarget/plugins/amdgpu/impl/impl.cpp deleted file mode 100644 --- a/openmp/libomptarget/plugins/amdgpu/impl/impl.cpp +++ /dev/null @@ -1,182 +0,0 @@ -//===--- amdgpu/impl/impl.cpp ------------------------------------- C++ -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -#include "rt.h" -#include - -/* - * Data - */ - -hsa_status_t is_locked(void *ptr, void **agentBaseAddress) { - hsa_status_t err = HSA_STATUS_SUCCESS; - hsa_amd_pointer_info_t info; - info.size = sizeof(hsa_amd_pointer_info_t); - err = hsa_amd_pointer_info(ptr, &info, /*alloc=*/nullptr, - /*num_agents_accessible=*/nullptr, - /*accessible=*/nullptr); - if (err != HSA_STATUS_SUCCESS) { - DP("Error when getting pointer info\n"); - return err; - } - - if (info.type == HSA_EXT_POINTER_TYPE_LOCKED) { - // When user passes in a basePtr+offset we need to fix the - // locked pointer to include the offset: ROCr always returns - // the base locked address, not the shifted one. - if ((char *)info.hostBaseAddress <= (char *)ptr && - (char *)ptr < (char *)info.hostBaseAddress + info.sizeInBytes) - *agentBaseAddress = - (void *)((uint64_t)info.agentBaseAddress + (uint64_t)ptr - - (uint64_t)info.hostBaseAddress); - else // address is already device-agent accessible, no need to compute - // offset - *agentBaseAddress = ptr; - } else - *agentBaseAddress = nullptr; - - return HSA_STATUS_SUCCESS; -} - -// host pointer (either src or dest) must be locked via hsa_amd_memory_lock -static hsa_status_t invoke_hsa_copy(hsa_signal_t signal, void *dest, - hsa_agent_t agent, const void *src, - size_t size) { - const hsa_signal_value_t init = 1; - const hsa_signal_value_t success = 0; - hsa_signal_store_screlease(signal, init); - - hsa_status_t err = hsa_amd_memory_async_copy(dest, agent, src, agent, size, 0, - nullptr, signal); - if (err != HSA_STATUS_SUCCESS) - return err; - - // async_copy reports success by decrementing and failure by setting to < 0 - hsa_signal_value_t got = init; - while (got == init) - got = hsa_signal_wait_scacquire(signal, HSA_SIGNAL_CONDITION_NE, init, - UINT64_MAX, HSA_WAIT_STATE_BLOCKED); - - if (got != success) - return HSA_STATUS_ERROR; - - return err; -} - -struct implFreePtrDeletor { - void operator()(void *p) { - core::Runtime::Memfree(p); // ignore failure to free - } -}; - -enum CopyDirection { H2D, D2H }; - -static hsa_status_t locking_async_memcpy(enum CopyDirection direction, - hsa_signal_t signal, void *dest, - hsa_agent_t agent, void *src, - void *lockingPtr, size_t size) { - void *lockedPtr = nullptr; - hsa_status_t err = is_locked(lockingPtr, &lockedPtr); - bool HostPtrIsLocked = true; - if (err != HSA_STATUS_SUCCESS) - return err; - if (!lockedPtr) { // not locked - HostPtrIsLocked = false; - hsa_agent_t agents[1] = {agent}; - err = hsa_amd_memory_lock(lockingPtr, size, agents, /*num_agent=*/1, - (void **)&lockedPtr); - if (err != HSA_STATUS_SUCCESS) - return err; - DP("locking_async_memcpy: lockingPtr=%p lockedPtr=%p Size = %lu\n", - lockingPtr, lockedPtr, size); - } - - switch (direction) { - case H2D: - err = invoke_hsa_copy(signal, dest, agent, lockedPtr, size); - break; - case D2H: - err = invoke_hsa_copy(signal, lockedPtr, agent, src, size); - break; - } - - if (err != HSA_STATUS_SUCCESS && !HostPtrIsLocked) { - // do not leak locked host pointers, but discard potential error message - // because the initial error was in the copy function - hsa_amd_memory_unlock(lockingPtr); - return err; - } - - // unlock only if not user locked - if (!HostPtrIsLocked) - err = hsa_amd_memory_unlock(lockingPtr); - if (err != HSA_STATUS_SUCCESS) - return err; - - return HSA_STATUS_SUCCESS; -} - -hsa_status_t impl_memcpy_h2d(hsa_signal_t signal, void *deviceDest, - void *hostSrc, size_t size, - hsa_agent_t device_agent, - hsa_amd_memory_pool_t MemoryPool) { - hsa_status_t err; - - err = locking_async_memcpy(CopyDirection::H2D, signal, deviceDest, - device_agent, hostSrc, hostSrc, size); - - if (err == HSA_STATUS_SUCCESS) - return err; - - // async memcpy sometimes fails in situations where - // allocate + copy succeeds. Looks like it might be related to - // locking part of a read only segment. Fall back for now. - void *tempHostPtr; - hsa_status_t ret = core::Runtime::HostMalloc(&tempHostPtr, size, MemoryPool); - if (ret != HSA_STATUS_SUCCESS) { - DP("HostMalloc: Unable to alloc %zu bytes for temp scratch\n", size); - return ret; - } - std::unique_ptr del(tempHostPtr); - memcpy(tempHostPtr, hostSrc, size); - - return locking_async_memcpy(CopyDirection::H2D, signal, deviceDest, - device_agent, tempHostPtr, tempHostPtr, size); -} - -hsa_status_t impl_memcpy_d2h(hsa_signal_t signal, void *hostDest, - void *deviceSrc, size_t size, - hsa_agent_t deviceAgent, - hsa_amd_memory_pool_t MemoryPool) { - hsa_status_t err; - - // device has always visibility over both pointers, so use that - err = locking_async_memcpy(CopyDirection::D2H, signal, hostDest, deviceAgent, - deviceSrc, hostDest, size); - - if (err == HSA_STATUS_SUCCESS) - return err; - - // hsa_memory_copy sometimes fails in situations where - // allocate + copy succeeds. Looks like it might be related to - // locking part of a read only segment. Fall back for now. - void *tempHostPtr; - hsa_status_t ret = core::Runtime::HostMalloc(&tempHostPtr, size, MemoryPool); - if (ret != HSA_STATUS_SUCCESS) { - DP("HostMalloc: Unable to alloc %zu bytes for temp scratch\n", size); - return ret; - } - std::unique_ptr del(tempHostPtr); - - err = locking_async_memcpy(CopyDirection::D2H, signal, tempHostPtr, - deviceAgent, deviceSrc, tempHostPtr, size); - if (err != HSA_STATUS_SUCCESS) - return HSA_STATUS_ERROR; - - memcpy(hostDest, tempHostPtr, size); - return HSA_STATUS_SUCCESS; -} diff --git a/openmp/libomptarget/plugins/amdgpu/impl/impl_runtime.h b/openmp/libomptarget/plugins/amdgpu/impl/impl_runtime.h deleted file mode 100644 --- a/openmp/libomptarget/plugins/amdgpu/impl/impl_runtime.h +++ /dev/null @@ -1,34 +0,0 @@ -//===--- amdgpu/impl/impl_runtime.h ------------------------------- C++ -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -#ifndef INCLUDE_IMPL_RUNTIME_H_ -#define INCLUDE_IMPL_RUNTIME_H_ - -#include "hsa_api.h" - -extern "C" { - -// Check if pointer ptr is already locked -hsa_status_t is_locked(void *ptr, void **agentBaseAddress); - -hsa_status_t impl_module_register_from_memory_to_place( - void *module_bytes, size_t module_size, int DeviceId, - hsa_status_t (*on_deserialized_data)(void *data, size_t size, - void *cb_state), - void *cb_state); - -hsa_status_t impl_memcpy_h2d(hsa_signal_t signal, void *deviceDest, - void *hostSrc, size_t size, - hsa_agent_t device_agent, - hsa_amd_memory_pool_t MemoryPool); - -hsa_status_t impl_memcpy_d2h(hsa_signal_t sig, void *hostDest, void *deviceSrc, - size_t size, hsa_agent_t device_agent, - hsa_amd_memory_pool_t MemoryPool); -} - -#endif // INCLUDE_IMPL_RUNTIME_H_ diff --git a/openmp/libomptarget/plugins/amdgpu/impl/internal.h b/openmp/libomptarget/plugins/amdgpu/impl/internal.h deleted file mode 100644 --- a/openmp/libomptarget/plugins/amdgpu/impl/internal.h +++ /dev/null @@ -1,154 +0,0 @@ -//===--- amdgpu/impl/internal.h ----------------------------------- C++ -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -#ifndef SRC_RUNTIME_INCLUDE_INTERNAL_H_ -#define SRC_RUNTIME_INCLUDE_INTERNAL_H_ -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include - -#include "hsa_api.h" - -#include "impl_runtime.h" - -#ifndef TARGET_NAME -#error "Missing TARGET_NAME macro" -#endif -#define DEBUG_PREFIX "Target " GETNAME(TARGET_NAME) " RTL" -#include "Debug.h" - -#define MAX_NUM_KERNELS (1024 * 16) - -// ---------------------- Kernel Start ------------- -typedef struct atl_kernel_info_s { - uint64_t kernel_object; - uint32_t group_segment_size; - uint32_t private_segment_size; - uint32_t sgpr_count; - uint32_t vgpr_count; - uint32_t sgpr_spill_count; - uint32_t vgpr_spill_count; - uint32_t kernel_segment_size; - uint32_t explicit_argument_count; - uint32_t implicit_argument_count; -} atl_kernel_info_t; - -typedef struct atl_symbol_info_s { - uint64_t addr; - uint32_t size; -} atl_symbol_info_t; - -// ---------------------- Kernel End ------------- - -namespace core { -class TaskgroupImpl; -class TaskImpl; -class Kernel; -class KernelImpl; -} // namespace core - -struct SignalPoolT { - SignalPoolT() {} - SignalPoolT(const SignalPoolT &) = delete; - SignalPoolT(SignalPoolT &&) = delete; - ~SignalPoolT() { - size_t N = state.size(); - for (size_t i = 0; i < N; i++) { - hsa_signal_t signal = state.front(); - state.pop(); - hsa_status_t rc = hsa_signal_destroy(signal); - if (rc != HSA_STATUS_SUCCESS) { - DP("Signal pool destruction failed\n"); - } - } - } - size_t size() { - lock l(&mutex); - return state.size(); - } - void push(hsa_signal_t s) { - lock l(&mutex); - state.push(s); - } - hsa_signal_t pop(void) { - lock l(&mutex); - if (!state.empty()) { - hsa_signal_t res = state.front(); - state.pop(); - return res; - } - - // Pool empty, attempt to create another signal - hsa_signal_t new_signal; - hsa_status_t err = hsa_signal_create(0, 0, NULL, &new_signal); - if (err == HSA_STATUS_SUCCESS) { - return new_signal; - } - - // Fail - return {0}; - } - -private: - static pthread_mutex_t mutex; - std::queue state; - struct lock { - lock(pthread_mutex_t *m) : m(m) { pthread_mutex_lock(m); } - ~lock() { pthread_mutex_unlock(m); } - pthread_mutex_t *m; - }; -}; - -namespace core { -hsa_status_t atl_init_gpu_context(); - -hsa_status_t init_hsa(); -hsa_status_t finalize_hsa(); -/* - * Generic utils - */ -template inline T alignDown(T value, size_t alignment) { - return (T)(value & ~(alignment - 1)); -} - -template inline T *alignDown(T *value, size_t alignment) { - return reinterpret_cast(alignDown((intptr_t)value, alignment)); -} - -template inline T alignUp(T value, size_t alignment) { - return alignDown((T)(value + alignment - 1), alignment); -} - -template inline T *alignUp(T *value, size_t alignment) { - return reinterpret_cast( - alignDown((intptr_t)(value + alignment - 1), alignment)); -} - -extern bool atl_is_impl_initialized(); - -bool handle_group_signal(hsa_signal_value_t value, void *arg); - -hsa_status_t allow_access_to_all_gpu_agents(void *ptr); -} // namespace core - -inline const char *get_error_string(hsa_status_t err) { - const char *res; - hsa_status_t rc = hsa_status_string(err, &res); - return (rc == HSA_STATUS_SUCCESS) ? res : "HSA_STATUS UNKNOWN."; -} - -#endif // SRC_RUNTIME_INCLUDE_INTERNAL_H_ diff --git a/openmp/libomptarget/plugins/amdgpu/impl/interop_hsa.h b/openmp/libomptarget/plugins/amdgpu/impl/interop_hsa.h deleted file mode 100644 --- a/openmp/libomptarget/plugins/amdgpu/impl/interop_hsa.h +++ /dev/null @@ -1,26 +0,0 @@ -//===--- amdgpu/impl/interop_hsa.h -------------------------------- C++ -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -#ifndef INCLUDE_INTEROP_HSA_H_ -#define INCLUDE_INTEROP_HSA_H_ - -#include "impl_runtime.h" -#include "hsa_api.h" -#include "internal.h" - -#include -#include - -extern "C" { - -hsa_status_t interop_hsa_get_symbol_info( - const std::map &SymbolInfoTable, - int DeviceId, const char *symbol, void **var_addr, unsigned int *var_size); - -} - -#endif // INCLUDE_INTEROP_HSA_H_ diff --git a/openmp/libomptarget/plugins/amdgpu/impl/interop_hsa.cpp b/openmp/libomptarget/plugins/amdgpu/impl/interop_hsa.cpp deleted file mode 100644 --- a/openmp/libomptarget/plugins/amdgpu/impl/interop_hsa.cpp +++ /dev/null @@ -1,39 +0,0 @@ -//===--- amdgpu/impl/interop_hsa.cpp ------------------------------ C++ -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -#include "interop_hsa.h" -#include "internal.h" - -hsa_status_t interop_hsa_get_symbol_info( - const std::map &SymbolInfoTable, - int DeviceId, const char *symbol, void **var_addr, unsigned int *var_size) { - /* - // Typical usage: - void *var_addr; - size_t var_size; - interop_hsa_get_symbol_addr(gpu_place, "symbol_name", &var_addr, - &var_size); - impl_memcpy(signal, host_add, var_addr, var_size); - */ - - if (!symbol || !var_addr || !var_size) - return HSA_STATUS_ERROR; - - // get the symbol info - std::string symbolStr = std::string(symbol); - auto It = SymbolInfoTable.find(symbolStr); - if (It != SymbolInfoTable.end()) { - atl_symbol_info_t info = It->second; - *var_addr = reinterpret_cast(info.addr); - *var_size = info.size; - return HSA_STATUS_SUCCESS; - } else { - *var_addr = NULL; - *var_size = 0; - return HSA_STATUS_ERROR; - } -} diff --git a/openmp/libomptarget/plugins/amdgpu/impl/msgpack.h b/openmp/libomptarget/plugins/amdgpu/impl/msgpack.h deleted file mode 100644 --- a/openmp/libomptarget/plugins/amdgpu/impl/msgpack.h +++ /dev/null @@ -1,282 +0,0 @@ -//===--- amdgpu/impl/msgpack.h ------------------------------------ C++ -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -#ifndef MSGPACK_H -#define MSGPACK_H - -#include - -namespace msgpack { - -// The message pack format is dynamically typed, schema-less. Format is: -// message: [type][header][payload] -// where type is one byte, header length is a fixed length function of type -// payload is zero to N bytes, with the length encoded in [type][header] - -// Scalar fields include boolean, signed integer, float, string etc -// Composite types are sequences of messages -// Array field is [header][element][element]... -// Map field is [header][key][value][key][value]... - -// Multibyte integer fields are big endian encoded -// The map key can be any message type -// Maps may contain duplicate keys -// Data is not uniquely encoded, e.g. integer "8" may be stored as one byte or -// in as many as nine, as signed or unsigned. Implementation defined. -// Similarly "foo" may embed the length in the type field or in multiple bytes - -// This parser is structured as an iterator over a sequence of bytes. -// It calls a user provided function on each message in order to extract fields -// The default implementation for each scalar type is to do nothing. For map or -// arrays, the default implementation returns just after that message to support -// iterating to the next message, but otherwise has no effect. - -struct byte_range { - const unsigned char *start; - const unsigned char *end; -}; - -const unsigned char *skip_next_message(const unsigned char *start, - const unsigned char *end); - -template class functors_defaults { -public: - void cb_string(size_t N, const unsigned char *str) { - derived().handle_string(N, str); - } - void cb_boolean(bool x) { derived().handle_boolean(x); } - void cb_signed(int64_t x) { derived().handle_signed(x); } - void cb_unsigned(uint64_t x) { derived().handle_unsigned(x); } - void cb_array_elements(byte_range bytes) { - derived().handle_array_elements(bytes); - } - void cb_map_elements(byte_range key, byte_range value) { - derived().handle_map_elements(key, value); - } - const unsigned char *cb_array(uint64_t N, byte_range bytes) { - return derived().handle_array(N, bytes); - } - const unsigned char *cb_map(uint64_t N, byte_range bytes) { - return derived().handle_map(N, bytes); - } - -private: - Derived &derived() { return *static_cast(this); } - - // Default implementations for scalar ops are no-ops - void handle_string(size_t, const unsigned char *) {} - void handle_boolean(bool) {} - void handle_signed(int64_t) {} - void handle_unsigned(uint64_t) {} - void handle_array_elements(byte_range) {} - void handle_map_elements(byte_range, byte_range) {} - - // Default implementation for sequences is to skip over the messages - const unsigned char *handle_array(uint64_t N, byte_range bytes) { - for (uint64_t i = 0; i < N; i++) { - const unsigned char *next = skip_next_message(bytes.start, bytes.end); - if (!next) { - return nullptr; - } - cb_array_elements(bytes); - bytes.start = next; - } - return bytes.start; - } - const unsigned char *handle_map(uint64_t N, byte_range bytes) { - for (uint64_t i = 0; i < N; i++) { - const unsigned char *start_key = bytes.start; - const unsigned char *end_key = skip_next_message(start_key, bytes.end); - if (!end_key) { - return nullptr; - } - const unsigned char *start_value = end_key; - const unsigned char *end_value = - skip_next_message(start_value, bytes.end); - if (!end_value) { - return nullptr; - } - cb_map_elements({start_key, end_key}, {start_value, end_value}); - bytes.start = end_value; - } - return bytes.start; - } -}; - -typedef enum : uint8_t { -#define X(NAME, WIDTH, PAYLOAD, LOWER, UPPER) NAME, -#include "msgpack.def" -#undef X -} type; - -[[noreturn]] void internal_error(); -type parse_type(unsigned char x); -unsigned bytes_used_fixed(type ty); - -typedef uint64_t (*payload_info_t)(const unsigned char *); -payload_info_t payload_info(msgpack::type ty); - -template R bitcast(T x); - -template -const unsigned char *handle_msgpack_given_type(byte_range bytes, F f) { - const unsigned char *start = bytes.start; - const unsigned char *end = bytes.end; - const uint64_t available = end - start; - assert(available != 0); - assert(ty == parse_type(*start)); - - const uint64_t bytes_used = bytes_used_fixed(ty); - if (available < bytes_used) { - return 0; - } - const uint64_t available_post_header = available - bytes_used; - - const payload_info_t info = payload_info(ty); - const uint64_t N = info(start); - - switch (ty) { - case msgpack::t: - case msgpack::f: { - // t is 0b11000010, f is 0b11000011, masked with 0x1 - f.cb_boolean(N); - return start + bytes_used; - } - - case msgpack::posfixint: - case msgpack::uint8: - case msgpack::uint16: - case msgpack::uint32: - case msgpack::uint64: { - f.cb_unsigned(N); - return start + bytes_used; - } - - case msgpack::negfixint: - case msgpack::int8: - case msgpack::int16: - case msgpack::int32: - case msgpack::int64: { - f.cb_signed(bitcast(N)); - return start + bytes_used; - } - - case msgpack::fixstr: - case msgpack::str8: - case msgpack::str16: - case msgpack::str32: { - if (available_post_header < N) { - return 0; - } else { - f.cb_string(N, start + bytes_used); - return start + bytes_used + N; - } - } - - case msgpack::fixarray: - case msgpack::array16: - case msgpack::array32: { - return f.cb_array(N, {start + bytes_used, end}); - } - - case msgpack::fixmap: - case msgpack::map16: - case msgpack::map32: { - return f.cb_map(N, {start + bytes_used, end}); - } - - case msgpack::nil: - case msgpack::bin8: - case msgpack::bin16: - case msgpack::bin32: - case msgpack::float32: - case msgpack::float64: - case msgpack::ext8: - case msgpack::ext16: - case msgpack::ext32: - case msgpack::fixext1: - case msgpack::fixext2: - case msgpack::fixext4: - case msgpack::fixext8: - case msgpack::fixext16: - case msgpack::never_used: { - if (available_post_header < N) { - return 0; - } - return start + bytes_used + N; - } - } - internal_error(); -} - -template -const unsigned char *handle_msgpack(byte_range bytes, F f) { - const unsigned char *start = bytes.start; - const unsigned char *end = bytes.end; - const uint64_t available = end - start; - if (available == 0) { - return 0; - } - const type ty = parse_type(*start); - - switch (ty) { -#define X(NAME, WIDTH, PAYLOAD, LOWER, UPPER) \ - case msgpack::NAME: \ - return handle_msgpack_given_type(bytes, f); -#include "msgpack.def" -#undef X - } - - internal_error(); -} - -bool message_is_string(byte_range bytes, const char *str); - -template void foronly_string(byte_range bytes, C callback) { - struct inner : functors_defaults { - inner(C &cb) : cb(cb) {} - C &cb; - void handle_string(size_t N, const unsigned char *str) { cb(N, str); } - }; - handle_msgpack(bytes, {callback}); -} - -template void foronly_unsigned(byte_range bytes, C callback) { - struct inner : functors_defaults { - inner(C &cb) : cb(cb) {} - C &cb; - void handle_unsigned(uint64_t x) { cb(x); } - }; - handle_msgpack(bytes, {callback}); -} - -template void foreach_array(byte_range bytes, C callback) { - struct inner : functors_defaults { - inner(C &cb) : cb(cb) {} - C &cb; - void handle_array_elements(byte_range element) { cb(element); } - }; - handle_msgpack(bytes, {callback}); -} - -template void foreach_map(byte_range bytes, C callback) { - struct inner : functors_defaults { - inner(C &cb) : cb(cb) {} - C &cb; - void handle_map_elements(byte_range key, byte_range value) { - cb(key, value); - } - }; - handle_msgpack(bytes, {callback}); -} - -// Crude approximation to json -void dump(byte_range); - -} // namespace msgpack - -#endif diff --git a/openmp/libomptarget/plugins/amdgpu/impl/msgpack.cpp b/openmp/libomptarget/plugins/amdgpu/impl/msgpack.cpp deleted file mode 100644 --- a/openmp/libomptarget/plugins/amdgpu/impl/msgpack.cpp +++ /dev/null @@ -1,271 +0,0 @@ -//===--- amdgpu/impl/msgpack.cpp ---------------------------------- C++ -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -#include -#include -#include -#include -#include - -#include "msgpack.h" - -namespace msgpack { - -[[noreturn]] void internal_error() { - printf("internal error\n"); - exit(1); -} - -const char *type_name(type ty) { - switch (ty) { -#define X(NAME, WIDTH, PAYLOAD, LOWER, UPPER) \ - case NAME: \ - return #NAME; -#include "msgpack.def" -#undef X - } - internal_error(); -} - -unsigned bytes_used_fixed(msgpack::type ty) { - using namespace msgpack; - switch (ty) { -#define X(NAME, WIDTH, PAYLOAD, LOWER, UPPER) \ - case NAME: \ - return WIDTH; -#include "msgpack.def" -#undef X - } - internal_error(); -} - -msgpack::type parse_type(unsigned char x) { - -#define X(NAME, WIDTH, PAYLOAD, LOWER, UPPER) \ - if (x >= LOWER && x <= UPPER) { \ - return NAME; \ - } else -#include "msgpack.def" -#undef X - { internal_error(); } -} - -template R bitcast(T x) { - static_assert(sizeof(T) == sizeof(R), ""); - R tmp; - memcpy(&tmp, &x, sizeof(T)); - return tmp; -} -template int64_t bitcast(uint64_t); -} // namespace msgpack - -// Helper functions for reading additional payload from the header -// Depending on the type, this can be a number of bytes, elements, -// key-value pairs or an embedded integer. -// Each takes a pointer to the start of the header and returns a uint64_t - -namespace { -namespace payload { -uint64_t read_zero(const unsigned char *) { return 0; } - -// Read the first byte and zero/sign extend it -uint64_t read_embedded_u8(const unsigned char *start) { return start[0]; } -uint64_t read_embedded_s8(const unsigned char *start) { - int64_t res = msgpack::bitcast(start[0]); - return msgpack::bitcast(res); -} - -// Read a masked part of the first byte -uint64_t read_via_mask_0x1(const unsigned char *start) { return *start & 0x1u; } -uint64_t read_via_mask_0xf(const unsigned char *start) { return *start & 0xfu; } -uint64_t read_via_mask_0x1f(const unsigned char *start) { - return *start & 0x1fu; -} - -// Read 1/2/4/8 bytes immediately following the type byte and zero/sign extend -// Big endian format. -uint64_t read_size_field_u8(const unsigned char *from) { - from++; - return from[0]; -} - -// TODO: detect whether host is little endian or not, and whether the intrinsic -// is available. And probably use the builtin to test the diy -const bool use_bswap = false; - -uint64_t read_size_field_u16(const unsigned char *from) { - from++; - if (use_bswap) { - uint16_t b; - memcpy(&b, from, 2); - return __builtin_bswap16(b); - } else { - return (from[0] << 8u) | from[1]; - } -} -uint64_t read_size_field_u32(const unsigned char *from) { - from++; - if (use_bswap) { - uint32_t b; - memcpy(&b, from, 4); - return __builtin_bswap32(b); - } else { - return (from[0] << 24u) | (from[1] << 16u) | (from[2] << 8u) | - (from[3] << 0u); - } -} -uint64_t read_size_field_u64(const unsigned char *from) { - from++; - if (use_bswap) { - uint64_t b; - memcpy(&b, from, 8); - return __builtin_bswap64(b); - } else { - return ((uint64_t)from[0] << 56u) | ((uint64_t)from[1] << 48u) | - ((uint64_t)from[2] << 40u) | ((uint64_t)from[3] << 32u) | - (from[4] << 24u) | (from[5] << 16u) | (from[6] << 8u) | - (from[7] << 0u); - } -} - -uint64_t read_size_field_s8(const unsigned char *from) { - uint8_t u = read_size_field_u8(from); - int64_t res = msgpack::bitcast(u); - return msgpack::bitcast(res); -} -uint64_t read_size_field_s16(const unsigned char *from) { - uint16_t u = read_size_field_u16(from); - int64_t res = msgpack::bitcast(u); - return msgpack::bitcast(res); -} -uint64_t read_size_field_s32(const unsigned char *from) { - uint32_t u = read_size_field_u32(from); - int64_t res = msgpack::bitcast(u); - return msgpack::bitcast(res); -} -uint64_t read_size_field_s64(const unsigned char *from) { - uint64_t u = read_size_field_u64(from); - int64_t res = msgpack::bitcast(u); - return msgpack::bitcast(res); -} -} // namespace payload -} // namespace - -namespace msgpack { - -payload_info_t payload_info(msgpack::type ty) { - using namespace msgpack; - switch (ty) { -#define X(NAME, WIDTH, PAYLOAD, LOWER, UPPER) \ - case NAME: \ - return payload::PAYLOAD; -#include "msgpack.def" -#undef X - } - internal_error(); -} - -} // namespace msgpack - -const unsigned char *msgpack::skip_next_message(const unsigned char *start, - const unsigned char *end) { - class f : public functors_defaults {}; - return handle_msgpack({start, end}, f()); -} - -namespace msgpack { -bool message_is_string(byte_range bytes, const char *needle) { - bool matched = false; - size_t needleN = strlen(needle); - - foronly_string(bytes, [=, &matched](size_t N, const unsigned char *str) { - if (N == needleN) { - if (memcmp(needle, str, N) == 0) { - matched = true; - } - } - }); - return matched; -} - -void dump(byte_range bytes) { - struct inner : functors_defaults { - inner(unsigned indent) : indent(indent) {} - const unsigned by = 2; - unsigned indent = 0; - - void handle_string(size_t N, const unsigned char *bytes) { - char *tmp = (char *)malloc(N + 1); - memcpy(tmp, bytes, N); - tmp[N] = '\0'; - printf("\"%s\"", tmp); - free(tmp); - } - - void handle_signed(int64_t x) { printf("%ld", x); } - void handle_unsigned(uint64_t x) { printf("%lu", x); } - - const unsigned char *handle_array(uint64_t N, byte_range bytes) { - printf("\n%*s[\n", indent, ""); - indent += by; - - for (uint64_t i = 0; i < N; i++) { - indent += by; - printf("%*s", indent, ""); - const unsigned char *next = handle_msgpack(bytes, {indent}); - printf(",\n"); - indent -= by; - bytes.start = next; - if (!next) { - break; - } - } - indent -= by; - printf("%*s]", indent, ""); - - return bytes.start; - } - - const unsigned char *handle_map(uint64_t N, byte_range bytes) { - printf("\n%*s{\n", indent, ""); - indent += by; - - for (uint64_t i = 0; i < 2 * N; i += 2) { - const unsigned char *start_key = bytes.start; - printf("%*s", indent, ""); - const unsigned char *end_key = - handle_msgpack({start_key, bytes.end}, {indent}); - if (!end_key) { - break; - } - - printf(" : "); - - const unsigned char *start_value = end_key; - const unsigned char *end_value = - handle_msgpack({start_value, bytes.end}, {indent}); - - if (!end_value) { - break; - } - - printf(",\n"); - bytes.start = end_value; - } - - indent -= by; - printf("%*s}", indent, ""); - - return bytes.start; - } - }; - - handle_msgpack(bytes, {0}); - printf("\n"); -} - -} // namespace msgpack diff --git a/openmp/libomptarget/plugins/amdgpu/impl/msgpack.def b/openmp/libomptarget/plugins/amdgpu/impl/msgpack.def deleted file mode 100644 --- a/openmp/libomptarget/plugins/amdgpu/impl/msgpack.def +++ /dev/null @@ -1,46 +0,0 @@ -//===--- amdgpu/impl/msgpack.def ---------------------------------- C++ -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -// name, header width, reader, [lower, upper] encoding -X(posfixint, 1, read_embedded_u8, 0x00, 0x7f) -X(negfixint, 1, read_embedded_s8, 0xe0, 0xff) -X(fixmap, 1, read_via_mask_0xf, 0x80, 0x8f) -X(fixarray, 1, read_via_mask_0xf, 0x90, 0x9f) -X(fixstr, 1, read_via_mask_0x1f, 0xa0, 0xbf) -X(nil, 1, read_zero, 0xc0, 0xc0) -X(never_used, 1, read_zero, 0xc1, 0xc1) -X(f, 1, read_via_mask_0x1, 0xc2, 0xc2) -X(t, 1, read_via_mask_0x1, 0xc3, 0xc3) -X(bin8, 2, read_size_field_u8, 0xc4, 0xc4) -X(bin16, 3, read_size_field_u16, 0xc5, 0xc5) -X(bin32, 5, read_size_field_u32, 0xc6, 0xc6) -X(ext8, 3, read_size_field_u8, 0xc7, 0xc7) -X(ext16, 4, read_size_field_u16, 0xc8, 0xc8) -X(ext32, 6, read_size_field_u32, 0xc9, 0xc9) -X(float32, 5, read_zero, 0xca, 0xca) -X(float64, 9, read_zero, 0xcb, 0xcb) -X(uint8, 2, read_size_field_u8, 0xcc, 0xcc) -X(uint16, 3, read_size_field_u16, 0xcd, 0xcd) -X(uint32, 5, read_size_field_u32, 0xce, 0xce) -X(uint64, 9, read_size_field_u64, 0xcf, 0xcf) -X(int8, 2, read_size_field_s8, 0xd0, 0xd0) -X(int16, 3, read_size_field_s16, 0xd1, 0xd1) -X(int32, 5, read_size_field_s32, 0xd2, 0xd2) -X(int64, 9, read_size_field_s64, 0xd3, 0xd3) -X(fixext1, 3, read_zero, 0xd4, 0xd4) -X(fixext2, 4, read_zero, 0xd5, 0xd5) -X(fixext4, 6, read_zero, 0xd6, 0xd6) -X(fixext8, 10, read_zero, 0xd7, 0xd7) -X(fixext16, 18, read_zero, 0xd8, 0xd8) -X(str8, 2, read_size_field_u8, 0xd9, 0xd9) -X(str16, 3, read_size_field_u16, 0xda, 0xda) -X(str32, 5, read_size_field_u32, 0xdb, 0xdb) -X(array16, 3, read_size_field_u16, 0xdc, 0xdc) -X(array32, 5, read_size_field_u32, 0xdd, 0xdd) -X(map16, 3, read_size_field_u16, 0xde, 0xde) -X(map32, 5, read_size_field_u32, 0xdf, 0xdf) diff --git a/openmp/libomptarget/plugins/amdgpu/impl/rt.h b/openmp/libomptarget/plugins/amdgpu/impl/rt.h deleted file mode 100644 --- a/openmp/libomptarget/plugins/amdgpu/impl/rt.h +++ /dev/null @@ -1,34 +0,0 @@ -//===--- amdgpu/impl/rt.h ----------------------------------------- C++ -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -#ifndef SRC_RUNTIME_INCLUDE_RT_H_ -#define SRC_RUNTIME_INCLUDE_RT_H_ - -#include "hsa_api.h" -#include "impl_runtime.h" -#include "internal.h" - -#include - -namespace core { -namespace Runtime { -hsa_status_t Memfree(void *); -hsa_status_t HostMalloc(void **ptr, size_t size, - hsa_amd_memory_pool_t MemoryPool); - -} // namespace Runtime -hsa_status_t RegisterModuleFromMemory( - std::map &KernelInfoTable, - std::map &SymbolInfoTable, - void *module_bytes, size_t module_size, hsa_agent_t agent, - hsa_status_t (*on_deserialized_data)(void *data, size_t size, - void *cb_state), - void *cb_state, std::vector &HSAExecutables); - -} // namespace core - -#endif // SRC_RUNTIME_INCLUDE_RT_H_ diff --git a/openmp/libomptarget/plugins/amdgpu/impl/system.cpp b/openmp/libomptarget/plugins/amdgpu/impl/system.cpp deleted file mode 100644 --- a/openmp/libomptarget/plugins/amdgpu/impl/system.cpp +++ /dev/null @@ -1,744 +0,0 @@ -//===--- amdgpu/impl/system.cpp ----------------------------------- C++ -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#include "llvm/ADT/StringRef.h" -#include "llvm/BinaryFormat/ELF.h" -#include "llvm/Object/ELF.h" -#include "llvm/Object/ELFObjectFile.h" - -#include -#include -#include - -#include "internal.h" -#include "rt.h" - -#include "msgpack.h" - -using namespace llvm; -using namespace llvm::object; -using namespace llvm::ELF; - -namespace hsa { -// Wrap HSA iterate API in a shim that allows passing general callables -template -hsa_status_t executable_iterate_symbols(hsa_executable_t executable, C cb) { - auto L = [](hsa_executable_t executable, hsa_executable_symbol_t symbol, - void *data) -> hsa_status_t { - C *unwrapped = static_cast(data); - return (*unwrapped)(executable, symbol); - }; - return hsa_executable_iterate_symbols(executable, L, - static_cast(&cb)); -} -} // namespace hsa - -typedef unsigned char *address; -/* - * Note descriptors. - */ -// FreeBSD already declares Elf_Note (indirectly via ) -#if !defined(__FreeBSD__) -typedef struct { - uint32_t n_namesz; /* Length of note's name. */ - uint32_t n_descsz; /* Length of note's value. */ - uint32_t n_type; /* Type of note. */ - // then name - // then padding, optional - // then desc, at 4 byte alignment (not 8, despite being elf64) -} Elf_Note; -#endif - -class KernelArgMD { -public: - enum class ValueKind { - HiddenGlobalOffsetX, - HiddenGlobalOffsetY, - HiddenGlobalOffsetZ, - HiddenNone, - HiddenPrintfBuffer, - HiddenDefaultQueue, - HiddenCompletionAction, - HiddenMultiGridSyncArg, - HiddenHostcallBuffer, - HiddenHeapV1, - Unknown - }; - - KernelArgMD() - : name_(std::string()), size_(0), offset_(0), - valueKind_(ValueKind::Unknown) {} - - // fields - std::string name_; - uint32_t size_; - uint32_t offset_; - ValueKind valueKind_; -}; - -static const std::map ArgValueKind = { - // v3 - // {"by_value", KernelArgMD::ValueKind::ByValue}, - // {"global_buffer", KernelArgMD::ValueKind::GlobalBuffer}, - // {"dynamic_shared_pointer", - // KernelArgMD::ValueKind::DynamicSharedPointer}, - // {"sampler", KernelArgMD::ValueKind::Sampler}, - // {"image", KernelArgMD::ValueKind::Image}, - // {"pipe", KernelArgMD::ValueKind::Pipe}, - // {"queue", KernelArgMD::ValueKind::Queue}, - {"hidden_global_offset_x", KernelArgMD::ValueKind::HiddenGlobalOffsetX}, - {"hidden_global_offset_y", KernelArgMD::ValueKind::HiddenGlobalOffsetY}, - {"hidden_global_offset_z", KernelArgMD::ValueKind::HiddenGlobalOffsetZ}, - {"hidden_none", KernelArgMD::ValueKind::HiddenNone}, - {"hidden_printf_buffer", KernelArgMD::ValueKind::HiddenPrintfBuffer}, - {"hidden_default_queue", KernelArgMD::ValueKind::HiddenDefaultQueue}, - {"hidden_completion_action", - KernelArgMD::ValueKind::HiddenCompletionAction}, - {"hidden_multigrid_sync_arg", - KernelArgMD::ValueKind::HiddenMultiGridSyncArg}, - {"hidden_hostcall_buffer", KernelArgMD::ValueKind::HiddenHostcallBuffer}, - {"hidden_heap_v1", KernelArgMD::ValueKind::HiddenHeapV1}}; - -namespace core { - -hsa_status_t callbackEvent(const hsa_amd_event_t *event, void *data) { - if (event->event_type == HSA_AMD_GPU_MEMORY_FAULT_EVENT) { - hsa_amd_gpu_memory_fault_info_t memory_fault = event->memory_fault; - // memory_fault.agent - // memory_fault.virtual_address - // memory_fault.fault_reason_mask - // fprintf("[GPU Error at %p: Reason is ", memory_fault.virtual_address); - std::stringstream stream; - stream << std::hex << (uintptr_t)memory_fault.virtual_address; - std::string addr("0x" + stream.str()); - - std::string err_string = "[GPU Memory Error] Addr: " + addr; - err_string += " Reason: "; - if (!(memory_fault.fault_reason_mask & 0x00111111)) { - err_string += "No Idea! "; - } else { - if (memory_fault.fault_reason_mask & 0x00000001) - err_string += "Page not present or supervisor privilege. "; - if (memory_fault.fault_reason_mask & 0x00000010) - err_string += "Write access to a read-only page. "; - if (memory_fault.fault_reason_mask & 0x00000100) - err_string += "Execute access to a page marked NX. "; - if (memory_fault.fault_reason_mask & 0x00001000) - err_string += "Host access only. "; - if (memory_fault.fault_reason_mask & 0x00010000) - err_string += "ECC failure (if supported by HW). "; - if (memory_fault.fault_reason_mask & 0x00100000) - err_string += "Can't determine the exact fault address. "; - } - fprintf(stderr, "%s\n", err_string.c_str()); - return HSA_STATUS_ERROR; - } - return HSA_STATUS_SUCCESS; -} - -hsa_status_t atl_init_gpu_context() { - hsa_status_t err = hsa_amd_register_system_event_handler(callbackEvent, NULL); - if (err != HSA_STATUS_SUCCESS) { - printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__, - "Registering the system for memory faults", get_error_string(err)); - return HSA_STATUS_ERROR; - } - - return HSA_STATUS_SUCCESS; -} - -static bool isImplicit(KernelArgMD::ValueKind value_kind) { - switch (value_kind) { - case KernelArgMD::ValueKind::HiddenGlobalOffsetX: - case KernelArgMD::ValueKind::HiddenGlobalOffsetY: - case KernelArgMD::ValueKind::HiddenGlobalOffsetZ: - case KernelArgMD::ValueKind::HiddenNone: - case KernelArgMD::ValueKind::HiddenPrintfBuffer: - case KernelArgMD::ValueKind::HiddenDefaultQueue: - case KernelArgMD::ValueKind::HiddenCompletionAction: - case KernelArgMD::ValueKind::HiddenMultiGridSyncArg: - case KernelArgMD::ValueKind::HiddenHostcallBuffer: - case KernelArgMD::ValueKind::HiddenHeapV1: - return true; - default: - return false; - } -} - -static std::pair -findMetadata(const ELFObjectFile &ELFObj) { - constexpr std::pair Failure = { - nullptr, nullptr}; - const auto &Elf = ELFObj.getELFFile(); - auto PhdrsOrErr = Elf.program_headers(); - if (!PhdrsOrErr) { - consumeError(PhdrsOrErr.takeError()); - return Failure; - } - - for (auto Phdr : *PhdrsOrErr) { - if (Phdr.p_type != PT_NOTE) - continue; - - Error Err = Error::success(); - for (auto Note : Elf.notes(Phdr, Err)) { - if (Note.getType() == 7 || Note.getType() == 8) - return Failure; - - // Code object v2 uses yaml metadata and is no longer supported. - if (Note.getType() == NT_AMD_HSA_METADATA && Note.getName() == "AMD") - return Failure; - // Code object v3 should have AMDGPU metadata. - if (Note.getType() == NT_AMDGPU_METADATA && Note.getName() != "AMDGPU") - return Failure; - - ArrayRef Desc = Note.getDesc(); - return {Desc.data(), Desc.data() + Desc.size()}; - } - - if (Err) { - consumeError(std::move(Err)); - return Failure; - } - } - - return Failure; -} - -static std::pair -find_metadata(void *binary, size_t binSize) { - constexpr std::pair Failure = { - nullptr, nullptr}; - - StringRef Buffer = StringRef(static_cast(binary), binSize); - auto ElfOrErr = ObjectFile::createELFObjectFile(MemoryBufferRef(Buffer, ""), - /*InitContent=*/false); - if (!ElfOrErr) { - consumeError(ElfOrErr.takeError()); - return Failure; - } - - if (const auto *ELFObj = dyn_cast(ElfOrErr->get())) - return findMetadata(*ELFObj); - return Failure; -} - -namespace { -int map_lookup_array(msgpack::byte_range message, const char *needle, - msgpack::byte_range *res, uint64_t *size) { - unsigned count = 0; - struct s : msgpack::functors_defaults { - s(unsigned &count, uint64_t *size) : count(count), size(size) {} - unsigned &count; - uint64_t *size; - const unsigned char *handle_array(uint64_t N, msgpack::byte_range bytes) { - count++; - *size = N; - return bytes.end; - } - }; - - msgpack::foreach_map(message, - [&](msgpack::byte_range key, msgpack::byte_range value) { - if (msgpack::message_is_string(key, needle)) { - // If the message is an array, record number of - // elements in *size - msgpack::handle_msgpack(value, {count, size}); - // return the whole array - *res = value; - } - }); - // Only claim success if exactly one key/array pair matched - return count != 1; -} - -int map_lookup_string(msgpack::byte_range message, const char *needle, - std::string *res) { - unsigned count = 0; - struct s : public msgpack::functors_defaults { - s(unsigned &count, std::string *res) : count(count), res(res) {} - unsigned &count; - std::string *res; - void handle_string(size_t N, const unsigned char *str) { - count++; - *res = std::string(str, str + N); - } - }; - msgpack::foreach_map(message, - [&](msgpack::byte_range key, msgpack::byte_range value) { - if (msgpack::message_is_string(key, needle)) { - msgpack::handle_msgpack(value, {count, res}); - } - }); - return count != 1; -} - -int map_lookup_uint64_t(msgpack::byte_range message, const char *needle, - uint64_t *res) { - unsigned count = 0; - msgpack::foreach_map(message, - [&](msgpack::byte_range key, msgpack::byte_range value) { - if (msgpack::message_is_string(key, needle)) { - msgpack::foronly_unsigned(value, [&](uint64_t x) { - count++; - *res = x; - }); - } - }); - return count != 1; -} - -int array_lookup_element(msgpack::byte_range message, uint64_t elt, - msgpack::byte_range *res) { - int rc = 1; - uint64_t i = 0; - msgpack::foreach_array(message, [&](msgpack::byte_range value) { - if (i == elt) { - *res = value; - rc = 0; - } - i++; - }); - return rc; -} - -int populate_kernelArgMD(msgpack::byte_range args_element, - KernelArgMD *kernelarg) { - using namespace msgpack; - int error = 0; - foreach_map(args_element, [&](byte_range key, byte_range value) -> void { - if (message_is_string(key, ".name")) { - foronly_string(value, [&](size_t N, const unsigned char *str) { - kernelarg->name_ = std::string(str, str + N); - }); - } else if (message_is_string(key, ".size")) { - foronly_unsigned(value, [&](uint64_t x) { kernelarg->size_ = x; }); - } else if (message_is_string(key, ".offset")) { - foronly_unsigned(value, [&](uint64_t x) { kernelarg->offset_ = x; }); - } else if (message_is_string(key, ".value_kind")) { - foronly_string(value, [&](size_t N, const unsigned char *str) { - std::string s = std::string(str, str + N); - auto itValueKind = ArgValueKind.find(s); - if (itValueKind != ArgValueKind.end()) { - kernelarg->valueKind_ = itValueKind->second; - } - }); - } - }); - return error; -} -} // namespace - -static hsa_status_t get_code_object_custom_metadata( - void *binary, size_t binSize, - std::map &KernelInfoTable) { - // parse code object with different keys from v2 - // also, the kernel name is not the same as the symbol name -- so a - // symbol->name map is needed - - std::pair metadata = - find_metadata(binary, binSize); - if (!metadata.first) { - return HSA_STATUS_ERROR_INVALID_CODE_OBJECT; - } - - uint64_t kernelsSize = 0; - int msgpack_errors = 0; - msgpack::byte_range kernel_array; - msgpack_errors = - map_lookup_array({metadata.first, metadata.second}, "amdhsa.kernels", - &kernel_array, &kernelsSize); - if (msgpack_errors != 0) { - printf("[%s:%d] %s failed\n", __FILE__, __LINE__, - "kernels lookup in program metadata"); - return HSA_STATUS_ERROR_INVALID_CODE_OBJECT; - } - - for (size_t i = 0; i < kernelsSize; i++) { - assert(msgpack_errors == 0); - std::string kernelName; - std::string symbolName; - - msgpack::byte_range element; - msgpack_errors += array_lookup_element(kernel_array, i, &element); - if (msgpack_errors != 0) { - printf("[%s:%d] %s failed\n", __FILE__, __LINE__, - "element lookup in kernel metadata"); - return HSA_STATUS_ERROR_INVALID_CODE_OBJECT; - } - - msgpack_errors += map_lookup_string(element, ".name", &kernelName); - msgpack_errors += map_lookup_string(element, ".symbol", &symbolName); - if (msgpack_errors != 0) { - printf("[%s:%d] %s failed\n", __FILE__, __LINE__, - "strings lookup in kernel metadata"); - return HSA_STATUS_ERROR_INVALID_CODE_OBJECT; - } - - // Make sure that kernelName + ".kd" == symbolName - if ((kernelName + ".kd") != symbolName) { - printf("[%s:%d] Kernel name mismatching symbol: %s != %s + .kd\n", - __FILE__, __LINE__, symbolName.c_str(), kernelName.c_str()); - return HSA_STATUS_ERROR_INVALID_CODE_OBJECT; - } - - atl_kernel_info_t info = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; - - uint64_t sgpr_count, vgpr_count, sgpr_spill_count, vgpr_spill_count; - msgpack_errors += map_lookup_uint64_t(element, ".sgpr_count", &sgpr_count); - if (msgpack_errors != 0) { - printf("[%s:%d] %s failed\n", __FILE__, __LINE__, - "sgpr count metadata lookup in kernel metadata"); - return HSA_STATUS_ERROR_INVALID_CODE_OBJECT; - } - - info.sgpr_count = sgpr_count; - - msgpack_errors += map_lookup_uint64_t(element, ".vgpr_count", &vgpr_count); - if (msgpack_errors != 0) { - printf("[%s:%d] %s failed\n", __FILE__, __LINE__, - "vgpr count metadata lookup in kernel metadata"); - return HSA_STATUS_ERROR_INVALID_CODE_OBJECT; - } - - info.vgpr_count = vgpr_count; - - msgpack_errors += - map_lookup_uint64_t(element, ".sgpr_spill_count", &sgpr_spill_count); - if (msgpack_errors != 0) { - printf("[%s:%d] %s failed\n", __FILE__, __LINE__, - "sgpr spill count metadata lookup in kernel metadata"); - return HSA_STATUS_ERROR_INVALID_CODE_OBJECT; - } - - info.sgpr_spill_count = sgpr_spill_count; - - msgpack_errors += - map_lookup_uint64_t(element, ".vgpr_spill_count", &vgpr_spill_count); - if (msgpack_errors != 0) { - printf("[%s:%d] %s failed\n", __FILE__, __LINE__, - "vgpr spill count metadata lookup in kernel metadata"); - return HSA_STATUS_ERROR_INVALID_CODE_OBJECT; - } - - info.vgpr_spill_count = vgpr_spill_count; - - size_t kernel_explicit_args_size = 0; - uint64_t kernel_segment_size; - msgpack_errors += map_lookup_uint64_t(element, ".kernarg_segment_size", - &kernel_segment_size); - if (msgpack_errors != 0) { - printf("[%s:%d] %s failed\n", __FILE__, __LINE__, - "kernarg segment size metadata lookup in kernel metadata"); - return HSA_STATUS_ERROR_INVALID_CODE_OBJECT; - } - - bool hasHiddenArgs = false; - if (kernel_segment_size > 0) { - uint64_t argsSize; - size_t offset = 0; - - msgpack::byte_range args_array; - msgpack_errors += - map_lookup_array(element, ".args", &args_array, &argsSize); - if (msgpack_errors != 0) { - printf("[%s:%d] %s failed\n", __FILE__, __LINE__, - "kernel args metadata lookup in kernel metadata"); - return HSA_STATUS_ERROR_INVALID_CODE_OBJECT; - } - - for (size_t i = 0; i < argsSize; ++i) { - KernelArgMD lcArg; - - msgpack::byte_range args_element; - msgpack_errors += array_lookup_element(args_array, i, &args_element); - if (msgpack_errors != 0) { - printf("[%s:%d] %s failed\n", __FILE__, __LINE__, - "iterate args map in kernel args metadata"); - return HSA_STATUS_ERROR_INVALID_CODE_OBJECT; - } - - msgpack_errors += populate_kernelArgMD(args_element, &lcArg); - if (msgpack_errors != 0) { - printf("[%s:%d] %s failed\n", __FILE__, __LINE__, - "iterate args map in kernel args metadata"); - return HSA_STATUS_ERROR_INVALID_CODE_OBJECT; - } - // v3 has offset field and not align field - size_t new_offset = lcArg.offset_; - size_t padding = new_offset - offset; - offset = new_offset; - DP("Arg[%lu] \"%s\" (%u, %u)\n", i, lcArg.name_.c_str(), lcArg.size_, - lcArg.offset_); - offset += lcArg.size_; - - // check if the arg is a hidden/implicit arg - // this logic assumes that all hidden args are 8-byte aligned - if (!isImplicit(lcArg.valueKind_)) { - info.explicit_argument_count++; - kernel_explicit_args_size += lcArg.size_; - } else { - info.implicit_argument_count++; - hasHiddenArgs = true; - } - kernel_explicit_args_size += padding; - } - } - - // TODO: Probably don't want this arithmetic - info.kernel_segment_size = - (hasHiddenArgs ? kernel_explicit_args_size : kernel_segment_size); - DP("[%s: kernarg seg size] (%lu --> %u)\n", kernelName.c_str(), - kernel_segment_size, info.kernel_segment_size); - - // kernel received, now add it to the kernel info table - KernelInfoTable[kernelName] = info; - } - - return HSA_STATUS_SUCCESS; -} - -static hsa_status_t -populate_InfoTables(hsa_executable_symbol_t symbol, - std::map &KernelInfoTable, - std::map &SymbolInfoTable) { - hsa_symbol_kind_t type; - - uint32_t name_length; - hsa_status_t err; - err = hsa_executable_symbol_get_info(symbol, HSA_EXECUTABLE_SYMBOL_INFO_TYPE, - &type); - if (err != HSA_STATUS_SUCCESS) { - printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__, - "Symbol info extraction", get_error_string(err)); - return err; - } - DP("Exec Symbol type: %d\n", type); - if (type == HSA_SYMBOL_KIND_KERNEL) { - err = hsa_executable_symbol_get_info( - symbol, HSA_EXECUTABLE_SYMBOL_INFO_NAME_LENGTH, &name_length); - if (err != HSA_STATUS_SUCCESS) { - printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__, - "Symbol info extraction", get_error_string(err)); - return err; - } - char *name = reinterpret_cast(malloc(name_length + 1)); - err = hsa_executable_symbol_get_info(symbol, - HSA_EXECUTABLE_SYMBOL_INFO_NAME, name); - if (err != HSA_STATUS_SUCCESS) { - printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__, - "Symbol info extraction", get_error_string(err)); - return err; - } - // remove the suffix .kd from symbol name. - name[name_length - 3] = 0; - - atl_kernel_info_t info; - std::string kernelName(name); - // by now, the kernel info table should already have an entry - // because the non-ROCr custom code object parsing is called before - // iterating over the code object symbols using ROCr - if (KernelInfoTable.find(kernelName) == KernelInfoTable.end()) { - DP("amdgpu internal consistency error\n"); - return HSA_STATUS_ERROR; - } - // found, so assign and update - info = KernelInfoTable[kernelName]; - - /* Extract dispatch information from the symbol */ - err = hsa_executable_symbol_get_info( - symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT, - &(info.kernel_object)); - if (err != HSA_STATUS_SUCCESS) { - printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__, - "Extracting the symbol from the executable", - get_error_string(err)); - return err; - } - err = hsa_executable_symbol_get_info( - symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_GROUP_SEGMENT_SIZE, - &(info.group_segment_size)); - if (err != HSA_STATUS_SUCCESS) { - printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__, - "Extracting the group segment size from the executable", - get_error_string(err)); - return err; - } - err = hsa_executable_symbol_get_info( - symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_PRIVATE_SEGMENT_SIZE, - &(info.private_segment_size)); - if (err != HSA_STATUS_SUCCESS) { - printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__, - "Extracting the private segment from the executable", - get_error_string(err)); - return err; - } - - DP("Kernel %s --> %lx symbol %u group segsize %u pvt segsize %u bytes " - "kernarg\n", - kernelName.c_str(), info.kernel_object, info.group_segment_size, - info.private_segment_size, info.kernel_segment_size); - - // assign it back to the kernel info table - KernelInfoTable[kernelName] = info; - free(name); - } else if (type == HSA_SYMBOL_KIND_VARIABLE) { - err = hsa_executable_symbol_get_info( - symbol, HSA_EXECUTABLE_SYMBOL_INFO_NAME_LENGTH, &name_length); - if (err != HSA_STATUS_SUCCESS) { - printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__, - "Symbol info extraction", get_error_string(err)); - return err; - } - char *name = reinterpret_cast(malloc(name_length + 1)); - err = hsa_executable_symbol_get_info(symbol, - HSA_EXECUTABLE_SYMBOL_INFO_NAME, name); - if (err != HSA_STATUS_SUCCESS) { - printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__, - "Symbol info extraction", get_error_string(err)); - return err; - } - name[name_length] = 0; - - atl_symbol_info_t info; - - err = hsa_executable_symbol_get_info( - symbol, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS, &(info.addr)); - if (err != HSA_STATUS_SUCCESS) { - printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__, - "Symbol info address extraction", get_error_string(err)); - return err; - } - - err = hsa_executable_symbol_get_info( - symbol, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_SIZE, &(info.size)); - if (err != HSA_STATUS_SUCCESS) { - printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__, - "Symbol info size extraction", get_error_string(err)); - return err; - } - - DP("Symbol %s = %p (%u bytes)\n", name, (void *)info.addr, info.size); - SymbolInfoTable[std::string(name)] = info; - free(name); - } else { - DP("Symbol is an indirect function\n"); - } - return HSA_STATUS_SUCCESS; -} - -hsa_status_t RegisterModuleFromMemory( - std::map &KernelInfoTable, - std::map &SymbolInfoTable, - void *module_bytes, size_t module_size, hsa_agent_t agent, - hsa_status_t (*on_deserialized_data)(void *data, size_t size, - void *cb_state), - void *cb_state, std::vector &HSAExecutables) { - hsa_status_t err; - hsa_executable_t executable = {0}; - hsa_profile_t agent_profile; - - err = hsa_agent_get_info(agent, HSA_AGENT_INFO_PROFILE, &agent_profile); - if (err != HSA_STATUS_SUCCESS) { - printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__, - "Query the agent profile", get_error_string(err)); - return HSA_STATUS_ERROR; - } - // FIXME: Assume that every profile is FULL until we understand how to build - // GCN with base profile - agent_profile = HSA_PROFILE_FULL; - /* Create the empty executable. */ - err = hsa_executable_create(agent_profile, HSA_EXECUTABLE_STATE_UNFROZEN, "", - &executable); - if (err != HSA_STATUS_SUCCESS) { - printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__, - "Create the executable", get_error_string(err)); - return HSA_STATUS_ERROR; - } - - bool module_load_success = false; - do // Existing control flow used continue, preserve that for this patch - { - { - // Some metadata info is not available through ROCr API, so use custom - // code object metadata parsing to collect such metadata info - - err = get_code_object_custom_metadata(module_bytes, module_size, - KernelInfoTable); - if (err != HSA_STATUS_SUCCESS) { - DP("[%s:%d] %s failed: %s\n", __FILE__, __LINE__, - "Getting custom code object metadata", get_error_string(err)); - continue; - } - - // Deserialize code object. - hsa_code_object_t code_object = {0}; - err = hsa_code_object_deserialize(module_bytes, module_size, NULL, - &code_object); - if (err != HSA_STATUS_SUCCESS) { - DP("[%s:%d] %s failed: %s\n", __FILE__, __LINE__, - "Code Object Deserialization", get_error_string(err)); - continue; - } - assert(0 != code_object.handle); - - // Mutating the device image here avoids another allocation & memcpy - void *code_object_alloc_data = - reinterpret_cast(code_object.handle); - hsa_status_t impl_err = - on_deserialized_data(code_object_alloc_data, module_size, cb_state); - if (impl_err != HSA_STATUS_SUCCESS) { - printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__, - "Error in deserialized_data callback", - get_error_string(impl_err)); - return impl_err; - } - - /* Load the code object. */ - err = - hsa_executable_load_code_object(executable, agent, code_object, NULL); - if (err != HSA_STATUS_SUCCESS) { - DP("[%s:%d] %s failed: %s\n", __FILE__, __LINE__, - "Loading the code object", get_error_string(err)); - continue; - } - - // cannot iterate over symbols until executable is frozen - } - module_load_success = true; - } while (0); - DP("Modules loaded successful? %d\n", module_load_success); - if (module_load_success) { - /* Freeze the executable; it can now be queried for symbols. */ - err = hsa_executable_freeze(executable, ""); - if (err != HSA_STATUS_SUCCESS) { - printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__, - "Freeze the executable", get_error_string(err)); - return HSA_STATUS_ERROR; - } - - err = hsa::executable_iterate_symbols( - executable, - [&](hsa_executable_t, hsa_executable_symbol_t symbol) -> hsa_status_t { - return populate_InfoTables(symbol, KernelInfoTable, SymbolInfoTable); - }); - if (err != HSA_STATUS_SUCCESS) { - printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__, - "Iterating over symbols for execuatable", get_error_string(err)); - return HSA_STATUS_ERROR; - } - - // save the executable and destroy during finalize - HSAExecutables.push_back(executable); - return HSA_STATUS_SUCCESS; - } else { - return HSA_STATUS_ERROR; - } -} - -} // namespace core diff --git a/openmp/libomptarget/plugins/amdgpu/src/print_tracing.h b/openmp/libomptarget/plugins/amdgpu/src/print_tracing.h deleted file mode 100644 --- a/openmp/libomptarget/plugins/amdgpu/src/print_tracing.h +++ /dev/null @@ -1,20 +0,0 @@ -//===--- amdgpu/src/print_tracing.h ------------------------------- C++ -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -#ifndef LIBOMPTARGET_PLUGINS_AMGGPU_SRC_PRINT_TRACING_H_INCLUDED -#define LIBOMPTARGET_PLUGINS_AMGGPU_SRC_PRINT_TRACING_H_INCLUDED - -enum PrintTraceControlBits { - LAUNCH = 1, // print a message to stderr for each kernel launch - RTL_TIMING = 2, // Print timing info around each RTL step - STARTUP_DETAILS = 4, // Details around loading up kernel - RTL_TO_STDOUT = 8 // Redirect RTL tracing to stdout -}; - -extern int print_kernel_trace; // set by environment variable - -#endif diff --git a/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp b/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp deleted file mode 100644 --- a/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp +++ /dev/null @@ -1,2615 +0,0 @@ -//===--- amdgpu/src/rtl.cpp --------------------------------------- C++ -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// RTL for AMD hsa machine -// -//===----------------------------------------------------------------------===// - -#include "llvm/ADT/StringRef.h" -#include "llvm/Frontend/OpenMP/OMPConstants.h" -#include "llvm/Frontend/OpenMP/OMPGridValues.h" -#include "llvm/Object/ELF.h" -#include "llvm/Object/ELFObjectFile.h" - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "ELFSymbols.h" -#include "impl_runtime.h" -#include "interop_hsa.h" - -#include "UtilitiesRTL.h" -#include "internal.h" -#include "rt.h" - -#include "DeviceEnvironment.h" -#include "get_elf_mach_gfx_name.h" -#include "omptargetplugin.h" -#include "print_tracing.h" - -using namespace llvm; -using namespace llvm::object; -using namespace llvm::ELF; -using namespace llvm::omp::target::plugin::utils; - -// hostrpc interface, FIXME: consider moving to its own include these are -// statically linked into amdgpu/plugin if present from hostrpc_services.a, -// linked as --whole-archive to override the weak symbols that are used to -// implement a fallback for toolchains that do not yet have a hostrpc library. -extern "C" { -uint64_t hostrpc_assign_buffer(hsa_agent_t Agent, hsa_queue_t *ThisQ, - uint32_t DeviceId); -hsa_status_t hostrpc_init(); -hsa_status_t hostrpc_terminate(); - -__attribute__((weak)) hsa_status_t hostrpc_init() { return HSA_STATUS_SUCCESS; } -__attribute__((weak)) hsa_status_t hostrpc_terminate() { - return HSA_STATUS_SUCCESS; -} -__attribute__((weak)) uint64_t hostrpc_assign_buffer(hsa_agent_t, hsa_queue_t *, - uint32_t DeviceId) { - DP("Warning: Attempting to assign hostrpc to device %u, but hostrpc library " - "missing\n", - DeviceId); - return 0; -} -} - -// Heuristic parameters used for kernel launch -// Number of teams per CU to allow scheduling flexibility -static const unsigned DefaultTeamsPerCU = 4; - -int print_kernel_trace; - -#ifdef OMPTARGET_DEBUG -#define check(msg, status) \ - if (status != HSA_STATUS_SUCCESS) { \ - DP(#msg " failed\n"); \ - } else { \ - DP(#msg " succeeded\n"); \ - } -#else -#define check(msg, status) \ - {} -#endif - -#include "elf_common.h" - -namespace hsa { -template hsa_status_t iterate_agents(C Cb) { - auto L = [](hsa_agent_t Agent, void *Data) -> hsa_status_t { - C *Unwrapped = static_cast(Data); - return (*Unwrapped)(Agent); - }; - return hsa_iterate_agents(L, static_cast(&Cb)); -} - -template -hsa_status_t amd_agent_iterate_memory_pools(hsa_agent_t Agent, C Cb) { - auto L = [](hsa_amd_memory_pool_t MemoryPool, void *Data) -> hsa_status_t { - C *Unwrapped = static_cast(Data); - return (*Unwrapped)(MemoryPool); - }; - - return hsa_amd_agent_iterate_memory_pools(Agent, L, static_cast(&Cb)); -} - -} // namespace hsa - -/// Keep entries table per device -struct FuncOrGblEntryTy { - __tgt_target_table Table; - std::vector<__tgt_offload_entry> Entries; -}; - -struct KernelArgPool { -private: - static pthread_mutex_t Mutex; - -public: - uint32_t KernargSegmentSize; - void *KernargRegion = nullptr; - std::queue FreeKernargSegments; - - uint32_t kernargSizeIncludingImplicit() { - return KernargSegmentSize + sizeof(AMDGPUImplicitArgsTy); - } - - ~KernelArgPool() { - if (KernargRegion) { - auto R = hsa_amd_memory_pool_free(KernargRegion); - if (R != HSA_STATUS_SUCCESS) { - DP("hsa_amd_memory_pool_free failed: %s\n", get_error_string(R)); - } - } - } - - // Can't really copy or move a mutex - KernelArgPool() = default; - KernelArgPool(const KernelArgPool &) = delete; - KernelArgPool(KernelArgPool &&) = delete; - - KernelArgPool(uint32_t KernargSegmentSize, hsa_amd_memory_pool_t &MemoryPool) - : KernargSegmentSize(KernargSegmentSize) { - - // impl uses one pool per kernel for all gpus, with a fixed upper size - // preserving that exact scheme here, including the queue - - hsa_status_t Err = hsa_amd_memory_pool_allocate( - MemoryPool, kernargSizeIncludingImplicit() * MAX_NUM_KERNELS, 0, - &KernargRegion); - - if (Err != HSA_STATUS_SUCCESS) { - DP("hsa_amd_memory_pool_allocate failed: %s\n", get_error_string(Err)); - KernargRegion = nullptr; // paranoid - return; - } - - Err = core::allow_access_to_all_gpu_agents(KernargRegion); - if (Err != HSA_STATUS_SUCCESS) { - DP("hsa allow_access_to_all_gpu_agents failed: %s\n", - get_error_string(Err)); - auto R = hsa_amd_memory_pool_free(KernargRegion); - if (R != HSA_STATUS_SUCCESS) { - // if free failed, can't do anything more to resolve it - DP("hsa memory poll free failed: %s\n", get_error_string(Err)); - } - KernargRegion = nullptr; - return; - } - - for (int I = 0; I < MAX_NUM_KERNELS; I++) { - FreeKernargSegments.push(I); - } - } - - void *allocate(uint64_t ArgNum) { - assert((ArgNum * sizeof(void *)) == KernargSegmentSize); - Lock L(&Mutex); - void *Res = nullptr; - if (!FreeKernargSegments.empty()) { - - int FreeIdx = FreeKernargSegments.front(); - Res = static_cast(static_cast(KernargRegion) + - (FreeIdx * kernargSizeIncludingImplicit())); - assert(FreeIdx == pointerToIndex(Res)); - FreeKernargSegments.pop(); - } - return Res; - } - - void deallocate(void *Ptr) { - Lock L(&Mutex); - int Idx = pointerToIndex(Ptr); - FreeKernargSegments.push(Idx); - } - -private: - int pointerToIndex(void *Ptr) { - ptrdiff_t Bytes = - static_cast(Ptr) - static_cast(KernargRegion); - assert(Bytes >= 0); - assert(Bytes % kernargSizeIncludingImplicit() == 0); - return Bytes / kernargSizeIncludingImplicit(); - } - struct Lock { - Lock(pthread_mutex_t *M) : M(M) { pthread_mutex_lock(M); } - ~Lock() { pthread_mutex_unlock(M); } - pthread_mutex_t *M; - }; -}; -pthread_mutex_t KernelArgPool::Mutex = PTHREAD_MUTEX_INITIALIZER; - -std::unordered_map> - KernelArgPoolMap; - -/// Use a single entity to encode a kernel and a set of flags -struct KernelTy { - llvm::omp::OMPTgtExecModeFlags ExecutionMode; - int16_t ConstWGSize; - int32_t DeviceId; - void *CallStackAddr = nullptr; - const char *Name; - - KernelTy(llvm::omp::OMPTgtExecModeFlags ExecutionMode, int16_t ConstWgSize, - int32_t DeviceId, void *CallStackAddr, const char *Name, - uint32_t KernargSegmentSize, - hsa_amd_memory_pool_t &KernArgMemoryPool) - : ExecutionMode(ExecutionMode), ConstWGSize(ConstWgSize), - DeviceId(DeviceId), CallStackAddr(CallStackAddr), Name(Name) { - DP("Construct kernelinfo: ExecMode %d\n", ExecutionMode); - - std::string N(Name); - if (KernelArgPoolMap.find(N) == KernelArgPoolMap.end()) { - KernelArgPoolMap.insert( - std::make_pair(N, std::unique_ptr(new KernelArgPool( - KernargSegmentSize, KernArgMemoryPool)))); - } - } -}; - -/// List that contains all the kernels. -/// FIXME: we may need this to be per device and per library. -std::list KernelsList; - -template static hsa_status_t findAgents(Callback CB) { - - hsa_status_t Err = - hsa::iterate_agents([&](hsa_agent_t Agent) -> hsa_status_t { - hsa_device_type_t DeviceType; - // get_info fails iff HSA runtime not yet initialized - hsa_status_t Err = - hsa_agent_get_info(Agent, HSA_AGENT_INFO_DEVICE, &DeviceType); - - if (Err != HSA_STATUS_SUCCESS) { - if (print_kernel_trace > 0) - DP("rtl.cpp: err %s\n", get_error_string(Err)); - - return Err; - } - - CB(DeviceType, Agent); - return HSA_STATUS_SUCCESS; - }); - - // iterate_agents fails iff HSA runtime not yet initialized - if (print_kernel_trace > 0 && Err != HSA_STATUS_SUCCESS) { - DP("rtl.cpp: err %s\n", get_error_string(Err)); - } - - return Err; -} - -static void callbackQueue(hsa_status_t Status, hsa_queue_t *Source, - void *Data) { - if (Status != HSA_STATUS_SUCCESS) { - const char *StatusString; - if (hsa_status_string(Status, &StatusString) != HSA_STATUS_SUCCESS) { - StatusString = "unavailable"; - } - DP("[%s:%d] GPU error in queue %p %d (%s)\n", __FILE__, __LINE__, Source, - Status, StatusString); - abort(); - } -} - -namespace core { -namespace { - -bool checkResult(hsa_status_t Err, const char *ErrMsg) { - if (Err == HSA_STATUS_SUCCESS) - return true; - - REPORT("%s", ErrMsg); - REPORT("%s", get_error_string(Err)); - return false; -} - -void packetStoreRelease(uint32_t *Packet, uint16_t Header, uint16_t Rest) { - __atomic_store_n(Packet, Header | (Rest << 16), __ATOMIC_RELEASE); -} - -uint16_t createHeader() { - uint16_t Header = HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE; - Header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE; - Header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE; - return Header; -} - -hsa_status_t isValidMemoryPool(hsa_amd_memory_pool_t MemoryPool) { - bool AllocAllowed = false; - hsa_status_t Err = hsa_amd_memory_pool_get_info( - MemoryPool, HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALLOWED, - &AllocAllowed); - if (Err != HSA_STATUS_SUCCESS) { - DP("Alloc allowed in memory pool check failed: %s\n", - get_error_string(Err)); - return Err; - } - - size_t Size = 0; - Err = hsa_amd_memory_pool_get_info(MemoryPool, HSA_AMD_MEMORY_POOL_INFO_SIZE, - &Size); - if (Err != HSA_STATUS_SUCCESS) { - DP("Get memory pool size failed: %s\n", get_error_string(Err)); - return Err; - } - - return (AllocAllowed && Size > 0) ? HSA_STATUS_SUCCESS : HSA_STATUS_ERROR; -} - -hsa_status_t addMemoryPool(hsa_amd_memory_pool_t MemoryPool, void *Data) { - std::vector *Result = - static_cast *>(Data); - - hsa_status_t Err; - if ((Err = isValidMemoryPool(MemoryPool)) != HSA_STATUS_SUCCESS) { - return Err; - } - - Result->push_back(MemoryPool); - return HSA_STATUS_SUCCESS; -} - -} // namespace -} // namespace core - -struct EnvironmentVariables { - int NumTeams; - int TeamLimit; - int TeamThreadLimit; - int MaxTeamsDefault; - int DynamicMemSize; -}; - -template -static constexpr const llvm::omp::GV &getGridValue() { - return llvm::omp::getAMDGPUGridValues(); -} - -struct HSALifetime { - // Wrapper around HSA used to ensure it is constructed before other types - // and destructed after, which means said other types can use raii for - // cleanup without risking running outside of the lifetime of HSA - const hsa_status_t S; - - bool HSAInitSuccess() { return S == HSA_STATUS_SUCCESS; } - HSALifetime() : S(hsa_init()) {} - - ~HSALifetime() { - if (S == HSA_STATUS_SUCCESS) { - hsa_status_t Err = hsa_shut_down(); - if (Err != HSA_STATUS_SUCCESS) { - // Can't call into HSA to get a string from the integer - DP("Shutting down HSA failed: %d\n", Err); - } - } - } -}; - -// Handle scheduling of multiple hsa_queue's per device to -// multiple threads (one scheduler per device) -class HSAQueueScheduler { -public: - HSAQueueScheduler() : Current(0) {} - - HSAQueueScheduler(const HSAQueueScheduler &) = delete; - - HSAQueueScheduler(HSAQueueScheduler &&Q) { - Current = Q.Current.load(); - for (uint8_t I = 0; I < NUM_QUEUES_PER_DEVICE; I++) { - HSAQueues[I] = Q.HSAQueues[I]; - Q.HSAQueues[I] = nullptr; - } - } - - // \return false if any HSA queue creation fails - bool createQueues(hsa_agent_t HSAAgent, uint32_t QueueSize) { - for (uint8_t I = 0; I < NUM_QUEUES_PER_DEVICE; I++) { - hsa_queue_t *Q = nullptr; - hsa_status_t Rc = - hsa_queue_create(HSAAgent, QueueSize, HSA_QUEUE_TYPE_MULTI, - callbackQueue, NULL, UINT32_MAX, UINT32_MAX, &Q); - if (Rc != HSA_STATUS_SUCCESS) { - DP("Failed to create HSA queue %d\n", I); - return false; - } - HSAQueues[I] = Q; - } - return true; - } - - ~HSAQueueScheduler() { - for (uint8_t I = 0; I < NUM_QUEUES_PER_DEVICE; I++) { - if (HSAQueues[I]) { - hsa_status_t Err = hsa_queue_destroy(HSAQueues[I]); - if (Err != HSA_STATUS_SUCCESS) - DP("Error destroying HSA queue"); - } - } - } - - // \return next queue to use for device - hsa_queue_t *next() { - return HSAQueues[(Current.fetch_add(1, std::memory_order_relaxed)) % - NUM_QUEUES_PER_DEVICE]; - } - -private: - // Number of queues per device - enum : uint8_t { NUM_QUEUES_PER_DEVICE = 4 }; - hsa_queue_t *HSAQueues[NUM_QUEUES_PER_DEVICE] = {}; - std::atomic Current; -}; - -/// Class containing all the device information -class RTLDeviceInfoTy : HSALifetime { - std::vector> FuncGblEntries; - - struct QueueDeleter { - void operator()(hsa_queue_t *Q) { - if (Q) { - hsa_status_t Err = hsa_queue_destroy(Q); - if (Err != HSA_STATUS_SUCCESS) { - DP("Error destroying hsa queue: %s\n", get_error_string(Err)); - } - } - } - }; - -public: - bool ConstructionSucceeded = false; - - // load binary populates symbol tables and mutates various global state - // run uses those symbol tables - std::shared_timed_mutex LoadRunLock; - - int NumberOfDevices = 0; - - // GPU devices - std::vector HSAAgents; - std::vector HSAQueueSchedulers; // one per gpu - - // CPUs - std::vector CPUAgents; - - // Device properties - std::vector ComputeUnits; - std::vector GroupsPerDevice; - std::vector ThreadsPerGroup; - std::vector WarpSize; - std::vector GPUName; - std::vector TargetID; - - // OpenMP properties - std::vector NumTeams; - std::vector NumThreads; - - // OpenMP Environment properties - EnvironmentVariables Env; - - // OpenMP Requires Flags - int64_t RequiresFlags; - - // Resource pools - SignalPoolT FreeSignalPool; - - bool HostcallRequired = false; - - std::vector HSAExecutables; - - std::vector> KernelInfoTable; - std::vector> SymbolInfoTable; - - hsa_amd_memory_pool_t KernArgPool; - - // fine grained memory pool for host allocations - hsa_amd_memory_pool_t HostFineGrainedMemoryPool; - - // fine and coarse-grained memory pools per offloading device - std::vector DeviceFineGrainedMemoryPools; - std::vector DeviceCoarseGrainedMemoryPools; - - struct ImplFreePtrDeletor { - void operator()(void *P) { - core::Runtime::Memfree(P); // ignore failure to free - } - }; - - // device_State shared across loaded binaries, error if inconsistent size - std::vector, uint64_t>> - DeviceStateStore; - - static const unsigned HardTeamLimit = - (1 << 16) - 1; // 64K needed to fit in uint16 - static const int DefaultNumTeams = 128; - - // These need to be per-device since different devices can have different - // wave sizes, but are currently the same number for each so that refactor - // can be postponed. - static_assert(getGridValue<32>().GV_Max_Teams == - getGridValue<64>().GV_Max_Teams, - ""); - static const int MaxTeams = getGridValue<64>().GV_Max_Teams; - - static_assert(getGridValue<32>().GV_Max_WG_Size == - getGridValue<64>().GV_Max_WG_Size, - ""); - static const int MaxWgSize = getGridValue<64>().GV_Max_WG_Size; - - static_assert(getGridValue<32>().GV_Default_WG_Size == - getGridValue<64>().GV_Default_WG_Size, - ""); - static const int DefaultWgSize = getGridValue<64>().GV_Default_WG_Size; - - using MemcpyFunc = hsa_status_t (*)(hsa_signal_t, void *, void *, size_t Size, - hsa_agent_t, hsa_amd_memory_pool_t); - hsa_status_t freesignalpoolMemcpy(void *Dest, void *Src, size_t Size, - MemcpyFunc Func, int32_t DeviceId) { - hsa_agent_t Agent = HSAAgents[DeviceId]; - hsa_signal_t S = FreeSignalPool.pop(); - if (S.handle == 0) { - return HSA_STATUS_ERROR; - } - hsa_status_t R = Func(S, Dest, Src, Size, Agent, HostFineGrainedMemoryPool); - FreeSignalPool.push(S); - return R; - } - - hsa_status_t freesignalpoolMemcpyD2H(void *Dest, void *Src, size_t Size, - int32_t DeviceId) { - return freesignalpoolMemcpy(Dest, Src, Size, impl_memcpy_d2h, DeviceId); - } - - hsa_status_t freesignalpoolMemcpyH2D(void *Dest, void *Src, size_t Size, - int32_t DeviceId) { - return freesignalpoolMemcpy(Dest, Src, Size, impl_memcpy_h2d, DeviceId); - } - - static void printDeviceInfo(int32_t DeviceId, hsa_agent_t Agent) { - char TmpChar[1000]; - uint16_t Major, Minor; - uint32_t TmpUInt; - uint32_t TmpUInt2; - uint32_t CacheSize[4]; - bool TmpBool; - uint16_t WorkgroupMaxDim[3]; - hsa_dim3_t GridMaxDim; - - // Getting basic information about HSA and Device - core::checkResult( - hsa_system_get_info(HSA_SYSTEM_INFO_VERSION_MAJOR, &Major), - "Error from hsa_system_get_info when obtaining " - "HSA_SYSTEM_INFO_VERSION_MAJOR\n"); - core::checkResult( - hsa_system_get_info(HSA_SYSTEM_INFO_VERSION_MINOR, &Minor), - "Error from hsa_system_get_info when obtaining " - "HSA_SYSTEM_INFO_VERSION_MINOR\n"); - printf(" HSA Runtime Version: \t\t%u.%u \n", Major, Minor); - printf(" HSA OpenMP Device Number: \t\t%d \n", DeviceId); - core::checkResult( - hsa_agent_get_info( - Agent, (hsa_agent_info_t)HSA_AMD_AGENT_INFO_PRODUCT_NAME, TmpChar), - "Error returned from hsa_agent_get_info when obtaining " - "HSA_AMD_AGENT_INFO_PRODUCT_NAME\n"); - printf(" Product Name: \t\t\t%s \n", TmpChar); - core::checkResult(hsa_agent_get_info(Agent, HSA_AGENT_INFO_NAME, TmpChar), - "Error returned from hsa_agent_get_info when obtaining " - "HSA_AGENT_INFO_NAME\n"); - printf(" Device Name: \t\t\t%s \n", TmpChar); - core::checkResult( - hsa_agent_get_info(Agent, HSA_AGENT_INFO_VENDOR_NAME, TmpChar), - "Error returned from hsa_agent_get_info when obtaining " - "HSA_AGENT_INFO_NAME\n"); - printf(" Vendor Name: \t\t\t%s \n", TmpChar); - hsa_device_type_t DevType; - core::checkResult( - hsa_agent_get_info(Agent, HSA_AGENT_INFO_DEVICE, &DevType), - "Error returned from hsa_agent_get_info when obtaining " - "HSA_AGENT_INFO_DEVICE\n"); - printf(" Device Type: \t\t\t%s \n", - DevType == HSA_DEVICE_TYPE_CPU - ? "CPU" - : (DevType == HSA_DEVICE_TYPE_GPU - ? "GPU" - : (DevType == HSA_DEVICE_TYPE_DSP ? "DSP" : "UNKNOWN"))); - core::checkResult( - hsa_agent_get_info(Agent, HSA_AGENT_INFO_QUEUES_MAX, &TmpUInt), - "Error returned from hsa_agent_get_info when obtaining " - "HSA_AGENT_INFO_QUEUES_MAX\n"); - printf(" Max Queues: \t\t\t%u \n", TmpUInt); - core::checkResult( - hsa_agent_get_info(Agent, HSA_AGENT_INFO_QUEUE_MIN_SIZE, &TmpUInt), - "Error returned from hsa_agent_get_info when obtaining " - "HSA_AGENT_INFO_QUEUE_MIN_SIZE\n"); - printf(" Queue Min Size: \t\t\t%u \n", TmpUInt); - core::checkResult( - hsa_agent_get_info(Agent, HSA_AGENT_INFO_QUEUE_MAX_SIZE, &TmpUInt), - "Error returned from hsa_agent_get_info when obtaining " - "HSA_AGENT_INFO_QUEUE_MAX_SIZE\n"); - printf(" Queue Max Size: \t\t\t%u \n", TmpUInt); - - // Getting cache information - printf(" Cache:\n"); - - // FIXME: This is deprecated according to HSA documentation. But using - // hsa_agent_iterate_caches and hsa_cache_get_info breaks execution during - // runtime. - core::checkResult( - hsa_agent_get_info(Agent, HSA_AGENT_INFO_CACHE_SIZE, CacheSize), - "Error returned from hsa_agent_get_info when obtaining " - "HSA_AGENT_INFO_CACHE_SIZE\n"); - - for (int I = 0; I < 4; I++) { - if (CacheSize[I]) { - printf(" L%u: \t\t\t\t%u bytes\n", I, CacheSize[I]); - } - } - - core::checkResult( - hsa_agent_get_info(Agent, - (hsa_agent_info_t)HSA_AMD_AGENT_INFO_CACHELINE_SIZE, - &TmpUInt), - "Error returned from hsa_agent_get_info when obtaining " - "HSA_AMD_AGENT_INFO_CACHELINE_SIZE\n"); - printf(" Cacheline Size: \t\t\t%u \n", TmpUInt); - core::checkResult( - hsa_agent_get_info( - Agent, (hsa_agent_info_t)HSA_AMD_AGENT_INFO_MAX_CLOCK_FREQUENCY, - &TmpUInt), - "Error returned from hsa_agent_get_info when obtaining " - "HSA_AMD_AGENT_INFO_MAX_CLOCK_FREQUENCY\n"); - printf(" Max Clock Freq(MHz): \t\t%u \n", TmpUInt); - core::checkResult( - hsa_agent_get_info( - Agent, (hsa_agent_info_t)HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT, - &TmpUInt), - "Error returned from hsa_agent_get_info when obtaining " - "HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT\n"); - printf(" Compute Units: \t\t\t%u \n", TmpUInt); - core::checkResult(hsa_agent_get_info( - Agent, - (hsa_agent_info_t)HSA_AMD_AGENT_INFO_NUM_SIMDS_PER_CU, - &TmpUInt), - "Error returned from hsa_agent_get_info when obtaining " - "HSA_AMD_AGENT_INFO_NUM_SIMDS_PER_CU\n"); - printf(" SIMD per CU: \t\t\t%u \n", TmpUInt); - core::checkResult( - hsa_agent_get_info(Agent, HSA_AGENT_INFO_FAST_F16_OPERATION, &TmpBool), - "Error returned from hsa_agent_get_info when obtaining " - "HSA_AMD_AGENT_INFO_NUM_SIMDS_PER_CU\n"); - printf(" Fast F16 Operation: \t\t%s \n", (TmpBool ? "TRUE" : "FALSE")); - core::checkResult( - hsa_agent_get_info(Agent, HSA_AGENT_INFO_WAVEFRONT_SIZE, &TmpUInt2), - "Error returned from hsa_agent_get_info when obtaining " - "HSA_AGENT_INFO_WAVEFRONT_SIZE\n"); - printf(" Wavefront Size: \t\t\t%u \n", TmpUInt2); - core::checkResult( - hsa_agent_get_info(Agent, HSA_AGENT_INFO_WORKGROUP_MAX_SIZE, &TmpUInt), - "Error returned from hsa_agent_get_info when obtaining " - "HSA_AGENT_INFO_WORKGROUP_MAX_SIZE\n"); - printf(" Workgroup Max Size: \t\t%u \n", TmpUInt); - core::checkResult(hsa_agent_get_info(Agent, - HSA_AGENT_INFO_WORKGROUP_MAX_DIM, - WorkgroupMaxDim), - "Error returned from hsa_agent_get_info when obtaining " - "HSA_AGENT_INFO_WORKGROUP_MAX_DIM\n"); - printf(" Workgroup Max Size per Dimension:\n"); - printf(" x: \t\t\t\t%u\n", WorkgroupMaxDim[0]); - printf(" y: \t\t\t\t%u\n", WorkgroupMaxDim[1]); - printf(" z: \t\t\t\t%u\n", WorkgroupMaxDim[2]); - core::checkResult(hsa_agent_get_info( - Agent, - (hsa_agent_info_t)HSA_AMD_AGENT_INFO_MAX_WAVES_PER_CU, - &TmpUInt), - "Error returned from hsa_agent_get_info when obtaining " - "HSA_AMD_AGENT_INFO_MAX_WAVES_PER_CU\n"); - printf(" Max Waves Per CU: \t\t\t%u \n", TmpUInt); - printf(" Max Work-item Per CU: \t\t%u \n", TmpUInt * TmpUInt2); - core::checkResult( - hsa_agent_get_info(Agent, HSA_AGENT_INFO_GRID_MAX_SIZE, &TmpUInt), - "Error returned from hsa_agent_get_info when obtaining " - "HSA_AGENT_INFO_GRID_MAX_SIZE\n"); - printf(" Grid Max Size: \t\t\t%u \n", TmpUInt); - core::checkResult( - hsa_agent_get_info(Agent, HSA_AGENT_INFO_GRID_MAX_DIM, &GridMaxDim), - "Error returned from hsa_agent_get_info when obtaining " - "HSA_AGENT_INFO_GRID_MAX_DIM\n"); - printf(" Grid Max Size per Dimension: \t\t\n"); - printf(" x: \t\t\t\t%u\n", GridMaxDim.x); - printf(" y: \t\t\t\t%u\n", GridMaxDim.y); - printf(" z: \t\t\t\t%u\n", GridMaxDim.z); - core::checkResult( - hsa_agent_get_info(Agent, HSA_AGENT_INFO_FBARRIER_MAX_SIZE, &TmpUInt), - "Error returned from hsa_agent_get_info when obtaining " - "HSA_AGENT_INFO_FBARRIER_MAX_SIZE\n"); - printf(" Max fbarriers/Workgrp: \t\t%u\n", TmpUInt); - - printf(" Memory Pools:\n"); - auto CbMem = [](hsa_amd_memory_pool_t Region, void *Data) -> hsa_status_t { - std::string TmpStr; - size_t Size; - bool Alloc, Access; - hsa_amd_segment_t Segment; - hsa_amd_memory_pool_global_flag_t GlobalFlags; - core::checkResult( - hsa_amd_memory_pool_get_info( - Region, HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, &GlobalFlags), - "Error returned from hsa_amd_memory_pool_get_info when obtaining " - "HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS\n"); - core::checkResult(hsa_amd_memory_pool_get_info( - Region, HSA_AMD_MEMORY_POOL_INFO_SEGMENT, &Segment), - "Error returned from hsa_amd_memory_pool_get_info when " - "obtaining HSA_AMD_MEMORY_POOL_INFO_SEGMENT\n"); - - switch (Segment) { - case HSA_AMD_SEGMENT_GLOBAL: - TmpStr = "GLOBAL; FLAGS: "; - if (HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_KERNARG_INIT & GlobalFlags) - TmpStr += "KERNARG, "; - if (HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_FINE_GRAINED & GlobalFlags) - TmpStr += "FINE GRAINED, "; - if (HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_COARSE_GRAINED & GlobalFlags) - TmpStr += "COARSE GRAINED, "; - break; - case HSA_AMD_SEGMENT_READONLY: - TmpStr = "READONLY"; - break; - case HSA_AMD_SEGMENT_PRIVATE: - TmpStr = "PRIVATE"; - break; - case HSA_AMD_SEGMENT_GROUP: - TmpStr = "GROUP"; - break; - } - printf(" Pool %s: \n", TmpStr.c_str()); - - core::checkResult(hsa_amd_memory_pool_get_info( - Region, HSA_AMD_MEMORY_POOL_INFO_SIZE, &Size), - "Error returned from hsa_amd_memory_pool_get_info when " - "obtaining HSA_AMD_MEMORY_POOL_INFO_SIZE\n"); - printf(" Size: \t\t\t\t %zu bytes\n", Size); - core::checkResult( - hsa_amd_memory_pool_get_info( - Region, HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALLOWED, &Alloc), - "Error returned from hsa_amd_memory_pool_get_info when obtaining " - "HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALLOWED\n"); - printf(" Allocatable: \t\t\t %s\n", (Alloc ? "TRUE" : "FALSE")); - core::checkResult( - hsa_amd_memory_pool_get_info( - Region, HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_GRANULE, &Size), - "Error returned from hsa_amd_memory_pool_get_info when obtaining " - "HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_GRANULE\n"); - printf(" Runtime Alloc Granule: \t\t %zu bytes\n", Size); - core::checkResult( - hsa_amd_memory_pool_get_info( - Region, HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALIGNMENT, &Size), - "Error returned from hsa_amd_memory_pool_get_info when obtaining " - "HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALIGNMENT\n"); - printf(" Runtime Alloc alignment: \t %zu bytes\n", Size); - core::checkResult( - hsa_amd_memory_pool_get_info( - Region, HSA_AMD_MEMORY_POOL_INFO_ACCESSIBLE_BY_ALL, &Access), - "Error returned from hsa_amd_memory_pool_get_info when obtaining " - "HSA_AMD_MEMORY_POOL_INFO_ACCESSIBLE_BY_ALL\n"); - printf(" Accessable by all: \t\t %s\n", - (Access ? "TRUE" : "FALSE")); - - return HSA_STATUS_SUCCESS; - }; - // Iterate over all the memory regions for this agent. Get the memory region - // type and size - hsa_amd_agent_iterate_memory_pools(Agent, CbMem, nullptr); - - printf(" ISAs:\n"); - auto CBIsas = [](hsa_isa_t Isa, void *Data) -> hsa_status_t { - char TmpChar[1000]; - core::checkResult(hsa_isa_get_info_alt(Isa, HSA_ISA_INFO_NAME, TmpChar), - "Error returned from hsa_isa_get_info_alt when " - "obtaining HSA_ISA_INFO_NAME\n"); - printf(" Name: \t\t\t\t %s\n", TmpChar); - - return HSA_STATUS_SUCCESS; - }; - // Iterate over all the memory regions for this agent. Get the memory region - // type and size - hsa_agent_iterate_isas(Agent, CBIsas, nullptr); - } - - // Record entry point associated with device - void addOffloadEntry(int32_t DeviceId, __tgt_offload_entry Entry) { - assert(DeviceId < (int32_t)FuncGblEntries.size() && - "Unexpected device id!"); - FuncOrGblEntryTy &E = FuncGblEntries[DeviceId].back(); - - E.Entries.push_back(Entry); - } - - // Return true if the entry is associated with device - bool findOffloadEntry(int32_t DeviceId, void *Addr) { - assert(DeviceId < (int32_t)FuncGblEntries.size() && - "Unexpected device id!"); - FuncOrGblEntryTy &E = FuncGblEntries[DeviceId].back(); - - for (auto &It : E.Entries) { - if (It.addr == Addr) - return true; - } - - return false; - } - - // Return the pointer to the target entries table - __tgt_target_table *getOffloadEntriesTable(int32_t DeviceId) { - assert(DeviceId < (int32_t)FuncGblEntries.size() && - "Unexpected device id!"); - FuncOrGblEntryTy &E = FuncGblEntries[DeviceId].back(); - - int32_t Size = E.Entries.size(); - - // Table is empty - if (!Size) - return 0; - - __tgt_offload_entry *Begin = &E.Entries[0]; - __tgt_offload_entry *End = &E.Entries[Size - 1]; - - // Update table info according to the entries and return the pointer - E.Table.EntriesBegin = Begin; - E.Table.EntriesEnd = ++End; - - return &E.Table; - } - - // Clear entries table for a device - void clearOffloadEntriesTable(int DeviceId) { - assert(DeviceId < (int32_t)FuncGblEntries.size() && - "Unexpected device id!"); - FuncGblEntries[DeviceId].emplace_back(); - FuncOrGblEntryTy &E = FuncGblEntries[DeviceId].back(); - // KernelArgPoolMap.clear(); - E.Entries.clear(); - E.Table.EntriesBegin = E.Table.EntriesEnd = 0; - } - - hsa_status_t addDeviceMemoryPool(hsa_amd_memory_pool_t MemoryPool, - unsigned int DeviceId) { - assert(DeviceId < DeviceFineGrainedMemoryPools.size() && "Error here."); - uint32_t GlobalFlags = 0; - hsa_status_t Err = hsa_amd_memory_pool_get_info( - MemoryPool, HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, &GlobalFlags); - - if (Err != HSA_STATUS_SUCCESS) { - return Err; - } - - if (GlobalFlags & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_FINE_GRAINED) { - DeviceFineGrainedMemoryPools[DeviceId] = MemoryPool; - } else if (GlobalFlags & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_COARSE_GRAINED) { - DeviceCoarseGrainedMemoryPools[DeviceId] = MemoryPool; - } - - return HSA_STATUS_SUCCESS; - } - - hsa_status_t setupDevicePools(const std::vector &Agents) { - for (unsigned int DeviceId = 0; DeviceId < Agents.size(); DeviceId++) { - hsa_status_t Err = hsa::amd_agent_iterate_memory_pools( - Agents[DeviceId], [&](hsa_amd_memory_pool_t MemoryPool) { - hsa_status_t ValidStatus = core::isValidMemoryPool(MemoryPool); - if (ValidStatus != HSA_STATUS_SUCCESS) { - DP("Alloc allowed in memory pool check failed: %s\n", - get_error_string(ValidStatus)); - return HSA_STATUS_SUCCESS; - } - return addDeviceMemoryPool(MemoryPool, DeviceId); - }); - - if (Err != HSA_STATUS_SUCCESS) { - DP("[%s:%d] %s failed: %s\n", __FILE__, __LINE__, - "Iterate all memory pools", get_error_string(Err)); - return Err; - } - } - return HSA_STATUS_SUCCESS; - } - - hsa_status_t setupHostMemoryPools(std::vector &Agents) { - std::vector HostPools; - - // collect all the "valid" pools for all the given agents. - for (const auto &Agent : Agents) { - hsa_status_t Err = hsa_amd_agent_iterate_memory_pools( - Agent, core::addMemoryPool, static_cast(&HostPools)); - if (Err != HSA_STATUS_SUCCESS) { - DP("addMemoryPool returned %s, continuing\n", get_error_string(Err)); - } - } - - // We need two fine-grained pools. - // 1. One with kernarg flag set for storing kernel arguments - // 2. Second for host allocations - bool FineGrainedMemoryPoolSet = false; - bool KernArgPoolSet = false; - for (const auto &MemoryPool : HostPools) { - hsa_status_t Err = HSA_STATUS_SUCCESS; - uint32_t GlobalFlags = 0; - Err = hsa_amd_memory_pool_get_info( - MemoryPool, HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, &GlobalFlags); - if (Err != HSA_STATUS_SUCCESS) { - DP("Get memory pool info failed: %s\n", get_error_string(Err)); - return Err; - } - - if (GlobalFlags & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_FINE_GRAINED) { - if (GlobalFlags & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_KERNARG_INIT) { - KernArgPool = MemoryPool; - KernArgPoolSet = true; - } else { - HostFineGrainedMemoryPool = MemoryPool; - FineGrainedMemoryPoolSet = true; - } - } - } - - if (FineGrainedMemoryPoolSet && KernArgPoolSet) - return HSA_STATUS_SUCCESS; - - return HSA_STATUS_ERROR; - } - - hsa_amd_memory_pool_t getDeviceMemoryPool(unsigned int DeviceId) { - assert(DeviceId >= 0 && DeviceId < DeviceCoarseGrainedMemoryPools.size() && - "Invalid device Id"); - return DeviceCoarseGrainedMemoryPools[DeviceId]; - } - - hsa_amd_memory_pool_t getHostMemoryPool() { - return HostFineGrainedMemoryPool; - } - - static int readEnv(const char *Env, int Default = -1) { - const char *EnvStr = getenv(Env); - int Res = Default; - if (EnvStr) { - Res = std::stoi(EnvStr); - DP("Parsed %s=%d\n", Env, Res); - } - return Res; - } - - RTLDeviceInfoTy() { - DP("Start initializing " GETNAME(TARGET_NAME) "\n"); - - // LIBOMPTARGET_KERNEL_TRACE provides a kernel launch trace to stderr - // anytime. You do not need a debug library build. - // 0 => no tracing - // 1 => tracing dispatch only - // >1 => verbosity increase - - if (!HSAInitSuccess()) { - DP("Error when initializing HSA in " GETNAME(TARGET_NAME) "\n"); - return; - } - - if (char *EnvStr = getenv("LIBOMPTARGET_KERNEL_TRACE")) - print_kernel_trace = atoi(EnvStr); - else - print_kernel_trace = 0; - - hsa_status_t Err = core::atl_init_gpu_context(); - if (Err != HSA_STATUS_SUCCESS) { - DP("Error when initializing " GETNAME(TARGET_NAME) "\n"); - return; - } - - // Init hostcall soon after initializing hsa - hostrpc_init(); - - Err = findAgents([&](hsa_device_type_t DeviceType, hsa_agent_t Agent) { - if (DeviceType == HSA_DEVICE_TYPE_CPU) { - CPUAgents.push_back(Agent); - } else { - HSAAgents.push_back(Agent); - } - }); - if (Err != HSA_STATUS_SUCCESS) - return; - - NumberOfDevices = (int)HSAAgents.size(); - - if (NumberOfDevices == 0) { - DP("There are no devices supporting HSA.\n"); - return; - } - DP("There are %d devices supporting HSA.\n", NumberOfDevices); - - // Init the device info - HSAQueueSchedulers.reserve(NumberOfDevices); - FuncGblEntries.resize(NumberOfDevices); - ThreadsPerGroup.resize(NumberOfDevices); - ComputeUnits.resize(NumberOfDevices); - GPUName.resize(NumberOfDevices); - GroupsPerDevice.resize(NumberOfDevices); - WarpSize.resize(NumberOfDevices); - NumTeams.resize(NumberOfDevices); - NumThreads.resize(NumberOfDevices); - DeviceStateStore.resize(NumberOfDevices); - KernelInfoTable.resize(NumberOfDevices); - SymbolInfoTable.resize(NumberOfDevices); - DeviceCoarseGrainedMemoryPools.resize(NumberOfDevices); - DeviceFineGrainedMemoryPools.resize(NumberOfDevices); - - Err = setupDevicePools(HSAAgents); - if (Err != HSA_STATUS_SUCCESS) { - DP("Setup for Device Memory Pools failed\n"); - return; - } - - Err = setupHostMemoryPools(CPUAgents); - if (Err != HSA_STATUS_SUCCESS) { - DP("Setup for Host Memory Pools failed\n"); - return; - } - - for (int I = 0; I < NumberOfDevices; I++) { - uint32_t QueueSize = 0; - { - hsa_status_t Err = hsa_agent_get_info( - HSAAgents[I], HSA_AGENT_INFO_QUEUE_MAX_SIZE, &QueueSize); - if (Err != HSA_STATUS_SUCCESS) { - DP("HSA query QUEUE_MAX_SIZE failed for agent %d\n", I); - return; - } - enum { MaxQueueSize = 4096 }; - if (QueueSize > MaxQueueSize) { - QueueSize = MaxQueueSize; - } - } - - { - HSAQueueScheduler QSched; - if (!QSched.createQueues(HSAAgents[I], QueueSize)) - return; - HSAQueueSchedulers.emplace_back(std::move(QSched)); - } - - DeviceStateStore[I] = {nullptr, 0}; - } - - for (int I = 0; I < NumberOfDevices; I++) { - ThreadsPerGroup[I] = RTLDeviceInfoTy::DefaultWgSize; - GroupsPerDevice[I] = RTLDeviceInfoTy::DefaultNumTeams; - ComputeUnits[I] = 1; - DP("Device %d: Initial groupsPerDevice %d & threadsPerGroup %d\n", I, - GroupsPerDevice[I], ThreadsPerGroup[I]); - } - - // Get environment variables regarding teams - Env.TeamLimit = readEnv("OMP_TEAM_LIMIT"); - Env.NumTeams = readEnv("OMP_NUM_TEAMS"); - Env.MaxTeamsDefault = readEnv("OMP_MAX_TEAMS_DEFAULT"); - Env.TeamThreadLimit = readEnv("OMP_TEAMS_THREAD_LIMIT"); - Env.DynamicMemSize = readEnv("LIBOMPTARGET_SHARED_MEMORY_SIZE", 0); - - // Default state. - RequiresFlags = OMP_REQ_UNDEFINED; - - ConstructionSucceeded = true; - } - - ~RTLDeviceInfoTy() { - DP("Finalizing the " GETNAME(TARGET_NAME) " DeviceInfo.\n"); - if (!HSAInitSuccess()) { - // Then none of these can have been set up and they can't be torn down - return; - } - // Run destructors on types that use HSA before - // impl_finalize removes access to it - DeviceStateStore.clear(); - KernelArgPoolMap.clear(); - // Terminate hostrpc before finalizing hsa - hostrpc_terminate(); - - hsa_status_t Err; - for (uint32_t I = 0; I < HSAExecutables.size(); I++) { - Err = hsa_executable_destroy(HSAExecutables[I]); - if (Err != HSA_STATUS_SUCCESS) { - DP("[%s:%d] %s failed: %s\n", __FILE__, __LINE__, - "Destroying executable", get_error_string(Err)); - } - } - } -}; - -pthread_mutex_t SignalPoolT::mutex = PTHREAD_MUTEX_INITIALIZER; - -// Putting accesses to DeviceInfo global behind a function call prior -// to changing to use init_plugin/deinit_plugin calls -static RTLDeviceInfoTy DeviceInfoState; -static RTLDeviceInfoTy &DeviceInfo() { return DeviceInfoState; } - -namespace { - -int32_t dataRetrieve(int32_t DeviceId, void *HstPtr, void *TgtPtr, int64_t Size, - __tgt_async_info *AsyncInfo) { - assert(AsyncInfo && "AsyncInfo is nullptr"); - assert(DeviceId < DeviceInfo().NumberOfDevices && "Device ID too large"); - // Return success if we are not copying back to host from target. - if (!HstPtr) - return OFFLOAD_SUCCESS; - hsa_status_t Err; - DP("Retrieve data %ld bytes, (tgt:%016llx) -> (hst:%016llx).\n", Size, - (long long unsigned)(Elf64_Addr)TgtPtr, - (long long unsigned)(Elf64_Addr)HstPtr); - - Err = DeviceInfo().freesignalpoolMemcpyD2H(HstPtr, TgtPtr, (size_t)Size, - DeviceId); - - if (Err != HSA_STATUS_SUCCESS) { - DP("Error when copying data from device to host. Pointers: " - "host = 0x%016lx, device = 0x%016lx, size = %lld\n", - (Elf64_Addr)HstPtr, (Elf64_Addr)TgtPtr, (unsigned long long)Size); - return OFFLOAD_FAIL; - } - DP("DONE Retrieve data %ld bytes, (tgt:%016llx) -> (hst:%016llx).\n", Size, - (long long unsigned)(Elf64_Addr)TgtPtr, - (long long unsigned)(Elf64_Addr)HstPtr); - return OFFLOAD_SUCCESS; -} - -int32_t dataSubmit(int32_t DeviceId, void *TgtPtr, void *HstPtr, int64_t Size, - __tgt_async_info *AsyncInfo) { - assert(AsyncInfo && "AsyncInfo is nullptr"); - hsa_status_t Err; - assert(DeviceId < DeviceInfo().NumberOfDevices && "Device ID too large"); - // Return success if we are not doing host to target. - if (!HstPtr) - return OFFLOAD_SUCCESS; - - DP("Submit data %ld bytes, (hst:%016llx) -> (tgt:%016llx).\n", Size, - (long long unsigned)(Elf64_Addr)HstPtr, - (long long unsigned)(Elf64_Addr)TgtPtr); - Err = DeviceInfo().freesignalpoolMemcpyH2D(TgtPtr, HstPtr, (size_t)Size, - DeviceId); - if (Err != HSA_STATUS_SUCCESS) { - DP("Error when copying data from host to device. Pointers: " - "host = 0x%016lx, device = 0x%016lx, size = %lld\n", - (Elf64_Addr)HstPtr, (Elf64_Addr)TgtPtr, (unsigned long long)Size); - return OFFLOAD_FAIL; - } - return OFFLOAD_SUCCESS; -} - -// Async. -// The implementation was written with cuda streams in mind. The semantics of -// that are to execute kernels on a queue in order of insertion. A synchronise -// call then makes writes visible between host and device. This means a series -// of N data_submit_async calls are expected to execute serially. HSA offers -// various options to run the data copies concurrently. This may require changes -// to libomptarget. - -// __tgt_async_info* contains a void * Queue. Queue = 0 is used to indicate that -// there are no outstanding kernels that need to be synchronized. Any async call -// may be passed a Queue==0, at which point the cuda implementation will set it -// to non-null (see getStream). The cuda streams are per-device. Upstream may -// change this interface to explicitly initialize the AsyncInfo_pointer, but -// until then hsa lazily initializes it as well. - -void initAsyncInfo(__tgt_async_info *AsyncInfo) { - // set non-null while using async calls, return to null to indicate completion - assert(AsyncInfo); - if (!AsyncInfo->Queue) { - AsyncInfo->Queue = reinterpret_cast(UINT64_MAX); - } -} -void finiAsyncInfo(__tgt_async_info *AsyncInfo) { - assert(AsyncInfo); - assert(AsyncInfo->Queue); - AsyncInfo->Queue = 0; -} - -// Determine launch values for kernel. -struct LaunchVals { - int WorkgroupSize; - int GridSize; -}; -LaunchVals getLaunchVals(int WarpSize, EnvironmentVariables Env, - int ConstWGSize, - llvm::omp::OMPTgtExecModeFlags ExecutionMode, - int NumTeams, int ThreadLimit, uint64_t LoopTripcount, - int DeviceNumTeams) { - - int ThreadsPerGroup = RTLDeviceInfoTy::DefaultWgSize; - int NumGroups = 0; - - int MaxTeams = Env.MaxTeamsDefault > 0 ? Env.MaxTeamsDefault : DeviceNumTeams; - if (MaxTeams > static_cast(RTLDeviceInfoTy::HardTeamLimit)) - MaxTeams = RTLDeviceInfoTy::HardTeamLimit; - - if (print_kernel_trace & STARTUP_DETAILS) { - DP("RTLDeviceInfoTy::Max_Teams: %d\n", RTLDeviceInfoTy::MaxTeams); - DP("Max_Teams: %d\n", MaxTeams); - DP("RTLDeviceInfoTy::Warp_Size: %d\n", WarpSize); - DP("RTLDeviceInfoTy::Max_WG_Size: %d\n", RTLDeviceInfoTy::MaxWgSize); - DP("RTLDeviceInfoTy::Default_WG_Size: %d\n", - RTLDeviceInfoTy::DefaultWgSize); - DP("thread_limit: %d\n", ThreadLimit); - DP("threadsPerGroup: %d\n", ThreadsPerGroup); - DP("ConstWGSize: %d\n", ConstWGSize); - } - // check for thread_limit() clause - if (ThreadLimit > 0) { - ThreadsPerGroup = ThreadLimit; - DP("Setting threads per block to requested %d\n", ThreadLimit); - // Add master warp for GENERIC - if (ExecutionMode == - llvm::omp::OMPTgtExecModeFlags::OMP_TGT_EXEC_MODE_GENERIC) { - ThreadsPerGroup += WarpSize; - DP("Adding master wavefront: +%d threads\n", WarpSize); - } - if (ThreadsPerGroup > RTLDeviceInfoTy::MaxWgSize) { // limit to max - ThreadsPerGroup = RTLDeviceInfoTy::MaxWgSize; - DP("Setting threads per block to maximum %d\n", ThreadsPerGroup); - } - } - // check flat_max_work_group_size attr here - if (ThreadsPerGroup > ConstWGSize) { - ThreadsPerGroup = ConstWGSize; - DP("Reduced threadsPerGroup to flat-attr-group-size limit %d\n", - ThreadsPerGroup); - } - if (print_kernel_trace & STARTUP_DETAILS) - DP("threadsPerGroup: %d\n", ThreadsPerGroup); - DP("Preparing %d threads\n", ThreadsPerGroup); - - // Set default num_groups (teams) - if (Env.TeamLimit > 0) - NumGroups = (MaxTeams < Env.TeamLimit) ? MaxTeams : Env.TeamLimit; - else - NumGroups = MaxTeams; - DP("Set default num of groups %d\n", NumGroups); - - if (print_kernel_trace & STARTUP_DETAILS) { - DP("num_groups: %d\n", NumGroups); - DP("num_teams: %d\n", NumTeams); - } - - // Reduce num_groups if threadsPerGroup exceeds RTLDeviceInfoTy::Max_WG_Size - // This reduction is typical for default case (no thread_limit clause). - // or when user goes crazy with num_teams clause. - // FIXME: We cant distinguish between a constant or variable thread limit. - // So we only handle constant thread_limits. - if (ThreadsPerGroup > - RTLDeviceInfoTy::DefaultWgSize) // 256 < threadsPerGroup <= 1024 - // Should we round threadsPerGroup up to nearest WarpSize - // here? - NumGroups = (MaxTeams * RTLDeviceInfoTy::MaxWgSize) / ThreadsPerGroup; - - // check for num_teams() clause - if (NumTeams > 0) { - NumGroups = (NumTeams < NumGroups) ? NumTeams : NumGroups; - } - if (print_kernel_trace & STARTUP_DETAILS) { - DP("num_groups: %d\n", NumGroups); - DP("Env.NumTeams %d\n", Env.NumTeams); - DP("Env.TeamLimit %d\n", Env.TeamLimit); - } - - if (Env.NumTeams > 0) { - NumGroups = (Env.NumTeams < NumGroups) ? Env.NumTeams : NumGroups; - DP("Modifying teams based on Env.NumTeams %d\n", Env.NumTeams); - } else if (Env.TeamLimit > 0) { - NumGroups = (Env.TeamLimit < NumGroups) ? Env.TeamLimit : NumGroups; - DP("Modifying teams based on Env.TeamLimit%d\n", Env.TeamLimit); - } else { - if (NumTeams <= 0) { - if (LoopTripcount > 0) { - if (ExecutionMode == - llvm::omp::OMPTgtExecModeFlags::OMP_TGT_EXEC_MODE_SPMD) { - // round up to the nearest integer - NumGroups = ((LoopTripcount - 1) / ThreadsPerGroup) + 1; - } else if (ExecutionMode == - llvm::omp::OMPTgtExecModeFlags::OMP_TGT_EXEC_MODE_GENERIC) { - NumGroups = LoopTripcount; - } else /* OMP_TGT_EXEC_MODE_GENERIC_SPMD */ { - // This is a generic kernel that was transformed to use SPMD-mode - // execution but uses Generic-mode semantics for scheduling. - NumGroups = LoopTripcount; - } - DP("Using %d teams due to loop trip count %" PRIu64 " and number of " - "threads per block %d\n", - NumGroups, LoopTripcount, ThreadsPerGroup); - } - } else { - NumGroups = NumTeams; - } - if (NumGroups > MaxTeams) { - NumGroups = MaxTeams; - if (print_kernel_trace & STARTUP_DETAILS) - DP("Limiting num_groups %d to Max_Teams %d \n", NumGroups, MaxTeams); - } - if (NumGroups > NumTeams && NumTeams > 0) { - NumGroups = NumTeams; - if (print_kernel_trace & STARTUP_DETAILS) - DP("Limiting num_groups %d to clause num_teams %d \n", NumGroups, - NumTeams); - } - } - - // num_teams clause always honored, no matter what, unless DEFAULT is active. - if (NumTeams > 0) { - NumGroups = NumTeams; - // Cap num_groups to EnvMaxTeamsDefault if set. - if (Env.MaxTeamsDefault > 0 && NumGroups > Env.MaxTeamsDefault) - NumGroups = Env.MaxTeamsDefault; - } - if (print_kernel_trace & STARTUP_DETAILS) { - DP("threadsPerGroup: %d\n", ThreadsPerGroup); - DP("num_groups: %d\n", NumGroups); - DP("loop_tripcount: %ld\n", LoopTripcount); - } - DP("Final %d num_groups and %d threadsPerGroup\n", NumGroups, - ThreadsPerGroup); - - LaunchVals Res; - Res.WorkgroupSize = ThreadsPerGroup; - Res.GridSize = ThreadsPerGroup * NumGroups; - return Res; -} - -static uint64_t acquireAvailablePacketId(hsa_queue_t *Queue) { - uint64_t PacketId = hsa_queue_add_write_index_relaxed(Queue, 1); - bool Full = true; - while (Full) { - Full = - PacketId >= (Queue->size + hsa_queue_load_read_index_scacquire(Queue)); - } - return PacketId; -} - -int32_t runRegionLocked(int32_t DeviceId, void *TgtEntryPtr, void **TgtArgs, - ptrdiff_t *TgtOffsets, int32_t ArgNum, int32_t NumTeams, - int32_t ThreadLimit, uint64_t LoopTripcount) { - // Set the context we are using - // update thread limit content in gpu memory if un-initialized or specified - // from host - - DP("Run target team region thread_limit %d\n", ThreadLimit); - - // All args are references. - std::vector Args(ArgNum); - std::vector Ptrs(ArgNum); - - DP("Arg_num: %d\n", ArgNum); - for (int32_t I = 0; I < ArgNum; ++I) { - Ptrs[I] = (void *)((intptr_t)TgtArgs[I] + TgtOffsets[I]); - Args[I] = &Ptrs[I]; - DP("Offseted base: arg[%d]:" DPxMOD "\n", I, DPxPTR(Ptrs[I])); - } - - KernelTy *KernelInfo = (KernelTy *)TgtEntryPtr; - - std::string KernelName = std::string(KernelInfo->Name); - auto &KernelInfoTable = DeviceInfo().KernelInfoTable; - if (KernelInfoTable[DeviceId].find(KernelName) == - KernelInfoTable[DeviceId].end()) { - DP("Kernel %s not found\n", KernelName.c_str()); - return OFFLOAD_FAIL; - } - - const atl_kernel_info_t KernelInfoEntry = - KernelInfoTable[DeviceId][KernelName]; - const uint32_t GroupSegmentSize = - KernelInfoEntry.group_segment_size + DeviceInfo().Env.DynamicMemSize; - const uint32_t SgprCount = KernelInfoEntry.sgpr_count; - const uint32_t VgprCount = KernelInfoEntry.vgpr_count; - const uint32_t SgprSpillCount = KernelInfoEntry.sgpr_spill_count; - const uint32_t VgprSpillCount = KernelInfoEntry.vgpr_spill_count; - - assert(ArgNum == (int)KernelInfoEntry.explicit_argument_count); - - /* - * Set limit based on ThreadsPerGroup and GroupsPerDevice - */ - LaunchVals LV = - getLaunchVals(DeviceInfo().WarpSize[DeviceId], DeviceInfo().Env, - KernelInfo->ConstWGSize, KernelInfo->ExecutionMode, - NumTeams, // From run_region arg - ThreadLimit, // From run_region arg - LoopTripcount, // From run_region arg - DeviceInfo().NumTeams[KernelInfo->DeviceId]); - const int GridSize = LV.GridSize; - const int WorkgroupSize = LV.WorkgroupSize; - - if (print_kernel_trace >= LAUNCH) { - int NumGroups = GridSize / WorkgroupSize; - // enum modes are SPMD, GENERIC, NONE 0,1,2 - // if doing rtl timing, print to stderr, unless stdout requested. - bool TraceToStdout = print_kernel_trace & (RTL_TO_STDOUT | RTL_TIMING); - fprintf(TraceToStdout ? stdout : stderr, - "DEVID:%2d SGN:%1d ConstWGSize:%-4d args:%2d teamsXthrds:(%4dX%4d) " - "reqd:(%4dX%4d) lds_usage:%uB sgpr_count:%u vgpr_count:%u " - "sgpr_spill_count:%u vgpr_spill_count:%u tripcount:%lu n:%s\n", - DeviceId, KernelInfo->ExecutionMode, KernelInfo->ConstWGSize, - ArgNum, NumGroups, WorkgroupSize, NumTeams, ThreadLimit, - GroupSegmentSize, SgprCount, VgprCount, SgprSpillCount, - VgprSpillCount, LoopTripcount, KernelInfo->Name); - } - - // Run on the device. - { - hsa_queue_t *Queue = DeviceInfo().HSAQueueSchedulers[DeviceId].next(); - if (!Queue) { - return OFFLOAD_FAIL; - } - uint64_t PacketId = acquireAvailablePacketId(Queue); - - const uint32_t Mask = Queue->size - 1; // size is a power of 2 - hsa_kernel_dispatch_packet_t *Packet = - (hsa_kernel_dispatch_packet_t *)Queue->base_address + (PacketId & Mask); - - // packet->header is written last - Packet->setup = UINT16_C(1) << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS; - Packet->workgroup_size_x = WorkgroupSize; - Packet->workgroup_size_y = 1; - Packet->workgroup_size_z = 1; - Packet->reserved0 = 0; - Packet->grid_size_x = GridSize; - Packet->grid_size_y = 1; - Packet->grid_size_z = 1; - Packet->private_segment_size = KernelInfoEntry.private_segment_size; - Packet->group_segment_size = GroupSegmentSize; - Packet->kernel_object = KernelInfoEntry.kernel_object; - Packet->kernarg_address = 0; // use the block allocator - Packet->reserved2 = 0; // impl writes id_ here - Packet->completion_signal = {0}; // may want a pool of signals - - KernelArgPool *ArgPool = nullptr; - void *KernArg = nullptr; - { - auto It = KernelArgPoolMap.find(std::string(KernelInfo->Name)); - if (It != KernelArgPoolMap.end()) { - ArgPool = (It->second).get(); - } - } - if (!ArgPool) { - DP("Warning: No ArgPool for %s on device %d\n", KernelInfo->Name, - DeviceId); - } - { - if (ArgPool) { - assert(ArgPool->KernargSegmentSize == (ArgNum * sizeof(void *))); - KernArg = ArgPool->allocate(ArgNum); - } - if (!KernArg) { - DP("Allocate kernarg failed\n"); - return OFFLOAD_FAIL; - } - - // Copy explicit arguments - for (int I = 0; I < ArgNum; I++) { - memcpy((char *)KernArg + sizeof(void *) * I, Args[I], sizeof(void *)); - } - - // Initialize implicit arguments. TODO: Which of these can be dropped - AMDGPUImplicitArgsTy *ImplArgs = reinterpret_cast( - static_cast(KernArg) + ArgPool->KernargSegmentSize); - memset(ImplArgs, 0, - sizeof(AMDGPUImplicitArgsTy)); // may not be necessary - ImplArgs->OffsetX = 0; - ImplArgs->OffsetY = 0; - ImplArgs->OffsetZ = 0; - - // assign a hostcall buffer for the selected Q - if (__atomic_load_n(&DeviceInfo().HostcallRequired, __ATOMIC_ACQUIRE)) { - // hostrpc_assign_buffer is not thread safe, and this function is - // under a multiple reader lock, not a writer lock. - static pthread_mutex_t HostcallInitLock = PTHREAD_MUTEX_INITIALIZER; - pthread_mutex_lock(&HostcallInitLock); - uint64_t Buffer = hostrpc_assign_buffer( - DeviceInfo().HSAAgents[DeviceId], Queue, DeviceId); - pthread_mutex_unlock(&HostcallInitLock); - if (!Buffer) { - DP("hostrpc_assign_buffer failed, gpu would dereference null and " - "error\n"); - return OFFLOAD_FAIL; - } - - DP("Implicit argument count: %d\n", - KernelInfoEntry.implicit_argument_count); - if (KernelInfoEntry.implicit_argument_count >= 4) { - // Initialise pointer for implicit_argument_count != 0 ABI - // Guess that the right implicit argument is at offset 24 after - // the explicit arguments. In the future, should be able to read - // the offset from msgpack. Clang is not annotating it at present. - uint64_t Offset = - sizeof(void *) * (KernelInfoEntry.explicit_argument_count + 3); - if ((Offset + 8) > ArgPool->kernargSizeIncludingImplicit()) { - DP("Bad offset of hostcall: %lu, exceeds kernarg size w/ implicit " - "args: %d\n", - Offset + 8, ArgPool->kernargSizeIncludingImplicit()); - } else { - memcpy(static_cast(KernArg) + Offset, &Buffer, 8); - } - } - - // initialise pointer for implicit_argument_count == 0 ABI - ImplArgs->HostcallPtr = Buffer; - } - - Packet->kernarg_address = KernArg; - } - - hsa_signal_t S = DeviceInfo().FreeSignalPool.pop(); - if (S.handle == 0) { - DP("Failed to get signal instance\n"); - return OFFLOAD_FAIL; - } - Packet->completion_signal = S; - hsa_signal_store_relaxed(Packet->completion_signal, 1); - - // Publish the packet indicating it is ready to be processed - core::packetStoreRelease(reinterpret_cast(Packet), - core::createHeader(), Packet->setup); - - // Since the packet is already published, its contents must not be - // accessed any more - hsa_signal_store_relaxed(Queue->doorbell_signal, PacketId); - - while (hsa_signal_wait_scacquire(S, HSA_SIGNAL_CONDITION_EQ, 0, UINT64_MAX, - HSA_WAIT_STATE_BLOCKED) != 0) - ; - - assert(ArgPool); - ArgPool->deallocate(KernArg); - DeviceInfo().FreeSignalPool.push(S); - } - - DP("Kernel completed\n"); - return OFFLOAD_SUCCESS; -} - -bool elfMachineIdIsAmdgcn(__tgt_device_image *Image) { - const uint16_t AmdgcnMachineID = EM_AMDGPU; - const int32_t R = elf_check_machine(Image, AmdgcnMachineID); - if (!R) { - DP("Supported machine ID not found\n"); - } - return R; -} - -uint32_t elfEFlags(__tgt_device_image *Image) { - const char *ImgBegin = (char *)Image->ImageStart; - size_t ImgSize = (char *)Image->ImageEnd - ImgBegin; - - StringRef Buffer = StringRef(ImgBegin, ImgSize); - auto ElfOrErr = ObjectFile::createELFObjectFile(MemoryBufferRef(Buffer, ""), - /*InitContent=*/false); - if (!ElfOrErr) { - consumeError(ElfOrErr.takeError()); - return 0; - } - - if (const auto *ELFObj = dyn_cast(ElfOrErr->get())) - return ELFObj->getPlatformFlags(); - return 0; -} - -template bool enforceUpperBound(T *Value, T Upper) { - bool Changed = *Value > Upper; - if (Changed) { - *Value = Upper; - } - return Changed; -} - -struct SymbolInfo { - const void *Addr = nullptr; - uint32_t Size = UINT32_MAX; - uint32_t ShType = SHT_NULL; -}; - -int getSymbolInfoWithoutLoading(const ELFObjectFile &ELFObj, - StringRef SymName, SymbolInfo *Res) { - auto SymOrErr = getELFSymbol(ELFObj, SymName); - if (!SymOrErr) { - std::string ErrorString = toString(SymOrErr.takeError()); - DP("Failed ELF lookup: %s\n", ErrorString.c_str()); - return 1; - } - if (!*SymOrErr) - return 1; - - auto SymSecOrErr = ELFObj.getELFFile().getSection((*SymOrErr)->st_shndx); - if (!SymSecOrErr) { - std::string ErrorString = toString(SymOrErr.takeError()); - DP("Failed ELF lookup: %s\n", ErrorString.c_str()); - return 1; - } - - Res->Addr = (*SymOrErr)->st_value + ELFObj.getELFFile().base(); - Res->Size = static_cast((*SymOrErr)->st_size); - Res->ShType = static_cast((*SymSecOrErr)->sh_type); - return 0; -} - -int getSymbolInfoWithoutLoading(char *Base, size_t ImgSize, const char *SymName, - SymbolInfo *Res) { - StringRef Buffer = StringRef(Base, ImgSize); - auto ElfOrErr = ObjectFile::createELFObjectFile(MemoryBufferRef(Buffer, ""), - /*InitContent=*/false); - if (!ElfOrErr) { - REPORT("Failed to load ELF: %s\n", toString(ElfOrErr.takeError()).c_str()); - return 1; - } - - if (const auto *ELFObj = dyn_cast(ElfOrErr->get())) - return getSymbolInfoWithoutLoading(*ELFObj, SymName, Res); - return 1; -} - -hsa_status_t interopGetSymbolInfo(char *Base, size_t ImgSize, - const char *SymName, const void **VarAddr, - uint32_t *VarSize) { - SymbolInfo SI; - int Rc = getSymbolInfoWithoutLoading(Base, ImgSize, SymName, &SI); - if (Rc == 0) { - *VarAddr = SI.Addr; - *VarSize = SI.Size; - return HSA_STATUS_SUCCESS; - } - return HSA_STATUS_ERROR; -} - -template -hsa_status_t moduleRegisterFromMemoryToPlace( - std::map &KernelInfoTable, - std::map &SymbolInfoTable, - void *ModuleBytes, size_t ModuleSize, int DeviceId, C Cb, - std::vector &HSAExecutables) { - auto L = [](void *Data, size_t Size, void *CbState) -> hsa_status_t { - C *Unwrapped = static_cast(CbState); - return (*Unwrapped)(Data, Size); - }; - return core::RegisterModuleFromMemory( - KernelInfoTable, SymbolInfoTable, ModuleBytes, ModuleSize, - DeviceInfo().HSAAgents[DeviceId], L, static_cast(&Cb), - HSAExecutables); -} - -uint64_t getDeviceStateBytes(char *ImageStart, size_t ImgSize) { - uint64_t DeviceStateBytes = 0; - { - // If this is the deviceRTL, get the state variable size - SymbolInfo SizeSi; - int Rc = getSymbolInfoWithoutLoading( - ImageStart, ImgSize, "omptarget_nvptx_device_State_size", &SizeSi); - - if (Rc == 0) { - if (SizeSi.Size != sizeof(uint64_t)) { - DP("Found device_State_size variable with wrong size\n"); - return 0; - } - - // Read number of bytes directly from the elf - memcpy(&DeviceStateBytes, SizeSi.Addr, sizeof(uint64_t)); - } - } - return DeviceStateBytes; -} - -struct DeviceEnvironment { - // initialise an DeviceEnvironmentTy in the deviceRTL - // patches around differences in the deviceRTL between trunk, aomp, - // rocmcc. Over time these differences will tend to zero and this class - // simplified. - // Symbol may be in .data or .bss, and may be missing fields, todo: - // review aomp/trunk/rocm and simplify the following - - // The symbol may also have been deadstripped because the device side - // accessors were unused. - - // If the symbol is in .data (aomp, rocm) it can be written directly. - // If it is in .bss, we must wait for it to be allocated space on the - // gpu (trunk) and initialize after loading. - const char *sym() { return "__omp_rtl_device_environment"; } - - DeviceEnvironmentTy HostDeviceEnv; - SymbolInfo SI; - bool Valid = false; - - __tgt_device_image *Image; - const size_t ImgSize; - - DeviceEnvironment(int DeviceId, int NumberDevices, int DynamicMemSize, - __tgt_device_image *Image, const size_t ImgSize) - : Image(Image), ImgSize(ImgSize) { - - HostDeviceEnv.NumDevices = NumberDevices; - HostDeviceEnv.DeviceNum = DeviceId; - HostDeviceEnv.DebugKind = 0; - HostDeviceEnv.DynamicMemSize = DynamicMemSize; - if (char *EnvStr = getenv("LIBOMPTARGET_DEVICE_RTL_DEBUG")) - HostDeviceEnv.DebugKind = std::stoi(EnvStr); - - int Rc = getSymbolInfoWithoutLoading((char *)Image->ImageStart, ImgSize, - sym(), &SI); - if (Rc != 0) { - DP("Finding global device environment '%s' - symbol missing.\n", sym()); - return; - } - - if (SI.Size > sizeof(HostDeviceEnv)) { - DP("Symbol '%s' has size %u, expected at most %zu.\n", sym(), SI.Size, - sizeof(HostDeviceEnv)); - return; - } - - Valid = true; - } - - bool inImage() { return SI.ShType != SHT_NOBITS; } - - hsa_status_t beforeLoading(void *Data, size_t Size) { - if (Valid) { - if (inImage()) { - DP("Setting global device environment before load (%u bytes)\n", - SI.Size); - uint64_t Offset = reinterpret_cast(SI.Addr) - - reinterpret_cast(Image->ImageStart); - void *Pos = reinterpret_cast(Data) + Offset; - memcpy(Pos, &HostDeviceEnv, SI.Size); - } - } - return HSA_STATUS_SUCCESS; - } - - hsa_status_t afterLoading() { - if (Valid) { - if (!inImage()) { - DP("Setting global device environment after load (%u bytes)\n", - SI.Size); - int DeviceId = HostDeviceEnv.DeviceNum; - auto &SymbolInfo = DeviceInfo().SymbolInfoTable[DeviceId]; - void *StatePtr; - uint32_t StatePtrSize; - hsa_status_t Err = interop_hsa_get_symbol_info( - SymbolInfo, DeviceId, sym(), &StatePtr, &StatePtrSize); - if (Err != HSA_STATUS_SUCCESS) { - DP("failed to find %s in loaded image\n", sym()); - return Err; - } - - if (StatePtrSize != SI.Size) { - DP("Symbol had size %u before loading, %u after\n", StatePtrSize, - SI.Size); - return HSA_STATUS_ERROR; - } - - return DeviceInfo().freesignalpoolMemcpyH2D(StatePtr, &HostDeviceEnv, - StatePtrSize, DeviceId); - } - } - return HSA_STATUS_SUCCESS; - } -}; - -hsa_status_t implCalloc(void **RetPtr, size_t Size, int DeviceId) { - uint64_t Rounded = 4 * ((Size + 3) / 4); - void *Ptr; - hsa_amd_memory_pool_t MemoryPool = DeviceInfo().getDeviceMemoryPool(DeviceId); - hsa_status_t Err = hsa_amd_memory_pool_allocate(MemoryPool, Rounded, 0, &Ptr); - if (Err != HSA_STATUS_SUCCESS) { - return Err; - } - - hsa_status_t Rc = hsa_amd_memory_fill(Ptr, 0, Rounded / 4); - if (Rc != HSA_STATUS_SUCCESS) { - DP("zero fill device_state failed with %u\n", Rc); - core::Runtime::Memfree(Ptr); - return HSA_STATUS_ERROR; - } - - *RetPtr = Ptr; - return HSA_STATUS_SUCCESS; -} - -bool imageContainsSymbol(void *Data, size_t Size, const char *Sym) { - SymbolInfo SI; - int Rc = getSymbolInfoWithoutLoading((char *)Data, Size, Sym, &SI); - return (Rc == 0) && (SI.Addr != nullptr); -} - -hsa_status_t lock_memory(void *HostPtr, size_t Size, hsa_agent_t Agent, - void **LockedHostPtr) { - hsa_status_t err = is_locked(HostPtr, LockedHostPtr); - if (err != HSA_STATUS_SUCCESS) - return err; - - // HostPtr is already locked, just return it - if (*LockedHostPtr) - return HSA_STATUS_SUCCESS; - - hsa_agent_t Agents[1] = {Agent}; - return hsa_amd_memory_lock(HostPtr, Size, Agents, /*num_agent=*/1, - LockedHostPtr); -} - -hsa_status_t unlock_memory(void *HostPtr) { - void *LockedHostPtr = nullptr; - hsa_status_t err = is_locked(HostPtr, &LockedHostPtr); - if (err != HSA_STATUS_SUCCESS) - return err; - - // if LockedHostPtr is nullptr, then HostPtr was not locked - if (!LockedHostPtr) - return HSA_STATUS_SUCCESS; - - err = hsa_amd_memory_unlock(HostPtr); - return err; -} - -} // namespace - -namespace core { -hsa_status_t allow_access_to_all_gpu_agents(void *Ptr) { - return hsa_amd_agents_allow_access(DeviceInfo().HSAAgents.size(), - &DeviceInfo().HSAAgents[0], NULL, Ptr); -} -} // namespace core - -static hsa_status_t GetIsaInfo(hsa_isa_t isa, void *data) { - hsa_status_t err; - uint32_t name_len; - err = hsa_isa_get_info_alt(isa, HSA_ISA_INFO_NAME_LENGTH, &name_len); - if (err != HSA_STATUS_SUCCESS) { - DP("Error getting ISA info length\n"); - return err; - } - - char TargetID[name_len]; - err = hsa_isa_get_info_alt(isa, HSA_ISA_INFO_NAME, TargetID); - if (err != HSA_STATUS_SUCCESS) { - DP("Error getting ISA info name\n"); - return err; - } - - auto TripleTargetID = llvm::StringRef(TargetID); - if (TripleTargetID.consume_front("amdgcn-amd-amdhsa")) { - DeviceInfo().TargetID.push_back(TripleTargetID.ltrim('-').str()); - } - return HSA_STATUS_SUCCESS; -} - -extern "C" { -int32_t __tgt_rtl_is_valid_binary(__tgt_device_image *Image) { - return elfMachineIdIsAmdgcn(Image); -} - -int32_t __tgt_rtl_is_valid_binary_info(__tgt_device_image *image, - __tgt_image_info *info) { - if (!__tgt_rtl_is_valid_binary(image)) - return false; - - // A subarchitecture was not specified. Assume it is compatible. - if (!info->Arch) - return true; - - int32_t NumberOfDevices = __tgt_rtl_number_of_devices(); - - for (int32_t DeviceId = 0; DeviceId < NumberOfDevices; ++DeviceId) { - __tgt_rtl_init_device(DeviceId); - hsa_agent_t agent = DeviceInfo().HSAAgents[DeviceId]; - hsa_status_t err = hsa_agent_iterate_isas(agent, GetIsaInfo, &DeviceId); - if (err != HSA_STATUS_SUCCESS) { - DP("Error iterating ISAs\n"); - return false; - } - if (!isImageCompatibleWithEnv(info, DeviceInfo().TargetID[DeviceId])) - return false; - } - DP("Image has Target ID compatible with the current environment: %s\n", - info->Arch); - return true; -} - -int32_t __tgt_rtl_init_plugin() { return OFFLOAD_SUCCESS; } -int32_t __tgt_rtl_deinit_plugin() { return OFFLOAD_SUCCESS; } - -int __tgt_rtl_number_of_devices() { - // If the construction failed, no methods are safe to call - if (DeviceInfo().ConstructionSucceeded) { - return DeviceInfo().NumberOfDevices; - } - DP("AMDGPU plugin construction failed. Zero devices available\n"); - return 0; -} - -int64_t __tgt_rtl_init_requires(int64_t RequiresFlags) { - DP("Init requires flags to %ld\n", RequiresFlags); - DeviceInfo().RequiresFlags = RequiresFlags; - return RequiresFlags; -} - -int32_t __tgt_rtl_init_device(int DeviceId) { - hsa_status_t Err = hsa_init(); - if (Err != HSA_STATUS_SUCCESS) { - DP("HSA Initialization Failed.\n"); - return HSA_STATUS_ERROR; - } - // this is per device id init - DP("Initialize the device id: %d\n", DeviceId); - - hsa_agent_t Agent = DeviceInfo().HSAAgents[DeviceId]; - - // Get number of Compute Unit - uint32_t ComputeUnits = 0; - Err = hsa_agent_get_info( - Agent, (hsa_agent_info_t)HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT, - &ComputeUnits); - if (Err != HSA_STATUS_SUCCESS) { - DeviceInfo().ComputeUnits[DeviceId] = 1; - DP("Error getting compute units : settiing to 1\n"); - } else { - DeviceInfo().ComputeUnits[DeviceId] = ComputeUnits; - DP("Using %d compute unis per grid\n", DeviceInfo().ComputeUnits[DeviceId]); - } - - char GetInfoName[64]; // 64 max size returned by get info - Err = hsa_agent_get_info(Agent, (hsa_agent_info_t)HSA_AGENT_INFO_NAME, - (void *)GetInfoName); - if (Err) - DeviceInfo().GPUName[DeviceId] = "--unknown gpu--"; - else { - DeviceInfo().GPUName[DeviceId] = GetInfoName; - } - - if (print_kernel_trace & STARTUP_DETAILS) - DP("Device#%-2d CU's: %2d %s\n", DeviceId, - DeviceInfo().ComputeUnits[DeviceId], - DeviceInfo().GPUName[DeviceId].c_str()); - - // Query attributes to determine number of threads/block and blocks/grid. - uint16_t WorkgroupMaxDim[3]; - Err = hsa_agent_get_info(Agent, HSA_AGENT_INFO_WORKGROUP_MAX_DIM, - &WorkgroupMaxDim); - if (Err != HSA_STATUS_SUCCESS) { - DeviceInfo().GroupsPerDevice[DeviceId] = RTLDeviceInfoTy::DefaultNumTeams; - DP("Error getting grid dims: num groups : %d\n", - RTLDeviceInfoTy::DefaultNumTeams); - } else if (WorkgroupMaxDim[0] <= RTLDeviceInfoTy::HardTeamLimit) { - DeviceInfo().GroupsPerDevice[DeviceId] = WorkgroupMaxDim[0]; - DP("Using %d ROCm blocks per grid\n", - DeviceInfo().GroupsPerDevice[DeviceId]); - } else { - DeviceInfo().GroupsPerDevice[DeviceId] = RTLDeviceInfoTy::HardTeamLimit; - DP("Max ROCm blocks per grid %d exceeds the hard team limit %d, capping " - "at the hard limit\n", - WorkgroupMaxDim[0], RTLDeviceInfoTy::HardTeamLimit); - } - - // Get thread limit - hsa_dim3_t GridMaxDim; - Err = hsa_agent_get_info(Agent, HSA_AGENT_INFO_GRID_MAX_DIM, &GridMaxDim); - if (Err == HSA_STATUS_SUCCESS) { - DeviceInfo().ThreadsPerGroup[DeviceId] = - reinterpret_cast(&GridMaxDim)[0] / - DeviceInfo().GroupsPerDevice[DeviceId]; - - if (DeviceInfo().ThreadsPerGroup[DeviceId] == 0) { - DeviceInfo().ThreadsPerGroup[DeviceId] = RTLDeviceInfoTy::MaxWgSize; - DP("Default thread limit: %d\n", RTLDeviceInfoTy::MaxWgSize); - } else if (enforceUpperBound(&DeviceInfo().ThreadsPerGroup[DeviceId], - RTLDeviceInfoTy::MaxWgSize)) { - DP("Capped thread limit: %d\n", RTLDeviceInfoTy::MaxWgSize); - } else { - DP("Using ROCm Queried thread limit: %d\n", - DeviceInfo().ThreadsPerGroup[DeviceId]); - } - } else { - DeviceInfo().ThreadsPerGroup[DeviceId] = RTLDeviceInfoTy::MaxWgSize; - DP("Error getting max block dimension, use default:%d \n", - RTLDeviceInfoTy::MaxWgSize); - } - - // Get wavefront size - uint32_t WavefrontSize = 0; - Err = - hsa_agent_get_info(Agent, HSA_AGENT_INFO_WAVEFRONT_SIZE, &WavefrontSize); - if (Err == HSA_STATUS_SUCCESS) { - DP("Queried wavefront size: %d\n", WavefrontSize); - DeviceInfo().WarpSize[DeviceId] = WavefrontSize; - } else { - // TODO: Burn the wavefront size into the code object - DP("Warning: Unknown wavefront size, assuming 64\n"); - DeviceInfo().WarpSize[DeviceId] = 64; - } - - // Adjust teams to the env variables - - if (DeviceInfo().Env.TeamLimit > 0 && - (enforceUpperBound(&DeviceInfo().GroupsPerDevice[DeviceId], - DeviceInfo().Env.TeamLimit))) { - DP("Capping max groups per device to OMP_TEAM_LIMIT=%d\n", - DeviceInfo().Env.TeamLimit); - } - - // Set default number of teams - if (DeviceInfo().Env.NumTeams > 0) { - DeviceInfo().NumTeams[DeviceId] = DeviceInfo().Env.NumTeams; - DP("Default number of teams set according to environment %d\n", - DeviceInfo().Env.NumTeams); - } else { - char *TeamsPerCUEnvStr = getenv("OMP_TARGET_TEAMS_PER_PROC"); - int TeamsPerCU = DefaultTeamsPerCU; - if (TeamsPerCUEnvStr) { - TeamsPerCU = std::stoi(TeamsPerCUEnvStr); - } - - DeviceInfo().NumTeams[DeviceId] = - TeamsPerCU * DeviceInfo().ComputeUnits[DeviceId]; - DP("Default number of teams = %d * number of compute units %d\n", - TeamsPerCU, DeviceInfo().ComputeUnits[DeviceId]); - } - - if (enforceUpperBound(&DeviceInfo().NumTeams[DeviceId], - DeviceInfo().GroupsPerDevice[DeviceId])) { - DP("Default number of teams exceeds device limit, capping at %d\n", - DeviceInfo().GroupsPerDevice[DeviceId]); - } - - // Adjust threads to the env variables - if (DeviceInfo().Env.TeamThreadLimit > 0 && - (enforceUpperBound(&DeviceInfo().NumThreads[DeviceId], - DeviceInfo().Env.TeamThreadLimit))) { - DP("Capping max number of threads to OMP_TEAMS_THREAD_LIMIT=%d\n", - DeviceInfo().Env.TeamThreadLimit); - } - - // Set default number of threads - DeviceInfo().NumThreads[DeviceId] = RTLDeviceInfoTy::DefaultWgSize; - DP("Default number of threads set according to library's default %d\n", - RTLDeviceInfoTy::DefaultWgSize); - if (enforceUpperBound(&DeviceInfo().NumThreads[DeviceId], - DeviceInfo().ThreadsPerGroup[DeviceId])) { - DP("Default number of threads exceeds device limit, capping at %d\n", - DeviceInfo().ThreadsPerGroup[DeviceId]); - } - - DP("Device %d: default limit for groupsPerDevice %d & threadsPerGroup %d\n", - DeviceId, DeviceInfo().GroupsPerDevice[DeviceId], - DeviceInfo().ThreadsPerGroup[DeviceId]); - - DP("Device %d: wavefront size %d, total threads %d x %d = %d\n", DeviceId, - DeviceInfo().WarpSize[DeviceId], DeviceInfo().ThreadsPerGroup[DeviceId], - DeviceInfo().GroupsPerDevice[DeviceId], - DeviceInfo().GroupsPerDevice[DeviceId] * - DeviceInfo().ThreadsPerGroup[DeviceId]); - - return OFFLOAD_SUCCESS; -} - -static __tgt_target_table * -__tgt_rtl_load_binary_locked(int32_t DeviceId, __tgt_device_image *Image); - -__tgt_target_table *__tgt_rtl_load_binary(int32_t DeviceId, - __tgt_device_image *Image) { - DeviceInfo().LoadRunLock.lock(); - __tgt_target_table *Res = __tgt_rtl_load_binary_locked(DeviceId, Image); - DeviceInfo().LoadRunLock.unlock(); - return Res; -} - -__tgt_target_table *__tgt_rtl_load_binary_locked(int32_t DeviceId, - __tgt_device_image *Image) { - // This function loads the device image onto gpu[DeviceId] and does other - // per-image initialization work. Specifically: - // - // - Initialize an DeviceEnvironmentTy instance embedded in the - // image at the symbol "__omp_rtl_device_environment" - // Fields DebugKind, DeviceNum, NumDevices. Used by the deviceRTL. - // - // - Allocate a large array per-gpu (could be moved to init_device) - // - Read a uint64_t at symbol omptarget_nvptx_device_State_size - // - Allocate at least that many bytes of gpu memory - // - Zero initialize it - // - Write the pointer to the symbol omptarget_nvptx_device_State - // - // - Pulls some per-kernel information together from various sources and - // records it in the KernelsList for quicker access later - // - // The initialization can be done before or after loading the image onto the - // gpu. This function presently does a mixture. Using the hsa api to get/set - // the information is simpler to implement, in exchange for more complicated - // runtime behaviour. E.g. launching a kernel or using dma to get eight bytes - // back from the gpu vs a hashtable lookup on the host. - - const size_t ImgSize = (char *)Image->ImageEnd - (char *)Image->ImageStart; - - DeviceInfo().clearOffloadEntriesTable(DeviceId); - - // We do not need to set the ELF version because the caller of this function - // had to do that to decide the right runtime to use - - if (!elfMachineIdIsAmdgcn(Image)) - return NULL; - - { - auto Env = - DeviceEnvironment(DeviceId, DeviceInfo().NumberOfDevices, - DeviceInfo().Env.DynamicMemSize, Image, ImgSize); - - auto &KernelInfo = DeviceInfo().KernelInfoTable[DeviceId]; - auto &SymbolInfo = DeviceInfo().SymbolInfoTable[DeviceId]; - hsa_status_t Err = moduleRegisterFromMemoryToPlace( - KernelInfo, SymbolInfo, (void *)Image->ImageStart, ImgSize, DeviceId, - [&](void *Data, size_t Size) { - if (imageContainsSymbol(Data, Size, "needs_hostcall_buffer")) { - __atomic_store_n(&DeviceInfo().HostcallRequired, true, - __ATOMIC_RELEASE); - } - return Env.beforeLoading(Data, Size); - }, - DeviceInfo().HSAExecutables); - - check("Module registering", Err); - if (Err != HSA_STATUS_SUCCESS) { - const char *DeviceName = DeviceInfo().GPUName[DeviceId].c_str(); - const char *ElfName = get_elf_mach_gfx_name(elfEFlags(Image)); - - if (strcmp(DeviceName, ElfName) != 0) { - DP("Possible gpu arch mismatch: device:%s, image:%s please check" - " compiler flag: -march=\n", - DeviceName, ElfName); - } else { - DP("Error loading image onto GPU: %s\n", get_error_string(Err)); - } - - return NULL; - } - - Err = Env.afterLoading(); - if (Err != HSA_STATUS_SUCCESS) { - return NULL; - } - } - - DP("AMDGPU module successfully loaded!\n"); - - { - // the device_State array is either large value in bss or a void* that - // needs to be assigned to a pointer to an array of size device_state_bytes - // If absent, it has been deadstripped and needs no setup. - - void *StatePtr; - uint32_t StatePtrSize; - auto &SymbolInfoMap = DeviceInfo().SymbolInfoTable[DeviceId]; - hsa_status_t Err = interop_hsa_get_symbol_info( - SymbolInfoMap, DeviceId, "omptarget_nvptx_device_State", &StatePtr, - &StatePtrSize); - - if (Err != HSA_STATUS_SUCCESS) { - DP("No device_state symbol found, skipping initialization\n"); - } else { - if (StatePtrSize < sizeof(void *)) { - DP("unexpected size of state_ptr %u != %zu\n", StatePtrSize, - sizeof(void *)); - return NULL; - } - - // if it's larger than a void*, assume it's a bss array and no further - // initialization is required. Only try to set up a pointer for - // sizeof(void*) - if (StatePtrSize == sizeof(void *)) { - uint64_t DeviceStateBytes = - getDeviceStateBytes((char *)Image->ImageStart, ImgSize); - if (DeviceStateBytes == 0) { - DP("Can't initialize device_State, missing size information\n"); - return NULL; - } - - auto &DSS = DeviceInfo().DeviceStateStore[DeviceId]; - if (DSS.first.get() == nullptr) { - assert(DSS.second == 0); - void *Ptr = NULL; - hsa_status_t Err = implCalloc(&Ptr, DeviceStateBytes, DeviceId); - if (Err != HSA_STATUS_SUCCESS) { - DP("Failed to allocate device_state array\n"); - return NULL; - } - DSS = { - std::unique_ptr{Ptr}, - DeviceStateBytes, - }; - } - - void *Ptr = DSS.first.get(); - if (DeviceStateBytes != DSS.second) { - DP("Inconsistent sizes of device_State unsupported\n"); - return NULL; - } - - // write ptr to device memory so it can be used by later kernels - Err = DeviceInfo().freesignalpoolMemcpyH2D(StatePtr, &Ptr, - sizeof(void *), DeviceId); - if (Err != HSA_STATUS_SUCCESS) { - DP("memcpy install of state_ptr failed\n"); - return NULL; - } - } - } - } - - // Here, we take advantage of the data that is appended after img_end to get - // the symbols' name we need to load. This data consist of the host entries - // begin and end as well as the target name (see the offloading linker script - // creation in clang compiler). - - // Find the symbols in the module by name. The name can be obtain by - // concatenating the host entry name with the target name - - __tgt_offload_entry *HostBegin = Image->EntriesBegin; - __tgt_offload_entry *HostEnd = Image->EntriesEnd; - - for (__tgt_offload_entry *E = HostBegin; E != HostEnd; ++E) { - - if (!E->addr) { - // The host should have always something in the address to - // uniquely identify the target region. - DP("Analyzing host entry '' (size = %lld)...\n", - (unsigned long long)E->size); - return NULL; - } - - if (E->size) { - __tgt_offload_entry Entry = *E; - - void *Varptr; - uint32_t Varsize; - - auto &SymbolInfoMap = DeviceInfo().SymbolInfoTable[DeviceId]; - hsa_status_t Err = interop_hsa_get_symbol_info( - SymbolInfoMap, DeviceId, E->name, &Varptr, &Varsize); - - if (Err != HSA_STATUS_SUCCESS) { - // Inform the user what symbol prevented offloading - DP("Loading global '%s' (Failed)\n", E->name); - return NULL; - } - - if (Varsize != E->size) { - DP("Loading global '%s' - size mismatch (%u != %lu)\n", E->name, - Varsize, E->size); - return NULL; - } - - DP("Entry point " DPxMOD " maps to global %s (" DPxMOD ")\n", - DPxPTR(E - HostBegin), E->name, DPxPTR(Varptr)); - Entry.addr = (void *)Varptr; - - DeviceInfo().addOffloadEntry(DeviceId, Entry); - - if (DeviceInfo().RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY && - E->flags & OMP_DECLARE_TARGET_LINK) { - // If unified memory is present any target link variables - // can access host addresses directly. There is no longer a - // need for device copies. - Err = DeviceInfo().freesignalpoolMemcpyH2D(Varptr, E->addr, - sizeof(void *), DeviceId); - if (Err != HSA_STATUS_SUCCESS) - DP("Error when copying USM\n"); - DP("Copy linked variable host address (" DPxMOD ")" - "to device address (" DPxMOD ")\n", - DPxPTR(*((void **)E->addr)), DPxPTR(Varptr)); - } - - continue; - } - - DP("to find the kernel name: %s size: %lu\n", E->name, strlen(E->name)); - - // errors in kernarg_segment_size previously treated as = 0 (or as undef) - uint32_t KernargSegmentSize = 0; - auto &KernelInfoMap = DeviceInfo().KernelInfoTable[DeviceId]; - hsa_status_t Err = HSA_STATUS_SUCCESS; - if (!E->name) { - Err = HSA_STATUS_ERROR; - } else { - std::string KernelStr = std::string(E->name); - auto It = KernelInfoMap.find(KernelStr); - if (It != KernelInfoMap.end()) { - atl_kernel_info_t Info = It->second; - KernargSegmentSize = Info.kernel_segment_size; - } else { - Err = HSA_STATUS_ERROR; - } - } - - // default value GENERIC (in case symbol is missing from cubin file) - llvm::omp::OMPTgtExecModeFlags ExecModeVal = - llvm::omp::OMPTgtExecModeFlags::OMP_TGT_EXEC_MODE_GENERIC; - - // get flat group size if present, else Default_WG_Size - int16_t WGSizeVal = RTLDeviceInfoTy::DefaultWgSize; - - // get Kernel Descriptor if present. - // Keep struct in sync wih getTgtAttributeStructQTy in CGOpenMPRuntime.cpp - struct KernDescValType { - uint16_t Version; - uint16_t TSize; - uint16_t WGSize; - }; - struct KernDescValType KernDescVal; - std::string KernDescNameStr(E->name); - KernDescNameStr += "_kern_desc"; - const char *KernDescName = KernDescNameStr.c_str(); - - const void *KernDescPtr; - uint32_t KernDescSize; - void *CallStackAddr = nullptr; - Err = interopGetSymbolInfo((char *)Image->ImageStart, ImgSize, KernDescName, - &KernDescPtr, &KernDescSize); - - if (Err == HSA_STATUS_SUCCESS) { - if ((size_t)KernDescSize != sizeof(KernDescVal)) - DP("Loading global computation properties '%s' - size mismatch (%u != " - "%lu)\n", - KernDescName, KernDescSize, sizeof(KernDescVal)); - - memcpy(&KernDescVal, KernDescPtr, (size_t)KernDescSize); - - // Check structure size against recorded size. - if ((size_t)KernDescSize != KernDescVal.TSize) - DP("KernDescVal size %lu does not match advertized size %d for '%s'\n", - sizeof(KernDescVal), KernDescVal.TSize, KernDescName); - - DP("After loading global for %s KernDesc \n", KernDescName); - DP("KernDesc: Version: %d\n", KernDescVal.Version); - DP("KernDesc: TSize: %d\n", KernDescVal.TSize); - DP("KernDesc: WG_Size: %d\n", KernDescVal.WGSize); - - if (KernDescVal.WGSize == 0) { - KernDescVal.WGSize = RTLDeviceInfoTy::DefaultWgSize; - DP("Setting KernDescVal.WG_Size to default %d\n", KernDescVal.WGSize); - } - WGSizeVal = KernDescVal.WGSize; - DP("WGSizeVal %d\n", WGSizeVal); - check("Loading KernDesc computation property", Err); - } else { - DP("Warning: Loading KernDesc '%s' - symbol not found, ", KernDescName); - - // Flat group size - std::string WGSizeNameStr(E->name); - WGSizeNameStr += "_wg_size"; - const char *WGSizeName = WGSizeNameStr.c_str(); - - const void *WGSizePtr; - uint32_t WGSize; - Err = interopGetSymbolInfo((char *)Image->ImageStart, ImgSize, WGSizeName, - &WGSizePtr, &WGSize); - - if (Err == HSA_STATUS_SUCCESS) { - if ((size_t)WGSize != sizeof(int16_t)) { - DP("Loading global computation properties '%s' - size mismatch (%u " - "!= " - "%lu)\n", - WGSizeName, WGSize, sizeof(int16_t)); - return NULL; - } - - memcpy(&WGSizeVal, WGSizePtr, (size_t)WGSize); - - DP("After loading global for %s WGSize = %d\n", WGSizeName, WGSizeVal); - - if (WGSizeVal < RTLDeviceInfoTy::DefaultWgSize || - WGSizeVal > RTLDeviceInfoTy::MaxWgSize) { - DP("Error wrong WGSize value specified in HSA code object file: " - "%d\n", - WGSizeVal); - WGSizeVal = RTLDeviceInfoTy::DefaultWgSize; - } - } else { - DP("Warning: Loading WGSize '%s' - symbol not found, " - "using default value %d\n", - WGSizeName, WGSizeVal); - } - - check("Loading WGSize computation property", Err); - } - - // Read execution mode from global in binary - std::string ExecModeNameStr(E->name); - ExecModeNameStr += "_exec_mode"; - const char *ExecModeName = ExecModeNameStr.c_str(); - - const void *ExecModePtr; - uint32_t VarSize; - Err = interopGetSymbolInfo((char *)Image->ImageStart, ImgSize, ExecModeName, - &ExecModePtr, &VarSize); - - if (Err == HSA_STATUS_SUCCESS) { - if ((size_t)VarSize != sizeof(llvm::omp::OMPTgtExecModeFlags)) { - DP("Loading global computation properties '%s' - size mismatch(%u != " - "%lu)\n", - ExecModeName, VarSize, sizeof(llvm::omp::OMPTgtExecModeFlags)); - return NULL; - } - - memcpy(&ExecModeVal, ExecModePtr, (size_t)VarSize); - - DP("After loading global for %s ExecMode = %d\n", ExecModeName, - ExecModeVal); - - if (ExecModeVal < 0 || - ExecModeVal > llvm::omp::OMP_TGT_EXEC_MODE_GENERIC_SPMD) { - DP("Error wrong exec_mode value specified in HSA code object file: " - "%d\n", - ExecModeVal); - return NULL; - } - } else { - DP("Loading global exec_mode '%s' - symbol missing, using default " - "value " - "GENERIC (1)\n", - ExecModeName); - } - check("Loading computation property", Err); - - KernelsList.push_back(KernelTy(ExecModeVal, WGSizeVal, DeviceId, - CallStackAddr, E->name, KernargSegmentSize, - DeviceInfo().KernArgPool)); - __tgt_offload_entry Entry = *E; - Entry.addr = (void *)&KernelsList.back(); - DeviceInfo().addOffloadEntry(DeviceId, Entry); - DP("Entry point %ld maps to %s\n", E - HostBegin, E->name); - } - - return DeviceInfo().getOffloadEntriesTable(DeviceId); -} - -void *__tgt_rtl_data_alloc(int DeviceId, int64_t Size, void *, int32_t Kind) { - void *Ptr = NULL; - assert(DeviceId < DeviceInfo().NumberOfDevices && "Device ID too large"); - - hsa_amd_memory_pool_t MemoryPool; - switch (Kind) { - case TARGET_ALLOC_DEFAULT: - case TARGET_ALLOC_DEVICE: - // GPU memory - MemoryPool = DeviceInfo().getDeviceMemoryPool(DeviceId); - break; - case TARGET_ALLOC_HOST: - // non-migratable memory accessible by host and device(s) - MemoryPool = DeviceInfo().getHostMemoryPool(); - break; - default: - REPORT("Invalid target data allocation kind or requested allocator not " - "implemented yet\n"); - return NULL; - } - - hsa_status_t Err = hsa_amd_memory_pool_allocate(MemoryPool, Size, 0, &Ptr); - DP("Tgt alloc data %ld bytes, (tgt:%016llx).\n", Size, - (long long unsigned)(Elf64_Addr)Ptr); - Ptr = (Err == HSA_STATUS_SUCCESS) ? Ptr : NULL; - return Ptr; -} - -int32_t __tgt_rtl_data_submit(int DeviceId, void *TgtPtr, void *HstPtr, - int64_t Size) { - assert(DeviceId < DeviceInfo().NumberOfDevices && "Device ID too large"); - __tgt_async_info AsyncInfo; - int32_t Rc = dataSubmit(DeviceId, TgtPtr, HstPtr, Size, &AsyncInfo); - if (Rc != OFFLOAD_SUCCESS) - return OFFLOAD_FAIL; - - return __tgt_rtl_synchronize(DeviceId, &AsyncInfo); -} - -int32_t __tgt_rtl_data_submit_async(int DeviceId, void *TgtPtr, void *HstPtr, - int64_t Size, __tgt_async_info *AsyncInfo) { - assert(DeviceId < DeviceInfo().NumberOfDevices && "Device ID too large"); - if (AsyncInfo) { - initAsyncInfo(AsyncInfo); - return dataSubmit(DeviceId, TgtPtr, HstPtr, Size, AsyncInfo); - } - return __tgt_rtl_data_submit(DeviceId, TgtPtr, HstPtr, Size); -} - -int32_t __tgt_rtl_data_retrieve(int DeviceId, void *HstPtr, void *TgtPtr, - int64_t Size) { - assert(DeviceId < DeviceInfo().NumberOfDevices && "Device ID too large"); - __tgt_async_info AsyncInfo; - int32_t Rc = dataRetrieve(DeviceId, HstPtr, TgtPtr, Size, &AsyncInfo); - if (Rc != OFFLOAD_SUCCESS) - return OFFLOAD_FAIL; - - return __tgt_rtl_synchronize(DeviceId, &AsyncInfo); -} - -int32_t __tgt_rtl_data_retrieve_async(int DeviceId, void *HstPtr, void *TgtPtr, - int64_t Size, - __tgt_async_info *AsyncInfo) { - assert(AsyncInfo && "AsyncInfo is nullptr"); - assert(DeviceId < DeviceInfo().NumberOfDevices && "Device ID too large"); - initAsyncInfo(AsyncInfo); - return dataRetrieve(DeviceId, HstPtr, TgtPtr, Size, AsyncInfo); -} - -int32_t __tgt_rtl_data_delete(int DeviceId, void *TgtPtr, int32_t) { - assert(DeviceId < DeviceInfo().NumberOfDevices && "Device ID too large"); - // HSA can free pointers allocated from different types of memory pool. - hsa_status_t Err; - DP("Tgt free data (tgt:%016llx).\n", (long long unsigned)(Elf64_Addr)TgtPtr); - Err = core::Runtime::Memfree(TgtPtr); - if (Err != HSA_STATUS_SUCCESS) { - DP("Error when freeing CUDA memory\n"); - return OFFLOAD_FAIL; - } - return OFFLOAD_SUCCESS; -} - -int32_t __tgt_rtl_launch_kernel(int32_t DeviceId, void *TgtEntryPtr, - void **TgtArgs, ptrdiff_t *TgtOffsets, - KernelArgsTy *KernelArgs, - __tgt_async_info *AsyncInfo) { - assert(!KernelArgs->NumTeams[1] && !KernelArgs->NumTeams[2] && - !KernelArgs->ThreadLimit[1] && !KernelArgs->ThreadLimit[2] && - "Only one dimensional kernels supported."); - assert(AsyncInfo && "AsyncInfo is nullptr"); - initAsyncInfo(AsyncInfo); - - DeviceInfo().LoadRunLock.lock_shared(); - int32_t Res = - runRegionLocked(DeviceId, TgtEntryPtr, TgtArgs, TgtOffsets, - KernelArgs->NumArgs, KernelArgs->NumTeams[0], - KernelArgs->ThreadLimit[0], KernelArgs->Tripcount); - - DeviceInfo().LoadRunLock.unlock_shared(); - return Res; -} - -int32_t __tgt_rtl_synchronize(int32_t DeviceId, __tgt_async_info *AsyncInfo) { - assert(AsyncInfo && "AsyncInfo is nullptr"); - - // Cuda asserts that AsyncInfo->Queue is non-null, but this invariant - // is not ensured by devices.cpp for amdgcn - // assert(AsyncInfo->Queue && "AsyncInfo->Queue is nullptr"); - if (AsyncInfo->Queue) { - finiAsyncInfo(AsyncInfo); - } - return OFFLOAD_SUCCESS; -} - -void __tgt_rtl_print_device_info(int32_t DeviceId) { - // TODO: Assertion to see if DeviceId is correct - // NOTE: We don't need to set context for print device info. - - DeviceInfo().printDeviceInfo(DeviceId, DeviceInfo().HSAAgents[DeviceId]); -} - -int32_t __tgt_rtl_data_lock(int32_t DeviceId, void *HostPtr, int64_t Size, - void **LockedHostPtr) { - assert(DeviceId < DeviceInfo().NumberOfDevices && "Device ID too large"); - - hsa_agent_t Agent = DeviceInfo().HSAAgents[DeviceId]; - hsa_status_t err = lock_memory(HostPtr, Size, Agent, LockedHostPtr); - if (err != HSA_STATUS_SUCCESS) { - DP("Error in tgt_rtl_data_lock\n"); - return OFFLOAD_FAIL; - } - DP("Tgt lock host data %ld bytes, (HostPtr:%016llx).\n", Size, - (long long unsigned)(Elf64_Addr)*LockedHostPtr); - return OFFLOAD_SUCCESS; -} - -int32_t __tgt_rtl_data_unlock(int DeviceId, void *HostPtr) { - assert(DeviceId < DeviceInfo().NumberOfDevices && "Device ID too large"); - hsa_status_t err = unlock_memory(HostPtr); - if (err != HSA_STATUS_SUCCESS) { - DP("Error in tgt_rtl_data_unlock\n"); - return OFFLOAD_FAIL; - } - - DP("Tgt unlock data (tgt:%016llx).\n", - (long long unsigned)(Elf64_Addr)HostPtr); - return OFFLOAD_SUCCESS; -} - -} // extern "C" diff --git a/openmp/libomptarget/plugins/common/CMakeLists.txt b/openmp/libomptarget/plugins/common/CMakeLists.txt deleted file mode 100644 --- a/openmp/libomptarget/plugins/common/CMakeLists.txt +++ /dev/null @@ -1,14 +0,0 @@ -##===----------------------------------------------------------------------===## -# -# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -# See https://llvm.org/LICENSE.txt for license information. -# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -# -##===----------------------------------------------------------------------===## -# -# Common parts which can be used by all plugins -# -##===----------------------------------------------------------------------===## - -add_subdirectory(elf_common) -add_subdirectory(MemoryManager) diff --git a/openmp/libomptarget/plugins/cuda/CMakeLists.txt b/openmp/libomptarget/plugins/cuda/CMakeLists.txt deleted file mode 100644 --- a/openmp/libomptarget/plugins/cuda/CMakeLists.txt +++ /dev/null @@ -1,76 +0,0 @@ -##===----------------------------------------------------------------------===## -# -# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -# See https://llvm.org/LICENSE.txt for license information. -# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -# -##===----------------------------------------------------------------------===## -# -# Build a plugin for a CUDA machine if available. -# -##===----------------------------------------------------------------------===## -set(LIBOMPTARGET_BUILD_CUDA_PLUGIN TRUE CACHE BOOL - "Whether to build CUDA plugin") -if (NOT LIBOMPTARGET_BUILD_CUDA_PLUGIN) - libomptarget_say("Not building CUDA offloading plugin: LIBOMPTARGET_BUILD_CUDA_PLUGIN is false") - return() -endif() - -if (NOT (CMAKE_SYSTEM_PROCESSOR MATCHES "(x86_64)|(ppc64le)|(aarch64)$" AND CMAKE_SYSTEM_NAME MATCHES "Linux")) - libomptarget_say("Not building CUDA offloading plugin: only support CUDA in Linux x86_64, ppc64le, or aarch64 hosts.") - return() -endif() - -libomptarget_say("Building CUDA offloading plugin.") - -set(LIBOMPTARGET_DLOPEN_LIBCUDA OFF) -option(LIBOMPTARGET_FORCE_DLOPEN_LIBCUDA "Build with dlopened libcuda" ${LIBOMPTARGET_DLOPEN_LIBCUDA}) - -add_llvm_library(omptarget.rtl.cuda SHARED - src/rtl.cpp - - LINK_COMPONENTS - Support - Object - - LINK_LIBS PRIVATE - elf_common - MemoryManager - ${OPENMP_PTHREAD_LIB} - "-Wl,--version-script=${CMAKE_CURRENT_SOURCE_DIR}/../exports,-z,defs" - - NO_INSTALL_RPATH -) - -if(LIBOMPTARGET_DEP_CUDA_FOUND AND NOT LIBOMPTARGET_FORCE_DLOPEN_LIBCUDA) - libomptarget_say("Building CUDA plugin linked against libcuda") - target_link_libraries(omptarget.rtl.cuda PRIVATE CUDA::cuda_driver) -else() - libomptarget_say("Building CUDA plugin for dlopened libcuda") - target_include_directories(omptarget.rtl.cuda PRIVATE dynamic_cuda) - target_sources(omptarget.rtl.cuda PRIVATE dynamic_cuda/cuda.cpp) -endif() - -# Define the suffix for the runtime messaging dumps. -target_compile_definitions(omptarget.rtl.cuda PRIVATE TARGET_NAME="CUDA") -target_include_directories(omptarget.rtl.cuda PRIVATE ${LIBOMPTARGET_INCLUDE_DIR}) - -# Install plugin under the lib destination folder. -install(TARGETS omptarget.rtl.cuda LIBRARY DESTINATION "${OPENMP_INSTALL_LIBDIR}") -set_target_properties(omptarget.rtl.cuda PROPERTIES - INSTALL_RPATH "$ORIGIN" BUILD_RPATH "$ORIGIN:${CMAKE_CURRENT_BINARY_DIR}/.." - CXX_VISIBILITY_PRESET protected) - -# Report to the parent scope that we are building a plugin for CUDA. -# This controls whether tests are run for the nvptx offloading target -# Run them if libcuda is available, or if the user explicitly asked for dlopen -# Otherwise this plugin is being built speculatively and there may be no cuda available -option(LIBOMPTARGET_FORCE_NVIDIA_TESTS "Build NVIDIA libomptarget tests" OFF) -if (LIBOMPTARGET_FOUND_NVIDIA_GPU OR LIBOMPTARGET_FORCE_NVIDIA_TESTS) - libomptarget_say("Enable tests using CUDA plugin") - set(LIBOMPTARGET_SYSTEM_TARGETS "${LIBOMPTARGET_SYSTEM_TARGETS} nvptx64-nvidia-cuda nvptx64-nvidia-cuda-LTO" PARENT_SCOPE) - list(APPEND LIBOMPTARGET_TESTED_PLUGINS "omptarget.rtl.cuda") - set(LIBOMPTARGET_TESTED_PLUGINS "${LIBOMPTARGET_TESTED_PLUGINS}" PARENT_SCOPE) -else() - libomptarget_say("Disabling tests using CUDA plugin as cuda may not be available") -endif() diff --git a/openmp/libomptarget/plugins/cuda/src/rtl.cpp b/openmp/libomptarget/plugins/cuda/src/rtl.cpp deleted file mode 100644 --- a/openmp/libomptarget/plugins/cuda/src/rtl.cpp +++ /dev/null @@ -1,1925 +0,0 @@ -//===----RTLs/cuda/src/rtl.cpp - Target RTLs Implementation ------- C++ -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// RTL for CUDA machine -// -//===----------------------------------------------------------------------===// - -#include "llvm/ADT/StringRef.h" - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "Debug.h" -#include "DeviceEnvironment.h" -#include "omptarget.h" -#include "omptargetplugin.h" - -#ifndef TARGET_NAME -#define TARGET_NAME CUDA -#endif -#ifndef DEBUG_PREFIX -#define DEBUG_PREFIX "Target " GETNAME(TARGET_NAME) " RTL" -#endif - -#include "MemoryManager.h" - -#include "llvm/Frontend/OpenMP/OMPConstants.h" - -using namespace llvm; - -// Utility for retrieving and printing CUDA error string. -#ifdef OMPTARGET_DEBUG -#define CUDA_ERR_STRING(err) \ - do { \ - if (getDebugLevel() > 0) { \ - const char *errStr = nullptr; \ - CUresult errStr_status = cuGetErrorString(err, &errStr); \ - if (errStr_status == CUDA_ERROR_INVALID_VALUE) \ - REPORT("Unrecognized CUDA error code: %d\n", err); \ - else if (errStr_status == CUDA_SUCCESS) \ - REPORT("CUDA error is: %s\n", errStr); \ - else { \ - REPORT("Unresolved CUDA error code: %d\n", err); \ - REPORT("Unsuccessful cuGetErrorString return status: %d\n", \ - errStr_status); \ - } \ - } else { \ - const char *errStr = nullptr; \ - CUresult errStr_status = cuGetErrorString(err, &errStr); \ - if (errStr_status == CUDA_SUCCESS) \ - REPORT("%s \n", errStr); \ - } \ - } while (false) -#else // OMPTARGET_DEBUG -#define CUDA_ERR_STRING(err) \ - do { \ - const char *errStr = nullptr; \ - CUresult errStr_status = cuGetErrorString(err, &errStr); \ - if (errStr_status == CUDA_SUCCESS) \ - REPORT("%s \n", errStr); \ - } while (false) -#endif // OMPTARGET_DEBUG - -#define BOOL2TEXT(b) ((b) ? "Yes" : "No") - -#include "elf_common.h" - -/// Keep entries table per device. -struct FuncOrGblEntryTy { - __tgt_target_table Table; - std::vector<__tgt_offload_entry> Entries; -}; - -/// Use a single entity to encode a kernel and a set of flags. -struct KernelTy { - CUfunction Func; - - // execution mode of kernel - llvm::omp::OMPTgtExecModeFlags ExecutionMode; - - /// Maximal number of threads per block for this kernel. - int MaxThreadsPerBlock = 0; - - KernelTy(CUfunction Func, llvm::omp::OMPTgtExecModeFlags ExecutionMode) - : Func(Func), ExecutionMode(ExecutionMode) {} -}; - -namespace { -bool checkResult(CUresult Err, const char *ErrMsg) { - if (Err == CUDA_SUCCESS) - return true; - - REPORT("%s", ErrMsg); - CUDA_ERR_STRING(Err); - return false; -} - -int memcpyDtoD(const void *SrcPtr, void *DstPtr, int64_t Size, - CUstream Stream) { - CUresult Err = - cuMemcpyDtoDAsync((CUdeviceptr)DstPtr, (CUdeviceptr)SrcPtr, Size, Stream); - - if (Err != CUDA_SUCCESS) { - DP("Error when copying data from device to device. Pointers: src " - "= " DPxMOD ", dst = " DPxMOD ", size = %" PRId64 "\n", - DPxPTR(SrcPtr), DPxPTR(DstPtr), Size); - CUDA_ERR_STRING(Err); - return OFFLOAD_FAIL; - } - - return OFFLOAD_SUCCESS; -} - -int recordEvent(void *EventPtr, __tgt_async_info *AsyncInfo) { - CUstream Stream = reinterpret_cast(AsyncInfo->Queue); - CUevent Event = reinterpret_cast(EventPtr); - - CUresult Err = cuEventRecord(Event, Stream); - if (Err != CUDA_SUCCESS) { - DP("Error when recording event. stream = " DPxMOD ", event = " DPxMOD "\n", - DPxPTR(Stream), DPxPTR(Event)); - CUDA_ERR_STRING(Err); - return OFFLOAD_FAIL; - } - - return OFFLOAD_SUCCESS; -} - -int syncEvent(void *EventPtr) { - CUevent Event = reinterpret_cast(EventPtr); - - CUresult Err = cuEventSynchronize(Event); - if (Err != CUDA_SUCCESS) { - DP("Error when syncing event = " DPxMOD "\n", DPxPTR(Event)); - CUDA_ERR_STRING(Err); - return OFFLOAD_FAIL; - } - - return OFFLOAD_SUCCESS; -} - -namespace { - -// Structure contains per-device data -struct DeviceDataTy { - /// List that contains all the kernels. - std::list KernelsList; - - std::list FuncGblEntries; - - CUcontext Context = nullptr; - // Device properties - unsigned int ThreadsPerBlock = 0; - unsigned int BlocksPerGrid = 0; - unsigned int WarpSize = 0; - // OpenMP properties - unsigned int NumTeams = 0; - unsigned int NumThreads = 0; -}; - -/// Resource allocator where \p T is the resource type. -/// Functions \p create and \p destroy return OFFLOAD_SUCCESS and OFFLOAD_FAIL -/// accordingly. The implementation should not raise any exception. -template struct AllocatorTy { - using ElementTy = T; - virtual ~AllocatorTy() {} - - /// Create a resource and assign to R. - virtual int create(T &R) noexcept = 0; - /// Destroy the resource. - virtual int destroy(T) noexcept = 0; -}; - -/// Allocator for CUstream. -struct StreamAllocatorTy final : public AllocatorTy { - /// See AllocatorTy::create. - int create(CUstream &Stream) noexcept override { - if (!checkResult(cuStreamCreate(&Stream, CU_STREAM_NON_BLOCKING), - "Error returned from cuStreamCreate\n")) - return OFFLOAD_FAIL; - - return OFFLOAD_SUCCESS; - } - - /// See AllocatorTy::destroy. - int destroy(CUstream Stream) noexcept override { - if (!checkResult(cuStreamDestroy(Stream), - "Error returned from cuStreamDestroy\n")) - return OFFLOAD_FAIL; - - return OFFLOAD_SUCCESS; - } -}; - -/// Allocator for CUevent. -struct EventAllocatorTy final : public AllocatorTy { - /// See AllocatorTy::create. - int create(CUevent &Event) noexcept override { - if (!checkResult(cuEventCreate(&Event, CU_EVENT_DEFAULT), - "Error returned from cuEventCreate\n")) - return OFFLOAD_FAIL; - - return OFFLOAD_SUCCESS; - } - - /// See AllocatorTy::destroy. - int destroy(CUevent Event) noexcept override { - if (!checkResult(cuEventDestroy(Event), - "Error returned from cuEventDestroy\n")) - return OFFLOAD_FAIL; - - return OFFLOAD_SUCCESS; - } -}; - -/// A generic pool of resources where \p T is the resource type. -/// \p T should be copyable as the object is stored in \p std::vector . -template class ResourcePoolTy { - using ElementTy = typename AllocTy::ElementTy; - /// Index of the next available resource. - size_t Next = 0; - /// Mutex to guard the pool. - std::mutex Mutex; - /// Pool of resources. The difference between \p Resources and \p Pool is, - /// when a resource is acquired and released, it is all on \p Resources. When - /// a batch of new resources are needed, they are both added to \p Resources - /// and \p Pool. The reason for this setting is, \p Resources could contain - /// redundant elements because resources are not released, which can cause - /// double free. This setting makes sure that \p Pool always has every - /// resource allocated from the device. - std::vector Resources; - std::vector Pool; - /// A reference to the corresponding allocator. - AllocTy Allocator; - - /// If `Resources` is used up, we will fill in more resources. It assumes that - /// the new size `Size` should be always larger than the current size. - bool resize(size_t Size) { - assert(Resources.size() == Pool.size() && "size mismatch"); - auto CurSize = Resources.size(); - assert(Size > CurSize && "Unexpected smaller size"); - Pool.reserve(Size); - Resources.reserve(Size); - for (auto I = CurSize; I < Size; ++I) { - ElementTy NewItem; - int Ret = Allocator.create(NewItem); - if (Ret != OFFLOAD_SUCCESS) - return false; - Pool.push_back(NewItem); - Resources.push_back(NewItem); - } - return true; - } - -public: - ResourcePoolTy(AllocTy &&A, size_t Size = 0) noexcept - : Allocator(std::move(A)) { - if (Size) - (void)resize(Size); - } - - ~ResourcePoolTy() noexcept { clear(); } - - /// Get a resource from pool. `Next` always points to the next available - /// resource. That means, `[0, next-1]` have been assigned, and `[id,]` are - /// still available. If there is no resource left, we will ask for more. Each - /// time a resource is assigned, the id will increase one. - /// xxxxxs+++++++++ - /// ^ - /// Next - /// After assignment, the pool becomes the following and s is assigned. - /// xxxxxs+++++++++ - /// ^ - /// Next - int acquire(ElementTy &R) noexcept { - std::lock_guard LG(Mutex); - if (Next == Resources.size()) { - auto NewSize = Resources.size() ? Resources.size() * 2 : 1; - if (!resize(NewSize)) - return OFFLOAD_FAIL; - } - - assert(Next < Resources.size()); - - R = Resources[Next++]; - - return OFFLOAD_SUCCESS; - } - - /// Return the resource back to the pool. When we return a resource, we need - /// to first decrease `Next`, and then copy the resource back. It is worth - /// noting that, the order of resources return might be different from that - /// they're assigned, that saying, at some point, there might be two identical - /// resources. - /// xxax+a+++++ - /// ^ - /// Next - /// However, it doesn't matter, because they're always on the two sides of - /// `Next`. The left one will in the end be overwritten by another resource. - /// Therefore, after several execution, the order of pool might be different - /// from its initial state. - void release(ElementTy R) noexcept { - std::lock_guard LG(Mutex); - Resources[--Next] = R; - } - - /// Released all stored resources and clear the pool. - /// Note: This function is not thread safe. Be sure to guard it if necessary. - void clear() noexcept { - for (auto &R : Pool) - (void)Allocator.destroy(R); - Pool.clear(); - Resources.clear(); - } -}; - -} // namespace - -class DeviceRTLTy { - int NumberOfDevices; - // OpenMP environment properties - int EnvNumTeams; - unsigned int EnvTeamLimit; - unsigned int EnvTeamThreadLimit; - // OpenMP requires flags - int64_t RequiresFlags; - // Amount of dynamic shared memory to use at launch. - uint64_t DynamicMemorySize; - - /// Number of initial streams for each device. - int NumInitialStreams = 32; - - /// Number of initial events for each device. - int NumInitialEvents = 8; - - static constexpr const int32_t HardThreadLimit = 1024; - static constexpr const int32_t DefaultNumTeams = 128; - static constexpr const int32_t DefaultNumThreads = 128; - - using StreamPoolTy = ResourcePoolTy; - std::vector> StreamPool; - - using EventPoolTy = ResourcePoolTy; - std::vector> EventPool; - - std::vector DeviceData; - std::vector> Modules; - - /// Vector of flags indicating the initalization status of all associated - /// devices. - std::vector InitializedFlags; - - enum class PeerAccessState : uint8_t { Unkown, Yes, No }; - std::vector> PeerAccessMatrix; - std::mutex PeerAccessMatrixLock; - - /// A class responsible for interacting with device native runtime library to - /// allocate and free memory. - class CUDADeviceAllocatorTy : public DeviceAllocatorTy { - public: - void *allocate(size_t Size, void *, TargetAllocTy Kind) override { - if (Size == 0) - return nullptr; - - void *MemAlloc = nullptr; - CUresult Err; - switch (Kind) { - case TARGET_ALLOC_DEFAULT: - case TARGET_ALLOC_DEVICE: - CUdeviceptr DevicePtr; - Err = cuMemAlloc(&DevicePtr, Size); - MemAlloc = (void *)DevicePtr; - if (!checkResult(Err, "Error returned from cuMemAlloc\n")) - return nullptr; - break; - case TARGET_ALLOC_HOST: - void *HostPtr; - Err = cuMemAllocHost(&HostPtr, Size); - MemAlloc = HostPtr; - if (!checkResult(Err, "Error returned from cuMemAllocHost\n")) - return nullptr; - break; - case TARGET_ALLOC_SHARED: - CUdeviceptr SharedPtr; - Err = cuMemAllocManaged(&SharedPtr, Size, CU_MEM_ATTACH_GLOBAL); - MemAlloc = (void *)SharedPtr; - if (!checkResult(Err, "Error returned from cuMemAllocManaged\n")) - return nullptr; - break; - } - - return MemAlloc; - } - - int free(void *TgtPtr, TargetAllocTy Kind) override { - CUresult Err; - // Host pinned memory must be freed differently. - switch (Kind) { - case TARGET_ALLOC_DEFAULT: - case TARGET_ALLOC_DEVICE: - case TARGET_ALLOC_SHARED: - Err = cuMemFree((CUdeviceptr)TgtPtr); - if (!checkResult(Err, "Error returned from cuMemFree\n")) - return OFFLOAD_FAIL; - break; - case TARGET_ALLOC_HOST: - Err = cuMemFreeHost(TgtPtr); - if (!checkResult(Err, "Error returned from cuMemFreeHost\n")) - return OFFLOAD_FAIL; - break; - } - - return OFFLOAD_SUCCESS; - } - }; - - /// A vector of device allocators - std::vector DeviceAllocators; - - /// A vector of memory managers. Since the memory manager is non-copyable and - // non-removable, we wrap them into std::unique_ptr. - std::vector> MemoryManagers; - - /// Whether use memory manager - bool UseMemoryManager = true; - - // Record entry point associated with device - void addOffloadEntry(const int DeviceId, const __tgt_offload_entry Entry) { - FuncOrGblEntryTy &E = DeviceData[DeviceId].FuncGblEntries.back(); - E.Entries.push_back(Entry); - } - - // Return a pointer to the entry associated with the pointer - const __tgt_offload_entry *getOffloadEntry(const int DeviceId, - const void *Addr) const { - for (const __tgt_offload_entry &Itr : - DeviceData[DeviceId].FuncGblEntries.back().Entries) - if (Itr.addr == Addr) - return &Itr; - - return nullptr; - } - - // Return the pointer to the target entries table - __tgt_target_table *getOffloadEntriesTable(const int DeviceId) { - FuncOrGblEntryTy &E = DeviceData[DeviceId].FuncGblEntries.back(); - - if (E.Entries.empty()) - return nullptr; - - // Update table info according to the entries and return the pointer - E.Table.EntriesBegin = E.Entries.data(); - E.Table.EntriesEnd = E.Entries.data() + E.Entries.size(); - - return &E.Table; - } - - // Clear entries table for a device - void clearOffloadEntriesTable(const int DeviceId) { - DeviceData[DeviceId].FuncGblEntries.emplace_back(); - FuncOrGblEntryTy &E = DeviceData[DeviceId].FuncGblEntries.back(); - E.Entries.clear(); - E.Table.EntriesBegin = E.Table.EntriesEnd = nullptr; - } - -public: - CUstream getStream(const int DeviceId, __tgt_async_info *AsyncInfo) const { - assert(AsyncInfo && "AsyncInfo is nullptr"); - - if (!AsyncInfo->Queue) { - CUstream S; - if (StreamPool[DeviceId]->acquire(S) != OFFLOAD_SUCCESS) - return nullptr; - - AsyncInfo->Queue = S; - } - - return reinterpret_cast(AsyncInfo->Queue); - } - - // This class should not be copied - DeviceRTLTy(const DeviceRTLTy &) = delete; - DeviceRTLTy(DeviceRTLTy &&) = delete; - - DeviceRTLTy() - : NumberOfDevices(0), EnvNumTeams(-1), EnvTeamLimit(-1), - EnvTeamThreadLimit(-1), RequiresFlags(OMP_REQ_UNDEFINED), - DynamicMemorySize(0) { - - DP("Start initializing CUDA\n"); - - CUresult Err = cuInit(0); - if (Err == CUDA_ERROR_INVALID_HANDLE) { - // Can't call cuGetErrorString if dlsym failed - DP("Failed to load CUDA shared library\n"); - return; - } - if (Err == CUDA_ERROR_NO_DEVICE) { - DP("There are no devices supporting CUDA.\n"); - return; - } - if (!checkResult(Err, "Error returned from cuInit\n")) { - return; - } - - Err = cuDeviceGetCount(&NumberOfDevices); - if (!checkResult(Err, "Error returned from cuDeviceGetCount\n")) - return; - - if (NumberOfDevices == 0) { - DP("There are no devices supporting CUDA.\n"); - return; - } - - DeviceData.resize(NumberOfDevices); - Modules.resize(NumberOfDevices); - StreamPool.resize(NumberOfDevices); - EventPool.resize(NumberOfDevices); - PeerAccessMatrix.resize(NumberOfDevices); - for (auto &V : PeerAccessMatrix) - V.resize(NumberOfDevices, PeerAccessState::Unkown); - - // Get environment variables regarding teams - if (const char *EnvStr = getenv("OMP_TEAM_LIMIT")) { - // OMP_TEAM_LIMIT has been set - EnvTeamLimit = std::stoi(EnvStr); - DP("Parsed OMP_TEAM_LIMIT=%d\n", EnvTeamLimit); - } - if (const char *EnvStr = getenv("OMP_TEAMS_THREAD_LIMIT")) { - // OMP_TEAMS_THREAD_LIMIT has been set - EnvTeamThreadLimit = std::stoi(EnvStr); - DP("Parsed OMP_TEAMS_THREAD_LIMIT=%d\n", EnvTeamThreadLimit); - } - if (const char *EnvStr = getenv("OMP_NUM_TEAMS")) { - // OMP_NUM_TEAMS has been set - EnvNumTeams = std::stoi(EnvStr); - DP("Parsed OMP_NUM_TEAMS=%d\n", EnvNumTeams); - } - if (const char *EnvStr = getenv("LIBOMPTARGET_SHARED_MEMORY_SIZE")) { - // LIBOMPTARGET_SHARED_MEMORY_SIZE has been set - DynamicMemorySize = std::stoi(EnvStr); - DP("Parsed LIBOMPTARGET_SHARED_MEMORY_SIZE = %" PRIu64 "\n", - DynamicMemorySize); - } - if (const char *EnvStr = getenv("LIBOMPTARGET_NUM_INITIAL_STREAMS")) { - // LIBOMPTARGET_NUM_INITIAL_STREAMS has been set - NumInitialStreams = std::stoi(EnvStr); - DP("Parsed LIBOMPTARGET_NUM_INITIAL_STREAMS=%d\n", NumInitialStreams); - } - - for (int I = 0; I < NumberOfDevices; ++I) - DeviceAllocators.emplace_back(); - - // Get the size threshold from environment variable - std::pair Res = MemoryManagerTy::getSizeThresholdFromEnv(); - UseMemoryManager = Res.second; - size_t MemoryManagerThreshold = Res.first; - - if (UseMemoryManager) - for (int I = 0; I < NumberOfDevices; ++I) - MemoryManagers.emplace_back(std::make_unique( - DeviceAllocators[I], MemoryManagerThreshold)); - - // We lazily initialize all devices later. - InitializedFlags.assign(NumberOfDevices, false); - } - - ~DeviceRTLTy() { - for (int DeviceId = 0; DeviceId < NumberOfDevices; ++DeviceId) - deinitDevice(DeviceId); - } - - // Check whether a given DeviceId is valid - bool isValidDeviceId(const int DeviceId) const { - return DeviceId >= 0 && DeviceId < NumberOfDevices; - } - - int getNumOfDevices() const { return NumberOfDevices; } - - void setRequiresFlag(const int64_t Flags) { this->RequiresFlags = Flags; } - - int initDevice(const int DeviceId) { - CUdevice Device; - - DP("Getting device %d\n", DeviceId); - CUresult Err = cuDeviceGet(&Device, DeviceId); - if (!checkResult(Err, "Error returned from cuDeviceGet\n")) - return OFFLOAD_FAIL; - - assert(InitializedFlags[DeviceId] == false && "Reinitializing device!"); - InitializedFlags[DeviceId] = true; - - // Query the current flags of the primary context and set its flags if - // it is inactive - unsigned int FormerPrimaryCtxFlags = 0; - int FormerPrimaryCtxIsActive = 0; - Err = cuDevicePrimaryCtxGetState(Device, &FormerPrimaryCtxFlags, - &FormerPrimaryCtxIsActive); - if (!checkResult(Err, "Error returned from cuDevicePrimaryCtxGetState\n")) - return OFFLOAD_FAIL; - - if (FormerPrimaryCtxIsActive) { - DP("The primary context is active, no change to its flags\n"); - if ((FormerPrimaryCtxFlags & CU_CTX_SCHED_MASK) != - CU_CTX_SCHED_BLOCKING_SYNC) - DP("Warning the current flags are not CU_CTX_SCHED_BLOCKING_SYNC\n"); - } else { - DP("The primary context is inactive, set its flags to " - "CU_CTX_SCHED_BLOCKING_SYNC\n"); - Err = cuDevicePrimaryCtxSetFlags(Device, CU_CTX_SCHED_BLOCKING_SYNC); - if (!checkResult(Err, "Error returned from cuDevicePrimaryCtxSetFlags\n")) - return OFFLOAD_FAIL; - } - - // Retain the per device primary context and save it to use whenever this - // device is selected. - Err = cuDevicePrimaryCtxRetain(&DeviceData[DeviceId].Context, Device); - if (!checkResult(Err, "Error returned from cuDevicePrimaryCtxRetain\n")) - return OFFLOAD_FAIL; - - Err = cuCtxSetCurrent(DeviceData[DeviceId].Context); - if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n")) - return OFFLOAD_FAIL; - - // Initialize the stream pool. - if (!StreamPool[DeviceId]) - StreamPool[DeviceId] = std::make_unique(StreamAllocatorTy(), - NumInitialStreams); - - // Initialize the event pool. - if (!EventPool[DeviceId]) - EventPool[DeviceId] = - std::make_unique(EventAllocatorTy(), NumInitialEvents); - - // Query attributes to determine number of threads/block and blocks/grid. - int MaxGridDimX; - Err = cuDeviceGetAttribute(&MaxGridDimX, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X, - Device); - if (Err != CUDA_SUCCESS) { - DP("Error getting max grid dimension, use default value %d\n", - DeviceRTLTy::DefaultNumTeams); - DeviceData[DeviceId].BlocksPerGrid = DeviceRTLTy::DefaultNumTeams; - } else { - DP("Using %d CUDA blocks per grid\n", MaxGridDimX); - DeviceData[DeviceId].BlocksPerGrid = MaxGridDimX; - } - - // We are only exploiting threads along the x axis. - int MaxBlockDimX; - Err = cuDeviceGetAttribute(&MaxBlockDimX, - CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X, Device); - if (Err != CUDA_SUCCESS) { - DP("Error getting max block dimension, use default value %d\n", - DeviceRTLTy::DefaultNumThreads); - DeviceData[DeviceId].ThreadsPerBlock = DeviceRTLTy::DefaultNumThreads; - } else { - DP("Using %d CUDA threads per block\n", MaxBlockDimX); - DeviceData[DeviceId].ThreadsPerBlock = MaxBlockDimX; - - if (EnvTeamThreadLimit > 0 && - DeviceData[DeviceId].ThreadsPerBlock > EnvTeamThreadLimit) { - DP("Max CUDA threads per block %d exceeds the thread limit %d set by " - "OMP_TEAMS_THREAD_LIMIT, capping at the limit\n", - DeviceData[DeviceId].ThreadsPerBlock, EnvTeamThreadLimit); - DeviceData[DeviceId].ThreadsPerBlock = EnvTeamThreadLimit; - } - if (DeviceData[DeviceId].ThreadsPerBlock > DeviceRTLTy::HardThreadLimit) { - DP("Max CUDA threads per block %d exceeds the hard thread limit %d, " - "capping at the hard limit\n", - DeviceData[DeviceId].ThreadsPerBlock, DeviceRTLTy::HardThreadLimit); - DeviceData[DeviceId].ThreadsPerBlock = DeviceRTLTy::HardThreadLimit; - } - } - - // Get and set warp size - int WarpSize; - Err = - cuDeviceGetAttribute(&WarpSize, CU_DEVICE_ATTRIBUTE_WARP_SIZE, Device); - if (Err != CUDA_SUCCESS) { - DP("Error getting warp size, assume default value 32\n"); - DeviceData[DeviceId].WarpSize = 32; - } else { - DP("Using warp size %d\n", WarpSize); - DeviceData[DeviceId].WarpSize = WarpSize; - } - - // Adjust teams to the env variables - if (EnvTeamLimit > 0 && DeviceData[DeviceId].BlocksPerGrid > EnvTeamLimit) { - DP("Capping max CUDA blocks per grid to OMP_TEAM_LIMIT=%d\n", - EnvTeamLimit); - DeviceData[DeviceId].BlocksPerGrid = EnvTeamLimit; - } - - size_t StackLimit; - size_t HeapLimit; - if (const char *EnvStr = getenv("LIBOMPTARGET_STACK_SIZE")) { - StackLimit = std::stol(EnvStr); - if (cuCtxSetLimit(CU_LIMIT_STACK_SIZE, StackLimit) != CUDA_SUCCESS) - return OFFLOAD_FAIL; - } else { - if (cuCtxGetLimit(&StackLimit, CU_LIMIT_STACK_SIZE) != CUDA_SUCCESS) - return OFFLOAD_FAIL; - } - if (const char *EnvStr = getenv("LIBOMPTARGET_HEAP_SIZE")) { - HeapLimit = std::stol(EnvStr); - if (cuCtxSetLimit(CU_LIMIT_MALLOC_HEAP_SIZE, HeapLimit) != CUDA_SUCCESS) - return OFFLOAD_FAIL; - } else { - if (cuCtxGetLimit(&HeapLimit, CU_LIMIT_MALLOC_HEAP_SIZE) != CUDA_SUCCESS) - return OFFLOAD_FAIL; - } - - INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId, - "Device supports up to %d CUDA blocks and %d threads with a " - "warp size of %d\n", - DeviceData[DeviceId].BlocksPerGrid, - DeviceData[DeviceId].ThreadsPerBlock, DeviceData[DeviceId].WarpSize); - INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId, - "Device heap size is %d Bytes, device stack size is %d Bytes per " - "thread\n", - (int)HeapLimit, (int)StackLimit); - - // Set default number of teams - if (EnvNumTeams > 0) { - DP("Default number of teams set according to environment %d\n", - EnvNumTeams); - DeviceData[DeviceId].NumTeams = EnvNumTeams; - } else { - DeviceData[DeviceId].NumTeams = DeviceRTLTy::DefaultNumTeams; - DP("Default number of teams set according to library's default %d\n", - DeviceRTLTy::DefaultNumTeams); - } - - if (DeviceData[DeviceId].NumTeams > DeviceData[DeviceId].BlocksPerGrid) { - DP("Default number of teams exceeds device limit, capping at %d\n", - DeviceData[DeviceId].BlocksPerGrid); - DeviceData[DeviceId].NumTeams = DeviceData[DeviceId].BlocksPerGrid; - } - - // Set default number of threads - DeviceData[DeviceId].NumThreads = DeviceRTLTy::DefaultNumThreads; - DP("Default number of threads set according to library's default %d\n", - DeviceRTLTy::DefaultNumThreads); - if (DeviceData[DeviceId].NumThreads > - DeviceData[DeviceId].ThreadsPerBlock) { - DP("Default number of threads exceeds device limit, capping at %d\n", - DeviceData[DeviceId].ThreadsPerBlock); - DeviceData[DeviceId].NumThreads = DeviceData[DeviceId].ThreadsPerBlock; - } - - return OFFLOAD_SUCCESS; - } - - int deinitDevice(const int DeviceId) { - auto IsInitialized = InitializedFlags[DeviceId]; - if (!IsInitialized) - return OFFLOAD_SUCCESS; - InitializedFlags[DeviceId] = false; - - if (UseMemoryManager) - MemoryManagers[DeviceId].release(); - - StreamPool[DeviceId].reset(); - EventPool[DeviceId].reset(); - - DeviceDataTy &D = DeviceData[DeviceId]; - if (!checkResult(cuCtxSetCurrent(D.Context), - "Error returned from cuCtxSetCurrent\n")) - return OFFLOAD_FAIL; - - // Unload all modules. - for (auto &M : Modules[DeviceId]) - if (!checkResult(cuModuleUnload(M), - "Error returned from cuModuleUnload\n")) - return OFFLOAD_FAIL; - - // Destroy context. - CUdevice Device; - if (!checkResult(cuCtxGetDevice(&Device), - "Error returned from cuCtxGetDevice\n")) - return OFFLOAD_FAIL; - - if (!checkResult(cuDevicePrimaryCtxRelease(Device), - "Error returned from cuDevicePrimaryCtxRelease\n")) - return OFFLOAD_FAIL; - - return OFFLOAD_SUCCESS; - } - - __tgt_target_table *loadBinary(const int DeviceId, - const __tgt_device_image *Image) { - // Clear the offload table as we are going to create a new one. - clearOffloadEntriesTable(DeviceId); - - // Create the module and extract the function pointers. - CUmodule Module; - DP("Load data from image " DPxMOD "\n", DPxPTR(Image->ImageStart)); - CUresult Err = - cuModuleLoadDataEx(&Module, Image->ImageStart, 0, nullptr, nullptr); - if (!checkResult(Err, "Error returned from cuModuleLoadDataEx\n")) - return nullptr; - - DP("CUDA module successfully loaded!\n"); - - Modules[DeviceId].push_back(Module); - - // Find the symbols in the module by name. - const __tgt_offload_entry *HostBegin = Image->EntriesBegin; - const __tgt_offload_entry *HostEnd = Image->EntriesEnd; - - std::list &KernelsList = DeviceData[DeviceId].KernelsList; - for (const __tgt_offload_entry *E = HostBegin; E != HostEnd; ++E) { - if (!E->addr) { - // We return nullptr when something like this happens, the host should - // have always something in the address to uniquely identify the target - // region. - DP("Invalid binary: host entry '' (size = %zd)...\n", E->size); - return nullptr; - } - - if (E->size) { - __tgt_offload_entry Entry = *E; - CUdeviceptr CUPtr; - size_t CUSize; - Err = cuModuleGetGlobal(&CUPtr, &CUSize, Module, E->name); - // We keep this style here because we need the name - if (Err != CUDA_SUCCESS) { - REPORT("Loading global '%s' Failed\n", E->name); - CUDA_ERR_STRING(Err); - return nullptr; - } - - if (CUSize != E->size) { - DP("Loading global '%s' - size mismatch (%zd != %zd)\n", E->name, - CUSize, E->size); - return nullptr; - } - - DP("Entry point " DPxMOD " maps to global %s (" DPxMOD ")\n", - DPxPTR(E - HostBegin), E->name, DPxPTR(CUPtr)); - - Entry.addr = (void *)(CUPtr); - - // Note: In the current implementation declare target variables - // can either be link or to. This means that once unified - // memory is activated via the requires directive, the variable - // can be used directly from the host in both cases. - // TODO: when variables types other than to or link are added, - // the below condition should be changed to explicitly - // check for to and link variables types: - // (RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY && (e->flags & - // OMP_DECLARE_TARGET_LINK || e->flags == OMP_DECLARE_TARGET_TO)) - if (RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY) { - // If unified memory is present any target link or to variables - // can access host addresses directly. There is no longer a - // need for device copies. - cuMemcpyHtoD(CUPtr, E->addr, sizeof(void *)); - DP("Copy linked variable host address (" DPxMOD - ") to device address (" DPxMOD ")\n", - DPxPTR(*((void **)E->addr)), DPxPTR(CUPtr)); - } - - addOffloadEntry(DeviceId, Entry); - - continue; - } - - CUfunction Func; - Err = cuModuleGetFunction(&Func, Module, E->name); - // We keep this style here because we need the name - if (Err != CUDA_SUCCESS) { - REPORT("Loading '%s' Failed\n", E->name); - CUDA_ERR_STRING(Err); - return nullptr; - } - - DP("Entry point " DPxMOD " maps to %s (" DPxMOD ")\n", - DPxPTR(E - HostBegin), E->name, DPxPTR(Func)); - - // default value GENERIC (in case symbol is missing from cubin file) - llvm::omp::OMPTgtExecModeFlags ExecModeVal; - std::string ExecModeNameStr(E->name); - ExecModeNameStr += "_exec_mode"; - const char *ExecModeName = ExecModeNameStr.c_str(); - - CUdeviceptr ExecModePtr; - size_t CUSize; - Err = cuModuleGetGlobal(&ExecModePtr, &CUSize, Module, ExecModeName); - if (Err == CUDA_SUCCESS) { - if (CUSize != sizeof(llvm::omp::OMPTgtExecModeFlags)) { - DP("Loading global exec_mode '%s' - size mismatch (%zd != %zd)\n", - ExecModeName, CUSize, sizeof(llvm::omp::OMPTgtExecModeFlags)); - return nullptr; - } - - Err = cuMemcpyDtoH(&ExecModeVal, ExecModePtr, CUSize); - if (Err != CUDA_SUCCESS) { - REPORT("Error when copying data from device to host. Pointers: " - "host = " DPxMOD ", device = " DPxMOD ", size = %zd\n", - DPxPTR(&ExecModeVal), DPxPTR(ExecModePtr), CUSize); - CUDA_ERR_STRING(Err); - return nullptr; - } - } else { - DP("Loading global exec_mode '%s' - symbol missing, using default " - "value GENERIC (1)\n", - ExecModeName); - } - - KernelsList.emplace_back(Func, ExecModeVal); - - __tgt_offload_entry Entry = *E; - Entry.addr = &KernelsList.back(); - addOffloadEntry(DeviceId, Entry); - } - - // send device environment data to the device - { - // TODO: The device ID used here is not the real device ID used by OpenMP. - DeviceEnvironmentTy DeviceEnv{0, static_cast(NumberOfDevices), - static_cast(DeviceId), - static_cast(DynamicMemorySize)}; - - if (const char *EnvStr = getenv("LIBOMPTARGET_DEVICE_RTL_DEBUG")) - DeviceEnv.DebugKind = std::stoi(EnvStr); - - const char *DeviceEnvName = "__omp_rtl_device_environment"; - CUdeviceptr DeviceEnvPtr; - size_t CUSize; - - Err = cuModuleGetGlobal(&DeviceEnvPtr, &CUSize, Module, DeviceEnvName); - if (Err == CUDA_SUCCESS) { - if (CUSize != sizeof(DeviceEnv)) { - REPORT( - "Global device_environment '%s' - size mismatch (%zu != %zu)\n", - DeviceEnvName, CUSize, sizeof(int32_t)); - CUDA_ERR_STRING(Err); - return nullptr; - } - - Err = cuMemcpyHtoD(DeviceEnvPtr, &DeviceEnv, CUSize); - if (Err != CUDA_SUCCESS) { - REPORT("Error when copying data from host to device. Pointers: " - "host = " DPxMOD ", device = " DPxMOD ", size = %zu\n", - DPxPTR(&DeviceEnv), DPxPTR(DeviceEnvPtr), CUSize); - CUDA_ERR_STRING(Err); - return nullptr; - } - - DP("Sending global device environment data %zu bytes\n", CUSize); - } else { - DP("Finding global device environment '%s' - symbol missing.\n", - DeviceEnvName); - DP("Continue, considering this is a device RTL which does not accept " - "environment setting.\n"); - } - } - - return getOffloadEntriesTable(DeviceId); - } - - void *dataAlloc(const int DeviceId, const int64_t Size, - const TargetAllocTy Kind) { - switch (Kind) { - case TARGET_ALLOC_DEFAULT: - case TARGET_ALLOC_DEVICE: - if (UseMemoryManager) - return MemoryManagers[DeviceId]->allocate(Size, nullptr); - else - return DeviceAllocators[DeviceId].allocate(Size, nullptr, Kind); - case TARGET_ALLOC_HOST: - case TARGET_ALLOC_SHARED: - return DeviceAllocators[DeviceId].allocate(Size, nullptr, Kind); - } - - REPORT("Invalid target data allocation kind or requested allocator not " - "implemented yet\n"); - - return nullptr; - } - - int dataSubmit(const int DeviceId, const void *TgtPtr, const void *HstPtr, - const int64_t Size, __tgt_async_info *AsyncInfo) const { - assert(AsyncInfo && "AsyncInfo is nullptr"); - - CUstream Stream = getStream(DeviceId, AsyncInfo); - CUresult Err = cuMemcpyHtoDAsync((CUdeviceptr)TgtPtr, HstPtr, Size, Stream); - if (Err != CUDA_SUCCESS) { - DP("Error when copying data from host to device. Pointers: host " - "= " DPxMOD ", device = " DPxMOD ", size = %" PRId64 "\n", - DPxPTR(HstPtr), DPxPTR(TgtPtr), Size); - CUDA_ERR_STRING(Err); - return OFFLOAD_FAIL; - } - - return OFFLOAD_SUCCESS; - } - - int dataRetrieve(const int DeviceId, void *HstPtr, const void *TgtPtr, - const int64_t Size, __tgt_async_info *AsyncInfo) const { - assert(AsyncInfo && "AsyncInfo is nullptr"); - - CUstream Stream = getStream(DeviceId, AsyncInfo); - CUresult Err = cuMemcpyDtoHAsync(HstPtr, (CUdeviceptr)TgtPtr, Size, Stream); - if (Err != CUDA_SUCCESS) { - DP("Error when copying data from device to host. Pointers: host " - "= " DPxMOD ", device = " DPxMOD ", size = %" PRId64 "\n", - DPxPTR(HstPtr), DPxPTR(TgtPtr), Size); - CUDA_ERR_STRING(Err); - return OFFLOAD_FAIL; - } - - return OFFLOAD_SUCCESS; - } - - int dataExchange(int SrcDevId, const void *SrcPtr, int DstDevId, void *DstPtr, - int64_t Size, __tgt_async_info *AsyncInfo) { - assert(AsyncInfo && "AsyncInfo is nullptr"); - - CUresult Err; - CUstream Stream = getStream(SrcDevId, AsyncInfo); - - // If they are two devices, we try peer to peer copy first - if (SrcDevId != DstDevId) { - std::lock_guard LG(PeerAccessMatrixLock); - - switch (PeerAccessMatrix[SrcDevId][DstDevId]) { - case PeerAccessState::No: { - REPORT("Peer access from %" PRId32 " to %" PRId32 - " is not supported. Fall back to D2D memcpy.\n", - SrcDevId, DstDevId); - return memcpyDtoD(SrcPtr, DstPtr, Size, Stream); - } - case PeerAccessState::Unkown: { - int CanAccessPeer = 0; - Err = cuDeviceCanAccessPeer(&CanAccessPeer, SrcDevId, DstDevId); - if (Err != CUDA_SUCCESS) { - REPORT("Error returned from cuDeviceCanAccessPeer. src = %" PRId32 - ", dst = %" PRId32 ". Fall back to D2D memcpy.\n", - SrcDevId, DstDevId); - CUDA_ERR_STRING(Err); - PeerAccessMatrix[SrcDevId][DstDevId] = PeerAccessState::No; - return memcpyDtoD(SrcPtr, DstPtr, Size, Stream); - } - - if (!CanAccessPeer) { - REPORT("P2P access from %d to %d is not supported. Fall back to D2D " - "memcpy.\n", - SrcDevId, DstDevId); - PeerAccessMatrix[SrcDevId][DstDevId] = PeerAccessState::No; - return memcpyDtoD(SrcPtr, DstPtr, Size, Stream); - } - - Err = cuCtxEnablePeerAccess(DeviceData[DstDevId].Context, 0); - if (Err != CUDA_SUCCESS) { - REPORT("Error returned from cuCtxEnablePeerAccess. src = %" PRId32 - ", dst = %" PRId32 ". Fall back to D2D memcpy.\n", - SrcDevId, DstDevId); - CUDA_ERR_STRING(Err); - PeerAccessMatrix[SrcDevId][DstDevId] = PeerAccessState::No; - return memcpyDtoD(SrcPtr, DstPtr, Size, Stream); - } - - PeerAccessMatrix[SrcDevId][DstDevId] = PeerAccessState::Yes; - - [[fallthrough]]; - } - case PeerAccessState::Yes: { - Err = cuMemcpyPeerAsync( - (CUdeviceptr)DstPtr, DeviceData[DstDevId].Context, - (CUdeviceptr)SrcPtr, DeviceData[SrcDevId].Context, Size, Stream); - if (Err == CUDA_SUCCESS) - return OFFLOAD_SUCCESS; - - DP("Error returned from cuMemcpyPeerAsync. src_ptr = " DPxMOD - ", src_id =%" PRId32 ", dst_ptr = " DPxMOD ", dst_id =%" PRId32 - ". Fall back to D2D memcpy.\n", - DPxPTR(SrcPtr), SrcDevId, DPxPTR(DstPtr), DstDevId); - CUDA_ERR_STRING(Err); - - return memcpyDtoD(SrcPtr, DstPtr, Size, Stream); - } - } - } - - return memcpyDtoD(SrcPtr, DstPtr, Size, Stream); - } - - int dataDelete(const int DeviceId, void *TgtPtr, TargetAllocTy Kind) { - switch (Kind) { - case TARGET_ALLOC_DEFAULT: - case TARGET_ALLOC_DEVICE: - if (UseMemoryManager) - return MemoryManagers[DeviceId]->free(TgtPtr); - else - return DeviceAllocators[DeviceId].free(TgtPtr, Kind); - case TARGET_ALLOC_HOST: - case TARGET_ALLOC_SHARED: - return DeviceAllocators[DeviceId].free(TgtPtr, Kind); - } - - REPORT("Invalid target data allocation kind or requested allocator not " - "implemented yet\n"); - - return OFFLOAD_FAIL; - } - - int runTargetTeamRegion(const int DeviceId, void *TgtEntryPtr, void **TgtArgs, - ptrdiff_t *TgtOffsets, const int ArgNum, - const int TeamNum, const int ThreadLimit, - const unsigned int LoopTripCount, - __tgt_async_info *AsyncInfo) const { - // All args are references. - std::vector Args(ArgNum); - std::vector Ptrs(ArgNum); - - for (int I = 0; I < ArgNum; ++I) { - Ptrs[I] = (void *)((intptr_t)TgtArgs[I] + TgtOffsets[I]); - Args[I] = &Ptrs[I]; - } - - KernelTy *KernelInfo = reinterpret_cast(TgtEntryPtr); - - const bool IsSPMDGenericMode = - KernelInfo->ExecutionMode == llvm::omp::OMP_TGT_EXEC_MODE_GENERIC_SPMD; - const bool IsSPMDMode = - KernelInfo->ExecutionMode == llvm::omp::OMP_TGT_EXEC_MODE_SPMD; - const bool IsGenericMode = - KernelInfo->ExecutionMode == llvm::omp::OMP_TGT_EXEC_MODE_GENERIC; - - int CudaThreadsPerBlock; - if (ThreadLimit > 0) { - DP("Setting CUDA threads per block to requested %d\n", ThreadLimit); - CudaThreadsPerBlock = ThreadLimit; - // Add master warp if necessary - if (IsGenericMode) { - DP("Adding master warp: +%d threads\n", DeviceData[DeviceId].WarpSize); - CudaThreadsPerBlock += DeviceData[DeviceId].WarpSize; - } - } else { - DP("Setting CUDA threads per block to default %d\n", - DeviceData[DeviceId].NumThreads); - CudaThreadsPerBlock = DeviceData[DeviceId].NumThreads; - } - - if ((unsigned)CudaThreadsPerBlock > DeviceData[DeviceId].ThreadsPerBlock) { - DP("Threads per block capped at device limit %d\n", - DeviceData[DeviceId].ThreadsPerBlock); - CudaThreadsPerBlock = DeviceData[DeviceId].ThreadsPerBlock; - } - - CUresult Err; - if (!KernelInfo->MaxThreadsPerBlock) { - Err = cuFuncGetAttribute(&KernelInfo->MaxThreadsPerBlock, - CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, - KernelInfo->Func); - if (!checkResult(Err, "Error returned from cuFuncGetAttribute\n")) - return OFFLOAD_FAIL; - } - - if (KernelInfo->MaxThreadsPerBlock < CudaThreadsPerBlock) { - DP("Threads per block capped at kernel limit %d\n", - KernelInfo->MaxThreadsPerBlock); - CudaThreadsPerBlock = KernelInfo->MaxThreadsPerBlock; - } - - unsigned int CudaBlocksPerGrid; - if (TeamNum <= 0) { - if (LoopTripCount > 0 && EnvNumTeams < 0) { - if (IsSPMDGenericMode) { - // If we reach this point, then we are executing a kernel that was - // transformed from Generic-mode to SPMD-mode. This kernel has - // SPMD-mode execution, but needs its blocks to be scheduled - // differently because the current loop trip count only applies to the - // `teams distribute` region and will create var too few blocks using - // the regular SPMD-mode method. - CudaBlocksPerGrid = LoopTripCount; - } else if (IsSPMDMode) { - // We have a combined construct, i.e. `target teams distribute - // parallel for [simd]`. We launch so many teams so that each thread - // will execute one iteration of the loop. round up to the nearest - // integer - CudaBlocksPerGrid = ((LoopTripCount - 1) / CudaThreadsPerBlock) + 1; - } else if (IsGenericMode) { - // If we reach this point, then we have a non-combined construct, i.e. - // `teams distribute` with a nested `parallel for` and each team is - // assigned one iteration of the `distribute` loop. E.g.: - // - // #pragma omp target teams distribute - // for(...loop_tripcount...) { - // #pragma omp parallel for - // for(...) {} - // } - // - // Threads within a team will execute the iterations of the `parallel` - // loop. - CudaBlocksPerGrid = LoopTripCount; - } else { - REPORT("Unknown execution mode: %d\n", - static_cast(KernelInfo->ExecutionMode)); - return OFFLOAD_FAIL; - } - DP("Using %d teams due to loop trip count %" PRIu32 - " and number of threads per block %d\n", - CudaBlocksPerGrid, LoopTripCount, CudaThreadsPerBlock); - } else { - DP("Using default number of teams %d\n", DeviceData[DeviceId].NumTeams); - CudaBlocksPerGrid = DeviceData[DeviceId].NumTeams; - } - } else { - DP("Using requested number of teams %d\n", TeamNum); - CudaBlocksPerGrid = TeamNum; - } - - if (CudaBlocksPerGrid > DeviceData[DeviceId].BlocksPerGrid) { - DP("Capping number of teams to team limit %d\n", - DeviceData[DeviceId].BlocksPerGrid); - CudaBlocksPerGrid = DeviceData[DeviceId].BlocksPerGrid; - } - - INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId, - "Launching kernel %s with %d blocks and %d threads in %s mode\n", - (getOffloadEntry(DeviceId, TgtEntryPtr)) - ? getOffloadEntry(DeviceId, TgtEntryPtr)->name - : "(null)", - CudaBlocksPerGrid, CudaThreadsPerBlock, - (!IsSPMDMode ? (IsGenericMode ? "Generic" : "SPMD-Generic") : "SPMD")); - - CUstream Stream = getStream(DeviceId, AsyncInfo); - Err = cuLaunchKernel(KernelInfo->Func, CudaBlocksPerGrid, /* gridDimY */ 1, - /* gridDimZ */ 1, CudaThreadsPerBlock, - /* blockDimY */ 1, /* blockDimZ */ 1, - DynamicMemorySize, Stream, &Args[0], nullptr); - if (!checkResult(Err, "Error returned from cuLaunchKernel\n")) - return OFFLOAD_FAIL; - - DP("Launch of entry point at " DPxMOD " successful!\n", - DPxPTR(TgtEntryPtr)); - - return OFFLOAD_SUCCESS; - } - - int synchronize(const int DeviceId, __tgt_async_info *AsyncInfo) const { - CUstream Stream = reinterpret_cast(AsyncInfo->Queue); - CUresult Err = cuStreamSynchronize(Stream); - - // Once the stream is synchronized, return it to stream pool and reset - // AsyncInfo. This is to make sure the synchronization only works for its - // own tasks. - StreamPool[DeviceId]->release(reinterpret_cast(AsyncInfo->Queue)); - AsyncInfo->Queue = nullptr; - - if (Err != CUDA_SUCCESS) { - DP("Error when synchronizing stream. stream = " DPxMOD - ", async info ptr = " DPxMOD "\n", - DPxPTR(Stream), DPxPTR(AsyncInfo)); - CUDA_ERR_STRING(Err); - } - return (Err == CUDA_SUCCESS) ? OFFLOAD_SUCCESS : OFFLOAD_FAIL; - } - - int queryAsync(const int DeviceId, __tgt_async_info *AsyncInfo) const { - CUstream Stream = reinterpret_cast(AsyncInfo->Queue); - CUresult Err = cuStreamQuery(Stream); - - // Not ready streams must be considered as successful operations. - if (Err == CUDA_ERROR_NOT_READY) - return OFFLOAD_SUCCESS; - - // Once the stream is synchronized or an error occurs, return it to the - // stream pool and reset AsyncInfo. This is to make sure the - // synchronization only works for its own tasks. - StreamPool[DeviceId]->release(Stream); - AsyncInfo->Queue = nullptr; - - if (Err != CUDA_SUCCESS) { - DP("Error when querying for stream progress. stream = " DPxMOD - ", async info ptr = " DPxMOD "\n", - DPxPTR(Stream), DPxPTR(AsyncInfo)); - CUDA_ERR_STRING(Err); - } - return (Err == CUDA_SUCCESS) ? OFFLOAD_SUCCESS : OFFLOAD_FAIL; - } - - void printDeviceInfo(int32_t DeviceId) { - char TmpChar[1000]; - std::string TmpStr; - size_t TmpSt; - int TmpInt, TmpInt2, TmpInt3; - - CUdevice Device; - checkResult(cuDeviceGet(&Device, DeviceId), - "Error returned from cuCtxGetDevice\n"); - - cuDriverGetVersion(&TmpInt); - printf(" CUDA Driver Version: \t\t%d \n", TmpInt); - printf(" CUDA Device Number: \t\t%d \n", DeviceId); - checkResult(cuDeviceGetName(TmpChar, 1000, Device), - "Error returned from cuDeviceGetName\n"); - printf(" Device Name: \t\t\t%s \n", TmpChar); - checkResult(cuDeviceTotalMem(&TmpSt, Device), - "Error returned from cuDeviceTotalMem\n"); - printf(" Global Memory Size: \t\t%zu bytes \n", TmpSt); - checkResult(cuDeviceGetAttribute( - &TmpInt, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, Device), - "Error returned from cuDeviceGetAttribute\n"); - printf(" Number of Multiprocessors: \t\t%d \n", TmpInt); - checkResult( - cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_GPU_OVERLAP, Device), - "Error returned from cuDeviceGetAttribute\n"); - printf(" Concurrent Copy and Execution: \t%s \n", BOOL2TEXT(TmpInt)); - checkResult(cuDeviceGetAttribute( - &TmpInt, CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY, Device), - "Error returned from cuDeviceGetAttribute\n"); - printf(" Total Constant Memory: \t\t%d bytes\n", TmpInt); - checkResult( - cuDeviceGetAttribute( - &TmpInt, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK, Device), - "Error returned from cuDeviceGetAttribute\n"); - printf(" Max Shared Memory per Block: \t%d bytes \n", TmpInt); - checkResult( - cuDeviceGetAttribute( - &TmpInt, CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK, Device), - "Error returned from cuDeviceGetAttribute\n"); - printf(" Registers per Block: \t\t%d \n", TmpInt); - checkResult( - cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_WARP_SIZE, Device), - "Error returned from cuDeviceGetAttribute\n"); - printf(" Warp Size: \t\t\t\t%d Threads \n", TmpInt); - checkResult(cuDeviceGetAttribute( - &TmpInt, CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK, Device), - "Error returned from cuDeviceGetAttribute\n"); - printf(" Maximum Threads per Block: \t\t%d \n", TmpInt); - checkResult(cuDeviceGetAttribute( - &TmpInt, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X, Device), - "Error returned from cuDeviceGetAttribute\n"); - checkResult(cuDeviceGetAttribute( - &TmpInt2, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y, Device), - "Error returned from cuDeviceGetAttribute\n"); - checkResult(cuDeviceGetAttribute( - &TmpInt3, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z, Device), - "Error returned from cuDeviceGetAttribute\n"); - printf(" Maximum Block Dimensions: \t\t%d, %d, %d \n", TmpInt, TmpInt2, - TmpInt3); - checkResult(cuDeviceGetAttribute( - &TmpInt, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X, Device), - "Error returned from cuDeviceGetAttribute\n"); - checkResult(cuDeviceGetAttribute( - &TmpInt2, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y, Device), - "Error returned from cuDeviceGetAttribute\n"); - checkResult(cuDeviceGetAttribute( - &TmpInt3, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z, Device), - "Error returned from cuDeviceGetAttribute\n"); - printf(" Maximum Grid Dimensions: \t\t%d x %d x %d \n", TmpInt, TmpInt2, - TmpInt3); - checkResult( - cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_MAX_PITCH, Device), - "Error returned from cuDeviceGetAttribute\n"); - printf(" Maximum Memory Pitch: \t\t%d bytes \n", TmpInt); - checkResult(cuDeviceGetAttribute( - &TmpInt, CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT, Device), - "Error returned from cuDeviceGetAttribute\n"); - printf(" Texture Alignment: \t\t\t%d bytes \n", TmpInt); - checkResult( - cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, Device), - "Error returned from cuDeviceGetAttribute\n"); - printf(" Clock Rate: \t\t\t%d kHz\n", TmpInt); - checkResult(cuDeviceGetAttribute( - &TmpInt, CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT, Device), - "Error returned from cuDeviceGetAttribute\n"); - printf(" Execution Timeout: \t\t\t%s \n", BOOL2TEXT(TmpInt)); - checkResult( - cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_INTEGRATED, Device), - "Error returned from cuDeviceGetAttribute\n"); - printf(" Integrated Device: \t\t\t%s \n", BOOL2TEXT(TmpInt)); - checkResult(cuDeviceGetAttribute( - &TmpInt, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, Device), - "Error returned from cuDeviceGetAttribute\n"); - printf(" Can Map Host Memory: \t\t%s \n", BOOL2TEXT(TmpInt)); - checkResult( - cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, Device), - "Error returned from cuDeviceGetAttribute\n"); - if (TmpInt == CU_COMPUTEMODE_DEFAULT) - TmpStr = "DEFAULT"; - else if (TmpInt == CU_COMPUTEMODE_PROHIBITED) - TmpStr = "PROHIBITED"; - else if (TmpInt == CU_COMPUTEMODE_EXCLUSIVE_PROCESS) - TmpStr = "EXCLUSIVE PROCESS"; - else - TmpStr = "unknown"; - printf(" Compute Mode: \t\t\t%s \n", TmpStr.c_str()); - checkResult(cuDeviceGetAttribute( - &TmpInt, CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS, Device), - "Error returned from cuDeviceGetAttribute\n"); - printf(" Concurrent Kernels: \t\t%s \n", BOOL2TEXT(TmpInt)); - checkResult( - cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_ECC_ENABLED, Device), - "Error returned from cuDeviceGetAttribute\n"); - printf(" ECC Enabled: \t\t\t%s \n", BOOL2TEXT(TmpInt)); - checkResult(cuDeviceGetAttribute( - &TmpInt, CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE, Device), - "Error returned from cuDeviceGetAttribute\n"); - printf(" Memory Clock Rate: \t\t\t%d kHz\n", TmpInt); - checkResult( - cuDeviceGetAttribute( - &TmpInt, CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH, Device), - "Error returned from cuDeviceGetAttribute\n"); - printf(" Memory Bus Width: \t\t\t%d bits\n", TmpInt); - checkResult(cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE, - Device), - "Error returned from cuDeviceGetAttribute\n"); - printf(" L2 Cache Size: \t\t\t%d bytes \n", TmpInt); - checkResult(cuDeviceGetAttribute( - &TmpInt, CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR, - Device), - "Error returned from cuDeviceGetAttribute\n"); - printf(" Max Threads Per SMP: \t\t%d \n", TmpInt); - checkResult(cuDeviceGetAttribute( - &TmpInt, CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT, Device), - "Error returned from cuDeviceGetAttribute\n"); - printf(" Async Engines: \t\t\t%s (%d) \n", BOOL2TEXT(TmpInt), TmpInt); - checkResult(cuDeviceGetAttribute( - &TmpInt, CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING, Device), - "Error returned from cuDeviceGetAttribute\n"); - printf(" Unified Addressing: \t\t%s \n", BOOL2TEXT(TmpInt)); - checkResult(cuDeviceGetAttribute( - &TmpInt, CU_DEVICE_ATTRIBUTE_MANAGED_MEMORY, Device), - "Error returned from cuDeviceGetAttribute\n"); - printf(" Managed Memory: \t\t\t%s \n", BOOL2TEXT(TmpInt)); - checkResult( - cuDeviceGetAttribute( - &TmpInt, CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS, Device), - "Error returned from cuDeviceGetAttribute\n"); - printf(" Concurrent Managed Memory: \t\t%s \n", BOOL2TEXT(TmpInt)); - checkResult( - cuDeviceGetAttribute( - &TmpInt, CU_DEVICE_ATTRIBUTE_COMPUTE_PREEMPTION_SUPPORTED, Device), - "Error returned from cuDeviceGetAttribute\n"); - printf(" Preemption Supported: \t\t%s \n", BOOL2TEXT(TmpInt)); - checkResult(cuDeviceGetAttribute( - &TmpInt, CU_DEVICE_ATTRIBUTE_COOPERATIVE_LAUNCH, Device), - "Error returned from cuDeviceGetAttribute\n"); - printf(" Cooperative Launch: \t\t%s \n", BOOL2TEXT(TmpInt)); - checkResult(cuDeviceGetAttribute( - &TmpInt, CU_DEVICE_ATTRIBUTE_MULTI_GPU_BOARD, Device), - "Error returned from cuDeviceGetAttribute\n"); - printf(" Multi-Device Boars: \t\t%s \n", BOOL2TEXT(TmpInt)); - checkResult( - cuDeviceGetAttribute( - &TmpInt, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, Device), - "Error returned from cuDeviceGetAttribute\n"); - checkResult( - cuDeviceGetAttribute( - &TmpInt2, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, Device), - "Error returned from cuDeviceGetAttribute\n"); - printf(" Compute Capabilities: \t\t%d%d \n", TmpInt, TmpInt2); - } - - int createEvent(int DeviceId, void **P) { - CUevent Event = nullptr; - if (EventPool[DeviceId]->acquire(Event) != OFFLOAD_SUCCESS) - return OFFLOAD_FAIL; - *P = Event; - return OFFLOAD_SUCCESS; - } - - int destroyEvent(int DeviceId, void *EventPtr) { - EventPool[DeviceId]->release(reinterpret_cast(EventPtr)); - return OFFLOAD_SUCCESS; - } - - int waitEvent(const int DeviceId, __tgt_async_info *AsyncInfo, - void *EventPtr) const { - CUstream Stream = getStream(DeviceId, AsyncInfo); - CUevent Event = reinterpret_cast(EventPtr); - - // We don't use CU_EVENT_WAIT_DEFAULT here as it is only available from - // specific CUDA version, and defined as 0x0. In previous version, per CUDA - // API document, that argument has to be 0x0. - CUresult Err = cuStreamWaitEvent(Stream, Event, 0); - if (Err != CUDA_SUCCESS) { - DP("Error when waiting event. stream = " DPxMOD ", event = " DPxMOD "\n", - DPxPTR(Stream), DPxPTR(Event)); - CUDA_ERR_STRING(Err); - return OFFLOAD_FAIL; - } - - return OFFLOAD_SUCCESS; - } - - int releaseAsyncInfo(int DeviceId, __tgt_async_info *AsyncInfo) const { - if (AsyncInfo->Queue) { - StreamPool[DeviceId]->release( - reinterpret_cast(AsyncInfo->Queue)); - AsyncInfo->Queue = nullptr; - } - - return OFFLOAD_SUCCESS; - } - - int initAsyncInfo(int DeviceId, __tgt_async_info **AsyncInfo) const { - *AsyncInfo = new __tgt_async_info; - getStream(DeviceId, *AsyncInfo); - return OFFLOAD_SUCCESS; - } - - int initDeviceInfo(int DeviceId, __tgt_device_info *DeviceInfo, - const char **ErrStr) const { - assert(DeviceInfo && "DeviceInfo is nullptr"); - - if (!DeviceInfo->Context) - DeviceInfo->Context = DeviceData[DeviceId].Context; - if (!DeviceInfo->Device) { - CUdevice Dev; - CUresult Err = cuDeviceGet(&Dev, DeviceId); - if (Err == CUDA_SUCCESS) { - DeviceInfo->Device = reinterpret_cast(Dev); - } else { - cuGetErrorString(Err, ErrStr); - return OFFLOAD_FAIL; - } - } - return OFFLOAD_SUCCESS; - } - - int setContext(int DeviceId) { - assert(InitializedFlags[DeviceId] && "Device is not initialized"); - - CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context); - if (!checkResult(Err, "error returned from cuCtxSetCurrent")) - return OFFLOAD_FAIL; - - return OFFLOAD_SUCCESS; - } -}; - -DeviceRTLTy DeviceRTL; -} // namespace - -// Exposed library API function -#ifdef __cplusplus -extern "C" { -#endif - -int32_t __tgt_rtl_is_valid_binary(__tgt_device_image *Image) { - return elf_check_machine(Image, /* EM_CUDA */ 190); -} - -int32_t __tgt_rtl_is_valid_binary_info(__tgt_device_image *Image, - __tgt_image_info *Info) { - if (!__tgt_rtl_is_valid_binary(Image)) - return false; - - // A subarchitecture was not specified. Assume it is compatible. - if (!Info || !Info->Arch) - return true; - - int32_t NumberOfDevices = 0; - if (cuDeviceGetCount(&NumberOfDevices) != CUDA_SUCCESS) - return false; - - StringRef ArchStr = StringRef(Info->Arch).drop_front(sizeof("sm_") - 1); - for (int32_t DeviceId = 0; DeviceId < NumberOfDevices; ++DeviceId) { - CUdevice Device; - if (cuDeviceGet(&Device, DeviceId) != CUDA_SUCCESS) - return false; - - int32_t Major, Minor; - if (cuDeviceGetAttribute(&Major, - CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, - Device) != CUDA_SUCCESS) - return false; - if (cuDeviceGetAttribute(&Minor, - CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, - Device) != CUDA_SUCCESS) - return false; - - // A cubin generated for a certain compute capability is supported to run on - // any GPU with the same major revision and same or higher minor revision. - int32_t ImageMajor = ArchStr[0] - '0'; - int32_t ImageMinor = ArchStr[1] - '0'; - if (Major != ImageMajor || Minor < ImageMinor) - return false; - } - - DP("Image has compatible compute capability: %s\n", Info->Arch); - return true; -} - -int32_t __tgt_rtl_number_of_devices() { return DeviceRTL.getNumOfDevices(); } - -int64_t __tgt_rtl_init_requires(int64_t RequiresFlags) { - DP("Init requires flags to %" PRId64 "\n", RequiresFlags); - DeviceRTL.setRequiresFlag(RequiresFlags); - return RequiresFlags; -} - -int32_t __tgt_rtl_is_data_exchangable(int32_t SrcDevId, int DstDevId) { - if (DeviceRTL.isValidDeviceId(SrcDevId) && - DeviceRTL.isValidDeviceId(DstDevId)) - return 1; - - return 0; -} - -int32_t __tgt_rtl_init_device(int32_t DeviceId) { - assert(DeviceRTL.isValidDeviceId(DeviceId) && "device_id is invalid"); - // Context is set when init the device. - - return DeviceRTL.initDevice(DeviceId); -} - -int32_t __tgt_rtl_deinit_device(int32_t DeviceId) { - assert(DeviceRTL.isValidDeviceId(DeviceId) && "device_id is invalid"); - // Context is set when deinit the device. - - return DeviceRTL.deinitDevice(DeviceId); -} - -__tgt_target_table *__tgt_rtl_load_binary(int32_t DeviceId, - __tgt_device_image *Image) { - assert(DeviceRTL.isValidDeviceId(DeviceId) && "device_id is invalid"); - - if (DeviceRTL.setContext(DeviceId) != OFFLOAD_SUCCESS) - return nullptr; - - return DeviceRTL.loadBinary(DeviceId, Image); -} - -void *__tgt_rtl_data_alloc(int32_t DeviceId, int64_t Size, void *, - int32_t Kind) { - assert(DeviceRTL.isValidDeviceId(DeviceId) && "device_id is invalid"); - - if (DeviceRTL.setContext(DeviceId) != OFFLOAD_SUCCESS) - return nullptr; - - return DeviceRTL.dataAlloc(DeviceId, Size, (TargetAllocTy)Kind); -} - -int32_t __tgt_rtl_data_submit(int32_t DeviceId, void *TgtPtr, void *HstPtr, - int64_t Size) { - assert(DeviceRTL.isValidDeviceId(DeviceId) && "device_id is invalid"); - // Context is set in __tgt_rtl_data_submit_async. - - __tgt_async_info AsyncInfo; - const int32_t Rc = - __tgt_rtl_data_submit_async(DeviceId, TgtPtr, HstPtr, Size, &AsyncInfo); - if (Rc != OFFLOAD_SUCCESS) - return OFFLOAD_FAIL; - - return __tgt_rtl_synchronize(DeviceId, &AsyncInfo); -} - -int32_t __tgt_rtl_data_submit_async(int32_t DeviceId, void *TgtPtr, - void *HstPtr, int64_t Size, - __tgt_async_info *AsyncInfoPtr) { - assert(DeviceRTL.isValidDeviceId(DeviceId) && "device_id is invalid"); - assert(AsyncInfoPtr && "async_info_ptr is nullptr"); - - if (DeviceRTL.setContext(DeviceId) != OFFLOAD_SUCCESS) - return OFFLOAD_FAIL; - - return DeviceRTL.dataSubmit(DeviceId, TgtPtr, HstPtr, Size, AsyncInfoPtr); -} - -int32_t __tgt_rtl_data_retrieve(int32_t DeviceId, void *HstPtr, void *TgtPtr, - int64_t Size) { - assert(DeviceRTL.isValidDeviceId(DeviceId) && "device_id is invalid"); - // Context is set in __tgt_rtl_data_retrieve_async. - - __tgt_async_info AsyncInfo; - const int32_t Rc = - __tgt_rtl_data_retrieve_async(DeviceId, HstPtr, TgtPtr, Size, &AsyncInfo); - if (Rc != OFFLOAD_SUCCESS) - return OFFLOAD_FAIL; - - return __tgt_rtl_synchronize(DeviceId, &AsyncInfo); -} - -int32_t __tgt_rtl_data_retrieve_async(int32_t DeviceId, void *HstPtr, - void *TgtPtr, int64_t Size, - __tgt_async_info *AsyncInfoPtr) { - assert(DeviceRTL.isValidDeviceId(DeviceId) && "device_id is invalid"); - assert(AsyncInfoPtr && "async_info_ptr is nullptr"); - - if (DeviceRTL.setContext(DeviceId) != OFFLOAD_SUCCESS) - return OFFLOAD_FAIL; - - return DeviceRTL.dataRetrieve(DeviceId, HstPtr, TgtPtr, Size, AsyncInfoPtr); -} - -int32_t __tgt_rtl_data_exchange_async(int32_t SrcDevId, void *SrcPtr, - int DstDevId, void *DstPtr, int64_t Size, - __tgt_async_info *AsyncInfo) { - assert(DeviceRTL.isValidDeviceId(SrcDevId) && "src_dev_id is invalid"); - assert(DeviceRTL.isValidDeviceId(DstDevId) && "dst_dev_id is invalid"); - assert(AsyncInfo && "AsyncInfo is nullptr"); - - if (DeviceRTL.setContext(SrcDevId) != OFFLOAD_SUCCESS) - return OFFLOAD_FAIL; - - return DeviceRTL.dataExchange(SrcDevId, SrcPtr, DstDevId, DstPtr, Size, - AsyncInfo); -} - -int32_t __tgt_rtl_data_exchange(int32_t SrcDevId, void *SrcPtr, - int32_t DstDevId, void *DstPtr, int64_t Size) { - assert(DeviceRTL.isValidDeviceId(SrcDevId) && "src_dev_id is invalid"); - assert(DeviceRTL.isValidDeviceId(DstDevId) && "dst_dev_id is invalid"); - // Context is set in __tgt_rtl_data_exchange_async. - - __tgt_async_info AsyncInfo; - const int32_t Rc = __tgt_rtl_data_exchange_async(SrcDevId, SrcPtr, DstDevId, - DstPtr, Size, &AsyncInfo); - if (Rc != OFFLOAD_SUCCESS) - return OFFLOAD_FAIL; - - return __tgt_rtl_synchronize(SrcDevId, &AsyncInfo); -} - -int32_t __tgt_rtl_data_delete(int32_t DeviceId, void *TgtPtr, int32_t Kind) { - assert(DeviceRTL.isValidDeviceId(DeviceId) && "device_id is invalid"); - - if (DeviceRTL.setContext(DeviceId) != OFFLOAD_SUCCESS) - return OFFLOAD_FAIL; - - return DeviceRTL.dataDelete(DeviceId, TgtPtr, (TargetAllocTy)Kind); -} - -int32_t __tgt_rtl_run_target_team_region(int32_t DeviceId, void *TgtEntryPtr, - void **TgtArgs, ptrdiff_t *TgtOffsets, - int32_t ArgNum, int32_t TeamNum, - int32_t ThreadLimit, - uint64_t LoopTripcount) { - assert(DeviceRTL.isValidDeviceId(DeviceId) && "device_id is invalid"); - // Context is set in __tgt_rtl_run_target_team_region_async. - - __tgt_async_info AsyncInfo; - const int32_t Rc = __tgt_rtl_run_target_team_region_async( - DeviceId, TgtEntryPtr, TgtArgs, TgtOffsets, ArgNum, TeamNum, ThreadLimit, - LoopTripcount, &AsyncInfo); - if (Rc != OFFLOAD_SUCCESS) - return OFFLOAD_FAIL; - - return __tgt_rtl_synchronize(DeviceId, &AsyncInfo); -} - -int32_t __tgt_rtl_run_target_team_region_async( - int32_t DeviceId, void *TgtEntryPtr, void **TgtArgs, ptrdiff_t *TgtOffsets, - int32_t ArgNum, int32_t TeamNum, int32_t ThreadLimit, - uint64_t LoopTripcount, __tgt_async_info *AsyncInfoPtr) { - assert(DeviceRTL.isValidDeviceId(DeviceId) && "device_id is invalid"); - - if (DeviceRTL.setContext(DeviceId) != OFFLOAD_SUCCESS) - return OFFLOAD_FAIL; - - return DeviceRTL.runTargetTeamRegion(DeviceId, TgtEntryPtr, TgtArgs, - TgtOffsets, ArgNum, TeamNum, ThreadLimit, - LoopTripcount, AsyncInfoPtr); -} - -int32_t __tgt_rtl_run_target_region(int32_t DeviceId, void *TgtEntryPtr, - void **TgtArgs, ptrdiff_t *TgtOffsets, - int32_t ArgNum) { - assert(DeviceRTL.isValidDeviceId(DeviceId) && "device_id is invalid"); - // Context is set in __tgt_rtl_run_target_region_async. - - __tgt_async_info AsyncInfo; - const int32_t Rc = __tgt_rtl_run_target_region_async( - DeviceId, TgtEntryPtr, TgtArgs, TgtOffsets, ArgNum, &AsyncInfo); - if (Rc != OFFLOAD_SUCCESS) - return OFFLOAD_FAIL; - - return __tgt_rtl_synchronize(DeviceId, &AsyncInfo); -} - -int32_t __tgt_rtl_run_target_region_async(int32_t DeviceId, void *TgtEntryPtr, - void **TgtArgs, ptrdiff_t *TgtOffsets, - int32_t ArgNum, - __tgt_async_info *AsyncInfoPtr) { - assert(DeviceRTL.isValidDeviceId(DeviceId) && "device_id is invalid"); - // Context is set in __tgt_rtl_run_target_team_region_async. - return __tgt_rtl_run_target_team_region_async( - DeviceId, TgtEntryPtr, TgtArgs, TgtOffsets, ArgNum, - /* team num*/ 1, /* thread_limit */ 1, /* loop_tripcount */ 0, - AsyncInfoPtr); -} - -int32_t __tgt_rtl_synchronize(int32_t DeviceId, - __tgt_async_info *AsyncInfoPtr) { - assert(DeviceRTL.isValidDeviceId(DeviceId) && "device_id is invalid"); - assert(AsyncInfoPtr && "async_info_ptr is nullptr"); - assert(AsyncInfoPtr->Queue && "async_info_ptr->Queue is nullptr"); - // NOTE: We don't need to set context for stream sync. - return DeviceRTL.synchronize(DeviceId, AsyncInfoPtr); -} - -int32_t __tgt_rtl_query_async(int32_t DeviceId, - __tgt_async_info *AsyncInfoPtr) { - assert(DeviceRTL.isValidDeviceId(DeviceId) && "device_id is invalid"); - assert(AsyncInfoPtr && "async_info_ptr is nullptr"); - assert(AsyncInfoPtr->Queue && "async_info_ptr->Queue is nullptr"); - // NOTE: We don't need to set context for stream query. - return DeviceRTL.queryAsync(DeviceId, AsyncInfoPtr); -} - -void __tgt_rtl_set_info_flag(uint32_t NewInfoLevel) { - std::atomic &InfoLevel = getInfoLevelInternal(); - InfoLevel.store(NewInfoLevel); -} - -void __tgt_rtl_print_device_info(int32_t DeviceId) { - assert(DeviceRTL.isValidDeviceId(DeviceId) && "device_id is invalid"); - // NOTE: We don't need to set context for print device info. - DeviceRTL.printDeviceInfo(DeviceId); -} - -int32_t __tgt_rtl_create_event(int32_t DeviceId, void **Event) { - assert(Event && "event is nullptr"); - - if (DeviceRTL.setContext(DeviceId) != OFFLOAD_SUCCESS) - return OFFLOAD_FAIL; - - return DeviceRTL.createEvent(DeviceId, Event); -} - -int32_t __tgt_rtl_record_event(int32_t DeviceId, void *EventPtr, - __tgt_async_info *AsyncInfoPtr) { - assert(AsyncInfoPtr && "async_info_ptr is nullptr"); - assert(AsyncInfoPtr->Queue && "async_info_ptr->Queue is nullptr"); - assert(EventPtr && "event_ptr is nullptr"); - // NOTE: We might not need to set context for event record. - return recordEvent(EventPtr, AsyncInfoPtr); -} - -int32_t __tgt_rtl_wait_event(int32_t DeviceId, void *EventPtr, - __tgt_async_info *AsyncInfoPtr) { - assert(DeviceRTL.isValidDeviceId(DeviceId) && "device_id is invalid"); - assert(AsyncInfoPtr && "async_info_ptr is nullptr"); - assert(EventPtr && "event is nullptr"); - // If we don't have a queue we need to set the context. - if (!AsyncInfoPtr->Queue && DeviceRTL.setContext(DeviceId) != OFFLOAD_SUCCESS) - return OFFLOAD_FAIL; - return DeviceRTL.waitEvent(DeviceId, AsyncInfoPtr, EventPtr); -} - -int32_t __tgt_rtl_sync_event(int32_t DeviceId, void *EventPtr) { - assert(EventPtr && "event is nullptr"); - // NOTE: We might not need to set context for event sync. - return syncEvent(EventPtr); -} - -int32_t __tgt_rtl_destroy_event(int32_t DeviceId, void *EventPtr) { - assert(EventPtr && "event is nullptr"); - - if (DeviceRTL.setContext(DeviceId) != OFFLOAD_SUCCESS) - return OFFLOAD_FAIL; - - return DeviceRTL.destroyEvent(DeviceId, EventPtr); -} - -int32_t __tgt_rtl_release_async_info(int32_t DeviceId, - __tgt_async_info *AsyncInfo) { - assert(DeviceRTL.isValidDeviceId(DeviceId) && "device_id is invalid"); - assert(AsyncInfo && "async_info is nullptr"); - - if (DeviceRTL.setContext(DeviceId) != OFFLOAD_SUCCESS) - return OFFLOAD_FAIL; - - return DeviceRTL.releaseAsyncInfo(DeviceId, AsyncInfo); -} - -int32_t __tgt_rtl_init_async_info(int32_t DeviceId, - __tgt_async_info **AsyncInfo) { - assert(DeviceRTL.isValidDeviceId(DeviceId) && "device_id is invalid"); - assert(AsyncInfo && "async_info is nullptr"); - - if (DeviceRTL.setContext(DeviceId) != OFFLOAD_SUCCESS) - return OFFLOAD_FAIL; - - return DeviceRTL.initAsyncInfo(DeviceId, AsyncInfo); -} - -int32_t __tgt_rtl_init_device_info(int32_t DeviceId, - __tgt_device_info *DeviceInfoPtr, - const char **ErrStr) { - assert(DeviceRTL.isValidDeviceId(DeviceId) && "device_id is invalid"); - assert(DeviceInfoPtr && "device_info_ptr is nullptr"); - - if (DeviceRTL.setContext(DeviceId) != OFFLOAD_SUCCESS) - return OFFLOAD_FAIL; - - return DeviceRTL.initDeviceInfo(DeviceId, DeviceInfoPtr, ErrStr); -} - -int32_t __tgt_rtl_launch_kernel(int32_t DeviceId, void *TgtEntryPtr, - void **TgtArgs, ptrdiff_t *TgtOffsets, - KernelArgsTy *KernelArgs, - __tgt_async_info *AsyncInfo) { - assert(DeviceRTL.isValidDeviceId(DeviceId) && "device_id is invalid"); - - if (DeviceRTL.setContext(DeviceId) != OFFLOAD_SUCCESS) - return OFFLOAD_FAIL; - - return DeviceRTL.runTargetTeamRegion( - DeviceId, TgtEntryPtr, TgtArgs, TgtOffsets, KernelArgs->NumArgs, - KernelArgs->NumTeams[0], KernelArgs->ThreadLimit[0], - KernelArgs->Tripcount, AsyncInfo); -} - -#ifdef __cplusplus -} -#endif diff --git a/openmp/libomptarget/plugins/exports b/openmp/libomptarget/plugins/exports deleted file mode 100644 --- a/openmp/libomptarget/plugins/exports +++ /dev/null @@ -1,6 +0,0 @@ -VERS1.0 { - global: - __tgt_rtl*; - local: - *; -}; diff --git a/openmp/libomptarget/plugins/generic-elf-64bit/src/rtl.cpp b/openmp/libomptarget/plugins/generic-elf-64bit/src/rtl.cpp deleted file mode 100644 --- a/openmp/libomptarget/plugins/generic-elf-64bit/src/rtl.cpp +++ /dev/null @@ -1,280 +0,0 @@ -//===-RTLs/generic-64bit/src/rtl.cpp - Target RTLs Implementation - C++ -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// RTL for generic 64-bit machine -// -//===----------------------------------------------------------------------===// - -#include "llvm/ADT/SmallVector.h" -#include "llvm/Support/DynamicLibrary.h" - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "Debug.h" -#include "omptargetplugin.h" - -using namespace llvm; -using namespace llvm::sys; - -#ifndef TARGET_NAME -#define TARGET_NAME Generic ELF - 64bit -#endif -#define DEBUG_PREFIX "TARGET " GETNAME(TARGET_NAME) " RTL" - -#ifndef TARGET_ELF_ID -#define TARGET_ELF_ID 0 -#endif - -#include "elf_common.h" - -#define NUMBER_OF_DEVICES 4 -#define OFFLOAD_SECTION_NAME "omp_offloading_entries" - -/// Array of Dynamic libraries loaded for this target. -struct DynLibTy { - std::string FileName; - std::unique_ptr DynLib; -}; - -/// Keep entries table per device. -struct FuncOrGblEntryTy { - __tgt_target_table Table; - SmallVector<__tgt_offload_entry> Entries; -}; - -/// Class containing all the device information. -class RTLDeviceInfoTy { - std::vector> FuncGblEntries; - -public: - std::list DynLibs; - - // Record entry point associated with device. - void createOffloadTable(int32_t DeviceId, - SmallVector<__tgt_offload_entry> &&Entries) { - assert(DeviceId < (int32_t)FuncGblEntries.size() && - "Unexpected device id!"); - FuncGblEntries[DeviceId].emplace_back(); - FuncOrGblEntryTy &E = FuncGblEntries[DeviceId].back(); - - E.Entries = Entries; - E.Table.EntriesBegin = E.Entries.begin(); - E.Table.EntriesEnd = E.Entries.end(); - } - - // Return true if the entry is associated with device. - bool findOffloadEntry(int32_t DeviceId, void *Addr) { - assert(DeviceId < (int32_t)FuncGblEntries.size() && - "Unexpected device id!"); - FuncOrGblEntryTy &E = FuncGblEntries[DeviceId].back(); - - for (__tgt_offload_entry *I = E.Table.EntriesBegin, - *End = E.Table.EntriesEnd; - I < End; ++I) { - if (I->addr == Addr) - return true; - } - - return false; - } - - // Return the pointer to the target entries table. - __tgt_target_table *getOffloadEntriesTable(int32_t DeviceId) { - assert(DeviceId < (int32_t)FuncGblEntries.size() && - "Unexpected device id!"); - FuncOrGblEntryTy &E = FuncGblEntries[DeviceId].back(); - - return &E.Table; - } - - RTLDeviceInfoTy(int32_t NumDevices) { FuncGblEntries.resize(NumDevices); } - - ~RTLDeviceInfoTy() { - // Close dynamic libraries - for (auto &Lib : DynLibs) { - if (Lib.DynLib->isValid()) - remove(Lib.FileName.c_str()); - } - } -}; - -static RTLDeviceInfoTy DeviceInfo(NUMBER_OF_DEVICES); - -#ifdef __cplusplus -extern "C" { -#endif - -int32_t __tgt_rtl_is_valid_binary(__tgt_device_image *Image) { -// If we don't have a valid ELF ID we can just fail. -#if TARGET_ELF_ID < 1 - return 0; -#else - return elf_check_machine(Image, TARGET_ELF_ID); -#endif -} - -int32_t __tgt_rtl_number_of_devices() { return NUMBER_OF_DEVICES; } - -int32_t __tgt_rtl_init_device(int32_t DeviceId) { return OFFLOAD_SUCCESS; } - -__tgt_target_table *__tgt_rtl_load_binary(int32_t DeviceId, - __tgt_device_image *Image) { - - DP("Dev %d: load binary from " DPxMOD " image\n", DeviceId, - DPxPTR(Image->ImageStart)); - - assert(DeviceId >= 0 && DeviceId < NUMBER_OF_DEVICES && "bad dev id"); - - size_t ImageSize = (size_t)Image->ImageEnd - (size_t)Image->ImageStart; - - // load dynamic library and get the entry points. We use the dl library - // to do the loading of the library, but we could do it directly to avoid the - // dump to the temporary file. - // - // 1) Create tmp file with the library contents. - // 2) Use dlopen to load the file and dlsym to retrieve the symbols. - char TmpName[] = "/tmp/tmpfile_XXXXXX"; - int TmpFd = mkstemp(TmpName); - - if (TmpFd == -1) - return nullptr; - - FILE *Ftmp = fdopen(TmpFd, "wb"); - - if (!Ftmp) - return nullptr; - - fwrite(Image->ImageStart, ImageSize, 1, Ftmp); - fclose(Ftmp); - - std::string ErrMsg; - auto DynLib = std::make_unique( - sys::DynamicLibrary::getPermanentLibrary(TmpName, &ErrMsg)); - DynLibTy Lib = {TmpName, std::move(DynLib)}; - - if (!Lib.DynLib->isValid()) { - DP("Target library loading error: %s\n", ErrMsg.c_str()); - return NULL; - } - - __tgt_offload_entry *HostBegin = Image->EntriesBegin; - __tgt_offload_entry *HostEnd = Image->EntriesEnd; - - // Create a new offloading entry list using the device symbol address. - SmallVector<__tgt_offload_entry> Entries; - for (__tgt_offload_entry *E = HostBegin; E != HostEnd; ++E) { - if (!E->addr) - return nullptr; - - __tgt_offload_entry Entry = *E; - - void *DevAddr = Lib.DynLib->getAddressOfSymbol(E->name); - Entry.addr = DevAddr; - - DP("Entry point " DPxMOD " maps to global %s (" DPxMOD ")\n", - DPxPTR(E - HostBegin), E->name, DPxPTR(DevAddr)); - - Entries.emplace_back(Entry); - } - - DeviceInfo.createOffloadTable(DeviceId, std::move(Entries)); - DeviceInfo.DynLibs.emplace_back(std::move(Lib)); - - return DeviceInfo.getOffloadEntriesTable(DeviceId); -} - -void __tgt_rtl_print_device_info(int32_t DeviceId) { - printf(" This is a generic-elf-64bit device\n"); -} - -// Sample implementation of explicit memory allocator. For this plugin all kinds -// are equivalent to each other. -void *__tgt_rtl_data_alloc(int32_t DeviceId, int64_t Size, void *HstPtr, - int32_t Kind) { - void *Ptr = NULL; - - switch (Kind) { - case TARGET_ALLOC_DEVICE: - case TARGET_ALLOC_HOST: - case TARGET_ALLOC_SHARED: - case TARGET_ALLOC_DEFAULT: - Ptr = malloc(Size); - break; - default: - REPORT("Invalid target data allocation kind"); - } - - return Ptr; -} - -int32_t __tgt_rtl_data_submit(int32_t DeviceId, void *TgtPtr, void *HstPtr, - int64_t Size) { - memcpy(TgtPtr, HstPtr, Size); - return OFFLOAD_SUCCESS; -} - -int32_t __tgt_rtl_data_retrieve(int32_t DeviceId, void *HstPtr, void *TgtPtr, - int64_t Size) { - memcpy(HstPtr, TgtPtr, Size); - return OFFLOAD_SUCCESS; -} - -int32_t __tgt_rtl_data_delete(int32_t DeviceId, void *TgtPtr, int32_t) { - free(TgtPtr); - return OFFLOAD_SUCCESS; -} - -int32_t __tgt_rtl_launch_kernel(int32_t DeviceId, void *TgtEntryPtr, - void **TgtArgs, ptrdiff_t *TgtOffsets, - KernelArgsTy *KernelArgs, - __tgt_async_info *AsyncInfoPtr) { - assert(!KernelArgs->NumTeams[1] && !KernelArgs->NumTeams[2] && - !KernelArgs->ThreadLimit[1] && !KernelArgs->ThreadLimit[2] && - "Only one dimensional kernels supported."); - // ignore team num and thread limit. - - // Use libffi to launch execution. - ffi_cif Cif; - - // All args are references. - std::vector ArgsTypes(KernelArgs->NumArgs, &ffi_type_pointer); - std::vector Args(KernelArgs->NumArgs); - std::vector Ptrs(KernelArgs->NumArgs); - - for (uint32_t I = 0; I < KernelArgs->NumArgs; ++I) { - Ptrs[I] = (void *)((intptr_t)TgtArgs[I] + TgtOffsets[I]); - Args[I] = &Ptrs[I]; - } - - ffi_status Status = ffi_prep_cif(&Cif, FFI_DEFAULT_ABI, KernelArgs->NumArgs, - &ffi_type_void, &ArgsTypes[0]); - - assert(Status == FFI_OK && "Unable to prepare target launch!"); - - if (Status != FFI_OK) - return OFFLOAD_FAIL; - - DP("Running entry point at " DPxMOD "...\n", DPxPTR(TgtEntryPtr)); - - void (*Entry)(void); - *((void **)&Entry) = TgtEntryPtr; - ffi_call(&Cif, Entry, NULL, &Args[0]); - return OFFLOAD_SUCCESS; -} - -#ifdef __cplusplus -} -#endif diff --git a/openmp/libomptarget/plugins/ppc64/CMakeLists.txt b/openmp/libomptarget/plugins/ppc64/CMakeLists.txt deleted file mode 100644 --- a/openmp/libomptarget/plugins/ppc64/CMakeLists.txt +++ /dev/null @@ -1,17 +0,0 @@ -##===----------------------------------------------------------------------===## -# -# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -# See https://llvm.org/LICENSE.txt for license information. -# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -# -##===----------------------------------------------------------------------===## -# -# Build a plugin for a ppc64 machine if available. -# -##===----------------------------------------------------------------------===## - -if(CMAKE_SYSTEM_NAME MATCHES "Linux") - build_generic_elf64("ppc64" "PPC64" "ppc64" "powerpc64-ibm-linux-gnu" "21") -else() - libomptarget_say("Not building ppc64 offloading plugin: machine not found in the system.") -endif() diff --git a/openmp/libomptarget/plugins/ppc64le/CMakeLists.txt b/openmp/libomptarget/plugins/ppc64le/CMakeLists.txt deleted file mode 100644 --- a/openmp/libomptarget/plugins/ppc64le/CMakeLists.txt +++ /dev/null @@ -1,17 +0,0 @@ -##===----------------------------------------------------------------------===## -# -# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -# See https://llvm.org/LICENSE.txt for license information. -# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -# -##===----------------------------------------------------------------------===## -# -# Build a plugin for a ppc64le machine if available. -# -##===----------------------------------------------------------------------===## - -if(CMAKE_SYSTEM_NAME MATCHES "Linux") - build_generic_elf64("ppc64le" "PPC64le" "ppc64" "powerpc64le-ibm-linux-gnu" "21") -else() - libomptarget_say("Not building ppc64le offloading plugin: machine not found in the system.") -endif() diff --git a/openmp/libomptarget/plugins/x86_64/CMakeLists.txt b/openmp/libomptarget/plugins/x86_64/CMakeLists.txt deleted file mode 100644 --- a/openmp/libomptarget/plugins/x86_64/CMakeLists.txt +++ /dev/null @@ -1,17 +0,0 @@ -##===----------------------------------------------------------------------===## -# -# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -# See https://llvm.org/LICENSE.txt for license information. -# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -# -##===----------------------------------------------------------------------===## -# -# Build a plugin for a x86_64 machine if available. -# -##===----------------------------------------------------------------------===## - -if(CMAKE_SYSTEM_NAME MATCHES "Linux") - build_generic_elf64("x86_64" "x86_64" "x86_64" "x86_64-pc-linux-gnu" "62") -else() - libomptarget_say("Not building x86_64 offloading plugin: machine not found in the system.") -endif() diff --git a/openmp/libomptarget/src/rtl.cpp b/openmp/libomptarget/src/rtl.cpp --- a/openmp/libomptarget/src/rtl.cpp +++ b/openmp/libomptarget/src/rtl.cpp @@ -92,8 +92,6 @@ DP("Loading RTLs...\n"); - BoolEnvar NextGenPlugins("LIBOMPTARGET_NEXTGEN_PLUGINS", true); - // Attempt to open all the plugins and, if they exist, check if the interface // is correct and if they are supporting any devices. for (const char *Name : RTLNames) { @@ -102,13 +100,6 @@ RTLInfoTy &RTL = AllRTLs.back(); const std::string BaseRTLName(Name); - if (NextGenPlugins) { - if (attemptLoadRTL(BaseRTLName + ".nextgen.so", RTL)) - continue; - - DP("Falling back to original plugin...\n"); - } - if (!attemptLoadRTL(BaseRTLName + ".so", RTL)) AllRTLs.pop_back(); }