diff --git a/openmp/libomptarget/plugins/amdgpu/impl/atmi_interop_hsa.h b/openmp/libomptarget/plugins/amdgpu/impl/atmi_interop_hsa.h deleted file mode 100644 --- a/openmp/libomptarget/plugins/amdgpu/impl/atmi_interop_hsa.h +++ /dev/null @@ -1,86 +0,0 @@ -/*===-------------------------------------------------------------------------- - * ATMI (Asynchronous Task and Memory Interface) - * - * This file is distributed under the MIT License. See LICENSE.txt for details. - *===------------------------------------------------------------------------*/ -#ifndef INCLUDE_ATMI_INTEROP_HSA_H_ -#define INCLUDE_ATMI_INTEROP_HSA_H_ - -#include "atmi_runtime.h" -#include "hsa.h" -#include "hsa_ext_amd.h" - -#ifdef __cplusplus -extern "C" { -#endif -/** \defgroup interop_hsa_functions ATMI-HSA Interop - * @{ - */ - -/** - * @brief Get the device address and size of an HSA global symbol - * - * @detail Use this function to query the device address and size of an HSA - * global symbol. - * The symbol can be set at by the compiler or by the application writer in a - * language-specific manner. This function is meaningful only after calling one - * of the @p atmi_module_register functions. - * - * @param[in] place The ATMI memory place - * - * @param[in] symbol Pointer to a non-NULL global symbol name - * - * @param[in] var_addr Pointer to a non-NULL @p void* variable that will - * hold the device address of the global symbol object. - * - * @param[in] var_size Pointer to a non-NULL @p uint variable that will - * hold the size of the global symbol object. - * - * @retval ::ATMI_STATUS_SUCCESS The function has executed successfully. - * - * @retval ::ATMI_STATUS_ERROR If @p symbol, @p var_addr or @p var_size are - * invalid - * location in the current node, or if ATMI is not initialized. - * - * @retval ::ATMI_STATUS_UNKNOWN The function encountered errors. - */ -atmi_status_t atmi_interop_hsa_get_symbol_info(atmi_mem_place_t place, - const char *symbol, - void **var_addr, - unsigned int *var_size); - -/** - * @brief Get the HSA-specific kernel info from a kernel name - * - * @detail Use this function to query the HSA-specific kernel info from the - * kernel name. - * This function is meaningful only after calling one - * of the @p atmi_module_register functions. - * - * @param[in] place The ATMI memory place - * - * @param[in] kernel_name Pointer to a char array with the kernel name - * - * @param[in] info The different possible kernel properties - * - * @param[in] value Pointer to a non-NULL @p uint variable that will - * hold the return value of the kernel property. - * - * @retval ::ATMI_STATUS_SUCCESS The function has executed successfully. - * - * @retval ::ATMI_STATUS_ERROR If @p symbol, @p var_addr or @p var_size are - * invalid - * location in the current node, or if ATMI is not initialized. - * - * @retval ::ATMI_STATUS_UNKNOWN The function encountered errors. - */ -atmi_status_t atmi_interop_hsa_get_kernel_info( - atmi_mem_place_t place, const char *kernel_name, - hsa_executable_symbol_info_t info, uint32_t *value); -/** @} */ - -#ifdef __cplusplus -} -#endif - -#endif // INCLUDE_ATMI_INTEROP_HSA_H_ diff --git a/openmp/libomptarget/plugins/amdgpu/impl/atmi_interop_hsa.cpp b/openmp/libomptarget/plugins/amdgpu/impl/atmi_interop_hsa.cpp --- a/openmp/libomptarget/plugins/amdgpu/impl/atmi_interop_hsa.cpp +++ b/openmp/libomptarget/plugins/amdgpu/impl/atmi_interop_hsa.cpp @@ -3,15 +3,14 @@ * * This file is distributed under the MIT License. See LICENSE.txt for details. *===------------------------------------------------------------------------*/ -#include "atmi_interop_hsa.h" #include "internal.h" using core::atl_is_atmi_initialized; -atmi_status_t atmi_interop_hsa_get_symbol_info(atmi_mem_place_t place, - const char *symbol, - void **var_addr, - unsigned int *var_size) { +atmi_status_t atmi_interop_hsa_get_symbol_info( + std::vector> &SymbolInfoTable, + atmi_mem_place_t place, const char *symbol, void **var_addr, + unsigned int *var_size) { /* // Typical usage: void *var_addr; @@ -46,6 +45,7 @@ } atmi_status_t atmi_interop_hsa_get_kernel_info( + std::vector> &KernelInfoTable, atmi_mem_place_t place, const char *kernel_name, hsa_executable_symbol_info_t kernel_info, uint32_t *value) { /* diff --git a/openmp/libomptarget/plugins/amdgpu/impl/internal.h b/openmp/libomptarget/plugins/amdgpu/impl/internal.h --- a/openmp/libomptarget/plugins/amdgpu/impl/internal.h +++ b/openmp/libomptarget/plugins/amdgpu/impl/internal.h @@ -112,9 +112,6 @@ uint32_t size; } atl_symbol_info_t; -extern std::vector> KernelInfoTable; -extern std::vector> SymbolInfoTable; - // ---------------------- Kernel End ------------- namespace core { diff --git a/openmp/libomptarget/plugins/amdgpu/impl/rt.h b/openmp/libomptarget/plugins/amdgpu/impl/rt.h --- a/openmp/libomptarget/plugins/amdgpu/impl/rt.h +++ b/openmp/libomptarget/plugins/amdgpu/impl/rt.h @@ -54,12 +54,6 @@ static atmi_status_t Finalize(); // machine info static atmi_machine_t *GetMachineInfo(); - // modules - static atmi_status_t RegisterModuleFromMemory( - void *, size_t, atmi_place_t, - atmi_status_t (*on_deserialized_data)(void *data, size_t size, - void *cb_state), - void *cb_state, std::vector &HSAExecutables); // data static atmi_status_t Memcpy(hsa_signal_t, void *, const void *, size_t); diff --git a/openmp/libomptarget/plugins/amdgpu/impl/system.cpp b/openmp/libomptarget/plugins/amdgpu/impl/system.cpp --- a/openmp/libomptarget/plugins/amdgpu/impl/system.cpp +++ b/openmp/libomptarget/plugins/amdgpu/impl/system.cpp @@ -132,9 +132,6 @@ std::vector atl_gpu_kernarg_pools; -std::vector> KernelInfoTable; -std::vector> SymbolInfoTable; - bool g_atmi_initialized = false; bool g_atmi_hostcall_required = false; @@ -201,15 +198,6 @@ atmi_status_t Runtime::Finalize() { hsa_status_t err; - for (uint32_t i = 0; i < SymbolInfoTable.size(); i++) { - SymbolInfoTable[i].clear(); - } - SymbolInfoTable.clear(); - for (uint32_t i = 0; i < KernelInfoTable.size(); i++) { - KernelInfoTable[i].clear(); - } - KernelInfoTable.clear(); - atl_reset_atmi_initialized(); err = hsa_shut_down(); if (err != HSA_STATUS_SUCCESS) { @@ -552,14 +540,6 @@ "After initializing compute and memory", get_error_string(err)); exit(1); } - - int gpu_count = g_atl_machine.processorCount(); - KernelInfoTable.resize(gpu_count); - SymbolInfoTable.resize(gpu_count); - for (uint32_t i = 0; i < SymbolInfoTable.size(); i++) - SymbolInfoTable[i].clear(); - for (uint32_t i = 0; i < KernelInfoTable.size(); i++) - KernelInfoTable[i].clear(); atlc.g_hsa_initialized = true; DEBUG_PRINT("done\n"); } @@ -833,6 +813,7 @@ } // namespace static hsa_status_t get_code_object_custom_metadata( + std::vector> &KernelInfoTable, std::map &KernelNameMap, void *binary, size_t binSize, int gpu) { // parse code object with different keys from v2 @@ -1006,15 +987,21 @@ } struct PopulateInfoTablesContext { - PopulateInfoTablesContext(std::map &KernelNameMap, - int Device) - : KernelNameMap(KernelNameMap), DeviceId(Device) {} + PopulateInfoTablesContext( + std::map &KernelNameMap, + std::vector> &KernelInfo, + std::vector> &SymbolInfo, + int Device) + : KernelNameMap(KernelNameMap), KernelInfoTable(KernelInfo), + SymbolInfoTable(SymbolInfo), DeviceId(Device) {} PopulateInfoTablesContext() = delete; PopulateInfoTablesContext(const PopulateInfoTablesContext &) = delete; ~PopulateInfoTablesContext() = default; std::map &KernelNameMap; + std::vector> &KernelInfoTable; + std::vector> &SymbolInfoTable; int DeviceId; }; @@ -1023,8 +1010,11 @@ void *data) { PopulateInfoTablesContext *Ctx = static_cast(data); - int gpu = Ctx->DeviceId; auto &KernelNameMap = Ctx->KernelNameMap; + auto &KernelInfoTable = Ctx->KernelInfoTable; + auto &SymbolInfoTable = Ctx->SymbolInfoTable; + int gpu = Ctx->DeviceId; + hsa_symbol_kind_t type; uint32_t name_length; @@ -1171,7 +1161,9 @@ return HSA_STATUS_SUCCESS; } -atmi_status_t Runtime::RegisterModuleFromMemory( +atmi_status_t RegisterModuleFromMemory( + std::vector> &KernelInfoTable, + std::vector> &SymbolInfoTable, void *module_bytes, size_t module_size, atmi_place_t place, atmi_status_t (*on_deserialized_data)(void *data, size_t size, void *cb_state), @@ -1212,8 +1204,8 @@ // Some metadata info is not available through ROCr API, so use custom // code object metadata parsing to collect such metadata info - err = get_code_object_custom_metadata(KernelNameMap, module_bytes, - module_size, gpu); + err = core::get_code_object_custom_metadata( + KernelInfoTable, KernelNameMap, module_bytes, module_size, gpu); if (err != HSA_STATUS_SUCCESS) { DEBUG_PRINT("[%s:%d] %s failed: %s\n", __FILE__, __LINE__, "Getting custom code object metadata", @@ -1267,7 +1259,8 @@ exit(1); } - PopulateInfoTablesContext Ctx(KernelNameMap, gpu); + PopulateInfoTablesContext Ctx(KernelNameMap, KernelInfoTable, + SymbolInfoTable, gpu); err = hsa_executable_iterate_symbols(executable, PopulateInfoTables, static_cast(&Ctx)); if (err != HSA_STATUS_SUCCESS) { diff --git a/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp b/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp --- a/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp +++ b/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp @@ -30,7 +30,6 @@ #include // Header from ATMI interface -#include "atmi_interop_hsa.h" #include "atmi_runtime.h" #include "internal.h" @@ -87,6 +86,25 @@ #include "elf_common.h" +atmi_status_t atmi_interop_hsa_get_symbol_info( + std::vector> &SymbolInfoTable, + atmi_mem_place_t place, const char *symbol, void **var_addr, + unsigned int *var_size); +atmi_status_t atmi_interop_hsa_get_kernel_info( + std::vector> &KernelInfoTable, + atmi_mem_place_t place, const char *kernel_name, + hsa_executable_symbol_info_t info, uint32_t *value); + +namespace core { +atmi_status_t RegisterModuleFromMemory( + std::vector> &KernelInfoTable, + std::vector> &SymbolInfoTable, + void *, size_t, atmi_place_t, + atmi_status_t (*on_deserialized_data)(void *data, size_t size, + void *cb_state), + void *cb_state, std::vector &HSAExecutables); +} + /// Keep entries table per device struct FuncOrGblEntryTy { __tgt_target_table Table; @@ -330,6 +348,9 @@ std::vector HSAExecutables; + std::vector> KernelInfoTable; + std::vector> SymbolInfoTable; + struct atmiFreePtrDeletor { void operator()(void *p) { atmi_free(p); // ignore failure to free @@ -474,6 +495,13 @@ NumThreads.resize(NumberOfDevices); deviceStateStore.resize(NumberOfDevices); + KernelInfoTable.resize(NumberOfDevices); + SymbolInfoTable.resize(NumberOfDevices); + for (uint32_t i = 0; i < SymbolInfoTable.size(); i++) + SymbolInfoTable[i].clear(); + for (uint32_t i = 0; i < KernelInfoTable.size(); i++) + KernelInfoTable[i].clear(); + for (int i = 0; i < NumberOfDevices; i++) { uint32_t queue_size = 0; { @@ -558,6 +586,8 @@ // atmi_finalize removes access to it deviceStateStore.clear(); KernelArgPoolMap.clear(); + KernelInfoTable.clear(); + SymbolInfoTable.clear(); // Terminate hostrpc before finalizing ATMI hostrpc_terminate(); @@ -988,15 +1018,17 @@ template atmi_status_t module_register_from_memory_to_place( + std::vector> &KernelInfoTable, + std::vector> &SymbolInfoTable, void *module_bytes, size_t module_size, atmi_place_t place, C cb, std::vector &HSAExecutables) { auto L = [](void *data, size_t size, void *cb_state) -> atmi_status_t { C *unwrapped = static_cast(cb_state); return (*unwrapped)(data, size); }; - return core::Runtime::RegisterModuleFromMemory( - module_bytes, module_size, place, L, static_cast(&cb), - HSAExecutables); + return core::RegisterModuleFromMemory( + KernelInfoTable, SymbolInfoTable, module_bytes, module_size, place, L, + static_cast(&cb), HSAExecutables); } } // namespace @@ -1116,7 +1148,8 @@ void *state_ptr; uint32_t state_ptr_size; atmi_status_t err = atmi_interop_hsa_get_symbol_info( - get_gpu_mem_place(device_id), sym(), &state_ptr, &state_ptr_size); + DeviceInfo.SymbolInfoTable, get_gpu_mem_place(device_id), sym(), + &state_ptr, &state_ptr_size); if (err != ATMI_STATUS_SUCCESS) { DP("failed to find %s in loaded image\n", sym()); return err; @@ -1196,6 +1229,7 @@ img_size); atmi_status_t err = module_register_from_memory_to_place( + DeviceInfo.KernelInfoTable, DeviceInfo.SymbolInfoTable, (void *)image->ImageStart, img_size, get_gpu_place(device_id), [&](void *data, size_t size) { return env.before_loading(data, size); }, DeviceInfo.HSAExecutables); @@ -1226,8 +1260,8 @@ void *state_ptr; uint32_t state_ptr_size; atmi_status_t err = atmi_interop_hsa_get_symbol_info( - get_gpu_mem_place(device_id), "omptarget_nvptx_device_State", - &state_ptr, &state_ptr_size); + DeviceInfo.SymbolInfoTable, get_gpu_mem_place(device_id), + "omptarget_nvptx_device_State", &state_ptr, &state_ptr_size); if (err != ATMI_STATUS_SUCCESS) { DP("No device_state symbol found, skipping initialization\n"); @@ -1310,7 +1344,8 @@ uint32_t varsize; atmi_status_t err = atmi_interop_hsa_get_symbol_info( - get_gpu_mem_place(device_id), e->name, &varptr, &varsize); + DeviceInfo.SymbolInfoTable, get_gpu_mem_place(device_id), e->name, + &varptr, &varsize); if (err != ATMI_STATUS_SUCCESS) { // Inform the user what symbol prevented offloading @@ -1352,7 +1387,8 @@ atmi_mem_place_t place = get_gpu_mem_place(device_id); uint32_t kernarg_segment_size; atmi_status_t err = atmi_interop_hsa_get_kernel_info( - place, e->name, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_SIZE, + DeviceInfo.KernelInfoTable, place, e->name, + HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_SIZE, &kernarg_segment_size); // each arg is a void * in this openmp implementation @@ -1780,6 +1816,7 @@ KernelTy *KernelInfo = (KernelTy *)tgt_entry_ptr; std::string kernel_name = std::string(KernelInfo->Name); + auto &KernelInfoTable = DeviceInfo.KernelInfoTable; if (KernelInfoTable[device_id].find(kernel_name) == KernelInfoTable[device_id].end()) { DP("Kernel %s not found\n", kernel_name.c_str());