diff --git a/openmp/libomptarget/plugins/amdgpu/impl/atmi_interop_hsa.h b/openmp/libomptarget/plugins/amdgpu/impl/atmi_interop_hsa.h deleted file mode 100644 --- a/openmp/libomptarget/plugins/amdgpu/impl/atmi_interop_hsa.h +++ /dev/null @@ -1,86 +0,0 @@ -/*===-------------------------------------------------------------------------- - * ATMI (Asynchronous Task and Memory Interface) - * - * This file is distributed under the MIT License. See LICENSE.txt for details. - *===------------------------------------------------------------------------*/ -#ifndef INCLUDE_ATMI_INTEROP_HSA_H_ -#define INCLUDE_ATMI_INTEROP_HSA_H_ - -#include "atmi_runtime.h" -#include "hsa.h" -#include "hsa_ext_amd.h" - -#ifdef __cplusplus -extern "C" { -#endif -/** \defgroup interop_hsa_functions ATMI-HSA Interop - * @{ - */ - -/** - * @brief Get the device address and size of an HSA global symbol - * - * @detail Use this function to query the device address and size of an HSA - * global symbol. - * The symbol can be set at by the compiler or by the application writer in a - * language-specific manner. This function is meaningful only after calling one - * of the @p atmi_module_register functions. - * - * @param[in] place The ATMI memory place - * - * @param[in] symbol Pointer to a non-NULL global symbol name - * - * @param[in] var_addr Pointer to a non-NULL @p void* variable that will - * hold the device address of the global symbol object. - * - * @param[in] var_size Pointer to a non-NULL @p uint variable that will - * hold the size of the global symbol object. - * - * @retval ::ATMI_STATUS_SUCCESS The function has executed successfully. - * - * @retval ::ATMI_STATUS_ERROR If @p symbol, @p var_addr or @p var_size are - * invalid - * location in the current node, or if ATMI is not initialized. - * - * @retval ::ATMI_STATUS_UNKNOWN The function encountered errors. - */ -atmi_status_t atmi_interop_hsa_get_symbol_info(atmi_mem_place_t place, - const char *symbol, - void **var_addr, - unsigned int *var_size); - -/** - * @brief Get the HSA-specific kernel info from a kernel name - * - * @detail Use this function to query the HSA-specific kernel info from the - * kernel name. - * This function is meaningful only after calling one - * of the @p atmi_module_register functions. - * - * @param[in] place The ATMI memory place - * - * @param[in] kernel_name Pointer to a char array with the kernel name - * - * @param[in] info The different possible kernel properties - * - * @param[in] value Pointer to a non-NULL @p uint variable that will - * hold the return value of the kernel property. - * - * @retval ::ATMI_STATUS_SUCCESS The function has executed successfully. - * - * @retval ::ATMI_STATUS_ERROR If @p symbol, @p var_addr or @p var_size are - * invalid - * location in the current node, or if ATMI is not initialized. - * - * @retval ::ATMI_STATUS_UNKNOWN The function encountered errors. - */ -atmi_status_t atmi_interop_hsa_get_kernel_info( - atmi_mem_place_t place, const char *kernel_name, - hsa_executable_symbol_info_t info, uint32_t *value); -/** @} */ - -#ifdef __cplusplus -} -#endif - -#endif // INCLUDE_ATMI_INTEROP_HSA_H_ diff --git a/openmp/libomptarget/plugins/amdgpu/impl/atmi_interop_hsa.cpp b/openmp/libomptarget/plugins/amdgpu/impl/atmi_interop_hsa.cpp --- a/openmp/libomptarget/plugins/amdgpu/impl/atmi_interop_hsa.cpp +++ b/openmp/libomptarget/plugins/amdgpu/impl/atmi_interop_hsa.cpp @@ -3,15 +3,14 @@ * * This file is distributed under the MIT License. See LICENSE.txt for details. *===------------------------------------------------------------------------*/ -#include "atmi_interop_hsa.h" #include "internal.h" using core::atl_is_atmi_initialized; -atmi_status_t atmi_interop_hsa_get_symbol_info(atmi_mem_place_t place, - const char *symbol, - void **var_addr, - unsigned int *var_size) { +atmi_status_t atmi_interop_hsa_get_symbol_info( + std::map &SymbolInfoTable, + atmi_mem_place_t place, const char *symbol, void **var_addr, + unsigned int *var_size) { /* // Typical usage: void *var_addr; @@ -32,9 +31,8 @@ // get the symbol info std::string symbolStr = std::string(symbol); - if (SymbolInfoTable[place.dev_id].find(symbolStr) != - SymbolInfoTable[place.dev_id].end()) { - atl_symbol_info_t info = SymbolInfoTable[place.dev_id][symbolStr]; + if (SymbolInfoTable.find(symbolStr) != SymbolInfoTable.end()) { + atl_symbol_info_t info = SymbolInfoTable[symbolStr]; *var_addr = reinterpret_cast(info.addr); *var_size = info.size; return ATMI_STATUS_SUCCESS; @@ -46,6 +44,7 @@ } atmi_status_t atmi_interop_hsa_get_kernel_info( + std::map &KernelInfoTable, atmi_mem_place_t place, const char *kernel_name, hsa_executable_symbol_info_t kernel_info, uint32_t *value) { /* @@ -68,9 +67,8 @@ atmi_status_t status = ATMI_STATUS_SUCCESS; // get the kernel info std::string kernelStr = std::string(kernel_name); - if (KernelInfoTable[place.dev_id].find(kernelStr) != - KernelInfoTable[place.dev_id].end()) { - atl_kernel_info_t info = KernelInfoTable[place.dev_id][kernelStr]; + if (KernelInfoTable.find(kernelStr) != KernelInfoTable.end()) { + atl_kernel_info_t info = KernelInfoTable[kernelStr]; switch (kernel_info) { case HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_GROUP_SEGMENT_SIZE: *value = info.group_segment_size; diff --git a/openmp/libomptarget/plugins/amdgpu/impl/internal.h b/openmp/libomptarget/plugins/amdgpu/impl/internal.h --- a/openmp/libomptarget/plugins/amdgpu/impl/internal.h +++ b/openmp/libomptarget/plugins/amdgpu/impl/internal.h @@ -112,9 +112,6 @@ uint32_t size; } atl_symbol_info_t; -extern std::vector> KernelInfoTable; -extern std::vector> SymbolInfoTable; - // ---------------------- Kernel End ------------- namespace core { diff --git a/openmp/libomptarget/plugins/amdgpu/impl/rt.h b/openmp/libomptarget/plugins/amdgpu/impl/rt.h --- a/openmp/libomptarget/plugins/amdgpu/impl/rt.h +++ b/openmp/libomptarget/plugins/amdgpu/impl/rt.h @@ -54,12 +54,6 @@ static atmi_status_t Finalize(); // machine info static atmi_machine_t *GetMachineInfo(); - // modules - static atmi_status_t RegisterModuleFromMemory( - void *, size_t, atmi_place_t, - atmi_status_t (*on_deserialized_data)(void *data, size_t size, - void *cb_state), - void *cb_state, std::vector &HSAExecutables); // data static atmi_status_t Memcpy(hsa_signal_t, void *, const void *, size_t); diff --git a/openmp/libomptarget/plugins/amdgpu/impl/system.cpp b/openmp/libomptarget/plugins/amdgpu/impl/system.cpp --- a/openmp/libomptarget/plugins/amdgpu/impl/system.cpp +++ b/openmp/libomptarget/plugins/amdgpu/impl/system.cpp @@ -132,9 +132,6 @@ std::vector atl_gpu_kernarg_pools; -std::vector> KernelInfoTable; -std::vector> SymbolInfoTable; - bool g_atmi_initialized = false; bool g_atmi_hostcall_required = false; @@ -201,15 +198,6 @@ atmi_status_t Runtime::Finalize() { hsa_status_t err; - for (uint32_t i = 0; i < SymbolInfoTable.size(); i++) { - SymbolInfoTable[i].clear(); - } - SymbolInfoTable.clear(); - for (uint32_t i = 0; i < KernelInfoTable.size(); i++) { - KernelInfoTable[i].clear(); - } - KernelInfoTable.clear(); - atl_reset_atmi_initialized(); err = hsa_shut_down(); if (err != HSA_STATUS_SUCCESS) { @@ -553,13 +541,6 @@ exit(1); } - int gpu_count = g_atl_machine.processorCount(); - KernelInfoTable.resize(gpu_count); - SymbolInfoTable.resize(gpu_count); - for (uint32_t i = 0; i < SymbolInfoTable.size(); i++) - SymbolInfoTable[i].clear(); - for (uint32_t i = 0; i < KernelInfoTable.size(); i++) - KernelInfoTable[i].clear(); atlc.g_hsa_initialized = true; DEBUG_PRINT("done\n"); } @@ -832,8 +813,9 @@ } } // namespace -static hsa_status_t get_code_object_custom_metadata(void *binary, - size_t binSize, int gpu) { +static hsa_status_t get_code_object_custom_metadata( + std::map &KernelInfoTable, void *binary, + size_t binSize, int gpu) { // parse code object with different keys from v2 // also, the kernel name is not the same as the symbol name -- so a // symbol->name map is needed @@ -997,16 +979,36 @@ kernel_segment_size, info.kernel_segment_size); // kernel received, now add it to the kernel info table - KernelInfoTable[gpu][kernelName] = info; + KernelInfoTable[kernelName] = info; } return HSA_STATUS_SUCCESS; } +struct PopulateInfoTablesContext { + PopulateInfoTablesContext( + std::map &KernelInfo, + std::map &SymbolInfo, int Device) + : KernelInfoTable(KernelInfo), SymbolInfoTable(SymbolInfo), + DeviceId(Device) {} + + PopulateInfoTablesContext() = delete; + PopulateInfoTablesContext(const PopulateInfoTablesContext &) = delete; + ~PopulateInfoTablesContext() = default; + + std::map &KernelInfoTable; + std::map &SymbolInfoTable; + int DeviceId; +}; + static hsa_status_t populate_InfoTables(hsa_executable_t executable, hsa_executable_symbol_t symbol, void *data) { - int gpu = *static_cast(data); + PopulateInfoTablesContext *Ctx = + static_cast(data); + auto &KernelInfoTable = Ctx->KernelInfoTable; + auto &SymbolInfoTable = Ctx->SymbolInfoTable; + int gpu = Ctx->DeviceId; hsa_symbol_kind_t type; uint32_t name_length; @@ -1046,7 +1048,7 @@ // by now, the kernel info table should already have an entry // because the non-ROCr custom code object parsing is called before // iterating over the code object symbols using ROCr - if (KernelInfoTable[gpu].find(kernelName) == KernelInfoTable[gpu].end()) { + if (KernelInfoTable.find(kernelName) == KernelInfoTable.end()) { if (HSA_STATUS_ERROR_INVALID_CODE_OBJECT != HSA_STATUS_SUCCESS) { printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__, "Finding the entry kernel info table", @@ -1055,7 +1057,7 @@ } } // found, so assign and update - info = KernelInfoTable[gpu][kernelName]; + info = KernelInfoTable[kernelName]; /* Extract dispatch information from the symbol */ err = hsa_executable_symbol_get_info( @@ -1093,7 +1095,7 @@ info.private_segment_size, info.kernel_segment_size); // assign it back to the kernel info table - KernelInfoTable[gpu][kernelName] = info; + KernelInfoTable[kernelName] = info; free(name); } else if (type == HSA_SYMBOL_KIND_VARIABLE) { err = hsa_executable_symbol_get_info( @@ -1136,7 +1138,7 @@ info.size); register_allocation(reinterpret_cast(info.addr), (size_t)info.size, place); - SymbolInfoTable[gpu][std::string(name)] = info; + SymbolInfoTable[std::string(name)] = info; if (strcmp(name, "needs_hostcall_buffer") == 0) g_atmi_hostcall_required = true; free(name); @@ -1146,7 +1148,9 @@ return HSA_STATUS_SUCCESS; } -atmi_status_t Runtime::RegisterModuleFromMemory( +atmi_status_t RegisterModuleFromMemory( + std::map &KernelInfoTable, + std::map &SymbolInfoTable, void *module_bytes, size_t module_size, atmi_place_t place, atmi_status_t (*on_deserialized_data)(void *data, size_t size, void *cb_state), @@ -1186,7 +1190,8 @@ // Some metadata info is not available through ROCr API, so use custom // code object metadata parsing to collect such metadata info - err = get_code_object_custom_metadata(module_bytes, module_size, gpu); + err = get_code_object_custom_metadata(KernelInfoTable, module_bytes, + module_size, gpu); if (err != HSA_STATUS_SUCCESS) { DEBUG_PRINT("[%s:%d] %s failed: %s\n", __FILE__, __LINE__, "Getting custom code object metadata", @@ -1240,8 +1245,9 @@ exit(1); } + PopulateInfoTablesContext Ctx(KernelInfoTable, SymbolInfoTable, gpu); err = hsa_executable_iterate_symbols(executable, populate_InfoTables, - static_cast(&gpu)); + static_cast(&Ctx)); if (err != HSA_STATUS_SUCCESS) { printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__, "Iterating over symbols for execuatable", get_error_string(err)); diff --git a/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp b/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp --- a/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp +++ b/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp @@ -30,7 +30,6 @@ #include // Header from ATMI interface -#include "atmi_interop_hsa.h" #include "atmi_runtime.h" #include "internal.h" @@ -87,6 +86,25 @@ #include "elf_common.h" +atmi_status_t atmi_interop_hsa_get_symbol_info( + std::map &SymbolInfoTable, + atmi_mem_place_t place, const char *symbol, void **var_addr, + unsigned int *var_size); +atmi_status_t atmi_interop_hsa_get_kernel_info( + std::map &KernelInfoTable, + atmi_mem_place_t place, const char *kernel_name, + hsa_executable_symbol_info_t info, uint32_t *value); + +namespace core { +atmi_status_t RegisterModuleFromMemory( + std::map &KernelInfo, + std::map &SymbolInfoTable, void *, size_t, + atmi_place_t, + atmi_status_t (*on_deserialized_data)(void *data, size_t size, + void *cb_state), + void *cb_state, std::vector &HSAExecutables); +} + /// Keep entries table per device struct FuncOrGblEntryTy { __tgt_target_table Table; @@ -330,6 +348,9 @@ std::vector HSAExecutables; + std::vector> KernelInfoTable; + std::vector> SymbolInfoTable; + struct atmiFreePtrDeletor { void operator()(void *p) { atmi_free(p); // ignore failure to free @@ -473,6 +494,13 @@ NumTeams.resize(NumberOfDevices); NumThreads.resize(NumberOfDevices); deviceStateStore.resize(NumberOfDevices); + KernelInfoTable.resize(NumberOfDevices); + SymbolInfoTable.resize(NumberOfDevices); + + for (uint32_t i = 0; i < SymbolInfoTable.size(); i++) + SymbolInfoTable[i].clear(); + for (uint32_t i = 0; i < KernelInfoTable.size(); i++) + KernelInfoTable[i].clear(); for (int i = 0; i < NumberOfDevices; i++) { uint32_t queue_size = 0; @@ -558,6 +586,8 @@ // atmi_finalize removes access to it deviceStateStore.clear(); KernelArgPoolMap.clear(); + KernelInfoTable.clear(); + SymbolInfoTable.clear(); // Terminate hostrpc before finalizing ATMI hostrpc_terminate(); @@ -988,15 +1018,17 @@ template atmi_status_t module_register_from_memory_to_place( + std::map &KernelInfoTable, + std::map &SymbolInfoTable, void *module_bytes, size_t module_size, atmi_place_t place, C cb, std::vector &HSAExecutables) { auto L = [](void *data, size_t size, void *cb_state) -> atmi_status_t { C *unwrapped = static_cast(cb_state); return (*unwrapped)(data, size); }; - return core::Runtime::RegisterModuleFromMemory( - module_bytes, module_size, place, L, static_cast(&cb), - HSAExecutables); + return core::RegisterModuleFromMemory( + KernelInfoTable, SymbolInfoTable, module_bytes, module_size, place, L, + static_cast(&cb), HSAExecutables); } } // namespace @@ -1112,11 +1144,12 @@ DP("Setting global device environment after load (%u bytes)\n", si.size); int device_id = host_device_env.device_num; - + auto &SymbolInfo = DeviceInfo.SymbolInfoTable[device_id]; void *state_ptr; uint32_t state_ptr_size; atmi_status_t err = atmi_interop_hsa_get_symbol_info( - get_gpu_mem_place(device_id), sym(), &state_ptr, &state_ptr_size); + SymbolInfo, get_gpu_mem_place(device_id), sym(), &state_ptr, + &state_ptr_size); if (err != ATMI_STATUS_SUCCESS) { DP("failed to find %s in loaded image\n", sym()); return err; @@ -1195,8 +1228,11 @@ auto env = device_environment(device_id, DeviceInfo.NumberOfDevices, image, img_size); + auto &KernelInfo = DeviceInfo.KernelInfoTable[device_id]; + auto &SymbolInfo = DeviceInfo.SymbolInfoTable[device_id]; atmi_status_t err = module_register_from_memory_to_place( - (void *)image->ImageStart, img_size, get_gpu_place(device_id), + KernelInfo, SymbolInfo, (void *)image->ImageStart, img_size, + get_gpu_place(device_id), [&](void *data, size_t size) { return env.before_loading(data, size); }, DeviceInfo.HSAExecutables); @@ -1225,9 +1261,10 @@ void *state_ptr; uint32_t state_ptr_size; + auto &SymbolInfoMap = DeviceInfo.SymbolInfoTable[device_id]; atmi_status_t err = atmi_interop_hsa_get_symbol_info( - get_gpu_mem_place(device_id), "omptarget_nvptx_device_State", - &state_ptr, &state_ptr_size); + SymbolInfoMap, get_gpu_mem_place(device_id), + "omptarget_nvptx_device_State", &state_ptr, &state_ptr_size); if (err != ATMI_STATUS_SUCCESS) { DP("No device_state symbol found, skipping initialization\n"); @@ -1309,8 +1346,10 @@ void *varptr; uint32_t varsize; + auto &SymbolInfoMap = DeviceInfo.SymbolInfoTable[device_id]; atmi_status_t err = atmi_interop_hsa_get_symbol_info( - get_gpu_mem_place(device_id), e->name, &varptr, &varsize); + SymbolInfoMap, get_gpu_mem_place(device_id), e->name, &varptr, + &varsize); if (err != ATMI_STATUS_SUCCESS) { // Inform the user what symbol prevented offloading @@ -1351,8 +1390,10 @@ atmi_mem_place_t place = get_gpu_mem_place(device_id); uint32_t kernarg_segment_size; + auto &KernelInfoMap = DeviceInfo.KernelInfoTable[device_id]; atmi_status_t err = atmi_interop_hsa_get_kernel_info( - place, e->name, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_SIZE, + KernelInfoMap, place, e->name, + HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_SIZE, &kernarg_segment_size); // each arg is a void * in this openmp implementation @@ -1780,6 +1821,7 @@ KernelTy *KernelInfo = (KernelTy *)tgt_entry_ptr; std::string kernel_name = std::string(KernelInfo->Name); + auto &KernelInfoTable = DeviceInfo.KernelInfoTable; if (KernelInfoTable[device_id].find(kernel_name) == KernelInfoTable[device_id].end()) { DP("Kernel %s not found\n", kernel_name.c_str());