diff --git a/openmp/libomptarget/plugins/amdgpu/impl/atmi_interop_hsa.cpp b/openmp/libomptarget/plugins/amdgpu/impl/atmi_interop_hsa.cpp --- a/openmp/libomptarget/plugins/amdgpu/impl/atmi_interop_hsa.cpp +++ b/openmp/libomptarget/plugins/amdgpu/impl/atmi_interop_hsa.cpp @@ -25,9 +25,6 @@ if (!symbol || !var_addr || !var_size) return HSA_STATUS_ERROR; - if (DeviceId < 0 || - DeviceId >= g_atl_machine.processors().size()) - return HSA_STATUS_ERROR; // get the symbol info std::string symbolStr = std::string(symbol); @@ -58,9 +55,6 @@ if (!kernel_name || !value) return HSA_STATUS_ERROR; - if (DeviceId < 0 || - DeviceId >= g_atl_machine.processors().size()) - return HSA_STATUS_ERROR; hsa_status_t status = HSA_STATUS_SUCCESS; // get the kernel info diff --git a/openmp/libomptarget/plugins/amdgpu/impl/machine.h b/openmp/libomptarget/plugins/amdgpu/impl/machine.h --- a/openmp/libomptarget/plugins/amdgpu/impl/machine.h +++ b/openmp/libomptarget/plugins/amdgpu/impl/machine.h @@ -66,9 +66,6 @@ } template void addProcessor(const T &p); template std::vector &processors(); - template size_t processorCount() { - return processors().size(); - } private: std::vector cpu_processors_; @@ -78,14 +75,4 @@ hsa_amd_memory_pool_t get_memory_pool(const ATLProcessor &proc, const int mem_id); -extern ATLMachine g_atl_machine; -template T &get_processor(int dev_id) { - if (dev_id == -1) { - // user is asking runtime to pick a device - // best device of this type? pick 0 for now - dev_id = 0; - } - return g_atl_machine.processors()[dev_id]; -} - #endif // SRC_RUNTIME_INCLUDE_MACHINE_H_ diff --git a/openmp/libomptarget/plugins/amdgpu/impl/rt.h b/openmp/libomptarget/plugins/amdgpu/impl/rt.h --- a/openmp/libomptarget/plugins/amdgpu/impl/rt.h +++ b/openmp/libomptarget/plugins/amdgpu/impl/rt.h @@ -51,7 +51,7 @@ // modules static hsa_status_t RegisterModuleFromMemory( - void *, size_t, int DeviceId, + void *, size_t, hsa_agent_t agent, hsa_status_t (*on_deserialized_data)(void *data, size_t size, void *cb_state), void *cb_state, std::vector &HSAExecutables); diff --git a/openmp/libomptarget/plugins/amdgpu/impl/system.cpp b/openmp/libomptarget/plugins/amdgpu/impl/system.cpp --- a/openmp/libomptarget/plugins/amdgpu/impl/system.cpp +++ b/openmp/libomptarget/plugins/amdgpu/impl/system.cpp @@ -703,7 +703,7 @@ } // namespace static hsa_status_t get_code_object_custom_metadata( - void *binary, size_t binSize, int gpu, + void *binary, size_t binSize, std::map &KernelInfoTable) { // parse code object with different keys from v2 // also, the kernel name is not the same as the symbol name -- so a @@ -878,7 +878,7 @@ } static hsa_status_t -populate_InfoTables(hsa_executable_symbol_t symbol, int gpu, +populate_InfoTables(hsa_executable_symbol_t symbol, std::map &KernelInfoTable, std::map &SymbolInfoTable) { hsa_symbol_kind_t type; @@ -1020,16 +1020,11 @@ hsa_status_t RegisterModuleFromMemory( std::map &KernelInfoTable, std::map &SymbolInfoTable, - void *module_bytes, size_t module_size, int gpu, + void *module_bytes, size_t module_size, hsa_agent_t agent, hsa_status_t (*on_deserialized_data)(void *data, size_t size, void *cb_state), void *cb_state, std::vector &HSAExecutables) { hsa_status_t err; - assert(gpu >= 0); - - DEBUG_PRINT("Trying to load module to GPU-%d\n", gpu); - ATLGPUProcessor &proc = get_processor(gpu); - hsa_agent_t agent = proc.agent(); hsa_executable_t executable = {0}; hsa_profile_t agent_profile; @@ -1058,7 +1053,7 @@ // Some metadata info is not available through ROCr API, so use custom // code object metadata parsing to collect such metadata info - err = get_code_object_custom_metadata(module_bytes, module_size, gpu, + err = get_code_object_custom_metadata(module_bytes, module_size, KernelInfoTable); if (err != HSA_STATUS_SUCCESS) { DEBUG_PRINT("[%s:%d] %s failed: %s\n", __FILE__, __LINE__, @@ -1116,8 +1111,7 @@ err = hsa::executable_iterate_symbols( executable, [&](hsa_executable_t, hsa_executable_symbol_t symbol) -> hsa_status_t { - return populate_InfoTables(symbol, gpu, KernelInfoTable, - SymbolInfoTable); + return populate_InfoTables(symbol, KernelInfoTable, SymbolInfoTable); }); if (err != HSA_STATUS_SUCCESS) { printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__, diff --git a/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp b/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp --- a/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp +++ b/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp @@ -41,6 +41,8 @@ #include "omptargetplugin.h" #include "print_tracing.h" +#include "llvm/ADT/None.h" +#include "llvm/ADT/Optional.h" #include "llvm/Frontend/OpenMP/OMPGridValues.h" #ifndef TARGET_NAME @@ -91,7 +93,7 @@ hsa_status_t RegisterModuleFromMemory( std::map &KernelInfo, std::map &SymbolInfoTable, void *, size_t, - int DeviceId, + hsa_agent_t agent, hsa_status_t (*on_deserialized_data)(void *data, size_t size, void *cb_state), void *cb_state, std::vector &HSAExecutables); @@ -244,13 +246,14 @@ /// FIXME: we may need this to be per device and per library. std::list KernelsList; -static std::vector find_gpu_agents() { - std::vector res; +typedef std::pair DeviceTypeAgentPair; + +static llvm::Optional> FindAgents() { + std::vector res; hsa_status_t err = hsa_iterate_agents( [](hsa_agent_t agent, void *data) -> hsa_status_t { - std::vector *res = - static_cast *>(data); + auto *res = static_cast *>(data); hsa_device_type_t device_type; // get_info fails iff HSA runtime not yet initialized @@ -261,16 +264,19 @@ assert(err == HSA_STATUS_SUCCESS); if (device_type == HSA_DEVICE_TYPE_GPU) { - res->push_back(agent); + res->push_back({HSA_DEVICE_TYPE_GPU, agent}); + } else if (device_type == HSA_DEVICE_TYPE_CPU) { + res->push_back({HSA_DEVICE_TYPE_CPU, agent}); } return HSA_STATUS_SUCCESS; }, &res); // iterate_agents fails iff HSA runtime not yet initialized - if (print_kernel_trace > 0 && err != HSA_STATUS_SUCCESS) + if (print_kernel_trace > 0 && err != HSA_STATUS_SUCCESS) { printf("rtl.cpp: err %d\n", err); - assert(err == HSA_STATUS_SUCCESS); + return llvm::None; + } return res; } @@ -347,8 +353,7 @@ std::pair FindKernargPool(const std::vector &HSAAgents) { std::vector KernArgPools; - for (const auto &processor : g_atl_machine.processors()) { - hsa_agent_t Agent = processor.agent(); + for (const auto &Agent : HSAAgents) { hsa_status_t err = HSA_STATUS_SUCCESS; err = hsa_amd_agent_iterate_memory_pools( Agent, addKernArgPool, static_cast(&KernArgPools)); @@ -385,6 +390,9 @@ std::vector HSAAgents; std::vector HSAQueues; // one per gpu + // CPUs + std::vector CPUAgents; + // Device properties std::vector ComputeUnits; std::vector GroupsPerDevice; @@ -539,7 +547,20 @@ // Init hostcall soon after initializing ATMI hostrpc_init(); - HSAAgents = find_gpu_agents(); + llvm::Optional> TypeAgentPairs = + FindAgents(); + if (!TypeAgentPairs) + return; + + std::vector AllAgents; + for (const auto &TypeAgentPair : *TypeAgentPairs) { + if (TypeAgentPair.first == HSA_DEVICE_TYPE_GPU) { + HSAAgents.push_back(TypeAgentPair.second); + } else { + CPUAgents.push_back(TypeAgentPair.second); + } + AllAgents.push_back(TypeAgentPair.second); + } NumberOfDevices = (int)HSAAgents.size(); if (NumberOfDevices == 0) { @@ -548,8 +569,7 @@ } else { DP("There are %d devices supporting HSA.\n", NumberOfDevices); } - - std::tie(err, KernArgPool) = core::FindKernargPool(HSAAgents); + std::tie(err, KernArgPool) = core::FindKernargPool(AllAgents); if (err != HSA_STATUS_SUCCESS) { DP("Error when reading memory pools\n"); return; @@ -1105,8 +1125,9 @@ return (*unwrapped)(data, size); }; return core::RegisterModuleFromMemory( - KernelInfoTable, SymbolInfoTable, module_bytes, module_size, DeviceId, L, - static_cast(&cb), HSAExecutables); + KernelInfoTable, SymbolInfoTable, module_bytes, module_size, + DeviceInfo.HSAAgents[DeviceId], L, static_cast(&cb), + HSAExecutables); } } // namespace