diff --git a/openmp/libomptarget/plugins/amdgpu/impl/atmi_interop_hsa.cpp b/openmp/libomptarget/plugins/amdgpu/impl/atmi_interop_hsa.cpp --- a/openmp/libomptarget/plugins/amdgpu/impl/atmi_interop_hsa.cpp +++ b/openmp/libomptarget/plugins/amdgpu/impl/atmi_interop_hsa.cpp @@ -25,9 +25,6 @@ if (!symbol || !var_addr || !var_size) return HSA_STATUS_ERROR; - if (DeviceId < 0 || - DeviceId >= g_atl_machine.processors().size()) - return HSA_STATUS_ERROR; // get the symbol info std::string symbolStr = std::string(symbol); @@ -58,9 +55,6 @@ if (!kernel_name || !value) return HSA_STATUS_ERROR; - if (DeviceId < 0 || - DeviceId >= g_atl_machine.processors().size()) - return HSA_STATUS_ERROR; hsa_status_t status = HSA_STATUS_SUCCESS; // get the kernel info diff --git a/openmp/libomptarget/plugins/amdgpu/impl/machine.h b/openmp/libomptarget/plugins/amdgpu/impl/machine.h --- a/openmp/libomptarget/plugins/amdgpu/impl/machine.h +++ b/openmp/libomptarget/plugins/amdgpu/impl/machine.h @@ -66,9 +66,6 @@ } template void addProcessor(const T &p); template std::vector &processors(); - template size_t processorCount() { - return processors().size(); - } private: std::vector cpu_processors_; @@ -78,14 +75,4 @@ hsa_amd_memory_pool_t get_memory_pool(const ATLProcessor &proc, const int mem_id); -extern ATLMachine g_atl_machine; -template T &get_processor(int dev_id) { - if (dev_id == -1) { - // user is asking runtime to pick a device - // best device of this type? pick 0 for now - dev_id = 0; - } - return g_atl_machine.processors()[dev_id]; -} - #endif // SRC_RUNTIME_INCLUDE_MACHINE_H_ diff --git a/openmp/libomptarget/plugins/amdgpu/impl/rt.h b/openmp/libomptarget/plugins/amdgpu/impl/rt.h --- a/openmp/libomptarget/plugins/amdgpu/impl/rt.h +++ b/openmp/libomptarget/plugins/amdgpu/impl/rt.h @@ -51,7 +51,7 @@ // modules static hsa_status_t RegisterModuleFromMemory( - void *, size_t, int DeviceId, + void *, size_t, hsa_agent_t agent, hsa_status_t (*on_deserialized_data)(void *data, size_t size, void *cb_state), void *cb_state, std::vector &HSAExecutables); diff --git a/openmp/libomptarget/plugins/amdgpu/impl/system.cpp b/openmp/libomptarget/plugins/amdgpu/impl/system.cpp --- a/openmp/libomptarget/plugins/amdgpu/impl/system.cpp +++ b/openmp/libomptarget/plugins/amdgpu/impl/system.cpp @@ -703,7 +703,7 @@ } // namespace static hsa_status_t get_code_object_custom_metadata( - void *binary, size_t binSize, int gpu, + void *binary, size_t binSize, std::map &KernelInfoTable) { // parse code object with different keys from v2 // also, the kernel name is not the same as the symbol name -- so a @@ -878,7 +878,7 @@ } static hsa_status_t -populate_InfoTables(hsa_executable_symbol_t symbol, int gpu, +populate_InfoTables(hsa_executable_symbol_t symbol, std::map &KernelInfoTable, std::map &SymbolInfoTable) { hsa_symbol_kind_t type; @@ -1020,16 +1020,11 @@ hsa_status_t RegisterModuleFromMemory( std::map &KernelInfoTable, std::map &SymbolInfoTable, - void *module_bytes, size_t module_size, int gpu, + void *module_bytes, size_t module_size, hsa_agent_t agent, hsa_status_t (*on_deserialized_data)(void *data, size_t size, void *cb_state), void *cb_state, std::vector &HSAExecutables) { hsa_status_t err; - assert(gpu >= 0); - - DEBUG_PRINT("Trying to load module to GPU-%d\n", gpu); - ATLGPUProcessor &proc = get_processor(gpu); - hsa_agent_t agent = proc.agent(); hsa_executable_t executable = {0}; hsa_profile_t agent_profile; @@ -1058,7 +1053,7 @@ // Some metadata info is not available through ROCr API, so use custom // code object metadata parsing to collect such metadata info - err = get_code_object_custom_metadata(module_bytes, module_size, gpu, + err = get_code_object_custom_metadata(module_bytes, module_size, KernelInfoTable); if (err != HSA_STATUS_SUCCESS) { DEBUG_PRINT("[%s:%d] %s failed: %s\n", __FILE__, __LINE__, @@ -1116,8 +1111,7 @@ err = hsa::executable_iterate_symbols( executable, [&](hsa_executable_t, hsa_executable_symbol_t symbol) -> hsa_status_t { - return populate_InfoTables(symbol, gpu, KernelInfoTable, - SymbolInfoTable); + return populate_InfoTables(symbol, KernelInfoTable, SymbolInfoTable); }); if (err != HSA_STATUS_SUCCESS) { printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__, diff --git a/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp b/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp --- a/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp +++ b/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp @@ -91,12 +91,23 @@ hsa_status_t RegisterModuleFromMemory( std::map &KernelInfo, std::map &SymbolInfoTable, void *, size_t, - int DeviceId, + hsa_agent_t agent, hsa_status_t (*on_deserialized_data)(void *data, size_t size, void *cb_state), void *cb_state, std::vector &HSAExecutables); } +namespace hsa { +template hsa_status_t iterate_agents(C cb) { + auto L = [](hsa_agent_t agent, void *data) -> hsa_status_t { + C *unwrapped = static_cast(data); + return (*unwrapped)(agent); + }; + return hsa_iterate_agents(L, static_cast(&cb)); +} + +} // namespace hsa + /// Keep entries table per device struct FuncOrGblEntryTy { __tgt_target_table Table; @@ -244,14 +255,10 @@ /// FIXME: we may need this to be per device and per library. std::list KernelsList; -static std::vector find_gpu_agents() { - std::vector res; - - hsa_status_t err = hsa_iterate_agents( - [](hsa_agent_t agent, void *data) -> hsa_status_t { - std::vector *res = - static_cast *>(data); +template static hsa_status_t FindAgents(Callback CB) { + hsa_status_t err = + hsa::iterate_agents([&](hsa_agent_t agent) -> hsa_status_t { hsa_device_type_t device_type; // get_info fails iff HSA runtime not yet initialized hsa_status_t err = @@ -260,18 +267,16 @@ printf("rtl.cpp: err %d\n", err); assert(err == HSA_STATUS_SUCCESS); - if (device_type == HSA_DEVICE_TYPE_GPU) { - res->push_back(agent); - } + CB(device_type, agent); return HSA_STATUS_SUCCESS; - }, - &res); + }); // iterate_agents fails iff HSA runtime not yet initialized - if (print_kernel_trace > 0 && err != HSA_STATUS_SUCCESS) + if (print_kernel_trace > 0 && err != HSA_STATUS_SUCCESS) { printf("rtl.cpp: err %d\n", err); - assert(err == HSA_STATUS_SUCCESS); - return res; + } + + return err; } static void callbackQueue(hsa_status_t status, hsa_queue_t *source, @@ -346,8 +351,7 @@ std::pair FindKernargPool(const std::vector &HSAAgents) { std::vector KernArgPools; - for (const auto &processor : g_atl_machine.processors()) { - hsa_agent_t Agent = processor.agent(); + for (const auto &Agent : HSAAgents) { hsa_status_t err = HSA_STATUS_SUCCESS; err = hsa_amd_agent_iterate_memory_pools( Agent, addKernArgPool, static_cast(&KernArgPools)); @@ -384,6 +388,9 @@ std::vector HSAAgents; std::vector HSAQueues; // one per gpu + // CPUs + std::vector CPUAgents; + // Device properties std::vector ComputeUnits; std::vector GroupsPerDevice; @@ -538,7 +545,16 @@ // Init hostcall soon after initializing ATMI hostrpc_init(); - HSAAgents = find_gpu_agents(); + err = FindAgents([&](hsa_device_type_t DeviceType, hsa_agent_t Agent) { + if (DeviceType == HSA_DEVICE_TYPE_CPU) { + CPUAgents.push_back(Agent); + } else { + HSAAgents.push_back(Agent); + } + }); + if (err != HSA_STATUS_SUCCESS) + return; + NumberOfDevices = (int)HSAAgents.size(); if (NumberOfDevices == 0) { @@ -547,8 +563,7 @@ } else { DP("There are %d devices supporting HSA.\n", NumberOfDevices); } - - std::tie(err, KernArgPool) = core::FindKernargPool(HSAAgents); + std::tie(err, KernArgPool) = core::FindKernargPool(CPUAgents); if (err != HSA_STATUS_SUCCESS) { DP("Error when reading memory pools\n"); return; @@ -1104,8 +1119,9 @@ return (*unwrapped)(data, size); }; return core::RegisterModuleFromMemory( - KernelInfoTable, SymbolInfoTable, module_bytes, module_size, DeviceId, L, - static_cast(&cb), HSAExecutables); + KernelInfoTable, SymbolInfoTable, module_bytes, module_size, + DeviceInfo.HSAAgents[DeviceId], L, static_cast(&cb), + HSAExecutables); } } // namespace