diff --git a/openmp/libomptarget/plugins/amdgpu/impl/internal.h b/openmp/libomptarget/plugins/amdgpu/impl/internal.h --- a/openmp/libomptarget/plugins/amdgpu/impl/internal.h +++ b/openmp/libomptarget/plugins/amdgpu/impl/internal.h @@ -181,8 +181,6 @@ }; }; -extern std::vector atl_gpu_kernarg_pools; - namespace core { hsa_status_t atl_init_gpu_context(); diff --git a/openmp/libomptarget/plugins/amdgpu/impl/system.cpp b/openmp/libomptarget/plugins/amdgpu/impl/system.cpp --- a/openmp/libomptarget/plugins/amdgpu/impl/system.cpp +++ b/openmp/libomptarget/plugins/amdgpu/impl/system.cpp @@ -142,8 +142,6 @@ ATLMachine g_atl_machine; -std::vector atl_gpu_kernarg_pools; - /* atlc is all internal global values. The structure atl_context_t is defined in atl_internal.h @@ -198,10 +196,6 @@ if (HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_FINE_GRAINED & global_flag) { ATLMemory new_mem(memory_pool, *proc, ATMI_MEMTYPE_FINE_GRAINED); proc->addMemory(new_mem); - if (HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_KERNARG_INIT & global_flag) { - DEBUG_PRINT("GPU kernel args pool handle: %lu\n", memory_pool.handle); - atl_gpu_kernarg_pools.push_back(memory_pool); - } } else { ATLMemory new_mem(memory_pool, *proc, ATMI_MEMTYPE_COARSE_GRAINED); proc->addMemory(new_mem); diff --git a/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp b/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp --- a/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp +++ b/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp @@ -37,6 +37,7 @@ #include "Debug.h" #include "get_elf_mach_gfx_name.h" +#include "machine.h" #include "omptargetplugin.h" #include "print_tracing.h" @@ -136,15 +137,15 @@ KernelArgPool(const KernelArgPool &) = delete; KernelArgPool(KernelArgPool &&) = delete; - KernelArgPool(uint32_t kernarg_segment_size) + KernelArgPool(uint32_t kernarg_segment_size, + hsa_amd_memory_pool_t &memory_pool) : kernarg_segment_size(kernarg_segment_size) { // atmi uses one pool per kernel for all gpus, with a fixed upper size // preserving that exact scheme here, including the queue hsa_status_t err = hsa_amd_memory_pool_allocate( - atl_gpu_kernarg_pools[0], - kernarg_size_including_implicit() * MAX_NUM_KERNELS, 0, + memory_pool, kernarg_size_including_implicit() * MAX_NUM_KERNELS, 0, &kernarg_region); if (err != HSA_STATUS_SUCCESS) { @@ -224,7 +225,8 @@ KernelTy(int8_t _ExecutionMode, int16_t _ConstWGSize, int32_t _device_id, void *_CallStackAddr, const char *_Name, - uint32_t _kernarg_segment_size) + uint32_t _kernarg_segment_size, + hsa_amd_memory_pool_t &KernArgMemoryPool) : ExecutionMode(_ExecutionMode), ConstWGSize(_ConstWGSize), device_id(_device_id), CallStackAddr(_CallStackAddr), Name(_Name) { DP("Construct kernelinfo: ExecMode %d\n", ExecutionMode); @@ -232,8 +234,8 @@ std::string N(_Name); if (KernelArgPoolMap.find(N) == KernelArgPoolMap.end()) { KernelArgPoolMap.insert( - std::make_pair(N, std::unique_ptr( - new KernelArgPool(_kernarg_segment_size)))); + std::make_pair(N, std::unique_ptr(new KernelArgPool( + _kernarg_segment_size, KernArgMemoryPool)))); } } }; @@ -297,6 +299,74 @@ header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE; return header; } + +hsa_status_t addKernArgPool(hsa_amd_memory_pool_t MemoryPool, void *Data) { + std::vector *Result = + static_cast *>(Data); + bool AllocAllowed = false; + hsa_status_t err = hsa_amd_memory_pool_get_info( + MemoryPool, HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALLOWED, + &AllocAllowed); + if (err != HSA_STATUS_SUCCESS) { + fprintf(stderr, "Alloc allowed in memory pool check failed: %s\n", + get_error_string(err)); + return err; + } + + if (!AllocAllowed) { + // nothing needs to be done here. + return HSA_STATUS_SUCCESS; + } + + uint32_t GlobalFlags = 0; + err = hsa_amd_memory_pool_get_info( + MemoryPool, HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, &GlobalFlags); + if (err != HSA_STATUS_SUCCESS) { + fprintf(stderr, "Get memory pool info failed: %s\n", get_error_string(err)); + return err; + } + + fprintf(stderr, "Flags : %d\n", GlobalFlags); + if ((GlobalFlags & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_FINE_GRAINED) && + (GlobalFlags & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_KERNARG_INIT)) { + size_t size = 0; + err = hsa_amd_memory_pool_get_info(MemoryPool, + HSA_AMD_MEMORY_POOL_INFO_SIZE, &size); + if (err != HSA_STATUS_SUCCESS) { + fprintf(stderr, "Get memory pool size failed: %s\n", + get_error_string(err)); + return err; + } + if (size > 0) + Result->push_back(MemoryPool); + } + + return HSA_STATUS_SUCCESS; +} + +std::pair +FindKernargPool(const std::vector &HSAAgents) { + std::vector KernArgPools; + for (const auto &processor : g_atl_machine.processors()) { + hsa_agent_t Agent = processor.agent(); + hsa_status_t err = HSA_STATUS_SUCCESS; + err = hsa_amd_agent_iterate_memory_pools( + Agent, addKernArgPool, static_cast(&KernArgPools)); + if (err != HSA_STATUS_SUCCESS) { + printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__, + "Iterate all memory pools", get_error_string(err)); + return {err, hsa_amd_memory_pool_t{}}; + } + } + + if (KernArgPools.empty()) { + fprintf(stderr, "Unable to find any valid kernarg pool\n"); + return {HSA_STATUS_ERROR, hsa_amd_memory_pool_t{}}; + } + + return {HSA_STATUS_SUCCESS, KernArgPools[0]}; +} + } // namespace } // namespace core @@ -344,6 +414,8 @@ std::vector> KernelInfoTable; std::vector> SymbolInfoTable; + hsa_amd_memory_pool_t KernArgPool; + struct atmiFreePtrDeletor { void operator()(void *p) { core::Runtime::Memfree(p); // ignore failure to free @@ -477,6 +549,12 @@ DP("There are %d devices supporting HSA.\n", NumberOfDevices); } + std::tie(err, KernArgPool) = core::FindKernargPool(HSAAgents); + if (err != HSA_STATUS_SUCCESS) { + DP("Error when reading memory pools\n"); + return; + } + // Init the device info HSAQueues.resize(NumberOfDevices); FuncGblEntries.resize(NumberOfDevices); @@ -1543,8 +1621,8 @@ } KernelsList.push_back(KernelTy(ExecModeVal, WGSizeVal, device_id, - CallStackAddr, e->name, - kernarg_segment_size)); + CallStackAddr, e->name, kernarg_segment_size, + DeviceInfo.KernArgPool)); __tgt_offload_entry entry = *e; entry.addr = (void *)&KernelsList.back(); DeviceInfo.addOffloadEntry(device_id, entry);