diff --git a/openmp/libomptarget/plugins/amdgpu/impl/internal.h b/openmp/libomptarget/plugins/amdgpu/impl/internal.h --- a/openmp/libomptarget/plugins/amdgpu/impl/internal.h +++ b/openmp/libomptarget/plugins/amdgpu/impl/internal.h @@ -181,8 +181,6 @@ }; }; -extern std::vector atl_gpu_kernarg_pools; - namespace core { hsa_status_t atl_init_gpu_context(); diff --git a/openmp/libomptarget/plugins/amdgpu/impl/system.cpp b/openmp/libomptarget/plugins/amdgpu/impl/system.cpp --- a/openmp/libomptarget/plugins/amdgpu/impl/system.cpp +++ b/openmp/libomptarget/plugins/amdgpu/impl/system.cpp @@ -142,8 +142,6 @@ ATLMachine g_atl_machine; -std::vector atl_gpu_kernarg_pools; - /* atlc is all internal global values. The structure atl_context_t is defined in atl_internal.h @@ -198,10 +196,6 @@ if (HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_FINE_GRAINED & global_flag) { ATLMemory new_mem(memory_pool, *proc, ATMI_MEMTYPE_FINE_GRAINED); proc->addMemory(new_mem); - if (HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_KERNARG_INIT & global_flag) { - DEBUG_PRINT("GPU kernel args pool handle: %lu\n", memory_pool.handle); - atl_gpu_kernarg_pools.push_back(memory_pool); - } } else { ATLMemory new_mem(memory_pool, *proc, ATMI_MEMTYPE_COARSE_GRAINED); proc->addMemory(new_mem); diff --git a/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp b/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp --- a/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp +++ b/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp @@ -37,6 +37,7 @@ #include "Debug.h" #include "get_elf_mach_gfx_name.h" +#include "machine.h" #include "omptargetplugin.h" #include "print_tracing.h" @@ -71,6 +72,7 @@ } int print_kernel_trace; +extern ATLMachine g_atl_machine; #ifdef OMPTARGET_DEBUG #define check(msg, status) \ @@ -136,15 +138,15 @@ KernelArgPool(const KernelArgPool &) = delete; KernelArgPool(KernelArgPool &&) = delete; - KernelArgPool(uint32_t kernarg_segment_size) + KernelArgPool(uint32_t kernarg_segment_size, + hsa_amd_memory_pool_t &memory_pool) : kernarg_segment_size(kernarg_segment_size) { // atmi uses one pool per kernel for all gpus, with a fixed upper size // preserving that exact scheme here, including the queue hsa_status_t err = hsa_amd_memory_pool_allocate( - atl_gpu_kernarg_pools[0], - kernarg_size_including_implicit() * MAX_NUM_KERNELS, 0, + memory_pool, kernarg_size_including_implicit() * MAX_NUM_KERNELS, 0, &kernarg_region); if (err != HSA_STATUS_SUCCESS) { @@ -224,7 +226,8 @@ KernelTy(int8_t _ExecutionMode, int16_t _ConstWGSize, int32_t _device_id, void *_CallStackAddr, const char *_Name, - uint32_t _kernarg_segment_size) + uint32_t _kernarg_segment_size, + hsa_amd_memory_pool_t &KernArgMemoryPool) : ExecutionMode(_ExecutionMode), ConstWGSize(_ConstWGSize), device_id(_device_id), CallStackAddr(_CallStackAddr), Name(_Name) { DP("Construct kernelinfo: ExecMode %d\n", ExecutionMode); @@ -232,8 +235,8 @@ std::string N(_Name); if (KernelArgPoolMap.find(N) == KernelArgPoolMap.end()) { KernelArgPoolMap.insert( - std::make_pair(N, std::unique_ptr( - new KernelArgPool(_kernarg_segment_size)))); + std::make_pair(N, std::unique_ptr(new KernelArgPool( + _kernarg_segment_size, KernArgMemoryPool)))); } } }; @@ -297,6 +300,61 @@ header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE; return header; } + +hsa_status_t addKernArgPool(hsa_amd_memory_pool_t MemoryPool, void *Data) { + std::vector *Result = + static_cast *>(Data); + bool AllocAllowed = false; + hsa_status_t err = hsa_amd_memory_pool_get_info( + MemoryPool, HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALLOWED, + &AllocAllowed); + if (err != HSA_STATUS_SUCCESS) { + fprintf(stderr, "Alloc allowed in memory pool check failed: %s\n", + get_error_string(err)); + return err; + } + + if (!AllocAllowed) { + // nothing needs to be done here. + return HSA_STATUS_SUCCESS; + } + + uint32_t GlobalFlags = 0; + err = hsa_amd_memory_pool_get_info( + MemoryPool, HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, &GlobalFlags); + if (err != HSA_STATUS_SUCCESS) { + fprintf(stderr, "Get memory pool info failed: %s\n", get_error_string(err)); + return err; + } + + fprintf(stderr, "Flags : %d\n", GlobalFlags); + if ((GlobalFlags & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_FINE_GRAINED) && + (GlobalFlags & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_KERNARG_INIT)) { + Result->push_back(MemoryPool); + } + + return HSA_STATUS_SUCCESS; +} + +hsa_status_t +CollectKernArgPools(const std::vector &HSAAgents, + std::vector &KernArgPools) { + + for (const auto &processor : g_atl_machine.processors()) { + hsa_agent_t Agent = processor.agent(); + hsa_status_t err = HSA_STATUS_SUCCESS; + err = hsa_amd_agent_iterate_memory_pools( + Agent, addKernArgPool, static_cast(&KernArgPools)); + if (err != HSA_STATUS_SUCCESS) { + printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__, + "Iterate all memory pools", get_error_string(err)); + return err; + } + } + + return HSA_STATUS_SUCCESS; +} + } // namespace } // namespace core @@ -344,6 +402,8 @@ std::vector> KernelInfoTable; std::vector> SymbolInfoTable; + std::vector KernArgPools; + struct atmiFreePtrDeletor { void operator()(void *p) { core::Runtime::Memfree(p); // ignore failure to free @@ -390,6 +450,30 @@ return freesignalpool_memcpy(dest, src, size, atmi_memcpy_h2d, deviceId); } + // selects the first non-zero sized kernarg pool. + // TODO: should this take care of the device topology? + std::pair getKernArgPool() { + hsa_status_t err; + hsa_amd_memory_pool_t result; + for (int i = 0; i < KernArgPools.size(); i++) { + size_t size = 0; + hsa_amd_memory_pool_t &pool = KernArgPools[i]; + err = hsa_amd_memory_pool_get_info(pool, HSA_AMD_MEMORY_POOL_INFO_SIZE, + &size); + if (err != HSA_STATUS_SUCCESS) { + fprintf(stderr, "Failed to get memory pool size: %s\n", get_error_string(err)); + return {err, pool}; + } + if (size > 0) { + DP("Using %d kernarg pool (size = %lu bytes) out of %d pools\n", i, + size, KernArgPools.size()); + return {HSA_STATUS_SUCCESS, pool}; + } + } + + return {HSA_STATUS_ERROR, result}; + } + // Record entry point associated with device void addOffloadEntry(int32_t device_id, __tgt_offload_entry entry) { assert(device_id < (int32_t)FuncGblEntries.size() && @@ -477,6 +561,13 @@ DP("There are %d devices supporting HSA.\n", NumberOfDevices); } + // collect memory pools with kernarg_init flag set + err = core::CollectKernArgPools(HSAAgents, KernArgPools); + if (err != HSA_STATUS_SUCCESS) { + DP("Error when reading memory pools\n"); + return; + } + // Init the device info HSAQueues.resize(NumberOfDevices); FuncGblEntries.resize(NumberOfDevices); @@ -1261,6 +1352,14 @@ } DP("ATMI module successfully loaded!\n"); + hsa_amd_memory_pool_t KernArgPool; + hsa_status_t Err; + std::tie(Err, KernArgPool) = DeviceInfo.getKernArgPool(); + if (Err != HSA_STATUS_SUCCESS) { + fprintf(stderr, "Unable to find any kernarg pool: %s\n", + get_error_string(Err)); + return NULL; + } { // the device_State array is either large value in bss or a void* that @@ -1543,8 +1642,8 @@ } KernelsList.push_back(KernelTy(ExecModeVal, WGSizeVal, device_id, - CallStackAddr, e->name, - kernarg_segment_size)); + CallStackAddr, e->name, kernarg_segment_size, + KernArgPool)); __tgt_offload_entry entry = *e; entry.addr = (void *)&KernelsList.back(); DeviceInfo.addOffloadEntry(device_id, entry);