diff --git a/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp b/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp --- a/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp +++ b/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp @@ -106,6 +106,16 @@ return hsa_iterate_agents(L, static_cast(&cb)); } +template +hsa_status_t amd_agent_iterate_memory_pools(hsa_agent_t Agent, C cb) { + auto L = [](hsa_amd_memory_pool_t MemoryPool, void *data) -> hsa_status_t { + C *unwrapped = static_cast(data); + return (*unwrapped)(MemoryPool); + }; + + return hsa_amd_agent_iterate_memory_pools(Agent, L, static_cast(&cb)); +} + } // namespace hsa /// Keep entries table per device @@ -331,18 +341,60 @@ return err; } + size_t size = 0; + err = hsa_amd_memory_pool_get_info(MemoryPool, HSA_AMD_MEMORY_POOL_INFO_SIZE, + &size); + if (err != HSA_STATUS_SUCCESS) { + fprintf(stderr, "Get memory pool size failed: %s\n", get_error_string(err)); + return err; + } + if ((GlobalFlags & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_FINE_GRAINED) && - (GlobalFlags & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_KERNARG_INIT)) { - size_t size = 0; - err = hsa_amd_memory_pool_get_info(MemoryPool, - HSA_AMD_MEMORY_POOL_INFO_SIZE, &size); - if (err != HSA_STATUS_SUCCESS) { - fprintf(stderr, "Get memory pool size failed: %s\n", - get_error_string(err)); - return err; + (GlobalFlags & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_KERNARG_INIT) && + size > 0) { + Result->push_back(MemoryPool); + } + + return HSA_STATUS_SUCCESS; +} + +std::pair +isValidMemoryPool(hsa_amd_memory_pool_t MemoryPool) { + bool AllocAllowed = false; + hsa_status_t Err = hsa_amd_memory_pool_get_info( + MemoryPool, HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALLOWED, + &AllocAllowed); + if (Err != HSA_STATUS_SUCCESS) { + fprintf(stderr, "Alloc allowed in memory pool check failed: %s\n", + get_error_string(Err)); + return {Err, false}; + } + + return {HSA_STATUS_SUCCESS, AllocAllowed}; +} + +template +hsa_status_t CollectMemoryPools(const std::vector &Agents, + AccumulatorFunc Func) { + for (int DeviceId = 0; DeviceId < Agents.size(); DeviceId++) { + hsa_status_t Err = hsa::amd_agent_iterate_memory_pools( + Agents[DeviceId], [&](hsa_amd_memory_pool_t MemoryPool) { + hsa_status_t Err; + bool Valid = false; + std::tie(Err, Valid) = isValidMemoryPool(MemoryPool); + if (Err != HSA_STATUS_SUCCESS) { + return Err; + } + if (Valid) + Func(MemoryPool, DeviceId); + return HSA_STATUS_SUCCESS; + }); + + if (Err != HSA_STATUS_SUCCESS) { + printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__, + "Iterate all memory pools", get_error_string(Err)); + return Err; } - if (size > 0) - Result->push_back(MemoryPool); } return HSA_STATUS_SUCCESS; @@ -422,6 +474,14 @@ hsa_amd_memory_pool_t KernArgPool; + // fine and coarse-grained memory pools per CPU + std::vector HostFineGrainedMemoryPools; + std::vector HostCoarseGrainedMemoryPools; + + // fine and coarse-grained memory pools per offloading device + std::vector DeviceFineGrainedMemoryPools; + std::vector DeviceCoarseGrainedMemoryPools; + struct atmiFreePtrDeletor { void operator()(void *p) { core::Runtime::Memfree(p); // ignore failure to free @@ -524,6 +584,82 @@ E.Table.EntriesBegin = E.Table.EntriesEnd = 0; } + hsa_status_t AddDeviceMemoryPool(hsa_amd_memory_pool_t MemoryPool, + int DeviceId) { + assert(DeviceId < DeviceFineGrainedMemoryPools.size() && "Error here."); + uint32_t GlobalFlags = 0; + hsa_status_t Err = hsa_amd_memory_pool_get_info( + MemoryPool, HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, &GlobalFlags); + + if (Err != HSA_STATUS_SUCCESS) { + fprintf(stderr, "Get memory pool info failed: %s\n", + get_error_string(Err)); + return Err; + } + + if (GlobalFlags & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_FINE_GRAINED) { + DeviceFineGrainedMemoryPools[DeviceId] = MemoryPool; + } else if (GlobalFlags & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_COARSE_GRAINED) { + DeviceCoarseGrainedMemoryPools[DeviceId] = MemoryPool; + } + + return HSA_STATUS_SUCCESS; + } + + hsa_status_t AddHostMemoryPool(hsa_amd_memory_pool_t MemoryPool, + int DeviceId) { + assert(DeviceId < HostFineGrainedMemoryPools.size() && "Error in host"); + uint32_t GlobalFlags = 0; + hsa_status_t Err = hsa_amd_memory_pool_get_info( + MemoryPool, HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, &GlobalFlags); + + if (Err != HSA_STATUS_SUCCESS) { + fprintf(stderr, "Get memory pool info failed: %s\n", + get_error_string(Err)); + return Err; + } + + if (GlobalFlags & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_FINE_GRAINED) { + HostFineGrainedMemoryPools[DeviceId] = MemoryPool; + } else if (GlobalFlags & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_COARSE_GRAINED) { + HostCoarseGrainedMemoryPools[DeviceId] = MemoryPool; + } + return HSA_STATUS_SUCCESS; + } + + void setupMemoryPools() { + hsa_status_t Err; + Err = core::CollectMemoryPools( + CPUAgents, [&](hsa_amd_memory_pool_t MemoryPool, int DeviceId) { + return AddHostMemoryPool(MemoryPool, DeviceId); + }); + if (Err != HSA_STATUS_SUCCESS) { + fprintf(stderr, "HSA error in collecting memory pools for CPU: %s\n", + get_error_string(Err)); + return; + } + Err = core::CollectMemoryPools( + HSAAgents, [&](hsa_amd_memory_pool_t MemoryPool, int DeviceId) { + return AddDeviceMemoryPool(MemoryPool, DeviceId); + }); + if (Err != HSA_STATUS_SUCCESS) { + fprintf(stderr, + "HSA error in collecting memory pools for offload devices: %s\n", + get_error_string(Err)); + return; + } + } + + hsa_amd_memory_pool_t getDeviceMemoryPool(int DeviceId) { + assert(DeviceId >= 0 && DeviceId < DeviceCoarseGrainedMemoryPools.size() && + "Invalid device Id"); + return DeviceCoarseGrainedMemoryPools[DeviceId]; + } + + hsa_amd_memory_pool_t getHostMemoryPool() { + return HostFineGrainedMemoryPools[0]; + } + RTLDeviceInfoTy() { // LIBOMPTARGET_KERNEL_TRACE provides a kernel launch trace to stderr // anytime. You do not need a debug library build. @@ -556,6 +692,7 @@ return; NumberOfDevices = (int)HSAAgents.size(); + int NumCPUs = CPUAgents.size(); if (NumberOfDevices == 0) { DP("There are no devices supporting HSA.\n"); @@ -582,6 +719,12 @@ deviceStateStore.resize(NumberOfDevices); KernelInfoTable.resize(NumberOfDevices); SymbolInfoTable.resize(NumberOfDevices); + HostCoarseGrainedMemoryPools.resize(NumCPUs); + HostFineGrainedMemoryPools.resize(NumCPUs); + DeviceCoarseGrainedMemoryPools.resize(NumberOfDevices); + DeviceFineGrainedMemoryPools.resize(NumberOfDevices); + + setupMemoryPools(); for (int i = 0; i < NumberOfDevices; i++) { HSAQueues[i] = nullptr;