diff --git a/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp b/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp --- a/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp +++ b/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp @@ -17,6 +17,7 @@ #include #include #include +#include #include #include #include @@ -104,6 +105,16 @@ return hsa_iterate_agents(L, static_cast(&cb)); } +template +hsa_status_t amd_agent_iterate_memory_pools(hsa_agent_t Agent, C cb) { + auto L = [](hsa_amd_memory_pool_t MemoryPool, void *data) -> hsa_status_t { + C *unwrapped = static_cast(data); + return (*unwrapped)(MemoryPool); + }; + + return hsa_amd_agent_iterate_memory_pools(Agent, L, static_cast(&cb)); +} + } // namespace hsa /// Keep entries table per device @@ -329,18 +340,60 @@ return err; } + size_t size = 0; + err = hsa_amd_memory_pool_get_info(MemoryPool, HSA_AMD_MEMORY_POOL_INFO_SIZE, + &size); + if (err != HSA_STATUS_SUCCESS) { + fprintf(stderr, "Get memory pool size failed: %s\n", get_error_string(err)); + return err; + } + if ((GlobalFlags & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_FINE_GRAINED) && - (GlobalFlags & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_KERNARG_INIT)) { - size_t size = 0; - err = hsa_amd_memory_pool_get_info(MemoryPool, - HSA_AMD_MEMORY_POOL_INFO_SIZE, &size); - if (err != HSA_STATUS_SUCCESS) { - fprintf(stderr, "Get memory pool size failed: %s\n", - get_error_string(err)); - return err; + (GlobalFlags & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_KERNARG_INIT) && + size > 0) { + Result->push_back(MemoryPool); + } + + return HSA_STATUS_SUCCESS; +} + +std::pair +isValidMemoryPool(hsa_amd_memory_pool_t MemoryPool) { + bool AllocAllowed = false; + hsa_status_t Err = hsa_amd_memory_pool_get_info( + MemoryPool, HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALLOWED, + &AllocAllowed); + if (Err != HSA_STATUS_SUCCESS) { + fprintf(stderr, "Alloc allowed in memory pool check failed: %s\n", + get_error_string(Err)); + return {Err, false}; + } + + return {HSA_STATUS_SUCCESS, AllocAllowed}; +} + +template +hsa_status_t collectMemoryPools(const std::vector &Agents, + AccumulatorFunc Func) { + for (int DeviceId = 0; DeviceId < Agents.size(); DeviceId++) { + hsa_status_t Err = hsa::amd_agent_iterate_memory_pools( + Agents[DeviceId], [&](hsa_amd_memory_pool_t MemoryPool) { + hsa_status_t Err; + bool Valid = false; + std::tie(Err, Valid) = isValidMemoryPool(MemoryPool); + if (Err != HSA_STATUS_SUCCESS) { + return Err; + } + if (Valid) + Func(MemoryPool, DeviceId); + return HSA_STATUS_SUCCESS; + }); + + if (Err != HSA_STATUS_SUCCESS) { + printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__, + "Iterate all memory pools", get_error_string(Err)); + return Err; } - if (size > 0) - Result->push_back(MemoryPool); } return HSA_STATUS_SUCCESS; @@ -421,6 +474,13 @@ hsa_amd_memory_pool_t KernArgPool; + // fine grained memory pool for host allocations + hsa_amd_memory_pool_t HostFineGrainedMemoryPool; + + // fine and coarse-grained memory pools per offloading device + std::vector DeviceFineGrainedMemoryPools; + std::vector DeviceCoarseGrainedMemoryPools; + struct atmiFreePtrDeletor { void operator()(void *p) { core::Runtime::Memfree(p); // ignore failure to free @@ -523,6 +583,82 @@ E.Table.EntriesBegin = E.Table.EntriesEnd = 0; } + hsa_status_t addDeviceMemoryPool(hsa_amd_memory_pool_t MemoryPool, + int DeviceId) { + assert(DeviceId < DeviceFineGrainedMemoryPools.size() && "Error here."); + uint32_t GlobalFlags = 0; + hsa_status_t Err = hsa_amd_memory_pool_get_info( + MemoryPool, HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, &GlobalFlags); + + if (Err != HSA_STATUS_SUCCESS) { + return Err; + } + + if (GlobalFlags & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_FINE_GRAINED) { + DeviceFineGrainedMemoryPools[DeviceId] = MemoryPool; + } else if (GlobalFlags & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_COARSE_GRAINED) { + DeviceCoarseGrainedMemoryPools[DeviceId] = MemoryPool; + } + + return HSA_STATUS_SUCCESS; + } + + hsa_status_t addHostMemoryPool(hsa_amd_memory_pool_t MemoryPool, + int DeviceId) { + uint32_t GlobalFlags = 0; + hsa_status_t Err = hsa_amd_memory_pool_get_info( + MemoryPool, HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, &GlobalFlags); + + if (Err != HSA_STATUS_SUCCESS) { + return Err; + } + + uint32_t Size; + Err = hsa_amd_memory_pool_get_info(MemoryPool, + HSA_AMD_MEMORY_POOL_INFO_SIZE, &Size); + if (Err != HSA_STATUS_SUCCESS) { + return Err; + } + + if (GlobalFlags & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_FINE_GRAINED && + Size > 0) { + HostFineGrainedMemoryPool = MemoryPool; + } + + return HSA_STATUS_SUCCESS; + } + + hsa_status_t setupMemoryPools() { + using namespace std::placeholders; + hsa_status_t Err; + Err = core::collectMemoryPools( + CPUAgents, std::bind(&RTLDeviceInfoTy::addHostMemoryPool, this, _1, _2)); + if (Err != HSA_STATUS_SUCCESS) { + fprintf(stderr, "HSA error in collecting memory pools for CPU: %s\n", + get_error_string(Err)); + return Err; + } + Err = core::collectMemoryPools( + HSAAgents, std::bind(&RTLDeviceInfoTy::addDeviceMemoryPool, this, _1, _2)); + if (Err != HSA_STATUS_SUCCESS) { + fprintf(stderr, + "HSA error in collecting memory pools for offload devices: %s\n", + get_error_string(Err)); + return Err; + } + return HSA_STATUS_SUCCESS; + } + + hsa_amd_memory_pool_t getDeviceMemoryPool(int DeviceId) { + assert(DeviceId >= 0 && DeviceId < DeviceCoarseGrainedMemoryPools.size() && + "Invalid device Id"); + return DeviceCoarseGrainedMemoryPools[DeviceId]; + } + + hsa_amd_memory_pool_t getHostMemoryPool() { + return HostFineGrainedMemoryPool; + } + RTLDeviceInfoTy() { // LIBOMPTARGET_KERNEL_TRACE provides a kernel launch trace to stderr // anytime. You do not need a debug library build. @@ -581,6 +717,14 @@ deviceStateStore.resize(NumberOfDevices); KernelInfoTable.resize(NumberOfDevices); SymbolInfoTable.resize(NumberOfDevices); + DeviceCoarseGrainedMemoryPools.resize(NumberOfDevices); + DeviceFineGrainedMemoryPools.resize(NumberOfDevices); + + err = setupMemoryPools(); + if (err != HSA_STATUS_SUCCESS) { + DP("Error when setting up memory pools"); + return; + } for (int i = 0; i < NumberOfDevices; i++) { HSAQueues[i] = nullptr;