diff --git a/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp b/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp --- a/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp +++ b/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp @@ -210,38 +210,6 @@ }; pthread_mutex_t KernelArgPool::Mutex = PTHREAD_MUTEX_INITIALIZER; -std::unordered_map> - KernelArgPoolMap; - -/// Use a single entity to encode a kernel and a set of flags -struct KernelTy { - llvm::omp::OMPTgtExecModeFlags ExecutionMode; - int16_t ConstWGSize; - int32_t DeviceId; - void *CallStackAddr = nullptr; - const char *Name; - - KernelTy(llvm::omp::OMPTgtExecModeFlags ExecutionMode, int16_t ConstWgSize, - int32_t DeviceId, void *CallStackAddr, const char *Name, - uint32_t KernargSegmentSize, - hsa_amd_memory_pool_t &KernArgMemoryPool) - : ExecutionMode(ExecutionMode), ConstWGSize(ConstWgSize), - DeviceId(DeviceId), CallStackAddr(CallStackAddr), Name(Name) { - DP("Construct kernelinfo: ExecMode %d\n", ExecutionMode); - - std::string N(Name); - if (KernelArgPoolMap.find(N) == KernelArgPoolMap.end()) { - KernelArgPoolMap.insert( - std::make_pair(N, std::unique_ptr(new KernelArgPool( - KernargSegmentSize, KernArgMemoryPool)))); - } - } -}; - -/// List that contains all the kernels. -/// FIXME: we may need this to be per device and per library. -std::list KernelsList; - template static hsa_status_t findAgents(Callback CB) { hsa_status_t Err = @@ -432,6 +400,33 @@ std::atomic Current; }; +/// Use a single entity to encode a kernel and a set of flags +struct KernelTy { + llvm::omp::OMPTgtExecModeFlags ExecutionMode; + int16_t ConstWGSize; + int32_t DeviceId; + void *CallStackAddr = nullptr; + const char *Name; + + KernelTy(llvm::omp::OMPTgtExecModeFlags ExecutionMode, int16_t ConstWgSize, + int32_t DeviceId, void *CallStackAddr, const char *Name, + uint32_t KernargSegmentSize, + hsa_amd_memory_pool_t &KernArgMemoryPool, + std::unordered_map> + &KernelArgPoolMap) + : ExecutionMode(ExecutionMode), ConstWGSize(ConstWgSize), + DeviceId(DeviceId), CallStackAddr(CallStackAddr), Name(Name) { + DP("Construct kernelinfo: ExecMode %d\n", ExecutionMode); + + std::string N(Name); + if (KernelArgPoolMap.find(N) == KernelArgPoolMap.end()) { + KernelArgPoolMap.insert( + std::make_pair(N, std::unique_ptr(new KernelArgPool( + KernargSegmentSize, KernArgMemoryPool)))); + } + } +}; + /// Class containing all the device information class RTLDeviceInfoTy : HSALifetime { std::vector> FuncGblEntries; @@ -456,6 +451,12 @@ int NumberOfDevices = 0; + /// List that contains all the kernels. + /// FIXME: we may need this to be per device and per library. + std::list KernelsList; + std::unordered_map> + KernelArgPoolMap; + // GPU devices std::vector HSAAgents; std::vector HSAQueueSchedulers; // one per gpu @@ -857,7 +858,6 @@ "Unexpected device id!"); FuncGblEntries[DeviceId].emplace_back(); FuncOrGblEntryTy &E = FuncGblEntries[DeviceId].back(); - // KernelArgPoolMap.clear(); E.Entries.clear(); E.Table.EntriesBegin = E.Table.EntriesEnd = 0; } @@ -1111,23 +1111,10 @@ } }; -pthread_mutex_t SignalPoolT::mutex = PTHREAD_MUTEX_INITIALIZER; - static RTLDeviceInfoTy *DeviceInfoState = nullptr; static RTLDeviceInfoTy &DeviceInfo() { return *DeviceInfoState; } -int32_t __tgt_rtl_init_plugin() { - DeviceInfoState = new RTLDeviceInfoTy; - return (DeviceInfoState && DeviceInfoState->ConstructionSucceeded) - ? OFFLOAD_SUCCESS - : OFFLOAD_FAIL; -} - -int32_t __tgt_rtl_deinit_plugin() { - if (DeviceInfoState) - delete DeviceInfoState; - return OFFLOAD_SUCCESS; -} +pthread_mutex_t SignalPoolT::mutex = PTHREAD_MUTEX_INITIALIZER; namespace { @@ -1144,7 +1131,7 @@ (long long unsigned)(Elf64_Addr)HstPtr); Err = DeviceInfo().freesignalpoolMemcpyD2H(HstPtr, TgtPtr, (size_t)Size, - DeviceId); + DeviceId); if (Err != HSA_STATUS_SUCCESS) { DP("Error when copying data from device to host. Pointers: " @@ -1171,7 +1158,7 @@ (long long unsigned)(Elf64_Addr)HstPtr, (long long unsigned)(Elf64_Addr)TgtPtr); Err = DeviceInfo().freesignalpoolMemcpyH2D(TgtPtr, HstPtr, (size_t)Size, - DeviceId); + DeviceId); if (Err != HSA_STATUS_SUCCESS) { DP("Error when copying data from host to device. Pointers: " "host = 0x%016lx, device = 0x%016lx, size = %lld\n", @@ -1468,8 +1455,9 @@ KernelArgPool *ArgPool = nullptr; void *KernArg = nullptr; { - auto It = KernelArgPoolMap.find(std::string(KernelInfo->Name)); - if (It != KernelArgPoolMap.end()) { + auto It = + DeviceInfo().KernelArgPoolMap.find(std::string(KernelInfo->Name)); + if (It != DeviceInfo().KernelArgPoolMap.end()) { ArgPool = (It->second).get(); } } @@ -1507,8 +1495,8 @@ // under a multiple reader lock, not a writer lock. static pthread_mutex_t HostcallInitLock = PTHREAD_MUTEX_INITIALIZER; pthread_mutex_lock(&HostcallInitLock); - uint64_t Buffer = hostrpc_assign_buffer(DeviceInfo().HSAAgents[DeviceId], - Queue, DeviceId); + uint64_t Buffer = hostrpc_assign_buffer( + DeviceInfo().HSAAgents[DeviceId], Queue, DeviceId); pthread_mutex_unlock(&HostcallInitLock); if (!Buffer) { DP("hostrpc_assign_buffer failed, gpu would dereference null and " @@ -1870,7 +1858,7 @@ } return DeviceInfo().freesignalpoolMemcpyH2D(StatePtr, &HostDeviceEnv, - StatePtrSize, DeviceId); + StatePtrSize, DeviceId); } } return HSA_STATUS_SUCCESS; @@ -2031,6 +2019,20 @@ } extern "C" { + +int32_t __tgt_rtl_init_plugin() { + DeviceInfoState = new RTLDeviceInfoTy; + return (DeviceInfoState && DeviceInfoState->ConstructionSucceeded) + ? OFFLOAD_SUCCESS + : OFFLOAD_FAIL; +} + +int32_t __tgt_rtl_deinit_plugin() { + if (DeviceInfoState) + delete DeviceInfoState; + return OFFLOAD_SUCCESS; +} + int32_t __tgt_rtl_is_valid_binary(__tgt_device_image *Image) { return elfMachineIdIsAmdgcn(Image); } @@ -2112,7 +2114,8 @@ if (print_kernel_trace & STARTUP_DETAILS) DP("Device#%-2d CU's: %2d %s\n", DeviceId, - DeviceInfo().ComputeUnits[DeviceId], DeviceInfo().GPUName[DeviceId].c_str()); + DeviceInfo().ComputeUnits[DeviceId], + DeviceInfo().GPUName[DeviceId].c_str()); // Query attributes to determine number of threads/block and blocks/grid. uint16_t WorkgroupMaxDim[3]; @@ -2124,7 +2127,8 @@ RTLDeviceInfoTy::DefaultNumTeams); } else if (WorkgroupMaxDim[0] <= RTLDeviceInfoTy::HardTeamLimit) { DeviceInfo().GroupsPerDevice[DeviceId] = WorkgroupMaxDim[0]; - DP("Using %d ROCm blocks per grid\n", DeviceInfo().GroupsPerDevice[DeviceId]); + DP("Using %d ROCm blocks per grid\n", + DeviceInfo().GroupsPerDevice[DeviceId]); } else { DeviceInfo().GroupsPerDevice[DeviceId] = RTLDeviceInfoTy::HardTeamLimit; DP("Max ROCm blocks per grid %d exceeds the hard team limit %d, capping " @@ -2279,8 +2283,9 @@ return NULL; { - auto Env = DeviceEnvironment(DeviceId, DeviceInfo().NumberOfDevices, - DeviceInfo().Env.DynamicMemSize, Image, ImgSize); + auto Env = + DeviceEnvironment(DeviceId, DeviceInfo().NumberOfDevices, + DeviceInfo().Env.DynamicMemSize, Image, ImgSize); auto &KernelInfo = DeviceInfo().KernelInfoTable[DeviceId]; auto &SymbolInfo = DeviceInfo().SymbolInfoTable[DeviceId]; @@ -2373,8 +2378,8 @@ } // write ptr to device memory so it can be used by later kernels - Err = DeviceInfo().freesignalpoolMemcpyH2D(StatePtr, &Ptr, sizeof(void *), - DeviceId); + Err = DeviceInfo().freesignalpoolMemcpyH2D(StatePtr, &Ptr, + sizeof(void *), DeviceId); if (Err != HSA_STATUS_SUCCESS) { DP("memcpy install of state_ptr failed\n"); return NULL; @@ -2438,7 +2443,7 @@ // can access host addresses directly. There is no longer a // need for device copies. Err = DeviceInfo().freesignalpoolMemcpyH2D(Varptr, E->addr, - sizeof(void *), DeviceId); + sizeof(void *), DeviceId); if (Err != HSA_STATUS_SUCCESS) DP("Error when copying USM\n"); DP("Copy linked variable host address (" DPxMOD ")" @@ -2598,11 +2603,12 @@ } check("Loading computation property", Err); - KernelsList.push_back(KernelTy(ExecModeVal, WGSizeVal, DeviceId, - CallStackAddr, E->name, KernargSegmentSize, - DeviceInfo().KernArgPool)); + DeviceInfo().KernelsList.push_back( + KernelTy(ExecModeVal, WGSizeVal, DeviceId, CallStackAddr, E->name, + KernargSegmentSize, DeviceInfo().KernArgPool, + DeviceInfo().KernelArgPoolMap)); __tgt_offload_entry Entry = *E; - Entry.addr = (void *)&KernelsList.back(); + Entry.addr = (void *)&DeviceInfo().KernelsList.back(); DeviceInfo().addOffloadEntry(DeviceId, Entry); DP("Entry point %ld maps to %s\n", E - HostBegin, E->name); }