diff --git a/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp b/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp --- a/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp +++ b/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp @@ -210,9 +210,6 @@ }; pthread_mutex_t KernelArgPool::Mutex = PTHREAD_MUTEX_INITIALIZER; -std::unordered_map> - KernelArgPoolMap; - /// Use a single entity to encode a kernel and a set of flags struct KernelTy { llvm::omp::OMPTgtExecModeFlags ExecutionMode; @@ -224,7 +221,9 @@ KernelTy(llvm::omp::OMPTgtExecModeFlags ExecutionMode, int16_t ConstWgSize, int32_t DeviceId, void *CallStackAddr, const char *Name, uint32_t KernargSegmentSize, - hsa_amd_memory_pool_t &KernArgMemoryPool) + hsa_amd_memory_pool_t &KernArgMemoryPool, + std::unordered_map> + &KernelArgPoolMap) : ExecutionMode(ExecutionMode), ConstWGSize(ConstWgSize), DeviceId(DeviceId), CallStackAddr(CallStackAddr), Name(Name) { DP("Construct kernelinfo: ExecMode %d\n", ExecutionMode); @@ -238,10 +237,6 @@ } }; -/// List that contains all the kernels. -/// FIXME: we may need this to be per device and per library. -std::list KernelsList; - template static hsa_status_t findAgents(Callback CB) { hsa_status_t Err = @@ -456,6 +451,12 @@ int NumberOfDevices = 0; + /// List that contains all the kernels. + /// FIXME: we may need this to be per device and per library. + std::list KernelsList; + std::unordered_map> + KernelArgPoolMap; + // GPU devices std::vector HSAAgents; std::vector HSAQueueSchedulers; // one per gpu @@ -857,7 +858,6 @@ "Unexpected device id!"); FuncGblEntries[DeviceId].emplace_back(); FuncOrGblEntryTy &E = FuncGblEntries[DeviceId].back(); - // KernelArgPoolMap.clear(); E.Entries.clear(); E.Table.EntriesBegin = E.Table.EntriesEnd = 0; } @@ -1116,19 +1116,6 @@ static RTLDeviceInfoTy *DeviceInfoState = nullptr; static RTLDeviceInfoTy &DeviceInfo() { return *DeviceInfoState; } -int32_t __tgt_rtl_init_plugin() { - DeviceInfoState = new RTLDeviceInfoTy; - return (DeviceInfoState && DeviceInfoState->ConstructionSucceeded) - ? OFFLOAD_SUCCESS - : OFFLOAD_FAIL; -} - -int32_t __tgt_rtl_deinit_plugin() { - if (DeviceInfoState) - delete DeviceInfoState; - return OFFLOAD_SUCCESS; -} - namespace { int32_t dataRetrieve(int32_t DeviceId, void *HstPtr, void *TgtPtr, int64_t Size, @@ -1171,7 +1158,7 @@ (long long unsigned)(Elf64_Addr)HstPtr, (long long unsigned)(Elf64_Addr)TgtPtr); Err = DeviceInfo().freesignalpoolMemcpyH2D(TgtPtr, HstPtr, (size_t)Size, - DeviceId); + DeviceId); if (Err != HSA_STATUS_SUCCESS) { DP("Error when copying data from host to device. Pointers: " "host = 0x%016lx, device = 0x%016lx, size = %lld\n", @@ -1468,8 +1455,9 @@ KernelArgPool *ArgPool = nullptr; void *KernArg = nullptr; { - auto It = KernelArgPoolMap.find(std::string(KernelInfo->Name)); - if (It != KernelArgPoolMap.end()) { + auto It = + DeviceInfo().KernelArgPoolMap.find(std::string(KernelInfo->Name)); + if (It != DeviceInfo().KernelArgPoolMap.end()) { ArgPool = (It->second).get(); } } @@ -2031,6 +2019,20 @@ } extern "C" { + +int32_t __tgt_rtl_init_plugin() { + DeviceInfoState = new RTLDeviceInfoTy; + return (DeviceInfoState && DeviceInfoState->ConstructionSucceeded) + ? OFFLOAD_SUCCESS + : OFFLOAD_FAIL; +} + +int32_t __tgt_rtl_deinit_plugin() { + if (DeviceInfoState) + delete DeviceInfoState; + return OFFLOAD_SUCCESS; +} + int32_t __tgt_rtl_is_valid_binary(__tgt_device_image *Image) { return elfMachineIdIsAmdgcn(Image); } @@ -2373,8 +2375,8 @@ } // write ptr to device memory so it can be used by later kernels - Err = DeviceInfo().freesignalpoolMemcpyH2D(StatePtr, &Ptr, sizeof(void *), - DeviceId); + Err = DeviceInfo().freesignalpoolMemcpyH2D(StatePtr, &Ptr, + sizeof(void *), DeviceId); if (Err != HSA_STATUS_SUCCESS) { DP("memcpy install of state_ptr failed\n"); return NULL; @@ -2437,8 +2439,8 @@ // If unified memory is present any target link variables // can access host addresses directly. There is no longer a // need for device copies. - Err = DeviceInfo().freesignalpoolMemcpyH2D(Varptr, E->addr, - sizeof(void *), DeviceId); + Err = DeviceInfo().freesignalpoolMemcpyH2D(Varptr, E->addr, sizeof(void *), + DeviceId); if (Err != HSA_STATUS_SUCCESS) DP("Error when copying USM\n"); DP("Copy linked variable host address (" DPxMOD ")" @@ -2598,11 +2600,12 @@ } check("Loading computation property", Err); - KernelsList.push_back(KernelTy(ExecModeVal, WGSizeVal, DeviceId, - CallStackAddr, E->name, KernargSegmentSize, - DeviceInfo().KernArgPool)); + DeviceInfo().KernelsList.push_back( + KernelTy(ExecModeVal, WGSizeVal, DeviceId, CallStackAddr, E->name, + KernargSegmentSize, DeviceInfo().KernArgPool, + DeviceInfo().KernelArgPoolMap)); __tgt_offload_entry Entry = *E; - Entry.addr = (void *)&KernelsList.back(); + Entry.addr = (void *)&DeviceInfo().KernelsList.back(); DeviceInfo().addOffloadEntry(DeviceId, Entry); DP("Entry point %ld maps to %s\n", E - HostBegin, E->name); }