diff --git a/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp b/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp --- a/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp +++ b/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp @@ -125,6 +125,7 @@ uint32_t KernargSegmentSize; void *KernargRegion = nullptr; std::queue FreeKernargSegments; + uint16_t CodeObjectVersion; uint32_t kernargSizeIncludingImplicit() { return KernargSegmentSize + sizeof(AMDGPUImplicitArgsTy); @@ -144,8 +145,10 @@ KernelArgPool(const KernelArgPool &) = delete; KernelArgPool(KernelArgPool &&) = delete; - KernelArgPool(uint32_t KernargSegmentSize, hsa_amd_memory_pool_t &MemoryPool) - : KernargSegmentSize(KernargSegmentSize) { + KernelArgPool(uint32_t KernargSegmentSize, hsa_amd_memory_pool_t &MemoryPool, + uint16_t CodeObjectVersion) + : KernargSegmentSize(KernargSegmentSize), + CodeObjectVersion(CodeObjectVersion) { // impl uses one pool per kernel for all gpus, with a fixed upper size // preserving that exact scheme here, including the queue @@ -229,16 +232,16 @@ KernelTy(llvm::omp::OMPTgtExecModeFlags ExecutionMode, int16_t ConstWgSize, int32_t DeviceId, void *CallStackAddr, const char *Name, uint32_t KernargSegmentSize, - hsa_amd_memory_pool_t &KernArgMemoryPool) + hsa_amd_memory_pool_t &KernArgMemoryPool, uint16_t CodeObjectVersion) : ExecutionMode(ExecutionMode), ConstWGSize(ConstWgSize), DeviceId(DeviceId), CallStackAddr(CallStackAddr), Name(Name) { DP("Construct kernelinfo: ExecMode %d\n", ExecutionMode); std::string N(Name); if (KernelArgPoolMap.find(N) == KernelArgPoolMap.end()) { - KernelArgPoolMap.insert( - std::make_pair(N, std::unique_ptr(new KernelArgPool( - KernargSegmentSize, KernArgMemoryPool)))); + KernelArgPoolMap.insert(std::make_pair( + N, std::unique_ptr(new KernelArgPool( + KernargSegmentSize, KernArgMemoryPool, CodeObjectVersion)))); } } }; @@ -475,6 +478,7 @@ std::vector WarpSize; std::vector GPUName; std::vector TargetID; + uint16_t CodeObjectVersion; // OpenMP properties std::vector NumTeams; @@ -1363,6 +1367,27 @@ return PacketId; } +const uint16_t getCodeObjectVersionFromELF(__tgt_device_image *Image) { + char *ImageBegin = (char *)Image->ImageStart; + size_t ImageSize = (char *)Image->ImageEnd - ImageBegin; + + StringRef Buffer = StringRef(ImageBegin, ImageSize); + auto ElfOrErr = ObjectFile::createELFObjectFile(MemoryBufferRef(Buffer, ""), + /*InitContent=*/false); + if (!ElfOrErr) { + REPORT("Failed to load ELF: %s\n", toString(ElfOrErr.takeError()).c_str()); + return 1; + } + + if (const auto *ELFObj = dyn_cast(ElfOrErr->get())) { + auto Header = ELFObj->getELFFile().getHeader(); + uint16_t Version = (uint8_t)(Header.e_ident[EI_ABIVERSION]); + DP("ELFABIVERSION Version: %u\n", Version); + return Version; + } + return 0; +} + int32_t runRegionLocked(int32_t DeviceId, void *TgtEntryPtr, void **TgtArgs, ptrdiff_t *TgtOffsets, int32_t ArgNum, int32_t NumTeams, int32_t ThreadLimit, uint64_t LoopTripcount) { @@ -1440,6 +1465,7 @@ } uint64_t PacketId = acquireAvailablePacketId(Queue); + uint16_t CodeObjectVersion = DeviceInfo().CodeObjectVersion; const uint32_t Mask = Queue->size - 1; // size is a power of 2 hsa_kernel_dispatch_packet_t *Packet = (hsa_kernel_dispatch_packet_t *)Queue->base_address + (PacketId & Mask); @@ -1515,9 +1541,15 @@ DP("Implicit argument count: %d\n", KernelInfoEntry.implicit_argument_count); - DP("Setting Hostcall buffer for COV4\n"); - memcpy(&ImplArgs[IMPLICITARGS::COV4_HOSTCALL_PTR_OFFSET], &Buffer, - IMPLICITARGS::HOSTCALL_PTR_SIZE); + + if (CodeObjectVersion < llvm::ELF::ELFABIVERSION_AMDGPU_HSA_V5) { + DP("Setting Hostcall buffer for COV4\n"); + memcpy(&ImplArgs[IMPLICITARGS::COV4_HOSTCALL_PTR_OFFSET], &Buffer, + IMPLICITARGS::HOSTCALL_PTR_SIZE); + } else { + DP("Code object version 5 is not yet supported\n"); + return OFFLOAD_FAIL; + } Packet->kernarg_address = KernArg; } @@ -2087,6 +2119,8 @@ if (!elfMachineIdIsAmdgcn(Image)) return NULL; + DeviceInfo().CodeObjectVersion = getCodeObjectVersionFromELF(Image); + { auto Env = DeviceEnvironment(DeviceId, DeviceInfo().NumberOfDevices, @@ -2410,7 +2444,8 @@ KernelsList.push_back(KernelTy(ExecModeVal, WGSizeVal, DeviceId, CallStackAddr, E->name, KernargSegmentSize, - DeviceInfo().KernArgPool)); + DeviceInfo().KernArgPool, + DeviceInfo().CodeObjectVersion)); __tgt_offload_entry Entry = *E; Entry.addr = (void *)&KernelsList.back(); DeviceInfo().addOffloadEntry(DeviceId, Entry);