diff --git a/openmp/libomptarget/plugins/amdgpu/impl/get_elf_mach_gfx_name.h b/openmp/libomptarget/plugins/amdgpu/impl/get_elf_mach_gfx_name.h --- a/openmp/libomptarget/plugins/amdgpu/impl/get_elf_mach_gfx_name.h +++ b/openmp/libomptarget/plugins/amdgpu/impl/get_elf_mach_gfx_name.h @@ -57,4 +57,6 @@ COV5_HEAPV1_PTR_SIZE = 8 }; +const uint16_t getImplicitArgsSize(uint16_t Version); + #endif diff --git a/openmp/libomptarget/plugins/amdgpu/impl/get_elf_mach_gfx_name.cpp b/openmp/libomptarget/plugins/amdgpu/impl/get_elf_mach_gfx_name.cpp --- a/openmp/libomptarget/plugins/amdgpu/impl/get_elf_mach_gfx_name.cpp +++ b/openmp/libomptarget/plugins/amdgpu/impl/get_elf_mach_gfx_name.cpp @@ -79,3 +79,8 @@ return "--unknown gfx"; } } + +const uint16_t getImplicitArgsSize(uint16_t Version) { + return Version < ELFABIVERSION_AMDGPU_HSA_V5 ? IMPLICITARGS::COV4_SIZE + : IMPLICITARGS::COV5_SIZE; +} diff --git a/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp b/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp --- a/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp +++ b/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp @@ -125,9 +125,10 @@ uint32_t KernargSegmentSize; void *KernargRegion = nullptr; std::queue FreeKernargSegments; + uint16_t CodeObjectVersion; uint32_t kernargSizeIncludingImplicit() { - return KernargSegmentSize + sizeof(AMDGPUImplicitArgsTy); + return KernargSegmentSize + getImplicitArgsSize(CodeObjectVersion); } ~KernelArgPool() { @@ -144,8 +145,10 @@ KernelArgPool(const KernelArgPool &) = delete; KernelArgPool(KernelArgPool &&) = delete; - KernelArgPool(uint32_t KernargSegmentSize, hsa_amd_memory_pool_t &MemoryPool) - : KernargSegmentSize(KernargSegmentSize) { + KernelArgPool(uint32_t KernargSegmentSize, hsa_amd_memory_pool_t &MemoryPool, + uint16_t CodeObjectVersion) + : KernargSegmentSize(KernargSegmentSize), + CodeObjectVersion(CodeObjectVersion) { // impl uses one pool per kernel for all gpus, with a fixed upper size // preserving that exact scheme here, including the queue @@ -229,16 +232,16 @@ KernelTy(llvm::omp::OMPTgtExecModeFlags ExecutionMode, int16_t ConstWgSize, int32_t DeviceId, void *CallStackAddr, const char *Name, uint32_t KernargSegmentSize, - hsa_amd_memory_pool_t &KernArgMemoryPool) + hsa_amd_memory_pool_t &KernArgMemoryPool, uint16_t CodeObjectVersion) : ExecutionMode(ExecutionMode), ConstWGSize(ConstWgSize), DeviceId(DeviceId), CallStackAddr(CallStackAddr), Name(Name) { DP("Construct kernelinfo: ExecMode %d\n", ExecutionMode); std::string N(Name); if (KernelArgPoolMap.find(N) == KernelArgPoolMap.end()) { - KernelArgPoolMap.insert( - std::make_pair(N, std::unique_ptr(new KernelArgPool( - KernargSegmentSize, KernArgMemoryPool)))); + KernelArgPoolMap.insert(std::make_pair( + N, std::unique_ptr(new KernelArgPool( + KernargSegmentSize, KernArgMemoryPool, CodeObjectVersion)))); } } }; @@ -475,6 +478,7 @@ std::vector WarpSize; std::vector GPUName; std::vector TargetID; + uint16_t CodeObjectVersion; // OpenMP properties std::vector NumTeams; @@ -1363,6 +1367,27 @@ return PacketId; } +const uint16_t getCodeObjectVersionFromELF(__tgt_device_image *Image) { + char *ImageBegin = (char *)Image->ImageStart; + size_t ImageSize = (char *)Image->ImageEnd - ImageBegin; + + StringRef Buffer = StringRef(ImageBegin, ImageSize); + auto ElfOrErr = ObjectFile::createELFObjectFile(MemoryBufferRef(Buffer, ""), + /*InitContent=*/false); + if (!ElfOrErr) { + REPORT("Failed to load ELF: %s\n", toString(ElfOrErr.takeError()).c_str()); + return 1; + } + + if (const auto *ELFObj = dyn_cast(ElfOrErr->get())) { + auto Header = ELFObj->getELFFile().getHeader(); + uint16_t Version = (uint8_t)(Header.e_ident[EI_ABIVERSION]); + DP("ELFABIVERSION Version: %u\n", Version); + return Version; + } + return 0; +} + int32_t runRegionLocked(int32_t DeviceId, void *TgtEntryPtr, void **TgtArgs, ptrdiff_t *TgtOffsets, int32_t ArgNum, int32_t NumTeams, int32_t ThreadLimit, uint64_t LoopTripcount) { @@ -1440,6 +1465,7 @@ } uint64_t PacketId = acquireAvailablePacketId(Queue); + uint16_t CodeObjectVersion = DeviceInfo().CodeObjectVersion; const uint32_t Mask = Queue->size - 1; // size is a power of 2 hsa_kernel_dispatch_packet_t *Packet = (hsa_kernel_dispatch_packet_t *)Queue->base_address + (PacketId & Mask); @@ -1488,13 +1514,9 @@ } // Initialize implicit arguments. TODO: Which of these can be dropped - AMDGPUImplicitArgsTy *ImplArgs = reinterpret_cast( - static_cast(KernArg) + ArgPool->KernargSegmentSize); - memset(ImplArgs, 0, - sizeof(AMDGPUImplicitArgsTy)); // may not be necessary - ImplArgs->OffsetX = 0; - ImplArgs->OffsetY = 0; - ImplArgs->OffsetZ = 0; + uint8_t *ImplArgs = + static_cast(KernArg) + sizeof(void *) * ArgNum; + memset(ImplArgs, 0, getImplicitArgsSize(CodeObjectVersion)); uint64_t Buffer = 0; // assign a hostcall buffer for the selected Q @@ -1515,9 +1537,15 @@ DP("Implicit argument count: %d\n", KernelInfoEntry.implicit_argument_count); - DP("Setting Hostcall buffer for COV4\n"); - memcpy(&ImplArgs[IMPLICITARGS::COV4_HOSTCALL_PTR_OFFSET], &Buffer, - IMPLICITARGS::HOSTCALL_PTR_SIZE); + + if (CodeObjectVersion < llvm::ELF::ELFABIVERSION_AMDGPU_HSA_V5) { + DP("Setting Hostcall buffer for COV4\n"); + memcpy(&ImplArgs[IMPLICITARGS::COV4_HOSTCALL_PTR_OFFSET], &Buffer, + IMPLICITARGS::HOSTCALL_PTR_SIZE); + } else { + DP("Code object version 5 is not yet supported\n"); + return OFFLOAD_FAIL; + } Packet->kernarg_address = KernArg; } @@ -2087,6 +2115,8 @@ if (!elfMachineIdIsAmdgcn(Image)) return NULL; + DeviceInfo().CodeObjectVersion = getCodeObjectVersionFromELF(Image); + { auto Env = DeviceEnvironment(DeviceId, DeviceInfo().NumberOfDevices, @@ -2410,7 +2440,8 @@ KernelsList.push_back(KernelTy(ExecModeVal, WGSizeVal, DeviceId, CallStackAddr, E->name, KernargSegmentSize, - DeviceInfo().KernArgPool)); + DeviceInfo().KernArgPool, + DeviceInfo().CodeObjectVersion)); __tgt_offload_entry Entry = *E; Entry.addr = (void *)&KernelsList.back(); DeviceInfo().addOffloadEntry(DeviceId, Entry);