diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp --- a/clang/lib/CodeGen/CGBuiltin.cpp +++ b/clang/lib/CodeGen/CGBuiltin.cpp @@ -27,6 +27,7 @@ #include "clang/AST/OSLog.h" #include "clang/Basic/TargetBuiltins.h" #include "clang/Basic/TargetInfo.h" +#include "clang/Basic/TargetOptions.h" #include "clang/CodeGen/CGFunctionInfo.h" #include "clang/Frontend/FrontendDiagnostic.h" #include "llvm/ADT/APFloat.h" @@ -17130,23 +17131,80 @@ // \p Index is 0, 1, and 2 for x, y, and z dimension, respectively. Value *EmitAMDGPUWorkGroupSize(CodeGenFunction &CGF, unsigned Index) { - bool IsCOV_5 = CGF.getTarget().getTargetOpts().CodeObjectVersion == - clang::TargetOptions::COV_5; - Constant *Offset; - Value *DP; - if (IsCOV_5) { + llvm::LoadInst *LD; + Constant *Offset, *Offset1; + Value *DP, *DP1; + + auto Cov = CGF.getTarget().getTargetOpts().CodeObjectVersion; + + if (Cov == clang::TargetOptions::COV_None) { + auto *ABIVersionC = CGF.CGM.GetOrCreateLLVMGlobal( + "llvm.amdgcn.abi.version", CGF.Int32Ty, LangAS::Default, nullptr, + CodeGen::NotForDefinition); + + Value *ABIVersion; + if (ABIVersionC) { + ABIVersion = CGF.Builder.CreateAlignedLoad(CGF.Int32Ty, ABIVersionC, + CGF.CGM.getIntAlign()); + } + + Value *Iscov5 = CGF.Builder.CreateICmpSGE( + ABIVersion, + llvm::ConstantInt::get(CGF.Int32Ty, clang::TargetOptions::COV_5)); + + Function *TheFunction = CGF.Builder.GetInsertBlock()->getParent(); + + BasicBlock *NewABI = CGF.createBasicBlock("amdgcn.abi.cov5", TheFunction); + BasicBlock *OldABI = CGF.createBasicBlock("amdgcn.abi.cov4", nullptr); + BasicBlock *End = CGF.createBasicBlock("amdgcn.abi.end", nullptr); + + CGF.Builder.CreateCondBr(Iscov5, NewABI, OldABI); + CGF.Builder.SetInsertPoint(NewABI); // Indexing the implicit kernarg segment. Offset = llvm::ConstantInt::get(CGF.Int32Ty, 12 + Index * 2); DP = EmitAMDGPUImplicitArgPtr(CGF); - } else { + auto *GEPNew = CGF.Builder.CreateGEP(CGF.Int8Ty, DP, Offset); + CGF.Builder.CreateBr(End); + NewABI = CGF.Builder.GetInsertBlock(); + + TheFunction->insert(TheFunction->end(), OldABI); + CGF.Builder.SetInsertPoint(OldABI); // Indexing the HSA kernel_dispatch_packet struct. - Offset = llvm::ConstantInt::get(CGF.Int32Ty, 4 + Index * 2); - DP = EmitAMDGPUDispatchPtr(CGF); + Offset1 = llvm::ConstantInt::get(CGF.Int32Ty, 4 + Index * 2); + DP1 = EmitAMDGPUDispatchPtr(CGF); + auto *GEPOld = CGF.Builder.CreateGEP(CGF.Int8Ty, DP1, Offset1); + CGF.Builder.CreateBr(End); + OldABI = CGF.Builder.GetInsertBlock(); + + TheFunction->insert(TheFunction->end(), End); + CGF.Builder.SetInsertPoint(End); + PHINode *Result = + CGF.Builder.CreatePHI(GEPNew->getType(), 2, "abi_gep_result"); + Result->addIncoming(GEPNew, NewABI); + Result->addIncoming(GEPOld, OldABI); + LD = CGF.Builder.CreateLoad( + Address(Result, CGF.Int16Ty, CharUnits::fromQuantity(2))); + } else { + if (Cov == clang::TargetOptions::COV_5) { + // Indexing the implicit kernarg segment. + Offset = llvm::ConstantInt::get(CGF.Int32Ty, 12 + Index * 2); + DP = EmitAMDGPUImplicitArgPtr(CGF); + // llvm::errs().changeColor(raw_ostream::GREEN) << "\n ~ file: + // CGBuiltin.cpp ~ line 17052 ~ Value*EmitAMDGPUWorkGroupSize ~ COV: + // " << Cov ; llvm::errs().resetColor(); + } else { + // Indexing the HSA kernel_dispatch_packet struct. + Offset = llvm::ConstantInt::get(CGF.Int32Ty, 4 + Index * 2); + DP = EmitAMDGPUDispatchPtr(CGF); + // llvm::errs().changeColor(raw_ostream::YELLOW) << "\n ~ file: + // CGBuiltin.cpp ~ line 17052 ~ Value*EmitAMDGPUWorkGroupSize ~ COV: + // " << Cov ; llvm::errs().resetColor(); + } + auto *GEP = CGF.Builder.CreateGEP(CGF.Int8Ty, DP, Offset); + LD = CGF.Builder.CreateLoad( + Address(GEP, CGF.Int16Ty, CharUnits::fromQuantity(2))); } - auto *GEP = CGF.Builder.CreateGEP(CGF.Int8Ty, DP, Offset); - auto *LD = CGF.Builder.CreateLoad( - Address(GEP, CGF.Int16Ty, CharUnits::fromQuantity(2))); llvm::MDBuilder MDHelper(CGF.getLLVMContext()); llvm::MDNode *RNode = MDHelper.createRange(APInt(16, 1), APInt(16, CGF.getTarget().getMaxOpenCLWorkGroupSize() + 1)); diff --git a/clang/lib/CodeGen/CodeGenModule.h b/clang/lib/CodeGen/CodeGenModule.h --- a/clang/lib/CodeGen/CodeGenModule.h +++ b/clang/lib/CodeGen/CodeGenModule.h @@ -1567,6 +1567,11 @@ void handleAMDGPUWavesPerEUAttr(llvm::Function *F, const AMDGPUWavesPerEUAttr *A); + llvm::Constant * + GetOrCreateLLVMGlobal(StringRef MangledName, llvm::Type *Ty, LangAS AddrSpace, + const VarDecl *D, + ForDefinition_t IsForDefinition = NotForDefinition); + private: llvm::Constant *GetOrCreateLLVMFunction( StringRef MangledName, llvm::Type *Ty, GlobalDecl D, bool ForVTable, @@ -1589,11 +1594,6 @@ void UpdateMultiVersionNames(GlobalDecl GD, const FunctionDecl *FD, StringRef &CurName); - llvm::Constant * - GetOrCreateLLVMGlobal(StringRef MangledName, llvm::Type *Ty, LangAS AddrSpace, - const VarDecl *D, - ForDefinition_t IsForDefinition = NotForDefinition); - bool GetCPUAndFeaturesAttributes(GlobalDecl GD, llvm::AttrBuilder &AttrBuilder, bool SetTargetFeatures = true); diff --git a/clang/lib/CodeGen/CodeGenModule.cpp b/clang/lib/CodeGen/CodeGenModule.cpp --- a/clang/lib/CodeGen/CodeGenModule.cpp +++ b/clang/lib/CodeGen/CodeGenModule.cpp @@ -1201,6 +1201,8 @@ getModule().addModuleFlag(llvm::Module::Error, "MaxTLSAlign", getContext().getTargetInfo().getMaxTLSAlign()); + getTargetCodeGenInfo().emitTargetGlobals(*this); + getTargetCodeGenInfo().emitTargetMetadata(*this, MangledDeclNames); EmitBackendOptionsMetadata(getCodeGenOpts()); diff --git a/clang/lib/CodeGen/TargetInfo.h b/clang/lib/CodeGen/TargetInfo.h --- a/clang/lib/CodeGen/TargetInfo.h +++ b/clang/lib/CodeGen/TargetInfo.h @@ -81,6 +81,9 @@ CodeGen::CodeGenModule &CGM, const llvm::MapVector &MangledDeclNames) const {} + /// Provides a convenient hook to handle extra target-specific globals. + virtual void emitTargetGlobals(CodeGen::CodeGenModule &CGM) const {} + /// Any further codegen related checks that need to be done on a function call /// in a target specific manner. virtual void checkFunctionCallABI(CodeGenModule &CGM, SourceLocation CallLoc, diff --git a/clang/lib/CodeGen/Targets/AMDGPU.cpp b/clang/lib/CodeGen/Targets/AMDGPU.cpp --- a/clang/lib/CodeGen/Targets/AMDGPU.cpp +++ b/clang/lib/CodeGen/Targets/AMDGPU.cpp @@ -8,6 +8,7 @@ #include "ABIInfoImpl.h" #include "TargetInfo.h" +#include "clang/Basic/TargetOptions.h" using namespace clang; using namespace clang::CodeGen; @@ -268,6 +269,8 @@ void setFunctionDeclAttributes(const FunctionDecl *FD, llvm::Function *F, CodeGenModule &CGM) const; + void emitTargetGlobals(CodeGen::CodeGenModule &CGM) const override; + void setTargetAttributes(const Decl *D, llvm::GlobalValue *GV, CodeGen::CodeGenModule &M) const override; unsigned getOpenCLKernelCallingConv() const override; @@ -348,6 +351,38 @@ } } +/// Emits control constants used to change per-architecture behaviour in the +/// AMDGPU ROCm device libraries. +void AMDGPUTargetCodeGenInfo::emitTargetGlobals( + CodeGen::CodeGenModule &CGM) const { + if (!CGM.getTriple().isAMDGCN()) + return; + + auto AddGlobal = [&](StringRef Name, + clang::TargetOptions::CodeObjectVersionKind Value, + unsigned Size, + llvm::GlobalValue::LinkageTypes Linkage = + llvm::GlobalValue::WeakODRLinkage) { + if (CGM.getModule().getNamedGlobal(Name)) + return; + + auto *Type = + llvm::IntegerType::getIntNTy(CGM.getModule().getContext(), Size); + auto *GV = new llvm::GlobalVariable( + CGM.getModule(), Type, true, Linkage, + llvm::ConstantInt::get(Type, Value), Name, nullptr, + llvm::GlobalValue::ThreadLocalMode::NotThreadLocal, + CGM.getContext().getTargetAddressSpace(LangAS::opencl_constant)); + GV->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::Local); + GV->setVisibility(llvm::GlobalValue::VisibilityTypes::HiddenVisibility); + GV->setAlignment(CGM.getDataLayout().getABITypeAlign(Type)); + }; + + AddGlobal("llvm.amdgcn.abi.version", + CGM.getTarget().getTargetOpts().CodeObjectVersion, /*Size=*/32, + llvm::GlobalValue::WeakODRLinkage); +} + void AMDGPUTargetCodeGenInfo::setTargetAttributes( const Decl *D, llvm::GlobalValue *GV, CodeGen::CodeGenModule &M) const { if (requiresAMDGPUProtectedVisibility(D, GV)) { diff --git a/clang/lib/Driver/ToolChain.cpp b/clang/lib/Driver/ToolChain.cpp --- a/clang/lib/Driver/ToolChain.cpp +++ b/clang/lib/Driver/ToolChain.cpp @@ -1354,6 +1354,11 @@ // Handle -Xopenmp-target flags for (auto *A : Args) { + // pass code objection version to device toolchain + // correctly set meta-data in intermediate files + if (A->getOption().matches(options::OPT_mcode_object_version_EQ)) + DAL->append(A); + // Exclude flags which may only apply to the host toolchain. // Do not exclude flags when the host triple (AuxTriple) // matches the current toolchain triple. If it is not present diff --git a/openmp/libomptarget/DeviceRTL/CMakeLists.txt b/openmp/libomptarget/DeviceRTL/CMakeLists.txt --- a/openmp/libomptarget/DeviceRTL/CMakeLists.txt +++ b/openmp/libomptarget/DeviceRTL/CMakeLists.txt @@ -282,7 +282,7 @@ add_custom_target(omptarget.devicertl.amdgpu) foreach(gpu_arch ${LIBOMPTARGET_DEVICE_ARCHITECTURES}) if("${gpu_arch}" IN_LIST all_amdgpu_architectures) - compileDeviceRTLLibrary(${gpu_arch} amdgpu amdgcn-amd-amdhsa) + compileDeviceRTLLibrary(${gpu_arch} amdgpu amdgcn-amd-amdhsa -Xclang -mcode-object-version=none) elseif("${gpu_arch}" IN_LIST all_nvptx_architectures) compileDeviceRTLLibrary(${gpu_arch} nvptx nvptx64-nvidia-cuda --cuda-feature=+ptx61) else() diff --git a/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp b/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp --- a/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp +++ b/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp @@ -253,6 +253,13 @@ return Plugin::check(Status, "Error in hsa_amd_agents_allow_access: %s"); } + Error zeroInitializeMemory(void *Ptr, size_t Size) { + uint64_t Rounded = sizeof(uint32_t) * ((Size + 3) / sizeof(uint32_t)); + hsa_status_t Status = + hsa_amd_memory_fill(Ptr, 0, Rounded / sizeof(uint32_t)); + return Plugin::check(Status, "Error in hsa_amd_memory_fill: %s"); + } + /// Get attribute from the memory pool. template Error getAttr(hsa_amd_memory_pool_info_t Kind, Ty &Value) const { @@ -381,6 +388,9 @@ /// Get the executable. hsa_executable_t getExecutable() const { return Executable; } + /// Get to Code Object Version of the ELF + uint16_t getELFABIVersion() const { return ELFABIVersion; } + /// Find an HSA device symbol by its name on the executable. Expected findDeviceSymbol(GenericDeviceTy &Device, StringRef SymbolName) const; @@ -401,6 +411,7 @@ hsa_executable_t Executable; hsa_code_object_t CodeObject; StringMap KernelInfoMap; + uint16_t ELFABIVersion; }; /// Class implementing the AMDGPU kernel functionalities which derives from the @@ -408,8 +419,7 @@ struct AMDGPUKernelTy : public GenericKernelTy { /// Create an AMDGPU kernel with a name and an execution mode. AMDGPUKernelTy(const char *Name, OMPTgtExecModeFlags ExecutionMode) - : GenericKernelTy(Name, ExecutionMode), - ImplicitArgsSize(sizeof(utils::AMDGPUImplicitArgsTy)) {} + : GenericKernelTy(Name, ExecutionMode) {} /// Initialize the AMDGPU kernel. Error initImpl(GenericDeviceTy &Device, DeviceImageTy &Image) override { @@ -450,6 +460,12 @@ // TODO: Read the kernel descriptor for the max threads per block. May be // read from the image. + ImplicitArgsSize = + (AMDImage.getELFABIVersion() < llvm::ELF::ELFABIVERSION_AMDGPU_HSA_V5) + ? utils::COV4_SIZE + : utils::COV5_SIZE; + DP("ELFABIVersion: %d\n", AMDImage.getELFABIVersion()); + // Get additional kernel info read from image KernelInfo = AMDImage.getKernelInfo(getName()); if (!KernelInfo.has_value()) @@ -476,6 +492,10 @@ /// Get the HSA kernel object representing the kernel function. uint64_t getKernelObject() const { return KernelObject; } + /// Get the size of implicitargs based on the code object version + /// @return 56 for cov4 and 256 for cov5 + uint32_t getImplicitArgsSize() const { return ImplicitArgsSize; } + private: /// The kernel object to execute. uint64_t KernelObject; @@ -486,7 +506,7 @@ uint32_t PrivateSize; /// The size of implicit kernel arguments. - const uint32_t ImplicitArgsSize; + uint32_t ImplicitArgsSize; /// Additional Info for the AMD GPU Kernel std::optional KernelInfo; @@ -1728,6 +1748,9 @@ if (auto Err = initMemoryPools()) return Err; + // if (auto Err = preAllocateDeviceMemoryPool()) + // return Err; + char GPUName[64]; if (auto Err = getDeviceAttr(HSA_AGENT_INFO_NAME, GPUName)) return Err; @@ -2515,6 +2538,43 @@ }); } + /// Get the address of pointer to the preallocated device memory pool. + void **getPreAllocatedDeviceMemoryPool() { + return &PreAllocatedDeviceMemoryPool; + } + + /// Allocate and zero initialize a small memory pool from the coarse grained + /// device memory of each device. + Error preAllocateDeviceMemoryPool() { + Error Err = retrieveAllMemoryPools(); + if (Err) + return Plugin::error("Unable to retieve all memmory pools"); + + void *DevPtr; + for (AMDGPUMemoryPoolTy *MemoryPool : AllMemoryPools) { + if (MemoryPool->isCoarseGrained()) { + DevPtr = nullptr; + size_t PreAllocSize = utils::PER_DEVICE_PREALLOC_SIZE; + + Err = MemoryPool->allocate(PreAllocSize, &DevPtr); + if (Err) + return Plugin::error("Device memory pool preallocation failed"); + + Err = MemoryPool->enableAccess(DevPtr, PreAllocSize, {getAgent()}); + if (Err) + return Plugin::error("Preallocated device memory pool inaccessible"); + + Err = MemoryPool->zeroInitializeMemory(DevPtr, PreAllocSize); + if (Err) + return Plugin::error( + "Zero initialization of preallocated device memory pool failed"); + + PreAllocatedDeviceMemoryPool = DevPtr; + } + } + return Plugin::success(); + } + private: using AMDGPUEventRef = AMDGPUResourceRef; using AMDGPUEventManagerTy = GenericDeviceResourceManagerTy; @@ -2572,6 +2632,9 @@ /// Reference to the host device. AMDHostDeviceTy &HostDevice; + + /// Pointer to the preallocated device memory pool + void *PreAllocatedDeviceMemoryPool; }; Error AMDGPUDeviceImageTy::loadExecutable(const AMDGPUDeviceTy &Device) { @@ -2605,8 +2668,8 @@ if (Result) return Plugin::error("Loaded HSA executable does not validate"); - if (auto Err = - utils::readAMDGPUMetaDataFromImage(getMemoryBuffer(), KernelInfoMap)) + if (auto Err = utils::readAMDGPUMetaDataFromImage( + getMemoryBuffer(), KernelInfoMap, ELFABIVersion)) return Err; return Plugin::success(); @@ -2947,9 +3010,8 @@ } // Initialize implicit arguments. - utils::AMDGPUImplicitArgsTy *ImplArgs = - reinterpret_cast( - advanceVoidPtr(AllArgs, KernelArgsSize)); + uint8_t *ImplArgs = + static_cast(advanceVoidPtr(AllArgs, KernelArgsSize)); // Initialize the implicit arguments to zero. std::memset(ImplArgs, 0, ImplicitArgsSize); @@ -2971,6 +3033,43 @@ if (GenericDevice.getRPCServer()) Stream->setRPCServer(GenericDevice.getRPCServer()); + if (getImplicitArgsSize() < utils::COV5_SIZE) { + DP("Setting fields of ImplicitArgs for COV4\n"); + } else { + DP("Setting fields of ImplicitArgs for COV5\n"); + uint16_t Remainder = 0; + uint16_t GridDims = 1; + uint32_t NumThreadsYZ = 1; + uint16_t NumBlocksYZ = 0; + memcpy(&ImplArgs[utils::COV5_BLOCK_COUNT_X_OFFSET], &NumBlocks, + utils::COV5_BLOCK_COUNT_X_SIZE); + memcpy(&ImplArgs[utils::COV5_BLOCK_COUNT_Y_OFFSET], &NumBlocksYZ, + utils::COV5_BLOCK_COUNT_Y_SIZE); + memcpy(&ImplArgs[utils::COV5_BLOCK_COUNT_Z_OFFSET], &NumBlocksYZ, + utils::COV5_BLOCK_COUNT_Z_SIZE); + + memcpy(&ImplArgs[utils::COV5_GROUP_SIZE_X_OFFSET], &NumThreads, + utils::COV5_GROUP_SIZE_X_SIZE); + memcpy(&ImplArgs[utils::COV5_GROUP_SIZE_Y_OFFSET], &NumThreadsYZ, + utils::COV5_GROUP_SIZE_Y_SIZE); + memcpy(&ImplArgs[utils::COV5_GROUP_SIZE_Z_OFFSET], &NumThreadsYZ, + utils::COV5_GROUP_SIZE_Z_SIZE); + + memcpy(&ImplArgs[utils::COV5_REMAINDER_X_OFFSET], &Remainder, + utils::COV5_REMAINDER_X_SIZE); + memcpy(&ImplArgs[utils::COV5_REMAINDER_Y_OFFSET], &Remainder, + utils::COV5_REMAINDER_Y_SIZE); + memcpy(&ImplArgs[utils::COV5_REMAINDER_Z_OFFSET], &Remainder, + utils::COV5_REMAINDER_Z_SIZE); + + memcpy(&ImplArgs[utils::COV5_GRID_DIMS_OFFSET], &GridDims, + utils::COV5_GRID_DIMS_SIZE); + + // memcpy(&ImplArgs[utils::COV5_HEAPV1_PTR_OFFSET], + // AMDGPUDevice.getPreAllocatedDeviceMemoryPool(), + // utils::COV5_HEAPV1_PTR_SIZE); + } + // Push the kernel launch into the stream. return Stream->pushKernelLaunch(*this, AllArgs, NumThreads, NumBlocks, GroupSize, ArgsMemoryManager); diff --git a/openmp/libomptarget/plugins-nextgen/amdgpu/utils/UtilitiesRTL.h b/openmp/libomptarget/plugins-nextgen/amdgpu/utils/UtilitiesRTL.h --- a/openmp/libomptarget/plugins-nextgen/amdgpu/utils/UtilitiesRTL.h +++ b/openmp/libomptarget/plugins-nextgen/amdgpu/utils/UtilitiesRTL.h @@ -25,6 +25,7 @@ #include "llvm/Support/MemoryBufferRef.h" #include "llvm/Support/YAMLTraits.h" +using namespace llvm::ELF; namespace llvm { namespace omp { @@ -32,19 +33,55 @@ namespace plugin { namespace utils { -// The implicit arguments of AMDGPU kernels. -struct AMDGPUImplicitArgsTy { - uint64_t OffsetX; - uint64_t OffsetY; - uint64_t OffsetZ; - uint64_t HostcallPtr; - uint64_t Unused0; - uint64_t Unused1; - uint64_t Unused2; +enum IMPLICITARGS : uint32_t { + COV4_SIZE = 56, + COV4_HOSTCALL_PTR_OFFSET = 24, + HOSTCALL_PTR_SIZE = 8, + + COV5_SIZE = 256, + + COV5_BLOCK_COUNT_X_OFFSET = 0, + COV5_BLOCK_COUNT_X_SIZE = 4, + + COV5_BLOCK_COUNT_Y_OFFSET = 4, + COV5_BLOCK_COUNT_Y_SIZE = 4, + + COV5_BLOCK_COUNT_Z_OFFSET = 8, + COV5_BLOCK_COUNT_Z_SIZE = 4, + + COV5_GROUP_SIZE_X_OFFSET = 12, + COV5_GROUP_SIZE_X_SIZE = 2, + + COV5_GROUP_SIZE_Y_OFFSET = 14, + COV5_GROUP_SIZE_Y_SIZE = 2, + + COV5_GROUP_SIZE_Z_OFFSET = 16, + COV5_GROUP_SIZE_Z_SIZE = 2, + + COV5_REMAINDER_X_OFFSET = 18, + COV5_REMAINDER_X_SIZE = 2, + + COV5_REMAINDER_Y_OFFSET = 20, + COV5_REMAINDER_Y_SIZE = 2, + + COV5_REMAINDER_Z_OFFSET = 22, + COV5_REMAINDER_Z_SIZE = 2, + + COV5_GRID_DIMS_OFFSET = 64, + COV5_GRID_DIMS_SIZE = 2, + + COV5_HOSTCALL_PTR_OFFSET = 80, + + COV5_HEAPV1_PTR_OFFSET = 96, + COV5_HEAPV1_PTR_SIZE = 8, + + // 128 KB + PER_DEVICE_PREALLOC_SIZE = 131072 }; -static_assert(sizeof(AMDGPUImplicitArgsTy) == 56, - "Unexpected size of implicit arguments"); +uint16_t getImplicitArgsSize(uint16_t Version) { + return Version < ELF::ELFABIVERSION_AMDGPU_HSA_V5 ? 56 : 256; +} /// Parse a TargetID to get processor arch and feature map. /// Returns processor subarch. @@ -295,7 +332,8 @@ /// Reads the AMDGPU specific metadata from the ELF file and propagates the /// KernelInfoMap Error readAMDGPUMetaDataFromImage(MemoryBufferRef MemBuffer, - StringMap &KernelInfoMap) { + StringMap &KernelInfoMap, + uint16_t &ELFABIVersion) { Error Err = Error::success(); // Used later as out-parameter auto ELFOrError = object::ELF64LEFile::create(MemBuffer.getBuffer()); @@ -305,6 +343,12 @@ const object::ELF64LEFile ELFObj = ELFOrError.get(); ArrayRef Sections = cantFail(ELFObj.sections()); KernelInfoReader Reader(KernelInfoMap); + + // Read the code object version from ELF image header + auto Header = ELFObj.getHeader(); + ELFABIVersion = (uint8_t)(Header.e_ident[ELF::EI_ABIVERSION]); + DP("ELFABIVERSION Version: %u\n", ELFABIVersion); + for (const auto &S : Sections) { if (S.sh_type != ELF::SHT_NOTE) continue;