diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -27,6 +27,7 @@
 #include "clang/AST/OSLog.h"
 #include "clang/Basic/TargetBuiltins.h"
 #include "clang/Basic/TargetInfo.h"
+#include "clang/Basic/TargetOptions.h"
 #include "clang/CodeGen/CGFunctionInfo.h"
 #include "clang/Frontend/FrontendDiagnostic.h"
 #include "llvm/ADT/APFloat.h"
@@ -17130,23 +17131,80 @@
 
 // \p Index is 0, 1, and 2 for x, y, and z dimension, respectively.
 Value *EmitAMDGPUWorkGroupSize(CodeGenFunction &CGF, unsigned Index) {
-  bool IsCOV_5 = CGF.getTarget().getTargetOpts().CodeObjectVersion ==
-                 clang::TargetOptions::COV_5;
-  Constant *Offset;
-  Value *DP;
-  if (IsCOV_5) {
+  llvm::LoadInst *LD;
+  Constant *Offset, *Offset1;
+  Value *DP, *DP1;
+
+  auto Cov = CGF.getTarget().getTargetOpts().CodeObjectVersion;
+
+  if (Cov == clang::TargetOptions::COV_None) {
+    auto *ABIVersionC = CGF.CGM.GetOrCreateLLVMGlobal(
+        "llvm.amdgcn.abi.version", CGF.Int32Ty, LangAS::Default, nullptr,
+        CodeGen::NotForDefinition);
+
+    Value *ABIVersion;
+    if (ABIVersionC) {
+      ABIVersion = CGF.Builder.CreateAlignedLoad(CGF.Int32Ty, ABIVersionC,
+                                                 CGF.CGM.getIntAlign());
+    }
+
+    Value *Iscov5 = CGF.Builder.CreateICmpSGE(
+        ABIVersion,
+        llvm::ConstantInt::get(CGF.Int32Ty, clang::TargetOptions::COV_5));
+
+    Function *TheFunction = CGF.Builder.GetInsertBlock()->getParent();
+
+    BasicBlock *NewABI = CGF.createBasicBlock("amdgcn.abi.cov5", TheFunction);
+    BasicBlock *OldABI = CGF.createBasicBlock("amdgcn.abi.cov4", nullptr);
+    BasicBlock *End = CGF.createBasicBlock("amdgcn.abi.end", nullptr);
+
+    CGF.Builder.CreateCondBr(Iscov5, NewABI, OldABI);
+    CGF.Builder.SetInsertPoint(NewABI);
     // Indexing the implicit kernarg segment.
     Offset = llvm::ConstantInt::get(CGF.Int32Ty, 12 + Index * 2);
     DP = EmitAMDGPUImplicitArgPtr(CGF);
-  } else {
+    auto *GEPNew = CGF.Builder.CreateGEP(CGF.Int8Ty, DP, Offset);
+    CGF.Builder.CreateBr(End);
+    NewABI = CGF.Builder.GetInsertBlock();
+
+    TheFunction->insert(TheFunction->end(), OldABI);
+    CGF.Builder.SetInsertPoint(OldABI);
     // Indexing the HSA kernel_dispatch_packet struct.
-    Offset = llvm::ConstantInt::get(CGF.Int32Ty, 4 + Index * 2);
-    DP = EmitAMDGPUDispatchPtr(CGF);
+    Offset1 = llvm::ConstantInt::get(CGF.Int32Ty, 4 + Index * 2);
+    DP1 = EmitAMDGPUDispatchPtr(CGF);
+    auto *GEPOld = CGF.Builder.CreateGEP(CGF.Int8Ty, DP1, Offset1);
+    CGF.Builder.CreateBr(End);
+    OldABI = CGF.Builder.GetInsertBlock();
+
+    TheFunction->insert(TheFunction->end(), End);
+    CGF.Builder.SetInsertPoint(End);
+    PHINode *Result =
+        CGF.Builder.CreatePHI(GEPNew->getType(), 2, "abi_gep_result");
+    Result->addIncoming(GEPNew, NewABI);
+    Result->addIncoming(GEPOld, OldABI);
+    LD = CGF.Builder.CreateLoad(
+        Address(Result, CGF.Int16Ty, CharUnits::fromQuantity(2)));
+  } else {
+    if (Cov == clang::TargetOptions::COV_5) {
+      // Indexing the implicit kernarg segment.
+      Offset = llvm::ConstantInt::get(CGF.Int32Ty, 12 + Index * 2);
+      DP = EmitAMDGPUImplicitArgPtr(CGF);
+      // llvm::errs().changeColor(raw_ostream::GREEN)  << "\n ~ file:
+      // CGBuiltin.cpp ~ line 17052 ~ Value*EmitAMDGPUWorkGroupSize ~ COV:
+      // " << Cov ; llvm::errs().resetColor();
+    } else {
+      // Indexing the HSA kernel_dispatch_packet struct.
+      Offset = llvm::ConstantInt::get(CGF.Int32Ty, 4 + Index * 2);
+      DP = EmitAMDGPUDispatchPtr(CGF);
+      // llvm::errs().changeColor(raw_ostream::YELLOW)  << "\n ~ file:
+      // CGBuiltin.cpp ~ line 17052 ~ Value*EmitAMDGPUWorkGroupSize ~ COV:
+      // " << Cov ; llvm::errs().resetColor();
+    }
+    auto *GEP = CGF.Builder.CreateGEP(CGF.Int8Ty, DP, Offset);
+    LD = CGF.Builder.CreateLoad(
+        Address(GEP, CGF.Int16Ty, CharUnits::fromQuantity(2)));
   }
 
-  auto *GEP = CGF.Builder.CreateGEP(CGF.Int8Ty, DP, Offset);
-  auto *LD = CGF.Builder.CreateLoad(
-      Address(GEP, CGF.Int16Ty, CharUnits::fromQuantity(2)));
   llvm::MDBuilder MDHelper(CGF.getLLVMContext());
   llvm::MDNode *RNode = MDHelper.createRange(APInt(16, 1),
       APInt(16, CGF.getTarget().getMaxOpenCLWorkGroupSize() + 1));
diff --git a/clang/lib/CodeGen/CodeGenModule.h b/clang/lib/CodeGen/CodeGenModule.h
--- a/clang/lib/CodeGen/CodeGenModule.h
+++ b/clang/lib/CodeGen/CodeGenModule.h
@@ -1567,6 +1567,11 @@
   void handleAMDGPUWavesPerEUAttr(llvm::Function *F,
                                   const AMDGPUWavesPerEUAttr *A);
 
+  llvm::Constant *
+  GetOrCreateLLVMGlobal(StringRef MangledName, llvm::Type *Ty, LangAS AddrSpace,
+                        const VarDecl *D,
+                        ForDefinition_t IsForDefinition = NotForDefinition);
+
 private:
   llvm::Constant *GetOrCreateLLVMFunction(
       StringRef MangledName, llvm::Type *Ty, GlobalDecl D, bool ForVTable,
@@ -1589,11 +1594,6 @@
   void UpdateMultiVersionNames(GlobalDecl GD, const FunctionDecl *FD,
                                StringRef &CurName);
 
-  llvm::Constant *
-  GetOrCreateLLVMGlobal(StringRef MangledName, llvm::Type *Ty, LangAS AddrSpace,
-                        const VarDecl *D,
-                        ForDefinition_t IsForDefinition = NotForDefinition);
-
   bool GetCPUAndFeaturesAttributes(GlobalDecl GD,
                                    llvm::AttrBuilder &AttrBuilder,
                                    bool SetTargetFeatures = true);
diff --git a/clang/lib/CodeGen/CodeGenModule.cpp b/clang/lib/CodeGen/CodeGenModule.cpp
--- a/clang/lib/CodeGen/CodeGenModule.cpp
+++ b/clang/lib/CodeGen/CodeGenModule.cpp
@@ -1201,6 +1201,8 @@
     getModule().addModuleFlag(llvm::Module::Error, "MaxTLSAlign",
                               getContext().getTargetInfo().getMaxTLSAlign());
 
+  getTargetCodeGenInfo().emitTargetGlobals(*this);
+
   getTargetCodeGenInfo().emitTargetMetadata(*this, MangledDeclNames);
 
   EmitBackendOptionsMetadata(getCodeGenOpts());
diff --git a/clang/lib/CodeGen/TargetInfo.h b/clang/lib/CodeGen/TargetInfo.h
--- a/clang/lib/CodeGen/TargetInfo.h
+++ b/clang/lib/CodeGen/TargetInfo.h
@@ -81,6 +81,9 @@
       CodeGen::CodeGenModule &CGM,
       const llvm::MapVector<GlobalDecl, StringRef> &MangledDeclNames) const {}
 
+  /// Provides a convenient hook to handle extra target-specific globals.
+  virtual void emitTargetGlobals(CodeGen::CodeGenModule &CGM) const {}
+
   /// Any further codegen related checks that need to be done on a function call
   /// in a target specific manner.
   virtual void checkFunctionCallABI(CodeGenModule &CGM, SourceLocation CallLoc,
diff --git a/clang/lib/CodeGen/Targets/AMDGPU.cpp b/clang/lib/CodeGen/Targets/AMDGPU.cpp
--- a/clang/lib/CodeGen/Targets/AMDGPU.cpp
+++ b/clang/lib/CodeGen/Targets/AMDGPU.cpp
@@ -8,6 +8,7 @@
 
 #include "ABIInfoImpl.h"
 #include "TargetInfo.h"
+#include "clang/Basic/TargetOptions.h"
 
 using namespace clang;
 using namespace clang::CodeGen;
@@ -268,6 +269,8 @@
   void setFunctionDeclAttributes(const FunctionDecl *FD, llvm::Function *F,
                                  CodeGenModule &CGM) const;
 
+  void emitTargetGlobals(CodeGen::CodeGenModule &CGM) const override;
+
   void setTargetAttributes(const Decl *D, llvm::GlobalValue *GV,
                            CodeGen::CodeGenModule &M) const override;
   unsigned getOpenCLKernelCallingConv() const override;
@@ -348,6 +351,38 @@
   }
 }
 
+/// Emits control constants used to change per-architecture behaviour in the
+/// AMDGPU ROCm device libraries.
+void AMDGPUTargetCodeGenInfo::emitTargetGlobals(
+    CodeGen::CodeGenModule &CGM) const {
+  if (!CGM.getTriple().isAMDGCN())
+    return;
+
+  auto AddGlobal = [&](StringRef Name,
+                       clang::TargetOptions::CodeObjectVersionKind Value,
+                       unsigned Size,
+                       llvm::GlobalValue::LinkageTypes Linkage =
+                           llvm::GlobalValue::WeakODRLinkage) {
+    if (CGM.getModule().getNamedGlobal(Name))
+      return;
+
+    auto *Type =
+        llvm::IntegerType::getIntNTy(CGM.getModule().getContext(), Size);
+    auto *GV = new llvm::GlobalVariable(
+        CGM.getModule(), Type, true, Linkage,
+        llvm::ConstantInt::get(Type, Value), Name, nullptr,
+        llvm::GlobalValue::ThreadLocalMode::NotThreadLocal,
+        CGM.getContext().getTargetAddressSpace(LangAS::opencl_constant));
+    GV->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::Local);
+    GV->setVisibility(llvm::GlobalValue::VisibilityTypes::HiddenVisibility);
+    GV->setAlignment(CGM.getDataLayout().getABITypeAlign(Type));
+  };
+
+  AddGlobal("llvm.amdgcn.abi.version",
+            CGM.getTarget().getTargetOpts().CodeObjectVersion, /*Size=*/32,
+            llvm::GlobalValue::WeakODRLinkage);
+}
+
 void AMDGPUTargetCodeGenInfo::setTargetAttributes(
     const Decl *D, llvm::GlobalValue *GV, CodeGen::CodeGenModule &M) const {
   if (requiresAMDGPUProtectedVisibility(D, GV)) {
diff --git a/clang/lib/Driver/ToolChain.cpp b/clang/lib/Driver/ToolChain.cpp
--- a/clang/lib/Driver/ToolChain.cpp
+++ b/clang/lib/Driver/ToolChain.cpp
@@ -1354,6 +1354,11 @@
 
   // Handle -Xopenmp-target flags
   for (auto *A : Args) {
+    // pass code objection version to device toolchain
+    // correctly set meta-data in intermediate files
+    if (A->getOption().matches(options::OPT_mcode_object_version_EQ))
+      DAL->append(A);
+
     // Exclude flags which may only apply to the host toolchain.
     // Do not exclude flags when the host triple (AuxTriple)
     // matches the current toolchain triple. If it is not present
diff --git a/openmp/libomptarget/DeviceRTL/CMakeLists.txt b/openmp/libomptarget/DeviceRTL/CMakeLists.txt
--- a/openmp/libomptarget/DeviceRTL/CMakeLists.txt
+++ b/openmp/libomptarget/DeviceRTL/CMakeLists.txt
@@ -282,7 +282,7 @@
 add_custom_target(omptarget.devicertl.amdgpu)
 foreach(gpu_arch ${LIBOMPTARGET_DEVICE_ARCHITECTURES})
   if("${gpu_arch}" IN_LIST all_amdgpu_architectures)
-    compileDeviceRTLLibrary(${gpu_arch} amdgpu amdgcn-amd-amdhsa)
+    compileDeviceRTLLibrary(${gpu_arch} amdgpu amdgcn-amd-amdhsa -Xclang -mcode-object-version=none)
   elseif("${gpu_arch}" IN_LIST all_nvptx_architectures)
     compileDeviceRTLLibrary(${gpu_arch} nvptx nvptx64-nvidia-cuda --cuda-feature=+ptx61)
   else()
diff --git a/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp b/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp
--- a/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp
+++ b/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp
@@ -253,6 +253,13 @@
     return Plugin::check(Status, "Error in hsa_amd_agents_allow_access: %s");
   }
 
+  Error zeroInitializeMemory(void *Ptr, size_t Size) {
+    uint64_t Rounded = sizeof(uint32_t) * ((Size + 3) / sizeof(uint32_t));
+    hsa_status_t Status =
+        hsa_amd_memory_fill(Ptr, 0, Rounded / sizeof(uint32_t));
+    return Plugin::check(Status, "Error in hsa_amd_memory_fill: %s");
+  }
+
   /// Get attribute from the memory pool.
   template <typename Ty>
   Error getAttr(hsa_amd_memory_pool_info_t Kind, Ty &Value) const {
@@ -381,6 +388,9 @@
   /// Get the executable.
   hsa_executable_t getExecutable() const { return Executable; }
 
+  /// Get to Code Object Version of the ELF
+  uint16_t getELFABIVersion() const { return ELFABIVersion; }
+
   /// Find an HSA device symbol by its name on the executable.
   Expected<hsa_executable_symbol_t>
   findDeviceSymbol(GenericDeviceTy &Device, StringRef SymbolName) const;
@@ -401,6 +411,7 @@
   hsa_executable_t Executable;
   hsa_code_object_t CodeObject;
   StringMap<utils::KernelMetaDataTy> KernelInfoMap;
+  uint16_t ELFABIVersion;
 };
 
 /// Class implementing the AMDGPU kernel functionalities which derives from the
@@ -408,8 +419,7 @@
 struct AMDGPUKernelTy : public GenericKernelTy {
   /// Create an AMDGPU kernel with a name and an execution mode.
   AMDGPUKernelTy(const char *Name, OMPTgtExecModeFlags ExecutionMode)
-      : GenericKernelTy(Name, ExecutionMode),
-        ImplicitArgsSize(sizeof(utils::AMDGPUImplicitArgsTy)) {}
+      : GenericKernelTy(Name, ExecutionMode) {}
 
   /// Initialize the AMDGPU kernel.
   Error initImpl(GenericDeviceTy &Device, DeviceImageTy &Image) override {
@@ -450,6 +460,12 @@
     // TODO: Read the kernel descriptor for the max threads per block. May be
     // read from the image.
 
+    ImplicitArgsSize =
+        (AMDImage.getELFABIVersion() < llvm::ELF::ELFABIVERSION_AMDGPU_HSA_V5)
+            ? utils::COV4_SIZE
+            : utils::COV5_SIZE;
+    DP("ELFABIVersion: %d\n", AMDImage.getELFABIVersion());
+
     // Get additional kernel info read from image
     KernelInfo = AMDImage.getKernelInfo(getName());
     if (!KernelInfo.has_value())
@@ -476,6 +492,10 @@
   /// Get the HSA kernel object representing the kernel function.
   uint64_t getKernelObject() const { return KernelObject; }
 
+  /// Get the size of implicitargs based on the code object version
+  /// @return 56 for cov4 and 256 for cov5
+  uint32_t getImplicitArgsSize() const { return ImplicitArgsSize; }
+
 private:
   /// The kernel object to execute.
   uint64_t KernelObject;
@@ -486,7 +506,7 @@
   uint32_t PrivateSize;
 
   /// The size of implicit kernel arguments.
-  const uint32_t ImplicitArgsSize;
+  uint32_t ImplicitArgsSize;
 
   /// Additional Info for the AMD GPU Kernel
   std::optional<utils::KernelMetaDataTy> KernelInfo;
@@ -1728,6 +1748,9 @@
     if (auto Err = initMemoryPools())
       return Err;
 
+    // if (auto Err = preAllocateDeviceMemoryPool())
+    //   return Err;
+
     char GPUName[64];
     if (auto Err = getDeviceAttr(HSA_AGENT_INFO_NAME, GPUName))
       return Err;
@@ -2515,6 +2538,43 @@
         });
   }
 
+  /// Get the address of pointer to the preallocated device memory pool.
+  void **getPreAllocatedDeviceMemoryPool() {
+    return &PreAllocatedDeviceMemoryPool;
+  }
+
+  /// Allocate and zero initialize a small memory pool from the coarse grained
+  /// device memory of each device.
+  Error preAllocateDeviceMemoryPool() {
+    Error Err = retrieveAllMemoryPools();
+    if (Err)
+      return Plugin::error("Unable to retieve all memmory pools");
+
+    void *DevPtr;
+    for (AMDGPUMemoryPoolTy *MemoryPool : AllMemoryPools) {
+      if (MemoryPool->isCoarseGrained()) {
+        DevPtr = nullptr;
+        size_t PreAllocSize = utils::PER_DEVICE_PREALLOC_SIZE;
+
+        Err = MemoryPool->allocate(PreAllocSize, &DevPtr);
+        if (Err)
+          return Plugin::error("Device memory pool preallocation failed");
+
+        Err = MemoryPool->enableAccess(DevPtr, PreAllocSize, {getAgent()});
+        if (Err)
+          return Plugin::error("Preallocated device memory pool inaccessible");
+
+        Err = MemoryPool->zeroInitializeMemory(DevPtr, PreAllocSize);
+        if (Err)
+          return Plugin::error(
+              "Zero initialization of preallocated device memory pool failed");
+
+        PreAllocatedDeviceMemoryPool = DevPtr;
+      }
+    }
+    return Plugin::success();
+  }
+
 private:
   using AMDGPUEventRef = AMDGPUResourceRef<AMDGPUEventTy>;
   using AMDGPUEventManagerTy = GenericDeviceResourceManagerTy<AMDGPUEventRef>;
@@ -2572,6 +2632,9 @@
 
   /// Reference to the host device.
   AMDHostDeviceTy &HostDevice;
+
+  /// Pointer to the preallocated device memory pool
+  void *PreAllocatedDeviceMemoryPool;
 };
 
 Error AMDGPUDeviceImageTy::loadExecutable(const AMDGPUDeviceTy &Device) {
@@ -2605,8 +2668,8 @@
   if (Result)
     return Plugin::error("Loaded HSA executable does not validate");
 
-  if (auto Err =
-          utils::readAMDGPUMetaDataFromImage(getMemoryBuffer(), KernelInfoMap))
+  if (auto Err = utils::readAMDGPUMetaDataFromImage(
+          getMemoryBuffer(), KernelInfoMap, ELFABIVersion))
     return Err;
 
   return Plugin::success();
@@ -2947,9 +3010,8 @@
   }
 
   // Initialize implicit arguments.
-  utils::AMDGPUImplicitArgsTy *ImplArgs =
-      reinterpret_cast<utils::AMDGPUImplicitArgsTy *>(
-          advanceVoidPtr(AllArgs, KernelArgsSize));
+  uint8_t *ImplArgs =
+      static_cast<uint8_t *>(advanceVoidPtr(AllArgs, KernelArgsSize));
 
   // Initialize the implicit arguments to zero.
   std::memset(ImplArgs, 0, ImplicitArgsSize);
@@ -2971,6 +3033,43 @@
   if (GenericDevice.getRPCServer())
     Stream->setRPCServer(GenericDevice.getRPCServer());
 
+  if (getImplicitArgsSize() < utils::COV5_SIZE) {
+    DP("Setting fields of ImplicitArgs for COV4\n");
+  } else {
+    DP("Setting fields of ImplicitArgs for COV5\n");
+    uint16_t Remainder = 0;
+    uint16_t GridDims = 1;
+    uint32_t NumThreadsYZ = 1;
+    uint16_t NumBlocksYZ = 0;
+    memcpy(&ImplArgs[utils::COV5_BLOCK_COUNT_X_OFFSET], &NumBlocks,
+           utils::COV5_BLOCK_COUNT_X_SIZE);
+    memcpy(&ImplArgs[utils::COV5_BLOCK_COUNT_Y_OFFSET], &NumBlocksYZ,
+           utils::COV5_BLOCK_COUNT_Y_SIZE);
+    memcpy(&ImplArgs[utils::COV5_BLOCK_COUNT_Z_OFFSET], &NumBlocksYZ,
+           utils::COV5_BLOCK_COUNT_Z_SIZE);
+
+    memcpy(&ImplArgs[utils::COV5_GROUP_SIZE_X_OFFSET], &NumThreads,
+           utils::COV5_GROUP_SIZE_X_SIZE);
+    memcpy(&ImplArgs[utils::COV5_GROUP_SIZE_Y_OFFSET], &NumThreadsYZ,
+           utils::COV5_GROUP_SIZE_Y_SIZE);
+    memcpy(&ImplArgs[utils::COV5_GROUP_SIZE_Z_OFFSET], &NumThreadsYZ,
+           utils::COV5_GROUP_SIZE_Z_SIZE);
+
+    memcpy(&ImplArgs[utils::COV5_REMAINDER_X_OFFSET], &Remainder,
+           utils::COV5_REMAINDER_X_SIZE);
+    memcpy(&ImplArgs[utils::COV5_REMAINDER_Y_OFFSET], &Remainder,
+           utils::COV5_REMAINDER_Y_SIZE);
+    memcpy(&ImplArgs[utils::COV5_REMAINDER_Z_OFFSET], &Remainder,
+           utils::COV5_REMAINDER_Z_SIZE);
+
+    memcpy(&ImplArgs[utils::COV5_GRID_DIMS_OFFSET], &GridDims,
+           utils::COV5_GRID_DIMS_SIZE);
+
+    // memcpy(&ImplArgs[utils::COV5_HEAPV1_PTR_OFFSET],
+    //        AMDGPUDevice.getPreAllocatedDeviceMemoryPool(),
+    //        utils::COV5_HEAPV1_PTR_SIZE);
+  }
+
   // Push the kernel launch into the stream.
   return Stream->pushKernelLaunch(*this, AllArgs, NumThreads, NumBlocks,
                                   GroupSize, ArgsMemoryManager);
diff --git a/openmp/libomptarget/plugins-nextgen/amdgpu/utils/UtilitiesRTL.h b/openmp/libomptarget/plugins-nextgen/amdgpu/utils/UtilitiesRTL.h
--- a/openmp/libomptarget/plugins-nextgen/amdgpu/utils/UtilitiesRTL.h
+++ b/openmp/libomptarget/plugins-nextgen/amdgpu/utils/UtilitiesRTL.h
@@ -25,6 +25,7 @@
 #include "llvm/Support/MemoryBufferRef.h"
 
 #include "llvm/Support/YAMLTraits.h"
+using namespace llvm::ELF;
 
 namespace llvm {
 namespace omp {
@@ -32,19 +33,55 @@
 namespace plugin {
 namespace utils {
 
-// The implicit arguments of AMDGPU kernels.
-struct AMDGPUImplicitArgsTy {
-  uint64_t OffsetX;
-  uint64_t OffsetY;
-  uint64_t OffsetZ;
-  uint64_t HostcallPtr;
-  uint64_t Unused0;
-  uint64_t Unused1;
-  uint64_t Unused2;
+enum IMPLICITARGS : uint32_t {
+  COV4_SIZE = 56,
+  COV4_HOSTCALL_PTR_OFFSET = 24,
+  HOSTCALL_PTR_SIZE = 8,
+
+  COV5_SIZE = 256,
+
+  COV5_BLOCK_COUNT_X_OFFSET = 0,
+  COV5_BLOCK_COUNT_X_SIZE = 4,
+
+  COV5_BLOCK_COUNT_Y_OFFSET = 4,
+  COV5_BLOCK_COUNT_Y_SIZE = 4,
+
+  COV5_BLOCK_COUNT_Z_OFFSET = 8,
+  COV5_BLOCK_COUNT_Z_SIZE = 4,
+
+  COV5_GROUP_SIZE_X_OFFSET = 12,
+  COV5_GROUP_SIZE_X_SIZE = 2,
+
+  COV5_GROUP_SIZE_Y_OFFSET = 14,
+  COV5_GROUP_SIZE_Y_SIZE = 2,
+
+  COV5_GROUP_SIZE_Z_OFFSET = 16,
+  COV5_GROUP_SIZE_Z_SIZE = 2,
+
+  COV5_REMAINDER_X_OFFSET = 18,
+  COV5_REMAINDER_X_SIZE = 2,
+
+  COV5_REMAINDER_Y_OFFSET = 20,
+  COV5_REMAINDER_Y_SIZE = 2,
+
+  COV5_REMAINDER_Z_OFFSET = 22,
+  COV5_REMAINDER_Z_SIZE = 2,
+
+  COV5_GRID_DIMS_OFFSET = 64,
+  COV5_GRID_DIMS_SIZE = 2,
+
+  COV5_HOSTCALL_PTR_OFFSET = 80,
+
+  COV5_HEAPV1_PTR_OFFSET = 96,
+  COV5_HEAPV1_PTR_SIZE = 8,
+
+  // 128 KB
+  PER_DEVICE_PREALLOC_SIZE = 131072
 };
 
-static_assert(sizeof(AMDGPUImplicitArgsTy) == 56,
-              "Unexpected size of implicit arguments");
+uint16_t getImplicitArgsSize(uint16_t Version) {
+  return Version < ELF::ELFABIVERSION_AMDGPU_HSA_V5 ? 56 : 256;
+}
 
 /// Parse a TargetID to get processor arch and feature map.
 /// Returns processor subarch.
@@ -295,7 +332,8 @@
 /// Reads the AMDGPU specific metadata from the ELF file and propagates the
 /// KernelInfoMap
 Error readAMDGPUMetaDataFromImage(MemoryBufferRef MemBuffer,
-                                  StringMap<KernelMetaDataTy> &KernelInfoMap) {
+                                  StringMap<KernelMetaDataTy> &KernelInfoMap,
+                                  uint16_t &ELFABIVersion) {
   Error Err = Error::success(); // Used later as out-parameter
 
   auto ELFOrError = object::ELF64LEFile::create(MemBuffer.getBuffer());
@@ -305,6 +343,12 @@
   const object::ELF64LEFile ELFObj = ELFOrError.get();
   ArrayRef<object::ELF64LE::Shdr> Sections = cantFail(ELFObj.sections());
   KernelInfoReader Reader(KernelInfoMap);
+
+  // Read the code object version from ELF image header
+  auto Header = ELFObj.getHeader();
+  ELFABIVersion = (uint8_t)(Header.e_ident[ELF::EI_ABIVERSION]);
+  DP("ELFABIVERSION Version: %u\n", ELFABIVersion);
+
   for (const auto &S : Sections) {
     if (S.sh_type != ELF::SHT_NOTE)
       continue;