diff --git a/openmp/libomptarget/plugins/amdgpu/impl/get_elf_mach_gfx_name.h b/openmp/libomptarget/plugins/amdgpu/impl/get_elf_mach_gfx_name.h --- a/openmp/libomptarget/plugins/amdgpu/impl/get_elf_mach_gfx_name.h +++ b/openmp/libomptarget/plugins/amdgpu/impl/get_elf_mach_gfx_name.h @@ -12,4 +12,51 @@ const char *get_elf_mach_gfx_name(uint32_t EFlags); +/// refer for details: see +/// https://llvm.org/docs/AMDGPUUsage.html#code-object-v5-metadata + +enum IMPLICITARGS : uint16_t { + COV4_SIZE = 56, // size of code object v4 metadata in bytes + COV4_HOSTCALL_PTR_OFFSET = 24, // offset of host call buffer pointer + HOSTCALL_PTR_SIZE = + 8, // size of hostcall buffer pointer, same for cov4 and cov5. + + COV5_SIZE = 256, // size of code object v4 metadata in bytes + + COV5_BLOCK_COUNT_X_OFFSET = 0, + COV5_BLOCK_COUNT_X_SIZE = 4, + + COV5_BLOCK_COUNT_Y_OFFSET = 4, + COV5_BLOCK_COUNT_Y_SIZE = 4, + + COV5_BLOCK_COUNT_Z_OFFSET = 8, + COV5_BLOCK_COUNT_Z_SIZE = 4, + + COV5_GROUP_SIZE_X_OFFSET = 12, + COV5_GROUP_SIZE_X_SIZE = 2, + + COV5_GROUP_SIZE_Y_OFFSET = 14, + COV5_GROUP_SIZE_Y_SIZE = 2, + + COV5_GROUP_SIZE_Z_OFFSET = 16, + COV5_GROUP_SIZE_Z_SIZE = 2, + + COV5_REMAINDER_X_OFFSET = 18, + COV5_REMAINDER_X_SIZE = 2, + + COV5_REMAINDER_Y_OFFSET = 20, + COV5_REMAINDER_Y_SIZE = 2, + + COV5_REMAINDER_Z_OFFSET = 22, + COV5_REMAINDER_Z_SIZE = 2, + + COV5_GRID_DIMS_OFFSET = 64, + COV5_GRID_DIMS_SIZE = 2, + + COV5_HOSTCALL_PTR_OFFSET = 80, + + COV5_HEAPV1_PTR_OFFSET = 96, + COV5_HEAPV1_PTR_SIZE = 8 +}; + #endif diff --git a/openmp/libomptarget/plugins/amdgpu/impl/get_elf_mach_gfx_name.cpp b/openmp/libomptarget/plugins/amdgpu/impl/get_elf_mach_gfx_name.cpp --- a/openmp/libomptarget/plugins/amdgpu/impl/get_elf_mach_gfx_name.cpp +++ b/openmp/libomptarget/plugins/amdgpu/impl/get_elf_mach_gfx_name.cpp @@ -11,6 +11,7 @@ // identifier) and contains more up to date values for the enum checked here. // rtl.cpp uses the system elf.h. #include "llvm/BinaryFormat/ELF.h" +using namespace llvm::ELF; const char *get_elf_mach_gfx_name(uint32_t EFlags) { using namespace llvm::ELF; diff --git a/openmp/libomptarget/plugins/amdgpu/impl/system.cpp b/openmp/libomptarget/plugins/amdgpu/impl/system.cpp --- a/openmp/libomptarget/plugins/amdgpu/impl/system.cpp +++ b/openmp/libomptarget/plugins/amdgpu/impl/system.cpp @@ -67,6 +67,17 @@ HiddenMultiGridSyncArg, HiddenHostcallBuffer, HiddenHeapV1, + HiddenBlockCountX, + HiddenBlockCountY, + HiddenBlockCountZ, + HiddenGroupSizeX, + HiddenGroupSizeY, + HiddenGroupSizeZ, + HiddenRemainderX, + HiddenRemainderY, + HiddenRemainderZ, + HiddenGridDims, + HiddenQueuePtr, Unknown }; @@ -102,7 +113,19 @@ {"hidden_multigrid_sync_arg", KernelArgMD::ValueKind::HiddenMultiGridSyncArg}, {"hidden_hostcall_buffer", KernelArgMD::ValueKind::HiddenHostcallBuffer}, - {"hidden_heap_v1", KernelArgMD::ValueKind::HiddenHeapV1}}; + {"hidden_heap_v1", KernelArgMD::ValueKind::HiddenHeapV1}, + {"hidden_block_count_x", KernelArgMD::ValueKind::HiddenBlockCountX}, + {"hidden_block_count_y", KernelArgMD::ValueKind::HiddenBlockCountY}, + {"hidden_block_count_z", KernelArgMD::ValueKind::HiddenBlockCountZ}, + {"hidden_group_size_x", KernelArgMD::ValueKind::HiddenGroupSizeX}, + {"hidden_group_size_y", KernelArgMD::ValueKind::HiddenGroupSizeY}, + {"hidden_group_size_z", KernelArgMD::ValueKind::HiddenGroupSizeZ}, + {"hidden_remainder_x", KernelArgMD::ValueKind::HiddenRemainderX}, + {"hidden_remainder_y", KernelArgMD::ValueKind::HiddenRemainderY}, + {"hidden_remainder_z", KernelArgMD::ValueKind::HiddenRemainderZ}, + {"hidden_grid_dims", KernelArgMD::ValueKind::HiddenGridDims}, + {"hidden_queue_ptr", KernelArgMD::ValueKind::HiddenQueuePtr}, +}; namespace core { @@ -164,6 +187,17 @@ case KernelArgMD::ValueKind::HiddenMultiGridSyncArg: case KernelArgMD::ValueKind::HiddenHostcallBuffer: case KernelArgMD::ValueKind::HiddenHeapV1: + case KernelArgMD::ValueKind::HiddenBlockCountX: + case KernelArgMD::ValueKind::HiddenBlockCountY: + case KernelArgMD::ValueKind::HiddenBlockCountZ: + case KernelArgMD::ValueKind::HiddenGroupSizeX: + case KernelArgMD::ValueKind::HiddenGroupSizeY: + case KernelArgMD::ValueKind::HiddenGroupSizeZ: + case KernelArgMD::ValueKind::HiddenRemainderX: + case KernelArgMD::ValueKind::HiddenRemainderY: + case KernelArgMD::ValueKind::HiddenRemainderZ: + case KernelArgMD::ValueKind::HiddenGridDims: + case KernelArgMD::ValueKind::HiddenQueuePtr: return true; default: return false; diff --git a/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp b/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp --- a/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp +++ b/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp @@ -1496,13 +1496,14 @@ ImplArgs->OffsetY = 0; ImplArgs->OffsetZ = 0; + uint64_t Buffer = 0; // assign a hostcall buffer for the selected Q if (__atomic_load_n(&DeviceInfo().HostcallRequired, __ATOMIC_ACQUIRE)) { // hostrpc_assign_buffer is not thread safe, and this function is // under a multiple reader lock, not a writer lock. static pthread_mutex_t HostcallInitLock = PTHREAD_MUTEX_INITIALIZER; pthread_mutex_lock(&HostcallInitLock); - uint64_t Buffer = hostrpc_assign_buffer( + Buffer = hostrpc_assign_buffer( DeviceInfo().HSAAgents[DeviceId], Queue, DeviceId); pthread_mutex_unlock(&HostcallInitLock); if (!Buffer) { @@ -1510,29 +1511,14 @@ "error\n"); return OFFLOAD_FAIL; } - - DP("Implicit argument count: %d\n", - KernelInfoEntry.implicit_argument_count); - if (KernelInfoEntry.implicit_argument_count >= 4) { - // Initialise pointer for implicit_argument_count != 0 ABI - // Guess that the right implicit argument is at offset 24 after - // the explicit arguments. In the future, should be able to read - // the offset from msgpack. Clang is not annotating it at present. - uint64_t Offset = - sizeof(void *) * (KernelInfoEntry.explicit_argument_count + 3); - if ((Offset + 8) > ArgPool->kernargSizeIncludingImplicit()) { - DP("Bad offset of hostcall: %lu, exceeds kernarg size w/ implicit " - "args: %d\n", - Offset + 8, ArgPool->kernargSizeIncludingImplicit()); - } else { - memcpy(static_cast(KernArg) + Offset, &Buffer, 8); - } - } - - // initialise pointer for implicit_argument_count == 0 ABI - ImplArgs->HostcallPtr = Buffer; } + DP("Implicit argument count: %d\n", + KernelInfoEntry.implicit_argument_count); + DP("Setting Hostcall buffer for COV4\n"); + memcpy(&ImplArgs[IMPLICITARGS::COV4_HOSTCALL_PTR_OFFSET], &Buffer, + IMPLICITARGS::HOSTCALL_PTR_SIZE); + Packet->kernarg_address = KernArg; }