diff --git a/openmp/libomptarget/plugins/amdgpu/impl/internal.h b/openmp/libomptarget/plugins/amdgpu/impl/internal.h --- a/openmp/libomptarget/plugins/amdgpu/impl/internal.h +++ b/openmp/libomptarget/plugins/amdgpu/impl/internal.h @@ -54,7 +54,8 @@ uint32_t sgpr_spill_count; uint32_t vgpr_spill_count; uint32_t kernel_segment_size; - uint32_t num_args; + uint32_t explicit_argument_count; + uint32_t implicit_argument_count; } atl_kernel_info_t; typedef struct atl_symbol_info_s { diff --git a/openmp/libomptarget/plugins/amdgpu/impl/system.cpp b/openmp/libomptarget/plugins/amdgpu/impl/system.cpp --- a/openmp/libomptarget/plugins/amdgpu/impl/system.cpp +++ b/openmp/libomptarget/plugins/amdgpu/impl/system.cpp @@ -381,7 +381,7 @@ return HSA_STATUS_ERROR_INVALID_CODE_OBJECT; } - atl_kernel_info_t info = {0, 0, 0, 0, 0, 0, 0, 0, 0}; + atl_kernel_info_t info = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; uint64_t sgpr_count, vgpr_count, sgpr_spill_count, vgpr_spill_count; msgpack_errors += map_lookup_uint64_t(element, ".sgpr_count", &sgpr_count); @@ -446,8 +446,6 @@ return HSA_STATUS_ERROR_INVALID_CODE_OBJECT; } - info.num_args = argsSize; - for (size_t i = 0; i < argsSize; ++i) { KernelArgMD lcArg; @@ -476,8 +474,10 @@ // check if the arg is a hidden/implicit arg // this logic assumes that all hidden args are 8-byte aligned if (!isImplicit(lcArg.valueKind_)) { + info.explicit_argument_count++; kernel_explicit_args_size += lcArg.size_; } else { + info.implicit_argument_count++; hasHiddenArgs = true; } kernel_explicit_args_size += padding; diff --git a/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp b/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp --- a/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp +++ b/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp @@ -2071,7 +2071,7 @@ const uint32_t sgpr_spill_count = KernelInfoEntry.sgpr_spill_count; const uint32_t vgpr_spill_count = KernelInfoEntry.vgpr_spill_count; - assert(arg_num == (int)KernelInfoEntry.num_args); + assert(arg_num == (int)KernelInfoEntry.explicit_argument_count); /* * Set limit based on ThreadsPerGroup and GroupsPerDevice @@ -2173,14 +2173,31 @@ // under a multiple reader lock, not a writer lock. static pthread_mutex_t hostcall_init_lock = PTHREAD_MUTEX_INITIALIZER; pthread_mutex_lock(&hostcall_init_lock); - impl_args->hostcall_ptr = hostrpc_assign_buffer( + unsigned long buffer = hostrpc_assign_buffer( DeviceInfo.HSAAgents[device_id], queue, device_id); pthread_mutex_unlock(&hostcall_init_lock); - if (!impl_args->hostcall_ptr) { + if (!buffer) { DP("hostrpc_assign_buffer failed, gpu would dereference null and " "error\n"); return OFFLOAD_FAIL; } + + if (KernelInfoEntry.implicit_argument_count >= 4) { + // Initialise pointer for implicit_argument_count != 0 ABI + // Guess that the right implicit argument is at offset 24 after + // the explicit arguments. In the future, should be able to read + // the offset from msgpack. Clang is not annotating it at present. + uint64_t Offset = + sizeof(void *) * (KernelInfoEntry.explicit_argument_count + 3); + if ((Offset + 8) > (ArgPool->kernarg_segment_size)) { + DP("Bad offset of hostcall, exceeds kernarg segment size\n"); + } else { + memcpy(static_cast(kernarg) + Offset, &buffer, 8); + } + } + + // initialise pointer for implicit_argument_count == 0 ABI + impl_args->hostcall_ptr = buffer; } packet->kernarg_address = kernarg;