diff --git a/openmp/libomptarget/plugins/amdgpu/impl/internal.h b/openmp/libomptarget/plugins/amdgpu/impl/internal.h --- a/openmp/libomptarget/plugins/amdgpu/impl/internal.h +++ b/openmp/libomptarget/plugins/amdgpu/impl/internal.h @@ -54,7 +54,8 @@ uint32_t sgpr_spill_count; uint32_t vgpr_spill_count; uint32_t kernel_segment_size; - uint32_t num_args; + uint32_t explicit_argument_count; // excluding implicit arguments + uint32_t hostcall_pointer_offset; // UINT32_MAX if not used } atl_kernel_info_t; typedef struct atl_symbol_info_s { diff --git a/openmp/libomptarget/plugins/amdgpu/impl/system.cpp b/openmp/libomptarget/plugins/amdgpu/impl/system.cpp --- a/openmp/libomptarget/plugins/amdgpu/impl/system.cpp +++ b/openmp/libomptarget/plugins/amdgpu/impl/system.cpp @@ -62,7 +62,7 @@ }; KernelArgMD() - : name_(std::string()), size_(0), offset_(0), + : name_(std::string()), size_(0), offset_(0), valueKind_(ValueKind::Unknown) {} // fields @@ -381,7 +381,7 @@ return HSA_STATUS_ERROR_INVALID_CODE_OBJECT; } - atl_kernel_info_t info = {0, 0, 0, 0, 0, 0, 0, 0, 0}; + atl_kernel_info_t info = {0, 0, 0, 0, 0, 0, 0, 0, 0, UINT32_MAX}; uint64_t sgpr_count, vgpr_count, sgpr_spill_count, vgpr_spill_count; msgpack_errors += map_lookup_uint64_t(element, ".sgpr_count", &sgpr_count); @@ -446,8 +446,7 @@ return HSA_STATUS_ERROR_INVALID_CODE_OBJECT; } - info.num_args = argsSize; - + uint32_t explicit_argument_count = 0; for (size_t i = 0; i < argsSize; ++i) { KernelArgMD lcArg; @@ -473,18 +472,27 @@ lcArg.offset_); offset += lcArg.size_; + if (lcArg.valueKind_ == KernelArgMD::ValueKind::HiddenHostcallBuffer) { + if (info.hostcall_pointer_offset != UINT32_MAX) { + DP("[%s: Multiple hostcall buffers detected\n", kernelName.c_str()); + } + info.hostcall_pointer_offset = lcArg.offset_; + } + // check if the arg is a hidden/implicit arg // this logic assumes that all hidden args are 8-byte aligned if (!isImplicit(lcArg.valueKind_)) { + explicit_argument_count++; kernel_explicit_args_size += lcArg.size_; } else { hasHiddenArgs = true; } kernel_explicit_args_size += padding; } + info.explicit_argument_count = explicit_argument_count; } - // TODO: Probably don't want this arithmetic + // TODO: Probably don't want this arithmetic info.kernel_segment_size = (hasHiddenArgs ? kernel_explicit_args_size : kernel_segment_size); DP("[%s: kernarg seg size] (%lu --> %u)\n", kernelName.c_str(), diff --git a/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp b/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp --- a/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp +++ b/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp @@ -2071,7 +2071,7 @@ const uint32_t sgpr_spill_count = KernelInfoEntry.sgpr_spill_count; const uint32_t vgpr_spill_count = KernelInfoEntry.vgpr_spill_count; - assert(arg_num == (int)KernelInfoEntry.num_args); + assert(arg_num == (int)KernelInfoEntry.explicit_argument_count); /* * Set limit based on ThreadsPerGroup and GroupsPerDevice @@ -2173,14 +2173,24 @@ // under a multiple reader lock, not a writer lock. static pthread_mutex_t hostcall_init_lock = PTHREAD_MUTEX_INITIALIZER; pthread_mutex_lock(&hostcall_init_lock); - impl_args->hostcall_ptr = hostrpc_assign_buffer( + unsigned long buffer = hostrpc_assign_buffer( DeviceInfo.HSAAgents[device_id], queue, device_id); pthread_mutex_unlock(&hostcall_init_lock); - if (!impl_args->hostcall_ptr) { + if (!buffer) { DP("hostrpc_assign_buffer failed, gpu would dereference null and " "error\n"); return OFFLOAD_FAIL; } + + uint32_t HostcallOffset = KernelInfoEntry.hostcall_pointer_offset; + if (HostcallOffset != UINT32_MAX) { + if (HostcallOffset > (ArgPool->kernarg_segment_size - 8)) { + DP("Bad offset of hostcall, exceeds kernarg segment size\n"); + } else { + memcpy(static_cast(kernarg) + HostcallOffset, &buffer, 8); + } + } + impl_args->hostcall_ptr = buffer; } packet->kernarg_address = kernarg;