Index: openmp/libomptarget/plugins/amdgpu/src/rtl.cpp =================================================================== --- openmp/libomptarget/plugins/amdgpu/src/rtl.cpp +++ openmp/libomptarget/plugins/amdgpu/src/rtl.cpp @@ -883,1355 +883,1351 @@ AsyncInfo->Queue = 0; } -bool elf_machine_id_is_amdgcn(__tgt_device_image *image) { - const uint16_t amdgcnMachineID = 224; // EM_AMDGPU may not be in system elf.h - int32_t r = elf_check_machine(image, amdgcnMachineID); - if (!r) { - DP("Supported machine ID not found\n"); - } - return r; -} +// Determine launch values for kernel. +struct launchVals { + int WorkgroupSize; + int GridSize; +}; +launchVals getLaunchVals(int WarpSize, EnvironmentVariables Env, + int ConstWGSize, + llvm::omp::OMPTgtExecModeFlags ExecutionMode, + int num_teams, int thread_limit, + uint64_t loop_tripcount, int DeviceNumTeams) { -uint32_t elf_e_flags(__tgt_device_image *image) { - char *img_begin = (char *)image->ImageStart; - size_t img_size = (char *)image->ImageEnd - img_begin; + int threadsPerGroup = RTLDeviceInfoTy::Default_WG_Size; + int num_groups = 0; - Elf *e = elf_memory(img_begin, img_size); - if (!e) { - DP("Unable to get ELF handle: %s!\n", elf_errmsg(-1)); - return 0; + int Max_Teams = + Env.MaxTeamsDefault > 0 ? Env.MaxTeamsDefault : DeviceNumTeams; + if (Max_Teams > RTLDeviceInfoTy::HardTeamLimit) + Max_Teams = RTLDeviceInfoTy::HardTeamLimit; + + if (print_kernel_trace & STARTUP_DETAILS) { + DP("RTLDeviceInfoTy::Max_Teams: %d\n", RTLDeviceInfoTy::Max_Teams); + DP("Max_Teams: %d\n", Max_Teams); + DP("RTLDeviceInfoTy::Warp_Size: %d\n", WarpSize); + DP("RTLDeviceInfoTy::Max_WG_Size: %d\n", RTLDeviceInfoTy::Max_WG_Size); + DP("RTLDeviceInfoTy::Default_WG_Size: %d\n", + RTLDeviceInfoTy::Default_WG_Size); + DP("thread_limit: %d\n", thread_limit); + DP("threadsPerGroup: %d\n", threadsPerGroup); + DP("ConstWGSize: %d\n", ConstWGSize); + } + // check for thread_limit() clause + if (thread_limit > 0) { + threadsPerGroup = thread_limit; + DP("Setting threads per block to requested %d\n", thread_limit); + // Add master warp for GENERIC + if (ExecutionMode == + llvm::omp::OMPTgtExecModeFlags::OMP_TGT_EXEC_MODE_GENERIC) { + threadsPerGroup += WarpSize; + DP("Adding master wavefront: +%d threads\n", WarpSize); + } + if (threadsPerGroup > RTLDeviceInfoTy::Max_WG_Size) { // limit to max + threadsPerGroup = RTLDeviceInfoTy::Max_WG_Size; + DP("Setting threads per block to maximum %d\n", threadsPerGroup); + } + } + // check flat_max_work_group_size attr here + if (threadsPerGroup > ConstWGSize) { + threadsPerGroup = ConstWGSize; + DP("Reduced threadsPerGroup to flat-attr-group-size limit %d\n", + threadsPerGroup); } + if (print_kernel_trace & STARTUP_DETAILS) + DP("threadsPerGroup: %d\n", threadsPerGroup); + DP("Preparing %d threads\n", threadsPerGroup); - Elf64_Ehdr *eh64 = elf64_getehdr(e); + // Set default num_groups (teams) + if (Env.TeamLimit > 0) + num_groups = (Max_Teams < Env.TeamLimit) ? Max_Teams : Env.TeamLimit; + else + num_groups = Max_Teams; + DP("Set default num of groups %d\n", num_groups); - if (!eh64) { - DP("Unable to get machine ID from ELF file!\n"); - elf_end(e); - return 0; + if (print_kernel_trace & STARTUP_DETAILS) { + DP("num_groups: %d\n", num_groups); + DP("num_teams: %d\n", num_teams); } - uint32_t Flags = eh64->e_flags; - - elf_end(e); - DP("ELF Flags: 0x%x\n", Flags); - return Flags; -} -} // namespace + // Reduce num_groups if threadsPerGroup exceeds RTLDeviceInfoTy::Max_WG_Size + // This reduction is typical for default case (no thread_limit clause). + // or when user goes crazy with num_teams clause. + // FIXME: We cant distinguish between a constant or variable thread limit. + // So we only handle constant thread_limits. + if (threadsPerGroup > + RTLDeviceInfoTy::Default_WG_Size) // 256 < threadsPerGroup <= 1024 + // Should we round threadsPerGroup up to nearest WarpSize + // here? + num_groups = (Max_Teams * RTLDeviceInfoTy::Max_WG_Size) / threadsPerGroup; -int32_t __tgt_rtl_is_valid_binary(__tgt_device_image *image) { - return elf_machine_id_is_amdgcn(image); -} + // check for num_teams() clause + if (num_teams > 0) { + num_groups = (num_teams < num_groups) ? num_teams : num_groups; + } + if (print_kernel_trace & STARTUP_DETAILS) { + DP("num_groups: %d\n", num_groups); + DP("Env.NumTeams %d\n", Env.NumTeams); + DP("Env.TeamLimit %d\n", Env.TeamLimit); + } -int __tgt_rtl_number_of_devices() { - // If the construction failed, no methods are safe to call - if (DeviceInfo.ConstructionSucceeded) { - return DeviceInfo.NumberOfDevices; + if (Env.NumTeams > 0) { + num_groups = (Env.NumTeams < num_groups) ? Env.NumTeams : num_groups; + DP("Modifying teams based on Env.NumTeams %d\n", Env.NumTeams); + } else if (Env.TeamLimit > 0) { + num_groups = (Env.TeamLimit < num_groups) ? Env.TeamLimit : num_groups; + DP("Modifying teams based on Env.TeamLimit%d\n", Env.TeamLimit); } else { - DP("AMDGPU plugin construction failed. Zero devices available\n"); - return 0; + if (num_teams <= 0) { + if (loop_tripcount > 0) { + if (ExecutionMode == + llvm::omp::OMPTgtExecModeFlags::OMP_TGT_EXEC_MODE_SPMD) { + // round up to the nearest integer + num_groups = ((loop_tripcount - 1) / threadsPerGroup) + 1; + } else if (ExecutionMode == + llvm::omp::OMPTgtExecModeFlags::OMP_TGT_EXEC_MODE_GENERIC) { + num_groups = loop_tripcount; + } else /* OMP_TGT_EXEC_MODE_GENERIC_SPMD */ { + // This is a generic kernel that was transformed to use SPMD-mode + // execution but uses Generic-mode semantics for scheduling. + num_groups = loop_tripcount; + } + DP("Using %d teams due to loop trip count %" PRIu64 " and number of " + "threads per block %d\n", + num_groups, loop_tripcount, threadsPerGroup); + } + } else { + num_groups = num_teams; + } + if (num_groups > Max_Teams) { + num_groups = Max_Teams; + if (print_kernel_trace & STARTUP_DETAILS) + DP("Limiting num_groups %d to Max_Teams %d \n", num_groups, Max_Teams); + } + if (num_groups > num_teams && num_teams > 0) { + num_groups = num_teams; + if (print_kernel_trace & STARTUP_DETAILS) + DP("Limiting num_groups %d to clause num_teams %d \n", num_groups, + num_teams); + } } -} -int64_t __tgt_rtl_init_requires(int64_t RequiresFlags) { - DP("Init requires flags to %ld\n", RequiresFlags); - DeviceInfo.RequiresFlags = RequiresFlags; - return RequiresFlags; + // num_teams clause always honored, no matter what, unless DEFAULT is active. + if (num_teams > 0) { + num_groups = num_teams; + // Cap num_groups to EnvMaxTeamsDefault if set. + if (Env.MaxTeamsDefault > 0 && num_groups > Env.MaxTeamsDefault) + num_groups = Env.MaxTeamsDefault; + } + if (print_kernel_trace & STARTUP_DETAILS) { + DP("threadsPerGroup: %d\n", threadsPerGroup); + DP("num_groups: %d\n", num_groups); + DP("loop_tripcount: %ld\n", loop_tripcount); + } + DP("Final %d num_groups and %d threadsPerGroup\n", num_groups, + threadsPerGroup); + + launchVals res; + res.WorkgroupSize = threadsPerGroup; + res.GridSize = threadsPerGroup * num_groups; + return res; } -namespace { -template bool enforce_upper_bound(T *value, T upper) { - bool changed = *value > upper; - if (changed) { - *value = upper; +static uint64_t acquire_available_packet_id(hsa_queue_t *queue) { + uint64_t packet_id = hsa_queue_add_write_index_relaxed(queue, 1); + bool full = true; + while (full) { + full = + packet_id >= (queue->size + hsa_queue_load_read_index_scacquire(queue)); } - return changed; + return packet_id; } -} // namespace - -int32_t __tgt_rtl_init_device(int device_id) { - hsa_status_t err; - // this is per device id init - DP("Initialize the device id: %d\n", device_id); +// Base kernel launch function used by synchronous and asynchronous versions +int32_t runRegionNowaitLocked(int32_t device_id, void *tgt_entry_ptr, + void **tgt_args, ptrdiff_t *tgt_offsets, + int32_t arg_num, int32_t num_teams, + int32_t thread_limit, uint64_t loop_tripcount) { + // Set the context we are using + // update thread limit content in gpu memory if un-initialized or specified + // from host - hsa_agent_t agent = DeviceInfo.HSAAgents[device_id]; + DP("Run target team region thread_limit %d\n", thread_limit); - // Get number of Compute Unit - uint32_t compute_units = 0; - err = hsa_agent_get_info( - agent, (hsa_agent_info_t)HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT, - &compute_units); - if (err != HSA_STATUS_SUCCESS) { - DeviceInfo.ComputeUnits[device_id] = 1; - DP("Error getting compute units : settiing to 1\n"); - } else { - DeviceInfo.ComputeUnits[device_id] = compute_units; - DP("Using %d compute unis per grid\n", DeviceInfo.ComputeUnits[device_id]); - } + // All args are references. + std::vector args(arg_num); + std::vector ptrs(arg_num); - char GetInfoName[64]; // 64 max size returned by get info - err = hsa_agent_get_info(agent, (hsa_agent_info_t)HSA_AGENT_INFO_NAME, - (void *)GetInfoName); - if (err) - DeviceInfo.GPUName[device_id] = "--unknown gpu--"; - else { - DeviceInfo.GPUName[device_id] = GetInfoName; + DP("Arg_num: %d\n", arg_num); + for (int32_t i = 0; i < arg_num; ++i) { + ptrs[i] = (void *)((intptr_t)tgt_args[i] + tgt_offsets[i]); + args[i] = &ptrs[i]; + DP("Offseted base: arg[%d]:" DPxMOD "\n", i, DPxPTR(ptrs[i])); } - if (print_kernel_trace & STARTUP_DETAILS) - DP("Device#%-2d CU's: %2d %s\n", device_id, - DeviceInfo.ComputeUnits[device_id], - DeviceInfo.GPUName[device_id].c_str()); + KernelTy *KernelInfo = (KernelTy *)tgt_entry_ptr; - // Query attributes to determine number of threads/block and blocks/grid. - uint16_t workgroup_max_dim[3]; - err = hsa_agent_get_info(agent, HSA_AGENT_INFO_WORKGROUP_MAX_DIM, - &workgroup_max_dim); - if (err != HSA_STATUS_SUCCESS) { - DeviceInfo.GroupsPerDevice[device_id] = RTLDeviceInfoTy::DefaultNumTeams; - DP("Error getting grid dims: num groups : %d\n", - RTLDeviceInfoTy::DefaultNumTeams); - } else if (workgroup_max_dim[0] <= RTLDeviceInfoTy::HardTeamLimit) { - DeviceInfo.GroupsPerDevice[device_id] = workgroup_max_dim[0]; - DP("Using %d ROCm blocks per grid\n", - DeviceInfo.GroupsPerDevice[device_id]); - } else { - DeviceInfo.GroupsPerDevice[device_id] = RTLDeviceInfoTy::HardTeamLimit; - DP("Max ROCm blocks per grid %d exceeds the hard team limit %d, capping " - "at the hard limit\n", - workgroup_max_dim[0], RTLDeviceInfoTy::HardTeamLimit); + std::string kernel_name = std::string(KernelInfo->Name); + auto &KernelInfoTable = DeviceInfo.KernelInfoTable; + if (KernelInfoTable[device_id].find(kernel_name) == + KernelInfoTable[device_id].end()) { + DP("Kernel %s not found\n", kernel_name.c_str()); + return OFFLOAD_FAIL; } - // Get thread limit - hsa_dim3_t grid_max_dim; - err = hsa_agent_get_info(agent, HSA_AGENT_INFO_GRID_MAX_DIM, &grid_max_dim); - if (err == HSA_STATUS_SUCCESS) { - DeviceInfo.ThreadsPerGroup[device_id] = - reinterpret_cast(&grid_max_dim)[0] / - DeviceInfo.GroupsPerDevice[device_id]; - - if (DeviceInfo.ThreadsPerGroup[device_id] == 0) { - DeviceInfo.ThreadsPerGroup[device_id] = RTLDeviceInfoTy::Max_WG_Size; - DP("Default thread limit: %d\n", RTLDeviceInfoTy::Max_WG_Size); - } else if (enforce_upper_bound(&DeviceInfo.ThreadsPerGroup[device_id], - RTLDeviceInfoTy::Max_WG_Size)) { - DP("Capped thread limit: %d\n", RTLDeviceInfoTy::Max_WG_Size); - } else { - DP("Using ROCm Queried thread limit: %d\n", - DeviceInfo.ThreadsPerGroup[device_id]); - } - } else { - DeviceInfo.ThreadsPerGroup[device_id] = RTLDeviceInfoTy::Max_WG_Size; - DP("Error getting max block dimension, use default:%d \n", - RTLDeviceInfoTy::Max_WG_Size); - } + const atl_kernel_info_t KernelInfoEntry = + KernelInfoTable[device_id][kernel_name]; + const uint32_t group_segment_size = KernelInfoEntry.group_segment_size; + const uint32_t sgpr_count = KernelInfoEntry.sgpr_count; + const uint32_t vgpr_count = KernelInfoEntry.vgpr_count; + const uint32_t sgpr_spill_count = KernelInfoEntry.sgpr_spill_count; + const uint32_t vgpr_spill_count = KernelInfoEntry.vgpr_spill_count; - // Get wavefront size - uint32_t wavefront_size = 0; - err = - hsa_agent_get_info(agent, HSA_AGENT_INFO_WAVEFRONT_SIZE, &wavefront_size); - if (err == HSA_STATUS_SUCCESS) { - DP("Queried wavefront size: %d\n", wavefront_size); - DeviceInfo.WarpSize[device_id] = wavefront_size; - } else { - // TODO: Burn the wavefront size into the code object - DP("Warning: Unknown wavefront size, assuming 64\n"); - DeviceInfo.WarpSize[device_id] = 64; - } + assert(arg_num == (int)KernelInfoEntry.explicit_argument_count); - // Adjust teams to the env variables + /* + * Set limit based on ThreadsPerGroup and GroupsPerDevice + */ + launchVals LV = + getLaunchVals(DeviceInfo.WarpSize[device_id], DeviceInfo.Env, + KernelInfo->ConstWGSize, KernelInfo->ExecutionMode, + num_teams, // From run_region arg + thread_limit, // From run_region arg + loop_tripcount, // From run_region arg + DeviceInfo.NumTeams[KernelInfo->device_id]); + const int GridSize = LV.GridSize; + const int WorkgroupSize = LV.WorkgroupSize; - if (DeviceInfo.Env.TeamLimit > 0 && - (enforce_upper_bound(&DeviceInfo.GroupsPerDevice[device_id], - DeviceInfo.Env.TeamLimit))) { - DP("Capping max groups per device to OMP_TEAM_LIMIT=%d\n", - DeviceInfo.Env.TeamLimit); + if (print_kernel_trace >= LAUNCH) { + int num_groups = GridSize / WorkgroupSize; + // enum modes are SPMD, GENERIC, NONE 0,1,2 + // if doing rtl timing, print to stderr, unless stdout requested. + bool traceToStdout = print_kernel_trace & (RTL_TO_STDOUT | RTL_TIMING); + fprintf(traceToStdout ? stdout : stderr, + "DEVID:%2d SGN:%1d ConstWGSize:%-4d args:%2d teamsXthrds:(%4dX%4d) " + "reqd:(%4dX%4d) lds_usage:%uB sgpr_count:%u vgpr_count:%u " + "sgpr_spill_count:%u vgpr_spill_count:%u tripcount:%lu n:%s\n", + device_id, KernelInfo->ExecutionMode, KernelInfo->ConstWGSize, + arg_num, num_groups, WorkgroupSize, num_teams, thread_limit, + group_segment_size, sgpr_count, vgpr_count, sgpr_spill_count, + vgpr_spill_count, loop_tripcount, KernelInfo->Name); } - // Set default number of teams - if (DeviceInfo.Env.NumTeams > 0) { - DeviceInfo.NumTeams[device_id] = DeviceInfo.Env.NumTeams; - DP("Default number of teams set according to environment %d\n", - DeviceInfo.Env.NumTeams); - } else { - char *TeamsPerCUEnvStr = getenv("OMP_TARGET_TEAMS_PER_PROC"); - int TeamsPerCU = DefaultTeamsPerCU; - if (TeamsPerCUEnvStr) { - TeamsPerCU = std::stoi(TeamsPerCUEnvStr); + // Run on the device. + { + hsa_queue_t *queue = DeviceInfo.HSAQueues[device_id].get(); + if (!queue) { + return OFFLOAD_FAIL; } + uint64_t packet_id = acquire_available_packet_id(queue); - DeviceInfo.NumTeams[device_id] = - TeamsPerCU * DeviceInfo.ComputeUnits[device_id]; - DP("Default number of teams = %d * number of compute units %d\n", - TeamsPerCU, DeviceInfo.ComputeUnits[device_id]); - } - - if (enforce_upper_bound(&DeviceInfo.NumTeams[device_id], - DeviceInfo.GroupsPerDevice[device_id])) { - DP("Default number of teams exceeds device limit, capping at %d\n", - DeviceInfo.GroupsPerDevice[device_id]); - } - - // Adjust threads to the env variables - if (DeviceInfo.Env.TeamThreadLimit > 0 && - (enforce_upper_bound(&DeviceInfo.NumThreads[device_id], - DeviceInfo.Env.TeamThreadLimit))) { - DP("Capping max number of threads to OMP_TEAMS_THREAD_LIMIT=%d\n", - DeviceInfo.Env.TeamThreadLimit); - } + const uint32_t mask = queue->size - 1; // size is a power of 2 + hsa_kernel_dispatch_packet_t *packet = + (hsa_kernel_dispatch_packet_t *)queue->base_address + + (packet_id & mask); - // Set default number of threads - DeviceInfo.NumThreads[device_id] = RTLDeviceInfoTy::Default_WG_Size; - DP("Default number of threads set according to library's default %d\n", - RTLDeviceInfoTy::Default_WG_Size); - if (enforce_upper_bound(&DeviceInfo.NumThreads[device_id], - DeviceInfo.ThreadsPerGroup[device_id])) { - DP("Default number of threads exceeds device limit, capping at %d\n", - DeviceInfo.ThreadsPerGroup[device_id]); - } + // packet->header is written last + packet->setup = UINT16_C(1) << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS; + packet->workgroup_size_x = WorkgroupSize; + packet->workgroup_size_y = 1; + packet->workgroup_size_z = 1; + packet->reserved0 = 0; + packet->grid_size_x = GridSize; + packet->grid_size_y = 1; + packet->grid_size_z = 1; + packet->private_segment_size = KernelInfoEntry.private_segment_size; + packet->group_segment_size = KernelInfoEntry.group_segment_size; + packet->kernel_object = KernelInfoEntry.kernel_object; + packet->kernarg_address = 0; // use the block allocator + packet->reserved2 = 0; // impl writes id_ here + packet->completion_signal = {0}; // may want a pool of signals - DP("Device %d: default limit for groupsPerDevice %d & threadsPerGroup %d\n", - device_id, DeviceInfo.GroupsPerDevice[device_id], - DeviceInfo.ThreadsPerGroup[device_id]); + KernelArgPool *ArgPool = nullptr; + void *kernarg = nullptr; + { + auto it = KernelArgPoolMap.find(std::string(KernelInfo->Name)); + if (it != KernelArgPoolMap.end()) { + ArgPool = (it->second).get(); + } + } + if (!ArgPool) { + DP("Warning: No ArgPool for %s on device %d\n", KernelInfo->Name, + device_id); + } + { + if (ArgPool) { + assert(ArgPool->kernarg_segment_size == (arg_num * sizeof(void *))); + kernarg = ArgPool->allocate(arg_num); + } + if (!kernarg) { + DP("Allocate kernarg failed\n"); + return OFFLOAD_FAIL; + } - DP("Device %d: wavefront size %d, total threads %d x %d = %d\n", device_id, - DeviceInfo.WarpSize[device_id], DeviceInfo.ThreadsPerGroup[device_id], - DeviceInfo.GroupsPerDevice[device_id], - DeviceInfo.GroupsPerDevice[device_id] * - DeviceInfo.ThreadsPerGroup[device_id]); + // Copy explicit arguments + for (int i = 0; i < arg_num; i++) { + memcpy((char *)kernarg + sizeof(void *) * i, args[i], sizeof(void *)); + } - return OFFLOAD_SUCCESS; -} + // Initialize implicit arguments. TODO: Which of these can be dropped + impl_implicit_args_t *impl_args = + reinterpret_cast( + static_cast(kernarg) + ArgPool->kernarg_segment_size); + memset(impl_args, 0, + sizeof(impl_implicit_args_t)); // may not be necessary + impl_args->offset_x = 0; + impl_args->offset_y = 0; + impl_args->offset_z = 0; -namespace { -Elf64_Shdr *find_only_SHT_HASH(Elf *elf) { - size_t N; - int rc = elf_getshdrnum(elf, &N); - if (rc != 0) { - return nullptr; - } + // assign a hostcall buffer for the selected Q + if (__atomic_load_n(&DeviceInfo.hostcall_required, __ATOMIC_ACQUIRE)) { + // hostrpc_assign_buffer is not thread safe, and this function is + // under a multiple reader lock, not a writer lock. + static pthread_mutex_t hostcall_init_lock = PTHREAD_MUTEX_INITIALIZER; + pthread_mutex_lock(&hostcall_init_lock); + unsigned long buffer = hostrpc_assign_buffer( + DeviceInfo.HSAAgents[device_id], queue, device_id); + pthread_mutex_unlock(&hostcall_init_lock); + if (!buffer) { + DP("hostrpc_assign_buffer failed, gpu would dereference null and " + "error\n"); + return OFFLOAD_FAIL; + } - Elf64_Shdr *result = nullptr; - for (size_t i = 0; i < N; i++) { - Elf_Scn *scn = elf_getscn(elf, i); - if (scn) { - Elf64_Shdr *shdr = elf64_getshdr(scn); - if (shdr) { - if (shdr->sh_type == SHT_HASH) { - if (result == nullptr) { - result = shdr; + if (KernelInfoEntry.implicit_argument_count >= 4) { + // Initialise pointer for implicit_argument_count != 0 ABI + // Guess that the right implicit argument is at offset 24 after + // the explicit arguments. In the future, should be able to read + // the offset from msgpack. Clang is not annotating it at present. + uint64_t Offset = + sizeof(void *) * (KernelInfoEntry.explicit_argument_count + 3); + if ((Offset + 8) > (ArgPool->kernarg_segment_size)) { + DP("Bad offset of hostcall, exceeds kernarg segment size\n"); } else { - // multiple SHT_HASH sections not handled - return nullptr; + memcpy(static_cast(kernarg) + Offset, &buffer, 8); } } + + // initialise pointer for implicit_argument_count == 0 ABI + impl_args->hostcall_ptr = buffer; } - } - } - return result; -} -const Elf64_Sym *elf_lookup(Elf *elf, char *base, Elf64_Shdr *section_hash, - const char *symname) { + packet->kernarg_address = kernarg; + } - assert(section_hash); - size_t section_symtab_index = section_hash->sh_link; - Elf64_Shdr *section_symtab = - elf64_getshdr(elf_getscn(elf, section_symtab_index)); - size_t section_strtab_index = section_symtab->sh_link; + hsa_signal_t s = DeviceInfo.FreeSignalPool.pop(); + if (s.handle == 0) { + DP("Failed to get signal instance\n"); + return OFFLOAD_FAIL; + } + packet->completion_signal = s; + hsa_signal_store_relaxed(packet->completion_signal, 1); - const Elf64_Sym *symtab = - reinterpret_cast(base + section_symtab->sh_offset); + // Publish the packet indicating it is ready to be processed + core::packet_store_release(reinterpret_cast(packet), + core::create_header(), packet->setup); - const uint32_t *hashtab = - reinterpret_cast(base + section_hash->sh_offset); + // Since the packet is already published, its contents must not be + // accessed any more + hsa_signal_store_relaxed(queue->doorbell_signal, packet_id); - // Layout: - // nbucket - // nchain - // bucket[nbucket] - // chain[nchain] - uint32_t nbucket = hashtab[0]; - const uint32_t *bucket = &hashtab[2]; - const uint32_t *chain = &hashtab[nbucket + 2]; + while (hsa_signal_wait_scacquire(s, HSA_SIGNAL_CONDITION_EQ, 0, UINT64_MAX, + HSA_WAIT_STATE_BLOCKED) != 0) + ; - const size_t max = strlen(symname) + 1; - const uint32_t hash = elf_hash(symname); - for (uint32_t i = bucket[hash % nbucket]; i != 0; i = chain[i]) { - char *n = elf_strptr(elf, section_strtab_index, symtab[i].st_name); - if (strncmp(symname, n, max) == 0) { - return &symtab[i]; - } + assert(ArgPool); + ArgPool->deallocate(kernarg); + DeviceInfo.FreeSignalPool.push(s); } - return nullptr; + DP("Kernel completed\n"); + return OFFLOAD_SUCCESS; } -struct symbol_info { - void *addr = nullptr; - uint32_t size = UINT32_MAX; - uint32_t sh_type = SHT_NULL; -}; - -int get_symbol_info_without_loading(Elf *elf, char *base, const char *symname, - symbol_info *res) { - if (elf_kind(elf) != ELF_K_ELF) { - return 1; - } - - Elf64_Shdr *section_hash = find_only_SHT_HASH(elf); - if (!section_hash) { - return 1; +bool elf_machine_id_is_amdgcn(__tgt_device_image *image) { + const uint16_t amdgcnMachineID = 224; // EM_AMDGPU may not be in system elf.h + int32_t r = elf_check_machine(image, amdgcnMachineID); + if (!r) { + DP("Supported machine ID not found\n"); } + return r; +} - const Elf64_Sym *sym = elf_lookup(elf, base, section_hash, symname); - if (!sym) { - return 1; - } +uint32_t elf_e_flags(__tgt_device_image *image) { + char *img_begin = (char *)image->ImageStart; + size_t img_size = (char *)image->ImageEnd - img_begin; - if (sym->st_size > UINT32_MAX) { - return 1; + Elf *e = elf_memory(img_begin, img_size); + if (!e) { + DP("Unable to get ELF handle: %s!\n", elf_errmsg(-1)); + return 0; } - if (sym->st_shndx == SHN_UNDEF) { - return 1; - } + Elf64_Ehdr *eh64 = elf64_getehdr(e); - Elf_Scn *section = elf_getscn(elf, sym->st_shndx); - if (!section) { - return 1; + if (!eh64) { + DP("Unable to get machine ID from ELF file!\n"); + elf_end(e); + return 0; } - Elf64_Shdr *header = elf64_getshdr(section); - if (!header) { - return 1; - } + uint32_t Flags = eh64->e_flags; - res->addr = sym->st_value + base; - res->size = static_cast(sym->st_size); - res->sh_type = header->sh_type; - return 0; + elf_end(e); + DP("ELF Flags: 0x%x\n", Flags); + return Flags; } +} // namespace -int get_symbol_info_without_loading(char *base, size_t img_size, - const char *symname, symbol_info *res) { - Elf *elf = elf_memory(base, img_size); - if (elf) { - int rc = get_symbol_info_without_loading(elf, base, symname, res); - elf_end(elf); - return rc; - } - return 1; +int32_t __tgt_rtl_is_valid_binary(__tgt_device_image *image) { + return elf_machine_id_is_amdgcn(image); } -hsa_status_t interop_get_symbol_info(char *base, size_t img_size, - const char *symname, void **var_addr, - uint32_t *var_size) { - symbol_info si; - int rc = get_symbol_info_without_loading(base, img_size, symname, &si); - if (rc == 0) { - *var_addr = si.addr; - *var_size = si.size; - return HSA_STATUS_SUCCESS; +int __tgt_rtl_number_of_devices() { + // If the construction failed, no methods are safe to call + if (DeviceInfo.ConstructionSucceeded) { + return DeviceInfo.NumberOfDevices; } else { - return HSA_STATUS_ERROR; + DP("AMDGPU plugin construction failed. Zero devices available\n"); + return 0; } } -template -hsa_status_t module_register_from_memory_to_place( - std::map &KernelInfoTable, - std::map &SymbolInfoTable, - void *module_bytes, size_t module_size, int DeviceId, C cb, - std::vector &HSAExecutables) { - auto L = [](void *data, size_t size, void *cb_state) -> hsa_status_t { - C *unwrapped = static_cast(cb_state); - return (*unwrapped)(data, size); - }; - return core::RegisterModuleFromMemory( - KernelInfoTable, SymbolInfoTable, module_bytes, module_size, - DeviceInfo.HSAAgents[DeviceId], L, static_cast(&cb), - HSAExecutables); +int64_t __tgt_rtl_init_requires(int64_t RequiresFlags) { + DP("Init requires flags to %ld\n", RequiresFlags); + DeviceInfo.RequiresFlags = RequiresFlags; + return RequiresFlags; } -} // namespace - -static uint64_t get_device_State_bytes(char *ImageStart, size_t img_size) { - uint64_t device_State_bytes = 0; - { - // If this is the deviceRTL, get the state variable size - symbol_info size_si; - int rc = get_symbol_info_without_loading( - ImageStart, img_size, "omptarget_nvptx_device_State_size", &size_si); - - if (rc == 0) { - if (size_si.size != sizeof(uint64_t)) { - DP("Found device_State_size variable with wrong size\n"); - return 0; - } - // Read number of bytes directly from the elf - memcpy(&device_State_bytes, size_si.addr, sizeof(uint64_t)); - } +namespace { +template bool enforce_upper_bound(T *value, T upper) { + bool changed = *value > upper; + if (changed) { + *value = upper; } - return device_State_bytes; + return changed; } +} // namespace -static __tgt_target_table * -__tgt_rtl_load_binary_locked(int32_t device_id, __tgt_device_image *image); - -static __tgt_target_table * -__tgt_rtl_load_binary_locked(int32_t device_id, __tgt_device_image *image); +int32_t __tgt_rtl_init_device(int device_id) { + hsa_status_t err; -__tgt_target_table *__tgt_rtl_load_binary(int32_t device_id, - __tgt_device_image *image) { - DeviceInfo.load_run_lock.lock(); - __tgt_target_table *res = __tgt_rtl_load_binary_locked(device_id, image); - DeviceInfo.load_run_lock.unlock(); - return res; -} + // this is per device id init + DP("Initialize the device id: %d\n", device_id); -struct device_environment { - // initialise an DeviceEnvironmentTy in the deviceRTL - // patches around differences in the deviceRTL between trunk, aomp, - // rocmcc. Over time these differences will tend to zero and this class - // simplified. - // Symbol may be in .data or .bss, and may be missing fields, todo: - // review aomp/trunk/rocm and simplify the following + hsa_agent_t agent = DeviceInfo.HSAAgents[device_id]; - // The symbol may also have been deadstripped because the device side - // accessors were unused. + // Get number of Compute Unit + uint32_t compute_units = 0; + err = hsa_agent_get_info( + agent, (hsa_agent_info_t)HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT, + &compute_units); + if (err != HSA_STATUS_SUCCESS) { + DeviceInfo.ComputeUnits[device_id] = 1; + DP("Error getting compute units : settiing to 1\n"); + } else { + DeviceInfo.ComputeUnits[device_id] = compute_units; + DP("Using %d compute unis per grid\n", DeviceInfo.ComputeUnits[device_id]); + } - // If the symbol is in .data (aomp, rocm) it can be written directly. - // If it is in .bss, we must wait for it to be allocated space on the - // gpu (trunk) and initialize after loading. - const char *sym() { return "omptarget_device_environment"; } + char GetInfoName[64]; // 64 max size returned by get info + err = hsa_agent_get_info(agent, (hsa_agent_info_t)HSA_AGENT_INFO_NAME, + (void *)GetInfoName); + if (err) + DeviceInfo.GPUName[device_id] = "--unknown gpu--"; + else { + DeviceInfo.GPUName[device_id] = GetInfoName; + } - DeviceEnvironmentTy host_device_env; - symbol_info si; - bool valid = false; + if (print_kernel_trace & STARTUP_DETAILS) + DP("Device#%-2d CU's: %2d %s\n", device_id, + DeviceInfo.ComputeUnits[device_id], + DeviceInfo.GPUName[device_id].c_str()); - __tgt_device_image *image; - const size_t img_size; + // Query attributes to determine number of threads/block and blocks/grid. + uint16_t workgroup_max_dim[3]; + err = hsa_agent_get_info(agent, HSA_AGENT_INFO_WORKGROUP_MAX_DIM, + &workgroup_max_dim); + if (err != HSA_STATUS_SUCCESS) { + DeviceInfo.GroupsPerDevice[device_id] = RTLDeviceInfoTy::DefaultNumTeams; + DP("Error getting grid dims: num groups : %d\n", + RTLDeviceInfoTy::DefaultNumTeams); + } else if (workgroup_max_dim[0] <= RTLDeviceInfoTy::HardTeamLimit) { + DeviceInfo.GroupsPerDevice[device_id] = workgroup_max_dim[0]; + DP("Using %d ROCm blocks per grid\n", + DeviceInfo.GroupsPerDevice[device_id]); + } else { + DeviceInfo.GroupsPerDevice[device_id] = RTLDeviceInfoTy::HardTeamLimit; + DP("Max ROCm blocks per grid %d exceeds the hard team limit %d, capping " + "at the hard limit\n", + workgroup_max_dim[0], RTLDeviceInfoTy::HardTeamLimit); + } - device_environment(int device_id, int number_devices, - __tgt_device_image *image, const size_t img_size) - : image(image), img_size(img_size) { + // Get thread limit + hsa_dim3_t grid_max_dim; + err = hsa_agent_get_info(agent, HSA_AGENT_INFO_GRID_MAX_DIM, &grid_max_dim); + if (err == HSA_STATUS_SUCCESS) { + DeviceInfo.ThreadsPerGroup[device_id] = + reinterpret_cast(&grid_max_dim)[0] / + DeviceInfo.GroupsPerDevice[device_id]; - host_device_env.NumDevices = number_devices; - host_device_env.DeviceNum = device_id; - host_device_env.DebugKind = 0; - host_device_env.DynamicMemSize = 0; - if (char *envStr = getenv("LIBOMPTARGET_DEVICE_RTL_DEBUG")) { - host_device_env.DebugKind = std::stoi(envStr); + if (DeviceInfo.ThreadsPerGroup[device_id] == 0) { + DeviceInfo.ThreadsPerGroup[device_id] = RTLDeviceInfoTy::Max_WG_Size; + DP("Default thread limit: %d\n", RTLDeviceInfoTy::Max_WG_Size); + } else if (enforce_upper_bound(&DeviceInfo.ThreadsPerGroup[device_id], + RTLDeviceInfoTy::Max_WG_Size)) { + DP("Capped thread limit: %d\n", RTLDeviceInfoTy::Max_WG_Size); + } else { + DP("Using ROCm Queried thread limit: %d\n", + DeviceInfo.ThreadsPerGroup[device_id]); } - - int rc = get_symbol_info_without_loading((char *)image->ImageStart, - img_size, sym(), &si); - if (rc != 0) { - DP("Finding global device environment '%s' - symbol missing.\n", sym()); - return; - } - - if (si.size > sizeof(host_device_env)) { - DP("Symbol '%s' has size %u, expected at most %zu.\n", sym(), si.size, - sizeof(host_device_env)); - return; - } - - valid = true; + } else { + DeviceInfo.ThreadsPerGroup[device_id] = RTLDeviceInfoTy::Max_WG_Size; + DP("Error getting max block dimension, use default:%d \n", + RTLDeviceInfoTy::Max_WG_Size); } - bool in_image() { return si.sh_type != SHT_NOBITS; } - - hsa_status_t before_loading(void *data, size_t size) { - if (valid) { - if (in_image()) { - DP("Setting global device environment before load (%u bytes)\n", - si.size); - uint64_t offset = (char *)si.addr - (char *)image->ImageStart; - void *pos = (char *)data + offset; - memcpy(pos, &host_device_env, si.size); - } - } - return HSA_STATUS_SUCCESS; + // Get wavefront size + uint32_t wavefront_size = 0; + err = + hsa_agent_get_info(agent, HSA_AGENT_INFO_WAVEFRONT_SIZE, &wavefront_size); + if (err == HSA_STATUS_SUCCESS) { + DP("Queried wavefront size: %d\n", wavefront_size); + DeviceInfo.WarpSize[device_id] = wavefront_size; + } else { + // TODO: Burn the wavefront size into the code object + DP("Warning: Unknown wavefront size, assuming 64\n"); + DeviceInfo.WarpSize[device_id] = 64; } - hsa_status_t after_loading() { - if (valid) { - if (!in_image()) { - DP("Setting global device environment after load (%u bytes)\n", - si.size); - int device_id = host_device_env.DeviceNum; - auto &SymbolInfo = DeviceInfo.SymbolInfoTable[device_id]; - void *state_ptr; - uint32_t state_ptr_size; - hsa_status_t err = interop_hsa_get_symbol_info( - SymbolInfo, device_id, sym(), &state_ptr, &state_ptr_size); - if (err != HSA_STATUS_SUCCESS) { - DP("failed to find %s in loaded image\n", sym()); - return err; - } + // Adjust teams to the env variables - if (state_ptr_size != si.size) { - DP("Symbol had size %u before loading, %u after\n", state_ptr_size, - si.size); - return HSA_STATUS_ERROR; - } + if (DeviceInfo.Env.TeamLimit > 0 && + (enforce_upper_bound(&DeviceInfo.GroupsPerDevice[device_id], + DeviceInfo.Env.TeamLimit))) { + DP("Capping max groups per device to OMP_TEAM_LIMIT=%d\n", + DeviceInfo.Env.TeamLimit); + } - return DeviceInfo.freesignalpool_memcpy_h2d(state_ptr, &host_device_env, - state_ptr_size, device_id); - } + // Set default number of teams + if (DeviceInfo.Env.NumTeams > 0) { + DeviceInfo.NumTeams[device_id] = DeviceInfo.Env.NumTeams; + DP("Default number of teams set according to environment %d\n", + DeviceInfo.Env.NumTeams); + } else { + char *TeamsPerCUEnvStr = getenv("OMP_TARGET_TEAMS_PER_PROC"); + int TeamsPerCU = DefaultTeamsPerCU; + if (TeamsPerCUEnvStr) { + TeamsPerCU = std::stoi(TeamsPerCUEnvStr); } - return HSA_STATUS_SUCCESS; - } -}; -static hsa_status_t impl_calloc(void **ret_ptr, size_t size, int DeviceId) { - uint64_t rounded = 4 * ((size + 3) / 4); - void *ptr; - hsa_amd_memory_pool_t MemoryPool = DeviceInfo.getDeviceMemoryPool(DeviceId); - hsa_status_t err = hsa_amd_memory_pool_allocate(MemoryPool, rounded, 0, &ptr); - if (err != HSA_STATUS_SUCCESS) { - return err; + DeviceInfo.NumTeams[device_id] = + TeamsPerCU * DeviceInfo.ComputeUnits[device_id]; + DP("Default number of teams = %d * number of compute units %d\n", + TeamsPerCU, DeviceInfo.ComputeUnits[device_id]); } - hsa_status_t rc = hsa_amd_memory_fill(ptr, 0, rounded / 4); - if (rc != HSA_STATUS_SUCCESS) { - DP("zero fill device_state failed with %u\n", rc); - core::Runtime::Memfree(ptr); - return HSA_STATUS_ERROR; + if (enforce_upper_bound(&DeviceInfo.NumTeams[device_id], + DeviceInfo.GroupsPerDevice[device_id])) { + DP("Default number of teams exceeds device limit, capping at %d\n", + DeviceInfo.GroupsPerDevice[device_id]); } - *ret_ptr = ptr; - return HSA_STATUS_SUCCESS; -} + // Adjust threads to the env variables + if (DeviceInfo.Env.TeamThreadLimit > 0 && + (enforce_upper_bound(&DeviceInfo.NumThreads[device_id], + DeviceInfo.Env.TeamThreadLimit))) { + DP("Capping max number of threads to OMP_TEAMS_THREAD_LIMIT=%d\n", + DeviceInfo.Env.TeamThreadLimit); + } -static bool image_contains_symbol(void *data, size_t size, const char *sym) { - symbol_info si; - int rc = get_symbol_info_without_loading((char *)data, size, sym, &si); - return (rc == 0) && (si.addr != nullptr); -} + // Set default number of threads + DeviceInfo.NumThreads[device_id] = RTLDeviceInfoTy::Default_WG_Size; + DP("Default number of threads set according to library's default %d\n", + RTLDeviceInfoTy::Default_WG_Size); + if (enforce_upper_bound(&DeviceInfo.NumThreads[device_id], + DeviceInfo.ThreadsPerGroup[device_id])) { + DP("Default number of threads exceeds device limit, capping at %d\n", + DeviceInfo.ThreadsPerGroup[device_id]); + } -__tgt_target_table *__tgt_rtl_load_binary_locked(int32_t device_id, - __tgt_device_image *image) { - // This function loads the device image onto gpu[device_id] and does other - // per-image initialization work. Specifically: - // - // - Initialize an DeviceEnvironmentTy instance embedded in the - // image at the symbol "omptarget_device_environment" - // Fields DebugKind, DeviceNum, NumDevices. Used by the deviceRTL. - // - // - Allocate a large array per-gpu (could be moved to init_device) - // - Read a uint64_t at symbol omptarget_nvptx_device_State_size - // - Allocate at least that many bytes of gpu memory - // - Zero initialize it - // - Write the pointer to the symbol omptarget_nvptx_device_State - // - // - Pulls some per-kernel information together from various sources and - // records it in the KernelsList for quicker access later - // - // The initialization can be done before or after loading the image onto the - // gpu. This function presently does a mixture. Using the hsa api to get/set - // the information is simpler to implement, in exchange for more complicated - // runtime behaviour. E.g. launching a kernel or using dma to get eight bytes - // back from the gpu vs a hashtable lookup on the host. + DP("Device %d: default limit for groupsPerDevice %d & threadsPerGroup %d\n", + device_id, DeviceInfo.GroupsPerDevice[device_id], + DeviceInfo.ThreadsPerGroup[device_id]); - const size_t img_size = (char *)image->ImageEnd - (char *)image->ImageStart; + DP("Device %d: wavefront size %d, total threads %d x %d = %d\n", device_id, + DeviceInfo.WarpSize[device_id], DeviceInfo.ThreadsPerGroup[device_id], + DeviceInfo.GroupsPerDevice[device_id], + DeviceInfo.GroupsPerDevice[device_id] * + DeviceInfo.ThreadsPerGroup[device_id]); - DeviceInfo.clearOffloadEntriesTable(device_id); + return OFFLOAD_SUCCESS; +} - // We do not need to set the ELF version because the caller of this function - // had to do that to decide the right runtime to use +namespace { +Elf64_Shdr *find_only_SHT_HASH(Elf *elf) { + size_t N; + int rc = elf_getshdrnum(elf, &N); + if (rc != 0) { + return nullptr; + } - if (!elf_machine_id_is_amdgcn(image)) { - return NULL; + Elf64_Shdr *result = nullptr; + for (size_t i = 0; i < N; i++) { + Elf_Scn *scn = elf_getscn(elf, i); + if (scn) { + Elf64_Shdr *shdr = elf64_getshdr(scn); + if (shdr) { + if (shdr->sh_type == SHT_HASH) { + if (result == nullptr) { + result = shdr; + } else { + // multiple SHT_HASH sections not handled + return nullptr; + } + } + } + } } + return result; +} - { - auto env = device_environment(device_id, DeviceInfo.NumberOfDevices, image, - img_size); +const Elf64_Sym *elf_lookup(Elf *elf, char *base, Elf64_Shdr *section_hash, + const char *symname) { - auto &KernelInfo = DeviceInfo.KernelInfoTable[device_id]; - auto &SymbolInfo = DeviceInfo.SymbolInfoTable[device_id]; - hsa_status_t err = module_register_from_memory_to_place( - KernelInfo, SymbolInfo, (void *)image->ImageStart, img_size, device_id, - [&](void *data, size_t size) { - if (image_contains_symbol(data, size, "needs_hostcall_buffer")) { - __atomic_store_n(&DeviceInfo.hostcall_required, true, - __ATOMIC_RELEASE); - } - return env.before_loading(data, size); - }, - DeviceInfo.HSAExecutables); + assert(section_hash); + size_t section_symtab_index = section_hash->sh_link; + Elf64_Shdr *section_symtab = + elf64_getshdr(elf_getscn(elf, section_symtab_index)); + size_t section_strtab_index = section_symtab->sh_link; - check("Module registering", err); - if (err != HSA_STATUS_SUCCESS) { - const char *DeviceName = DeviceInfo.GPUName[device_id].c_str(); - const char *ElfName = get_elf_mach_gfx_name(elf_e_flags(image)); + const Elf64_Sym *symtab = + reinterpret_cast(base + section_symtab->sh_offset); - if (strcmp(DeviceName, ElfName) != 0) { - DP("Possible gpu arch mismatch: device:%s, image:%s please check" - " compiler flag: -march=\n", - DeviceName, ElfName); - } else { - DP("Error loading image onto GPU: %s\n", get_error_string(err)); - } + const uint32_t *hashtab = + reinterpret_cast(base + section_hash->sh_offset); - return NULL; - } + // Layout: + // nbucket + // nchain + // bucket[nbucket] + // chain[nchain] + uint32_t nbucket = hashtab[0]; + const uint32_t *bucket = &hashtab[2]; + const uint32_t *chain = &hashtab[nbucket + 2]; - err = env.after_loading(); - if (err != HSA_STATUS_SUCCESS) { - return NULL; + const size_t max = strlen(symname) + 1; + const uint32_t hash = elf_hash(symname); + for (uint32_t i = bucket[hash % nbucket]; i != 0; i = chain[i]) { + char *n = elf_strptr(elf, section_strtab_index, symtab[i].st_name); + if (strncmp(symname, n, max) == 0) { + return &symtab[i]; } } - DP("AMDGPU module successfully loaded!\n"); + return nullptr; +} - { - // the device_State array is either large value in bss or a void* that - // needs to be assigned to a pointer to an array of size device_state_bytes - // If absent, it has been deadstripped and needs no setup. - - void *state_ptr; - uint32_t state_ptr_size; - auto &SymbolInfoMap = DeviceInfo.SymbolInfoTable[device_id]; - hsa_status_t err = interop_hsa_get_symbol_info( - SymbolInfoMap, device_id, "omptarget_nvptx_device_State", &state_ptr, - &state_ptr_size); - - if (err != HSA_STATUS_SUCCESS) { - DP("No device_state symbol found, skipping initialization\n"); - } else { - if (state_ptr_size < sizeof(void *)) { - DP("unexpected size of state_ptr %u != %zu\n", state_ptr_size, - sizeof(void *)); - return NULL; - } +struct symbol_info { + void *addr = nullptr; + uint32_t size = UINT32_MAX; + uint32_t sh_type = SHT_NULL; +}; - // if it's larger than a void*, assume it's a bss array and no further - // initialization is required. Only try to set up a pointer for - // sizeof(void*) - if (state_ptr_size == sizeof(void *)) { - uint64_t device_State_bytes = - get_device_State_bytes((char *)image->ImageStart, img_size); - if (device_State_bytes == 0) { - DP("Can't initialize device_State, missing size information\n"); - return NULL; - } +int get_symbol_info_without_loading(Elf *elf, char *base, const char *symname, + symbol_info *res) { + if (elf_kind(elf) != ELF_K_ELF) { + return 1; + } - auto &dss = DeviceInfo.deviceStateStore[device_id]; - if (dss.first.get() == nullptr) { - assert(dss.second == 0); - void *ptr = NULL; - hsa_status_t err = impl_calloc(&ptr, device_State_bytes, device_id); - if (err != HSA_STATUS_SUCCESS) { - DP("Failed to allocate device_state array\n"); - return NULL; - } - dss = { - std::unique_ptr{ptr}, - device_State_bytes, - }; - } + Elf64_Shdr *section_hash = find_only_SHT_HASH(elf); + if (!section_hash) { + return 1; + } - void *ptr = dss.first.get(); - if (device_State_bytes != dss.second) { - DP("Inconsistent sizes of device_State unsupported\n"); - return NULL; - } + const Elf64_Sym *sym = elf_lookup(elf, base, section_hash, symname); + if (!sym) { + return 1; + } - // write ptr to device memory so it can be used by later kernels - err = DeviceInfo.freesignalpool_memcpy_h2d(state_ptr, &ptr, - sizeof(void *), device_id); - if (err != HSA_STATUS_SUCCESS) { - DP("memcpy install of state_ptr failed\n"); - return NULL; - } - } - } + if (sym->st_size > UINT32_MAX) { + return 1; } - // Here, we take advantage of the data that is appended after img_end to get - // the symbols' name we need to load. This data consist of the host entries - // begin and end as well as the target name (see the offloading linker script - // creation in clang compiler). + if (sym->st_shndx == SHN_UNDEF) { + return 1; + } - // Find the symbols in the module by name. The name can be obtain by - // concatenating the host entry name with the target name + Elf_Scn *section = elf_getscn(elf, sym->st_shndx); + if (!section) { + return 1; + } - __tgt_offload_entry *HostBegin = image->EntriesBegin; - __tgt_offload_entry *HostEnd = image->EntriesEnd; + Elf64_Shdr *header = elf64_getshdr(section); + if (!header) { + return 1; + } - for (__tgt_offload_entry *e = HostBegin; e != HostEnd; ++e) { + res->addr = sym->st_value + base; + res->size = static_cast(sym->st_size); + res->sh_type = header->sh_type; + return 0; +} - if (!e->addr) { - // The host should have always something in the address to - // uniquely identify the target region. - DP("Analyzing host entry '' (size = %lld)...\n", - (unsigned long long)e->size); - return NULL; - } +int get_symbol_info_without_loading(char *base, size_t img_size, + const char *symname, symbol_info *res) { + Elf *elf = elf_memory(base, img_size); + if (elf) { + int rc = get_symbol_info_without_loading(elf, base, symname, res); + elf_end(elf); + return rc; + } + return 1; +} - if (e->size) { - __tgt_offload_entry entry = *e; +hsa_status_t interop_get_symbol_info(char *base, size_t img_size, + const char *symname, void **var_addr, + uint32_t *var_size) { + symbol_info si; + int rc = get_symbol_info_without_loading(base, img_size, symname, &si); + if (rc == 0) { + *var_addr = si.addr; + *var_size = si.size; + return HSA_STATUS_SUCCESS; + } else { + return HSA_STATUS_ERROR; + } +} - void *varptr; - uint32_t varsize; +template +hsa_status_t module_register_from_memory_to_place( + std::map &KernelInfoTable, + std::map &SymbolInfoTable, + void *module_bytes, size_t module_size, int DeviceId, C cb, + std::vector &HSAExecutables) { + auto L = [](void *data, size_t size, void *cb_state) -> hsa_status_t { + C *unwrapped = static_cast(cb_state); + return (*unwrapped)(data, size); + }; + return core::RegisterModuleFromMemory( + KernelInfoTable, SymbolInfoTable, module_bytes, module_size, + DeviceInfo.HSAAgents[DeviceId], L, static_cast(&cb), + HSAExecutables); +} +} // namespace - auto &SymbolInfoMap = DeviceInfo.SymbolInfoTable[device_id]; - hsa_status_t err = interop_hsa_get_symbol_info( - SymbolInfoMap, device_id, e->name, &varptr, &varsize); +static uint64_t get_device_State_bytes(char *ImageStart, size_t img_size) { + uint64_t device_State_bytes = 0; + { + // If this is the deviceRTL, get the state variable size + symbol_info size_si; + int rc = get_symbol_info_without_loading( + ImageStart, img_size, "omptarget_nvptx_device_State_size", &size_si); - if (err != HSA_STATUS_SUCCESS) { - // Inform the user what symbol prevented offloading - DP("Loading global '%s' (Failed)\n", e->name); - return NULL; + if (rc == 0) { + if (size_si.size != sizeof(uint64_t)) { + DP("Found device_State_size variable with wrong size\n"); + return 0; } - if (varsize != e->size) { - DP("Loading global '%s' - size mismatch (%u != %lu)\n", e->name, - varsize, e->size); - return NULL; - } + // Read number of bytes directly from the elf + memcpy(&device_State_bytes, size_si.addr, sizeof(uint64_t)); + } + } + return device_State_bytes; +} - DP("Entry point " DPxMOD " maps to global %s (" DPxMOD ")\n", - DPxPTR(e - HostBegin), e->name, DPxPTR(varptr)); - entry.addr = (void *)varptr; +static __tgt_target_table * +__tgt_rtl_load_binary_locked(int32_t device_id, __tgt_device_image *image); - DeviceInfo.addOffloadEntry(device_id, entry); +static __tgt_target_table * +__tgt_rtl_load_binary_locked(int32_t device_id, __tgt_device_image *image); - if (DeviceInfo.RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY && - e->flags & OMP_DECLARE_TARGET_LINK) { - // If unified memory is present any target link variables - // can access host addresses directly. There is no longer a - // need for device copies. - err = DeviceInfo.freesignalpool_memcpy_h2d(varptr, e->addr, - sizeof(void *), device_id); - if (err != HSA_STATUS_SUCCESS) - DP("Error when copying USM\n"); - DP("Copy linked variable host address (" DPxMOD ")" - "to device address (" DPxMOD ")\n", - DPxPTR(*((void **)e->addr)), DPxPTR(varptr)); - } +__tgt_target_table *__tgt_rtl_load_binary(int32_t device_id, + __tgt_device_image *image) { + DeviceInfo.load_run_lock.lock(); + __tgt_target_table *res = __tgt_rtl_load_binary_locked(device_id, image); + DeviceInfo.load_run_lock.unlock(); + return res; +} - continue; - } +struct device_environment { + // initialise an DeviceEnvironmentTy in the deviceRTL + // patches around differences in the deviceRTL between trunk, aomp, + // rocmcc. Over time these differences will tend to zero and this class + // simplified. + // Symbol may be in .data or .bss, and may be missing fields, todo: + // review aomp/trunk/rocm and simplify the following - DP("to find the kernel name: %s size: %lu\n", e->name, strlen(e->name)); + // The symbol may also have been deadstripped because the device side + // accessors were unused. - // errors in kernarg_segment_size previously treated as = 0 (or as undef) - uint32_t kernarg_segment_size = 0; - auto &KernelInfoMap = DeviceInfo.KernelInfoTable[device_id]; - hsa_status_t err = HSA_STATUS_SUCCESS; - if (!e->name) { - err = HSA_STATUS_ERROR; - } else { - std::string kernelStr = std::string(e->name); - auto It = KernelInfoMap.find(kernelStr); - if (It != KernelInfoMap.end()) { - atl_kernel_info_t info = It->second; - kernarg_segment_size = info.kernel_segment_size; - } else { - err = HSA_STATUS_ERROR; - } - } + // If the symbol is in .data (aomp, rocm) it can be written directly. + // If it is in .bss, we must wait for it to be allocated space on the + // gpu (trunk) and initialize after loading. + const char *sym() { return "omptarget_device_environment"; } - // default value GENERIC (in case symbol is missing from cubin file) - llvm::omp::OMPTgtExecModeFlags ExecModeVal = - llvm::omp::OMPTgtExecModeFlags::OMP_TGT_EXEC_MODE_GENERIC; + DeviceEnvironmentTy host_device_env; + symbol_info si; + bool valid = false; - // get flat group size if present, else Default_WG_Size - int16_t WGSizeVal = RTLDeviceInfoTy::Default_WG_Size; + __tgt_device_image *image; + const size_t img_size; - // get Kernel Descriptor if present. - // Keep struct in sync wih getTgtAttributeStructQTy in CGOpenMPRuntime.cpp - struct KernDescValType { - uint16_t Version; - uint16_t TSize; - uint16_t WG_Size; - }; - struct KernDescValType KernDescVal; - std::string KernDescNameStr(e->name); - KernDescNameStr += "_kern_desc"; - const char *KernDescName = KernDescNameStr.c_str(); + device_environment(int device_id, int number_devices, + __tgt_device_image *image, const size_t img_size) + : image(image), img_size(img_size) { - void *KernDescPtr; - uint32_t KernDescSize; - void *CallStackAddr = nullptr; - err = interop_get_symbol_info((char *)image->ImageStart, img_size, - KernDescName, &KernDescPtr, &KernDescSize); + host_device_env.NumDevices = number_devices; + host_device_env.DeviceNum = device_id; + host_device_env.DebugKind = 0; + host_device_env.DynamicMemSize = 0; + if (char *envStr = getenv("LIBOMPTARGET_DEVICE_RTL_DEBUG")) { + host_device_env.DebugKind = std::stoi(envStr); + } - if (err == HSA_STATUS_SUCCESS) { - if ((size_t)KernDescSize != sizeof(KernDescVal)) - DP("Loading global computation properties '%s' - size mismatch (%u != " - "%lu)\n", - KernDescName, KernDescSize, sizeof(KernDescVal)); + int rc = get_symbol_info_without_loading((char *)image->ImageStart, + img_size, sym(), &si); + if (rc != 0) { + DP("Finding global device environment '%s' - symbol missing.\n", sym()); + return; + } - memcpy(&KernDescVal, KernDescPtr, (size_t)KernDescSize); + if (si.size > sizeof(host_device_env)) { + DP("Symbol '%s' has size %u, expected at most %zu.\n", sym(), si.size, + sizeof(host_device_env)); + return; + } - // Check structure size against recorded size. - if ((size_t)KernDescSize != KernDescVal.TSize) - DP("KernDescVal size %lu does not match advertized size %d for '%s'\n", - sizeof(KernDescVal), KernDescVal.TSize, KernDescName); + valid = true; + } - DP("After loading global for %s KernDesc \n", KernDescName); - DP("KernDesc: Version: %d\n", KernDescVal.Version); - DP("KernDesc: TSize: %d\n", KernDescVal.TSize); - DP("KernDesc: WG_Size: %d\n", KernDescVal.WG_Size); + bool in_image() { return si.sh_type != SHT_NOBITS; } - if (KernDescVal.WG_Size == 0) { - KernDescVal.WG_Size = RTLDeviceInfoTy::Default_WG_Size; - DP("Setting KernDescVal.WG_Size to default %d\n", KernDescVal.WG_Size); + hsa_status_t before_loading(void *data, size_t size) { + if (valid) { + if (in_image()) { + DP("Setting global device environment before load (%u bytes)\n", + si.size); + uint64_t offset = (char *)si.addr - (char *)image->ImageStart; + void *pos = (char *)data + offset; + memcpy(pos, &host_device_env, si.size); } - WGSizeVal = KernDescVal.WG_Size; - DP("WGSizeVal %d\n", WGSizeVal); - check("Loading KernDesc computation property", err); - } else { - DP("Warning: Loading KernDesc '%s' - symbol not found, ", KernDescName); - - // Flat group size - std::string WGSizeNameStr(e->name); - WGSizeNameStr += "_wg_size"; - const char *WGSizeName = WGSizeNameStr.c_str(); - - void *WGSizePtr; - uint32_t WGSize; - err = interop_get_symbol_info((char *)image->ImageStart, img_size, - WGSizeName, &WGSizePtr, &WGSize); + } + return HSA_STATUS_SUCCESS; + } - if (err == HSA_STATUS_SUCCESS) { - if ((size_t)WGSize != sizeof(int16_t)) { - DP("Loading global computation properties '%s' - size mismatch (%u " - "!= " - "%lu)\n", - WGSizeName, WGSize, sizeof(int16_t)); - return NULL; + hsa_status_t after_loading() { + if (valid) { + if (!in_image()) { + DP("Setting global device environment after load (%u bytes)\n", + si.size); + int device_id = host_device_env.DeviceNum; + auto &SymbolInfo = DeviceInfo.SymbolInfoTable[device_id]; + void *state_ptr; + uint32_t state_ptr_size; + hsa_status_t err = interop_hsa_get_symbol_info( + SymbolInfo, device_id, sym(), &state_ptr, &state_ptr_size); + if (err != HSA_STATUS_SUCCESS) { + DP("failed to find %s in loaded image\n", sym()); + return err; } - memcpy(&WGSizeVal, WGSizePtr, (size_t)WGSize); - - DP("After loading global for %s WGSize = %d\n", WGSizeName, WGSizeVal); - - if (WGSizeVal < RTLDeviceInfoTy::Default_WG_Size || - WGSizeVal > RTLDeviceInfoTy::Max_WG_Size) { - DP("Error wrong WGSize value specified in HSA code object file: " - "%d\n", - WGSizeVal); - WGSizeVal = RTLDeviceInfoTy::Default_WG_Size; + if (state_ptr_size != si.size) { + DP("Symbol had size %u before loading, %u after\n", state_ptr_size, + si.size); + return HSA_STATUS_ERROR; } - } else { - DP("Warning: Loading WGSize '%s' - symbol not found, " - "using default value %d\n", - WGSizeName, WGSizeVal); - } - - check("Loading WGSize computation property", err); - } - // Read execution mode from global in binary - std::string ExecModeNameStr(e->name); - ExecModeNameStr += "_exec_mode"; - const char *ExecModeName = ExecModeNameStr.c_str(); - - void *ExecModePtr; - uint32_t varsize; - err = interop_get_symbol_info((char *)image->ImageStart, img_size, - ExecModeName, &ExecModePtr, &varsize); - - if (err == HSA_STATUS_SUCCESS) { - if ((size_t)varsize != sizeof(llvm::omp::OMPTgtExecModeFlags)) { - DP("Loading global computation properties '%s' - size mismatch(%u != " - "%lu)\n", - ExecModeName, varsize, sizeof(llvm::omp::OMPTgtExecModeFlags)); - return NULL; - } - - memcpy(&ExecModeVal, ExecModePtr, (size_t)varsize); - - DP("After loading global for %s ExecMode = %d\n", ExecModeName, - ExecModeVal); - - if (ExecModeVal < 0 || - ExecModeVal > llvm::omp::OMP_TGT_EXEC_MODE_GENERIC_SPMD) { - DP("Error wrong exec_mode value specified in HSA code object file: " - "%d\n", - ExecModeVal); - return NULL; + return DeviceInfo.freesignalpool_memcpy_h2d(state_ptr, &host_device_env, + state_ptr_size, device_id); } - } else { - DP("Loading global exec_mode '%s' - symbol missing, using default " - "value " - "GENERIC (1)\n", - ExecModeName); } - check("Loading computation property", err); - - KernelsList.push_back(KernelTy(ExecModeVal, WGSizeVal, device_id, - CallStackAddr, e->name, kernarg_segment_size, - DeviceInfo.KernArgPool)); - __tgt_offload_entry entry = *e; - entry.addr = (void *)&KernelsList.back(); - DeviceInfo.addOffloadEntry(device_id, entry); - DP("Entry point %ld maps to %s\n", e - HostBegin, e->name); + return HSA_STATUS_SUCCESS; } +}; - return DeviceInfo.getOffloadEntriesTable(device_id); -} - -void *__tgt_rtl_data_alloc(int device_id, int64_t size, void *, int32_t kind) { - void *ptr = NULL; - assert(device_id < DeviceInfo.NumberOfDevices && "Device ID too large"); +static hsa_status_t impl_calloc(void **ret_ptr, size_t size, int DeviceId) { + uint64_t rounded = 4 * ((size + 3) / 4); + void *ptr; + hsa_amd_memory_pool_t MemoryPool = DeviceInfo.getDeviceMemoryPool(DeviceId); + hsa_status_t err = hsa_amd_memory_pool_allocate(MemoryPool, rounded, 0, &ptr); + if (err != HSA_STATUS_SUCCESS) { + return err; + } - if (kind != TARGET_ALLOC_DEFAULT) { - REPORT("Invalid target data allocation kind or requested allocator not " - "implemented yet\n"); - return NULL; + hsa_status_t rc = hsa_amd_memory_fill(ptr, 0, rounded / 4); + if (rc != HSA_STATUS_SUCCESS) { + DP("zero fill device_state failed with %u\n", rc); + core::Runtime::Memfree(ptr); + return HSA_STATUS_ERROR; } - hsa_amd_memory_pool_t MemoryPool = DeviceInfo.getDeviceMemoryPool(device_id); - hsa_status_t err = hsa_amd_memory_pool_allocate(MemoryPool, size, 0, &ptr); - DP("Tgt alloc data %ld bytes, (tgt:%016llx).\n", size, - (long long unsigned)(Elf64_Addr)ptr); - ptr = (err == HSA_STATUS_SUCCESS) ? ptr : NULL; - return ptr; + *ret_ptr = ptr; + return HSA_STATUS_SUCCESS; } -int32_t __tgt_rtl_data_submit(int device_id, void *tgt_ptr, void *hst_ptr, - int64_t size) { - assert(device_id < DeviceInfo.NumberOfDevices && "Device ID too large"); - __tgt_async_info AsyncInfo; - int32_t rc = dataSubmit(device_id, tgt_ptr, hst_ptr, size, &AsyncInfo); - if (rc != OFFLOAD_SUCCESS) - return OFFLOAD_FAIL; - - return __tgt_rtl_synchronize(device_id, &AsyncInfo); +static bool image_contains_symbol(void *data, size_t size, const char *sym) { + symbol_info si; + int rc = get_symbol_info_without_loading((char *)data, size, sym, &si); + return (rc == 0) && (si.addr != nullptr); } -int32_t __tgt_rtl_data_submit_async(int device_id, void *tgt_ptr, void *hst_ptr, - int64_t size, __tgt_async_info *AsyncInfo) { - assert(device_id < DeviceInfo.NumberOfDevices && "Device ID too large"); - if (AsyncInfo) { - initAsyncInfo(AsyncInfo); - return dataSubmit(device_id, tgt_ptr, hst_ptr, size, AsyncInfo); - } else { - return __tgt_rtl_data_submit(device_id, tgt_ptr, hst_ptr, size); - } -} +__tgt_target_table *__tgt_rtl_load_binary_locked(int32_t device_id, + __tgt_device_image *image) { + // This function loads the device image onto gpu[device_id] and does other + // per-image initialization work. Specifically: + // + // - Initialize an DeviceEnvironmentTy instance embedded in the + // image at the symbol "omptarget_device_environment" + // Fields DebugKind, DeviceNum, NumDevices. Used by the deviceRTL. + // + // - Allocate a large array per-gpu (could be moved to init_device) + // - Read a uint64_t at symbol omptarget_nvptx_device_State_size + // - Allocate at least that many bytes of gpu memory + // - Zero initialize it + // - Write the pointer to the symbol omptarget_nvptx_device_State + // + // - Pulls some per-kernel information together from various sources and + // records it in the KernelsList for quicker access later + // + // The initialization can be done before or after loading the image onto the + // gpu. This function presently does a mixture. Using the hsa api to get/set + // the information is simpler to implement, in exchange for more complicated + // runtime behaviour. E.g. launching a kernel or using dma to get eight bytes + // back from the gpu vs a hashtable lookup on the host. -int32_t __tgt_rtl_data_retrieve(int device_id, void *hst_ptr, void *tgt_ptr, - int64_t size) { - assert(device_id < DeviceInfo.NumberOfDevices && "Device ID too large"); - __tgt_async_info AsyncInfo; - int32_t rc = dataRetrieve(device_id, hst_ptr, tgt_ptr, size, &AsyncInfo); - if (rc != OFFLOAD_SUCCESS) - return OFFLOAD_FAIL; + const size_t img_size = (char *)image->ImageEnd - (char *)image->ImageStart; - return __tgt_rtl_synchronize(device_id, &AsyncInfo); -} + DeviceInfo.clearOffloadEntriesTable(device_id); -int32_t __tgt_rtl_data_retrieve_async(int device_id, void *hst_ptr, - void *tgt_ptr, int64_t size, - __tgt_async_info *AsyncInfo) { - assert(AsyncInfo && "AsyncInfo is nullptr"); - assert(device_id < DeviceInfo.NumberOfDevices && "Device ID too large"); - initAsyncInfo(AsyncInfo); - return dataRetrieve(device_id, hst_ptr, tgt_ptr, size, AsyncInfo); -} + // We do not need to set the ELF version because the caller of this function + // had to do that to decide the right runtime to use -int32_t __tgt_rtl_data_delete(int device_id, void *tgt_ptr) { - assert(device_id < DeviceInfo.NumberOfDevices && "Device ID too large"); - hsa_status_t err; - DP("Tgt free data (tgt:%016llx).\n", (long long unsigned)(Elf64_Addr)tgt_ptr); - err = core::Runtime::Memfree(tgt_ptr); - if (err != HSA_STATUS_SUCCESS) { - DP("Error when freeing CUDA memory\n"); - return OFFLOAD_FAIL; + if (!elf_machine_id_is_amdgcn(image)) { + return NULL; } - return OFFLOAD_SUCCESS; -} -// Determine launch values for kernel. -struct launchVals { - int WorkgroupSize; - int GridSize; -}; -launchVals getLaunchVals(int WarpSize, EnvironmentVariables Env, - int ConstWGSize, - llvm::omp::OMPTgtExecModeFlags ExecutionMode, - int num_teams, int thread_limit, - uint64_t loop_tripcount, int DeviceNumTeams) { + { + auto env = device_environment(device_id, DeviceInfo.NumberOfDevices, image, + img_size); - int threadsPerGroup = RTLDeviceInfoTy::Default_WG_Size; - int num_groups = 0; + auto &KernelInfo = DeviceInfo.KernelInfoTable[device_id]; + auto &SymbolInfo = DeviceInfo.SymbolInfoTable[device_id]; + hsa_status_t err = module_register_from_memory_to_place( + KernelInfo, SymbolInfo, (void *)image->ImageStart, img_size, device_id, + [&](void *data, size_t size) { + if (image_contains_symbol(data, size, "needs_hostcall_buffer")) { + __atomic_store_n(&DeviceInfo.hostcall_required, true, + __ATOMIC_RELEASE); + } + return env.before_loading(data, size); + }, + DeviceInfo.HSAExecutables); - int Max_Teams = - Env.MaxTeamsDefault > 0 ? Env.MaxTeamsDefault : DeviceNumTeams; - if (Max_Teams > RTLDeviceInfoTy::HardTeamLimit) - Max_Teams = RTLDeviceInfoTy::HardTeamLimit; + check("Module registering", err); + if (err != HSA_STATUS_SUCCESS) { + const char *DeviceName = DeviceInfo.GPUName[device_id].c_str(); + const char *ElfName = get_elf_mach_gfx_name(elf_e_flags(image)); - if (print_kernel_trace & STARTUP_DETAILS) { - DP("RTLDeviceInfoTy::Max_Teams: %d\n", RTLDeviceInfoTy::Max_Teams); - DP("Max_Teams: %d\n", Max_Teams); - DP("RTLDeviceInfoTy::Warp_Size: %d\n", WarpSize); - DP("RTLDeviceInfoTy::Max_WG_Size: %d\n", RTLDeviceInfoTy::Max_WG_Size); - DP("RTLDeviceInfoTy::Default_WG_Size: %d\n", - RTLDeviceInfoTy::Default_WG_Size); - DP("thread_limit: %d\n", thread_limit); - DP("threadsPerGroup: %d\n", threadsPerGroup); - DP("ConstWGSize: %d\n", ConstWGSize); - } - // check for thread_limit() clause - if (thread_limit > 0) { - threadsPerGroup = thread_limit; - DP("Setting threads per block to requested %d\n", thread_limit); - // Add master warp for GENERIC - if (ExecutionMode == - llvm::omp::OMPTgtExecModeFlags::OMP_TGT_EXEC_MODE_GENERIC) { - threadsPerGroup += WarpSize; - DP("Adding master wavefront: +%d threads\n", WarpSize); + if (strcmp(DeviceName, ElfName) != 0) { + DP("Possible gpu arch mismatch: device:%s, image:%s please check" + " compiler flag: -march=\n", + DeviceName, ElfName); + } else { + DP("Error loading image onto GPU: %s\n", get_error_string(err)); + } + + return NULL; } - if (threadsPerGroup > RTLDeviceInfoTy::Max_WG_Size) { // limit to max - threadsPerGroup = RTLDeviceInfoTy::Max_WG_Size; - DP("Setting threads per block to maximum %d\n", threadsPerGroup); + + err = env.after_loading(); + if (err != HSA_STATUS_SUCCESS) { + return NULL; } } - // check flat_max_work_group_size attr here - if (threadsPerGroup > ConstWGSize) { - threadsPerGroup = ConstWGSize; - DP("Reduced threadsPerGroup to flat-attr-group-size limit %d\n", - threadsPerGroup); - } - if (print_kernel_trace & STARTUP_DETAILS) - DP("threadsPerGroup: %d\n", threadsPerGroup); - DP("Preparing %d threads\n", threadsPerGroup); - // Set default num_groups (teams) - if (Env.TeamLimit > 0) - num_groups = (Max_Teams < Env.TeamLimit) ? Max_Teams : Env.TeamLimit; - else - num_groups = Max_Teams; - DP("Set default num of groups %d\n", num_groups); + DP("AMDGPU module successfully loaded!\n"); - if (print_kernel_trace & STARTUP_DETAILS) { - DP("num_groups: %d\n", num_groups); - DP("num_teams: %d\n", num_teams); - } + { + // the device_State array is either large value in bss or a void* that + // needs to be assigned to a pointer to an array of size device_state_bytes + // If absent, it has been deadstripped and needs no setup. - // Reduce num_groups if threadsPerGroup exceeds RTLDeviceInfoTy::Max_WG_Size - // This reduction is typical for default case (no thread_limit clause). - // or when user goes crazy with num_teams clause. - // FIXME: We cant distinguish between a constant or variable thread limit. - // So we only handle constant thread_limits. - if (threadsPerGroup > - RTLDeviceInfoTy::Default_WG_Size) // 256 < threadsPerGroup <= 1024 - // Should we round threadsPerGroup up to nearest WarpSize - // here? - num_groups = (Max_Teams * RTLDeviceInfoTy::Max_WG_Size) / threadsPerGroup; + void *state_ptr; + uint32_t state_ptr_size; + auto &SymbolInfoMap = DeviceInfo.SymbolInfoTable[device_id]; + hsa_status_t err = interop_hsa_get_symbol_info( + SymbolInfoMap, device_id, "omptarget_nvptx_device_State", &state_ptr, + &state_ptr_size); - // check for num_teams() clause - if (num_teams > 0) { - num_groups = (num_teams < num_groups) ? num_teams : num_groups; - } - if (print_kernel_trace & STARTUP_DETAILS) { - DP("num_groups: %d\n", num_groups); - DP("Env.NumTeams %d\n", Env.NumTeams); - DP("Env.TeamLimit %d\n", Env.TeamLimit); - } + if (err != HSA_STATUS_SUCCESS) { + DP("No device_state symbol found, skipping initialization\n"); + } else { + if (state_ptr_size < sizeof(void *)) { + DP("unexpected size of state_ptr %u != %zu\n", state_ptr_size, + sizeof(void *)); + return NULL; + } - if (Env.NumTeams > 0) { - num_groups = (Env.NumTeams < num_groups) ? Env.NumTeams : num_groups; - DP("Modifying teams based on Env.NumTeams %d\n", Env.NumTeams); - } else if (Env.TeamLimit > 0) { - num_groups = (Env.TeamLimit < num_groups) ? Env.TeamLimit : num_groups; - DP("Modifying teams based on Env.TeamLimit%d\n", Env.TeamLimit); - } else { - if (num_teams <= 0) { - if (loop_tripcount > 0) { - if (ExecutionMode == - llvm::omp::OMPTgtExecModeFlags::OMP_TGT_EXEC_MODE_SPMD) { - // round up to the nearest integer - num_groups = ((loop_tripcount - 1) / threadsPerGroup) + 1; - } else if (ExecutionMode == - llvm::omp::OMPTgtExecModeFlags::OMP_TGT_EXEC_MODE_GENERIC) { - num_groups = loop_tripcount; - } else /* OMP_TGT_EXEC_MODE_GENERIC_SPMD */ { - // This is a generic kernel that was transformed to use SPMD-mode - // execution but uses Generic-mode semantics for scheduling. - num_groups = loop_tripcount; + // if it's larger than a void*, assume it's a bss array and no further + // initialization is required. Only try to set up a pointer for + // sizeof(void*) + if (state_ptr_size == sizeof(void *)) { + uint64_t device_State_bytes = + get_device_State_bytes((char *)image->ImageStart, img_size); + if (device_State_bytes == 0) { + DP("Can't initialize device_State, missing size information\n"); + return NULL; + } + + auto &dss = DeviceInfo.deviceStateStore[device_id]; + if (dss.first.get() == nullptr) { + assert(dss.second == 0); + void *ptr = NULL; + hsa_status_t err = impl_calloc(&ptr, device_State_bytes, device_id); + if (err != HSA_STATUS_SUCCESS) { + DP("Failed to allocate device_state array\n"); + return NULL; + } + dss = { + std::unique_ptr{ptr}, + device_State_bytes, + }; + } + + void *ptr = dss.first.get(); + if (device_State_bytes != dss.second) { + DP("Inconsistent sizes of device_State unsupported\n"); + return NULL; + } + + // write ptr to device memory so it can be used by later kernels + err = DeviceInfo.freesignalpool_memcpy_h2d(state_ptr, &ptr, + sizeof(void *), device_id); + if (err != HSA_STATUS_SUCCESS) { + DP("memcpy install of state_ptr failed\n"); + return NULL; } - DP("Using %d teams due to loop trip count %" PRIu64 " and number of " - "threads per block %d\n", - num_groups, loop_tripcount, threadsPerGroup); } - } else { - num_groups = num_teams; - } - if (num_groups > Max_Teams) { - num_groups = Max_Teams; - if (print_kernel_trace & STARTUP_DETAILS) - DP("Limiting num_groups %d to Max_Teams %d \n", num_groups, Max_Teams); - } - if (num_groups > num_teams && num_teams > 0) { - num_groups = num_teams; - if (print_kernel_trace & STARTUP_DETAILS) - DP("Limiting num_groups %d to clause num_teams %d \n", num_groups, - num_teams); } } - // num_teams clause always honored, no matter what, unless DEFAULT is active. - if (num_teams > 0) { - num_groups = num_teams; - // Cap num_groups to EnvMaxTeamsDefault if set. - if (Env.MaxTeamsDefault > 0 && num_groups > Env.MaxTeamsDefault) - num_groups = Env.MaxTeamsDefault; - } - if (print_kernel_trace & STARTUP_DETAILS) { - DP("threadsPerGroup: %d\n", threadsPerGroup); - DP("num_groups: %d\n", num_groups); - DP("loop_tripcount: %ld\n", loop_tripcount); - } - DP("Final %d num_groups and %d threadsPerGroup\n", num_groups, - threadsPerGroup); + // Here, we take advantage of the data that is appended after img_end to get + // the symbols' name we need to load. This data consist of the host entries + // begin and end as well as the target name (see the offloading linker script + // creation in clang compiler). - launchVals res; - res.WorkgroupSize = threadsPerGroup; - res.GridSize = threadsPerGroup * num_groups; - return res; -} + // Find the symbols in the module by name. The name can be obtain by + // concatenating the host entry name with the target name -static uint64_t acquire_available_packet_id(hsa_queue_t *queue) { - uint64_t packet_id = hsa_queue_add_write_index_relaxed(queue, 1); - bool full = true; - while (full) { - full = - packet_id >= (queue->size + hsa_queue_load_read_index_scacquire(queue)); - } - return packet_id; -} + __tgt_offload_entry *HostBegin = image->EntriesBegin; + __tgt_offload_entry *HostEnd = image->EntriesEnd; -static int32_t __tgt_rtl_run_target_team_region_locked( - int32_t device_id, void *tgt_entry_ptr, void **tgt_args, - ptrdiff_t *tgt_offsets, int32_t arg_num, int32_t num_teams, - int32_t thread_limit, uint64_t loop_tripcount); + for (__tgt_offload_entry *e = HostBegin; e != HostEnd; ++e) { -int32_t __tgt_rtl_run_target_team_region(int32_t device_id, void *tgt_entry_ptr, - void **tgt_args, - ptrdiff_t *tgt_offsets, - int32_t arg_num, int32_t num_teams, - int32_t thread_limit, - uint64_t loop_tripcount) { + if (!e->addr) { + // The host should have always something in the address to + // uniquely identify the target region. + DP("Analyzing host entry '' (size = %lld)...\n", + (unsigned long long)e->size); + return NULL; + } - DeviceInfo.load_run_lock.lock_shared(); - int32_t res = __tgt_rtl_run_target_team_region_locked( - device_id, tgt_entry_ptr, tgt_args, tgt_offsets, arg_num, num_teams, - thread_limit, loop_tripcount); + if (e->size) { + __tgt_offload_entry entry = *e; - DeviceInfo.load_run_lock.unlock_shared(); - return res; -} + void *varptr; + uint32_t varsize; -int32_t __tgt_rtl_run_target_team_region_locked( - int32_t device_id, void *tgt_entry_ptr, void **tgt_args, - ptrdiff_t *tgt_offsets, int32_t arg_num, int32_t num_teams, - int32_t thread_limit, uint64_t loop_tripcount) { - // Set the context we are using - // update thread limit content in gpu memory if un-initialized or specified - // from host + auto &SymbolInfoMap = DeviceInfo.SymbolInfoTable[device_id]; + hsa_status_t err = interop_hsa_get_symbol_info( + SymbolInfoMap, device_id, e->name, &varptr, &varsize); - DP("Run target team region thread_limit %d\n", thread_limit); + if (err != HSA_STATUS_SUCCESS) { + // Inform the user what symbol prevented offloading + DP("Loading global '%s' (Failed)\n", e->name); + return NULL; + } - // All args are references. - std::vector args(arg_num); - std::vector ptrs(arg_num); + if (varsize != e->size) { + DP("Loading global '%s' - size mismatch (%u != %lu)\n", e->name, + varsize, e->size); + return NULL; + } - DP("Arg_num: %d\n", arg_num); - for (int32_t i = 0; i < arg_num; ++i) { - ptrs[i] = (void *)((intptr_t)tgt_args[i] + tgt_offsets[i]); - args[i] = &ptrs[i]; - DP("Offseted base: arg[%d]:" DPxMOD "\n", i, DPxPTR(ptrs[i])); - } + DP("Entry point " DPxMOD " maps to global %s (" DPxMOD ")\n", + DPxPTR(e - HostBegin), e->name, DPxPTR(varptr)); + entry.addr = (void *)varptr; - KernelTy *KernelInfo = (KernelTy *)tgt_entry_ptr; + DeviceInfo.addOffloadEntry(device_id, entry); - std::string kernel_name = std::string(KernelInfo->Name); - auto &KernelInfoTable = DeviceInfo.KernelInfoTable; - if (KernelInfoTable[device_id].find(kernel_name) == - KernelInfoTable[device_id].end()) { - DP("Kernel %s not found\n", kernel_name.c_str()); - return OFFLOAD_FAIL; - } + if (DeviceInfo.RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY && + e->flags & OMP_DECLARE_TARGET_LINK) { + // If unified memory is present any target link variables + // can access host addresses directly. There is no longer a + // need for device copies. + err = DeviceInfo.freesignalpool_memcpy_h2d(varptr, e->addr, + sizeof(void *), device_id); + if (err != HSA_STATUS_SUCCESS) + DP("Error when copying USM\n"); + DP("Copy linked variable host address (" DPxMOD ")" + "to device address (" DPxMOD ")\n", + DPxPTR(*((void **)e->addr)), DPxPTR(varptr)); + } - const atl_kernel_info_t KernelInfoEntry = - KernelInfoTable[device_id][kernel_name]; - const uint32_t group_segment_size = KernelInfoEntry.group_segment_size; - const uint32_t sgpr_count = KernelInfoEntry.sgpr_count; - const uint32_t vgpr_count = KernelInfoEntry.vgpr_count; - const uint32_t sgpr_spill_count = KernelInfoEntry.sgpr_spill_count; - const uint32_t vgpr_spill_count = KernelInfoEntry.vgpr_spill_count; + continue; + } - assert(arg_num == (int)KernelInfoEntry.explicit_argument_count); + DP("to find the kernel name: %s size: %lu\n", e->name, strlen(e->name)); - /* - * Set limit based on ThreadsPerGroup and GroupsPerDevice - */ - launchVals LV = - getLaunchVals(DeviceInfo.WarpSize[device_id], DeviceInfo.Env, - KernelInfo->ConstWGSize, KernelInfo->ExecutionMode, - num_teams, // From run_region arg - thread_limit, // From run_region arg - loop_tripcount, // From run_region arg - DeviceInfo.NumTeams[KernelInfo->device_id]); - const int GridSize = LV.GridSize; - const int WorkgroupSize = LV.WorkgroupSize; + // errors in kernarg_segment_size previously treated as = 0 (or as undef) + uint32_t kernarg_segment_size = 0; + auto &KernelInfoMap = DeviceInfo.KernelInfoTable[device_id]; + hsa_status_t err = HSA_STATUS_SUCCESS; + if (!e->name) { + err = HSA_STATUS_ERROR; + } else { + std::string kernelStr = std::string(e->name); + auto It = KernelInfoMap.find(kernelStr); + if (It != KernelInfoMap.end()) { + atl_kernel_info_t info = It->second; + kernarg_segment_size = info.kernel_segment_size; + } else { + err = HSA_STATUS_ERROR; + } + } - if (print_kernel_trace >= LAUNCH) { - int num_groups = GridSize / WorkgroupSize; - // enum modes are SPMD, GENERIC, NONE 0,1,2 - // if doing rtl timing, print to stderr, unless stdout requested. - bool traceToStdout = print_kernel_trace & (RTL_TO_STDOUT | RTL_TIMING); - fprintf(traceToStdout ? stdout : stderr, - "DEVID:%2d SGN:%1d ConstWGSize:%-4d args:%2d teamsXthrds:(%4dX%4d) " - "reqd:(%4dX%4d) lds_usage:%uB sgpr_count:%u vgpr_count:%u " - "sgpr_spill_count:%u vgpr_spill_count:%u tripcount:%lu n:%s\n", - device_id, KernelInfo->ExecutionMode, KernelInfo->ConstWGSize, - arg_num, num_groups, WorkgroupSize, num_teams, thread_limit, - group_segment_size, sgpr_count, vgpr_count, sgpr_spill_count, - vgpr_spill_count, loop_tripcount, KernelInfo->Name); - } + // default value GENERIC (in case symbol is missing from cubin file) + llvm::omp::OMPTgtExecModeFlags ExecModeVal = + llvm::omp::OMPTgtExecModeFlags::OMP_TGT_EXEC_MODE_GENERIC; - // Run on the device. - { - hsa_queue_t *queue = DeviceInfo.HSAQueues[device_id].get(); - if (!queue) { - return OFFLOAD_FAIL; - } - uint64_t packet_id = acquire_available_packet_id(queue); + // get flat group size if present, else Default_WG_Size + int16_t WGSizeVal = RTLDeviceInfoTy::Default_WG_Size; + + // get Kernel Descriptor if present. + // Keep struct in sync wih getTgtAttributeStructQTy in CGOpenMPRuntime.cpp + struct KernDescValType { + uint16_t Version; + uint16_t TSize; + uint16_t WG_Size; + }; + struct KernDescValType KernDescVal; + std::string KernDescNameStr(e->name); + KernDescNameStr += "_kern_desc"; + const char *KernDescName = KernDescNameStr.c_str(); + + void *KernDescPtr; + uint32_t KernDescSize; + void *CallStackAddr = nullptr; + err = interop_get_symbol_info((char *)image->ImageStart, img_size, + KernDescName, &KernDescPtr, &KernDescSize); + + if (err == HSA_STATUS_SUCCESS) { + if ((size_t)KernDescSize != sizeof(KernDescVal)) + DP("Loading global computation properties '%s' - size mismatch (%u != " + "%lu)\n", + KernDescName, KernDescSize, sizeof(KernDescVal)); + + memcpy(&KernDescVal, KernDescPtr, (size_t)KernDescSize); + + // Check structure size against recorded size. + if ((size_t)KernDescSize != KernDescVal.TSize) + DP("KernDescVal size %lu does not match advertized size %d for '%s'\n", + sizeof(KernDescVal), KernDescVal.TSize, KernDescName); + + DP("After loading global for %s KernDesc \n", KernDescName); + DP("KernDesc: Version: %d\n", KernDescVal.Version); + DP("KernDesc: TSize: %d\n", KernDescVal.TSize); + DP("KernDesc: WG_Size: %d\n", KernDescVal.WG_Size); + + if (KernDescVal.WG_Size == 0) { + KernDescVal.WG_Size = RTLDeviceInfoTy::Default_WG_Size; + DP("Setting KernDescVal.WG_Size to default %d\n", KernDescVal.WG_Size); + } + WGSizeVal = KernDescVal.WG_Size; + DP("WGSizeVal %d\n", WGSizeVal); + check("Loading KernDesc computation property", err); + } else { + DP("Warning: Loading KernDesc '%s' - symbol not found, ", KernDescName); + + // Flat group size + std::string WGSizeNameStr(e->name); + WGSizeNameStr += "_wg_size"; + const char *WGSizeName = WGSizeNameStr.c_str(); + + void *WGSizePtr; + uint32_t WGSize; + err = interop_get_symbol_info((char *)image->ImageStart, img_size, + WGSizeName, &WGSizePtr, &WGSize); + + if (err == HSA_STATUS_SUCCESS) { + if ((size_t)WGSize != sizeof(int16_t)) { + DP("Loading global computation properties '%s' - size mismatch (%u " + "!= " + "%lu)\n", + WGSizeName, WGSize, sizeof(int16_t)); + return NULL; + } - const uint32_t mask = queue->size - 1; // size is a power of 2 - hsa_kernel_dispatch_packet_t *packet = - (hsa_kernel_dispatch_packet_t *)queue->base_address + - (packet_id & mask); + memcpy(&WGSizeVal, WGSizePtr, (size_t)WGSize); - // packet->header is written last - packet->setup = UINT16_C(1) << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS; - packet->workgroup_size_x = WorkgroupSize; - packet->workgroup_size_y = 1; - packet->workgroup_size_z = 1; - packet->reserved0 = 0; - packet->grid_size_x = GridSize; - packet->grid_size_y = 1; - packet->grid_size_z = 1; - packet->private_segment_size = KernelInfoEntry.private_segment_size; - packet->group_segment_size = KernelInfoEntry.group_segment_size; - packet->kernel_object = KernelInfoEntry.kernel_object; - packet->kernarg_address = 0; // use the block allocator - packet->reserved2 = 0; // impl writes id_ here - packet->completion_signal = {0}; // may want a pool of signals + DP("After loading global for %s WGSize = %d\n", WGSizeName, WGSizeVal); - KernelArgPool *ArgPool = nullptr; - void *kernarg = nullptr; - { - auto it = KernelArgPoolMap.find(std::string(KernelInfo->Name)); - if (it != KernelArgPoolMap.end()) { - ArgPool = (it->second).get(); + if (WGSizeVal < RTLDeviceInfoTy::Default_WG_Size || + WGSizeVal > RTLDeviceInfoTy::Max_WG_Size) { + DP("Error wrong WGSize value specified in HSA code object file: " + "%d\n", + WGSizeVal); + WGSizeVal = RTLDeviceInfoTy::Default_WG_Size; + } + } else { + DP("Warning: Loading WGSize '%s' - symbol not found, " + "using default value %d\n", + WGSizeName, WGSizeVal); } + + check("Loading WGSize computation property", err); } - if (!ArgPool) { - DP("Warning: No ArgPool for %s on device %d\n", KernelInfo->Name, - device_id); - } - { - if (ArgPool) { - assert(ArgPool->kernarg_segment_size == (arg_num * sizeof(void *))); - kernarg = ArgPool->allocate(arg_num); - } - if (!kernarg) { - DP("Allocate kernarg failed\n"); - return OFFLOAD_FAIL; - } - // Copy explicit arguments - for (int i = 0; i < arg_num; i++) { - memcpy((char *)kernarg + sizeof(void *) * i, args[i], sizeof(void *)); - } + // Read execution mode from global in binary + std::string ExecModeNameStr(e->name); + ExecModeNameStr += "_exec_mode"; + const char *ExecModeName = ExecModeNameStr.c_str(); - // Initialize implicit arguments. TODO: Which of these can be dropped - impl_implicit_args_t *impl_args = - reinterpret_cast( - static_cast(kernarg) + ArgPool->kernarg_segment_size); - memset(impl_args, 0, - sizeof(impl_implicit_args_t)); // may not be necessary - impl_args->offset_x = 0; - impl_args->offset_y = 0; - impl_args->offset_z = 0; + void *ExecModePtr; + uint32_t varsize; + err = interop_get_symbol_info((char *)image->ImageStart, img_size, + ExecModeName, &ExecModePtr, &varsize); - // assign a hostcall buffer for the selected Q - if (__atomic_load_n(&DeviceInfo.hostcall_required, __ATOMIC_ACQUIRE)) { - // hostrpc_assign_buffer is not thread safe, and this function is - // under a multiple reader lock, not a writer lock. - static pthread_mutex_t hostcall_init_lock = PTHREAD_MUTEX_INITIALIZER; - pthread_mutex_lock(&hostcall_init_lock); - unsigned long buffer = hostrpc_assign_buffer( - DeviceInfo.HSAAgents[device_id], queue, device_id); - pthread_mutex_unlock(&hostcall_init_lock); - if (!buffer) { - DP("hostrpc_assign_buffer failed, gpu would dereference null and " - "error\n"); - return OFFLOAD_FAIL; - } + if (err == HSA_STATUS_SUCCESS) { + if ((size_t)varsize != sizeof(llvm::omp::OMPTgtExecModeFlags)) { + DP("Loading global computation properties '%s' - size mismatch(%u != " + "%lu)\n", + ExecModeName, varsize, sizeof(llvm::omp::OMPTgtExecModeFlags)); + return NULL; + } - if (KernelInfoEntry.implicit_argument_count >= 4) { - // Initialise pointer for implicit_argument_count != 0 ABI - // Guess that the right implicit argument is at offset 24 after - // the explicit arguments. In the future, should be able to read - // the offset from msgpack. Clang is not annotating it at present. - uint64_t Offset = - sizeof(void *) * (KernelInfoEntry.explicit_argument_count + 3); - if ((Offset + 8) > (ArgPool->kernarg_segment_size)) { - DP("Bad offset of hostcall, exceeds kernarg segment size\n"); - } else { - memcpy(static_cast(kernarg) + Offset, &buffer, 8); - } - } + memcpy(&ExecModeVal, ExecModePtr, (size_t)varsize); - // initialise pointer for implicit_argument_count == 0 ABI - impl_args->hostcall_ptr = buffer; - } + DP("After loading global for %s ExecMode = %d\n", ExecModeName, + ExecModeVal); - packet->kernarg_address = kernarg; + if (ExecModeVal < 0 || + ExecModeVal > llvm::omp::OMP_TGT_EXEC_MODE_GENERIC_SPMD) { + DP("Error wrong exec_mode value specified in HSA code object file: " + "%d\n", + ExecModeVal); + return NULL; + } + } else { + DP("Loading global exec_mode '%s' - symbol missing, using default " + "value " + "GENERIC (1)\n", + ExecModeName); } + check("Loading computation property", err); - hsa_signal_t s = DeviceInfo.FreeSignalPool.pop(); - if (s.handle == 0) { - DP("Failed to get signal instance\n"); - return OFFLOAD_FAIL; - } - packet->completion_signal = s; - hsa_signal_store_relaxed(packet->completion_signal, 1); + KernelsList.push_back(KernelTy(ExecModeVal, WGSizeVal, device_id, + CallStackAddr, e->name, kernarg_segment_size, + DeviceInfo.KernArgPool)); + __tgt_offload_entry entry = *e; + entry.addr = (void *)&KernelsList.back(); + DeviceInfo.addOffloadEntry(device_id, entry); + DP("Entry point %ld maps to %s\n", e - HostBegin, e->name); + } - // Publish the packet indicating it is ready to be processed - core::packet_store_release(reinterpret_cast(packet), - core::create_header(), packet->setup); + return DeviceInfo.getOffloadEntriesTable(device_id); +} - // Since the packet is already published, its contents must not be - // accessed any more - hsa_signal_store_relaxed(queue->doorbell_signal, packet_id); +void *__tgt_rtl_data_alloc(int device_id, int64_t size, void *, int32_t kind) { + void *ptr = NULL; + assert(device_id < DeviceInfo.NumberOfDevices && "Device ID too large"); - while (hsa_signal_wait_scacquire(s, HSA_SIGNAL_CONDITION_EQ, 0, UINT64_MAX, - HSA_WAIT_STATE_BLOCKED) != 0) - ; + if (kind != TARGET_ALLOC_DEFAULT) { + REPORT("Invalid target data allocation kind or requested allocator not " + "implemented yet\n"); + return NULL; + } - assert(ArgPool); - ArgPool->deallocate(kernarg); - DeviceInfo.FreeSignalPool.push(s); + hsa_amd_memory_pool_t MemoryPool = DeviceInfo.getDeviceMemoryPool(device_id); + hsa_status_t err = hsa_amd_memory_pool_allocate(MemoryPool, size, 0, &ptr); + DP("Tgt alloc data %ld bytes, (tgt:%016llx).\n", size, + (long long unsigned)(Elf64_Addr)ptr); + ptr = (err == HSA_STATUS_SUCCESS) ? ptr : NULL; + return ptr; +} + +int32_t __tgt_rtl_data_submit(int device_id, void *tgt_ptr, void *hst_ptr, + int64_t size) { + assert(device_id < DeviceInfo.NumberOfDevices && "Device ID too large"); + __tgt_async_info AsyncInfo; + int32_t rc = dataSubmit(device_id, tgt_ptr, hst_ptr, size, &AsyncInfo); + if (rc != OFFLOAD_SUCCESS) + return OFFLOAD_FAIL; + + return __tgt_rtl_synchronize(device_id, &AsyncInfo); +} + +int32_t __tgt_rtl_data_submit_async(int device_id, void *tgt_ptr, void *hst_ptr, + int64_t size, __tgt_async_info *AsyncInfo) { + assert(device_id < DeviceInfo.NumberOfDevices && "Device ID too large"); + if (AsyncInfo) { + initAsyncInfo(AsyncInfo); + return dataSubmit(device_id, tgt_ptr, hst_ptr, size, AsyncInfo); + } else { + return __tgt_rtl_data_submit(device_id, tgt_ptr, hst_ptr, size); } +} - DP("Kernel completed\n"); +int32_t __tgt_rtl_data_retrieve(int device_id, void *hst_ptr, void *tgt_ptr, + int64_t size) { + assert(device_id < DeviceInfo.NumberOfDevices && "Device ID too large"); + __tgt_async_info AsyncInfo; + int32_t rc = dataRetrieve(device_id, hst_ptr, tgt_ptr, size, &AsyncInfo); + if (rc != OFFLOAD_SUCCESS) + return OFFLOAD_FAIL; + + return __tgt_rtl_synchronize(device_id, &AsyncInfo); +} + +int32_t __tgt_rtl_data_retrieve_async(int device_id, void *hst_ptr, + void *tgt_ptr, int64_t size, + __tgt_async_info *AsyncInfo) { + assert(AsyncInfo && "AsyncInfo is nullptr"); + assert(device_id < DeviceInfo.NumberOfDevices && "Device ID too large"); + initAsyncInfo(AsyncInfo); + return dataRetrieve(device_id, hst_ptr, tgt_ptr, size, AsyncInfo); +} + +int32_t __tgt_rtl_data_delete(int device_id, void *tgt_ptr) { + assert(device_id < DeviceInfo.NumberOfDevices && "Device ID too large"); + hsa_status_t err; + DP("Tgt free data (tgt:%016llx).\n", (long long unsigned)(Elf64_Addr)tgt_ptr); + err = core::Runtime::Memfree(tgt_ptr); + if (err != HSA_STATUS_SUCCESS) { + DP("Error when freeing CUDA memory\n"); + return OFFLOAD_FAIL; + } return OFFLOAD_SUCCESS; } +int32_t __tgt_rtl_run_target_team_region(int32_t device_id, void *tgt_entry_ptr, + void **tgt_args, + ptrdiff_t *tgt_offsets, + int32_t arg_num, int32_t num_teams, + int32_t thread_limit, + uint64_t loop_tripcount) { + + DeviceInfo.load_run_lock.lock_shared(); + int32_t res = + runRegionNowaitLocked(device_id, tgt_entry_ptr, tgt_args, tgt_offsets, + arg_num, num_teams, thread_limit, loop_tripcount); + + DeviceInfo.load_run_lock.unlock_shared(); + return res; +} + int32_t __tgt_rtl_run_target_region(int32_t device_id, void *tgt_entry_ptr, void **tgt_args, ptrdiff_t *tgt_offsets, int32_t arg_num) { @@ -2244,6 +2240,20 @@ thread_limit, 0); } +int32_t __tgt_rtl_run_target_team_region_async( + int32_t device_id, void *tgt_entry_ptr, void **tgt_args, + ptrdiff_t *tgt_offsets, int32_t arg_num, int32_t num_teams, + int32_t thread_limit, uint64_t loop_tripcount) { + + DeviceInfo.load_run_lock.lock_shared(); + int32_t res = + runRegionNowaitLocked(device_id, tgt_entry_ptr, tgt_args, tgt_offsets, + arg_num, num_teams, thread_limit, loop_tripcount); + + DeviceInfo.load_run_lock.unlock_shared(); + return res; +} + int32_t __tgt_rtl_run_target_region_async(int32_t device_id, void *tgt_entry_ptr, void **tgt_args, ptrdiff_t *tgt_offsets, @@ -2256,9 +2266,9 @@ // fix thread num int32_t team_num = 1; int32_t thread_limit = 0; // use default - return __tgt_rtl_run_target_team_region(device_id, tgt_entry_ptr, tgt_args, - tgt_offsets, arg_num, team_num, - thread_limit, 0); + return __tgt_rtl_run_target_team_region_async(device_id, tgt_entry_ptr, + tgt_args, tgt_offsets, arg_num, + team_num, thread_limit, 0); } int32_t __tgt_rtl_synchronize(int32_t device_id, __tgt_async_info *AsyncInfo) {