diff --git a/openmp/libomptarget/plugins/hsa/CMakeLists.txt b/openmp/libomptarget/plugins/hsa/CMakeLists.txt new file mode 100644 --- /dev/null +++ b/openmp/libomptarget/plugins/hsa/CMakeLists.txt @@ -0,0 +1,146 @@ +##===----------------------------------------------------------------------===## +# +# The LLVM Compiler Infrastructure +# +# This file is dual licensed under the MIT and the University of Illinois Open +# Source Licenses. See LICENSE.txt for details. +# +##===----------------------------------------------------------------------===## +# +# Build a plugin for an HSA machine if available. +# +##===----------------------------------------------------------------------===## + +################################################################################ +# Add check for required compiler + +find_package(LLVM QUIET CONFIG + PATHS + $ENV{AOMP} + $ENV{HOME}/rocm/aomp + /opt/rocm/aomp + /usr/lib/rocm/aomp + ${LIBOMPTARGET_NVPTX_CUDA_COMPILER_DIR} + ${LIBOMPTARGET_NVPTX_CUDA_LINKER_DIR} + ${CMAKE_CXX_COMPILER_DIR} + NO_DEFAULT_PATH + ) + +if (LLVM_DIR) + libomptarget_say("Found LLVM ${LLVM_PACKAGE_VERSION}. Configure: ${LLVM_DIR}/LLVMConfig.cmake") +else() + libomptarget_say("Not building HSA plugin: AOMP llvm not found") + return() +endif() + +set(AOMP_DIR_FOUND ${LLVM_DIR}) +set(AOMP_INSTALL_PREFIX ${LLVM_INSTALL_PREFIX}) +set(AOMP_MAIN_INCDIR ${LLVM_BUILD_MAIN_INCLUDE_DIR}) +set(AOMP_BINDIR ${AOMP_INSTALL_PREFIX}/bin) +set(AOMP_INCDIR ${AOMP_INSTALL_PREFIX}/include) +set(AOMP_LIBDIR ${AOMP_INSTALL_PREFIX}/lib) + +find_package(Clang QUIET CONFIG + PATHS + $ENV{AOMP} + $ENV{HOME}/rocm/aomp + /opt/rocm/aomp + /usr/lib/rocm/aomp + ${LIBOMPTARGET_NVPTX_CUDA_COMPILER_DIR} + ${LIBOMPTARGET_NVPTX_CUDA_LINKER_DIR} + ${CMAKE_CXX_COMPILER_DIR} + NO_DEFAULT_PATH + ) + +if (CLANG_CMAKE_DIR) + libomptarget_say("Found Clang ${LLVM_PACKAGE_VERSION}. Configure: ${CLANG_CMAKE_DIR}/ClangConfig.cmake") +else() + libomptarget_say("Not building HSA plugin: AOMP clang not found") + return() +endif() + +# Add check for required libraries + +if(NOT LIBOMPTARGET_DEP_LIBELF_FOUND) + libomptarget_say("Not building HSA plugin: LIBELF not found") + return() +endif() + +################################################################################ +# Looking for ROCM... +################################################################################ +pkg_check_modules(LIBOMPTARGET_SEARCH_LIBHSA QUIET libhsa-runtime64) + +set(LIBOMPTARGET_DEP_LIBHSA_INCLUDE_DIRS $ENV{HSA_RUNTIME_PATH}/include/hsa) + +# This is usually found on /opt/rocm/lib or /usr/lib/rocm/lib +find_path ( + LIBOMPTARGET_DEP_LIBHSA_LIBRARIES_DIRS + NAMES + libhsa-runtime64.so + PATHS + $ENV{HSA_RUNTIME_PATH}/lib + $ENV{HOME}/rocm/aomp/lib + /opt/rocm/aomp/lib + /opt/rocm/lib + /usr/lib/rocm/lib + ) + +find_package_handle_standard_args( + LIBOMPTARGET_DEP_LIBHSA + DEFAULT_MSG + LIBOMPTARGET_DEP_LIBHSA_LIBRARIES_DIRS + LIBOMPTARGET_DEP_LIBHSA_INCLUDE_DIRS) + +mark_as_advanced( + LIBOMPTARGET_DEP_LIBHSA_INCLUDE_DIRS + LIBOMPTARGET_DEP_LIBHSA_LIBRARIES_DIRS) + +if(NOT LIBOMPTARGET_DEP_LIBHSA_FOUND) + libomptarget_say("Not building HSA plugin: LIBHSA not found") + return() +endif() + +if(NOT CMAKE_SYSTEM_PROCESSOR MATCHES "(x86_64)|(ppc64le)|(aarch64)$" AND CMAKE_SYSTEM_NAME MATCHES "Linux") + libomptarget_say("Not building HSA plugin: only support HSA in Linux x86_64, ppc64le , or aarch64 hosts.") + return() +endif() +libomptarget_say("Building HSA offloading plugin") + +################################################################################ +# Define the suffix for the runtime messaging dumps. +add_definitions(-DTARGET_NAME=HSA) +if(CMAKE_SYSTEM_PROCESSOR MATCHES "(ppc64le)|(aarch64)$") + add_definitions(-DLITTLEENDIAN_CPU=1) +endif() + +if(CMAKE_BUILD_TYPE MATCHES Debug) + add_definitions(-DHSA_ERROR_REPORT) +endif() + +include_directories( + ${LIBOMPTARGET_DEP_LIBHSA_INCLUDE_DIRS} + ${CLANG_INCLUDE_DIRS} +) + +add_library(omptarget.rtl.hsa SHARED + src/rtl.cpp ) + +# Install plugin under the lib destination folder. +# When we build for debug, OPENMP_LIBDIR_SUFFIX get set to -debug +install(TARGETS omptarget.rtl.hsa LIBRARY DESTINATION "lib${OPENMP_LIBDIR_SUFFIX}") + +# We need the AOMP specific build of ATMI that has HSA_INTEROP turned on. +# Also, the AOMP specific build of ATMI has seperate release and debug builds. +target_link_libraries( + omptarget.rtl.hsa + -L${AOMP_LIBDIR}${OPENMP_LIBDIR_SUFFIX} -latmi_runtime -Wl,-rpath,${AOMP_LIBDIR}${OPENMP_LIBDIR_SUFFIX} + -L${LIBOMPTARGET_DEP_LIBHSA_LIBRARIES_DIRS} -lhsa-runtime64 -lhsakmt -Wl,-rpath,${LIBOMPTARGET_DEP_LIBHSA_LIBRARIES_DIRS} + -L${AOMP_LIBDIR} -lLLVMAMDGPUDesc -lLLVMAMDGPUUtils -lLLVMMC -lLLVMCore -lLLVMSupport -Wl,-rpath,${AOMP_LIBDIR} + -lelf + "-Wl,--version-script=${CMAKE_CURRENT_SOURCE_DIR}/../exports" + ) + +# Report to the parent scope that we are building a plugin for hsa +set(LIBOMPTARGET_SYSTEM_TARGETS "${LIBOMPTARGET_SYSTEM_TARGETS} amdgcn-amd-amdhsa" PARENT_SCOPE) + diff --git a/openmp/libomptarget/plugins/hsa/src/rtl.cpp b/openmp/libomptarget/plugins/hsa/src/rtl.cpp new file mode 100644 --- /dev/null +++ b/openmp/libomptarget/plugins/hsa/src/rtl.cpp @@ -0,0 +1,1271 @@ +//===----RTLs/hsa/src/rtl.cpp - Target RTLs Implementation -------- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// RTL for hsa machine +// +//===----------------------------------------------------------------------===// + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +// Header from ATMI interface +#include "atmi_interop_hsa.h" +#include "atmi_runtime.h" + +#include "omptargetplugin.h" + +enum GVIDX { + /// The maximum number of threads in a worker warp. + GV_Warp_Size, + /// the maximum number of teams. + GV_Max_Teams, + // The absolute maximum team size for a working group + GV_Max_WG_Size, + // The default maximum team size for a working group + GV_Default_WG_Size, +}; +static constexpr int AMDGPUGpuGridValues[] = { + 64, // GV_Warp_Size + 128, // GV_Max_Teams + 1024, // GV_Max_WG_Size, + 256, // GV_Default_WG_Size +}; + +#ifndef TARGET_NAME +#define TARGET_NAME AMDHSA +#endif + +int print_kernel_trace; + +#ifdef OMPTARGET_DEBUG +static int DebugLevel = 0; + +#define GETNAME2(name) #name +#define GETNAME(name) GETNAME2(name) +#define DP(...) \ + do { \ + if (DebugLevel > 0) { \ + DEBUGP("Target " GETNAME(TARGET_NAME) " RTL", __VA_ARGS__); \ + } \ + } while (false) +#else // OMPTARGET_DEBUG +#define DP(...) \ + {} +#endif // OMPTARGET_DEBUG + +#ifdef OMPTARGET_DEBUG +#define check(msg, status) \ + if (status != ATMI_STATUS_SUCCESS) { \ + /* fprintf(stderr, "[%s:%d] %s failed.\n", __FILE__, __LINE__, #msg);*/ \ + DP(#msg " failed\n"); \ + /*assert(0);*/ \ + } else { \ + /* fprintf(stderr, "[%s:%d] %s succeeded.\n", __FILE__, __LINE__, #msg); \ + */ \ + DP(#msg " succeeded\n"); \ + } +#else +#define check(msg, status) \ + {} +#endif + +/// Keep entries table per device +struct FuncOrGblEntryTy { + __tgt_target_table Table; + std::vector<__tgt_offload_entry> Entries; +}; + +enum ExecutionModeType { + SPMD, // constructors, destructors, + // combined constructs (`teams distribute parallel for [simd]`) + GENERIC, // everything else + NONE +}; + +typedef atmi_kernel_t ATMIfunction; +/// Use a single entity to encode a kernel and a set of flags +struct KernelTy { + ATMIfunction Func; + + // execution mode of kernel + // 0 - SPMD mode (without master warp) + // 1 - Generic mode (with master warp) + int8_t ExecutionMode; + int16_t ConstWGSize; + const char *Name; + + KernelTy(ATMIfunction _Func, int8_t _ExecutionMode, int16_t _ConstWGSize, + const char *_Name) + : Func(_Func), ExecutionMode(_ExecutionMode), ConstWGSize(_ConstWGSize), + Name(_Name) { + DP("Construct kernelinfo: ExecMode %d\n", ExecutionMode); + } +}; + +/// List that contains all the kernels. +/// FIXME: we may need this to be per device and per library. +std::list KernelsList; + +/// Class containing all the device information +class RTLDeviceInfoTy { + std::vector> FuncGblEntries; + int NumberOfiGPUs; + int NumberOfdGPUs; + int NumberOfCPUs; + +public: + int NumberOfDevices; + + // GPU devices + atmi_machine_t *Machine; + std::vector GPUPlaces; + std::vector GPUMEMPlaces; + std::vector CPUMEMPlaces; + std::vector HSAAgents; + + // Device properties + std::vector ComputeUnits; + std::vector GroupsPerDevice; + std::vector ThreadsPerGroup; + std::vector WarpSize; + + // OpenMP properties + std::vector NumTeams; + std::vector NumThreads; + + // OpenMP Environment properties + int EnvNumTeams; + int EnvTeamLimit; + int EnvMaxTeamsDefault; + + // OpenMP Requires Flags + int64_t RequiresFlags; + + // static int EnvNumThreads; + static const int HardTeamLimit = 1 << 20; // 1 Meg + static const int DefaultNumTeams = 128; + static const int Max_Teams = + AMDGPUGpuGridValues[GVIDX::GV_Max_Teams]; + static const int Warp_Size = + AMDGPUGpuGridValues[GVIDX::GV_Warp_Size]; + static const int Max_WG_Size = + AMDGPUGpuGridValues[GVIDX::GV_Max_WG_Size]; + static const int Default_WG_Size = + AMDGPUGpuGridValues[GVIDX::GV_Default_WG_Size]; + + // Record entry point associated with device + void addOffloadEntry(int32_t device_id, __tgt_offload_entry entry) { + assert(device_id < (int32_t)FuncGblEntries.size() && + "Unexpected device id!"); + FuncOrGblEntryTy &E = FuncGblEntries[device_id].back(); + + E.Entries.push_back(entry); + } + + // Return true if the entry is associated with device + bool findOffloadEntry(int32_t device_id, void *addr) { + assert(device_id < (int32_t)FuncGblEntries.size() && + "Unexpected device id!"); + FuncOrGblEntryTy &E = FuncGblEntries[device_id].back(); + + for (auto &it : E.Entries) { + if (it.addr == addr) + return true; + } + + return false; + } + + // Return the pointer to the target entries table + __tgt_target_table *getOffloadEntriesTable(int32_t device_id) { + assert(device_id < (int32_t)FuncGblEntries.size() && + "Unexpected device id!"); + FuncOrGblEntryTy &E = FuncGblEntries[device_id].back(); + + int32_t size = E.Entries.size(); + + // Table is empty + if (!size) + return 0; + + __tgt_offload_entry *begin = &E.Entries[0]; + __tgt_offload_entry *end = &E.Entries[size - 1]; + + // Update table info according to the entries and return the pointer + E.Table.EntriesBegin = begin; + E.Table.EntriesEnd = ++end; + + return &E.Table; + } + + // Clear entries table for a device + void clearOffloadEntriesTable(int device_id) { + assert(device_id < (int32_t)FuncGblEntries.size() && + "Unexpected device id!"); + FuncGblEntries[device_id].emplace_back(); + FuncOrGblEntryTy &E = FuncGblEntries[device_id].back(); + for (std::vector<__tgt_offload_entry>::iterator it = E.Entries.begin(); + it != E.Entries.end(); it++) { + KernelTy *kernel_info = (KernelTy *)it->addr; + if (kernel_info->Func.handle != 0ull) + atmi_kernel_release(kernel_info->Func); + } + E.Entries.clear(); + E.Table.EntriesBegin = E.Table.EntriesEnd = 0; + } + + RTLDeviceInfoTy() { +#ifdef OMPTARGET_DEBUG + if (char *envStr = getenv("LIBOMPTARGET_DEBUG")) + DebugLevel = std::stoi(envStr); +#endif // OMPTARGET_DEBUG + + // LIBOMPTARGET_KERNEL_TRACE provides a kernel launch trace to stderr + // anytime. You do not need a debug library build. + // 0 => no tracing + // 1 => tracing dispatch only + // >1 => verbosity increase + if (char *envStr = getenv("LIBOMPTARGET_KERNEL_TRACE")) + print_kernel_trace = atoi(envStr); + else + print_kernel_trace = 0; + + DP("Start initializing HSA-ATMI\n"); + + atmi_status_t err = atmi_init(ATMI_DEVTYPE_GPU); + if (err != ATMI_STATUS_SUCCESS) { + DP("Error when initializing HSA-ATMI\n"); + return; + } + atmi_machine_t *machine = atmi_machine_get_info(); + NumberOfiGPUs = machine->device_count_by_type[ATMI_DEVTYPE_iGPU]; + NumberOfdGPUs = machine->device_count_by_type[ATMI_DEVTYPE_dGPU]; + NumberOfDevices = machine->device_count_by_type[ATMI_DEVTYPE_GPU]; + NumberOfCPUs = machine->device_count_by_type[ATMI_DEVTYPE_CPU]; + + if (NumberOfDevices == 0) { + DP("There are no devices supporting HSA.\n"); + return; + } else { + DP("There are %d devices supporting HSA.\n", NumberOfDevices); + } + + Machine = machine; + + // Init the device info + FuncGblEntries.resize(NumberOfDevices); + GPUPlaces.resize(NumberOfDevices); + GPUMEMPlaces.resize(NumberOfDevices); + CPUMEMPlaces.resize(NumberOfCPUs); + HSAAgents.resize(NumberOfDevices); + ThreadsPerGroup.resize(NumberOfDevices); + ComputeUnits.resize(NumberOfDevices); + GroupsPerDevice.resize(NumberOfDevices); + WarpSize.resize(NumberOfDevices); + NumTeams.resize(NumberOfDevices); + NumThreads.resize(NumberOfDevices); + + for (int i = 0; i < NumberOfCPUs; i++) { + CPUMEMPlaces[i] = (atmi_mem_place_t)ATMI_MEM_PLACE_CPU_MEM(0, i, 0); + } + for (int i = 0; i < NumberOfDevices; i++) { + ThreadsPerGroup[i] = RTLDeviceInfoTy::Default_WG_Size; + GroupsPerDevice[i] = RTLDeviceInfoTy::DefaultNumTeams; + ComputeUnits[i] = 1; + + DP("Device %d: Initial groupsPerDevice %d & threadsPerGroup %d\n", i, + GroupsPerDevice[i], ThreadsPerGroup[i]); + + // ATMI API to get gpu and gpu memory place + GPUPlaces[i] = (atmi_place_t)ATMI_PLACE_GPU(0, i); + GPUMEMPlaces[i] = (atmi_mem_place_t)ATMI_MEM_PLACE_GPU_MEM(0, i, 0); + + // ATMI API to get HSA agent + err = atmi_interop_hsa_get_agent(GPUPlaces[i], &(HSAAgents[i])); + check("Get HSA agents", err); + } + + // Get environment variables regarding teams + char *envStr = getenv("OMP_TEAM_LIMIT"); + if (envStr) { + // OMP_TEAM_LIMIT has been set + EnvTeamLimit = std::stoi(envStr); + DP("Parsed OMP_TEAM_LIMIT=%d\n", EnvTeamLimit); + } else { + EnvTeamLimit = -1; + } + envStr = getenv("OMP_NUM_TEAMS"); + if (envStr) { + // OMP_NUM_TEAMS has been set + EnvNumTeams = std::stoi(envStr); + DP("Parsed OMP_NUM_TEAMS=%d\n", EnvNumTeams); + } else { + EnvNumTeams = -1; + } + // Get environment variables regarding expMaxTeams + envStr = getenv("OMP_MAX_TEAMS_DEFAULT"); + if (envStr) { + EnvMaxTeamsDefault = std::stoi(envStr); + DP("Parsed OMP_MAX_TEAMS_DEFAULT=%d\n", EnvMaxTeamsDefault); + } else { + EnvMaxTeamsDefault = -1; + } + + // Default state. + RequiresFlags = OMP_REQ_UNDEFINED; + } + + ~RTLDeviceInfoTy() { + DP("Finalizing the HSA-ATMI DeviceInfo.\n"); + atmi_finalize(); + } +}; + +#include "../../../src/device_env_struct.h" + +static RTLDeviceInfoTy DeviceInfo; + +static char GPUName[256] = "--unknown gpu--"; + +#ifdef __cplusplus +extern "C" { +#endif + +int32_t __tgt_rtl_is_valid_binary(__tgt_device_image *image) { + + // Is the library version incompatible with the header file? + if (elf_version(EV_CURRENT) == EV_NONE) { + DP("Incompatible ELF library!\n"); + return 0; + } + + char *img_begin = (char *)image->ImageStart; + char *img_end = (char *)image->ImageEnd; + size_t img_size = img_end - img_begin; + + // Obtain elf handler + Elf *e = elf_memory(img_begin, img_size); + if (!e) { + DP("Unable to get ELF handle: %s!\n", elf_errmsg(-1)); + return 0; + } + + // Check if ELF is the right kind. + if (elf_kind(e) != ELF_K_ELF) { + DP("Unexpected ELF type!\n"); + return 0; + } + Elf64_Ehdr *eh64 = elf64_getehdr(e); + Elf32_Ehdr *eh32 = elf32_getehdr(e); + + if (!eh64 && !eh32) { + DP("Unable to get machine ID from ELF file!\n"); + elf_end(e); + return 0; + } + + uint16_t MachineID; + if (eh64 && !eh32) + MachineID = eh64->e_machine; + else if (eh32 && !eh64) + MachineID = eh32->e_machine; + else { + DP("Ambiguous ELF header!\n"); + elf_end(e); + return 0; + } + + elf_end(e); + + switch (MachineID) { + // old brig file in HSA 1.0P + case 0: + // brig file in HSAIL path + case 44890: + case 44891: + break; + // amdgcn + case 224: + break; + default: + DP("Unsupported machine ID found: %d\n", MachineID); + return 0; + } + + return 1; +} + +int __tgt_rtl_number_of_devices() { return DeviceInfo.NumberOfDevices; } + +int64_t __tgt_rtl_init_requires(int64_t RequiresFlags) { + DP("Init requires flags to %ld\n", RequiresFlags); + DeviceInfo.RequiresFlags = RequiresFlags; + return RequiresFlags; +} + +int32_t __tgt_rtl_init_device(int device_id) { + hsa_status_t err; + + // this is per device id init + DP("Initialize the device id: %d\n", device_id); + + hsa_agent_t &agent = DeviceInfo.HSAAgents[device_id]; + + // Get number of Compute Unit + uint32_t compute_units = 0; + err = hsa_agent_get_info( + agent, (hsa_agent_info_t)HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT, + &compute_units); + if (err != HSA_STATUS_SUCCESS) { + DeviceInfo.ComputeUnits[device_id] = 1; + DP("Error getting compute units : settiing to 1\n"); + } else { + DeviceInfo.ComputeUnits[device_id] = compute_units; + DP("Using %d compute unis per grid\n", DeviceInfo.ComputeUnits[device_id]); + } + if (print_kernel_trace > 1) + fprintf(stderr, "Device#%-2d CU's: %2d\n", device_id, + DeviceInfo.ComputeUnits[device_id]); + + // Query attributes to determine number of threads/block and blocks/grid. + uint16_t workgroup_max_dim[3]; + err = hsa_agent_get_info(agent, HSA_AGENT_INFO_WORKGROUP_MAX_DIM, + &workgroup_max_dim); + if (err != HSA_STATUS_SUCCESS) { + DeviceInfo.GroupsPerDevice[device_id] = RTLDeviceInfoTy::DefaultNumTeams; + DP("Error getting grid dims: num groups : %d\n", + RTLDeviceInfoTy::DefaultNumTeams); + } else if (workgroup_max_dim[0] <= RTLDeviceInfoTy::HardTeamLimit) { + DeviceInfo.GroupsPerDevice[device_id] = workgroup_max_dim[0]; + DP("Using %d ROCm blocks per grid\n", + DeviceInfo.GroupsPerDevice[device_id]); + } else { + DeviceInfo.GroupsPerDevice[device_id] = RTLDeviceInfoTy::HardTeamLimit; + DP("Max ROCm blocks per grid %d exceeds the hard team limit %d, capping " + "at the hard limit\n", + workgroup_max_dim[0], RTLDeviceInfoTy::HardTeamLimit); + } + + // Get thread limit + hsa_dim3_t grid_max_dim; + err = hsa_agent_get_info(agent, HSA_AGENT_INFO_GRID_MAX_DIM, &grid_max_dim); + if (err == HSA_STATUS_SUCCESS) { + DeviceInfo.ThreadsPerGroup[device_id] = + reinterpret_cast(&grid_max_dim)[0] / + DeviceInfo.GroupsPerDevice[device_id]; + if ((DeviceInfo.ThreadsPerGroup[device_id] > + RTLDeviceInfoTy::Max_WG_Size) || + DeviceInfo.ThreadsPerGroup[device_id] == 0) { + DP("Capped thread limit: %d\n", RTLDeviceInfoTy::Max_WG_Size); + DeviceInfo.ThreadsPerGroup[device_id] = RTLDeviceInfoTy::Max_WG_Size; + } else { + DP("Using ROCm Queried thread limit: %d\n", + DeviceInfo.ThreadsPerGroup[device_id]); + } + } else { + DeviceInfo.ThreadsPerGroup[device_id] = RTLDeviceInfoTy::Max_WG_Size; + DP("Error getting max block dimension, use default:%d \n", + RTLDeviceInfoTy::Max_WG_Size); + } + + // Get wavefront size + uint32_t wavefront_size = 0; + err = + hsa_agent_get_info(agent, HSA_AGENT_INFO_WAVEFRONT_SIZE, &wavefront_size); + if (err == HSA_STATUS_SUCCESS) { + DP("Queried wavefront size: %d\n", wavefront_size); + DeviceInfo.WarpSize[device_id] = wavefront_size; + } else { + DP("Default wavefront size: %d\n", + AMDGPUGpuGridValues[GVIDX::GV_Warp_Size]); + DeviceInfo.WarpSize[device_id] = + AMDGPUGpuGridValues[GVIDX::GV_Warp_Size]; + } + + err = hsa_agent_get_info(agent, (hsa_agent_info_t)HSA_AGENT_INFO_NAME, + (void *)GPUName); + DP("Name of gpu:%s\n", GPUName); + + // Adjust teams to the env variables + if (DeviceInfo.EnvTeamLimit > 0 && + DeviceInfo.GroupsPerDevice[device_id] > DeviceInfo.EnvTeamLimit) { + DeviceInfo.GroupsPerDevice[device_id] = DeviceInfo.EnvTeamLimit; + DP("Capping max groups per device to OMP_TEAM_LIMIT=%d\n", + DeviceInfo.EnvTeamLimit); + } + + // Set default number of teams + if (DeviceInfo.EnvNumTeams > 0) { + DeviceInfo.NumTeams[device_id] = DeviceInfo.EnvNumTeams; + DP("Default number of teams set according to environment %d\n", + DeviceInfo.EnvNumTeams); + } else { + DeviceInfo.NumTeams[device_id] = RTLDeviceInfoTy::DefaultNumTeams; + DP("Default number of teams set according to library's default %d\n", + RTLDeviceInfoTy::DefaultNumTeams); + } + + if (DeviceInfo.NumTeams[device_id] > DeviceInfo.GroupsPerDevice[device_id]) { + DeviceInfo.NumTeams[device_id] = DeviceInfo.GroupsPerDevice[device_id]; + DP("Default number of teams exceeds device limit, capping at %d\n", + DeviceInfo.GroupsPerDevice[device_id]); + } + + // Set default number of threads + DeviceInfo.NumThreads[device_id] = RTLDeviceInfoTy::Default_WG_Size; + DP("Default number of threads set according to library's default %d\n", + RTLDeviceInfoTy::Default_WG_Size); + if (DeviceInfo.NumThreads[device_id] > + DeviceInfo.ThreadsPerGroup[device_id]) { + DeviceInfo.NumTeams[device_id] = DeviceInfo.ThreadsPerGroup[device_id]; + DP("Default number of threads exceeds device limit, capping at %d\n", + DeviceInfo.ThreadsPerGroup[device_id]); + } + + DP("Device %d: default limit for groupsPerDevice %d & threadsPerGroup %d\n", + device_id, DeviceInfo.GroupsPerDevice[device_id], + DeviceInfo.ThreadsPerGroup[device_id]); + + DP("Device %d: wavefront size %d, total threads %d x %d = %d\n", device_id, + DeviceInfo.WarpSize[device_id], DeviceInfo.ThreadsPerGroup[device_id], + DeviceInfo.GroupsPerDevice[device_id], + DeviceInfo.GroupsPerDevice[device_id] * + DeviceInfo.ThreadsPerGroup[device_id]); + + return OFFLOAD_SUCCESS; +} + +__tgt_target_table *__tgt_rtl_load_binary(int32_t device_id, + __tgt_device_image *image) { + size_t img_size = (char *)image->ImageEnd - (char *)image->ImageStart; + + DeviceInfo.clearOffloadEntriesTable(device_id); + // TODO: is BRIG even required to be supported? Can we assume AMDGCN only? + int useBrig = 0; + + // We do not need to set the ELF version because the caller of this function + // had to do that to decide the right runtime to use + + // Obtain elf handler and do an extra check + { + Elf *elfP = elf_memory((char *)image->ImageStart, img_size); + if (!elfP) { + DP("Unable to get ELF handle: %s!\n", elf_errmsg(-1)); + return 0; + } + + if (elf_kind(elfP) != ELF_K_ELF) { + DP("Invalid Elf kind!\n"); + elf_end(elfP); + return 0; + } + + uint16_t MachineID; + { + Elf64_Ehdr *eh64 = elf64_getehdr(elfP); + Elf32_Ehdr *eh32 = elf32_getehdr(elfP); + if (eh64 && !eh32) + MachineID = eh64->e_machine; + else if (eh32 && !eh64) + MachineID = eh32->e_machine; + else { + printf("Ambiguous ELF header!\n"); + return 0; + } + } + + switch (MachineID) { + // old brig file in HSA 1.0P + case 0: + // brig file in HSAIL path + case 44890: + case 44891: { + useBrig = 1; + }; break; + case 224: + // do nothing, amdgcn + break; + default: + DP("Unsupported machine ID found: %d\n", MachineID); + elf_end(elfP); + return 0; + } + + DP("Machine ID found: %d\n", MachineID); + // Close elf + elf_end(elfP); + } + + atmi_status_t err; + // Temporaryily adding a fix to 7 and 8 gpu cards working. + // This patch needs to be reverted once we have a fix as described in: + // https://github.com/RadeonOpenCompute/atmi-staging/issues/67 + // https://github.com/RadeonOpenCompute/atmi-staging/issues/65 + + static int visited = 0; + if (!visited) { + atmi_platform_type_t platform = (useBrig ? BRIG : AMDGCN); + void *new_img = malloc(img_size); + memcpy(new_img, image->ImageStart, img_size); + err = atmi_module_register_from_memory((void **)&new_img, &img_size, + &platform, 1); + + free(new_img); + check("Module registering", err); + if (err != ATMI_STATUS_SUCCESS) { + fprintf(stderr, + "Possible gpu arch mismatch: %s, please check" + " compiler: -march= flag\n", + GPUName); + return NULL; + } + new_img = NULL; + visited = 1; + } + + DP("ATMI module successfully loaded!\n"); + + // TODO: Check with Guansong to understand the below comment more thoroughly. + // Here, we take advantage of the data that is appended after img_end to get + // the symbols' name we need to load. This data consist of the host entries + // begin and end as well as the target name (see the offloading linker script + // creation in clang compiler). + + // Find the symbols in the module by name. The name can be obtain by + // concatenating the host entry name with the target name + + __tgt_offload_entry *HostBegin = image->EntriesBegin; + __tgt_offload_entry *HostEnd = image->EntriesEnd; + + for (__tgt_offload_entry *e = HostBegin; e != HostEnd; ++e) { + + if (!e->addr) { + // FIXME: Probably we should fail when something like this happen, the + // host should have always something in the address to uniquely identify + // the target region. + DP("Analyzing host entry '' (size = %lld)...\n", + (unsigned long long)e->size); + + __tgt_offload_entry entry = *e; + DeviceInfo.addOffloadEntry(device_id, entry); + continue; + } + + if (e->size) { + __tgt_offload_entry entry = *e; + + void *varptr; + uint32_t varsize; + atmi_mem_place_t place = DeviceInfo.GPUMEMPlaces[device_id]; + err = atmi_interop_hsa_get_symbol_info(place, e->name, &varptr, &varsize); + + if (err != ATMI_STATUS_SUCCESS) { + DP("Loading global '%s' (Failed)\n", e->name); + return NULL; + } + + if (varsize != e->size) { + DP("Loading global '%s' - size mismatch (%u != %lu)\n", e->name, + varsize, e->size); + return NULL; + } + + DP("Entry point " DPxMOD " maps to global %s (" DPxMOD ")\n", + DPxPTR(e - HostBegin), e->name, DPxPTR(varptr)); + entry.addr = (void *)varptr; + + DeviceInfo.addOffloadEntry(device_id, entry); + + if (DeviceInfo.RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY && + e->flags & OMP_DECLARE_TARGET_LINK) { + // If unified memory is present any target link variables + // can access host addresses directly. There is no longer a + // need for device copies. + err = atmi_memcpy(varptr, e->addr, sizeof(void *)); + if (err != ATMI_STATUS_SUCCESS) + DP("Error when copying USM\n"); + DP("Copy linked variable host address (" DPxMOD ")" + "to device address (" DPxMOD ")\n", + DPxPTR(*((void **)e->addr)), DPxPTR(varptr)); + } + + continue; + } + + DP("to find the kernel name: %s size: %lu\n", e->name, strlen(e->name)); + + atmi_mem_place_t place = DeviceInfo.GPUMEMPlaces[device_id]; + atmi_kernel_t kernel; + uint32_t kernel_segment_size; + err = atmi_interop_hsa_get_kernel_info( + place, e->name, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_SIZE, + &kernel_segment_size); + + // each arg is a void * in this openmp implementation + uint32_t arg_num = kernel_segment_size / sizeof(void *); + std::vector arg_sizes(arg_num); + for (std::vector::iterator it = arg_sizes.begin(); + it != arg_sizes.end(); it++) { + *it = sizeof(void *); + } + + atmi_status_t stat = atmi_kernel_create(&kernel, arg_num, &arg_sizes[0], 1, + ATMI_DEVTYPE_GPU, e->name); + + if (stat != ATMI_STATUS_SUCCESS) { + DP("atmi_kernel_create failed %d\n", stat); + return NULL; + } + // default value GENERIC (in case symbol is missing from cubin file) + int8_t ExecModeVal = ExecutionModeType::GENERIC; + + // get flat group size if present, else Default_WG_Size + int16_t WGSizeVal = RTLDeviceInfoTy::Default_WG_Size; + + // get Kernel Descriptor if present. + // Keep struct in sync wih getTgtAttributeStructQTy in CGOpenMPRuntime.cpp + struct KernDescValType { + uint16_t Version; + uint16_t TSize; + uint16_t WG_Size; + uint8_t Mode; + uint8_t HostServices; + }; + struct KernDescValType KernDescVal; + std::string KernDescNameStr(e->name); + KernDescNameStr += "_kern_desc"; + const char *KernDescName = KernDescNameStr.c_str(); + + void *KernDescPtr; + uint32_t KernDescSize; + err = atmi_interop_hsa_get_symbol_info(place, KernDescName, &KernDescPtr, + &KernDescSize); + if (err == ATMI_STATUS_SUCCESS) { + if ((size_t)KernDescSize != sizeof(KernDescVal)) + DP("Loading global computation properties '%s' - size mismatch (%u != " + "%lu)\n", + KernDescName, KernDescSize, sizeof(KernDescVal)); + + err = atmi_memcpy(&KernDescVal, KernDescPtr, (size_t)KernDescSize); + if (err != ATMI_STATUS_SUCCESS) { + DP("Error when copying data from device to host. Pointers: " + "host = " DPxMOD ", device = " DPxMOD ", size = %u\n", + DPxPTR(&KernDescVal), DPxPTR(KernDescPtr), KernDescSize); + return NULL; + } + // Check structure size against recorded size. + if ((size_t)KernDescSize != KernDescVal.TSize) + DP("KernDescVal size %lu does not match advertized size %d for '%s'\n", + sizeof(KernDescVal), KernDescVal.TSize, KernDescName); + + DP("After loading global for %s KernDesc \n", KernDescName); + DP("KernDesc: Version: %d\n", KernDescVal.Version); + DP("KernDesc: TSize: %d\n", KernDescVal.TSize); + DP("KernDesc: WG_Size: %d\n", KernDescVal.WG_Size); + DP("KernDesc: Mode: %d\n", KernDescVal.Mode); + DP("KernDesc: HostServices: %x\n", KernDescVal.HostServices); + + ExecModeVal = KernDescVal.Mode; + DP("ExecModeVal %d\n", ExecModeVal); + if (KernDescVal.WG_Size == 0) { + KernDescVal.WG_Size = RTLDeviceInfoTy::Default_WG_Size; + DP("Setting KernDescVal.WG_Size to default %d\n", KernDescVal.WG_Size); + } + WGSizeVal = KernDescVal.WG_Size; + DP("WGSizeVal %d\n", WGSizeVal); + check("Loading KernDesc computation property", err); + } else { + DP("Warning: Loading KernDesc '%s' - symbol not found, ", KernDescName); + + // Generic + std::string ExecModeNameStr(e->name); + ExecModeNameStr += "_exec_mode"; + const char *ExecModeName = ExecModeNameStr.c_str(); + + void *ExecModePtr; + uint32_t varsize; + err = atmi_interop_hsa_get_symbol_info(place, ExecModeName, &ExecModePtr, + &varsize); + if (err == ATMI_STATUS_SUCCESS) { + if ((size_t)varsize != sizeof(int8_t)) { + DP("Loading global computation properties '%s' - size mismatch(%u != " + "%lu)\n", + ExecModeName, varsize, sizeof(int8_t)); + return NULL; + } + + err = atmi_memcpy(&ExecModeVal, ExecModePtr, (size_t)varsize); + if (err != ATMI_STATUS_SUCCESS) { + DP("Error when copying data from device to host. Pointers: " + "host = " DPxMOD ", device = " DPxMOD ", size = %u\n", + DPxPTR(&ExecModeVal), DPxPTR(ExecModePtr), varsize); + return NULL; + } + DP("After loading global for %s ExecMode = %d\n", ExecModeName, + ExecModeVal); + + if (ExecModeVal < 0 || ExecModeVal > 1) { + DP("Error wrong exec_mode value specified in HSA code object file: " + "%d\n", + ExecModeVal); + return NULL; + } + } else { + DP("Loading global exec_mode '%s' - symbol missing, using default " + "value " + "GENERIC (1)\n", + ExecModeName); + } + check("Loading computation property", err); + + // Flag group size + std::string WGSizeNameStr(e->name); + WGSizeNameStr += "_wg_size"; + const char *WGSizeName = WGSizeNameStr.c_str(); + + void *WGSizePtr; + uint32_t WGSize; + err = atmi_interop_hsa_get_symbol_info(place, WGSizeName, &WGSizePtr, + &WGSize); + if (err == ATMI_STATUS_SUCCESS) { + if ((size_t)WGSize != sizeof(int16_t)) { + DP("Loading global computation properties '%s' - size mismatch (%u " + "!= " + "%lu)\n", + WGSizeName, WGSize, sizeof(int16_t)); + return NULL; + } + + err = atmi_memcpy(&WGSizeVal, WGSizePtr, (size_t)WGSize); + if (err != ATMI_STATUS_SUCCESS) { + DP("Error when copying data from device to host. Pointers: " + "host = " DPxMOD ", device = " DPxMOD ", size = %u\n", + DPxPTR(&WGSizeVal), DPxPTR(WGSizePtr), WGSize); + return NULL; + } + DP("After loading global for %s WGSize = %d\n", WGSizeName, WGSizeVal); + + if (WGSizeVal < RTLDeviceInfoTy::Default_WG_Size || + WGSizeVal > RTLDeviceInfoTy::Max_WG_Size) { + DP("Error wrong WGSize value specified in HSA code object file: " + "%d\n", + WGSizeVal); + WGSizeVal = RTLDeviceInfoTy::Default_WG_Size; + } + } else { + DP("Warning: Loading WGSize '%s' - symbol not found, " + "using default value %d\n", + WGSizeName, WGSizeVal); + } + + check("Loading WGSize computation property", err); + } + + KernelsList.push_back(KernelTy(kernel, ExecModeVal, WGSizeVal, e->name)); + __tgt_offload_entry entry = *e; + entry.addr = (void *)&KernelsList.back(); + DeviceInfo.addOffloadEntry(device_id, entry); + DP("Entry point %ld maps to %s\n", e - HostBegin, e->name); + } + + { // send device environment here + + omptarget_device_environmentTy host_device_env; + host_device_env.num_devices = DeviceInfo.NumberOfDevices; + host_device_env.device_num = device_id; + host_device_env.debug_level = 0; +#ifdef OMPTARGET_DEBUG + if (char *envStr = getenv("LIBOMPTARGET_DEVICE_RTL_DEBUG")) { + host_device_env.debug_level = std::stoi(envStr); + } +#endif + + const char *device_env_Name = "omptarget_device_environment"; + void *device_env_Ptr; + uint32_t varsize; + + atmi_mem_place_t place = DeviceInfo.GPUMEMPlaces[device_id]; + err = atmi_interop_hsa_get_symbol_info(place, device_env_Name, + &device_env_Ptr, &varsize); + + if (err == ATMI_STATUS_SUCCESS) { + if ((size_t)varsize != sizeof(host_device_env)) { + DP("Global device_environment '%s' - size mismatch (%u != %lu)\n", + device_env_Name, varsize, sizeof(int32_t)); + return NULL; + } + + err = atmi_memcpy(device_env_Ptr, &host_device_env, (size_t)varsize); + if (err != ATMI_STATUS_SUCCESS) { + DP("Error when copying data from host to device. Pointers: " + "host = " DPxMOD ", device = " DPxMOD ", size = %u\n", + DPxPTR(&host_device_env), DPxPTR(device_env_Ptr), varsize); + return NULL; + } + + DP("Sending global device environment %lu bytes\n", (size_t)varsize); + } else { + DP("Finding global device environment '%s' - symbol missing.\n", + device_env_Name); + // no need to return NULL, consider this is a not a device debug build. + // return NULL; + } + + check("Sending device environment", err); + } + + return DeviceInfo.getOffloadEntriesTable(device_id); +} + +void *__tgt_rtl_data_alloc(int device_id, int64_t size, void *) { + void *ptr = NULL; + assert(device_id < + (int)DeviceInfo.Machine->device_count_by_type[ATMI_DEVTYPE_GPU] && + "Device ID too large"); + atmi_mem_place_t place = DeviceInfo.GPUMEMPlaces[device_id]; + atmi_status_t err = atmi_malloc(&ptr, size, place); + DP("Tgt alloc data %ld bytes, (tgt:%016llx).\n", size, + (long long unsigned)(Elf64_Addr)ptr); + ptr = (err == ATMI_STATUS_SUCCESS) ? ptr : NULL; + return ptr; +} + +int32_t __tgt_rtl_data_submit(int device_id, void *tgt_ptr, void *hst_ptr, + int64_t size) { + atmi_status_t err; + assert(device_id < + (int)DeviceInfo.Machine->device_count_by_type[ATMI_DEVTYPE_GPU] && + "Device ID too large"); + // Return success if we are not doing host to target. + if (!hst_ptr) + return OFFLOAD_SUCCESS; + + DP("Submit data %ld bytes, (hst:%016llx) -> (tgt:%016llx).\n", size, + (long long unsigned)(Elf64_Addr)hst_ptr, + (long long unsigned)(Elf64_Addr)tgt_ptr); + err = atmi_memcpy(tgt_ptr, hst_ptr, (size_t)size); + if (err != ATMI_STATUS_SUCCESS) { + DP("Error when copying data from host to device. Pointers: " + "host = 0x%016lx, device = 0x%016lx, size = %lld\n", + (Elf64_Addr)hst_ptr, (Elf64_Addr)tgt_ptr, (unsigned long long)size); + return OFFLOAD_FAIL; + } + return OFFLOAD_SUCCESS; +} + +int32_t __tgt_rtl_data_retrieve(int device_id, void *hst_ptr, void *tgt_ptr, + int64_t size) { + assert(device_id < + (int)DeviceInfo.Machine->device_count_by_type[ATMI_DEVTYPE_GPU] && + "Device ID too large"); + // Return success if we are not copying back to host from target. + if (!hst_ptr) + return OFFLOAD_SUCCESS; + atmi_status_t err; + DP("Retrieve data %ld bytes, (tgt:%016llx) -> (hst:%016llx).\n", size, + (long long unsigned)(Elf64_Addr)tgt_ptr, + (long long unsigned)(Elf64_Addr)hst_ptr); + err = atmi_memcpy(hst_ptr, tgt_ptr, (size_t)size); + if (err != ATMI_STATUS_SUCCESS) { + DP("Error when copying data from device to host. Pointers: " + "host = 0x%016lx, device = 0x%016lx, size = %lld\n", + (Elf64_Addr)hst_ptr, (Elf64_Addr)tgt_ptr, (unsigned long long)size); + return OFFLOAD_FAIL; + } + DP("DONE Retrieve data %ld bytes, (tgt:%016llx) -> (hst:%016llx).\n", size, + (long long unsigned)(Elf64_Addr)tgt_ptr, + (long long unsigned)(Elf64_Addr)hst_ptr); + return OFFLOAD_SUCCESS; +} + +int32_t __tgt_rtl_data_delete(int device_id, void *tgt_ptr) { + assert(device_id < + (int)DeviceInfo.Machine->device_count_by_type[ATMI_DEVTYPE_GPU] && + "Device ID too large"); + atmi_status_t err; + DP("Tgt free data (tgt:%016llx).\n", (long long unsigned)(Elf64_Addr)tgt_ptr); + err = atmi_free(tgt_ptr); + if (err != ATMI_STATUS_SUCCESS) { + DP("Error when freeing CUDA memory\n"); + return OFFLOAD_FAIL; + } + return OFFLOAD_SUCCESS; +} + +void retrieveDeviceEnv(int32_t device_id) { + int err; + const char *device_env_Name = "omptarget_device_environment"; + omptarget_device_environmentTy host_device_env; + void *device_env_Ptr; + uint32_t varsize; + + atmi_mem_place_t place = DeviceInfo.GPUMEMPlaces[device_id]; + err = atmi_interop_hsa_get_symbol_info(place, device_env_Name, + &device_env_Ptr, &varsize); + + if (err == ATMI_STATUS_SUCCESS) { + if ((size_t)varsize != sizeof(host_device_env)) { + DP("Global device_environment '%s' - size mismatch (%u != %lu)\n", + device_env_Name, varsize, sizeof(int32_t)); + return; + } + + err = __tgt_rtl_data_retrieve(device_id, &host_device_env, device_env_Ptr, + varsize); + if (err != 0) { + DP("Error when copying data from device to host . Pointers: " + "host = " DPxMOD ", device = " DPxMOD ", size = %u\n", + DPxPTR(&host_device_env), DPxPTR(device_env_Ptr), varsize); + return; + } + + DP("Retrieving device environment %lu bytes\n", (size_t)varsize); + } else { + DP("Retrieving device environment '%s' - symbol missing.\n", + device_env_Name); + } + + check("Retreiving updated device environment", err); +} + +// Determine launch values for threadsPerGroup and num_groups. +// Outputs: treadsPerGroup, num_groups +// Inputs: Max_Teams, Max_WG_Size, Warp_Size, ExecutionMode, +// EnvTeamLimit, EnvNumTeams, num_teams, thread_limit, +// loop_tripcount. +void getLaunchVals(int &threadsPerGroup, unsigned &num_groups, int ConstWGSize, + int ExecutionMode, int EnvTeamLimit, int EnvNumTeams, + int num_teams, int thread_limit, uint64_t loop_tripcount) { + + int Max_Teams = DeviceInfo.EnvMaxTeamsDefault > 0 + ? DeviceInfo.EnvMaxTeamsDefault + : DeviceInfo.Max_Teams; + if (Max_Teams > DeviceInfo.HardTeamLimit) + Max_Teams = DeviceInfo.HardTeamLimit; + + if (print_kernel_trace > 1) { + fprintf(stderr, "RTLDeviceInfoTy::Max_Teams: %d\n", + RTLDeviceInfoTy::Max_Teams); + fprintf(stderr, "Max_Teams: %d\n", Max_Teams); + fprintf(stderr, "RTLDeviceInfoTy::Warp_Size: %d\n", + RTLDeviceInfoTy::Warp_Size); + fprintf(stderr, "RTLDeviceInfoTy::Max_WG_Size: %d\n", + RTLDeviceInfoTy::Max_WG_Size); + fprintf(stderr, "RTLDeviceInfoTy::Default_WG_Size: %d\n", + RTLDeviceInfoTy::Default_WG_Size); + fprintf(stderr, "thread_limit: %d\n", thread_limit); + fprintf(stderr, "threadsPerGroup: %d\n", threadsPerGroup); + fprintf(stderr, "ConstWGSize: %d\n", ConstWGSize); + } + // check for thread_limit() clause + if (thread_limit > 0) { + threadsPerGroup = thread_limit; + DP("Setting threads per block to requested %d\n", thread_limit); + if (ExecutionMode == GENERIC) { // Add master warp for GENERIC + threadsPerGroup += RTLDeviceInfoTy::Warp_Size; + DP("Adding master wavefront: +%d threads\n", RTLDeviceInfoTy::Warp_Size); + } + if (threadsPerGroup > RTLDeviceInfoTy::Max_WG_Size) { // limit to max + threadsPerGroup = RTLDeviceInfoTy::Max_WG_Size; + DP("Setting threads per block to maximum %d\n", threadsPerGroup); + } + } + // check flat_max_work_group_size attr here + if (threadsPerGroup > ConstWGSize) { + threadsPerGroup = ConstWGSize; + DP("Reduced threadsPerGroup to flat-attr-group-size limit %d\n", + threadsPerGroup); + } + if (print_kernel_trace > 1) + fprintf(stderr, "threadsPerGroup: %d\n", threadsPerGroup); + DP("Preparing %d threads\n", threadsPerGroup); + + // Set default num_groups (teams) + if (DeviceInfo.EnvTeamLimit > 0) + num_groups = (Max_Teams < DeviceInfo.EnvTeamLimit) + ? Max_Teams + : DeviceInfo.EnvTeamLimit; + else + num_groups = Max_Teams; + DP("Set default num of groups %d\n", num_groups); + + if (print_kernel_trace > 1) { + fprintf(stderr, "num_groups: %d\n", num_groups); + fprintf(stderr, "num_teams: %d\n", num_teams); + } + + // Reduce num_groups if threadsPerGroup exceeds RTLDeviceInfoTy::Max_WG_Size + // This reduction is typical for default case (no thread_limit clause). + // or when user goes crazy with num_teams clause. + // FIXME: We cant distinguish between a constant or variable thread limit. + // So we only handle constant thread_limits. + if (threadsPerGroup > + RTLDeviceInfoTy::Default_WG_Size) // 256 < threadsPerGroup <= 1024 + // Should we round threadsPerGroup up to nearest RTLDeviceInfoTy::Warp_Size + // here? + num_groups = (Max_Teams * RTLDeviceInfoTy::Max_WG_Size) / threadsPerGroup; + + // check for num_teams() clause + if (num_teams > 0) { + num_groups = (num_teams < num_groups) ? num_teams : num_groups; + } + if (print_kernel_trace > 1) { + fprintf(stderr, "num_groups: %d\n", num_groups); + fprintf(stderr, "DeviceInfo.EnvNumTeams %d\n", DeviceInfo.EnvNumTeams); + fprintf(stderr, "DeviceInfo.EnvTeamLimit %d\n", DeviceInfo.EnvTeamLimit); + } + + if (DeviceInfo.EnvNumTeams > 0) { + num_groups = (DeviceInfo.EnvNumTeams < num_groups) ? DeviceInfo.EnvNumTeams + : num_groups; + DP("Modifying teams based on EnvNumTeams %d\n", DeviceInfo.EnvNumTeams); + } else if (DeviceInfo.EnvTeamLimit > 0) { + num_groups = (DeviceInfo.EnvTeamLimit < num_groups) + ? DeviceInfo.EnvTeamLimit + : num_groups; + DP("Modifying teams based on EnvTeamLimit%d\n", DeviceInfo.EnvTeamLimit); + } else { + if (num_teams <= 0) { + if (loop_tripcount > 0) { + if (ExecutionMode == SPMD) { + // round up to the nearest integer + num_groups = ((loop_tripcount - 1) / threadsPerGroup) + 1; + } else { + num_groups = loop_tripcount; + } + DP("Using %d teams due to loop trip count %" PRIu64 " and number of " + "threads per block %d\n", + num_groups, loop_tripcount, threadsPerGroup); + } + } else { + num_groups = num_teams; + } + if (num_groups > Max_Teams) { + num_groups = Max_Teams; + if (print_kernel_trace > 1) + fprintf(stderr, "Limiting num_groups %d to Max_Teams %d \n", num_groups, + Max_Teams); + } + if (num_groups > num_teams && num_teams > 0) { + num_groups = num_teams; + if (print_kernel_trace > 1) + fprintf(stderr, "Limiting num_groups %d to clause num_teams %d \n", + num_groups, num_teams); + } + } + + // num_teams clause always honored, no matter what, unless DEFAULT is active. + if (num_teams > 0) { + num_groups = num_teams; + // Cap num_groups to EnvMaxTeamsDefault if set. + if (DeviceInfo.EnvMaxTeamsDefault > 0 && + num_groups > DeviceInfo.EnvMaxTeamsDefault) + num_groups = DeviceInfo.EnvMaxTeamsDefault; + } + if (print_kernel_trace > 1) { + fprintf(stderr, "threadsPerGroup: %d\n", threadsPerGroup); + fprintf(stderr, "num_groups: %d\n", num_groups); + fprintf(stderr, "loop_tripcount: %ld\n", loop_tripcount); + } + DP("Final %d num_groups and %d threadsPerGroup\n", num_groups, + threadsPerGroup); +} + +int32_t __tgt_rtl_run_target_team_region(int32_t device_id, void *tgt_entry_ptr, + void **tgt_args, + ptrdiff_t *tgt_offsets, + int32_t arg_num, int32_t num_teams, + int32_t thread_limit, + uint64_t loop_tripcount) { + + // Set the context we are using + // update thread limit content in gpu memory if un-initialized or specified + // from host + + DP("Run target team region thread_limit %d\n", thread_limit); + + // All args are references. + std::vector args(arg_num); + std::vector ptrs(arg_num); + + DP("Arg_num: %d\n", arg_num); + for (int32_t i = 0; i < arg_num; ++i) { + ptrs[i] = (void *)((intptr_t)tgt_args[i] + tgt_offsets[i]); + args[i] = &ptrs[i]; + DP("Offseted base: arg[%d]:" DPxMOD "\n", i, DPxPTR(ptrs[i])); + } + + KernelTy *KernelInfo = (KernelTy *)tgt_entry_ptr; + + /* + * Set limit based on ThreadsPerGroup and GroupsPerDevice + */ + unsigned num_groups = 0; + + int threadsPerGroup = RTLDeviceInfoTy::Default_WG_Size; + + getLaunchVals(threadsPerGroup, num_groups, KernelInfo->ConstWGSize, + KernelInfo->ExecutionMode, DeviceInfo.EnvTeamLimit, + DeviceInfo.EnvNumTeams, + num_teams, // From run_region arg + thread_limit, // From run_region arg + loop_tripcount // From run_region arg + ); + + if (print_kernel_trace > 0) + // enum modes are SPMD, GENERIC, NONE 0,1,2 + fprintf(stderr, + "DEVID:%2d SGN:%1d ConstWGSize:%-4d args:%2d teamsXthrds:(%4dX%4d) " + "reqd:(%4dX%4d) n:%s\n", + device_id, KernelInfo->ExecutionMode, KernelInfo->ConstWGSize, + arg_num, num_groups, threadsPerGroup, num_teams, thread_limit, + KernelInfo->Name); + + // Run on the device. + atmi_kernel_t kernel = KernelInfo->Func; + ATMI_LPARM_1D(lparm, num_groups * threadsPerGroup); + lparm->groupDim[0] = threadsPerGroup; + lparm->synchronous = ATMI_TRUE; + lparm->groupable = ATMI_FALSE; + lparm->place = DeviceInfo.GPUPlaces[device_id]; + atmi_task_launch(lparm, kernel, &args[0]); + + DP("Kernel completed\n"); + + retrieveDeviceEnv(device_id); + + return OFFLOAD_SUCCESS; +} + +int32_t __tgt_rtl_run_target_region(int32_t device_id, void *tgt_entry_ptr, + void **tgt_args, ptrdiff_t *tgt_offsets, + int32_t arg_num) { + // use one team and one thread + // fix thread num + int32_t team_num = 1; + int32_t thread_limit = 0; // use default + return __tgt_rtl_run_target_team_region(device_id, tgt_entry_ptr, tgt_args, + tgt_offsets, arg_num, team_num, + thread_limit, 0); +} + +#ifdef __cplusplus +} +#endif