Index: libomptarget/CMakeLists.txt =================================================================== --- libomptarget/CMakeLists.txt +++ libomptarget/CMakeLists.txt @@ -107,6 +107,9 @@ set(LIBOMPTARGET_LIBRARY_DIR ${CMAKE_CURRENT_BINARY_DIR}) endif() + # Build offloading plugins and device RTLs if they are available. + add_subdirectory(plugins) + # Add tests. add_subdirectory(test) Index: libomptarget/plugins/CMakeLists.txt =================================================================== --- /dev/null +++ libomptarget/plugins/CMakeLists.txt @@ -0,0 +1,71 @@ +##===----------------------------------------------------------------------===## +# +# The LLVM Compiler Infrastructure +# +# This file is dual licensed under the MIT and the University of Illinois Open +# Source Licenses. See LICENSE.txt for details. +# +##===----------------------------------------------------------------------===## +# +# Build plugins for the user system if available. +# +##===----------------------------------------------------------------------===## + +# void build_generic_elf64(string tmachine, string tmachine_name, string tmachine_libname, string elf_machine_id); +# - build a plugin for an ELF based generic 64-bit target based on libffi. +# - tmachine: name of the machine processor as used in the cmake build system. +# - tmachine_name: name of the machine to be printed with the debug messages. +# - tmachine_libname: machine name to be appended to the plugin library name. +macro(build_generic_elf64 tmachine tmachine_name tmachine_libname tmachine_triple elf_machine_id) +if(CMAKE_SYSTEM_PROCESSOR MATCHES "${tmachine}$") + if(LIBOMPTARGET_DEP_LIBELF_FOUND) + if(LIBOMPTARGET_DEP_LIBFFI_FOUND) + + libomptarget_say("Building ${tmachine_name} offloading plugin.") + + include_directories(${LIBOMPTARGET_DEP_LIBFFI_INCLUDE_DIR}) + include_directories(${LIBOMPTARGET_DEP_LIBELF_INCLUDE_DIR}) + + # Define macro to be used as prefix of the runtime messages for this target. + add_definitions("-DTARGET_NAME=${tmachine_name}") + + # Define macro with the ELF ID for this target. + add_definitions("-DTARGET_ELF_ID=${elf_machine_id}") + + add_library("omptarget.rtl.${tmachine_libname}" SHARED + ${CMAKE_CURRENT_SOURCE_DIR}/../generic-elf-64bit/src/rtl.cpp) + + # Install plugin under the lib destination folder. + install(TARGETS "omptarget.rtl.${tmachine_libname}" + LIBRARY DESTINATION lib${LIBOMPTARGET_LIBDIR_SUFFIX}) + + target_link_libraries( + "omptarget.rtl.${tmachine_libname}" + ${LIBOMPTARGET_DEP_LIBFFI_LIBRARIES} + ${LIBOMPTARGET_DEP_LIBELF_LIBRARIES} + dl + "-Wl,--version-script=${CMAKE_CURRENT_SOURCE_DIR}/../exports") + + # Report to the parent scope that we are building a plugin. + set(LIBOMPTARGET_SYSTEM_TARGETS + "${LIBOMPTARGET_SYSTEM_TARGETS} ${tmachine_triple}" PARENT_SCOPE) + + else(LIBOMPTARGET_DEP_LIBFFI_FOUND) + libomptarget_say("Not building ${tmachine_name} offloading plugin: libffi dependency not found.") + endif(LIBOMPTARGET_DEP_LIBFFI_FOUND) + else(LIBOMPTARGET_DEP_LIBELF_FOUND) + libomptarget_say("Not building ${tmachine_name} offloading plugin: libelf dependency not found.") + endif(LIBOMPTARGET_DEP_LIBELF_FOUND) +else() + libomptarget_say("Not building ${tmachine_name} offloading plugin: machine not found in the system.") +endif() +endmacro() + +add_subdirectory(cuda) +add_subdirectory(ppc64) +add_subdirectory(ppc64le) +add_subdirectory(x86_64) + +# Make sure the parent scope can see the plugins that will be created. +set(LIBOMPTARGET_SYSTEM_TARGETS "${LIBOMPTARGET_SYSTEM_TARGETS}" PARENT_SCOPE) + Index: libomptarget/plugins/common/elf_common.c =================================================================== --- /dev/null +++ libomptarget/plugins/common/elf_common.c @@ -0,0 +1,73 @@ +//===-- elf_common.c - Common ELF functionality -------------------*- C -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is dual licensed under the MIT and the University of Illinois Open +// Source Licenses. See LICENSE.txt for details. +// +//===----------------------------------------------------------------------===// +// +// Common ELF functionality for target plugins. +// Must be included in the plugin source file AFTER omptarget.h has been +// included and macro DP(...) has been defined. +// . +// +//===----------------------------------------------------------------------===// + +#if !(defined(_OMPTARGET_H_) && defined(DP)) +#error Include elf_common.c in the plugin source AFTER omptarget.h has been\ + included and macro DP(...) has been defined. +#endif + +#include +#include + +// Check whether an image is valid for execution on target_id +static inline int32_t elf_check_machine(__tgt_device_image *image, + uint16_t target_id) { + + // Is the library version incompatible with the header file? + if (elf_version(EV_CURRENT) == EV_NONE) { + DP("Incompatible ELF library!\n"); + return 0; + } + + char *img_begin = (char *)image->ImageStart; + char *img_end = (char *)image->ImageEnd; + size_t img_size = img_end - img_begin; + + // Obtain elf handler + Elf *e = elf_memory(img_begin, img_size); + if (!e) { + DP("Unable to get ELF handle: %s!\n", elf_errmsg(-1)); + return 0; + } + + // Check if ELF is the right kind. + if (elf_kind(e) != ELF_K_ELF) { + DP("Unexpected ELF type!\n"); + return 0; + } + Elf64_Ehdr *eh64 = elf64_getehdr(e); + Elf32_Ehdr *eh32 = elf32_getehdr(e); + + if (!eh64 && !eh32) { + DP("Unable to get machine ID from ELF file!\n"); + elf_end(e); + return 0; + } + + uint16_t MachineID; + if (eh64 && !eh32) + MachineID = eh64->e_machine; + else if (eh32 && !eh64) + MachineID = eh32->e_machine; + else { + DP("Ambiguous ELF header!\n"); + elf_end(e); + return 0; + } + + elf_end(e); + return MachineID == target_id; +} Index: libomptarget/plugins/cuda/CMakeLists.txt =================================================================== --- /dev/null +++ libomptarget/plugins/cuda/CMakeLists.txt @@ -0,0 +1,49 @@ +##===----------------------------------------------------------------------===## +# +# The LLVM Compiler Infrastructure +# +# This file is dual licensed under the MIT and the University of Illinois Open +# Source Licenses. See LICENSE.txt for details. +# +##===----------------------------------------------------------------------===## +# +# Build a plugin for a CUDA machine if available. +# +##===----------------------------------------------------------------------===## +if(LIBOMPTARGET_DEP_LIBELF_FOUND) + if(LIBOMPTARGET_DEP_CUDA_FOUND) + if(CMAKE_SYSTEM_PROCESSOR MATCHES "(x86_64)|(ppc64le)$" AND CMAKE_SYSTEM_NAME MATCHES "Linux") + + libomptarget_say("Building CUDA offloading plugin.") + + # Define the suffix for the runtime messaging dumps. + add_definitions(-DTARGET_NAME=CUDA) + + if(LIBOMPTARGET_CMAKE_BUILD_TYPE MATCHES debug) + add_definitions(-DCUDA_ERROR_REPORT) + endif() + + include_directories(${LIBOMPTARGET_DEP_CUDA_INCLUDE_DIRS}) + + add_library(omptarget.rtl.cuda SHARED src/rtl.cpp) + + # Install plugin under the lib destination folder. + install(TARGETS omptarget.rtl.cuda LIBRARY DESTINATION lib${LIBOMPTARGET_LIBDIR_SUFFIX}) + + target_link_libraries(omptarget.rtl.cuda + ${LIBOMPTARGET_DEP_CUDA_LIBRARIES} + cuda + ${LIBOMPTARGET_DEP_LIBELF_LIBRARIES} + "-Wl,--version-script=${CMAKE_CURRENT_SOURCE_DIR}/../exports") + + # Report to the parent scope that we are building a plugin for CUDA. + set(LIBOMPTARGET_SYSTEM_TARGETS "${LIBOMPTARGET_SYSTEM_TARGETS} nvptx64-nvidia-cuda" PARENT_SCOPE) + else() + libomptarget_say("Not building CUDA offloading plugin: only support CUDA in Linux x86_64 or ppc64le hosts.") + endif() + else() + libomptarget_say("Not building CUDA offloading plugin: CUDA not found in system.") + endif() +else(LIBOMPTARGET_DEP_LIBELF_FOUND) + libomptarget_say("Not building CUDA offloading plugin: libelf dependency not found.") +endif(LIBOMPTARGET_DEP_LIBELF_FOUND) \ No newline at end of file Index: libomptarget/plugins/cuda/src/rtl.cpp =================================================================== --- /dev/null +++ libomptarget/plugins/cuda/src/rtl.cpp @@ -0,0 +1,670 @@ +//===----RTLs/cuda/src/rtl.cpp - Target RTLs Implementation ------- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is dual licensed under the MIT and the University of Illinois Open +// Source Licenses. See LICENSE.txt for details. +// +//===----------------------------------------------------------------------===// +// +// RTL for CUDA machine +// +//===----------------------------------------------------------------------===// + +#include +#include +#include +#include +#include +#include +#include + +#include "omptarget.h" + +#ifndef TARGET_NAME +#define TARGET_NAME CUDA +#endif + +#define GETNAME2(name) #name +#define GETNAME(name) GETNAME2(name) +#define DP(...) DEBUGP("Target " GETNAME(TARGET_NAME) " RTL", __VA_ARGS__) + +#include "../../common/elf_common.c" + +// Utility for retrieving and printing CUDA error string. +#ifdef CUDA_ERROR_REPORT +#define CUDA_ERR_STRING(err) \ + do { \ + const char *errStr; \ + cuGetErrorString(err, &errStr); \ + DP("CUDA error is: %s\n", errStr); \ + } while (0) +#else +#define CUDA_ERR_STRING(err) \ + {} +#endif + +/// Keep entries table per device. +struct FuncOrGblEntryTy { + __tgt_target_table Table; + std::vector<__tgt_offload_entry> Entries; +}; + +enum ExecutionModeType { + SPMD, + GENERIC, + NONE +}; + +/// Use a single entity to encode a kernel and a set of flags +struct KernelTy { + CUfunction Func; + + // execution mode of kernel + // 0 - SPMD mode (without master warp) + // 1 - Generic mode (with master warp) + int8_t ExecutionMode; + + KernelTy(CUfunction _Func, int8_t _ExecutionMode) + : Func(_Func), ExecutionMode(_ExecutionMode) {} +}; + +/// List that contains all the kernels. +/// FIXME: we may need this to be per device and per library. +std::list KernelsList; + +/// Class containing all the device information. +class RTLDeviceInfoTy { + std::vector FuncGblEntries; + +public: + int NumberOfDevices; + std::vector Modules; + std::vector Contexts; + + // Device properties + std::vector ThreadsPerBlock; + std::vector BlocksPerGrid; + std::vector WarpSize; + + // OpenMP properties + std::vector NumTeams; + std::vector NumThreads; + + // OpenMP Environment properties + int EnvNumTeams; + int EnvTeamLimit; + + //static int EnvNumThreads; + static const int HardTeamLimit = 1<<16; // 64k + static const int HardThreadLimit = 1024; + static const int DefaultNumTeams = 128; + static const int DefaultNumThreads = 1024; + + // Record entry point associated with device + void addOffloadEntry(int32_t device_id, __tgt_offload_entry entry) { + assert(device_id < (int32_t)FuncGblEntries.size() && + "Unexpected device id!"); + FuncOrGblEntryTy &E = FuncGblEntries[device_id]; + + E.Entries.push_back(entry); + } + + // Return true if the entry is associated with device + bool findOffloadEntry(int32_t device_id, void *addr) { + assert(device_id < (int32_t)FuncGblEntries.size() && + "Unexpected device id!"); + FuncOrGblEntryTy &E = FuncGblEntries[device_id]; + + for (auto &it : E.Entries) { + if (it.addr == addr) + return true; + } + + return false; + } + + // Return the pointer to the target entries table + __tgt_target_table *getOffloadEntriesTable(int32_t device_id) { + assert(device_id < (int32_t)FuncGblEntries.size() && + "Unexpected device id!"); + FuncOrGblEntryTy &E = FuncGblEntries[device_id]; + + int32_t size = E.Entries.size(); + + // Table is empty + if (!size) + return 0; + + __tgt_offload_entry *begin = &E.Entries[0]; + __tgt_offload_entry *end = &E.Entries[size - 1]; + + // Update table info according to the entries and return the pointer + E.Table.EntriesBegin = begin; + E.Table.EntriesEnd = ++end; + + return &E.Table; + } + + // Clear entries table for a device + void clearOffloadEntriesTable(int32_t device_id) { + assert(device_id < (int32_t)FuncGblEntries.size() && + "Unexpected device id!"); + FuncOrGblEntryTy &E = FuncGblEntries[device_id]; + E.Entries.clear(); + E.Table.EntriesBegin = E.Table.EntriesEnd = 0; + } + + RTLDeviceInfoTy() { + DP("Start initializing CUDA\n"); + + CUresult err = cuInit(0); + if (err != CUDA_SUCCESS) { + DP("Error when initializing CUDA\n"); + CUDA_ERR_STRING(err); + return; + } + + NumberOfDevices = 0; + + err = cuDeviceGetCount(&NumberOfDevices); + if (err != CUDA_SUCCESS) { + DP("Error when getting CUDA device count\n"); + CUDA_ERR_STRING(err); + return; + } + + if (NumberOfDevices == 0) { + DP("There are no devices supporting CUDA.\n"); + return; + } + + FuncGblEntries.resize(NumberOfDevices); + Contexts.resize(NumberOfDevices); + ThreadsPerBlock.resize(NumberOfDevices); + BlocksPerGrid.resize(NumberOfDevices); + WarpSize.resize(NumberOfDevices); + NumTeams.resize(NumberOfDevices); + NumThreads.resize(NumberOfDevices); + + // Get environment variables regarding teams + char *envStr = getenv("OMP_TEAM_LIMIT"); + if (envStr) { + // OMP_TEAM_LIMIT has been set + EnvTeamLimit = std::stoi(envStr); + DP("Parsed OMP_TEAM_LIMIT=%d\n", EnvTeamLimit); + } else { + EnvTeamLimit = -1; + } + envStr = getenv("OMP_NUM_TEAMS"); + if (envStr) { + // OMP_NUM_TEAMS has been set + EnvNumTeams = std::stoi(envStr); + DP("Parsed OMP_NUM_TEAMS=%d\n", EnvNumTeams); + } else { + EnvNumTeams = -1; + } + } + + ~RTLDeviceInfoTy() { + // Close modules + for (auto &module : Modules) + if (module) { + CUresult err = cuModuleUnload(module); + if (err != CUDA_SUCCESS) { + DP("Error when unloading CUDA module\n"); + CUDA_ERR_STRING(err); + } + } + + // Destroy contexts + for (auto &ctx : Contexts) + if (ctx) { + CUresult err = cuCtxDestroy(ctx); + if (err != CUDA_SUCCESS) { + DP("Error when destroying CUDA context\n"); + CUDA_ERR_STRING(err); + } + } + } +}; + +static RTLDeviceInfoTy DeviceInfo; + +#ifdef __cplusplus +extern "C" { +#endif + +int32_t __tgt_rtl_is_valid_binary(__tgt_device_image *image) { + return elf_check_machine(image, 190); // EM_CUDA = 190. +} + +int32_t __tgt_rtl_number_of_devices() { return DeviceInfo.NumberOfDevices; } + +int32_t __tgt_rtl_init_device(int32_t device_id) { + + CUdevice cuDevice; + DP("Getting device %d\n", device_id); + CUresult err = cuDeviceGet(&cuDevice, device_id); + if (err != CUDA_SUCCESS) { + DP("Error when getting CUDA device with id = %d\n", device_id); + CUDA_ERR_STRING(err); + return OFFLOAD_FAIL; + } + + // Create the context and save it to use whenever this device is selected. + err = cuCtxCreate(&DeviceInfo.Contexts[device_id], CU_CTX_SCHED_BLOCKING_SYNC, + cuDevice); + if (err != CUDA_SUCCESS) { + DP("Error when creating a CUDA context\n"); + CUDA_ERR_STRING(err); + return OFFLOAD_FAIL; + } + + // scan properties to determine number of threads/block and blocks/grid. + struct cudaDeviceProp Properties; + cudaError_t error = cudaGetDeviceProperties(&Properties, device_id); + if (error != cudaSuccess) { + DP("Error getting device Properties, use defaults\n"); + DeviceInfo.BlocksPerGrid[device_id] = RTLDeviceInfoTy::DefaultNumTeams; + DeviceInfo.ThreadsPerBlock[device_id] = RTLDeviceInfoTy::DefaultNumThreads; + DeviceInfo.WarpSize[device_id] = 32; + } else { + // Get blocks per grid + if (Properties.maxGridSize[0] <= RTLDeviceInfoTy::HardTeamLimit) { + DeviceInfo.BlocksPerGrid[device_id] = Properties.maxGridSize[0]; + DP("Using %d CUDA blocks per grid\n", Properties.maxGridSize[0]); + } else { + DeviceInfo.BlocksPerGrid[device_id] = RTLDeviceInfoTy::HardTeamLimit; + DP("Max CUDA blocks per grid %d exceeds the hard team limit %d, capping " + "at the hard limit\n", Properties.maxGridSize[0], + RTLDeviceInfoTy::HardTeamLimit); + } + + // Get threads per block, exploit threads only along x axis + if (Properties.maxThreadsDim[0] <= RTLDeviceInfoTy::HardThreadLimit) { + DeviceInfo.ThreadsPerBlock[device_id] = Properties.maxThreadsDim[0]; + DP("Using %d CUDA threads per block\n", Properties.maxThreadsDim[0]); + if (Properties.maxThreadsDim[0] < Properties.maxThreadsPerBlock) { + DP("(fewer than max per block along all xyz dims %d)\n", + Properties.maxThreadsPerBlock); + } + } else { + DeviceInfo.ThreadsPerBlock[device_id] = RTLDeviceInfoTy::HardThreadLimit; + DP("Max CUDA threads per block %d exceeds the hard thread limit %d, " + "capping at the hard limit\n", Properties.maxThreadsDim[0], + RTLDeviceInfoTy::HardThreadLimit); + } + + // Get warp size + DeviceInfo.WarpSize[device_id] = Properties.warpSize; + } + + // Adjust teams to the env variables + if (DeviceInfo.EnvTeamLimit > 0 && + DeviceInfo.BlocksPerGrid[device_id] > DeviceInfo.EnvTeamLimit) { + DeviceInfo.BlocksPerGrid[device_id] = DeviceInfo.EnvTeamLimit; + DP("Capping max CUDA blocks per grid to OMP_TEAM_LIMIT=%d\n", + DeviceInfo.EnvTeamLimit); + } + + DP("Max number of CUDA blocks %d, threads %d & warp size %d\n", + DeviceInfo.BlocksPerGrid[device_id], DeviceInfo.ThreadsPerBlock[device_id], + DeviceInfo.WarpSize[device_id]); + + // Set default number of teams + if (DeviceInfo.EnvNumTeams > 0) { + DeviceInfo.NumTeams[device_id] = DeviceInfo.EnvNumTeams; + DP("Default number of teams set according to environment %d\n", + DeviceInfo.EnvNumTeams); + } else { + DeviceInfo.NumTeams[device_id] = RTLDeviceInfoTy::DefaultNumTeams; + DP("Default number of teams set according to library's default %d\n", + RTLDeviceInfoTy::DefaultNumTeams); + } + if (DeviceInfo.NumTeams[device_id] > DeviceInfo.BlocksPerGrid[device_id]) { + DeviceInfo.NumTeams[device_id] = DeviceInfo.BlocksPerGrid[device_id]; + DP("Default number of teams exceeds device limit, capping at %d\n", + DeviceInfo.BlocksPerGrid[device_id]); + } + + // Set default number of threads + DeviceInfo.NumThreads[device_id] = RTLDeviceInfoTy::DefaultNumThreads; + DP("Default number of threads set according to library's default %d\n", + RTLDeviceInfoTy::DefaultNumThreads); + if (DeviceInfo.NumThreads[device_id] > + DeviceInfo.ThreadsPerBlock[device_id]) { + DeviceInfo.NumTeams[device_id] = DeviceInfo.ThreadsPerBlock[device_id]; + DP("Default number of threads exceeds device limit, capping at %d\n", + DeviceInfo.ThreadsPerBlock[device_id]); + } + + return OFFLOAD_SUCCESS; +} + +__tgt_target_table *__tgt_rtl_load_binary(int32_t device_id, + __tgt_device_image *image) { + + // Set the context we are using. + CUresult err = cuCtxSetCurrent(DeviceInfo.Contexts[device_id]); + if (err != CUDA_SUCCESS) { + DP("Error when setting a CUDA context for device %d\n", device_id); + CUDA_ERR_STRING(err); + return NULL; + } + + // Clear the offload table as we are going to create a new one. + DeviceInfo.clearOffloadEntriesTable(device_id); + + // Create the module and extract the function pointers. + + CUmodule cumod; + DP("Load data from image " DPxMOD "\n", DPxPTR(image->ImageStart)); + err = cuModuleLoadDataEx(&cumod, image->ImageStart, 0, NULL, NULL); + if (err != CUDA_SUCCESS) { + DP("Error when loading CUDA module\n"); + CUDA_ERR_STRING(err); + return NULL; + } + + DP("CUDA module successfully loaded!\n"); + DeviceInfo.Modules.push_back(cumod); + + // Find the symbols in the module by name. + __tgt_offload_entry *HostBegin = image->EntriesBegin; + __tgt_offload_entry *HostEnd = image->EntriesEnd; + + for (__tgt_offload_entry *e = HostBegin; e != HostEnd; ++e) { + + if (!e->addr) { + // We return NULL when something like this happens, the host should have + // always something in the address to uniquely identify the target region. + DP("Invalid binary: host entry '' (size = %zd)...\n", e->size); + + return NULL; + } + + if (e->size) { + __tgt_offload_entry entry = *e; + + CUdeviceptr cuptr; + size_t cusize; + err = cuModuleGetGlobal(&cuptr, &cusize, cumod, e->name); + + if (err != CUDA_SUCCESS) { + DP("Loading global '%s' (Failed)\n", e->name); + CUDA_ERR_STRING(err); + return NULL; + } + + if (cusize != e->size) { + DP("Loading global '%s' - size mismatch (%zd != %zd)\n", e->name, + cusize, e->size); + CUDA_ERR_STRING(err); + return NULL; + } + + DP("Entry point " DPxMOD " maps to global %s (" DPxMOD ")\n", + DPxPTR(e - HostBegin), e->name, DPxPTR(cuptr)); + entry.addr = (void *)cuptr; + + DeviceInfo.addOffloadEntry(device_id, entry); + + continue; + } + + CUfunction fun; + err = cuModuleGetFunction(&fun, cumod, e->name); + + if (err != CUDA_SUCCESS) { + DP("Loading '%s' (Failed)\n", e->name); + CUDA_ERR_STRING(err); + return NULL; + } + + DP("Entry point " DPxMOD " maps to %s (" DPxMOD ")\n", + DPxPTR(e - HostBegin), e->name, DPxPTR(fun)); + + // default value GENERIC (in case symbol is missing from cubin file) + int8_t ExecModeVal = ExecutionModeType::GENERIC; + std::string ExecModeNameStr (e->name); + ExecModeNameStr += "_exec_mode"; + const char *ExecModeName = ExecModeNameStr.c_str(); + + CUdeviceptr ExecModePtr; + size_t cusize; + err = cuModuleGetGlobal(&ExecModePtr, &cusize, cumod, ExecModeName); + if (err == CUDA_SUCCESS) { + if ((size_t)cusize != sizeof(int8_t)) { + DP("Loading global exec_mode '%s' - size mismatch (%zd != %zd)\n", + ExecModeName, cusize, sizeof(int8_t)); + CUDA_ERR_STRING(err); + return NULL; + } + + err = cuMemcpyDtoH(&ExecModeVal, ExecModePtr, cusize); + if (err != CUDA_SUCCESS) { + DP("Error when copying data from device to host. Pointers: " + "host = " DPxMOD ", device = " DPxMOD ", size = %zd\n", + DPxPTR(&ExecModeVal), DPxPTR(ExecModePtr), cusize); + CUDA_ERR_STRING(err); + return NULL; + } + + if (ExecModeVal < 0 || ExecModeVal > 1) { + DP("Error wrong exec_mode value specified in cubin file: %d\n", + ExecModeVal); + return NULL; + } + } else { + DP("Loading global exec_mode '%s' - symbol missing, using default value " + "GENERIC (1)\n", ExecModeName); + CUDA_ERR_STRING(err); + } + + KernelsList.push_back(KernelTy(fun, ExecModeVal)); + + __tgt_offload_entry entry = *e; + entry.addr = (void *)&KernelsList.back(); + DeviceInfo.addOffloadEntry(device_id, entry); + } + + return DeviceInfo.getOffloadEntriesTable(device_id); +} + +void *__tgt_rtl_data_alloc(int32_t device_id, int64_t size) { + if (size == 0) { + return NULL; + } + + // Set the context we are using. + CUresult err = cuCtxSetCurrent(DeviceInfo.Contexts[device_id]); + if (err != CUDA_SUCCESS) { + DP("Error while trying to set CUDA current context\n"); + CUDA_ERR_STRING(err); + return NULL; + } + + CUdeviceptr ptr; + err = cuMemAlloc(&ptr, size); + if (err != CUDA_SUCCESS) { + DP("Error while trying to allocate %d\n", err); + CUDA_ERR_STRING(err); + return NULL; + } + + void *vptr = (void *)ptr; + return vptr; +} + +int32_t __tgt_rtl_data_submit(int32_t device_id, void *tgt_ptr, void *hst_ptr, + int64_t size) { + // Set the context we are using. + CUresult err = cuCtxSetCurrent(DeviceInfo.Contexts[device_id]); + if (err != CUDA_SUCCESS) { + DP("Error when setting CUDA context\n"); + CUDA_ERR_STRING(err); + return OFFLOAD_FAIL; + } + + err = cuMemcpyHtoD((CUdeviceptr)tgt_ptr, hst_ptr, size); + if (err != CUDA_SUCCESS) { + DP("Error when copying data from host to device. Pointers: host = " DPxMOD + ", device = " DPxMOD ", size = %" PRId64 "\n", DPxPTR(hst_ptr), + DPxPTR(tgt_ptr), size); + CUDA_ERR_STRING(err); + return OFFLOAD_FAIL; + } + return OFFLOAD_SUCCESS; +} + +int32_t __tgt_rtl_data_retrieve(int32_t device_id, void *hst_ptr, void *tgt_ptr, + int64_t size) { + // Set the context we are using. + CUresult err = cuCtxSetCurrent(DeviceInfo.Contexts[device_id]); + if (err != CUDA_SUCCESS) { + DP("Error when setting CUDA context\n"); + CUDA_ERR_STRING(err); + return OFFLOAD_FAIL; + } + + err = cuMemcpyDtoH(hst_ptr, (CUdeviceptr)tgt_ptr, size); + if (err != CUDA_SUCCESS) { + DP("Error when copying data from device to host. Pointers: host = " DPxMOD + ", device = " DPxMOD ", size = %" PRId64 "\n", DPxPTR(hst_ptr), + DPxPTR(tgt_ptr), size); + CUDA_ERR_STRING(err); + return OFFLOAD_FAIL; + } + return OFFLOAD_SUCCESS; +} + +int32_t __tgt_rtl_data_delete(int32_t device_id, void *tgt_ptr) { + // Set the context we are using. + CUresult err = cuCtxSetCurrent(DeviceInfo.Contexts[device_id]); + if (err != CUDA_SUCCESS) { + DP("Error when setting CUDA context\n"); + CUDA_ERR_STRING(err); + return OFFLOAD_FAIL; + } + + err = cuMemFree((CUdeviceptr)tgt_ptr); + if (err != CUDA_SUCCESS) { + DP("Error when freeing CUDA memory\n"); + CUDA_ERR_STRING(err); + return OFFLOAD_FAIL; + } + return OFFLOAD_SUCCESS; +} + +int32_t __tgt_rtl_run_target_team_region(int32_t device_id, void *tgt_entry_ptr, + void **tgt_args, int32_t arg_num, int32_t team_num, int32_t thread_limit, + uint64_t loop_tripcount) { + // Set the context we are using. + CUresult err = cuCtxSetCurrent(DeviceInfo.Contexts[device_id]); + if (err != CUDA_SUCCESS) { + DP("Error when setting CUDA context\n"); + CUDA_ERR_STRING(err); + return OFFLOAD_FAIL; + } + + // All args are references. + std::vector args(arg_num); + + for (int32_t i = 0; i < arg_num; ++i) + args[i] = &tgt_args[i]; + + KernelTy *KernelInfo = (KernelTy *)tgt_entry_ptr; + + int cudaThreadsPerBlock; + + if (thread_limit > 0) { + cudaThreadsPerBlock = thread_limit; + DP("Setting CUDA threads per block to requested %d\n", thread_limit); + } else { + cudaThreadsPerBlock = DeviceInfo.NumThreads[device_id]; + DP("Setting CUDA threads per block to default %d\n", + DeviceInfo.NumThreads[device_id]); + } + + // Add master warp if necessary + if (KernelInfo->ExecutionMode == GENERIC) { + cudaThreadsPerBlock += DeviceInfo.WarpSize[device_id]; + DP("Adding master warp: +%d threads\n", DeviceInfo.WarpSize[device_id]); + } + + if (cudaThreadsPerBlock > DeviceInfo.ThreadsPerBlock[device_id]) { + cudaThreadsPerBlock = DeviceInfo.ThreadsPerBlock[device_id]; + DP("Threads per block capped at device limit %d\n", + DeviceInfo.ThreadsPerBlock[device_id]); + } + + int kernel_limit; + err = cuFuncGetAttribute(&kernel_limit, + CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, KernelInfo->Func); + if (err == CUDA_SUCCESS) { + if (kernel_limit < cudaThreadsPerBlock) { + cudaThreadsPerBlock = kernel_limit; + DP("Threads per block capped at kernel limit %d\n", kernel_limit); + } + } + + int cudaBlocksPerGrid; + if (team_num <= 0) { + if (loop_tripcount > 0 && DeviceInfo.EnvNumTeams < 0) { + // round up to the nearest integer + cudaBlocksPerGrid = ((loop_tripcount - 1) / cudaThreadsPerBlock) + 1; + DP("Using %d teams due to loop trip count %" PRIu64 " and number of " + "threads per block %d\n", cudaBlocksPerGrid, loop_tripcount, + cudaThreadsPerBlock); + } else { + cudaBlocksPerGrid = DeviceInfo.NumTeams[device_id]; + DP("Using default number of teams %d\n", DeviceInfo.NumTeams[device_id]); + } + } else if (team_num > DeviceInfo.BlocksPerGrid[device_id]) { + cudaBlocksPerGrid = DeviceInfo.BlocksPerGrid[device_id]; + DP("Capping number of teams to team limit %d\n", + DeviceInfo.BlocksPerGrid[device_id]); + } else { + cudaBlocksPerGrid = team_num; + DP("Using requested number of teams %d\n", team_num); + } + + // Run on the device. + DP("Launch kernel with %d blocks and %d threads\n", cudaBlocksPerGrid, + cudaThreadsPerBlock); + + err = cuLaunchKernel(KernelInfo->Func, cudaBlocksPerGrid, 1, 1, + cudaThreadsPerBlock, 1, 1, 0 /*bytes of shared memory*/, 0, &args[0], 0); + if (err != CUDA_SUCCESS) { + DP("Device kernel launch failed!\n"); + CUDA_ERR_STRING(err); + assert(err == CUDA_SUCCESS && "Unable to launch target execution!"); + return OFFLOAD_FAIL; + } + + DP("Launch of entry point at " DPxMOD " successful!\n", + DPxPTR(tgt_entry_ptr)); + + if (cudaDeviceSynchronize() != cudaSuccess) { + DP("Kernel execution error at " DPxMOD ".\n", DPxPTR(tgt_entry_ptr)); + return OFFLOAD_FAIL; + } else { + DP("Kernel execution at " DPxMOD " successful!\n", DPxPTR(tgt_entry_ptr)); + } + + return OFFLOAD_SUCCESS; +} + +int32_t __tgt_rtl_run_target_region(int32_t device_id, void *tgt_entry_ptr, + void **tgt_args, int32_t arg_num) { + // use one team and the default number of threads. + const int32_t team_num = 1; + const int32_t thread_limit = 0; + return __tgt_rtl_run_target_team_region(device_id, tgt_entry_ptr, tgt_args, + arg_num, team_num, thread_limit, 0); +} + +#ifdef __cplusplus +} +#endif Index: libomptarget/plugins/exports =================================================================== --- /dev/null +++ libomptarget/plugins/exports @@ -0,0 +1,15 @@ +VERS1.0 { + global: + __tgt_rtl_is_valid_binary; + __tgt_rtl_number_of_devices; + __tgt_rtl_init_device; + __tgt_rtl_load_binary; + __tgt_rtl_data_alloc; + __tgt_rtl_data_submit; + __tgt_rtl_data_retrieve; + __tgt_rtl_data_delete; + __tgt_rtl_run_target_team_region; + __tgt_rtl_run_target_region; + local: + *; +}; Index: libomptarget/plugins/generic-elf-64bit/src/rtl.cpp =================================================================== --- /dev/null +++ libomptarget/plugins/generic-elf-64bit/src/rtl.cpp @@ -0,0 +1,314 @@ +//===-RTLs/generic-64bit/src/rtl.cpp - Target RTLs Implementation - C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is dual licensed under the MIT and the University of Illinois Open +// Source Licenses. See LICENSE.txt for details. +// +//===----------------------------------------------------------------------===// +// +// RTL for generic 64-bit machine +// +//===----------------------------------------------------------------------===// + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "omptarget.h" + +#ifndef TARGET_NAME +#define TARGET_NAME Generic ELF - 64bit +#endif + +#ifndef TARGET_ELF_ID +#define TARGET_ELF_ID 0 +#endif + +#define GETNAME2(name) #name +#define GETNAME(name) GETNAME2(name) +#define DP(...) DEBUGP("Target " GETNAME(TARGET_NAME) " RTL", __VA_ARGS__) + +#include "../../common/elf_common.c" + +#define NUMBER_OF_DEVICES 4 +#define OFFLOADSECTIONNAME ".omp_offloading.entries" + +/// Array of Dynamic libraries loaded for this target. +struct DynLibTy { + char *FileName; + void *Handle; +}; + +/// Keep entries table per device. +struct FuncOrGblEntryTy { + __tgt_target_table Table; +}; + +/// Class containing all the device information. +class RTLDeviceInfoTy { + std::vector FuncGblEntries; + +public: + std::list DynLibs; + + // Record entry point associated with device. + void createOffloadTable(int32_t device_id, __tgt_offload_entry *begin, + __tgt_offload_entry *end) { + assert(device_id < (int32_t)FuncGblEntries.size() && + "Unexpected device id!"); + FuncOrGblEntryTy &E = FuncGblEntries[device_id]; + + E.Table.EntriesBegin = begin; + E.Table.EntriesEnd = end; + } + + // Return true if the entry is associated with device. + bool findOffloadEntry(int32_t device_id, void *addr) { + assert(device_id < (int32_t)FuncGblEntries.size() && + "Unexpected device id!"); + FuncOrGblEntryTy &E = FuncGblEntries[device_id]; + + for (__tgt_offload_entry *i = E.Table.EntriesBegin, *e = E.Table.EntriesEnd; + i < e; ++i) { + if (i->addr == addr) + return true; + } + + return false; + } + + // Return the pointer to the target entries table. + __tgt_target_table *getOffloadEntriesTable(int32_t device_id) { + assert(device_id < (int32_t)FuncGblEntries.size() && + "Unexpected device id!"); + FuncOrGblEntryTy &E = FuncGblEntries[device_id]; + + return &E.Table; + } + + RTLDeviceInfoTy(int32_t num_devices) { FuncGblEntries.resize(num_devices); } + + ~RTLDeviceInfoTy() { + // Close dynamic libraries + for (auto &lib : DynLibs) { + if (lib.Handle) { + dlclose(lib.Handle); + remove(lib.FileName); + } + } + } +}; + +static RTLDeviceInfoTy DeviceInfo(NUMBER_OF_DEVICES); + +#ifdef __cplusplus +extern "C" { +#endif + +int32_t __tgt_rtl_is_valid_binary(__tgt_device_image *image) { +// If we don't have a valid ELF ID we can just fail. +#if TARGET_ELF_ID < 1 + return 0; +#else + return elf_check_machine(image, TARGET_ELF_ID); +#endif +} + +int32_t __tgt_rtl_number_of_devices() { return NUMBER_OF_DEVICES; } + +int32_t __tgt_rtl_init_device(int32_t device_id) { return OFFLOAD_SUCCESS; } + +__tgt_target_table *__tgt_rtl_load_binary(int32_t device_id, + __tgt_device_image *image) { + + DP("Dev %d: load binary from " DPxMOD " image\n", device_id, + DPxPTR(image->ImageStart)); + + assert(device_id >= 0 && device_id < NUMBER_OF_DEVICES && "bad dev id"); + + size_t ImageSize = (size_t)image->ImageEnd - (size_t)image->ImageStart; + size_t NumEntries = (size_t)(image->EntriesEnd - image->EntriesBegin); + DP("Expecting to have %zd entries defined.\n", NumEntries); + + // Is the library version incompatible with the header file? + if (elf_version(EV_CURRENT) == EV_NONE) { + DP("Incompatible ELF library!\n"); + return NULL; + } + + // Obtain elf handler + Elf *e = elf_memory((char *)image->ImageStart, ImageSize); + if (!e) { + DP("Unable to get ELF handle: %s!\n", elf_errmsg(-1)); + return NULL; + } + + if (elf_kind(e) != ELF_K_ELF) { + DP("Invalid Elf kind!\n"); + elf_end(e); + return NULL; + } + + // Find the entries section offset + Elf_Scn *section = 0; + Elf64_Off entries_offset = 0; + + size_t shstrndx; + + if (elf_getshdrstrndx(e, &shstrndx)) { + DP("Unable to get ELF strings index!\n"); + elf_end(e); + return NULL; + } + + while ((section = elf_nextscn(e, section))) { + GElf_Shdr hdr; + gelf_getshdr(section, &hdr); + + if (!strcmp(elf_strptr(e, shstrndx, hdr.sh_name), OFFLOADSECTIONNAME)) { + entries_offset = hdr.sh_addr; + break; + } + } + + if (!entries_offset) { + DP("Entries Section Offset Not Found\n"); + elf_end(e); + return NULL; + } + + DP("Offset of entries section is (" DPxMOD ").\n", DPxPTR(entries_offset)); + + // load dynamic library and get the entry points. We use the dl library + // to do the loading of the library, but we could do it directly to avoid the + // dump to the temporary file. + // + // 1) Create tmp file with the library contents. + // 2) Use dlopen to load the file and dlsym to retrieve the symbols. + char tmp_name[] = "/tmp/tmpfile_XXXXXX"; + int tmp_fd = mkstemp(tmp_name); + + if (tmp_fd == -1) { + elf_end(e); + return NULL; + } + + FILE *ftmp = fdopen(tmp_fd, "wb"); + + if (!ftmp) { + elf_end(e); + return NULL; + } + + fwrite(image->ImageStart, ImageSize, 1, ftmp); + fclose(ftmp); + + DynLibTy Lib = {tmp_name, dlopen(tmp_name, RTLD_LAZY)}; + + if (!Lib.Handle) { + DP("Target library loading error: %s\n", dlerror()); + elf_end(e); + return NULL; + } + + DeviceInfo.DynLibs.push_back(Lib); + + struct link_map *libInfo = (struct link_map *)Lib.Handle; + + // The place where the entries info is loaded is the library base address + // plus the offset determined from the ELF file. + Elf64_Addr entries_addr = libInfo->l_addr + entries_offset; + + DP("Pointer to first entry to be loaded is (" DPxMOD ").\n", + DPxPTR(entries_addr)); + + // Table of pointers to all the entries in the target. + __tgt_offload_entry *entries_table = (__tgt_offload_entry *)entries_addr; + + __tgt_offload_entry *entries_begin = &entries_table[0]; + __tgt_offload_entry *entries_end = entries_begin + NumEntries; + + if (!entries_begin) { + DP("Can't obtain entries begin\n"); + elf_end(e); + return NULL; + } + + DP("Entries table range is (" DPxMOD ")->(" DPxMOD ")\n", + DPxPTR(entries_begin), DPxPTR(entries_end)); + DeviceInfo.createOffloadTable(device_id, entries_begin, entries_end); + + elf_end(e); + + return DeviceInfo.getOffloadEntriesTable(device_id); +} + +void *__tgt_rtl_data_alloc(int32_t device_id, int64_t size) { + void *ptr = malloc(size); + return ptr; +} + +int32_t __tgt_rtl_data_submit(int32_t device_id, void *tgt_ptr, void *hst_ptr, + int64_t size) { + memcpy(tgt_ptr, hst_ptr, size); + return OFFLOAD_SUCCESS; +} + +int32_t __tgt_rtl_data_retrieve(int32_t device_id, void *hst_ptr, void *tgt_ptr, + int64_t size) { + memcpy(hst_ptr, tgt_ptr, size); + return OFFLOAD_SUCCESS; +} + +int32_t __tgt_rtl_data_delete(int32_t device_id, void *tgt_ptr) { + free(tgt_ptr); + return OFFLOAD_SUCCESS; +} + +int32_t __tgt_rtl_run_target_team_region(int32_t device_id, void *tgt_entry_ptr, + void **tgt_args, int32_t arg_num, int32_t team_num, int32_t thread_limit, + uint64_t loop_tripcount /*not used*/) { + // ignore team num and thread limit. + + // Use libffi to launch execution. + ffi_cif cif; + + // All args are references. + std::vector args_types(arg_num, &ffi_type_pointer); + std::vector args(arg_num); + + for (int32_t i = 0; i < arg_num; ++i) + args[i] = &tgt_args[i]; + + ffi_status status = ffi_prep_cif(&cif, FFI_DEFAULT_ABI, arg_num, + &ffi_type_void, &args_types[0]); + + assert(status == FFI_OK && "Unable to prepare target launch!"); + + if (status != FFI_OK) + return OFFLOAD_FAIL; + + DP("Running entry point at " DPxMOD "...\n", DPxPTR(tgt_entry_ptr)); + + ffi_call(&cif, FFI_FN(tgt_entry_ptr), NULL, &args[0]); + return OFFLOAD_SUCCESS; +} + +int32_t __tgt_rtl_run_target_region(int32_t device_id, void *tgt_entry_ptr, + void **tgt_args, int32_t arg_num) { + // use one team and one thread. + return __tgt_rtl_run_target_team_region(device_id, tgt_entry_ptr, tgt_args, + arg_num, 1, 1, 0); +} + +#ifdef __cplusplus +} +#endif Index: libomptarget/plugins/ppc64/CMakeLists.txt =================================================================== --- /dev/null +++ libomptarget/plugins/ppc64/CMakeLists.txt @@ -0,0 +1,18 @@ +##===----------------------------------------------------------------------===## +# +# The LLVM Compiler Infrastructure +# +# This file is dual licensed under the MIT and the University of Illinois Open +# Source Licenses. See LICENSE.txt for details. +# +##===----------------------------------------------------------------------===## +# +# Build a plugin for a ppc64 machine if available. +# +##===----------------------------------------------------------------------===## + +if(CMAKE_SYSTEM_NAME MATCHES "Linux") + build_generic_elf64("ppc64" "PPC64" "ppc64" "powerpc64-ibm-linux-gnu" "21") +else() + libomptarget_say("Not building ppc64 offloading plugin: machine not found in the system.") +endif() \ No newline at end of file Index: libomptarget/plugins/ppc64le/CMakeLists.txt =================================================================== --- /dev/null +++ libomptarget/plugins/ppc64le/CMakeLists.txt @@ -0,0 +1,18 @@ +##===----------------------------------------------------------------------===## +# +# The LLVM Compiler Infrastructure +# +# This file is dual licensed under the MIT and the University of Illinois Open +# Source Licenses. See LICENSE.txt for details. +# +##===----------------------------------------------------------------------===## +# +# Build a plugin for a ppc64le machine if available. +# +##===----------------------------------------------------------------------===## + +if(CMAKE_SYSTEM_NAME MATCHES "Linux") + build_generic_elf64("ppc64le" "PPC64le" "ppc64" "powerpc64le-ibm-linux-gnu" "21") +else() + libomptarget_say("Not building ppc64le offloading plugin: machine not found in the system.") +endif() \ No newline at end of file Index: libomptarget/plugins/x86_64/CMakeLists.txt =================================================================== --- /dev/null +++ libomptarget/plugins/x86_64/CMakeLists.txt @@ -0,0 +1,18 @@ +##===----------------------------------------------------------------------===## +# +# The LLVM Compiler Infrastructure +# +# This file is dual licensed under the MIT and the University of Illinois Open +# Source Licenses. See LICENSE.txt for details. +# +##===----------------------------------------------------------------------===## +# +# Build a plugin for a x86_64 machine if available. +# +##===----------------------------------------------------------------------===## + +if(CMAKE_SYSTEM_NAME MATCHES "Linux") + build_generic_elf64("x86_64" "x86_64" "x86_64" "x86_64-pc-linux-gnu" "62") +else() + libomptarget_say("Not building x86_64 offloading plugin: machine not found in the system.") +endif() \ No newline at end of file