Index: libomptarget/CMakeLists.txt =================================================================== --- libomptarget/CMakeLists.txt +++ libomptarget/CMakeLists.txt @@ -108,6 +108,9 @@ set(LIBOMPTARGET_LIBRARY_DIR ${CMAKE_CURRENT_BINARY_DIR}) endif() + # Build offloading plugins if they are available. + add_subdirectory(plugins) + # Add tests. add_subdirectory(test) Index: libomptarget/plugins/CMakeLists.txt =================================================================== --- /dev/null +++ libomptarget/plugins/CMakeLists.txt @@ -0,0 +1,59 @@ +##===----------------------------------------------------------------------===## +# +# The LLVM Compiler Infrastructure +# +# This file is dual licensed under the MIT and the University of Illinois Open +# Source Licenses. See LICENSE.txt for details. +# +##===----------------------------------------------------------------------===## +# +# Build plugins for the user system if available. +# +##===----------------------------------------------------------------------===## + +# void build_generic64(string tmachine, string tmachine_name, string tmachine_libname); +# - build a plugin for a generic 64-bit target based on libffi. +# - tmachine: name of the machine processor as used in the cmake build system. +# - tmachine_name: name of the machine to be printed with the debug messages. +# - tmachine_libname: machine name to be appended to the plugin library name. +macro(build_generic64 tmachine tmachine_name tmachine_libname tmachine_triple) +if(CMAKE_SYSTEM_PROCESSOR MATCHES "${tmachine}$") + if(LIBOMPTARGET_DEP_LIBFFI_FOUND) + + libomptarget_say("Building ${tmachine_name} offloading plugin.") + + include_directories(${LIBOMPTARGET_DEP_LIBFFI_INCLUDE_DIR}) + + # Define macro to be used as prefix of the runtime messages for this target. + add_definitions("-DTARGET_NAME=${tmachine_name}") + + add_library("omptarget.rtl.${tmachine_libname}" SHARED + ${CMAKE_CURRENT_SOURCE_DIR}/../generic-64bit/src/rtl.cpp) + + target_link_libraries( + "omptarget.rtl.${tmachine_libname}" + ${LIBOMPTARGET_DEP_LIBFFI_LIBRARIES} + ${LIBOMPTARGET_DEP_LIBELF_LIBRARIES} + dl + "-Wl,--version-script=${CMAKE_CURRENT_SOURCE_DIR}/../exports") + + # Report to the parent scope that we are building a plugin. + set(LIBOMPTARGET_SYSTEM_TARGETS + "${LIBOMPTARGET_SYSTEM_TARGETS} ${tmachine_triple}" PARENT_SCOPE) + + else(LIBOMPTARGET_DEP_LIBFFI_FOUND) + libomptarget_say("Not building ${tmachine_name} offloading plugin: libffi dependency not found.") + endif(LIBOMPTARGET_DEP_LIBFFI_FOUND) +else() + libomptarget_say("Not building ${tmachine_name} offloading plugin: machine not found in the system.") +endif() +endmacro() + +add_subdirectory(cuda) +add_subdirectory(ppc64) +add_subdirectory(ppc64le) +add_subdirectory(x86_64) + +# Make sure the parent scope can see the plugins that will be created. +set(LIBOMPTARGET_SYSTEM_TARGETS "${LIBOMPTARGET_SYSTEM_TARGETS}" PARENT_SCOPE) + Index: libomptarget/plugins/cuda/CMakeLists.txt =================================================================== --- /dev/null +++ libomptarget/plugins/cuda/CMakeLists.txt @@ -0,0 +1,42 @@ +##===----------------------------------------------------------------------===## +# +# The LLVM Compiler Infrastructure +# +# This file is dual licensed under the MIT and the University of Illinois Open +# Source Licenses. See LICENSE.txt for details. +# +##===----------------------------------------------------------------------===## +# +# Build a plugin for a CUDA machine if available. +# +##===----------------------------------------------------------------------===## + +if(LIBOMPTARGET_DEP_CUDA_FOUND) + if(CMAKE_SYSTEM_PROCESSOR MATCHES "(x86_64)|(ppc64le)$" AND CMAKE_SYSTEM_NAME MATCHES "Linux") + + libomptarget_say("Building CUDA offloading plugin.") + + # Define the suffix for the runtime messaging dumps. + add_definitions(-DTARGET_NAME=CUDA) + + if(CMAKE_BUILD_TYPE MATCHES Debug) + add_definitions(-DCUDA_ERROR_REPORT) + endif() + + include_directories(${LIBOMPTARGET_DEP_CUDA_INCLUDE_DIRS}) + + add_library(omptarget.rtl.cuda SHARED src/rtl.cpp) + target_link_libraries(omptarget.rtl.cuda + ${LIBOMPTARGET_DEP_CUDA_LIBRARIES} + cuda + ${LIBOMPTARGET_DEP_LIBELF_LIBRARIES} + "-Wl,--version-script=${CMAKE_CURRENT_SOURCE_DIR}/../exports") + + # Report to the parent scope that we are building a plugin for CUDA. + set(LIBOMPTARGET_SYSTEM_TARGETS "${LIBOMPTARGET_SYSTEM_TARGETS} nvptx64-nvidia-cuda" PARENT_SCOPE) + else() + libomptarget_say("Not building CUDA offloading plugin: only support CUDA in linux x86_64 or ppc64le hosts.") + endif() +else() + libomptarget_say("Not building CUDA offloading plugin: CUDA not found in system.") +endif() Index: libomptarget/plugins/cuda/src/rtl.cpp =================================================================== --- /dev/null +++ libomptarget/plugins/cuda/src/rtl.cpp @@ -0,0 +1,598 @@ +//===----RTLs/cuda/src/rtl.cpp - Target RTLs Implementation ------- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is dual licensed under the MIT and the University of Illinois Open +// Source Licenses. See LICENSE.txt for details. +// +//===----------------------------------------------------------------------===// +// +// RTL for CUDA machine +// +//===----------------------------------------------------------------------===// + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "omptarget.h" + +#ifndef TARGET_NAME +#define TARGET_NAME Generic - 64bit +#endif + +#define GETNAME2(name) #name +#define GETNAME(name) GETNAME2(name) +#define DP(...) DEBUGP("Target " GETNAME(TARGET_NAME) " RTL", __VA_ARGS__) + +// Utility for retrieving and printing CUDA error string +#ifdef CUDA_ERROR_REPORT +#define CUDA_ERR_STRING(err) \ + do { \ + const char *errStr; \ + cuGetErrorString(err, &errStr); \ + DP("CUDA error is: %s\n", errStr); \ + } while (0) +#else +#define CUDA_ERR_STRING(err) \ + {} +#endif + +// NVPTX image start encodes a struct that also includes the host entries begin +// and end pointers. The host entries are used by the runtime to accelerate +// the retrieval of the target entry pointers +struct __tgt_nvptx_device_image_start { + void *RealStart; // Pointer to actual NVPTX elf image + char *TgtName; // Name of the target of the image + __tgt_offload_entry *HostStart; // Pointer to the host entries start + __tgt_offload_entry *HostEnd; // Pointer to the host entries end +}; + +/// Account the memory allocated per device +struct AllocMemEntryTy { + int64_t TotalSize; + std::vector> Ptrs; + + AllocMemEntryTy() : TotalSize(0) {} +}; + +/// Keep entries table per device +struct FuncOrGblEntryTy { + __tgt_target_table Table; + std::vector<__tgt_offload_entry> Entries; +}; + +/// Use a single entity to encode a kernel and a set of flags +struct KernelTy { + CUfunction Func; + int SimdInfo; + + // keep track of cuda pointer to write to it when thread_limit value + // changes (check against last value written to ThreadLimit + CUdeviceptr ThreadLimitPtr; + int ThreadLimit; + + KernelTy(CUfunction _Func, int _SimdInfo, CUdeviceptr _ThreadLimitPtr) + : Func(_Func), SimdInfo(_SimdInfo), ThreadLimitPtr(_ThreadLimitPtr) { + ThreadLimit = 0; // default (0) signals that it was not initialized + }; +}; + +/// List that contains all the kernels. +/// FIXME: we may need this to be per device and per library. +std::list KernelsList; + +/// Class containing all the device information +class RTLDeviceInfoTy { + std::vector FuncGblEntries; + +public: + int NumberOfDevices; + std::vector Modules; + std::vector Contexts; + std::vector ThreadsPerBlock; + std::vector BlocksPerGrid; + + // Record entry point associated with device + void addOffloadEntry(int32_t device_id, __tgt_offload_entry entry) { + assert(device_id < (int32_t)FuncGblEntries.size() && + "Unexpected device id!"); + FuncOrGblEntryTy &E = FuncGblEntries[device_id]; + + E.Entries.push_back(entry); + } + + // Return true if the entry is associated with device + bool findOffloadEntry(int32_t device_id, void *addr) { + assert(device_id < (int32_t)FuncGblEntries.size() && + "Unexpected device id!"); + FuncOrGblEntryTy &E = FuncGblEntries[device_id]; + + for (unsigned i = 0; i < E.Entries.size(); ++i) { + if (E.Entries[i].addr == addr) + return true; + } + + return false; + } + + // Return the pointer to the target entries table + __tgt_target_table *getOffloadEntriesTable(int32_t device_id) { + assert(device_id < (int32_t)FuncGblEntries.size() && + "Unexpected device id!"); + FuncOrGblEntryTy &E = FuncGblEntries[device_id]; + + int32_t size = E.Entries.size(); + + // Table is empty + if (!size) + return 0; + + __tgt_offload_entry *begin = &E.Entries[0]; + __tgt_offload_entry *end = &E.Entries[size - 1]; + + // Update table info according to the entries and return the pointer + E.Table.EntriesBegin = begin; + E.Table.EntriesEnd = ++end; + + return &E.Table; + } + + // Clear entries table for a device + void clearOffloadEntriesTable(int32_t device_id) { + assert(device_id < (int32_t)FuncGblEntries.size() && + "Unexpected device id!"); + FuncOrGblEntryTy &E = FuncGblEntries[device_id]; + E.Entries.clear(); + E.Table.EntriesBegin = E.Table.EntriesEnd = 0; + } + + RTLDeviceInfoTy() { + DP("Start initializing CUDA\n"); + + CUresult err = cuInit(0); + if (err != CUDA_SUCCESS) { + DP("Error when initializing CUDA\n"); + CUDA_ERR_STRING(err); + return; + } + + NumberOfDevices = 0; + + err = cuDeviceGetCount(&NumberOfDevices); + if (err != CUDA_SUCCESS) { + DP("Error when getting CUDA device count\n"); + CUDA_ERR_STRING(err); + return; + } + + if (NumberOfDevices == 0) { + DP("There are no devices supporting CUDA.\n"); + return; + } + + FuncGblEntries.resize(NumberOfDevices); + Contexts.resize(NumberOfDevices); + ThreadsPerBlock.resize(NumberOfDevices); + BlocksPerGrid.resize(NumberOfDevices); + } + + ~RTLDeviceInfoTy() { + // Close modules + for (unsigned i = 0; i < Modules.size(); ++i) + if (Modules[i]) { + CUresult err = cuModuleUnload(Modules[i]); + if (err != CUDA_SUCCESS) { + DP("Error when unloading CUDA module\n"); + CUDA_ERR_STRING(err); + } + } + + // Destroy contexts + for (unsigned i = 0; i < Contexts.size(); ++i) + if (Contexts[i]) { + CUresult err = cuCtxDestroy(Contexts[i]); + if (err != CUDA_SUCCESS) { + DP("Error when destroying CUDA context\n"); + CUDA_ERR_STRING(err); + } + } + } +}; + +static RTLDeviceInfoTy DeviceInfo; + +#ifdef __cplusplus +extern "C" { +#endif + +int __tgt_rtl_device_type(int32_t device_id) { + + if (device_id < DeviceInfo.NumberOfDevices) + return 190; // EM_CUDA + + return 0; +} + +int __tgt_rtl_number_of_devices() { return DeviceInfo.NumberOfDevices; } + +int32_t __tgt_rtl_init_device(int32_t device_id) { + + CUdevice cuDevice; + DP("Getting device %d\n", device_id); + CUresult err = cuDeviceGet(&cuDevice, device_id); + if (err != CUDA_SUCCESS) { + DP("Error when getting CUDA device with id = %d\n", device_id); + CUDA_ERR_STRING(err); + return OFFLOAD_FAIL; + } + + // Create the context and save it to use whenever this device is selected + err = cuCtxCreate(&DeviceInfo.Contexts[device_id], CU_CTX_SCHED_BLOCKING_SYNC, + cuDevice); + if (err != CUDA_SUCCESS) { + DP("Error when creating a CUDA context\n"); + CUDA_ERR_STRING(err); + return OFFLOAD_FAIL; + } + + // scan properties to determine number of threads/blocks per block/grid + struct cudaDeviceProp Properties; + cudaError_t error = cudaGetDeviceProperties(&Properties, device_id); + if (error != cudaSuccess) { + DP("Error when getting device Properties, use default\n"); + DeviceInfo.BlocksPerGrid[device_id] = 32; + DeviceInfo.ThreadsPerBlock[device_id] = 512; + } else { + DeviceInfo.BlocksPerGrid[device_id] = Properties.multiProcessorCount; + // exploit threads only along x axis + DeviceInfo.ThreadsPerBlock[device_id] = Properties.maxThreadsDim[0]; + if (Properties.maxThreadsDim[0] < Properties.maxThreadsPerBlock) { + DP("use up to %d threads, fewer than max per blocks along xyz %d\n", + Properties.maxThreadsDim[0], Properties.maxThreadsPerBlock); + } + } + DP("Default number of blocks %d & threads %d\n", + DeviceInfo.BlocksPerGrid[device_id], + DeviceInfo.ThreadsPerBlock[device_id]); + + // done + return OFFLOAD_SUCCESS; +} + +__tgt_target_table *__tgt_rtl_load_binary(int32_t device_id, + __tgt_device_image *image) { + + // Set the context we are using + CUresult err = cuCtxSetCurrent(DeviceInfo.Contexts[device_id]); + if (err != CUDA_SUCCESS) { + DP("Error when setting a CUDA context for device %d\n", device_id); + CUDA_ERR_STRING(err); + return NULL; + } + + // Clear the offload table as we are going to create a new one + DeviceInfo.clearOffloadEntriesTable(device_id); + + // Create the module and extract the function pointers + + CUmodule cumod; + DP("load data from image %llx\n", (unsigned long long)image->ImageStart); + err = cuModuleLoadDataEx(&cumod, image->ImageStart, 0, NULL, NULL); + if (err != CUDA_SUCCESS) { + DP("Error when loading CUDA module\n"); + CUDA_ERR_STRING(err); + return NULL; + } + + DP("CUDA module successfully loaded!\n"); + DeviceInfo.Modules.push_back(cumod); + + // Here, we take advantage of the data that is appended after img_end to get + // the symbols' name we need to load. This data consist of the host entries + // begin and end as well as the target name (see the offloading linker script + // creation in clang compiler). + // Find the symbols in the module by name. The name can be obtain by + // concatenating the host entry name with the target name + + __tgt_offload_entry *HostBegin = image->EntriesBegin; + __tgt_offload_entry *HostEnd = image->EntriesEnd; + + for (__tgt_offload_entry *e = HostBegin; e != HostEnd; ++e) { + + if (!e->addr) { + // FIXME: Probably we should fail when something like this happen, the + // host should have always something in the address to uniquely identify + // the target region. + DP("Analyzing host entry '' (size = %lld)...\n", + (unsigned long long)e->size); + + __tgt_offload_entry entry = *e; + DeviceInfo.addOffloadEntry(device_id, entry); + continue; + } + + if (e->size) { + + __tgt_offload_entry entry = *e; + + CUdeviceptr cuptr; + size_t cusize; + err = cuModuleGetGlobal(&cuptr, &cusize, cumod, e->name); + + if (err != CUDA_SUCCESS) { + DP("loading global '%s' (Failed)\n", e->name); + CUDA_ERR_STRING(err); + return NULL; + } + + if ((int32_t)cusize != e->size) { + DP("loading global '%s' - size mismatch (%lld != %lld)\n", e->name, + (unsigned long long)cusize, (unsigned long long)e->size); + CUDA_ERR_STRING(err); + return NULL; + } + + DP("Entry point %ld maps to global %s (%016lx)\n", e - HostBegin, e->name, + (long)cuptr); + entry.addr = (void *)cuptr; + + DeviceInfo.addOffloadEntry(device_id, entry); + + continue; + } + + CUfunction fun; + err = cuModuleGetFunction(&fun, cumod, e->name); + + if (err != CUDA_SUCCESS) { + DP("loading '%s' (Failed)\n", e->name); + CUDA_ERR_STRING(err); + return NULL; + } + + DP("Entry point %ld maps to %s (%016lx)\n", e - HostBegin, e->name, + (Elf64_Addr)fun); + + // default value + int8_t SimdInfoVal = 1; + + // obtain and save simd_info value for target region + const char suffix[] = "_simd_info"; + char *SimdInfoName = + (char *)malloc((strlen(e->name) + strlen(suffix)) * sizeof(char)); + sprintf(SimdInfoName, "%s%s", e->name, suffix); + + CUdeviceptr SimdInfoPtr; + size_t cusize; + err = cuModuleGetGlobal(&SimdInfoPtr, &cusize, cumod, SimdInfoName); + if (err == CUDA_SUCCESS) { + if ((int32_t)cusize != sizeof(int8_t)) { + DP("loading global simd_info '%s' - size mismatch (%lld != %lld)\n", + SimdInfoName, (unsigned long long)cusize, + (unsigned long long)sizeof(int8_t)); + CUDA_ERR_STRING(err); + return NULL; + } + + err = cuMemcpyDtoH(&SimdInfoVal, (CUdeviceptr)SimdInfoPtr, cusize); + if (err != CUDA_SUCCESS) { + DP("Error when copying data from device to host. Pointers: " + "host = 0x%016lx, device = 0x%016lx, size = %lld\n", + (Elf64_Addr)&SimdInfoVal, (Elf64_Addr)SimdInfoPtr, + (unsigned long long)cusize); + CUDA_ERR_STRING(err); + return NULL; + } + if (SimdInfoVal < 1) { + DP("Error wrong simd_info value specified in cubin file: %d\n", + SimdInfoVal); + return NULL; + } + } + + // obtain cuda pointer to global tracking thread limit + const char SuffixTL[] = "_thread_limit"; + char *ThreadLimitName = + (char *)malloc((strlen(e->name) + strlen(SuffixTL)) * sizeof(char)); + sprintf(ThreadLimitName, "%s%s", e->name, SuffixTL); + + CUdeviceptr ThreadLimitPtr; + err = cuModuleGetGlobal(&ThreadLimitPtr, &cusize, cumod, ThreadLimitName); + if (err != CUDA_SUCCESS) { + DP("retrieving pointer for %s global\n", ThreadLimitName); + CUDA_ERR_STRING(err); + return NULL; + } + if ((int32_t)cusize != sizeof(int32_t)) { + DP("loading global thread_limit '%s' - size mismatch (%lld != %lld)\n", + ThreadLimitName, (unsigned long long)cusize, + (unsigned long long)sizeof(int32_t)); + CUDA_ERR_STRING(err); + return NULL; + } + + // encode function and kernel + KernelsList.push_back(KernelTy(fun, SimdInfoVal, ThreadLimitPtr)); + + __tgt_offload_entry entry = *e; + entry.addr = (void *)&KernelsList.back(); + DeviceInfo.addOffloadEntry(device_id, entry); + } + + return DeviceInfo.getOffloadEntriesTable(device_id); +} + +void *__tgt_rtl_data_alloc(int32_t device_id, int64_t size) { + + // Set the context we are using + CUresult err = cuCtxSetCurrent(DeviceInfo.Contexts[device_id]); + if (err != CUDA_SUCCESS) { + DP("Error while trying to set CUDA current context\n"); + CUDA_ERR_STRING(err); + return NULL; + } + + CUdeviceptr ptr; + err = cuMemAlloc(&ptr, size); + if (err != CUDA_SUCCESS) { + DP("Error while trying to allocate %d\n", err); + CUDA_ERR_STRING(err); + return NULL; + } + + void *vptr = (void *)ptr; + return vptr; +} + +int32_t __tgt_rtl_data_submit(int32_t device_id, void *tgt_ptr, void *hst_ptr, + int64_t size) { + // Set the context we are using + CUresult err = cuCtxSetCurrent(DeviceInfo.Contexts[device_id]); + if (err != CUDA_SUCCESS) { + DP("Error when setting CUDA context\n"); + CUDA_ERR_STRING(err); + return OFFLOAD_FAIL; + } + + err = cuMemcpyHtoD((CUdeviceptr)tgt_ptr, hst_ptr, size); + if (err != CUDA_SUCCESS) { + DP("Error when copying data from host to device. Pointers: " + "host = 0x%016lx, device = 0x%016lx, size = %lld\n", + (Elf64_Addr)hst_ptr, (Elf64_Addr)tgt_ptr, (unsigned long long)size); + CUDA_ERR_STRING(err); + return OFFLOAD_FAIL; + } + return OFFLOAD_SUCCESS; +} + +int32_t __tgt_rtl_data_retrieve(int32_t device_id, void *hst_ptr, void *tgt_ptr, + int64_t size) { + // Set the context we are using + CUresult err = cuCtxSetCurrent(DeviceInfo.Contexts[device_id]); + if (err != CUDA_SUCCESS) { + DP("Error when setting CUDA context\n"); + CUDA_ERR_STRING(err); + return OFFLOAD_FAIL; + } + + err = cuMemcpyDtoH(hst_ptr, (CUdeviceptr)tgt_ptr, size); + if (err != CUDA_SUCCESS) { + DP("Error when copying data from device to host. Pointers: " + "host = 0x%016lx, device = 0x%016lx, size = %lld\n", + (Elf64_Addr)hst_ptr, (Elf64_Addr)tgt_ptr, (unsigned long long)size); + CUDA_ERR_STRING(err); + return OFFLOAD_FAIL; + } + return OFFLOAD_SUCCESS; +} + +int32_t __tgt_rtl_data_delete(int32_t device_id, void *tgt_ptr) { + // Set the context we are using + CUresult err = cuCtxSetCurrent(DeviceInfo.Contexts[device_id]); + if (err != CUDA_SUCCESS) { + DP("Error when setting CUDA context\n"); + CUDA_ERR_STRING(err); + return OFFLOAD_FAIL; + } + + err = cuMemFree((CUdeviceptr)tgt_ptr); + if (err != CUDA_SUCCESS) { + DP("Error when freeing CUDA memory\n"); + CUDA_ERR_STRING(err); + return OFFLOAD_FAIL; + } + return OFFLOAD_SUCCESS; +} + +int32_t __tgt_rtl_run_target_team_region(int32_t device_id, void *tgt_entry_ptr, + void **tgt_args, int32_t arg_num, + int32_t team_num, + int32_t thread_limit) { + // Set the context we are using + CUresult err = cuCtxSetCurrent(DeviceInfo.Contexts[device_id]); + if (err != CUDA_SUCCESS) { + DP("Error when setting CUDA context\n"); + CUDA_ERR_STRING(err); + return OFFLOAD_FAIL; + } + + // All args are references + std::vector args(arg_num); + + for (int32_t i = 0; i < arg_num; ++i) + args[i] = &tgt_args[i]; + + KernelTy *KernelInfo = (KernelTy *)tgt_entry_ptr; + + int cudaThreadsPerBlock = (thread_limit <= 0 || + thread_limit * KernelInfo->SimdInfo > + DeviceInfo.ThreadsPerBlock[device_id]) + ? DeviceInfo.ThreadsPerBlock[device_id] + : thread_limit * KernelInfo->SimdInfo; + + // update thread limit content in gpu memory if un-initialized or changed + if (KernelInfo->ThreadLimit == 0 || KernelInfo->ThreadLimit != thread_limit) { + // always capped by maximum number of threads in a block: even if 1 OMP + // thread + // is 1 independent CUDA thread, we may have up to max block size OMP + // threads + // if the user request thread_limit(tl) with tl > max block size, we + // only start max block size CUDA threads + if (thread_limit > DeviceInfo.ThreadsPerBlock[device_id]) + thread_limit = DeviceInfo.ThreadsPerBlock[device_id]; + + KernelInfo->ThreadLimit = thread_limit; + err = cuMemcpyHtoD(KernelInfo->ThreadLimitPtr, &thread_limit, + sizeof(int32_t)); + + if (err != CUDA_SUCCESS) { + DP("Error when setting thread limit global\n"); + return OFFLOAD_FAIL; + } + } + + int blocksPerGrid = + team_num > 0 ? team_num : DeviceInfo.BlocksPerGrid[device_id]; + int nshared = 0; + + // Run on the device + DP("launch kernel with %d blocks and %d threads\n", blocksPerGrid, + cudaThreadsPerBlock); + + err = cuLaunchKernel(KernelInfo->Func, blocksPerGrid, 1, 1, + cudaThreadsPerBlock, 1, 1, nshared, 0, &args[0], 0); + if (err != CUDA_SUCCESS) { + DP("Device kernel launching failed!\n"); + CUDA_ERR_STRING(err); + assert(err == CUDA_SUCCESS && "Unable to launch target execution!"); + return OFFLOAD_FAIL; + } + + DP("Execution of entry point at %016lx successful!\n", + (Elf64_Addr)tgt_entry_ptr); + return OFFLOAD_SUCCESS; +} + +int32_t __tgt_rtl_run_target_region(int32_t device_id, void *tgt_entry_ptr, + void **tgt_args, int32_t arg_num) { + // use one team and one thread + // fix thread num + int32_t team_num = 1; + int32_t thread_limit = 0; // use default + return __tgt_rtl_run_target_team_region(device_id, tgt_entry_ptr, tgt_args, + arg_num, team_num, thread_limit); +} + +#ifdef __cplusplus +} +#endif Index: libomptarget/plugins/exports =================================================================== --- /dev/null +++ libomptarget/plugins/exports @@ -0,0 +1,15 @@ +VERS1.0 { + global: + __tgt_rtl_device_type; + __tgt_rtl_number_of_devices; + __tgt_rtl_init_device; + __tgt_rtl_load_binary; + __tgt_rtl_data_alloc; + __tgt_rtl_data_submit; + __tgt_rtl_data_retrieve; + __tgt_rtl_data_delete; + __tgt_rtl_run_target_team_region; + __tgt_rtl_run_target_region; + local: + *; +}; Index: libomptarget/plugins/generic-64bit/src/rtl.cpp =================================================================== --- /dev/null +++ libomptarget/plugins/generic-64bit/src/rtl.cpp @@ -0,0 +1,316 @@ +//===-RTLs/generic-64bit/src/rtl.cpp - Target RTLs Implementation - C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is dual licensed under the MIT and the University of Illinois Open +// Source Licenses. See LICENSE.txt for details. +// +//===----------------------------------------------------------------------===// +// +// RTL for generic 64-bit machine +// +//===----------------------------------------------------------------------===// + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "omptarget.h" + +#ifndef TARGET_NAME +#define TARGET_NAME Generic - 64bit +#endif + +#define GETNAME2(name) #name +#define GETNAME(name) GETNAME2(name) +#define DP(...) DEBUGP("Target " GETNAME(TARGET_NAME) " RTL", __VA_ARGS__) + +#define NUMBER_OF_DEVICES 4 +#define OFFLOADSECTIONNAME ".omp_offloading.entries" + +/// Array of Dynamic libraries loaded for this target +struct DynLibTy { + char *FileName; + void *Handle; +}; + +/// Account the memory allocated per device +struct AllocMemEntryTy { + int64_t TotalSize; + std::vector> Ptrs; + + AllocMemEntryTy() : TotalSize(0) {} +}; + +/// Keep entries table per device +struct FuncOrGblEntryTy { + __tgt_target_table Table; +}; + +/// Class containing all the device information +class RTLDeviceInfoTy { + std::vector FuncGblEntries; + +public: + std::list DynLibs; + + // Record entry point associated with device + void createOffloadTable(int32_t device_id, __tgt_offload_entry *begin, + __tgt_offload_entry *end) { + assert(device_id < (int32_t)FuncGblEntries.size() && + "Unexpected device id!"); + FuncOrGblEntryTy &E = FuncGblEntries[device_id]; + + E.Table.EntriesBegin = begin; + E.Table.EntriesEnd = end; + } + + // Return true if the entry is associated with device + bool findOffloadEntry(int32_t device_id, void *addr) { + assert(device_id < (int32_t)FuncGblEntries.size() && + "Unexpected device id!"); + FuncOrGblEntryTy &E = FuncGblEntries[device_id]; + + for (__tgt_offload_entry *i = E.Table.EntriesBegin, *e = E.Table.EntriesEnd; + i < e; ++i) { + if (i->addr == addr) + return true; + } + + return false; + } + + // Return the pointer to the target entries table + __tgt_target_table *getOffloadEntriesTable(int32_t device_id) { + assert(device_id < (int32_t)FuncGblEntries.size() && + "Unexpected device id!"); + FuncOrGblEntryTy &E = FuncGblEntries[device_id]; + + return &E.Table; + } + + RTLDeviceInfoTy(int32_t num_devices) { FuncGblEntries.resize(num_devices); } + + ~RTLDeviceInfoTy() { + // Close dynamic libraries + for (std::list::iterator ii = DynLibs.begin(), + ie = DynLibs.begin(); + ii != ie; ++ii) + if (ii->Handle) { + dlclose(ii->Handle); + remove(ii->FileName); + } + } +}; + +static RTLDeviceInfoTy DeviceInfo(NUMBER_OF_DEVICES); + +#ifdef __cplusplus +extern "C" { +#endif + +int __tgt_rtl_device_type(int32_t device_id) { + + if (device_id < NUMBER_OF_DEVICES) + return 21; // EM_PPC64 + + return 0; +} + +int __tgt_rtl_number_of_devices() { return NUMBER_OF_DEVICES; } + +int32_t __tgt_rtl_init_device(int32_t device_id) { + return OFFLOAD_SUCCESS; // success +} + +__tgt_target_table *__tgt_rtl_load_binary(int32_t device_id, + __tgt_device_image *image) { + + DP("Dev %d: load binary from 0x%llx image\n", device_id, + (long long)image->ImageStart); + + assert(device_id >= 0 && device_id < NUMBER_OF_DEVICES && "bad dev id"); + + size_t ImageSize = (size_t)image->ImageEnd - (size_t)image->ImageStart; + size_t NumEntries = (size_t)(image->EntriesEnd - image->EntriesBegin); + DP("Expecting to have %ld entries defined.\n", (long)NumEntries); + + // We do not need to set the ELF version because the caller of this function + // had to do that to decide the right runtime to use + + // Obtain elf handler + Elf *e = elf_memory((char *)image->ImageStart, ImageSize); + if (!e) { + DP("Unable to get ELF handle: %s!\n", elf_errmsg(-1)); + return NULL; + } + + if (elf_kind(e) != ELF_K_ELF) { + DP("Invalid Elf kind!\n"); + elf_end(e); + return NULL; + } + + // Find the entries section offset + Elf_Scn *section = 0; + Elf64_Off entries_offset = 0; + + size_t shstrndx; + + if (elf_getshdrstrndx(e, &shstrndx)) { + DP("Unable to get ELF strings index!\n"); + elf_end(e); + return NULL; + } + + while ((section = elf_nextscn(e, section))) { + GElf_Shdr hdr; + gelf_getshdr(section, &hdr); + + if (!strcmp(elf_strptr(e, shstrndx, hdr.sh_name), OFFLOADSECTIONNAME)) { + entries_offset = hdr.sh_addr; + break; + } + } + + if (!entries_offset) { + DP("Entries Section Offset Not Found\n"); + elf_end(e); + return NULL; + } + + DP("Offset of entries section is (%016lx).\n", entries_offset); + + // load dynamic library and get the entry points. We use the dl library + // to do the loading of the library, but we could do it directly to avoid the + // dump to the temporary file. + // + // 1) Create tmp file with the library contents + // 2) Use dlopen to load the file and dlsym to retrieve the symbols + char tmp_name[] = "/tmp/tmpfile_XXXXXX"; + int tmp_fd = mkstemp(tmp_name); + + if (tmp_fd == -1) { + elf_end(e); + return NULL; + } + + FILE *ftmp = fdopen(tmp_fd, "wb"); + + if (!ftmp) { + elf_end(e); + return NULL; + } + + fwrite(image->ImageStart, ImageSize, 1, ftmp); + fclose(ftmp); + + DynLibTy Lib = {tmp_name, dlopen(tmp_name, RTLD_LAZY)}; + + if (!Lib.Handle) { + DP("target library loading error: %s\n", dlerror()); + elf_end(e); + return NULL; + } + + struct link_map *libInfo = (struct link_map *)Lib.Handle; + + // The place where the entries info is loaded is the library base address + // plus the offset determined from the ELF file. + Elf64_Addr entries_addr = libInfo->l_addr + entries_offset; + + DP("Pointer to first entry to be loaded is (%016lx).\n", entries_addr); + + // Table of pointers to all the entries in the target + __tgt_offload_entry *entries_table = (__tgt_offload_entry *)entries_addr; + + __tgt_offload_entry *entries_begin = &entries_table[0]; + __tgt_offload_entry *entries_end = entries_begin + NumEntries; + + if (!entries_begin) { + DP("Can't obtain entries begin\n"); + elf_end(e); + return NULL; + } + + DP("Entries table range is (%016lx)->(%016lx)\n", (Elf64_Addr)entries_begin, + (Elf64_Addr)entries_end) + DeviceInfo.createOffloadTable(device_id, entries_begin, entries_end); + + elf_end(e); + + return DeviceInfo.getOffloadEntriesTable(device_id); +} + +void *__tgt_rtl_data_alloc(int32_t device_id, int64_t size) { + void *ptr = malloc(size); + return ptr; +} + +int32_t __tgt_rtl_data_submit(int32_t device_id, void *tgt_ptr, void *hst_ptr, + int64_t size) { + memcpy(tgt_ptr, hst_ptr, size); + return OFFLOAD_SUCCESS; +} + +int32_t __tgt_rtl_data_retrieve(int32_t device_id, void *hst_ptr, void *tgt_ptr, + int64_t size) { + memcpy(hst_ptr, tgt_ptr, size); + return OFFLOAD_SUCCESS; +} + +int32_t __tgt_rtl_data_delete(int32_t device_id, void *tgt_ptr) { + free(tgt_ptr); + return OFFLOAD_SUCCESS; +} + +int32_t __tgt_rtl_run_target_team_region(int32_t device_id, void *tgt_entry_ptr, + void **tgt_args, int32_t arg_num, + int32_t team_num, + int32_t thread_limit) { + // ignore team num and thread limit + + // Use libffi to launch execution + ffi_cif cif; + + // All args are references + std::vector args_types(arg_num, &ffi_type_pointer); + std::vector args(arg_num); + + for (int32_t i = 0; i < arg_num; ++i) + args[i] = &tgt_args[i]; + + ffi_status status = ffi_prep_cif(&cif, FFI_DEFAULT_ABI, arg_num, + &ffi_type_void, &args_types[0]); + + assert(status == FFI_OK && "Unable to prepare target launch!"); + + if (status != FFI_OK) + return OFFLOAD_FAIL; + + DP("Running entry point at %016lx...\n", (Elf64_Addr)tgt_entry_ptr); + + ffi_call(&cif, FFI_FN(tgt_entry_ptr), NULL, &args[0]); + return OFFLOAD_SUCCESS; +} + +int32_t __tgt_rtl_run_target_region(int32_t device_id, void *tgt_entry_ptr, + void **tgt_args, int32_t arg_num) { + // use one team and one thread + return __tgt_rtl_run_target_team_region(device_id, tgt_entry_ptr, tgt_args, + arg_num, 1, 1); +} + +#ifdef __cplusplus +} +#endif Index: libomptarget/plugins/ppc64/CMakeLists.txt =================================================================== --- /dev/null +++ libomptarget/plugins/ppc64/CMakeLists.txt @@ -0,0 +1,18 @@ +##===----------------------------------------------------------------------===## +# +# The LLVM Compiler Infrastructure +# +# This file is dual licensed under the MIT and the University of Illinois Open +# Source Licenses. See LICENSE.txt for details. +# +##===----------------------------------------------------------------------===## +# +# Build a plugin for a ppc64 machine if available. +# +##===----------------------------------------------------------------------===## + +if(CMAKE_SYSTEM_NAME MATCHES "Linux") + build_generic64("ppc64" "PPC64" "ppc64" "powerpc64-ibm-linux-gnu") +else() + libomptarget_say("Not building ppc64 offloading plugin: machine not found in the system.") +endif() \ No newline at end of file Index: libomptarget/plugins/ppc64le/CMakeLists.txt =================================================================== --- /dev/null +++ libomptarget/plugins/ppc64le/CMakeLists.txt @@ -0,0 +1,18 @@ +##===----------------------------------------------------------------------===## +# +# The LLVM Compiler Infrastructure +# +# This file is dual licensed under the MIT and the University of Illinois Open +# Source Licenses. See LICENSE.txt for details. +# +##===----------------------------------------------------------------------===## +# +# Build a plugin for a ppc64le machine if available. +# +##===----------------------------------------------------------------------===## + +if(CMAKE_SYSTEM_NAME MATCHES "Linux") + build_generic64("ppc64le" "PPC64le" "ppc64" "powerpc64le-ibm-linux-gnu") +else() + libomptarget_say("Not building ppc64le offloading plugin: machine not found in the system.") +endif() \ No newline at end of file Index: libomptarget/plugins/x86_64/CMakeLists.txt =================================================================== --- /dev/null +++ libomptarget/plugins/x86_64/CMakeLists.txt @@ -0,0 +1,18 @@ +##===----------------------------------------------------------------------===## +# +# The LLVM Compiler Infrastructure +# +# This file is dual licensed under the MIT and the University of Illinois Open +# Source Licenses. See LICENSE.txt for details. +# +##===----------------------------------------------------------------------===## +# +# Build a plugin for a x86_64 machine if available. +# +##===----------------------------------------------------------------------===## + +if(CMAKE_SYSTEM_NAME MATCHES "Linux") + build_generic64("x86_64" "x86_64" "x86_64" "x86_64-pc-linux-gnu") +else() + libomptarget_say("Not building x86_64 offloading plugin: machine not found in the system.") +endif() \ No newline at end of file