Index: openmp/libomptarget/cmake/Modules/LibomptargetGetDependencies.cmake =================================================================== --- openmp/libomptarget/cmake/Modules/LibomptargetGetDependencies.cmake +++ openmp/libomptarget/cmake/Modules/LibomptargetGetDependencies.cmake @@ -15,6 +15,7 @@ # libffi : required to launch target kernels given function and argument # pointers. # CUDA : required to control offloading to NVIDIA GPUs. +# VEOS : required to control offloading to NEC Aurora. include (FindPackageHandleStandardArgs) @@ -162,6 +163,61 @@ mark_as_advanced(LIBOMPTARGET_DEP_CUDA_DRIVER_LIBRARIES) ################################################################################ +# Looking for VEO... +################################################################################ + +find_path ( + LIBOMPTARGET_DEP_VEO_INCLUDE_DIR + NAMES + ve_offload.h + PATHS + /usr/include + /usr/local/include + /opt/local/include + /sw/include + /opt/nec/ve/veos/include + ENV CPATH + PATH_SUFFIXES + libveo) + +find_library ( + LIBOMPTARGET_DEP_VEO_LIBRARIES + NAMES + veo + PATHS + /usr/lib + /usr/local/lib + /opt/local/lib + /sw/lib + /opt/nec/ve/veos/lib64 + ENV LIBRARY_PATH + ENV LD_LIBRARY_PATH) + +find_library( + LIBOMPTARGET_DEP_VEOSINFO_LIBRARIES + NAMES + veosinfo + PATHS + /usr/lib + /usr/local/lib + /opt/local/lib + /sw/lib + /opt/nec/ve/veos/lib64 + ENV LIBRARY_PATH + ENV LD_LIBRARY_PATH) + +set(LIBOMPTARGET_DEP_VEO_INCLUDE_DIRS ${LIBOMPTARGET_DEP_VEO_INCLUDE_DIR}) +find_package_handle_standard_args( + LIBOMPTARGET_DEP_VEO + DEFAULT_MSG + LIBOMPTARGET_DEP_VEO_LIBRARIES + LIBOMPTARGET_DEP_VEOSINFO_LIBRARIES + LIBOMPTARGET_DEP_VEO_INCLUDE_DIRS) + +mark_as_advanced( + LIBOMPTARGET_DEP_VEO_FOUND + LIBOMPTARGET_DEP_VEO_INCLUDE_DIRS) + # Looking for CUDA libdevice subdirectory # # Special case for Debian/Ubuntu to have nvidia-cuda-toolkit work Index: openmp/libomptarget/plugins/CMakeLists.txt =================================================================== --- openmp/libomptarget/plugins/CMakeLists.txt +++ openmp/libomptarget/plugins/CMakeLists.txt @@ -69,6 +69,7 @@ add_subdirectory(cuda) add_subdirectory(ppc64) add_subdirectory(ppc64le) +add_subdirectory(ve) add_subdirectory(x86_64) # Make sure the parent scope can see the plugins that will be created. Index: openmp/libomptarget/plugins/common/elf_common.c =================================================================== --- openmp/libomptarget/plugins/common/elf_common.c +++ openmp/libomptarget/plugins/common/elf_common.c @@ -71,3 +71,41 @@ elf_end(e); return MachineID == target_id; } + +static inline int32_t elf_is_dynamic(__tgt_device_image *image) { + + char *img_begin = (char *)image->ImageStart; + char *img_end = (char *)image->ImageEnd; + size_t img_size = img_end - img_begin; + + // Obtain elf handler + Elf *e = elf_memory(img_begin, img_size); + if (!e) { + DP("Unable to get ELF handle: %s!\n", elf_errmsg(-1)); + return 0; + } + + Elf64_Ehdr *eh64 = elf64_getehdr(e); + Elf32_Ehdr *eh32 = elf32_getehdr(e); + + if (!eh64 && !eh32) { + DP("Unable to get machine ID from ELF file!\n"); + elf_end(e); + return 0; + } + + uint16_t Type; + if (eh64 && !eh32) + Type = eh64->e_type; + else if (eh32 && !eh64) + Type = eh32->e_type; + else { + DP("Ambiguous ELF header!\n"); + elf_end(e); + return 0; + } + + elf_end(e); + DP("ELF Type: %d\n", Type); + return Type == ET_DYN; +} Index: openmp/libomptarget/plugins/ve/CMakeLists.txt =================================================================== --- /dev/null +++ openmp/libomptarget/plugins/ve/CMakeLists.txt @@ -0,0 +1,49 @@ +##===----------------------------------------------------------------------===## +# +# Build a plugin for a NEC Aurora machine if available. (Can also run on host) +# +##===----------------------------------------------------------------------===## + + +if(${LIBOMPTARGET_DEP_VEO_FOUND}) + libomptarget_say("Building SX-Aurora VE offloading plugin.") + set(additional_libs "") + set(additional_libs ${LIBOMPTARGET_DEP_VEO_LIBRARIES} + ${LIBOMPTARGET_DEP_VEOSINFO_LIBRARIES} + ${additional_libs}) + + set(tmachine_name "ve") + set(tmachine_libname "ve") + set(tmachine_triple "ve-unknown-linux-unknown") + set(elf_machine_id 251) + + include_directories(${LIBOMPTARGET_DEP_LIBELF_INCLUDE_DIR}) + include_directories(${LIBOMPTARGET_DEP_VEO_INCLUDE_DIR}) + + + # Define macro to be used as prefix of the runtime messages for this target. + add_definitions("-DTARGET_NAME=${tmachine_name}") + + # Define macro with the ELF ID for this target. + add_definitions("-DTARGET_ELF_ID=${elf_machine_id}") + + add_library("omptarget.rtl.${tmachine_libname}" SHARED + ${CMAKE_CURRENT_SOURCE_DIR}/src/rtl.cpp) + + # Install plugin under the lib destination folder. + install(TARGETS "omptarget.rtl.${tmachine_libname}" + LIBRARY DESTINATION lib${OPENMP_LIBDIR_SUFFIX}) + + target_link_libraries( + "omptarget.rtl.${tmachine_libname}" + ${LIBOMPTARGET_DEP_LIBFFI_LIBRARIES} + ${LIBOMPTARGET_DEP_LIBELF_LIBRARIES} + ${additional_libs} + "-Wl,--version-script=${CMAKE_CURRENT_SOURCE_DIR}/../exports -Wl,-z,defs") + + # Report to the parent scope that we are building a plugin. + set(LIBOMPTARGET_SYSTEM_TARGETS + "${LIBOMPTARGET_SYSTEM_TARGETS} ${tmachine_triple}" PARENT_SCOPE) +else() + libomptarget_say("Not building nec-aurora plugin: libveo or libveosinfo not found.") +endif() Index: openmp/libomptarget/plugins/ve/src/rtl.cpp =================================================================== --- /dev/null +++ openmp/libomptarget/plugins/ve/src/rtl.cpp @@ -0,0 +1,464 @@ +//===-RTLs/nec-aurora/src/rtl.cpp - Target RTLs Implementation - C++ -*-======// +// +// The LLVM Compiler Infrastructure +// +// This file is dual licensed under the MIT and the University of Illinois Open +// Source Licenses. See LICENSE.txt for details. +// +//===----------------------------------------------------------------------===// +// +// RTL for NEC Aurora TSUBASA machines +// +//===----------------------------------------------------------------------===// + +#include "omptargetplugin.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifndef TARGET_ELF_ID +#define TARGET_ELF_ID 0 +#endif + +#ifdef OMPTARGET_DEBUG +static int DebugLevel = 0; + +#define GETNAME2(name) #name +#define GETNAME(name) GETNAME2(name) +#define DP(...) \ + do { \ + if (DebugLevel > 0) { \ + DEBUGP("Target " GETNAME(TARGET_NAME) " RTL", __VA_ARGS__); \ + } \ + } while (false) +#else // OMPTARGET_DEBUG +#define DP(...) \ + {} +#endif // OMPTARGET_DEBUG + +#include "../../common/elf_common.c" + +struct DynLibTy { + char *FileName; + uint64_t VeoLibHandle; +}; + +/// Keep entries table per device. +struct FuncOrGblEntryTy { + __tgt_target_table Table; + std::vector<__tgt_offload_entry> Entries; +}; + +class RTLDeviceInfoTy { + std::vector> FuncOrGblEntry; + +public: + std::vector ProcHandles; + std::vector Contexts; + std::vector LibraryHandles; + std::list DynLibs; + // Maps OpenMP device Ids to Ve nodeids + std::vector NodeIds; + + void buildOffloadTableFromHost(int32_t device_id, uint64_t VeoLibHandle, + __tgt_offload_entry *HostBegin, + __tgt_offload_entry *HostEnd) { + FuncOrGblEntry[device_id].emplace_back(); + std::vector<__tgt_offload_entry> &T = + FuncOrGblEntry[device_id].back().Entries; + T.clear(); + for (__tgt_offload_entry *i = HostBegin; i != HostEnd; ++i) { + char *SymbolName = i->name; + // we have not enough access to the target memory to conveniently parse + // the offload table there so we need to lookup every symbol with the host + // table + DP("Looking up symbol: %s\n", SymbolName); + uint64_t SymbolTargetAddr = + veo_get_sym(ProcHandles[device_id], VeoLibHandle, SymbolName); + __tgt_offload_entry Entry; + + if (!SymbolTargetAddr) { + DP("Symbol %s not found in target image\n", SymbolName); + Entry = {NULL, NULL, 0, 0, 0}; + } else { + DP("Found symbol %s successfully in target image (addr: %p)\n", + SymbolName, reinterpret_cast(SymbolTargetAddr)); + Entry = { reinterpret_cast(SymbolTargetAddr), + i->name, + i->size, + i->flags, + 0 }; + } + + T.push_back(Entry); + } + + FuncOrGblEntry[device_id].back().Table.EntriesBegin = &T.front(); + FuncOrGblEntry[device_id].back().Table.EntriesEnd = &T.back() + 1; + } + + __tgt_target_table *getOffloadTable(int32_t device_id) { + return &FuncOrGblEntry[device_id].back().Table; + } + + RTLDeviceInfoTy() { +#ifdef OMPTARGET_DEBUG + if (char *envStr = getenv("LIBOMPTARGET_DEBUG")) { + DebugLevel = std::stoi(envStr); + } +#endif // OMPTARGET_DEBUG + + struct ve_nodeinfo node_info; + ve_node_info(&node_info); + + // Build a predictable mapping between VE node ids and OpenMP device ids. + // This is necessary, because nodes can be missing or offline and (active) + // node ids are thus not consecutive. The entries in ve_nodeinfo may also + // not be in the order of their node ids. + for (int i = 0; i < node_info.total_node_count; ++i) { + if (node_info.status[i] == 0) { + NodeIds.push_back(node_info.nodeid[i]); + } + } + + // Because the entries in ve_nodeinfo may not be in the order of their node + // ids, we sort NodeIds to get a predictable mapping. + std::sort(NodeIds.begin(), NodeIds.end()); + + int NumDevices = NodeIds.size(); + DP("Found %i VE devices\n", NumDevices); + ProcHandles.resize(NumDevices, NULL); + Contexts.resize(NumDevices, NULL); + FuncOrGblEntry.resize(NumDevices); + LibraryHandles.resize(NumDevices); + } + + ~RTLDeviceInfoTy() { + for (auto &ctx : Contexts) { + if (ctx != NULL) { + if (veo_context_close(ctx) != 0) { + DP("Failed to close VEO context.\n"); + } + } + } + + for (auto &hdl : ProcHandles) { + if (hdl != NULL) { + veo_proc_destroy(hdl); + } + } + + for (auto &lib : DynLibs) { + if (lib.FileName) { + remove(lib.FileName); + } + } + } +}; + +static RTLDeviceInfoTy DeviceInfo; + +static int target_run_function_wait(uint32_t DeviceID, uint64_t FuncAddr, + struct veo_args *args, uint64_t *RetVal) { + DP("Running function with entry point %p\n", + reinterpret_cast(FuncAddr)); + uint64_t RequestHandle = + veo_call_async(DeviceInfo.Contexts[DeviceID], FuncAddr, args); + if (RequestHandle == VEO_REQUEST_ID_INVALID) { + DP("Execution of entry point %p failed\n", + reinterpret_cast(FuncAddr)); + return OFFLOAD_FAIL; + } + + DP("Function at address %p called (VEO request ID: %" PRIu64 ")\n", + reinterpret_cast(FuncAddr), RequestHandle); + + int ret = veo_call_wait_result(DeviceInfo.Contexts[DeviceID], RequestHandle, + RetVal); + if (ret != 0) { + DP("Waiting for entry point %p failed (Error code %d)\n", + reinterpret_cast(FuncAddr), ret); + return OFFLOAD_FAIL; + } + return OFFLOAD_SUCCESS; +} + + +// Return the number of available devices of the type supported by the +// target RTL. +int32_t __tgt_rtl_number_of_devices(void) { return DeviceInfo.NodeIds.size(); } + +// Return an integer different from zero if the provided device image can be +// supported by the runtime. The functionality is similar to comparing the +// result of __tgt__rtl__load__binary to NULL. However, this is meant to be a +// lightweight query to determine if the RTL is suitable for an image without +// having to load the library, which can be expensive. +int32_t __tgt_rtl_is_valid_binary(__tgt_device_image *Image) { +#if TARGET_ELF_ID < 1 + return 0; +#else + return elf_check_machine(Image, TARGET_ELF_ID); +#endif +} + +// Initialize the specified device. In case of success return 0; otherwise +// return an error code. +int32_t __tgt_rtl_init_device(int32_t ID) { + DP("Available VEO version: %i\n", veo_api_version()); + + // At the moment we do not really initialize (i.e. create a process or + // context on) the device here, but in "__tgt_rtl_load_binary". + // The reason for this is, that, when we create a process for a statically + // linked binary, the VEO api needs us to already supply the binary (but we + // can load a dynamically linked binary later, after we create the process). + // At this stage, we cannot check if we have a dynamically or statically + // linked binary so we defer process creation until we know. + return OFFLOAD_SUCCESS; +} + +// Pass an executable image section described by image to the specified +// device and prepare an address table of target entities. In case of error, +// return NULL. Otherwise, return a pointer to the built address table. +// Individual entries in the table may also be NULL, when the corresponding +// offload region is not supported on the target device. +__tgt_target_table *__tgt_rtl_load_binary(int32_t ID, + __tgt_device_image *Image) { + DP("Dev %d: load binary from " DPxMOD " image\n", ID, + DPxPTR(Image->ImageStart)); + + assert(ID >= 0 && "bad dev id"); + + size_t ImageSize = (size_t)Image->ImageEnd - (size_t)Image->ImageStart; + size_t NumEntries = (size_t)(Image->EntriesEnd - Image->EntriesBegin); + DP("Expecting to have %zd entries defined.\n", NumEntries); + + // load dynamic library and get the entry points. We use the dl library + // to do the loading of the library, but we could do it directly to avoid the + // dump to the temporary file. + // + // 1) Create tmp file with the library contents. + // 2) Use dlopen to load the file and dlsym to retrieve the symbols. + char tmp_name[] = "/tmp/tmpfile_XXXXXX"; + int tmp_fd = mkstemp(tmp_name); + + if (tmp_fd == -1) { + return NULL; + } + + FILE *ftmp = fdopen(tmp_fd, "wb"); + + if (!ftmp) { + DP("fdopen() for %s failed. Could not write target image\n", tmp_name); + return NULL; + } + + fwrite(Image->ImageStart, ImageSize, 1, ftmp); + + // at least for the static case we need to change the permissions + chmod(tmp_name, 0700); + + DP("Wrote target image to %s. ImageSize=%zu\n", tmp_name, ImageSize); + + fclose(ftmp); + + // See comment in "__tgt_rtl_init_device" + bool is_dyn = true; + if (DeviceInfo.ProcHandles[ID] == NULL) { + struct veo_proc_handle *proc_handle; + is_dyn = elf_is_dynamic(Image); + // If we have a dynamically linked image, we create the process handle, then + // the thread, and then load the image. + // If we have a statically linked image, we need to create the process + // handle and load the image at the same time with veo_proc_create_static(). + if (is_dyn) { + proc_handle = veo_proc_create(DeviceInfo.NodeIds[ID]); + if (!proc_handle) { + DP("veo_proc_create() failed for device %d\n", ID); + return NULL; + } + } else { + proc_handle = veo_proc_create_static(DeviceInfo.NodeIds[ID], tmp_name); + if (!proc_handle) { + DP("veo_proc_create_static() failed for device %d, image=%s\n", ID, + tmp_name); + return NULL; + } + } + DeviceInfo.ProcHandles[ID] = proc_handle; + } + + if (DeviceInfo.Contexts[ID] == NULL) { + struct veo_thr_ctxt *ctx = veo_context_open(DeviceInfo.ProcHandles[ID]); + + if (!ctx) { + DP("veo_context_open() failed: %s\n", std::strerror(errno)); + return NULL; + } + + DeviceInfo.Contexts[ID] = ctx; + } + + DP("Aurora device successfully initialized with loaded binary: " + "proc_handle=%p, ctx=%p\n", + DeviceInfo.ProcHandles[ID], DeviceInfo.Contexts[ID]); + + uint64_t LibHandle = 0UL; + if (is_dyn) { + LibHandle = veo_load_library(DeviceInfo.ProcHandles[ID], tmp_name); + + if (!LibHandle) { + DP("veo_load_library() failed: LibHandle=%" PRIu64 + " Name=%s. Set env VEORUN_BIN for static linked target code.\n", + LibHandle, tmp_name); + return NULL; + } + + DP("Successfully loaded library dynamically\n"); + } else { + DP("Symbol table is expected to have been created by " + "veo_create_proc_static()\n"); + } + + DynLibTy Lib = {tmp_name, LibHandle}; + DeviceInfo.DynLibs.push_back(Lib); + DeviceInfo.LibraryHandles[ID] = LibHandle; + + DeviceInfo.buildOffloadTableFromHost(ID, LibHandle, Image->EntriesBegin, + Image->EntriesEnd); + + return DeviceInfo.getOffloadTable(ID); +} + +// Allocate data on the particular target device, of the specified size. +// HostPtr is a address of the host data the allocated target data +// will be associated with (HostPtr may be NULL if it is not known at +// allocation time, like for example it would be for target data that +// is allocated by omp_target_alloc() API). Return address of the +// allocated data on the target that will be used by libomptarget.so to +// initialize the target data mapping structures. These addresses are +// used to generate a table of target variables to pass to +// __tgt_rtl_run_region(). The __tgt_rtl_data_alloc() returns NULL in +// case an error occurred on the target device. +void *__tgt_rtl_data_alloc(int32_t ID, int64_t Size, void *HostPtr) { + int ret; + uint64_t addr; + + if (DeviceInfo.ProcHandles[ID] == NULL) { + struct veo_proc_handle *proc_handle; + proc_handle = veo_proc_create(DeviceInfo.NodeIds[ID]); + if (!proc_handle) { + DP("veo_proc_create() failed for device %d\n", ID); + return NULL; + } + DeviceInfo.ProcHandles[ID] = proc_handle; + DP("Aurora device successfully initialized: proc_handle=%p", proc_handle); + } + + ret = veo_alloc_mem(DeviceInfo.ProcHandles[ID], &addr, Size); + DP("Allocate target memory: device=%d, target addr=%p, size=%" PRIu64 "\n", + ID, reinterpret_cast(addr), Size); + if (ret != 0) { + DP("veo_alloc_mem(%d, %p, %" PRIu64 ") failed with error code %d\n", + ID, reinterpret_cast(addr), Size, ret); + return NULL; + } + + return reinterpret_cast(addr); +} + +// Pass the data content to the target device using the target address. +// In case of success, return zero. Otherwise, return an error code. +int32_t __tgt_rtl_data_submit(int32_t ID, void *TargetPtr, void *HostPtr, + int64_t Size) { + int ret = veo_write_mem(DeviceInfo.ProcHandles[ID], (uint64_t)TargetPtr, + HostPtr, (size_t)Size); + if (ret != 0) { + DP("veo_write_mem() failed with error code %d\n", ret); + return OFFLOAD_FAIL; + } + return OFFLOAD_SUCCESS; +} + +// Retrieve the data content from the target device using its address. +// In case of success, return zero. Otherwise, return an error code. +int32_t __tgt_rtl_data_retrieve(int32_t ID, void *HostPtr, void *TargetPtr, + int64_t Size) { + int ret = veo_read_mem(DeviceInfo.ProcHandles[ID], HostPtr, + (uint64_t)TargetPtr, Size); + if (ret != 0) { + DP("veo_read_mem() failed with error code %d\n", ret); + return OFFLOAD_FAIL; + } + return OFFLOAD_SUCCESS; +} + +// De-allocate the data referenced by target ptr on the device. In case of +// success, return zero. Otherwise, return an error code. +int32_t __tgt_rtl_data_delete(int32_t ID, void *TargetPtr) { + int ret = veo_free_mem(DeviceInfo.ProcHandles[ID], (uint64_t)TargetPtr); + + if (ret != 0) { + DP("veo_free_mem() failed with error code %d\n", ret); + return OFFLOAD_FAIL; + } + return OFFLOAD_SUCCESS; +} + +// Similar to __tgt_rtl_run_target_region, but additionally specify the +// number of teams to be created and a number of threads in each team. +int32_t __tgt_rtl_run_target_team_region(int32_t ID, void *Entry, void **Args, + ptrdiff_t *Offsets, int32_t NumArgs, + int32_t NumTeams, int32_t ThreadLimit, + uint64_t loop_tripcount) { + int ret; + + // ignore team num and thread limit. + std::vector ptrs(NumArgs); + + struct veo_args *TargetArgs; + TargetArgs = veo_args_alloc(); + + if (TargetArgs == NULL) { + DP("Could not allocate VEO args\n"); + return OFFLOAD_FAIL; + } + + for (int i = 0; i < NumArgs; ++i) { + ret = veo_args_set_u64(TargetArgs, i, (intptr_t)Args[i]); + + if (ret != 0) { + DP("veo_args_set_u64() has returned %d for argnum=%d and value %p\n", + ret, i, Args[i]); + return OFFLOAD_FAIL; + } + } + + uint64_t RetVal; + if (target_run_function_wait(ID, reinterpret_cast(Entry), + TargetArgs, &RetVal) != OFFLOAD_SUCCESS) { + veo_args_free(TargetArgs); + return OFFLOAD_FAIL; + } + veo_args_free(TargetArgs); + return OFFLOAD_SUCCESS; +} + +// Transfer control to the offloaded entry Entry on the target device. +// Args and Offsets are arrays of NumArgs size of target addresses and +// offsets. An offset should be added to the target address before passing it +// to the outlined function on device side. In case of success, return zero. +// Otherwise, return an error code. +int32_t __tgt_rtl_run_target_region(int32_t ID, void *Entry, void **Args, + ptrdiff_t *Offsets, int32_t NumArgs) { + return __tgt_rtl_run_target_team_region(ID, Entry, Args, Offsets, NumArgs, 1, + 1, 0); +} Index: openmp/libomptarget/src/rtl.cpp =================================================================== --- openmp/libomptarget/src/rtl.cpp +++ openmp/libomptarget/src/rtl.cpp @@ -23,6 +23,7 @@ // List of all plugins that can support offloading. static const char *RTLNames[] = { + /* SX-Aurora VE target */ "libomptarget.rtl.ve.so", /* PowerPC target */ "libomptarget.rtl.ppc64.so", /* x86_64 target */ "libomptarget.rtl.x86_64.so", /* CUDA target */ "libomptarget.rtl.cuda.so",