Index: CMakeLists.txt =================================================================== --- CMakeLists.txt +++ CMakeLists.txt @@ -3,3 +3,4 @@ set(OPENMP_LLVM_TOOLS_DIR "" CACHE PATH "Path to LLVM tools for testing") add_subdirectory(runtime) +add_subdirectory(libomptarget) Index: libomptarget/Build_With_CMake.txt =================================================================== --- /dev/null +++ libomptarget/Build_With_CMake.txt @@ -0,0 +1,142 @@ +# +#//===----------------------------------------------------------------------===// +#// +#// The LLVM Compiler Infrastructure +#// +#// This file is dual licensed under the MIT and the University of Illinois Open +#// Source Licenses. See LICENSE.txt for details. +#// +#//===----------------------------------------------------------------------===// +# + +===================================================================== +How to Build the LLVM* OpenMP* Offloading Runtime Library using CMake +===================================================================== + +==== Version of CMake required: v2.8.0 or above ==== + +============================================ +How to call cmake initially, then repeatedly +============================================ +- When calling cmake for the first time, all needed compiler options + must be specified on the command line. After this initial call to + cmake, the compiler definitions must not be included for further calls + to cmake. Other options can be specified on the command line multiple + times including all definitions in the Build options section below. +- Example of configuring, building, reconfiguring, rebuilding: + $ mkdir build + $ cd build + $ cmake -DCMAKE_C_COMPILER=gcc -DCMAKE_CXX_COMPILER=g++ .. # Initial configuration + $ make + ... + $ make clean + $ cmake -DCMAKE_BUILD_TYPE=Debug .. # Second configuration + $ make + ... + $ rm -rf * + $ cmake -DCMAKE_C_COMPILER=gcc -DCMAKE_CXX_COMPILER=g++ .. # Third configuration + $ make +- Notice in the example how the compiler definitions are only specified + for an empty build directory, but other Build options are used at any time. +- The file CMakeCache.txt which is created after the first call to cmake + is a configuration file which holds all the values for the Build options. + These configuration values can be changed using a text editor to modify + CMakeCache.txt as opposed to using definitions on the command line. +- To have cmake create a particular type of build generator file simply + inlude the -G option: + $ cmake -G "Unix Makefiles" ... + You can see a list of generators cmake supports by executing cmake with + no arguments and a list will be printed. + +===================== +Instructions to Build +===================== + $ cd libomptarget_top_level/ [ directory with plugins/ , deviceRTLs/ , etc. ] + $ mkdir build + $ cd build + + [ Unix* Libraries ] + $ cmake -DCMAKE_C_COMPILER= -DCMAKE_CXX_COMPILER= .. + + $ make + $ make install + +=========== +Tests +=========== +After the library has been built, there are optional tests that can be +performed. Some will be skipped based upon the platform. +To run the tests, +$ make check-libomptarget + +============= +CMake options +============= +-DCMAKE_C_COMPILER= +Specify the C compiler + +-DCMAKE_CXX_COMPILER= +Specify the C++ compiler + +==== First values listed are the default value ==== +-DCMAKE_BUILD_TYPE=Release|Debug|RelWithDebInfo +Build type can be Release, Debug, or RelWithDebInfo. + +-DLIBOMPTARGET_ENABLE_WERROR=true|false +Should consider warnings as errors. + +-DLIBOMPTARGET_LLVM_LIT_EXECUTABLE="" +Full path to the llvm-lit tool. Required for testing in out-of-tree builds. + +-DLIBOMPTARGET_FILECHECK_EXECUTABLE="" +Full path to the FileCheck tool. Required for testing in out-of-tree builds. + +-DLIBOMPTARGET_OPENMP_HEADER_FOLDER="" +Path of the folder that contains omp.h. This is required for testing +out-of-tree builds. + +-DLIBOMPTARGET_OPENMP_HOST_RTL_FOLDER="" +Path of the folder that contains libomp.so. This is required for testing +out-of-tree builds. + +-DLIBOMPTARGET_NVPTX_ENABLE_BCLIB=false|true +Enable CUDA LLVM bitcode offloading device RTL. This is used for +link time optimization of the omp runtime and application code. + +-DLIBOMPTARGET_NVPTX_CUDA_COMPILER= +Location of a CUDA compiler capable of emitting LLVM bitcode. +Currently only the Clang compiler is supported. This is only used +when building the CUDA LLVM bitcode offloading device RTL. If +unspecified, the default paths are inspected. + +-DLIBOMPTARGET_NVPTX_BC_LINKER= +Location of a linker capable of linking LLVM bitcode objects. +This is only used when building the CUDA LLVM bitcode offloading +device RTL. If unspecified, the default paths are inspected. + +==== NVPTX device RTL specific ==== +-DLIBOMPTARGET_NVPTX_ALTERNATE_HOST_COMPILER="" +Host compiler to use with NVCC. This compiler is not going to be used to produce +any binary. Instead, this is used to overcome the input compiler checks done by +NVCC. E.g. if using a default host compiler that is not compatible with NVCC, +this option can be use to pass to NVCC a valid compiler to avoid the error. + +-DLIBOMPTARGET_NVPTX_COMPUTE_CAPABILITY="35" +Comma-separated list of CUDA compute capabilities that should be supported by +the NVPTX device RTL. E.g. for compute capabilities 3.0 and 3.5, the option +"30,35" should be used. + +======================= +Example usages of CMake +======================= +---- Typical usage ---- +cmake -DCMAKE_C_COMPILER=gcc -DCMAKE_CXX_COMPILER=g++ .. +cmake -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ .. + +---- Request an NVPTX runtime library that supports compute capability 5.0 ---- +cmake -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ -DLIBOMPTARGET_NVPTX_COMPUTE_CAPABILITY="50" + +========= +Footnotes +========= +[*] Other names and brands may be claimed as the property of others. Index: libomptarget/CMakeLists.txt =================================================================== --- /dev/null +++ libomptarget/CMakeLists.txt @@ -0,0 +1,111 @@ +##===----------------------------------------------------------------------===## +# +# The LLVM Compiler Infrastructure +# +# This file is dual licensed under the MIT and the University of Illinois Open +# Source Licenses. See LICENSE.txt for details. +# +##===----------------------------------------------------------------------===## +# +# Build offloading library libomptarget.so. +# +##===----------------------------------------------------------------------===## + +# CMAKE libomptarget +cmake_minimum_required(VERSION 2.8 FATAL_ERROR) + +# Add cmake directory to search for custom cmake functions. +set(CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake/Modules ${CMAKE_MODULE_PATH}) + +# Standalone build or part of LLVM? +set(LIBOMPTARGET_STANDALONE_BUILD FALSE) +if("${CMAKE_SOURCE_DIR}" STREQUAL "${CMAKE_CURRENT_SOURCE_DIR}" OR + "${CMAKE_SOURCE_DIR}/libomptarget" STREQUAL "${CMAKE_CURRENT_SOURCE_DIR}") + project(libomptarget C CXX) + set(LIBOMPTARGET_STANDALONE_BUILD TRUE) +endif() + + +if(${LIBOMPTARGET_STANDALONE_BUILD}) + set(LIBOMPTARGET_ENABLE_WERROR FALSE CACHE BOOL + "Enable -Werror flags to turn warnings into errors for supporting compilers.") + # CMAKE_BUILD_TYPE was not defined, set default to Release + if(NOT CMAKE_BUILD_TYPE) + set(CMAKE_BUILD_TYPE Release) + endif() +else() + set(LIBOMP_ENABLE_WERROR ${LLVM_ENABLE_WERROR}) +endif() + +# Compiler flag checks. +include(config-ix) + +# Message utilities. +include(LibomptargetUtils) + +# Get dependencies for the different components of the project. +include(LibomptargetGetDependencies) + +# This is a list of all the targets that are supported/tested right now. +set (LIBOMPTARGET_ALL_TARGETS "${LIBOMPTARGET_ALL_TARGETS} powerpc64le-ibm-linux-gnu") +set (LIBOMPTARGET_ALL_TARGETS "${LIBOMPTARGET_ALL_TARGETS} powerpc64-ibm-linux-gnu") +set (LIBOMPTARGET_ALL_TARGETS "${LIBOMPTARGET_ALL_TARGETS} x86_64-pc-linux-gnu") +set (LIBOMPTARGET_ALL_TARGETS "${LIBOMPTARGET_ALL_TARGETS} nvptx64-nvidia-cuda") + +# Once the plugins for the different targets are validated, they will be added to +# the list of supported targets in the current system. +set (LIBOMPTARGET_SYSTEM_TARGETS "") + +# Set base directories - required for lit to locate the tests. +set(LIBOMPTARGET_BASE_DIR ${CMAKE_CURRENT_SOURCE_DIR}) +set(LIBOMPTARGET_BINARY_DIR ${CMAKE_CURRENT_BINARY_DIR}) + +# We need C++11 support. +if(LIBOMPTARGET_HAVE_STD_CPP11_FLAG) + + libomptarget_say("Building offloading runtime library libomptarget.") + + # Enable support for C++11. + add_definitions(-std=c++11) + + if(LIBOMPTARGET_ENABLE_WERROR AND LIBOMPTARGET_HAVE_WERROR_FLAG) + add_definitions(-Werror) + endif() + + # If building this library in debug mode, we define a macro to enable + # dumping progress messages at runtime. + string( TOLOWER "${CMAKE_BUILD_TYPE}" LIBOMPTARGET_CMAKE_BUILD_TYPE) + if(LIBOMPTARGET_CMAKE_BUILD_TYPE MATCHES debug) + add_definitions(-DOMPTARGET_DEBUG) + add_definitions(-g) + add_definitions(-O0) + endif() + + set(src_files + src/omptarget.cpp + ) + + include_directories(src/) + + # Build libomptarget library with libdl dependency. + add_library(omptarget SHARED ${src_files}) + target_link_libraries(omptarget + dl + "-Wl,--version-script=${CMAKE_CURRENT_SOURCE_DIR}/exports") + + # Install libomptarget under the lib destination folder. + install(TARGETS omptarget LIBRARY DESTINATION "lib") + + # Retrieve the path to the resulting library so that it can be used for + # testing. + get_target_property(LIBOMPTARGET_LIBRARY_DIR omptarget LIBRARY_OUTPUT_DIRECTORY) + if(NOT LIBOMPTARGET_LIBRARY_DIR) + set(LIBOMPTARGET_LIBRARY_DIR ${CMAKE_CURRENT_BINARY_DIR}) + endif() + + # Add tests. + add_subdirectory(test) + +else(LIBOMPTARGET_HAVE_STD_CPP11_FLAG) + libomptarget_say("Not building offloading runtime library libomptarget: host compiler must have c++11 support.") +endif(LIBOMPTARGET_HAVE_STD_CPP11_FLAG) Index: libomptarget/README.txt =================================================================== --- /dev/null +++ libomptarget/README.txt @@ -0,0 +1,72 @@ + + README for the LLVM* OpenMP* Offloading Runtime Library (libomptarget) + ====================================================================== + +How to Build the LLVM* OpenMP* Offloading Runtime Library (libomptarget) +======================================================================== +In-tree build: + +$ cd where-you-want-to-live +Check out openmp (libomptarget lives under ./libomptarget) into llvm/projects +$ cd where-you-want-to-build +$ mkdir build && cd build +$ cmake path/to/llvm -DCMAKE_C_COMPILER= -DCMAKE_CXX_COMPILER= +$ make omptarget + +Out-of-tree build: + +$ cd where-you-want-to-live +Check out openmp (libomptarget lives under ./libomptarget) +$ cd where-you-want-to-live/openmp/libomptarget +$ mkdir build && cd build +$ cmake path/to/openmp -DCMAKE_C_COMPILER= -DCMAKE_CXX_COMPILER= +$ make + +For details about building, please look at Build_With_CMake.txt + +Architectures Supported +======================= +The current library has been only tested in Linux operating system and the +following host architectures: +* Intel(R) 64 architecture +* IBM(R) Power architecture (big endian) +* IBM(R) Power architecture (little endian) + +The currently supported offloading device architectures are: +* Intel(R) 64 architecture (generic 64-bit plugin - mostly for testing purposes) +* IBM(R) Power architecture (big endian) (generic 64-bit plugin - mostly for testing purposes) +* IBM(R) Power architecture (little endian) (generic 64-bit plugin - mostly for testing purposes) +* CUDA(R) enabled 64-bit NVIDIA(R) GPU architectures + +Supported RTL Build Configurations +================================== +Supported Architectures: Intel(R) 64, IBM(R) Power 7 and Power 8 + + --------------------------- + | gcc | clang | +--------------|------------|------------| +| Linux* OS | Yes(1) | Yes(2) | +----------------------------------------- + +(1) gcc version 4.8.2 or later is supported. +(2) clang version 3.7 or later is supported. + + +Front-end Compilers that work with this RTL +=========================================== + +The following compilers are known to do compatible code generation for +this RTL: + - clang (from the OpenMP development branch at http://clang-omp.github.io/ ) + - clang (development branch at http://clang.llvm.org - several features still + under development) + +----------------------------------------------------------------------- + +Notices +======= +This library and related compiler support is still under development, so the +employed interface is likely to change in the future. + +*Other names and brands may be claimed as the property of others. + Index: libomptarget/cmake/Modules/LibomptargetGetDependencies.cmake =================================================================== --- /dev/null +++ libomptarget/cmake/Modules/LibomptargetGetDependencies.cmake @@ -0,0 +1,124 @@ +# +#//===----------------------------------------------------------------------===// +#// +#// The LLVM Compiler Infrastructure +#// +#// This file is dual licensed under the MIT and the University of Illinois Open +#// Source Licenses. See LICENSE.txt for details. +#// +#//===----------------------------------------------------------------------===// +# + +# Try to detect in the system several dependencies required by the different +# components of libomptarget. These are the dependencies we have: +# +# libelf : required by some targets to handle the ELF files at runtime. +# libffi : required to launch target kernels given function and argument +# pointers. +# CUDA : required to control offloading to NVIDIA GPUs. + +include (FindPackageHandleStandardArgs) + +################################################################################ +# Looking for libelf... +################################################################################ + +find_path ( + LIBOMPTARGET_DEP_LIBELF_INCLUDE_DIR + NAMES + libelf.h + PATHS + /usr/include + /usr/local/include + /opt/local/include + /sw/include + ENV CPATH + PATH_SUFFIXES + libelf) + +find_library ( + LIBOMPTARGET_DEP_LIBELF_LIBRARIES + NAMES + elf + PATHS + /usr/lib + /usr/local/lib + /opt/local/lib + /sw/lib + ENV LIBRARY_PATH + ENV LD_LIBRARY_PATH) + +set(LIBOMPTARGET_DEP_LIBELF_INCLUDE_DIRS ${LIBOMPTARGET_DEP_LIBELF_INCLUDE_DIR}) +find_package_handle_standard_args( + LIBOMPTARGET_DEP_LIBELF + DEFAULT_MSG + LIBOMPTARGET_DEP_LIBELF_LIBRARIES + LIBOMPTARGET_DEP_LIBELF_INCLUDE_DIRS) + +mark_as_advanced( + LIBOMPTARGET_DEP_LIBELF_INCLUDE_DIRS + LIBOMPTARGET_DEP_LIBELF_LIBRARIES) + +################################################################################ +# Looking for libffi... +################################################################################ +find_package(PkgConfig) + +pkg_check_modules(LIBOMPTARGET_SEARCH_LIBFFI QUIET libffi) + +find_path ( + LIBOMPTARGET_DEP_LIBFFI_INCLUDE_DIR + NAMES + ffi.h + HINTS + ${LIBOMPTARGET_SEARCH_LIBFFI_INCLUDEDIR} + ${LIBOMPTARGET_SEARCH_LIBFFI_INCLUDE_DIRS} + PATHS + /usr/include + /usr/local/include + /opt/local/include + /sw/include + ENV CPATH) + +# Don't bother look for the library if the header files were not found. +if (LIBOMPTARGET_DEP_LIBFFI_INCLUDE_DIR) + find_library ( + LIBOMPTARGET_DEP_LIBFFI_LIBRARIES + NAMES + ffi + HINTS + ${LIBOMPTARGET_SEARCH_LIBFFI_LIBDIR} + ${LIBOMPTARGET_SEARCH_LIBFFI_LIBRARY_DIRS} + PATHS + /usr/lib + /usr/local/lib + /opt/local/lib + /sw/lib + ENV LIBRARY_PATH + ENV LD_LIBRARY_PATH) +endif() + +set(LIBOMPTARGET_DEP_LIBFFI_INCLUDE_DIRS ${LIBOMPTARGET_DEP_LIBFFI_INCLUDE_DIR}) +find_package_handle_standard_args( + LIBOMPTARGET_DEP_LIBFFI + DEFAULT_MSG + LIBOMPTARGET_DEP_LIBFFI_LIBRARIES + LIBOMPTARGET_DEP_LIBFFI_INCLUDE_DIRS) + +mark_as_advanced( + LIBOMPTARGET_DEP_LIBFFI_INCLUDE_DIRS + LIBOMPTARGET_DEP_LIBFFI_LIBRARIES) + +################################################################################ +# Looking for CUDA... +################################################################################ +find_package(CUDA QUIET) + +set(LIBOMPTARGET_DEP_CUDA_FOUND ${CUDA_FOUND}) +set(LIBOMPTARGET_DEP_CUDA_LIBRARIES ${CUDA_LIBRARIES}) +set(LIBOMPTARGET_DEP_CUDA_INCLUDE_DIRS ${CUDA_INCLUDE_DIRS}) + +mark_as_advanced( + LIBOMPTARGET_DEP_CUDA_FOUND + LIBOMPTARGET_DEP_CUDA_INCLUDE_DIRS + LIBOMPTARGET_DEP_CUDA_LIBRARIES) Index: libomptarget/cmake/Modules/LibomptargetUtils.cmake =================================================================== --- /dev/null +++ libomptarget/cmake/Modules/LibomptargetUtils.cmake @@ -0,0 +1,28 @@ +# +#//===----------------------------------------------------------------------===// +#// +#// The LLVM Compiler Infrastructure +#// +#// This file is dual licensed under the MIT and the University of Illinois Open +#// Source Licenses. See LICENSE.txt for details. +#// +#//===----------------------------------------------------------------------===// +# + +# void libomptarget_say(string message_to_user); +# - prints out message_to_user +macro(libomptarget_say message_to_user) + message(STATUS "LIBOMPTARGET: ${message_to_user}") +endmacro() + +# void libomptarget_warning_say(string message_to_user); +# - prints out message_to_user with a warning +macro(libomptarget_warning_say message_to_user) + message(WARNING "LIBOMPTARGET: ${message_to_user}") +endmacro() + +# void libomptarget_error_say(string message_to_user); +# - prints out message_to_user with an error and exits cmake +macro(libomptarget_error_say message_to_user) + message(FATAL_ERROR "LIBOMPTARGET: ${message_to_user}") +endmacro() Index: libomptarget/cmake/Modules/config-ix.cmake =================================================================== --- /dev/null +++ libomptarget/cmake/Modules/config-ix.cmake @@ -0,0 +1,17 @@ +# +#//===----------------------------------------------------------------------===// +#// +#// The LLVM Compiler Infrastructure +#// +#// This file is dual licensed under the MIT and the University of Illinois Open +#// Source Licenses. See LICENSE.txt for details. +#// +#//===----------------------------------------------------------------------===// +# + +include(CheckCCompilerFlag) +include(CheckCXXCompilerFlag) + +# Checking C, CXX +check_cxx_compiler_flag(-std=c++11 LIBOMPTARGET_HAVE_STD_CPP11_FLAG) +check_c_compiler_flag(-Werror LIBOMPTARGET_HAVE_WERROR_FLAG) Index: libomptarget/exports =================================================================== --- /dev/null +++ libomptarget/exports @@ -0,0 +1,28 @@ +VERS1.0 { + global: + __tgt_register_lib; + __tgt_unregister_lib; + __tgt_target_data_begin; + __tgt_target_data_end; + __tgt_target_data_update; + __tgt_target; + __tgt_target_teams; + __tgt_target_data_begin_nowait; + __tgt_target_data_end_nowait; + __tgt_target_data_update_nowait; + __tgt_target_nowait; + __tgt_target_teams_nowait; + omp_get_num_devices; + omp_get_initial_device; + omp_target_alloc; + omp_target_free; + omp_target_is_present; + omp_target_memcpy; + omp_target_memcpy_rect; + omp_target_associate_ptr; + omp_target_disassociate_ptr; + __kmpc_push_target_tripcount; + local: + *; +}; + Index: libomptarget/src/omptarget.h =================================================================== --- /dev/null +++ libomptarget/src/omptarget.h @@ -0,0 +1,234 @@ +//===-------- omptarget.h - Target independent OpenMP target RTL -- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is dual licensed under the MIT and the University of Illinois Open +// Source Licenses. See LICENSE.txt for details. +// +//===----------------------------------------------------------------------===// +// +// Interface to be used by Clang during the codegen of a +// target region. +// +//===----------------------------------------------------------------------===// + +#ifndef _OMPTARGET_H_ +#define _OMPTARGET_H_ + +#include + +#define OFFLOAD_SUCCESS (0) +#define OFFLOAD_FAIL (~0) + +#define OFFLOAD_DEVICE_DEFAULT -1 +#define OFFLOAD_DEVICE_CONSTRUCTOR -2 +#define OFFLOAD_DEVICE_DESTRUCTOR -3 +#define HOST_DEVICE -10 + +/// Data attributes for each data reference used in an OpenMP target region. +enum tgt_map_type { + // No flags + OMP_TGT_MAPTYPE_NONE = 0x000, + // copy data from host to device + OMP_TGT_MAPTYPE_TO = 0x001, + // copy data from device to host + OMP_TGT_MAPTYPE_FROM = 0x002, + // copy regardless of the reference count + OMP_TGT_MAPTYPE_ALWAYS = 0x004, + // force unmapping of data + OMP_TGT_MAPTYPE_DELETE = 0x008, + // map the pointer as well as the pointee + OMP_TGT_MAPTYPE_PTR_AND_OBJ = 0x010, + // pass device base address to kernel + OMP_TGT_MAPTYPE_TARGET_PARAM = 0x020, + // return base device address of mapped data + OMP_TGT_MAPTYPE_RETURN_PARAM = 0x040, + // private variable - not mapped + OMP_TGT_MAPTYPE_PRIVATE = 0x080, + // copy by value - not mapped + OMP_TGT_MAPTYPE_LITERAL = 0x100, + // mapping is implicit + OMP_TGT_MAPTYPE_IMPLICIT = 0x200, + // member of struct, member given by 4 MSBs - 1 + OMP_TGT_MAPTYPE_MEMBER_OF = 0xffff000000000000 +}; + +enum OpenMPOffloadingDeclareTargetFlags { + /// Mark the entry as having a 'link' attribute. + OMP_DECLARE_TARGET_LINK = 0x01, + /// Mark the entry as being a global constructor. + OMP_DECLARE_TARGET_CTOR = 0x02, + /// Mark the entry as being a global destructor. + OMP_DECLARE_TARGET_DTOR = 0x04 +}; + +/// This struct is a record of an entry point or global. For a function +/// entry point the size is expected to be zero +struct __tgt_offload_entry { + void *addr; // Pointer to the offload entry info (function or global) + char *name; // Name of the function or global + size_t size; // Size of the entry info (0 if it is a function) + int32_t flags; // Flags associated with the entry, e.g. 'link'. + int32_t reserved; // Reserved, to be used by the runtime library. +}; + +/// This struct is a record of the device image information +struct __tgt_device_image { + void *ImageStart; // Pointer to the target code start + void *ImageEnd; // Pointer to the target code end + __tgt_offload_entry *EntriesBegin; // Begin of table with all target entries + __tgt_offload_entry *EntriesEnd; // End of table (non inclusive) +}; + +/// This struct is a record of all the host code that may be offloaded to a +/// target. +struct __tgt_bin_desc { + int32_t NumDeviceImages; // Number of device types supported + __tgt_device_image *DeviceImages; // Array of device images (1 per dev. type) + __tgt_offload_entry *HostEntriesBegin; // Begin of table with all host entries + __tgt_offload_entry *HostEntriesEnd; // End of table (non inclusive) +}; + +/// This struct contains the offload entries identified by the target runtime +struct __tgt_target_table { + __tgt_offload_entry *EntriesBegin; // Begin of the table with all the entries + __tgt_offload_entry + *EntriesEnd; // End of the table with all the entries (non inclusive) +}; + +#ifdef __cplusplus +extern "C" { +#endif + +// Implemented in libomp, they are called from within __tgt_* functions. +int omp_get_default_device(void) __attribute__((weak)); +int32_t __kmpc_omp_taskwait(void *loc_ref, int32_t gtid) __attribute__((weak)); + +int omp_get_num_devices(void); +int omp_get_initial_device(void); +void *omp_target_alloc(size_t size, int device_num); +void omp_target_free(void *device_ptr, int device_num); +int omp_target_is_present(void *ptr, int device_num); +int omp_target_memcpy(void *dst, void *src, size_t length, size_t dst_offset, + size_t src_offset, int dst_device, int src_device); +int omp_target_memcpy_rect(void *dst, void *src, size_t element_size, + int num_dims, const size_t *volume, const size_t *dst_offsets, + const size_t *src_offsets, const size_t *dst_dimensions, + const size_t *src_dimensions, int dst_device, int src_device); +int omp_target_associate_ptr(void *host_ptr, void *device_ptr, size_t size, + size_t device_offset, int device_num); +int omp_target_disassociate_ptr(void *host_ptr, int device_num); + +/// adds a target shared library to the target execution image +void __tgt_register_lib(__tgt_bin_desc *desc); + +/// removes a target shared library from the target execution image +void __tgt_unregister_lib(__tgt_bin_desc *desc); + +// creates the host to target data mapping, stores it in the +// libomptarget.so internal structure (an entry in a stack of data maps) and +// passes the data to the device; +void __tgt_target_data_begin(int32_t device_id, int32_t arg_num, + void **args_base, void **args, int64_t *arg_sizes, + int32_t *arg_types); +void __tgt_target_data_begin_nowait(int32_t device_id, int32_t arg_num, + void **args_base, void **args, + int64_t *arg_sizes, int32_t *arg_types, + int32_t depNum, void *depList, + int32_t noAliasDepNum, + void *noAliasDepList); + +// passes data from the target, release target memory and destroys the +// host-target mapping (top entry from the stack of data maps) created by +// the last __tgt_target_data_begin +void __tgt_target_data_end(int32_t device_id, int32_t arg_num, void **args_base, + void **args, int64_t *arg_sizes, int32_t *arg_types); +void __tgt_target_data_end_nowait(int32_t device_id, int32_t arg_num, + void **args_base, void **args, + int64_t *arg_sizes, int32_t *arg_types, + int32_t depNum, void *depList, + int32_t noAliasDepNum, void *noAliasDepList); + +/// passes data to/from the target +void __tgt_target_data_update(int32_t device_id, int32_t arg_num, + void **args_base, void **args, int64_t *arg_sizes, + int32_t *arg_types); +void __tgt_target_data_update_nowait(int32_t device_id, int32_t arg_num, + void **args_base, void **args, + int64_t *arg_sizes, int32_t *arg_types, + int32_t depNum, void *depList, + int32_t noAliasDepNum, + void *noAliasDepList); + +// Performs the same actions as data_begin in case arg_num is non-zero +// and initiates run of offloaded region on target platform; if arg_num +// is non-zero after the region execution is done it also performs the +// same action as data_end above. The following types are used; this +// function returns 0 if it was able to transfer the execution to a +// target and an int different from zero otherwise. +int __tgt_target(int32_t device_id, void *host_ptr, int32_t arg_num, + void **args_base, void **args, int64_t *arg_sizes, + int32_t *arg_types); +int __tgt_target_nowait(int32_t device_id, void *host_ptr, int32_t arg_num, + void **args_base, void **args, int64_t *arg_sizes, + int32_t *arg_types, int32_t depNum, void *depList, + int32_t noAliasDepNum, void *noAliasDepList); + +int __tgt_target_teams(int32_t device_id, void *host_ptr, int32_t arg_num, + void **args_base, void **args, int64_t *arg_sizes, + int32_t *arg_types, int32_t num_teams, + int32_t thread_limit); +int __tgt_target_teams_nowait(int32_t device_id, void *host_ptr, + int32_t arg_num, void **args_base, void **args, + int64_t *arg_sizes, int32_t *arg_types, + int32_t num_teams, int32_t thread_limit, + int32_t depNum, void *depList, + int32_t noAliasDepNum, void *noAliasDepList); +void __kmpc_push_target_tripcount(int32_t device_id, uint64_t loop_tripcount); + +#ifdef __cplusplus +} +#endif + +#ifdef OMPTARGET_DEBUG +#include +#define DEBUGP(prefix, ...) \ + { \ + fprintf(stderr, "%s --> ", prefix); \ + fprintf(stderr, __VA_ARGS__); \ + } + +#include +#define DPxMOD "0x%0*" PRIxPTR +#define DPxPTR(ptr) ((int)(2*sizeof(uintptr_t))), ((uintptr_t) (ptr)) + +/* + * To printf a pointer in hex with a fixed width of 16 digits and a leading 0x, + * use printf("ptr=" DPxMOD "...\n", DPxPTR(ptr)); + * + * DPxMOD expands to: + * "0x%0*" PRIxPTR + * where PRIxPTR expands to an appropriate modifier for the type uintptr_t on a + * specific platform, e.g. "lu" if uintptr_t is typedef'd as unsigned long: + * "0x%0*lu" + * + * Ultimately, the whole statement expands to: + * printf("ptr=0x%0*lu...\n", // the 0* modifier expects an extra argument + * // specifying the width of the output + * (int)(2*sizeof(uintptr_t)), // the extra argument specifying the width + * // 8 digits for 32bit systems + * // 16 digits for 64bit + * (uintptr_t) ptr); + */ +#else +#define DEBUGP(prefix, ...) \ + {} +#endif + +#ifdef __cplusplus +#define EXTERN extern "C" +#else +#define EXTERN extern +#endif + +#endif // _OMPTARGET_H_ Index: libomptarget/src/omptarget.cpp =================================================================== --- /dev/null +++ libomptarget/src/omptarget.cpp @@ -0,0 +1,2316 @@ +//===------ omptarget.cpp - Target independent OpenMP target RTL -- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is dual licensed under the MIT and the University of Illinois Open +// Source Licenses. See LICENSE.txt for details. +// +//===----------------------------------------------------------------------===// +// +// Implementation of the interface to be used by Clang during the codegen of a +// target region. +// +//===----------------------------------------------------------------------===// + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +// Header file global to this project +#include "omptarget.h" + +#define DP(...) DEBUGP("Libomptarget", __VA_ARGS__) +#define INF_REF_CNT (LONG_MAX>>1) // leave room for additions/subtractions +#define CONSIDERED_INF(x) (x > (INF_REF_CNT>>1)) + +// List of all plugins that can support offloading. +static const char *RTLNames[] = { + /* PowerPC target */ "libomptarget.rtl.ppc64.so", + /* x86_64 target */ "libomptarget.rtl.x86_64.so", + /* CUDA target */ "libomptarget.rtl.cuda.so"}; + +// forward declarations +struct RTLInfoTy; +static int target(int32_t device_id, void *host_ptr, int32_t arg_num, + void **args_base, void **args, int64_t *arg_sizes, int64_t *arg_types, + int32_t team_num, int32_t thread_limit, int IsTeamConstruct); + +/// Map between host data and target data. +struct HostDataToTargetTy { + uintptr_t HstPtrBase; // host info. + uintptr_t HstPtrBegin; + uintptr_t HstPtrEnd; // non-inclusive. + + uintptr_t TgtPtrBegin; // target info. + + long RefCount; + + HostDataToTargetTy() + : HstPtrBase(0), HstPtrBegin(0), HstPtrEnd(0), + TgtPtrBegin(0), RefCount(0) {} + HostDataToTargetTy(uintptr_t BP, uintptr_t B, uintptr_t E, uintptr_t TB) + : HstPtrBase(BP), HstPtrBegin(B), HstPtrEnd(E), + TgtPtrBegin(TB), RefCount(1) {} +}; + +typedef std::list HostDataToTargetListTy; + +struct LookupResult { + struct { + unsigned IsContained : 1; + unsigned ExtendsBefore : 1; + unsigned ExtendsAfter : 1; + } Flags; + + HostDataToTargetListTy::iterator Entry; + + LookupResult() : Flags({0,0,0}), Entry(0) {} +}; + +/// Map for shadow pointers +struct ShadowPtrValTy { + void *HstPtrVal; + void *TgtPtrAddr; + void *TgtPtrVal; +}; +typedef std::map ShadowPtrListTy; + +/// +struct PendingCtorDtorListsTy { + std::list PendingCtors; + std::list PendingDtors; +}; +typedef std::map<__tgt_bin_desc *, PendingCtorDtorListsTy> + PendingCtorsDtorsPerLibrary; + +struct DeviceTy { + int32_t DeviceID; + RTLInfoTy *RTL; + int32_t RTLDeviceID; + + bool IsInit; + std::once_flag InitFlag; + bool HasPendingGlobals; + + HostDataToTargetListTy HostDataToTargetMap; + PendingCtorsDtorsPerLibrary PendingCtorsDtors; + + ShadowPtrListTy ShadowPtrMap; + + std::mutex DataMapMtx, PendingGlobalsMtx, ShadowMtx; + + uint64_t loopTripCnt; + + DeviceTy(RTLInfoTy *RTL) + : DeviceID(-1), RTL(RTL), RTLDeviceID(-1), IsInit(false), InitFlag(), + HasPendingGlobals(false), HostDataToTargetMap(), + PendingCtorsDtors(), ShadowPtrMap(), DataMapMtx(), PendingGlobalsMtx(), + ShadowMtx(), loopTripCnt(0) {} + + // The existence of mutexes makes DeviceTy non-copyable. We need to + // provide a copy constructor and an assignment operator explicitly. + DeviceTy(const DeviceTy &d) + : DeviceID(d.DeviceID), RTL(d.RTL), RTLDeviceID(d.RTLDeviceID), + IsInit(d.IsInit), InitFlag(), HasPendingGlobals(d.HasPendingGlobals), + HostDataToTargetMap(d.HostDataToTargetMap), + PendingCtorsDtors(d.PendingCtorsDtors), ShadowPtrMap(d.ShadowPtrMap), + DataMapMtx(), PendingGlobalsMtx(), + ShadowMtx(), loopTripCnt(d.loopTripCnt) {} + + DeviceTy& operator=(const DeviceTy &d) { + DeviceID = d.DeviceID; + RTL = d.RTL; + RTLDeviceID = d.RTLDeviceID; + IsInit = d.IsInit; + HasPendingGlobals = d.HasPendingGlobals; + HostDataToTargetMap = d.HostDataToTargetMap; + PendingCtorsDtors = d.PendingCtorsDtors; + ShadowPtrMap = d.ShadowPtrMap; + loopTripCnt = d.loopTripCnt; + + return *this; + } + + long getMapEntryRefCnt(void *HstPtrBegin); + LookupResult lookupMapping(void *HstPtrBegin, int64_t Size); + void *getOrAllocTgtPtr(void *HstPtrBegin, void *HstPtrBase, int64_t Size, + bool &IsNew, bool IsImplicit, bool UpdateRefCount = true); + void *getTgtPtrBegin(void *HstPtrBegin, int64_t Size); + void *getTgtPtrBegin(void *HstPtrBegin, int64_t Size, bool &IsLast, + bool UpdateRefCount); + int deallocTgtPtr(void *TgtPtrBegin, int64_t Size, bool ForceDelete); + int associatePtr(void *HstPtrBegin, void *TgtPtrBegin, int64_t Size); + int disassociatePtr(void *HstPtrBegin); + + // calls to RTL + int32_t initOnce(); + __tgt_target_table *load_binary(void *Img); + + int32_t data_submit(void *TgtPtrBegin, void *HstPtrBegin, int64_t Size); + int32_t data_retrieve(void *HstPtrBegin, void *TgtPtrBegin, int64_t Size); + + int32_t run_region(void *TgtEntryPtr, void **TgtVarsPtr, int32_t TgtVarsSize); + int32_t run_team_region(void *TgtEntryPtr, void **TgtVarsPtr, + int32_t TgtVarsSize, int32_t NumTeams, int32_t ThreadLimit, + uint64_t LoopTripCount); + +private: + // Call to RTL + void init(); // To be called only via DeviceTy::initOnce() +}; + +/// Map between Device ID (i.e. openmp device id) and its DeviceTy. +typedef std::vector DevicesTy; +static DevicesTy Devices; + +struct RTLInfoTy { + typedef int32_t(is_valid_binary_ty)(void *); + typedef int32_t(number_of_devices_ty)(); + typedef int32_t(init_device_ty)(int32_t); + typedef __tgt_target_table *(load_binary_ty)(int32_t, void *); + typedef void *(data_alloc_ty)(int32_t, int64_t); + typedef int32_t(data_submit_ty)(int32_t, void *, void *, int64_t); + typedef int32_t(data_retrieve_ty)(int32_t, void *, void *, int64_t); + typedef int32_t(data_delete_ty)(int32_t, void *); + typedef int32_t(run_region_ty)(int32_t, void *, void **, int32_t); + typedef int32_t(run_team_region_ty)(int32_t, void *, void **, int32_t, + int32_t, int32_t, uint64_t); + + int32_t Idx; // RTL index, index is the number of devices + // of other RTLs that were registered before, + // i.e. the OpenMP index of the first device + // to be registered with this RTL. + int32_t NumberOfDevices; // Number of devices this RTL deals with. + std::vector Devices; // one per device (NumberOfDevices in total). + + void *LibraryHandler; + +#ifdef OMPTARGET_DEBUG + std::string RTLName; +#endif + + // Functions implemented in the RTL. + is_valid_binary_ty *is_valid_binary; + number_of_devices_ty *number_of_devices; + init_device_ty *init_device; + load_binary_ty *load_binary; + data_alloc_ty *data_alloc; + data_submit_ty *data_submit; + data_retrieve_ty *data_retrieve; + data_delete_ty *data_delete; + run_region_ty *run_region; + run_team_region_ty *run_team_region; + + // Are there images associated with this RTL. + bool isUsed; + + // Mutex for thread-safety when calling RTL interface functions. + // It is easier to enforce thread-safety at the libomptarget level, + // so that developers of new RTLs do not have to worry about it. + std::mutex Mtx; + + // The existence of the mutex above makes RTLInfoTy non-copyable. + // We need to provide a copy constructor explicitly. + RTLInfoTy() + : Idx(-1), NumberOfDevices(-1), Devices(), LibraryHandler(0), +#ifdef OMPTARGET_DEBUG + RTLName(), +#endif + is_valid_binary(0), number_of_devices(0), init_device(0), + load_binary(0), data_alloc(0), data_submit(0), data_retrieve(0), + data_delete(0), run_region(0), run_team_region(0), isUsed(false), + Mtx() {} + + RTLInfoTy(const RTLInfoTy &r) : Mtx() { + Idx = r.Idx; + NumberOfDevices = r.NumberOfDevices; + Devices = r.Devices; + LibraryHandler = r.LibraryHandler; +#ifdef OMPTARGET_DEBUG + RTLName = r.RTLName; +#endif + is_valid_binary = r.is_valid_binary; + number_of_devices = r.number_of_devices; + init_device = r.init_device; + load_binary = r.load_binary; + data_alloc = r.data_alloc; + data_submit = r.data_submit; + data_retrieve = r.data_retrieve; + data_delete = r.data_delete; + run_region = r.run_region; + run_team_region = r.run_team_region; + isUsed = r.isUsed; + } +}; + +/// RTLs identified in the system. +class RTLsTy { +private: + // Mutex-like object to guarantee thread-safety and unique initialization + // (i.e. the library attempts to load the RTLs (plugins) only once). + std::once_flag initFlag; + void LoadRTLs(); // not thread-safe + +public: + // List of the detected runtime libraries. + std::list AllRTLs; + + // Array of pointers to the detected runtime libraries that have compatible + // binaries. + std::vector UsedRTLs; + + explicit RTLsTy() {} + + // Load all the runtime libraries (plugins) if not done before. + void LoadRTLsOnce(); +}; + +void RTLsTy::LoadRTLs() { + // Parse environment variable OMP_TARGET_OFFLOAD (if set) + char *envStr = getenv("OMP_TARGET_OFFLOAD"); + if (envStr && !strcmp(envStr, "DISABLED")) { + DP("Target offloading disabled by environment\n"); + return; + } + + DP("Loading RTLs...\n"); + + // Attempt to open all the plugins and, if they exist, check if the interface + // is correct and if they are supporting any devices. + for (auto *Name : RTLNames) { + DP("Loading library '%s'...\n", Name); + void *dynlib_handle = dlopen(Name, RTLD_NOW); + + if (!dynlib_handle) { + // Library does not exist or cannot be found. + DP("Unable to load library '%s': %s!\n", Name, dlerror()); + continue; + } + + DP("Successfully loaded library '%s'!\n", Name); + + // Retrieve the RTL information from the runtime library. + RTLInfoTy R; + + R.LibraryHandler = dynlib_handle; + R.isUsed = false; + +#ifdef OMPTARGET_DEBUG + R.RTLName = Name; +#endif + + if (!(R.is_valid_binary = (RTLInfoTy::is_valid_binary_ty *)dlsym( + dynlib_handle, "__tgt_rtl_is_valid_binary"))) + continue; + if (!(R.number_of_devices = (RTLInfoTy::number_of_devices_ty *)dlsym( + dynlib_handle, "__tgt_rtl_number_of_devices"))) + continue; + if (!(R.init_device = (RTLInfoTy::init_device_ty *)dlsym( + dynlib_handle, "__tgt_rtl_init_device"))) + continue; + if (!(R.load_binary = (RTLInfoTy::load_binary_ty *)dlsym( + dynlib_handle, "__tgt_rtl_load_binary"))) + continue; + if (!(R.data_alloc = (RTLInfoTy::data_alloc_ty *)dlsym( + dynlib_handle, "__tgt_rtl_data_alloc"))) + continue; + if (!(R.data_submit = (RTLInfoTy::data_submit_ty *)dlsym( + dynlib_handle, "__tgt_rtl_data_submit"))) + continue; + if (!(R.data_retrieve = (RTLInfoTy::data_retrieve_ty *)dlsym( + dynlib_handle, "__tgt_rtl_data_retrieve"))) + continue; + if (!(R.data_delete = (RTLInfoTy::data_delete_ty *)dlsym( + dynlib_handle, "__tgt_rtl_data_delete"))) + continue; + if (!(R.run_region = (RTLInfoTy::run_region_ty *)dlsym( + dynlib_handle, "__tgt_rtl_run_target_region"))) + continue; + if (!(R.run_team_region = (RTLInfoTy::run_team_region_ty *)dlsym( + dynlib_handle, "__tgt_rtl_run_target_team_region"))) + continue; + + // No devices are supported by this RTL? + if (!(R.NumberOfDevices = R.number_of_devices())) { + DP("No devices supported in this RTL\n"); + continue; + } + + DP("Registering RTL %s supporting %d devices!\n", + R.RTLName.c_str(), R.NumberOfDevices); + + // The RTL is valid! Will save the information in the RTLs list. + AllRTLs.push_back(R); + } + + DP("RTLs loaded!\n"); + + return; +} + +void RTLsTy::LoadRTLsOnce() { + // RTL.LoadRTLs() is called only once in a thread-safe fashion. + std::call_once(initFlag, &RTLsTy::LoadRTLs, this); +} + +static RTLsTy RTLs; +static std::mutex RTLsMtx; + +/// Map between the host entry begin and the translation table. Each +/// registered library gets one TranslationTable. Use the map from +/// __tgt_offload_entry so that we may quickly determine whether we +/// are trying to (re)register an existing lib or really have a new one. +struct TranslationTable { + __tgt_target_table HostTable; + + // Image assigned to a given device. + std::vector<__tgt_device_image *> TargetsImages; // One image per device ID. + + // Table of entry points or NULL if it was not already computed. + std::vector<__tgt_target_table *> TargetsTable; // One table per device ID. +}; +typedef std::map<__tgt_offload_entry *, TranslationTable> + HostEntriesBeginToTransTableTy; +static HostEntriesBeginToTransTableTy HostEntriesBeginToTransTable; +static std::mutex TrlTblMtx; + +/// Map between the host ptr and a table index +struct TableMap { + TranslationTable *Table; // table associated with the host ptr. + uint32_t Index; // index in which the host ptr translated entry is found. + TableMap() : Table(0), Index(0) {} + TableMap(TranslationTable *table, uint32_t index) + : Table(table), Index(index) {} +}; +typedef std::map HostPtrToTableMapTy; +static HostPtrToTableMapTy HostPtrToTableMap; +static std::mutex TblMapMtx; + +/// Check whether a device has an associated RTL and initialize it if it's not +/// already initialized. +static bool device_is_ready(int device_num) { + DP("Checking whether device %d is ready.\n", device_num); + // Devices.size() can only change while registering a new + // library, so try to acquire the lock of RTLs' mutex. + RTLsMtx.lock(); + size_t Devices_size = Devices.size(); + RTLsMtx.unlock(); + if (Devices_size <= (size_t)device_num) { + DP("Device ID %d does not have a matching RTL\n", device_num); + return false; + } + + // Get device info + DeviceTy &Device = Devices[device_num]; + + DP("Is the device %d (local ID %d) initialized? %d\n", device_num, + Device.RTLDeviceID, Device.IsInit); + + // Init the device if not done before + if (!Device.IsInit && Device.initOnce() != OFFLOAD_SUCCESS) { + DP("Failed to init device %d\n", device_num); + return false; + } + + DP("Device %d is ready to use.\n", device_num); + + return true; +} + +//////////////////////////////////////////////////////////////////////////////// +// Target API functions +// +EXTERN int omp_get_num_devices(void) { + RTLsMtx.lock(); + size_t Devices_size = Devices.size(); + RTLsMtx.unlock(); + + DP("Call to omp_get_num_devices returning %zd\n", Devices_size); + + return Devices_size; +} + +EXTERN int omp_get_initial_device(void) { + DP("Call to omp_get_initial_device returning %d\n", HOST_DEVICE); + return HOST_DEVICE; +} + +EXTERN void *omp_target_alloc(size_t size, int device_num) { + DP("Call to omp_target_alloc for device %d requesting %zu bytes\n", + device_num, size); + + if (size <= 0) { + DP("Call to omp_target_alloc with non-positive length\n"); + return NULL; + } + + void *rc = NULL; + + if (device_num == omp_get_initial_device()) { + rc = malloc(size); + DP("omp_target_alloc returns host ptr " DPxMOD "\n", DPxPTR(rc)); + return rc; + } + + if (!device_is_ready(device_num)) { + DP("omp_target_alloc returns NULL ptr\n"); + return NULL; + } + + DeviceTy &Device = Devices[device_num]; + rc = Device.RTL->data_alloc(Device.RTLDeviceID, size); + DP("omp_target_alloc returns device ptr " DPxMOD "\n", DPxPTR(rc)); + return rc; +} + +EXTERN void omp_target_free(void *device_ptr, int device_num) { + DP("Call to omp_target_free for device %d and address " DPxMOD "\n", + device_num, DPxPTR(device_ptr)); + + if (!device_ptr) { + DP("Call to omp_target_free with NULL ptr\n"); + return; + } + + if (device_num == omp_get_initial_device()) { + free(device_ptr); + DP("omp_target_free deallocated host ptr\n"); + return; + } + + if (!device_is_ready(device_num)) { + DP("omp_target_free returns, nothing to do\n"); + return; + } + + DeviceTy &Device = Devices[device_num]; + Device.RTL->data_delete(Device.RTLDeviceID, (void *)device_ptr); + DP("omp_target_free deallocated device ptr\n"); +} + +EXTERN int omp_target_is_present(void *ptr, int device_num) { + DP("Call to omp_target_is_present for device %d and address " DPxMOD "\n", + device_num, DPxPTR(ptr)); + + if (!ptr) { + DP("Call to omp_target_is_present with NULL ptr, returning false\n"); + return false; + } + + if (device_num == omp_get_initial_device()) { + DP("Call to omp_target_is_present on host, returning true\n"); + return true; + } + + DeviceTy& Device = Devices[device_num]; + bool IsLast; // not used + int rc = (Device.getTgtPtrBegin(ptr, 0, IsLast, false) != NULL); + DP("Call to omp_target_is_present returns %d\n", rc); + return rc; +} + +EXTERN int omp_target_memcpy(void *dst, void *src, size_t length, + size_t dst_offset, size_t src_offset, int dst_device, int src_device) { + DP("Call to omp_target_memcpy, dst device %d, src device %d, " + "dst addr " DPxMOD ", src addr " DPxMOD ", dst offset %zu, " + "src offset %zu, length %zu\n", dst_device, src_device, DPxPTR(dst), + DPxPTR(src), dst_offset, src_offset, length); + + if (!dst || !src || length <= 0) { + DP("Call to omp_target_memcpy with invalid arguments\n"); + return OFFLOAD_FAIL; + } + + if (src_device != omp_get_initial_device() && !device_is_ready(src_device)) { + DP("omp_target_memcpy returns OFFLOAD_FAIL\n"); + return OFFLOAD_FAIL; + } + + if (dst_device != omp_get_initial_device() && !device_is_ready(dst_device)) { + DP("omp_target_memcpy returns OFFLOAD_FAIL\n"); + return OFFLOAD_FAIL; + } + + int rc = OFFLOAD_SUCCESS; + void *srcAddr = (char *)src + src_offset; + void *dstAddr = (char *)dst + dst_offset; + + if (src_device == omp_get_initial_device() && + dst_device == omp_get_initial_device()) { + DP("copy from host to host\n"); + const void *p = memcpy(dstAddr, srcAddr, length); + if (p == NULL) + rc = OFFLOAD_FAIL; + } else if (src_device == omp_get_initial_device()) { + DP("copy from host to device\n"); + DeviceTy& DstDev = Devices[dst_device]; + rc = DstDev.data_submit(dstAddr, srcAddr, length); + } else if (dst_device == omp_get_initial_device()) { + DP("copy from device to host\n"); + DeviceTy& SrcDev = Devices[src_device]; + rc = SrcDev.data_retrieve(dstAddr, srcAddr, length); + } else { + DP("copy from device to device\n"); + void *buffer = malloc(length); + DeviceTy& SrcDev = Devices[src_device]; + DeviceTy& DstDev = Devices[dst_device]; + rc = SrcDev.data_retrieve(buffer, srcAddr, length); + if (rc == OFFLOAD_SUCCESS) + rc = DstDev.data_submit(dstAddr, buffer, length); + } + + DP("omp_target_memcpy returns %d\n", rc); + return rc; +} + +EXTERN int omp_target_memcpy_rect(void *dst, void *src, size_t element_size, + int num_dims, const size_t *volume, const size_t *dst_offsets, + const size_t *src_offsets, const size_t *dst_dimensions, + const size_t *src_dimensions, int dst_device, int src_device) { + DP("Call to omp_target_memcpy_rect, dst device %d, src device %d, " + "dst addr " DPxMOD ", src addr " DPxMOD ", dst offsets " DPxMOD ", " + "src offsets " DPxMOD ", dst dims " DPxMOD ", src dims " DPxMOD ", " + "volume " DPxMOD ", element size %zu, num_dims %d\n", dst_device, + src_device, DPxPTR(dst), DPxPTR(src), DPxPTR(dst_offsets), + DPxPTR(src_offsets), DPxPTR(dst_dimensions), DPxPTR(src_dimensions), + DPxPTR(volume), element_size, num_dims); + + if (!(dst || src)) { + DP("Call to omp_target_memcpy_rect returns max supported dimensions %d\n", + INT_MAX); + return INT_MAX; + } + + if (!dst || !src || element_size < 1 || num_dims < 1 || !volume || + !dst_offsets || !src_offsets || !dst_dimensions || !src_dimensions) { + DP("Call to omp_target_memcpy_rect with invalid arguments\n"); + return OFFLOAD_FAIL; + } + + int rc; + if (num_dims == 1) { + rc = omp_target_memcpy(dst, src, element_size * volume[0], + element_size * dst_offsets[0], element_size * src_offsets[0], + dst_device, src_device); + } else { + size_t dst_slice_size = element_size; + size_t src_slice_size = element_size; + for (int i=1; iHstPtrBegin) { + // Mapping exists + if (CONSIDERED_INF(ii->RefCount)) { + DP("Association found, removing it\n"); + HostDataToTargetMap.erase(ii); + DataMapMtx.unlock(); + return OFFLOAD_SUCCESS; + } else { + DP("Trying to disassociate a pointer which was not mapped via " + "omp_target_associate_ptr\n"); + break; + } + } + } + + // Mapping not found + DataMapMtx.unlock(); + DP("Association not found\n"); + return OFFLOAD_FAIL; +} + +// Get ref count of map entry containing HstPtrBegin +long DeviceTy::getMapEntryRefCnt(void *HstPtrBegin) { + uintptr_t hp = (uintptr_t)HstPtrBegin; + long RefCnt = -1; + + DataMapMtx.lock(); + for (auto &HT : HostDataToTargetMap) { + if (hp >= HT.HstPtrBegin && hp < HT.HstPtrEnd) { + DP("DeviceTy::getMapEntry: requested entry found\n"); + RefCnt = HT.RefCount; + break; + } + } + DataMapMtx.unlock(); + + if (RefCnt < 0) { + DP("DeviceTy::getMapEntry: requested entry not found\n"); + } + + return RefCnt; +} + +LookupResult DeviceTy::lookupMapping(void *HstPtrBegin, int64_t Size) { + uintptr_t hp = (uintptr_t)HstPtrBegin; + LookupResult lr; + + DP("Looking up mapping(HstPtrBegin=" DPxMOD ", Size=%ld)...\n", DPxPTR(hp), + Size); + for (lr.Entry = HostDataToTargetMap.begin(); + lr.Entry != HostDataToTargetMap.end(); ++lr.Entry) { + auto &HT = *lr.Entry; + // Is it contained? + lr.Flags.IsContained = hp >= HT.HstPtrBegin && hp < HT.HstPtrEnd && + (hp+Size) <= HT.HstPtrEnd; + // Does it extend into an already mapped region? + lr.Flags.ExtendsBefore = hp < HT.HstPtrBegin && (hp+Size) > HT.HstPtrBegin; + // Does it extend beyond the mapped region? + lr.Flags.ExtendsAfter = hp < HT.HstPtrEnd && (hp+Size) > HT.HstPtrEnd; + + if (lr.Flags.IsContained || lr.Flags.ExtendsBefore || + lr.Flags.ExtendsAfter) { + break; + } + } + + if (lr.Flags.ExtendsBefore) { + DP("WARNING: Pointer is not mapped but section extends into already " + "mapped data\n"); + } + if (lr.Flags.ExtendsAfter) { + DP("WARNING: Pointer is already mapped but section extends beyond mapped " + "region\n"); + } + + return lr; +} + +// Used by target_data_begin +// Return the target pointer begin (where the data will be moved). +// Allocate memory if this is the first occurrence if this mapping. +// Increment the reference counter. +// If NULL is returned, then either data allocation failed or the user tried +// to do an illegal mapping. +void *DeviceTy::getOrAllocTgtPtr(void *HstPtrBegin, void *HstPtrBase, + int64_t Size, bool &IsNew, bool IsImplicit, bool UpdateRefCount) { + void *rc = NULL; + DataMapMtx.lock(); + LookupResult lr = lookupMapping(HstPtrBegin, Size); + + // Check if the pointer is contained. + if (lr.Flags.IsContained || + ((lr.Flags.ExtendsBefore || lr.Flags.ExtendsAfter) && IsImplicit)) { + auto &HT = *lr.Entry; + IsNew = false; + + if (UpdateRefCount) + ++HT.RefCount; + + uintptr_t tp = HT.TgtPtrBegin + ((uintptr_t)HstPtrBegin - HT.HstPtrBegin); + DP("Mapping exists%s with HstPtrBegin=" DPxMOD ", TgtPtrBegin=" DPxMOD ", " + "Size=%ld,%s RefCount=%s\n", (IsImplicit ? " (implicit)" : ""), + DPxPTR(HstPtrBegin), DPxPTR(tp), Size, + (UpdateRefCount ? " updated" : ""), + (CONSIDERED_INF(HT.RefCount)) ? "INF" : + std::to_string(HT.RefCount).c_str()); + rc = (void *)tp; + } else if ((lr.Flags.ExtendsBefore || lr.Flags.ExtendsAfter) && !IsImplicit) { + // Explicit extension of mapped data - not allowed. + DP("Explicit extension of mapping is not allowed.\n"); + } else if (Size) { + // If it is not contained and Size > 0 we should create a new entry for it. + IsNew = true; + uintptr_t tp = (uintptr_t)RTL->data_alloc(RTLDeviceID, Size); + DP("Creating new map entry: HstBase=" DPxMOD ", HstBegin=" DPxMOD ", " + "HstEnd=" DPxMOD ", TgtBegin=" DPxMOD "\n", DPxPTR(HstPtrBase), + DPxPTR(HstPtrBegin), DPxPTR((uintptr_t)HstPtrBegin + Size), DPxPTR(tp)); + HostDataToTargetMap.push_front(HostDataToTargetTy((uintptr_t)HstPtrBase, + (uintptr_t)HstPtrBegin, (uintptr_t)HstPtrBegin + Size, tp)); + rc = (void *)tp; + } + + DataMapMtx.unlock(); + return rc; +} + +// Used by target_data_begin, target_data_end, target_data_update and target. +// Return the target pointer begin (where the data will be moved). +// Decrement the reference counter if called from target_data_end. +void *DeviceTy::getTgtPtrBegin(void *HstPtrBegin, int64_t Size, bool &IsLast, + bool UpdateRefCount) { + void *rc = NULL; + DataMapMtx.lock(); + LookupResult lr = lookupMapping(HstPtrBegin, Size); + + if (lr.Flags.IsContained || lr.Flags.ExtendsBefore || lr.Flags.ExtendsAfter) { + auto &HT = *lr.Entry; + IsLast = !(HT.RefCount > 1); + + if (HT.RefCount > 1 && UpdateRefCount) + --HT.RefCount; + + uintptr_t tp = HT.TgtPtrBegin + ((uintptr_t)HstPtrBegin - HT.HstPtrBegin); + DP("Mapping exists with HstPtrBegin=" DPxMOD ", TgtPtrBegin=" DPxMOD ", " + "Size=%ld,%s RefCount=%s\n", DPxPTR(HstPtrBegin), DPxPTR(tp), Size, + (UpdateRefCount ? " updated" : ""), + (CONSIDERED_INF(HT.RefCount)) ? "INF" : + std::to_string(HT.RefCount).c_str()); + rc = (void *)tp; + } else { + IsLast = false; + } + + DataMapMtx.unlock(); + return rc; +} + +// Return the target pointer begin (where the data will be moved). +// Lock-free version called from within assertions. +void *DeviceTy::getTgtPtrBegin(void *HstPtrBegin, int64_t Size) { + uintptr_t hp = (uintptr_t)HstPtrBegin; + LookupResult lr = lookupMapping(HstPtrBegin, Size); + if (lr.Flags.IsContained || lr.Flags.ExtendsBefore || lr.Flags.ExtendsAfter) { + auto &HT = *lr.Entry; + uintptr_t tp = HT.TgtPtrBegin + (hp - HT.HstPtrBegin); + return (void *)tp; + } + + return NULL; +} + +int DeviceTy::deallocTgtPtr(void *HstPtrBegin, int64_t Size, bool ForceDelete) { + // Check if the pointer is contained in any sub-nodes. + int rc; + DataMapMtx.lock(); + LookupResult lr = lookupMapping(HstPtrBegin, Size); + if (lr.Flags.IsContained || lr.Flags.ExtendsBefore || lr.Flags.ExtendsAfter) { + auto &HT = *lr.Entry; + if (ForceDelete) + HT.RefCount = 1; + if (--HT.RefCount <= 0) { + assert(HT.RefCount == 0 && "did not expect a negative ref count"); + DP("Deleting tgt data " DPxMOD " of size %ld\n", + DPxPTR(HT.TgtPtrBegin), Size); + RTL->data_delete(RTLDeviceID, (void *)HT.TgtPtrBegin); + DP("Removing%s mapping with HstPtrBegin=" DPxMOD ", TgtPtrBegin=" DPxMOD + ", Size=%ld\n", (ForceDelete ? " (forced)" : ""), + DPxPTR(HT.HstPtrBegin), DPxPTR(HT.TgtPtrBegin), Size); + HostDataToTargetMap.erase(lr.Entry); + } + rc = OFFLOAD_SUCCESS; + } else { + DP("Section to delete (hst addr " DPxMOD ") does not exist in the allocated" + " memory\n", DPxPTR(HstPtrBegin)); + rc = OFFLOAD_FAIL; + } + + DataMapMtx.unlock(); + return rc; +} + +/// Init device, should not be called directly. +void DeviceTy::init() { + int32_t rc = RTL->init_device(RTLDeviceID); + if (rc == OFFLOAD_SUCCESS) { + IsInit = true; + } +} + +/// Thread-safe method to initialize the device only once. +int32_t DeviceTy::initOnce() { + std::call_once(InitFlag, &DeviceTy::init, this); + + // At this point, if IsInit is true, then either this thread or some other + // thread in the past successfully initialized the device, so we can return + // OFFLOAD_SUCCESS. If this thread executed init() via call_once() and it + // failed, return OFFLOAD_FAIL. If call_once did not invoke init(), it means + // that some other thread already attempted to execute init() and if IsInit + // is still false, return OFFLOAD_FAIL. + if (IsInit) + return OFFLOAD_SUCCESS; + else + return OFFLOAD_FAIL; +} + +// Load binary to device. +__tgt_target_table *DeviceTy::load_binary(void *Img) { + RTL->Mtx.lock(); + __tgt_target_table *rc = RTL->load_binary(RTLDeviceID, Img); + RTL->Mtx.unlock(); + return rc; +} + +// Submit data to device. +int32_t DeviceTy::data_submit(void *TgtPtrBegin, void *HstPtrBegin, + int64_t Size) { + return RTL->data_submit(RTLDeviceID, TgtPtrBegin, HstPtrBegin, Size); +} + +// Retrieve data from device. +int32_t DeviceTy::data_retrieve(void *HstPtrBegin, void *TgtPtrBegin, + int64_t Size) { + return RTL->data_retrieve(RTLDeviceID, HstPtrBegin, TgtPtrBegin, Size); +} + +// Run region on device +int32_t DeviceTy::run_region(void *TgtEntryPtr, void **TgtVarsPtr, + int32_t TgtVarsSize) { + return RTL->run_region(RTLDeviceID, TgtEntryPtr, TgtVarsPtr, TgtVarsSize); +} + +// Run team region on device. +int32_t DeviceTy::run_team_region(void *TgtEntryPtr, void **TgtVarsPtr, + int32_t TgtVarsSize, int32_t NumTeams, int32_t ThreadLimit, + uint64_t LoopTripCount) { + return RTL->run_team_region(RTLDeviceID, TgtEntryPtr, TgtVarsPtr, TgtVarsSize, + NumTeams, ThreadLimit, LoopTripCount); +} + +//////////////////////////////////////////////////////////////////////////////// +// Functionality for registering libs + +static void RegisterImageIntoTranslationTable(TranslationTable &TT, + RTLInfoTy &RTL, __tgt_device_image *image) { + + // same size, as when we increase one, we also increase the other. + assert(TT.TargetsTable.size() == TT.TargetsImages.size() && + "We should have as many images as we have tables!"); + + // Resize the Targets Table and Images to accommodate the new targets if + // required + unsigned TargetsTableMinimumSize = RTL.Idx + RTL.NumberOfDevices; + + if (TT.TargetsTable.size() < TargetsTableMinimumSize) { + TT.TargetsImages.resize(TargetsTableMinimumSize, 0); + TT.TargetsTable.resize(TargetsTableMinimumSize, 0); + } + + // Register the image in all devices for this target type. + for (int32_t i = 0; i < RTL.NumberOfDevices; ++i) { + // If we are changing the image we are also invalidating the target table. + if (TT.TargetsImages[RTL.Idx + i] != image) { + TT.TargetsImages[RTL.Idx + i] = image; + TT.TargetsTable[RTL.Idx + i] = 0; // lazy initialization of target table. + } + } +} + +//////////////////////////////////////////////////////////////////////////////// +// Functionality for registering Ctors/Dtors + +static void RegisterGlobalCtorsDtorsForImage(__tgt_bin_desc *desc, + __tgt_device_image *img, RTLInfoTy *RTL) { + + for (int32_t i = 0; i < RTL->NumberOfDevices; ++i) { + DeviceTy &Device = Devices[RTL->Idx + i]; + Device.PendingGlobalsMtx.lock(); + Device.HasPendingGlobals = true; + for (__tgt_offload_entry *entry = img->EntriesBegin; + entry != img->EntriesEnd; ++entry) { + if (entry->flags & OMP_DECLARE_TARGET_CTOR) { + DP("Adding ctor " DPxMOD " to the pending list.\n", + DPxPTR(entry->addr)); + Device.PendingCtorsDtors[desc].PendingCtors.push_back(entry->addr); + } else if (entry->flags & OMP_DECLARE_TARGET_DTOR) { + // Dtors are pushed in reverse order so they are executed from end + // to beginning when unregistering the library! + DP("Adding dtor " DPxMOD " to the pending list.\n", + DPxPTR(entry->addr)); + Device.PendingCtorsDtors[desc].PendingDtors.push_front(entry->addr); + } + + if (entry->flags & OMP_DECLARE_TARGET_LINK) { + DP("The \"link\" attribute is not yet supported!\n"); + } + } + Device.PendingGlobalsMtx.unlock(); + } +} + +//////////////////////////////////////////////////////////////////////////////// +/// adds a target shared library to the target execution image +EXTERN void __tgt_register_lib(__tgt_bin_desc *desc) { + + // Attempt to load all plugins available in the system. + RTLs.LoadRTLsOnce(); + + RTLsMtx.lock(); + // Register the images with the RTLs that understand them, if any. + for (int32_t i = 0; i < desc->NumDeviceImages; ++i) { + // Obtain the image. + __tgt_device_image *img = &desc->DeviceImages[i]; + + RTLInfoTy *FoundRTL = NULL; + + // Scan the RTLs that have associated images until we find one that supports + // the current image. + for (auto &R : RTLs.AllRTLs) { + if (!R.is_valid_binary(img)) { + DP("Image " DPxMOD " is NOT compatible with RTL %s!\n", + DPxPTR(img->ImageStart), R.RTLName.c_str()); + continue; + } + + DP("Image " DPxMOD " is compatible with RTL %s!\n", + DPxPTR(img->ImageStart), R.RTLName.c_str()); + + // If this RTL is not already in use, initialize it. + if (!R.isUsed) { + // Initialize the device information for the RTL we are about to use. + DeviceTy device(&R); + + size_t start = Devices.size(); + Devices.resize(start + R.NumberOfDevices, device); + for (int32_t device_id = 0; device_id < R.NumberOfDevices; + device_id++) { + // global device ID + Devices[start + device_id].DeviceID = start + device_id; + // RTL local device ID + Devices[start + device_id].RTLDeviceID = device_id; + + // Save pointer to device in RTL in case we want to unregister the RTL + R.Devices.push_back(&Devices[start + device_id]); + } + + // Initialize the index of this RTL and save it in the used RTLs. + R.Idx = (RTLs.UsedRTLs.empty()) + ? 0 + : RTLs.UsedRTLs.back()->Idx + + RTLs.UsedRTLs.back()->NumberOfDevices; + assert((size_t) R.Idx == start && + "RTL index should equal the number of devices used so far."); + R.isUsed = true; + RTLs.UsedRTLs.push_back(&R); + + DP("RTL " DPxMOD " has index %d!\n", DPxPTR(R.LibraryHandler), R.Idx); + } + + // Initialize (if necessary) translation table for this library. + TrlTblMtx.lock(); + if(!HostEntriesBeginToTransTable.count(desc->HostEntriesBegin)){ + TranslationTable &tt = + HostEntriesBeginToTransTable[desc->HostEntriesBegin]; + tt.HostTable.EntriesBegin = desc->HostEntriesBegin; + tt.HostTable.EntriesEnd = desc->HostEntriesEnd; + } + + // Retrieve translation table for this library. + TranslationTable &TransTable = + HostEntriesBeginToTransTable[desc->HostEntriesBegin]; + + DP("Registering image " DPxMOD " with RTL %s!\n", + DPxPTR(img->ImageStart), R.RTLName.c_str()); + RegisterImageIntoTranslationTable(TransTable, R, img); + TrlTblMtx.unlock(); + FoundRTL = &R; + + // Load ctors/dtors for static objects + RegisterGlobalCtorsDtorsForImage(desc, img, FoundRTL); + + // if an RTL was found we are done - proceed to register the next image + break; + } + + if (!FoundRTL) { + DP("No RTL found for image " DPxMOD "!\n", DPxPTR(img->ImageStart)); + } + } + RTLsMtx.unlock(); + + + DP("Done registering entries!\n"); +} + +//////////////////////////////////////////////////////////////////////////////// +/// unloads a target shared library +EXTERN void __tgt_unregister_lib(__tgt_bin_desc *desc) { + DP("Unloading target library!\n"); + + RTLsMtx.lock(); + // Find which RTL understands each image, if any. + for (int32_t i = 0; i < desc->NumDeviceImages; ++i) { + // Obtain the image. + __tgt_device_image *img = &desc->DeviceImages[i]; + + RTLInfoTy *FoundRTL = NULL; + + // Scan the RTLs that have associated images until we find one that supports + // the current image. We only need to scan RTLs that are already being used. + for (auto *R : RTLs.UsedRTLs) { + + assert(R->isUsed && "Expecting used RTLs."); + + if (!R->is_valid_binary(img)) { + DP("Image " DPxMOD " is NOT compatible with RTL " DPxMOD "!\n", + DPxPTR(img->ImageStart), DPxPTR(R->LibraryHandler)); + continue; + } + + DP("Image " DPxMOD " is compatible with RTL " DPxMOD "!\n", + DPxPTR(img->ImageStart), DPxPTR(R->LibraryHandler)); + + FoundRTL = R; + + // Execute dtors for static objects if the device has been used, i.e. + // if its PendingCtors list has been emptied. + for (int32_t i = 0; i < FoundRTL->NumberOfDevices; ++i) { + DeviceTy &Device = Devices[FoundRTL->Idx + i]; + Device.PendingGlobalsMtx.lock(); + if (Device.PendingCtorsDtors[desc].PendingCtors.empty()) { + for (auto &dtor : Device.PendingCtorsDtors[desc].PendingDtors) { + int rc = target(Device.DeviceID, dtor, 0, NULL, NULL, NULL, NULL, 1, + 1, true /*team*/); + if (rc != OFFLOAD_SUCCESS) { + DP("Running destructor " DPxMOD " failed.\n", DPxPTR(dtor)); + } + } + // Remove this library's entry from PendingCtorsDtors + Device.PendingCtorsDtors.erase(desc); + } + Device.PendingGlobalsMtx.unlock(); + } + + DP("Unregistered image " DPxMOD " from RTL " DPxMOD "!\n", + DPxPTR(img->ImageStart), DPxPTR(R->LibraryHandler)); + + break; + } + + // if no RTL was found proceed to unregister the next image + if (!FoundRTL){ + DP("No RTLs in use support the image " DPxMOD "!\n", + DPxPTR(img->ImageStart)); + } + } + RTLsMtx.unlock(); + DP("Done unregistering images!\n"); + + // Remove entries from HostPtrToTableMap + TblMapMtx.lock(); + for (__tgt_offload_entry *cur = desc->HostEntriesBegin; + cur < desc->HostEntriesEnd; ++cur) { + HostPtrToTableMap.erase(cur->addr); + } + + // Remove translation table for this descriptor. + auto tt = HostEntriesBeginToTransTable.find(desc->HostEntriesBegin); + if (tt != HostEntriesBeginToTransTable.end()) { + DP("Removing translation table for descriptor " DPxMOD "\n", + DPxPTR(desc->HostEntriesBegin)); + HostEntriesBeginToTransTable.erase(tt); + } else { + DP("Translation table for descriptor " DPxMOD " cannot be found, probably " + "it has been already removed.\n", DPxPTR(desc->HostEntriesBegin)); + } + + TblMapMtx.unlock(); + + // TODO: Remove RTL and the devices it manages if it's not used anymore? + // TODO: Write some RTL->unload_image(...) function? + + DP("Done unregistering library!\n"); +} + +/// Map global data and execute pending ctors +static int InitLibrary(DeviceTy& Device) { + /* + * Map global data + */ + int32_t device_id = Device.DeviceID; + int rc = OFFLOAD_SUCCESS; + + Device.PendingGlobalsMtx.lock(); + TrlTblMtx.lock(); + for (HostEntriesBeginToTransTableTy::iterator + ii = HostEntriesBeginToTransTable.begin(); + ii != HostEntriesBeginToTransTable.end(); ++ii) { + TranslationTable *TransTable = &ii->second; + if (TransTable->TargetsTable[device_id] != 0) { + // Library entries have already been processed + continue; + } + + // 1) get image. + assert(TransTable->TargetsImages.size() > (size_t)device_id && + "Not expecting a device ID outside the table's bounds!"); + __tgt_device_image *img = TransTable->TargetsImages[device_id]; + if (!img) { + DP("No image loaded for device id %d.\n", device_id); + rc = OFFLOAD_FAIL; + break; + } + // 2) load image into the target table. + __tgt_target_table *TargetTable = + TransTable->TargetsTable[device_id] = Device.load_binary(img); + // Unable to get table for this image: invalidate image and fail. + if (!TargetTable) { + DP("Unable to generate entries table for device id %d.\n", device_id); + TransTable->TargetsImages[device_id] = 0; + rc = OFFLOAD_FAIL; + break; + } + + // Verify whether the two table sizes match. + size_t hsize = + TransTable->HostTable.EntriesEnd - TransTable->HostTable.EntriesBegin; + size_t tsize = TargetTable->EntriesEnd - TargetTable->EntriesBegin; + + // Invalid image for these host entries! + if (hsize != tsize) { + DP("Host and Target tables mismatch for device id %d [%zx != %zx].\n", + device_id, hsize, tsize); + TransTable->TargetsImages[device_id] = 0; + TransTable->TargetsTable[device_id] = 0; + rc = OFFLOAD_FAIL; + break; + } + + // process global data that needs to be mapped. + Device.DataMapMtx.lock(); + __tgt_target_table *HostTable = &TransTable->HostTable; + for (__tgt_offload_entry *CurrDeviceEntry = TargetTable->EntriesBegin, + *CurrHostEntry = HostTable->EntriesBegin, + *EntryDeviceEnd = TargetTable->EntriesEnd; + CurrDeviceEntry != EntryDeviceEnd; + CurrDeviceEntry++, CurrHostEntry++) { + if (CurrDeviceEntry->size != 0) { + // has data. + assert(CurrDeviceEntry->size == CurrHostEntry->size && + "data size mismatch"); + assert(Device.getTgtPtrBegin(CurrHostEntry->addr, + CurrHostEntry->size) == NULL && + "data in declared target should not be already mapped"); + // add entry to map. + DP("Add mapping from host " DPxMOD " to device " DPxMOD " with size %zu" + "\n", DPxPTR(CurrHostEntry->addr), DPxPTR(CurrDeviceEntry->addr), + CurrDeviceEntry->size); + Device.HostDataToTargetMap.push_front(HostDataToTargetTy( + (uintptr_t)CurrHostEntry->addr, (uintptr_t)CurrHostEntry->addr, + (uintptr_t)CurrHostEntry->addr + CurrHostEntry->size, + (uintptr_t)CurrDeviceEntry->addr)); + } + } + Device.DataMapMtx.unlock(); + } + TrlTblMtx.unlock(); + + if (rc != OFFLOAD_SUCCESS) { + Device.PendingGlobalsMtx.unlock(); + return rc; + } + + /* + * Run ctors for static objects + */ + if (!Device.PendingCtorsDtors.empty()) { + // Call all ctors for all libraries registered so far + for (auto &lib : Device.PendingCtorsDtors) { + if (!lib.second.PendingCtors.empty()) { + DP("Has pending ctors... call now\n"); + for (auto &entry : lib.second.PendingCtors) { + void *ctor = entry; + int rc = target(device_id, ctor, 0, NULL, NULL, NULL, + NULL, 1, 1, true /*team*/); + if (rc != OFFLOAD_SUCCESS) { + DP("Running ctor " DPxMOD " failed.\n", DPxPTR(ctor)); + Device.PendingGlobalsMtx.unlock(); + return OFFLOAD_FAIL; + } + } + // Clear the list to indicate that this device has been used + lib.second.PendingCtors.clear(); + DP("Done with pending ctors for lib " DPxMOD "\n", DPxPTR(lib.first)); + } + } + } + Device.HasPendingGlobals = false; + Device.PendingGlobalsMtx.unlock(); + + return OFFLOAD_SUCCESS; +} + +// Check whether a device has been initialized, global ctors have been +// executed and global data has been mapped; do so if not already done. +static int CheckDevice(int32_t device_id) { + // Is device ready? + if (!device_is_ready(device_id)) { + DP("Device %d is not ready.\n", device_id); + return OFFLOAD_FAIL; + } + + // Get device info. + DeviceTy &Device = Devices[device_id]; + + // Check whether global data has been mapped for this device + Device.PendingGlobalsMtx.lock(); + bool hasPendingGlobals = Device.HasPendingGlobals; + Device.PendingGlobalsMtx.unlock(); + if (hasPendingGlobals) { + if (InitLibrary(Device) != OFFLOAD_SUCCESS) { + DP("Failed to init globals on device %d\n", device_id); + return OFFLOAD_FAIL; + } + } + + return OFFLOAD_SUCCESS; +} + +// Following datatypes and functions (tgt_oldmap_type, combined_entry_t, +// translate_map, cleanup_map) will be removed once the compiler starts using +// the new map types. + +// Old map types +enum tgt_oldmap_type { + OMP_TGT_OLDMAPTYPE_TO = 0x001, // copy data from host to device + OMP_TGT_OLDMAPTYPE_FROM = 0x002, // copy data from device to host + OMP_TGT_OLDMAPTYPE_ALWAYS = 0x004, // copy regardless of the ref. count + OMP_TGT_OLDMAPTYPE_DELETE = 0x008, // force unmapping of data + OMP_TGT_OLDMAPTYPE_MAP_PTR = 0x010, // map pointer as well as pointee + OMP_TGT_OLDMAPTYPE_FIRST_MAP = 0x020, // first occurrence of mapped variable + OMP_TGT_OLDMAPTYPE_RETURN_PTR = 0x040, // return TgtBase addr of mapped data + OMP_TGT_OLDMAPTYPE_PRIVATE_PTR = 0x080, // private variable - not mapped + OMP_TGT_OLDMAPTYPE_PRIVATE_VAL = 0x100 // copy by value - not mapped +}; + +// Temporary functions for map translation and cleanup +struct combined_entry_t { + int num_members; // number of members in combined entry + void *base_addr; // base address of combined entry + void *begin_addr; // begin address of combined entry + void *end_addr; // size of combined entry +}; + +static void translate_map(int32_t arg_num, void **args_base, void **args, + int64_t *arg_sizes, int32_t *arg_types, int32_t &new_arg_num, + void **&new_args_base, void **&new_args, int64_t *&new_arg_sizes, + int64_t *&new_arg_types, bool is_target_construct) { + if (arg_num <= 0) { + DP("Nothing to translate\n"); + new_arg_num = 0; + return; + } + + // array of combined entries + combined_entry_t *cmb_entries = + (combined_entry_t *) alloca(arg_num * sizeof(combined_entry_t)); + // number of combined entries + long num_combined = 0; + // old entry is MAP_PTR? + bool *is_ptr_old = (bool *) alloca(arg_num * sizeof(bool)); + // old entry is member of member_of[old] cmb_entry + int *member_of = (int *) alloca(arg_num * sizeof(int)); + + DP("Translating %d map entries\n", arg_num); + for (int i = 0; i < arg_num; ++i) { + member_of[i] = -1; + is_ptr_old[i] = false; + // Scan previous entries to see whether this entry shares the same base + for (int j = 0; j < i; ++j) { + void *new_begin_addr = NULL; + void *new_end_addr = NULL; + + if (arg_types[i] & OMP_TGT_OLDMAPTYPE_MAP_PTR) { + if (args_base[i] == args[j]) { + if (!(arg_types[j] & OMP_TGT_OLDMAPTYPE_MAP_PTR)) { + DP("Entry %d has the same base as entry %d's begin address\n", i, + j); + new_begin_addr = args_base[i]; + new_end_addr = (char *)args_base[i] + sizeof(void *); + assert(arg_sizes[j] == sizeof(void *)); + is_ptr_old[j] = true; + } else { + DP("Entry %d has the same base as entry %d's begin address, but " + "%d's base was a MAP_PTR too\n", i, j, j); + } + } + } else { + if (!(arg_types[i] & OMP_TGT_OLDMAPTYPE_FIRST_MAP) && + args_base[i] == args_base[j]) { + DP("Entry %d has the same base address as entry %d\n", i, j); + new_begin_addr = args[i]; + new_end_addr = (char *)args[i] + arg_sizes[i]; + } + } + + // If we have combined the entry with a previous one + if (new_begin_addr) { + int id; + if(member_of[j] == -1) { + // We have a new entry + id = num_combined++; + DP("Creating new combined entry %d for old entry %d\n", id, j); + // Initialize new entry + cmb_entries[id].num_members = 1; + cmb_entries[id].base_addr = args_base[j]; + if (arg_types[j] & OMP_TGT_OLDMAPTYPE_MAP_PTR) { + cmb_entries[id].begin_addr = args_base[j]; + cmb_entries[id].end_addr = (char *)args_base[j] + arg_sizes[j]; + } else { + cmb_entries[id].begin_addr = args[j]; + cmb_entries[id].end_addr = (char *)args[j] + arg_sizes[j]; + } + member_of[j] = id; + } else { + // Reuse existing combined entry + DP("Reusing existing combined entry %d\n", member_of[j]); + id = member_of[j]; + } + + // Update combined entry + DP("Adding entry %d to combined entry %d\n", i, id); + cmb_entries[id].num_members++; + // base_addr stays the same + cmb_entries[id].begin_addr = + std::min(cmb_entries[id].begin_addr, new_begin_addr); + cmb_entries[id].end_addr = + std::max(cmb_entries[id].end_addr, new_end_addr); + member_of[i] = id; + break; + } + } + } + + DP("New entries: %ld combined + %d original\n", num_combined, arg_num); + new_arg_num = arg_num + num_combined; + new_args_base = (void **) malloc(new_arg_num * sizeof(void *)); + new_args = (void **) malloc(new_arg_num * sizeof(void *)); + new_arg_sizes = (int64_t *) malloc(new_arg_num * sizeof(int64_t)); + new_arg_types = (int64_t *) malloc(new_arg_num * sizeof(int64_t)); + + const int64_t alignment = 8; + + int next_id = 0; // next ID + int next_cid = 0; // next combined ID + int *combined_to_new_id = (int *) alloca(num_combined * sizeof(int)); + for (int i = 0; i < arg_num; ++i) { + // It is member_of + if (member_of[i] == next_cid) { + int cid = next_cid++; // ID of this combined entry + int nid = next_id++; // ID of the new (global) entry + combined_to_new_id[cid] = nid; + DP("Combined entry %3d will become new entry %3d\n", cid, nid); + + int64_t padding = (int64_t)cmb_entries[cid].begin_addr % alignment; + if (padding) { + DP("Using a padding of %" PRId64 " for begin address " DPxMOD "\n", + padding, DPxPTR(cmb_entries[cid].begin_addr)); + cmb_entries[cid].begin_addr = + (char *)cmb_entries[cid].begin_addr - padding; + } + + new_args_base[nid] = cmb_entries[cid].base_addr; + new_args[nid] = cmb_entries[cid].begin_addr; + new_arg_sizes[nid] = (int64_t) ((char *)cmb_entries[cid].end_addr - + (char *)cmb_entries[cid].begin_addr); + new_arg_types[nid] = OMP_TGT_MAPTYPE_TARGET_PARAM; + DP("Entry %3d: base_addr " DPxMOD ", begin_addr " DPxMOD ", " + "size %" PRId64 ", type 0x%" PRIx64 "\n", nid, + DPxPTR(new_args_base[nid]), DPxPTR(new_args[nid]), new_arg_sizes[nid], + new_arg_types[nid]); + } else if (member_of[i] != -1) { + DP("Combined entry %3d has been encountered before, do nothing\n", + member_of[i]); + } + + // Now that the combined entry (the one the old entry was a member of) has + // been inserted into the new arguments list, proceed with the old entry. + int nid = next_id++; + DP("Old entry %3d will become new entry %3d\n", i, nid); + + new_args_base[nid] = args_base[i]; + new_args[nid] = args[i]; + new_arg_sizes[nid] = arg_sizes[i]; + int64_t old_type = arg_types[i]; + + if (is_ptr_old[i]) { + // Reset TO and FROM flags + old_type &= ~(OMP_TGT_OLDMAPTYPE_TO | OMP_TGT_OLDMAPTYPE_FROM); + } + + if (member_of[i] == -1) { + if (!is_target_construct) + old_type &= ~OMP_TGT_MAPTYPE_TARGET_PARAM; + new_arg_types[nid] = old_type; + DP("Entry %3d: base_addr " DPxMOD ", begin_addr " DPxMOD ", size %" PRId64 + ", type 0x%" PRIx64 " (old entry %d not MEMBER_OF)\n", nid, + DPxPTR(new_args_base[nid]), DPxPTR(new_args[nid]), new_arg_sizes[nid], + new_arg_types[nid], i); + } else { + // Old entry is not FIRST_MAP + old_type &= ~OMP_TGT_OLDMAPTYPE_FIRST_MAP; + // Add MEMBER_OF + int new_member_of = combined_to_new_id[member_of[i]]; + old_type |= ((int64_t)new_member_of + 1) << 48; + new_arg_types[nid] = old_type; + DP("Entry %3d: base_addr " DPxMOD ", begin_addr " DPxMOD ", size %" PRId64 + ", type 0x%" PRIx64 " (old entry %d MEMBER_OF %d)\n", nid, + DPxPTR(new_args_base[nid]), DPxPTR(new_args[nid]), new_arg_sizes[nid], + new_arg_types[nid], i, new_member_of); + } + } +} + +static void cleanup_map(int32_t new_arg_num, void **new_args_base, + void **new_args, int64_t *new_arg_sizes, int64_t *new_arg_types, + int32_t arg_num, void **args_base) { + if (new_arg_num > 0) { + int offset = new_arg_num - arg_num; + for (int32_t i = 0; i < arg_num; ++i) { + // Restore old base address + args_base[i] = new_args_base[i+offset]; + } + free(new_args_base); + free(new_args); + free(new_arg_sizes); + free(new_arg_types); + } +} + +static short member_of(int64_t type) { + return ((type & OMP_TGT_MAPTYPE_MEMBER_OF) >> 48) - 1; +} + +/// Internal function to do the mapping and transfer the data to the device +static int target_data_begin(DeviceTy &Device, int32_t arg_num, + void **args_base, void **args, int64_t *arg_sizes, int64_t *arg_types) { + // process each input. + int rc = OFFLOAD_SUCCESS; + for (int32_t i = 0; i < arg_num; ++i) { + // Ignore private variables and arrays - there is no mapping for them. + if ((arg_types[i] & OMP_TGT_MAPTYPE_LITERAL) || + (arg_types[i] & OMP_TGT_MAPTYPE_PRIVATE)) + continue; + + void *HstPtrBegin = args[i]; + void *HstPtrBase = args_base[i]; + // Address of pointer on the host and device, respectively. + void *Pointer_HstPtrBegin, *Pointer_TgtPtrBegin; + bool IsNew, Pointer_IsNew; + bool IsImplicit = arg_types[i] & OMP_TGT_MAPTYPE_IMPLICIT; + bool UpdateRef = !(arg_types[i] & OMP_TGT_MAPTYPE_MEMBER_OF); + if (arg_types[i] & OMP_TGT_MAPTYPE_PTR_AND_OBJ) { + DP("Has a pointer entry: \n"); + // base is address of pointer. + Pointer_TgtPtrBegin = Device.getOrAllocTgtPtr(HstPtrBase, HstPtrBase, + sizeof(void *), Pointer_IsNew, IsImplicit, UpdateRef); + if (!Pointer_TgtPtrBegin) { + DP("Call to getOrAllocTgtPtr returned null pointer (device failure or " + "illegal mapping).\n"); + } + DP("There are %zu bytes allocated at target address " DPxMOD " - is%s new" + "\n", sizeof(void *), DPxPTR(Pointer_TgtPtrBegin), + (Pointer_IsNew ? "" : " not")); + Pointer_HstPtrBegin = HstPtrBase; + // modify current entry. + HstPtrBase = *(void **)HstPtrBase; + UpdateRef = true; // subsequently update ref count of pointee + } + + void *TgtPtrBegin = Device.getOrAllocTgtPtr(HstPtrBegin, HstPtrBase, + arg_sizes[i], IsNew, IsImplicit, UpdateRef); + if (!TgtPtrBegin && arg_sizes[i]) { + // If arg_sizes[i]==0, then the argument is a pointer to NULL, so + // getOrAlloc() returning NULL is not an error. + DP("Call to getOrAllocTgtPtr returned null pointer (device failure or " + "illegal mapping).\n"); + } + DP("There are %" PRId64 " bytes allocated at target address " DPxMOD + " - is%s new\n", arg_sizes[i], DPxPTR(TgtPtrBegin), + (IsNew ? "" : " not")); + + if (arg_types[i] & OMP_TGT_MAPTYPE_RETURN_PARAM) { + void *ret_ptr; + if (arg_types[i] & OMP_TGT_MAPTYPE_PTR_AND_OBJ) + ret_ptr = Pointer_TgtPtrBegin; + else { + bool IsLast; // not used + ret_ptr = Device.getTgtPtrBegin(HstPtrBegin, 0, IsLast, false); + } + + DP("Returning device pointer " DPxMOD "\n", DPxPTR(ret_ptr)); + args_base[i] = ret_ptr; + } + + if (arg_types[i] & OMP_TGT_MAPTYPE_TO) { + bool copy = false; + if (IsNew || (arg_types[i] & OMP_TGT_MAPTYPE_ALWAYS)) { + copy = true; + } else if (arg_types[i] & OMP_TGT_MAPTYPE_MEMBER_OF) { + // Copy data only if the "parent" struct has RefCount==1. + short parent_idx = member_of(arg_types[i]); + long parent_rc = Device.getMapEntryRefCnt(args[parent_idx]); + assert(parent_rc > 0 && "parent struct not found"); + if (parent_rc == 1) { + copy = true; + } + } + + if (copy) { + DP("Moving %" PRId64 " bytes (hst:" DPxMOD ") -> (tgt:" DPxMOD ")\n", + arg_sizes[i], DPxPTR(HstPtrBegin), DPxPTR(TgtPtrBegin)); + int rt = Device.data_submit(TgtPtrBegin, HstPtrBegin, arg_sizes[i]); + if (rt != OFFLOAD_SUCCESS) { + DP("Copying data to device failed.\n"); + rc = OFFLOAD_FAIL; + } + } + } + + if (arg_types[i] & OMP_TGT_MAPTYPE_PTR_AND_OBJ) { + DP("Update pointer (" DPxMOD ") -> [" DPxMOD "]\n", + DPxPTR(Pointer_TgtPtrBegin), DPxPTR(TgtPtrBegin)); + uint64_t Delta = (uint64_t)HstPtrBegin - (uint64_t)HstPtrBase; + void *TgtPtrBase = (void *)((uint64_t)TgtPtrBegin - Delta); + int rt = Device.data_submit(Pointer_TgtPtrBegin, &TgtPtrBase, + sizeof(void *)); + if (rt != OFFLOAD_SUCCESS) { + DP("Copying data to device failed.\n"); + rc = OFFLOAD_FAIL; + } + // create shadow pointers for this entry + Device.ShadowMtx.lock(); + Device.ShadowPtrMap[Pointer_HstPtrBegin] = {HstPtrBase, + Pointer_TgtPtrBegin, TgtPtrBase}; + Device.ShadowMtx.unlock(); + } + } + + return rc; +} + +EXTERN void __tgt_target_data_begin_nowait(int32_t device_id, int32_t arg_num, + void **args_base, void **args, int64_t *arg_sizes, int32_t *arg_types, + int32_t depNum, void *depList, int32_t noAliasDepNum, + void *noAliasDepList) { + if (depNum + noAliasDepNum > 0) + __kmpc_omp_taskwait(NULL, 0); + + __tgt_target_data_begin(device_id, arg_num, args_base, args, arg_sizes, + arg_types); +} + +/// creates host-to-target data mapping, stores it in the +/// libomptarget.so internal structure (an entry in a stack of data maps) +/// and passes the data to the device. +EXTERN void __tgt_target_data_begin(int32_t device_id, int32_t arg_num, + void **args_base, void **args, int64_t *arg_sizes, int32_t *arg_types) { + DP("Entering data begin region for device %d with %d mappings\n", device_id, + arg_num); + + // No devices available? + if (device_id == OFFLOAD_DEVICE_DEFAULT) { + device_id = omp_get_default_device(); + DP("Use default device id %d\n", device_id); + } + + if (CheckDevice(device_id) != OFFLOAD_SUCCESS) { + DP("Failed to get device %d ready\n", device_id); + return; + } + + DeviceTy& Device = Devices[device_id]; + + // Translate maps + int32_t new_arg_num; + void **new_args_base; + void **new_args; + int64_t *new_arg_sizes; + int64_t *new_arg_types; + translate_map(arg_num, args_base, args, arg_sizes, arg_types, new_arg_num, + new_args_base, new_args, new_arg_sizes, new_arg_types, false); + + //target_data_begin(Device, arg_num, args_base, args, arg_sizes, arg_types); + target_data_begin(Device, new_arg_num, new_args_base, new_args, new_arg_sizes, + new_arg_types); + + // Cleanup translation memory + cleanup_map(new_arg_num, new_args_base, new_args, new_arg_sizes, + new_arg_types, arg_num, args_base); +} + +/// Internal function to undo the mapping and retrieve the data from the device. +static int target_data_end(DeviceTy &Device, int32_t arg_num, void **args_base, + void **args, int64_t *arg_sizes, int64_t *arg_types) { + int rc = OFFLOAD_SUCCESS; + // process each input. + for (int32_t i = arg_num - 1; i >= 0; --i) { + // Ignore private variables and arrays - there is no mapping for them. + // Also, ignore the use_device_ptr directive, it has no effect here. + if ((arg_types[i] & OMP_TGT_MAPTYPE_LITERAL) || + (arg_types[i] & OMP_TGT_MAPTYPE_PRIVATE)) + continue; + + void *HstPtrBegin = args[i]; + bool IsLast; + bool UpdateRef = !(arg_types[i] & OMP_TGT_MAPTYPE_MEMBER_OF) || + (arg_types[i] & OMP_TGT_MAPTYPE_PTR_AND_OBJ); + bool ForceDelete = arg_types[i] & OMP_TGT_MAPTYPE_DELETE; + + // If PTR_AND_OBJ, HstPtrBegin is address of pointee + void *TgtPtrBegin = Device.getTgtPtrBegin(HstPtrBegin, arg_sizes[i], IsLast, + UpdateRef); + DP("There are %" PRId64 " bytes allocated at target address " DPxMOD + " - is%s last\n", arg_sizes[i], DPxPTR(TgtPtrBegin), + (IsLast ? "" : " not")); + + if ((arg_types[i] & OMP_TGT_MAPTYPE_MEMBER_OF) && + !(arg_types[i] & OMP_TGT_MAPTYPE_PTR_AND_OBJ)) { + IsLast = false; // protect parent struct from being deallocated + } + + bool DelEntry = IsLast || ForceDelete; + + if ((arg_types[i] & OMP_TGT_MAPTYPE_FROM) || DelEntry) { + // Move data back to the host + if (arg_types[i] & OMP_TGT_MAPTYPE_FROM) { + bool Always = arg_types[i] & OMP_TGT_MAPTYPE_ALWAYS; + bool CopyMember = false; + if ((arg_types[i] & OMP_TGT_MAPTYPE_MEMBER_OF) && + !(arg_types[i] & OMP_TGT_MAPTYPE_PTR_AND_OBJ)) { + // Copy data only if the "parent" struct has RefCount==1. + short parent_idx = member_of(arg_types[i]); + long parent_rc = Device.getMapEntryRefCnt(args[parent_idx]); + assert(parent_rc > 0 && "parent struct not found"); + if (parent_rc == 1) { + CopyMember = true; + } + } + + if (DelEntry || Always || CopyMember) { + DP("Moving %" PRId64 " bytes (tgt:" DPxMOD ") -> (hst:" DPxMOD ")\n", + arg_sizes[i], DPxPTR(TgtPtrBegin), DPxPTR(HstPtrBegin)); + int rt = Device.data_retrieve(HstPtrBegin, TgtPtrBegin, arg_sizes[i]); + if (rt != OFFLOAD_SUCCESS) { + DP("Copying data from device failed.\n"); + rc = OFFLOAD_FAIL; + } + } + } + + // If we copied back to the host a struct/array containing pointers, we + // need to restore the original host pointer values from their shadow + // copies. If the struct is going to be deallocated, remove any remaining + // shadow pointer entries for this struct. + uintptr_t lb = (uintptr_t) HstPtrBegin; + uintptr_t ub = (uintptr_t) HstPtrBegin + arg_sizes[i]; + Device.ShadowMtx.lock(); + for (ShadowPtrListTy::iterator it = Device.ShadowPtrMap.begin(); + it != Device.ShadowPtrMap.end(); ++it) { + void **ShadowHstPtrAddr = (void**) it->first; + + // An STL map is sorted on its keys; use this property + // to quickly determine when to break out of the loop. + if ((uintptr_t) ShadowHstPtrAddr < lb) + continue; + if ((uintptr_t) ShadowHstPtrAddr >= ub) + break; + + // If we copied the struct to the host, we need to restore the pointer. + if (arg_types[i] & OMP_TGT_MAPTYPE_FROM) { + DP("Restoring original host pointer value " DPxMOD " for host " + "pointer " DPxMOD "\n", DPxPTR(it->second.HstPtrVal), + DPxPTR(ShadowHstPtrAddr)); + *ShadowHstPtrAddr = it->second.HstPtrVal; + } + // If the struct is to be deallocated, remove the shadow entry. + if (DelEntry) { + DP("Removing shadow pointer " DPxMOD "\n", DPxPTR(ShadowHstPtrAddr)); + Device.ShadowPtrMap.erase(it); + } + } + Device.ShadowMtx.unlock(); + + // Deallocate map + if (DelEntry) { + int rt = Device.deallocTgtPtr(HstPtrBegin, arg_sizes[i], ForceDelete); + if (rt != OFFLOAD_SUCCESS) { + DP("Deallocating data from device failed.\n"); + rc = OFFLOAD_FAIL; + } + } + } + } + + return rc; +} + +/// passes data from the target, releases target memory and destroys +/// the host-target mapping (top entry from the stack of data maps) +/// created by the last __tgt_target_data_begin. +EXTERN void __tgt_target_data_end(int32_t device_id, int32_t arg_num, + void **args_base, void **args, int64_t *arg_sizes, int32_t *arg_types) { + DP("Entering data end region with %d mappings\n", arg_num); + + // No devices available? + if (device_id == OFFLOAD_DEVICE_DEFAULT) { + device_id = omp_get_default_device(); + } + + RTLsMtx.lock(); + size_t Devices_size = Devices.size(); + RTLsMtx.unlock(); + if (Devices_size <= (size_t)device_id) { + DP("Device ID %d does not have a matching RTL.\n", device_id); + return; + } + + DeviceTy &Device = Devices[device_id]; + if (!Device.IsInit) { + DP("uninit device: ignore"); + return; + } + + // Translate maps + int32_t new_arg_num; + void **new_args_base; + void **new_args; + int64_t *new_arg_sizes; + int64_t *new_arg_types; + translate_map(arg_num, args_base, args, arg_sizes, arg_types, new_arg_num, + new_args_base, new_args, new_arg_sizes, new_arg_types, false); + + //target_data_end(Device, arg_num, args_base, args, arg_sizes, arg_types); + target_data_end(Device, new_arg_num, new_args_base, new_args, new_arg_sizes, + new_arg_types); + + // Cleanup translation memory + cleanup_map(new_arg_num, new_args_base, new_args, new_arg_sizes, + new_arg_types, arg_num, args_base); +} + +EXTERN void __tgt_target_data_end_nowait(int32_t device_id, int32_t arg_num, + void **args_base, void **args, int64_t *arg_sizes, int32_t *arg_types, + int32_t depNum, void *depList, int32_t noAliasDepNum, + void *noAliasDepList) { + if (depNum + noAliasDepNum > 0) + __kmpc_omp_taskwait(NULL, 0); + + __tgt_target_data_end(device_id, arg_num, args_base, args, arg_sizes, + arg_types); +} + +/// passes data to/from the target. +EXTERN void __tgt_target_data_update(int32_t device_id, int32_t arg_num, + void **args_base, void **args, int64_t *arg_sizes, int32_t *arg_types) { + DP("Entering data update with %d mappings\n", arg_num); + + // No devices available? + if (device_id == OFFLOAD_DEVICE_DEFAULT) { + device_id = omp_get_default_device(); + } + + if (CheckDevice(device_id) != OFFLOAD_SUCCESS) { + DP("Failed to get device %d ready\n", device_id); + return; + } + + DeviceTy& Device = Devices[device_id]; + + // process each input. + for (int32_t i = 0; i < arg_num; ++i) { + if ((arg_types[i] & OMP_TGT_MAPTYPE_LITERAL) || + (arg_types[i] & OMP_TGT_MAPTYPE_PRIVATE)) + continue; + + void *HstPtrBegin = args[i]; + int64_t MapSize = arg_sizes[i]; + bool IsLast; + void *TgtPtrBegin = Device.getTgtPtrBegin(HstPtrBegin, MapSize, IsLast, + false); + + if (arg_types[i] & OMP_TGT_MAPTYPE_FROM) { + DP("Moving %" PRId64 " bytes (tgt:" DPxMOD ") -> (hst:" DPxMOD ")\n", + arg_sizes[i], DPxPTR(TgtPtrBegin), DPxPTR(HstPtrBegin)); + Device.data_retrieve(HstPtrBegin, TgtPtrBegin, MapSize); + + uintptr_t lb = (uintptr_t) HstPtrBegin; + uintptr_t ub = (uintptr_t) HstPtrBegin + MapSize; + Device.ShadowMtx.lock(); + for (ShadowPtrListTy::iterator it = Device.ShadowPtrMap.begin(); + it != Device.ShadowPtrMap.end(); ++it) { + void **ShadowHstPtrAddr = (void**) it->first; + if ((uintptr_t) ShadowHstPtrAddr < lb) + continue; + if ((uintptr_t) ShadowHstPtrAddr >= ub) + break; + DP("Restoring original host pointer value " DPxMOD " for host pointer " + DPxMOD "\n", DPxPTR(it->second.HstPtrVal), + DPxPTR(ShadowHstPtrAddr)); + *ShadowHstPtrAddr = it->second.HstPtrVal; + } + Device.ShadowMtx.unlock(); + } + + if (arg_types[i] & OMP_TGT_MAPTYPE_TO) { + DP("Moving %" PRId64 " bytes (hst:" DPxMOD ") -> (tgt:" DPxMOD ")\n", + arg_sizes[i], DPxPTR(HstPtrBegin), DPxPTR(TgtPtrBegin)); + Device.data_submit(TgtPtrBegin, HstPtrBegin, MapSize); + + uintptr_t lb = (uintptr_t) HstPtrBegin; + uintptr_t ub = (uintptr_t) HstPtrBegin + MapSize; + Device.ShadowMtx.lock(); + for (ShadowPtrListTy::iterator it = Device.ShadowPtrMap.begin(); + it != Device.ShadowPtrMap.end(); ++it) { + void **ShadowHstPtrAddr = (void**) it->first; + if ((uintptr_t) ShadowHstPtrAddr < lb) + continue; + if ((uintptr_t) ShadowHstPtrAddr >= ub) + break; + DP("Restoring original target pointer value " DPxMOD " for target " + "pointer " DPxMOD "\n", DPxPTR(it->second.TgtPtrVal), + DPxPTR(it->second.TgtPtrAddr)); + Device.data_submit(it->second.TgtPtrAddr, + &it->second.TgtPtrVal, sizeof(void *)); + } + Device.ShadowMtx.unlock(); + } + } +} + +EXTERN void __tgt_target_data_update_nowait( + int32_t device_id, int32_t arg_num, void **args_base, void **args, + int64_t *arg_sizes, int32_t *arg_types, int32_t depNum, void *depList, + int32_t noAliasDepNum, void *noAliasDepList) { + if (depNum + noAliasDepNum > 0) + __kmpc_omp_taskwait(NULL, 0); + + __tgt_target_data_update(device_id, arg_num, args_base, args, arg_sizes, + arg_types); +} + +/// performs the same actions as data_begin in case arg_num is +/// non-zero and initiates run of the offloaded region on the target platform; +/// if arg_num is non-zero after the region execution is done it also +/// performs the same action as data_update and data_end above. This function +/// returns 0 if it was able to transfer the execution to a target and an +/// integer different from zero otherwise. +static int target(int32_t device_id, void *host_ptr, int32_t arg_num, + void **args_base, void **args, int64_t *arg_sizes, int64_t *arg_types, + int32_t team_num, int32_t thread_limit, int IsTeamConstruct) { + DeviceTy &Device = Devices[device_id]; + + // Find the table information in the map or look it up in the translation + // tables. + TableMap *TM = 0; + TblMapMtx.lock(); + HostPtrToTableMapTy::iterator TableMapIt = HostPtrToTableMap.find(host_ptr); + if (TableMapIt == HostPtrToTableMap.end()) { + // We don't have a map. So search all the registered libraries. + TrlTblMtx.lock(); + for (HostEntriesBeginToTransTableTy::iterator + ii = HostEntriesBeginToTransTable.begin(), + ie = HostEntriesBeginToTransTable.end(); + !TM && ii != ie; ++ii) { + // get the translation table (which contains all the good info). + TranslationTable *TransTable = &ii->second; + // iterate over all the host table entries to see if we can locate the + // host_ptr. + __tgt_offload_entry *begin = TransTable->HostTable.EntriesBegin; + __tgt_offload_entry *end = TransTable->HostTable.EntriesEnd; + __tgt_offload_entry *cur = begin; + for (uint32_t i = 0; cur < end; ++cur, ++i) { + if (cur->addr != host_ptr) + continue; + // we got a match, now fill the HostPtrToTableMap so that we + // may avoid this search next time. + TM = &HostPtrToTableMap[host_ptr]; + TM->Table = TransTable; + TM->Index = i; + break; + } + } + TrlTblMtx.unlock(); + } else { + TM = &TableMapIt->second; + } + TblMapMtx.unlock(); + + // No map for this host pointer found! + if (!TM) { + DP("Host ptr " DPxMOD " does not have a matching target pointer.\n", + DPxPTR(host_ptr)); + return OFFLOAD_FAIL; + } + + // get target table. + TrlTblMtx.lock(); + assert(TM->Table->TargetsTable.size() > (size_t)device_id && + "Not expecting a device ID outside the table's bounds!"); + __tgt_target_table *TargetTable = TM->Table->TargetsTable[device_id]; + TrlTblMtx.unlock(); + assert(TargetTable && "Global data has not been mapped\n"); + + // Move data to device. + int rc = target_data_begin(Device, arg_num, args_base, args, arg_sizes, + arg_types); + + if (rc != OFFLOAD_SUCCESS) { + DP("Call to target_data_begin failed, skipping target execution.\n"); + // Call target_data_end to dealloc whatever target_data_begin allocated + // and return OFFLOAD_FAIL. + target_data_end(Device, arg_num, args_base, args, arg_sizes, arg_types); + return OFFLOAD_FAIL; + } + + std::vector tgt_args; + + // List of (first-)private arrays allocated for this target region + std::vector fpArrays; + + for (int32_t i = 0; i < arg_num; ++i) { + if (!(arg_types[i] & OMP_TGT_MAPTYPE_TARGET_PARAM)) { + // This is not a target parameter, do not push it into tgt_args. + continue; + } + void *HstPtrBegin = args[i]; + void *HstPtrBase = args_base[i]; + void *TgtPtrBase; + bool IsLast; // unused. + if (arg_types[i] & OMP_TGT_MAPTYPE_LITERAL) { + DP("Forwarding first-private value " DPxMOD " to the target construct\n", + DPxPTR(HstPtrBase)); + TgtPtrBase = HstPtrBase; + } else if (arg_types[i] & OMP_TGT_MAPTYPE_PRIVATE) { + // Allocate memory for (first-)private array + void *TgtPtrBegin = Device.RTL->data_alloc(Device.RTLDeviceID, + arg_sizes[i]); + if (!TgtPtrBegin) { + DP ("Data allocation for %sprivate array " DPxMOD " failed\n", + (arg_types[i] & OMP_TGT_MAPTYPE_TO ? "first-" : ""), + DPxPTR(HstPtrBegin)); + rc = OFFLOAD_FAIL; + break; + } else { + fpArrays.push_back(TgtPtrBegin); + uint64_t PtrDelta = (uint64_t)HstPtrBegin - (uint64_t)HstPtrBase; + TgtPtrBase = (void *)((uint64_t)TgtPtrBegin - PtrDelta); + DP("Allocated %" PRId64 " bytes of target memory at " DPxMOD " for " + "%sprivate array " DPxMOD " - pushing target argument " DPxMOD "\n", + arg_sizes[i], DPxPTR(TgtPtrBegin), + (arg_types[i] & OMP_TGT_MAPTYPE_TO ? "first-" : ""), + DPxPTR(HstPtrBegin), DPxPTR(TgtPtrBase)); + // If first-private, copy data from host + if (arg_types[i] & OMP_TGT_MAPTYPE_TO) { + int rt = Device.data_submit(TgtPtrBegin, HstPtrBegin, arg_sizes[i]); + if (rt != OFFLOAD_SUCCESS) { + DP ("Copying data to device failed.\n"); + rc = OFFLOAD_FAIL; + break; + } + } + } + } else if (arg_types[i] & OMP_TGT_MAPTYPE_PTR_AND_OBJ) { + void *TgtPtrBegin = Device.getTgtPtrBegin(HstPtrBase, sizeof(void *), + IsLast, false); + TgtPtrBase = TgtPtrBegin; // no offset for ptrs. + DP("Obtained target argument " DPxMOD " from host pointer " DPxMOD " to " + "object " DPxMOD "\n", DPxPTR(TgtPtrBegin), DPxPTR(HstPtrBase), + DPxPTR(HstPtrBase)); + } else { + void *TgtPtrBegin = Device.getTgtPtrBegin(HstPtrBegin, arg_sizes[i], + IsLast, false); + uint64_t PtrDelta = (uint64_t)HstPtrBegin - (uint64_t)HstPtrBase; + TgtPtrBase = (void *)((uint64_t)TgtPtrBegin - PtrDelta); + DP("Obtained target argument " DPxMOD " from host pointer " DPxMOD "\n", + DPxPTR(TgtPtrBase), DPxPTR(HstPtrBegin)); + } + tgt_args.push_back(TgtPtrBase); + } + // Push omp handle. + tgt_args.push_back((void *)0); + + // Pop loop trip count + uint64_t ltc = Device.loopTripCnt; + Device.loopTripCnt = 0; + + // Launch device execution. + if (rc == OFFLOAD_SUCCESS) { + DP("Launching target execution %s with pointer " DPxMOD " (index=%d).\n", + TargetTable->EntriesBegin[TM->Index].name, + DPxPTR(TargetTable->EntriesBegin[TM->Index].addr), TM->Index); + if (IsTeamConstruct) { + rc = Device.run_team_region(TargetTable->EntriesBegin[TM->Index].addr, + &tgt_args[0], tgt_args.size(), team_num, thread_limit, ltc); + } else { + rc = Device.run_region(TargetTable->EntriesBegin[TM->Index].addr, + &tgt_args[0], tgt_args.size()); + } + } else { + DP("Errors occurred while obtaining target arguments, skipping kernel " + "execution\n"); + } + + // Deallocate (first-)private arrays + for (auto it : fpArrays) { + int rt = Device.RTL->data_delete(Device.RTLDeviceID, it); + if (rt != OFFLOAD_SUCCESS) { + DP("Deallocation of (first-)private arrays failed.\n"); + rc = OFFLOAD_FAIL; + } + } + + // Move data from device. + int rt = target_data_end(Device, arg_num, args_base, args, arg_sizes, + arg_types); + + if (rt != OFFLOAD_SUCCESS) { + DP("Call to target_data_end failed.\n"); + rc = OFFLOAD_FAIL; + } + + return rc; +} + +EXTERN int __tgt_target(int32_t device_id, void *host_ptr, int32_t arg_num, + void **args_base, void **args, int64_t *arg_sizes, int32_t *arg_types) { + if (device_id == OFFLOAD_DEVICE_CONSTRUCTOR || + device_id == OFFLOAD_DEVICE_DESTRUCTOR) { + // Return immediately for the time being, target calls with device_id + // -2 or -3 will be removed from the compiler in the future. + return OFFLOAD_SUCCESS; + } + + DP("Entering target region with entry point " DPxMOD " and device Id %d\n", + DPxPTR(host_ptr), device_id); + + if (device_id == OFFLOAD_DEVICE_DEFAULT) { + device_id = omp_get_default_device(); + } + + if (CheckDevice(device_id) != OFFLOAD_SUCCESS) { + DP("Failed to get device %d ready\n", device_id); + return OFFLOAD_FAIL; + } + + // Translate maps + int32_t new_arg_num; + void **new_args_base; + void **new_args; + int64_t *new_arg_sizes; + int64_t *new_arg_types; + translate_map(arg_num, args_base, args, arg_sizes, arg_types, new_arg_num, + new_args_base, new_args, new_arg_sizes, new_arg_types, true); + + //return target(device_id, host_ptr, arg_num, args_base, args, arg_sizes, + // arg_types, 0, 0, false /*team*/, false /*recursive*/); + int rc = target(device_id, host_ptr, new_arg_num, new_args_base, new_args, + new_arg_sizes, new_arg_types, 0, 0, false /*team*/); + + // Cleanup translation memory + cleanup_map(new_arg_num, new_args_base, new_args, new_arg_sizes, + new_arg_types, arg_num, args_base); + + return rc; +} + +EXTERN int __tgt_target_nowait(int32_t device_id, void *host_ptr, + int32_t arg_num, void **args_base, void **args, int64_t *arg_sizes, + int32_t *arg_types, int32_t depNum, void *depList, int32_t noAliasDepNum, + void *noAliasDepList) { + if (depNum + noAliasDepNum > 0) + __kmpc_omp_taskwait(NULL, 0); + + return __tgt_target(device_id, host_ptr, arg_num, args_base, args, arg_sizes, + arg_types); +} + +EXTERN int __tgt_target_teams(int32_t device_id, void *host_ptr, + int32_t arg_num, void **args_base, void **args, int64_t *arg_sizes, + int32_t *arg_types, int32_t team_num, int32_t thread_limit) { + if (device_id == OFFLOAD_DEVICE_CONSTRUCTOR || + device_id == OFFLOAD_DEVICE_DESTRUCTOR) { + // Return immediately for the time being, target calls with device_id + // -2 or -3 will be removed from the compiler in the future. + return OFFLOAD_SUCCESS; + } + + DP("Entering target region with entry point " DPxMOD " and device Id %d\n", + DPxPTR(host_ptr), device_id); + + if (device_id == OFFLOAD_DEVICE_DEFAULT) { + device_id = omp_get_default_device(); + } + + if (CheckDevice(device_id) != OFFLOAD_SUCCESS) { + DP("Failed to get device %d ready\n", device_id); + return OFFLOAD_FAIL; + } + + // Translate maps + int32_t new_arg_num; + void **new_args_base; + void **new_args; + int64_t *new_arg_sizes; + int64_t *new_arg_types; + translate_map(arg_num, args_base, args, arg_sizes, arg_types, new_arg_num, + new_args_base, new_args, new_arg_sizes, new_arg_types, true); + + //return target(device_id, host_ptr, arg_num, args_base, args, arg_sizes, + // arg_types, team_num, thread_limit, true /*team*/, + // false /*recursive*/); + int rc = target(device_id, host_ptr, new_arg_num, new_args_base, new_args, + new_arg_sizes, new_arg_types, team_num, thread_limit, true /*team*/); + + // Cleanup translation memory + cleanup_map(new_arg_num, new_args_base, new_args, new_arg_sizes, + new_arg_types, arg_num, args_base); + + return rc; +} + +EXTERN int __tgt_target_teams_nowait(int32_t device_id, void *host_ptr, + int32_t arg_num, void **args_base, void **args, int64_t *arg_sizes, + int32_t *arg_types, int32_t team_num, int32_t thread_limit, int32_t depNum, + void *depList, int32_t noAliasDepNum, void *noAliasDepList) { + if (depNum + noAliasDepNum > 0) + __kmpc_omp_taskwait(NULL, 0); + + return __tgt_target_teams(device_id, host_ptr, arg_num, args_base, args, + arg_sizes, arg_types, team_num, thread_limit); +} + + +// The trip count mechanism will be revised - this scheme is not thread-safe. +EXTERN void __kmpc_push_target_tripcount(int32_t device_id, + uint64_t loop_tripcount) { + if (device_id == OFFLOAD_DEVICE_DEFAULT) { + device_id = omp_get_default_device(); + } + + if (CheckDevice(device_id) != OFFLOAD_SUCCESS) { + DP("Failed to get device %d ready\n", device_id); + return; + } + + DP("__kmpc_push_target_tripcount(%d, %" PRIu64 ")\n", device_id, + loop_tripcount); + Devices[device_id].loopTripCnt = loop_tripcount; +} + Index: libomptarget/test/CMakeLists.txt =================================================================== --- /dev/null +++ libomptarget/test/CMakeLists.txt @@ -0,0 +1,93 @@ +# CMakeLists.txt file for unit testing OpenMP Library +include(FindPythonInterp) +include(CheckTypeSize) +if(NOT PYTHONINTERP_FOUND) + libomptarget_warning_say("Could not find Python.") + libomptarget_warning_say("The check-libomptarget target will not be available!") + return() +endif() + +set(LIBOMPTARGET_TEST_CFLAGS "" CACHE STRING + "Extra compiler flags to send to the test compiler") + +if(${LIBOMPTARGET_STANDALONE_BUILD}) + # Make sure we can use the console pool for recent cmake and ninja > 1.5 + if(CMAKE_VERSION VERSION_LESS 3.1.20141117) + set(cmake_3_2_USES_TERMINAL) + else() + set(cmake_3_2_USES_TERMINAL USES_TERMINAL) + endif() + set(LIBOMPTARGET_TEST_C_COMPILER ${CMAKE_C_COMPILER} CACHE STRING + "C compiler to use for testing OpenMP offloading library") + set(LIBOMPTARGET_TEST_CXX_COMPILER ${CMAKE_CXX_COMPILER} CACHE STRING + "C++ compiler to use for testing OpenMP offloading library") + set(LIBOMPTARGET_TEST_OPENMP_FLAG -fopenmp CACHE STRING + "OpenMP compiler flag to use for testing OpenMP offloading library") + set(LIBOMPTARGET_LLVM_LIT_EXECUTABLE "" CACHE STRING + "Path to llvm-lit") + find_program(LIT_EXECUTABLE NAMES llvm-lit ${LIBOMPTARGET_LLVM_LIT_EXECUTABLE}) + if(NOT LIT_EXECUTABLE) + libomptarget_say("Cannot find llvm-lit.") + libomptarget_say("Please put llvm-lit in your PATH or set LIBOMPTARGET_LLVM_LIT_EXECUTABLE to its full path") + libomptarget_warning_say("The check-libomptarget target will not be available!") + return() + endif() + + set(LIBOMPTARGET_FILECHECK_EXECUTABLE "" CACHE STRING + "Path to FileCheck") + find_program(LIBOMPTARGET_FILECHECK NAMES FileCheck ${LIBOMPTARGET_FILECHECK_EXECUTABLE}) + if(NOT LIBOMPTARGET_FILECHECK) + libomptarget_say("Cannot find FileCheck.") + libomptarget_say("Please put FileCheck in your PATH or set LIBOMPTARGET_FILECHECK_EXECUTABLE to its full path") + libomptarget_warning_say("The check-libomptarget target will not be available!") + return() + endif() + + # Set lit arguments + # The -j 1 lets the actual tests run with the entire machine. + # We have one test thread that spawns the tests serially. This allows + # Each test to use the entire machine. + set(LIBOMPTARGET_LIT_ARGS_DEFAULT "-sv --show-unsupported --show-xfail -j 1") + if(MSVC OR XCODE) + set(LIBOMPTARGET_LIT_ARGS_DEFAULT "${LIBOMPTARGET_LIT_ARGS_DEFAULT} --no-progress-bar") + endif() + set(LIBOMPTARGET_LIT_ARGS "${LIBOMPTARGET_LIT_ARGS_DEFAULT}" CACHE STRING + "Default options for lit") + separate_arguments(LIBOMPTARGET_LIT_ARGS) + add_custom_target(check-libomptarget + COMMAND ${PYTHON_EXECUTABLE} ${LIT_EXECUTABLE} ${LIBOMPTARGET_LIT_ARGS} ${CMAKE_CURRENT_BINARY_DIR} + DEPENDS omptarget + COMMENT "Running libomptarget tests" + ${cmake_3_2_USES_TERMINAL} + ) + + set(LIBOMPTARGET_OPENMP_HEADER_FOLDER "${CMAKE_CURRENT_BINARY_DIR}/../../runtime/src" CACHE STRING + "Path to folder containing omp.h") + set(LIBOMPTARGET_OPENMP_HOST_RTL_FOLDER "${CMAKE_CURRENT_BINARY_DIR}/../../runtime/src" CACHE STRING + "Path to folder containing libomp.so") +else() + # LLVM source tree build, test just-built clang + if(NOT MSVC) + set(LIBOMPTARGET_TEST_C_COMPILER ${LLVM_RUNTIME_OUTPUT_INTDIR}/clang) + set(LIBOMPTARGET_TEST_CXX_COMPILER ${LLVM_RUNTIME_OUTPUT_INTDIR}/clang++) + set(LIBOMPTARGET_FILECHECK ${LLVM_RUNTIME_OUTPUT_INTDIR}/FileCheck) + else() + libomptarget_error_say("Not prepared to run tests on Windows systems.") + endif() + set(LIBOMPTARGET_TEST_OPENMP_FLAG -fopenmp=libomp) + # Use add_lit_testsuite() from LLVM CMake. This also depends on OpenMP + # implementation because it uses omp.h. + add_lit_testsuite(check-libomptarget + "Running libomptarget tests" + ${CMAKE_CURRENT_BINARY_DIR} + ARGS "-j 1" + DEPENDS omptarget omp + ) + + set(LIBOMPTARGET_OPENMP_HEADER_FOLDER "${LIBOMPTARGET_BINARY_DIR}/../runtime/src") +endif() + +# Configure the lit.site.cfg.in file +set(AUTO_GEN_COMMENT "## Autogenerated by libomptarget configuration.\n# Do not edit!") +configure_file(lit.site.cfg.in lit.site.cfg @ONLY) + Index: libomptarget/test/lit.cfg =================================================================== --- /dev/null +++ libomptarget/test/lit.cfg @@ -0,0 +1,116 @@ +# -*- Python -*- vim: set ft=python ts=4 sw=4 expandtab tw=79: +# Configuration file for the 'lit' test runner. + +import os +import lit.formats + +# Tell pylint that we know config and lit_config exist somewhere. +if 'PYLINT_IMPORT' in os.environ: + config = object() + lit_config = object() + +def append_dynamic_library_path(name, value, sep): + if name in config.environment: + config.environment[name] = value + sep + config.environment[name] + else: + config.environment[name] = value + +# name: The name of this test suite. +config.name = 'libomptarget' + +# suffixes: A list of file extensions to treat as test files. +config.suffixes = ['.c', '.cpp', '.cc'] + +# test_source_root: The root path where tests are located. +config.test_source_root = os.path.dirname(__file__) + +# test_exec_root: The root object directory where output is placed +config.test_exec_root = config.libomptarget_obj_root + +# test format +config.test_format = lit.formats.ShTest() + +# compiler flags +config.test_cflags = config.test_openmp_flag + \ + " -I " + config.test_source_root + \ + " -I " + config.omp_header_directory + \ + " -L " + config.library_dir; + +if config.omp_host_rtl_directory: + config.test_cflags = config.test_cflags + " -L " + \ + config.omp_host_rtl_directory + +config.test_cflags = config.test_cflags + " " + config.test_extra_cflags + +# Setup environment to find dynamic library at runtime +if config.operating_system == 'Windows': + append_dynamic_library_path('PATH', config.library_dir, ";") + append_dynamic_library_path('PATH', config.omp_host_rtl_directory, ";") +elif config.operating_system == 'Darwin': + append_dynamic_library_path('DYLD_LIBRARY_PATH', config.library_dir, ":") + append_dynamic_library_path('DYLD_LIBRARY_PATH', \ + config.omp_host_rtl_directory, ";") + config.test_cflags += " -Wl,-rpath," + config.library_dir + config.test_cflags += " -Wl,-rpath," + config.omp_host_rtl_directory +else: # Unices + append_dynamic_library_path('LD_LIBRARY_PATH', config.library_dir, ":") + append_dynamic_library_path('LD_LIBRARY_PATH', \ + config.omp_host_rtl_directory, ":") + +# substitutions +# - for targets that exist in the system create the actual command. +# - for valid targets that do not exist in the system, return false, so that the +# same test can be used for different targets. + +# Scan all the valid targets. +for libomptarget_target in config.libomptarget_all_targets: + # Is this target in the current system? If so create a compile, run and test + # command. Otherwise create command that return false. + if libomptarget_target in config.libomptarget_system_targets: + config.substitutions.append(("%libomptarget-compilexx-run-and-check-" + \ + libomptarget_target, \ + "%libomptarget-compilexx-and-run-" + libomptarget_target + \ + " | " + config.libomptarget_filecheck + " %s")) + config.substitutions.append(("%libomptarget-compile-run-and-check-" + \ + libomptarget_target, \ + "%libomptarget-compile-and-run-" + libomptarget_target + \ + " | " + config.libomptarget_filecheck + " %s")) + config.substitutions.append(("%libomptarget-compilexx-and-run-" + \ + libomptarget_target, \ + "%clangxx-" + libomptarget_target + " %s -o %t-" + \ + libomptarget_target + " && %t-" + libomptarget_target)) + config.substitutions.append(("%libomptarget-compile-and-run-" + \ + libomptarget_target, \ + "%clang-" + libomptarget_target + " %s -o %t-" + \ + libomptarget_target + " && %t-" + libomptarget_target)) + config.substitutions.append(("%clangxx-" + libomptarget_target, \ + "%clangxx %cflags -fopenmp-targets=" + libomptarget_target)) + config.substitutions.append(("%clang-" + libomptarget_target, \ + "%clang %cflags -fopenmp-targets=" + libomptarget_target)) + config.substitutions.append(("%fcheck-" + libomptarget_target, \ + config.libomptarget_filecheck + " %s")) + else: + config.substitutions.append(("%libomptarget-compile-run-and-check-" + \ + libomptarget_target, \ + "echo ignored-command")) + config.substitutions.append(("%libomptarget-compilexx-run-and-check-" + \ + libomptarget_target, \ + "echo ignored-command")) + config.substitutions.append(("%libomptarget-compile-and-run-" + \ + libomptarget_target, \ + "echo ignored-command")) + config.substitutions.append(("%libomptarget-compilexx-and-run-" + \ + libomptarget_target, \ + "echo ignored-command")) + config.substitutions.append(("%clang-" + libomptarget_target, \ + "echo ignored-command")) + config.substitutions.append(("%clangxx-" + libomptarget_target, \ + "echo ignored-command")) + config.substitutions.append(("%fcheck-" + libomptarget_target, \ + "echo ignored-command")) + +config.substitutions.append(("%clangxx", config.test_cxx_compiler)) +config.substitutions.append(("%clang", config.test_c_compiler)) +config.substitutions.append(("%openmp_flag", config.test_openmp_flag)) +config.substitutions.append(("%cflags", config.test_cflags)) + Index: libomptarget/test/lit.site.cfg.in =================================================================== --- /dev/null +++ libomptarget/test/lit.site.cfg.in @@ -0,0 +1,20 @@ +@AUTO_GEN_COMMENT@ + +config.test_c_compiler = "@LIBOMPTARGET_TEST_C_COMPILER@" +config.test_cxx_compiler = "@LIBOMPTARGET_TEST_CXX_COMPILER@" +config.test_openmp_flag = "@LIBOMPTARGET_TEST_OPENMP_FLAG@" +# For the moment we still need to pass libomptarget explicitly. Once the driver +# patch, lands, this is not required anymore. +config.test_extra_cflags = "-lomptarget @LIBOMPTARGET_TEST_CFLAGS@" +config.libomptarget_obj_root = "@CMAKE_CURRENT_BINARY_DIR@" +config.library_dir = "@LIBOMPTARGET_LIBRARY_DIR@" +config.omp_header_directory = "@LIBOMPTARGET_OPENMP_HEADER_FOLDER@" +config.omp_host_rtl_directory = "@LIBOMPTARGET_OPENMP_HOST_RTL_FOLDER@" +config.operating_system = "@CMAKE_SYSTEM_NAME@" +config.libomptarget_all_targets = "@LIBOMPTARGET_ALL_TARGETS@".split() +config.libomptarget_system_targets = "@LIBOMPTARGET_SYSTEM_TARGETS@".split() +config.libomptarget_filecheck = "@LIBOMPTARGET_FILECHECK@" + +# Let the main config do the real work. +lit_config.load_config(config, "@LIBOMPTARGET_BASE_DIR@/test/lit.cfg") + Index: libomptarget/test/offloading/offloading_success.c =================================================================== --- /dev/null +++ libomptarget/test/offloading/offloading_success.c @@ -0,0 +1,22 @@ +// RUN: %libomptarget-compile-run-and-check-powerpc64-ibm-linux-gnu +// RUN: %libomptarget-compile-run-and-check-powerpc64le-ibm-linux-gnu +// RUN: %libomptarget-compile-run-and-check-x86_64-pc-linux-gnu + +#include +#include + +int main(void) { + int isHost = -1; + +#pragma omp target + { isHost = omp_is_initial_device(); } + + if (isHost < 0) { + printf("Runtime error, isHost=%d\n", isHost); + } + + // CHECK: Target region executed on the device + printf("Target region executed on the %s\n", isHost ? "host" : "device"); + + return isHost; +} Index: libomptarget/test/offloading/offloading_success.cpp =================================================================== --- /dev/null +++ libomptarget/test/offloading/offloading_success.cpp @@ -0,0 +1,22 @@ +// RUN: %libomptarget-compilexx-run-and-check-powerpc64-ibm-linux-gnu +// RUN: %libomptarget-compilexx-run-and-check-powerpc64le-ibm-linux-gnu +// RUN: %libomptarget-compilexx-run-and-check-x86_64-pc-linux-gnu + +#include +#include + +int main(void) { + int isHost = 0; + +#pragma omp target + { isHost = omp_is_initial_device(); } + + if (isHost < 0) { + printf("Runtime error, isHost=%d\n", isHost); + } + + // CHECK: Target region executed on the device + printf("Target region executed on the %s\n", isHost ? "host" : "device"); + + return isHost; +}