Index: CMakeLists.txt =================================================================== --- CMakeLists.txt +++ CMakeLists.txt @@ -1,2 +1,3 @@ cmake_minimum_required(VERSION 2.8 FATAL_ERROR) add_subdirectory(runtime) +add_subdirectory(libomptarget) Index: libomptarget/Build_With_CMake.txt =================================================================== --- /dev/null +++ libomptarget/Build_With_CMake.txt @@ -0,0 +1,127 @@ +# +#//===----------------------------------------------------------------------===// +#// +#// The LLVM Compiler Infrastructure +#// +#// This file is dual licensed under the MIT and the University of Illinois Open +#// Source Licenses. See LICENSE.txt for details. +#// +#//===----------------------------------------------------------------------===// +# + +===================================================================== +How to Build the LLVM* OpenMP* Offloading Runtime Library using CMake +===================================================================== + +==== Version of CMake required: v2.8.0 or above ==== + +============================================ +How to call cmake initially, then repeatedly +============================================ +- When calling cmake for the first time, all needed compiler options + must be specified on the command line. After this initial call to + cmake, the compiler definitions must not be included for further calls + to cmake. Other options can be specified on the command line multiple + times including all definitions in the Build options section below. +- Example of configuring, building, reconfiguring, rebuilding: + $ mkdir build + $ cd build + $ cmake -DCMAKE_C_COMPILER=gcc -DCMAKE_CXX_COMPILER=g++ .. # Initial configuration + $ make + ... + $ make clean + $ cmake -DCMAKE_BUILD_TYPE=Debug .. # Second configuration + $ make + ... + $ rm -rf * + $ cmake -DCMAKE_C_COMPILER=gcc -DCMAKE_CXX_COMPILER=g++ .. # Third configuration + $ make +- Notice in the example how the compiler definitions are only specified + for an empty build directory, but other Build options are used at any time. +- The file CMakeCache.txt which is created after the first call to cmake + is a configuration file which holds all the values for the Build options. + These configuration values can be changed using a text editor to modify + CMakeCache.txt as opposed to using definitions on the command line. +- To have cmake create a particular type of build generator file simply + inlude the -G option: + $ cmake -G "Unix Makefiles" ... + You can see a list of generators cmake supports by executing cmake with + no arguments and a list will be printed. + +===================== +Instructions to Build +===================== + $ cd libomptarget_top_level/ [ directory with plugins/ , deviceRTLs/ , etc. ] + $ mkdir build + $ cd build + + [ Unix* Libraries ] + $ cmake -DCMAKE_C_COMPILER= -DCMAKE_CXX_COMPILER= .. + + $ make + $ make install + +=========== +Tests +=========== +After the library has been built, there are optional tests that can be +performed. Some will be skipped based upon the platform. +To run the tests, +$ make check-libomptarget + +============= +CMake options +============= +-DCMAKE_C_COMPILER= +Specify the C compiler + +-DCMAKE_CXX_COMPILER= +Specify the C++ compiler + +==== First values listed are the default value ==== +-DCMAKE_BUILD_TYPE=Release|Debug|RelWithDebInfo +Build type can be Release, Debug, or RelWithDebInfo. + +-DLIBOMPTARGET_ENABLE_WERROR=true|false +Should consider warnings as errors. + +-DLIBOMPTARGET_LLVM_LIT_EXECUTABLE="" +Full path to the llvm-lit tool. Required for testing in out-of-tree builds. + +-DLIBOMPTARGET_FILECHECK_EXECUTABLE="" +Full path to the FileCheck tool. Required for testing in out-of-tree builds. + +-DLIBOMPTARGET_OPENMP_HEADER_FOLDER="" +Path of the folder that contains omp.h. This is required for testing +out-of-tree builds. + +-DLIBOMPTARGET_OPENMP_HOST_RTL_FOLDER="" +Path of the folder that contains libomp.so. This is required for testing +out-of-tree builds. + +==== NVPTX device RTL specific ==== +-DLIBOMPTARGET_NVPTX_ALTERNATE_HOST_COMPILER="" +Host compiler to use with NVCC. This compiler is not going to be used to produce +any binary. Instead, this is used to overcome the input compiler checks done by +NVCC. E.g. if using a default host compiler that is not compatible with NVCC, +this option can be use to pass to NVCC a valid compiler to avoid the error. + +-DLIBOMPTARGET_NVPTX_COMPUTE_CAPABILITY="35" +Comma-separated list of CUDA compute capabilities that should be supported by +the NVPTX device RTL. E.g. for compute capabilities 3.0 and 3.5, the option +"30,35" should be used. + +======================= +Example usages of CMake +======================= +---- Typical usage ---- +cmake -DCMAKE_C_COMPILER=gcc -DCMAKE_CXX_COMPILER=g++ .. +cmake -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ .. + +---- Request an NVPTX runtime library that supports compute capability 5.0 ---- +cmake -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ -DLIBOMPTARGET_NVPTX_COMPUTE_CAPABILITY="50" + +========= +Footnotes +========= +[*] Other names and brands may be claimed as the property of others. Index: libomptarget/CMakeLists.txt =================================================================== --- /dev/null +++ libomptarget/CMakeLists.txt @@ -0,0 +1,124 @@ +##===----------------------------------------------------------------------===## +# +# The LLVM Compiler Infrastructure +# +# This file is dual licensed under the MIT and the University of Illinois Open +# Source Licenses. See LICENSE.txt for details. +# +##===----------------------------------------------------------------------===## +# +# Build offloading library libomptarget.so. +# +##===----------------------------------------------------------------------===## + +# CMAKE libomptarget +cmake_minimum_required(VERSION 2.8 FATAL_ERROR) + +# Add cmake directory to search for custom cmake functions. +set(CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake/Modules ${CMAKE_MODULE_PATH}) + +# Standalone build or part of LLVM? +set(LIBOMPTARGET_STANDALONE_BUILD FALSE) +if("${CMAKE_SOURCE_DIR}" STREQUAL "${CMAKE_CURRENT_SOURCE_DIR}" OR + "${CMAKE_SOURCE_DIR}/libomptarget" STREQUAL "${CMAKE_CURRENT_SOURCE_DIR}") + project(libomptarget C CXX) + set(LIBOMPTARGET_STANDALONE_BUILD TRUE) +endif() + + +if(${LIBOMPTARGET_STANDALONE_BUILD}) + set(LIBOMPTARGET_ENABLE_WERROR FALSE CACHE BOOL + "Enable -Werror flags to turn warnings into errors for supporting compilers.") + # CMAKE_BUILD_TYPE was not defined, set default to Release + if(NOT CMAKE_BUILD_TYPE) + set(CMAKE_BUILD_TYPE Release) + endif() +else() + set(LIBOMP_ENABLE_WERROR ${LLVM_ENABLE_WERROR}) +endif() + +# Compiler flag checks. +include(config-ix) + +# Message utilities. +include(LibomptargetUtils) + +# Get dependencies for the different components of the project. +include(LibomptargetGetDependencies) + +# This is a list of all the targets that are supported/tested right now. +set (LIBOMPTARGET_ALL_TARGETS "${LIBOMPTARGET_ALL_TARGETS} powerpc64le-ibm-linux-gnu") +set (LIBOMPTARGET_ALL_TARGETS "${LIBOMPTARGET_ALL_TARGETS} powerpc64-ibm-linux-gnu") +set (LIBOMPTARGET_ALL_TARGETS "${LIBOMPTARGET_ALL_TARGETS} x86_64-pc-linux-gnu") +set (LIBOMPTARGET_ALL_TARGETS "${LIBOMPTARGET_ALL_TARGETS} nvptx64-nvidia-cuda") + +# Once the plugins for the different targets are validated, they will be added to +# the list of supported targets in the current system. +set (LIBOMPTARGET_SYSTEM_TARGETS "") + +# Set base directories - required for lit to locate the tests. +set(LIBOMPTARGET_BASE_DIR ${CMAKE_CURRENT_SOURCE_DIR}) +set(LIBOMPTARGET_BINARY_DIR ${CMAKE_CURRENT_BINARY_DIR}) + +# We need C++11 support. +if(LIBOMPTARGET_HAVE_STD_CPP11_FLAG) + + # We can only build the offloading library if libelf is available. + if(LIBOMPTARGET_DEP_LIBELF_FOUND) + + libomptarget_say("Building offloading runtime library libomptarget.") + + # Add libelf include directories. + include_directories(${LIBOMPTARGET_DEP_LIBELF_INCLUDE_DIRS}) + + # Enable support for C++11. + add_definitions(-std=c++11) + + if(LIBOMPTARGET_ENABLE_WERROR AND LIBOMPTARGET_HAVE_WERROR_FLAG) + add_definitions(-Werror) + endif() + + # If building this library in debug mode, we define a macro to enable + # dumping progress messages at runtime. + string( TOLOWER "${CMAKE_BUILD_TYPE}" LIBOMPTARGET_CMAKE_BUILD_TYPE) + if(LIBOMPTARGET_CMAKE_BUILD_TYPE MATCHES debug) + add_definitions(-DOMPTARGET_DEBUG) + add_definitions(-g) + add_definitions(-O0) + endif() + + set(src_files + src/omptarget.cpp + src/targets_info.cpp + ) + + include_directories(src/) + + # Build libomptarget library with libelf and libdl dependencies. + add_library(omptarget SHARED ${src_files}) + target_link_libraries(omptarget + ${LIBOMPTARGET_DEP_LIBELF_LIBRARIES} + dl + "-Wl,--version-script=${CMAKE_CURRENT_SOURCE_DIR}/exports") + + # Retrieve the path to the resulting library so that it can be used for + # testing. + get_target_property(LIBOMPTARGET_LIBRARY_DIR omptarget LIBRARY_OUTPUT_DIRECTORY) + if(NOT LIBOMPTARGET_LIBRARY_DIR) + set(LIBOMPTARGET_LIBRARY_DIR ${CMAKE_CURRENT_BINARY_DIR}) + endif() + + # Build offloading plugins and the device RTL libraries if they are available. + add_subdirectory(plugins) + add_subdirectory(deviceRTLs) + + # Add tests. + add_subdirectory(test) + + else(LIBOMPTARGET_DEP_LIBELF_FOUND) + libomptarget_say("Not building offloading runtime library libomptarget: libelf dependency wasn't found.") + endif(LIBOMPTARGET_DEP_LIBELF_FOUND) + +else(LIBOMPTARGET_HAVE_STD_CPP11_FLAG) + libomptarget_say("Not building offloading runtime library libomptarget: host compiler must have c++11 support.") +endif(LIBOMPTARGET_HAVE_STD_CPP11_FLAG) Index: libomptarget/README.txt =================================================================== --- /dev/null +++ libomptarget/README.txt @@ -0,0 +1,72 @@ + + README for the LLVM* OpenMP* Offloading Runtime Library (libomptarget) + ====================================================================== + +How to Build the LLVM* OpenMP* Offloading Runtime Library (libomptarget) +======================================================================== +In-tree build: + +$ cd where-you-want-to-live +Check out openmp (libomptarget lives under ./libomptarget) into llvm/projects +$ cd where-you-want-to-build +$ mkdir build && cd build +$ cmake path/to/llvm -DCMAKE_C_COMPILER= -DCMAKE_CXX_COMPILER= +$ make omptarget + +Out-of-tree build: + +$ cd where-you-want-to-live +Check out openmp (libomptarget lives under ./libomptarget) +$ cd where-you-want-to-live/openmp/libomptarget +$ mkdir build && cd build +$ cmake path/to/openmp -DCMAKE_C_COMPILER= -DCMAKE_CXX_COMPILER= +$ make + +For details about building, please look at Build_With_CMake.txt + +Architectures Supported +======================= +The current library has been only tested in Linux operating system and the +following host architectures: +* Intel(R) 64 architecture +* IBM(R) Power architecture (big endian) +* IBM(R) Power architecture (little endian) + +The currently supported offloading device architectures are: +* Intel(R) 64 architecture (generic 64-bit plugin - mostly for testing purposes) +* IBM(R) Power architecture (big endian) (generic 64-bit plugin - mostly for testing purposes) +* IBM(R) Power architecture (little endian) (generic 64-bit plugin - mostly for testing purposes) +* CUDA(R) enabled 64-bit NVIDIA(R) GPU architectures + +Supported RTL Build Configurations +================================== +Supported Architectures: Intel(R) 64, IBM(R) Power 7 and Power 8 + + --------------------------- + | gcc | clang | +--------------|------------|------------| +| Linux* OS | Yes(1) | Yes(2) | +----------------------------------------- + +(1) gcc version 4.8.2 or later is supported. +(2) clang version 3.7 or later is supported. + + +Front-end Compilers that work with this RTL +=========================================== + +The following compilers are known to do compatible code generation for +this RTL: + - clang (from the OpenMP development branch at http://clang-omp.github.io/ ) + - clang (development branch at http://clang.llvm.org - several features still + under development) + +----------------------------------------------------------------------- + +Notices +======= +This library and related compiler support is still under development, so the +employed interface is likely to change in the future. + +*Other names and brands may be claimed as the property of others. + Index: libomptarget/cmake/Modules/LibomptargetGetDependencies.cmake =================================================================== --- /dev/null +++ libomptarget/cmake/Modules/LibomptargetGetDependencies.cmake @@ -0,0 +1,114 @@ +# +#//===----------------------------------------------------------------------===// +#// +#// The LLVM Compiler Infrastructure +#// +#// This file is dual licensed under the MIT and the University of Illinois Open +#// Source Licenses. See LICENSE.txt for details. +#// +#//===----------------------------------------------------------------------===// +# + +# Try to detect in the system several dependencies required by the different +# components of libomptarget. These are the dependencies we have: +# +# libelf : required by some targets to handle the ELF files at runtime. +# libffi : required to launch target kernels given function and argument +# pointers. +# CUDA : required to control offloading to NVIDIA GPUs. + +include (FindPackageHandleStandardArgs) + +################################################################################ +# Looking for libelf... +################################################################################ + +find_path ( + LIBOMPTARGET_DEP_LIBELF_INCLUDE_DIR + NAMES + libelf.h + PATHS + /usr/include + /usr/local/include + /opt/local/include + /sw/include + ENV CPATH + PATH_SUFFIXES + libelf) + +find_library ( + LIBOMPTARGET_DEP_LIBELF_LIBRARIES + NAMES + elf + PATHS + /usr/lib + /usr/local/lib + /opt/local/lib + /sw/lib + ENV LIBRARY_PATH + ENV LD_LIBRARY_PATH) + +set(LIBOMPTARGET_DEP_LIBELF_INCLUDE_DIRS ${LIBOMPTARGET_DEP_LIBELF_INCLUDE_DIR}) +find_package_handle_standard_args( + LIBOMPTARGET_DEP_LIBELF + DEFAULT_MSG + LIBOMPTARGET_DEP_LIBELF_LIBRARIES + LIBOMPTARGET_DEP_LIBELF_INCLUDE_DIRS) + +mark_as_advanced( + LIBOMPTARGET_DEP_LIBELF_INCLUDE_DIRS + LIBOMPTARGET_DEP_LIBELF_LIBRARIES) + +################################################################################ +# Looking for libffi... +################################################################################ + +find_path ( + LIBOMPTARGET_DEP_LIBFFI_INCLUDE_DIR + NAMES + ffi.h + PATHS + /usr/include + /usr/local/include + /opt/local/include + /sw/include + ENV CPATH + PATH_SUFFIXES + libffi) + +find_library ( + LIBOMPTARGET_DEP_LIBFFI_LIBRARIES + NAMES + ffi + PATHS + /usr/lib + /usr/local/lib + /opt/local/lib + /sw/lib + ENV LIBRARY_PATH + ENV LD_LIBRARY_PATH) + +set(LIBOMPTARGET_DEP_LIBFFI_INCLUDE_DIRS ${LIBOMPTARGET_DEP_LIBFFI_INCLUDE_DIR}) +find_package_handle_standard_args( + LIBOMPTARGET_DEP_LIBFFI + DEFAULT_MSG + LIBOMPTARGET_DEP_LIBFFI_LIBRARIES + LIBOMPTARGET_DEP_LIBFFI_INCLUDE_DIRS) + +mark_as_advanced( + LIBOMPTARGET_DEP_LIBFFI_INCLUDE_DIRS + LIBOMPTARGET_DEP_LIBFFI_LIBRARIES) + +################################################################################ +# Looking for CUDA... +################################################################################ +find_package(CUDA QUIET) + +set(LIBOMPTARGET_DEP_CUDA_FOUND ${CUDA_FOUND}) +set(LIBOMPTARGET_DEP_CUDA_LIBRARIES ${CUDA_LIBRARIES}) +set(LIBOMPTARGET_DEP_CUDA_INCLUDE_DIRS ${CUDA_INCLUDE_DIRS}) + +mark_as_advanced( + LIBOMPTARGET_DEP_CUDA_FOUND + LIBOMPTARGET_DEP_CUDA_INCLUDE_DIRS + LIBOMPTARGET_DEP_CUDA_LIBRARIES) Index: libomptarget/cmake/Modules/LibomptargetUtils.cmake =================================================================== --- /dev/null +++ libomptarget/cmake/Modules/LibomptargetUtils.cmake @@ -0,0 +1,28 @@ +# +#//===----------------------------------------------------------------------===// +#// +#// The LLVM Compiler Infrastructure +#// +#// This file is dual licensed under the MIT and the University of Illinois Open +#// Source Licenses. See LICENSE.txt for details. +#// +#//===----------------------------------------------------------------------===// +# + +# void libomptarget_say(string message_to_user); +# - prints out message_to_user +macro(libomptarget_say message_to_user) + message(STATUS "LIBOMPTARGET: ${message_to_user}") +endmacro() + +# void libomptarget_warning_say(string message_to_user); +# - prints out message_to_user with a warning +macro(libomptarget_warning_say message_to_user) + message(WARNING "LIBOMPTARGET: ${message_to_user}") +endmacro() + +# void libomptarget_error_say(string message_to_user); +# - prints out message_to_user with an error and exits cmake +macro(libomptarget_error_say message_to_user) + message(FATAL_ERROR "LIBOMPTARGET: ${message_to_user}") +endmacro() Index: libomptarget/cmake/Modules/config-ix.cmake =================================================================== --- /dev/null +++ libomptarget/cmake/Modules/config-ix.cmake @@ -0,0 +1,17 @@ +# +#//===----------------------------------------------------------------------===// +#// +#// The LLVM Compiler Infrastructure +#// +#// This file is dual licensed under the MIT and the University of Illinois Open +#// Source Licenses. See LICENSE.txt for details. +#// +#//===----------------------------------------------------------------------===// +# + +include(CheckCCompilerFlag) +include(CheckCXXCompilerFlag) + +# Checking C, CXX +check_cxx_compiler_flag(-std=c++11 LIBOMPTARGET_HAVE_STD_CPP11_FLAG) +check_c_compiler_flag(-Werror LIBOMPTARGET_HAVE_WERROR_FLAG) Index: libomptarget/deviceRTLs/CMakeLists.txt =================================================================== --- /dev/null +++ libomptarget/deviceRTLs/CMakeLists.txt @@ -0,0 +1,14 @@ +##===----------------------------------------------------------------------===## +# +# The LLVM Compiler Infrastructure +# +# This file is dual licensed under the MIT and the University of Illinois Open +# Source Licenses. See LICENSE.txt for details. +# +# ##===----------------------------------------------------------------------===## +# +# Build a device RTL for each available machine available. +# +##===----------------------------------------------------------------------===## + +add_subdirectory(nvptx) Index: libomptarget/deviceRTLs/nvptx/CMakeLists.txt =================================================================== --- /dev/null +++ libomptarget/deviceRTLs/nvptx/CMakeLists.txt @@ -0,0 +1,90 @@ +##===----------------------------------------------------------------------===## +# +# The LLVM Compiler Infrastructure +# +# This file is dual licensed under the MIT and the University of Illinois Open +# Source Licenses. See LICENSE.txt for details. +# +##===----------------------------------------------------------------------===## +# +# Build the NVPTX (CUDA) Device RTL if the CUDA tools are available +# +##===----------------------------------------------------------------------===## + +set(LIBOMPTARGET_NVPTX_ALTERNATE_HOST_COMPILER "" CACHE STRING + "Path to alternate NVCC host compiler to be used by the NVPTX device RTL.") + +if(LIBOMPTARGET_NVPTX_ALTERNATE_HOST_COMPILER) + find_program(CUDA_HOST_COMPILER NAMES ${LIBOMPTARGET_NVPTX_ALTERNATE_HOST_COMPILER}) + if(NOT CUDA_HOST_COMPILER) + libomptarget_say("Not building CUDA offloading device RTL: invalid NVPTX alternate host compiler.") + endif() +endif() + +# We can't use clang as nvcc host processor, so we attempt to replace it with +# gcc. +if(CUDA_HOST_COMPILER MATCHES "(clang)|(.*/clang)$") + + find_program(LIBOMPTARGET_NVPTX_ALTERNATE_GCC_HOST_COMPILER NAMES gcc) + + if(NOT LIBOMPTARGET_NVPTX_ALTERNATE_GCC_HOST_COMPILER) + libomptarget_say("Not building CUDA offloading device RTL: clang is not supported as NVCC host compiler.") + libomptarget_say("Please include gcc in your path or set LIBOMPTARGET_NVPTX_ALTERNATE_HOST_COMPILER to the full path of of valid compiler.") + return() + endif() + set(CUDA_HOST_COMPILER "${LIBOMPTARGET_NVPTX_ALTERNATE_GCC_HOST_COMPILER}") +endif() + +if(LIBOMPTARGET_DEP_CUDA_FOUND) + libomptarget_say("Building CUDA offloading device RTL.") + + # We really don't have any host code, so we don't need to care about + # propagating host flags. + set(CUDA_PROPAGATE_HOST_FLAGS OFF) + + set(cuda_src_files + src/cancel.cu + src/critical.cu + src/debug.cu + src/libcall.cu + src/loop.cu + src/omptarget-nvptx.cu + src/parallel.cu + src/reduction.cu + src/stdio.cu + src/sync.cu + src/task.cu + ) + + include_directories( + src/ + ) + + # Get all the compute capabilities the user requested or use SM_35 by default. + if(LIBOMPTARGET_NVPTX_COMPUTE_CAPABILITY) + string(REPLACE "," ";" nvptx_sm_list ${LIBOMPTARGET_NVPTX_COMPUTE_CAPABILITY}) + foreach(sm ${nvptx_sm_list}) + set(CUDA_ARCH ${CUDA_ARCH} -gencode arch=compute_${sm},code=sm_${sm}) + endforeach() + else() + set(CUDA_ARCH -arch sm_35) + endif() + + # Activate RTL message dumps if requested by the user. + if(LIBOMPTARGET_NVPTX_DEBUG) + set(CUDA_DEBUG -DOMPTARGET_NVPTX_DEBUG=-1 -g --ptxas-options=-v) + endif() + + # NVPTX runtime library has to be statically linked. Dynamic linking is not + # yet supported by the CUDA toolchain on the device. + set(BUILD_SHARED_LIBS OFF) + set(CUDA_SEPARABLE_COMPILATION ON) + + cuda_add_library(omptarget-nvptx STATIC ${cuda_src_files} + OPTIONS ${CUDA_ARCH} ${CUDA_DEBUG}) + + target_link_libraries(omptarget-nvptx ${CUDA_LIBRARIES}) + +else() + libomptarget_say("Not building CUDA offloading device RTL: CUDA tools not found in the system.") +endif() Index: libomptarget/deviceRTLs/nvptx/src/cancel.cu =================================================================== --- /dev/null +++ libomptarget/deviceRTLs/nvptx/src/cancel.cu @@ -0,0 +1,34 @@ +//===------ cancel.cu - NVPTX OpenMP cancel interface ------------ CUDA -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is dual licensed under the MIT and the University of Illinois Open +// Source Licenses. See LICENSE.txt for details. +// +//===----------------------------------------------------------------------===// +// +// Interface to be used in the implementation of OpenMP cancel. +// +//===----------------------------------------------------------------------===// + +#include "../../../deviceRTLs/nvptx/src/omptarget-nvptx.h" + +EXTERN int32_t __kmpc_cancellationpoint( + kmp_Indent* loc, + int32_t global_tid, + int32_t cancelVal) +{ + PRINT(LD_IO, "call kmpc_cancellationpoint(cancel val %d)\n", cancelVal); + // disabled + return FALSE; +} + +EXTERN int32_t __kmpc_cancel( + kmp_Indent* loc, + int32_t global_tid, + int32_t cancelVal) +{ + PRINT(LD_IO, "call kmpc_cancel(cancel val %d)\n", cancelVal); + // disabled + return FALSE; +} Index: libomptarget/deviceRTLs/nvptx/src/counter_group.h =================================================================== --- /dev/null +++ libomptarget/deviceRTLs/nvptx/src/counter_group.h @@ -0,0 +1,56 @@ +//===------ counter_group.h - NVPTX OpenMP loop scheduling ------- CUDA -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is dual licensed under the MIT and the University of Illinois Open +// Source Licenses. See LICENSE.txt for details. +// +//===----------------------------------------------------------------------===// +// +// Interface to implement OpenMP loop scheduling +// +//===----------------------------------------------------------------------===// + +#ifndef SRC_COUNTER_GROUP_H_ +#define SRC_COUNTER_GROUP_H_ + +#include +#include + +#include + +// counter group type for synchronizations +class omptarget_nvptx_CounterGroup +{ +public: + // getters and setters + INLINE Counter & Event () { return v_event; } + INLINE volatile Counter & Start () { return v_start; } + INLINE Counter & Init () { return v_init; } + + // Synchronization Interface + + INLINE void Clear(); // first time start=event + INLINE void Reset(); // init = first + INLINE void Init(Counter & priv); // priv = init + INLINE Counter Next(); // just counts number of events + + // set priv to n, to be used in later waitOrRelease + INLINE void Complete(Counter & priv, Counter n); + + // check priv and decide if we have to wait or can free the other warps + INLINE void Release(Counter priv, Counter current_event_value); + INLINE void WaitOrRelease(Counter priv, Counter current_event_value); + +private: + Counter v_event; // counter of events (atomic) + + // volatile is needed to force loads to read from global + // memory or L2 cache and see the write by the last master + volatile Counter v_start; // signal when events registered are finished + + Counter v_init; // used to initialize local thread variables +}; + + +#endif /* SRC_COUNTER_GROUP_H_ */ Index: libomptarget/deviceRTLs/nvptx/src/counter_groupi.h =================================================================== --- /dev/null +++ libomptarget/deviceRTLs/nvptx/src/counter_groupi.h @@ -0,0 +1,89 @@ +//===----- counter_groupi.h - NVPTX OpenMP loop scheduling ------- CUDA -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is dual licensed under the MIT and the University of Illinois Open +// Source Licenses. See LICENSE.txt for details. +// +//===----------------------------------------------------------------------===// +// +// Interface implementation for OpenMP loop scheduling +// +//===----------------------------------------------------------------------===// + +#include +#include "option.h" + +INLINE void omptarget_nvptx_CounterGroup::Clear() +{ + PRINT0(LD_SYNCD, "clear counters\n") + v_event = 0; + v_start = 0; + // v_init does not need to be reset (its value is dead) +} + +INLINE void omptarget_nvptx_CounterGroup::Reset() +{ + // done by master before entering parallel + ASSERT(LT_FUSSY, v_event==v_start, + "error, entry %lld !=start %lld at reset\n", P64(v_event), P64(v_start)); + v_init = v_start; +} + +INLINE void omptarget_nvptx_CounterGroup::Init(Counter & priv) +{ + PRINT(LD_SYNCD, "init priv counter 0x%llx with val %lld\n", + P64(&priv), P64(v_start)); + priv = v_start; +} + +// just counts number of events +INLINE Counter omptarget_nvptx_CounterGroup::Next() +{ + Counter oldVal = atomicAdd(&v_event, (Counter) 1); + PRINT(LD_SYNCD, "next event counter 0x%llx with val %lld->%lld\n", + P64(&v_event), P64(oldVal), P64(oldVal+1)); + + return oldVal; +} + +//set priv to n, to be used in later waitOrRelease +INLINE void omptarget_nvptx_CounterGroup::Complete(Counter & priv, Counter n) +{ + PRINT(LD_SYNCD, "complete priv counter 0x%llx with val %lld->%lld (+%d)\n", + P64(&priv), P64(priv), P64(priv+n), n); + priv += n; +} + +INLINE void omptarget_nvptx_CounterGroup::Release ( + Counter priv, + Counter current_event_value) +{ + if (priv - 1 == current_event_value) { + PRINT(LD_SYNCD, "Release start counter 0x%llx with val %lld->%lld\n", + P64(&v_start), P64(v_start), P64(priv)); + v_start = priv; + } +} + +// check priv and decide if we have to wait or can free the other warps +INLINE void omptarget_nvptx_CounterGroup::WaitOrRelease ( + Counter priv, + Counter current_event_value) +{ + if (priv - 1 == current_event_value) { + PRINT(LD_SYNCD, "Release start counter 0x%llx with val %lld->%lld\n", + P64(&v_start), P64(v_start), P64(priv)); + v_start = priv; + } else { + PRINT(LD_SYNCD, "Start waiting while start counter 0x%llx with val %lld < %lld\n", + P64(&v_start), P64(v_start), P64(priv)); + while (priv > v_start) { + // IDLE LOOP + // start is volatile: it will be re-loaded at each while loop + } + PRINT(LD_SYNCD, "Done waiting as start counter 0x%llx with val %lld >= %lld\n", + P64(&v_start), P64(v_start), P64(priv)); + } +} + Index: libomptarget/deviceRTLs/nvptx/src/critical.cu =================================================================== --- /dev/null +++ libomptarget/deviceRTLs/nvptx/src/critical.cu @@ -0,0 +1,40 @@ +//===------ critical.cu - NVPTX OpenMP critical ------------------ CUDA -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is dual licensed under the MIT and the University of Illinois Open +// Source Licenses. See LICENSE.txt for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains the implementation of critical with KMPC interface +// +//===----------------------------------------------------------------------===// + +#include +#include + +#include "../../../deviceRTLs/nvptx/src/omptarget-nvptx.h" + +EXTERN +void __kmpc_critical(kmp_Indent *loc, int32_t global_tid, kmp_CriticalName *lck) +{ + PRINT0(LD_IO, "call to kmpc_critical()\n"); + omptarget_nvptx_TeamDescr & teamDescr = getMyTeamDescriptor(); + omp_set_lock(teamDescr.CriticalLock()); +} + +EXTERN +void __kmpc_end_critical( kmp_Indent *loc, int32_t global_tid, kmp_CriticalName *lck ) +{ + PRINT0(LD_IO, "call to kmpc_end_critical()\n"); + omptarget_nvptx_TeamDescr & teamDescr = getMyTeamDescriptor(); + omp_unset_lock(teamDescr.CriticalLock()); +} + + + + + + + Index: libomptarget/deviceRTLs/nvptx/src/debug.h =================================================================== --- /dev/null +++ libomptarget/deviceRTLs/nvptx/src/debug.h @@ -0,0 +1,240 @@ +//===------------- debug.h - NVPTX OpenMP debug macros ----------- CUDA -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is dual licensed under the MIT and the University of Illinois Open +// Source Licenses. See LICENSE.txt for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains debug macros to be used in the application. +// +// Usage guide +// +// PRINT0(flag, str) : if debug flag is on, print (no arguments) +// PRINT(flag, str, args) : if debug flag is on, print (arguments) +// DON(flag) : return true if debug flag is on +// +// ASSERT(flag, cond, str, args): if test flag is on, test the condition +// if the condition is false, print str+args +// and assert. +// CAUTION: cond may be evaluate twice +// AON(flag) : return true if test flag is on +// +// WARNING(flag, str, args) : if warning flag is on, print the warning +// WON(flag) : return true if warning flag is on +// +//===----------------------------------------------------------------------===// + +#ifndef _OMPTARGET_NVPTX_DEBUG_H_ +#define _OMPTARGET_NVPTX_DEBUG_H_ + +//////////////////////////////////////////////////////////////////////////////// +// set desired level of debugging +//////////////////////////////////////////////////////////////////////////////// + +#define LD_SET_NONE 0ULL /* none */ +#define LD_SET_ALL -1ULL /* all */ + +// pos 1 +#define LD_SET_LOOP 0x1ULL /* basic loop */ +#define LD_SET_LOOPD 0x2ULL /* basic loop */ +#define LD_SET_PAR 0x4ULL /* basic parallel */ +#define LD_SET_PARD 0x8ULL /* basic parallel */ + +// pos 2 +#define LD_SET_SYNC 0x10ULL /* sync info */ +#define LD_SET_SYNCD 0x20ULL /* sync info */ +#define LD_SET_WAIT 0x40ULL /* state when waiting */ +#define LD_SET_TASK 0x80ULL /* print task info (high level) */ + +// pos 3 +#define LD_SET_IO 0x100ULL /* big region io (excl atomic) */ +#define LD_SET_IOD 0x200ULL /* big region io (excl atomic) */ +#define LD_SET_ENV 0x400ULL /* env info */ +#define LD_SET_CANCEL 0x800ULL /* print cancel info */ + +// pos 4 +#define LD_SET_MEM 0x1000ULL /* malloc / free */ + + + +//////////////////////////////////////////////////////////////////////////////// +// set the desired flags to print selected output + +//#define OMPTARGET_NVPTX_DEBUG (LD_SET_ALL) +//#define OMPTARGET_NVPTX_DEBUG (LD_SET_LOOP) // limit to loop printfs to save on cuda buffer +//#define OMPTARGET_NVPTX_DEBUG (LD_SET_IO) +//#define OMPTARGET_NVPTX_DEBUG (LD_SET_IO | LD_SET_ENV) +//#define OMPTARGET_NVPTX_DEBUG (LD_SET_PAR) + +#ifndef OMPTARGET_NVPTX_DEBUG + #define OMPTARGET_NVPTX_DEBUG LD_SET_NONE +#elif OMPTARGET_NVPTX_DEBUG + #warning debug is used, not good for measurements +#endif + +//////////////////////////////////////////////////////////////////////////////// +// set desired level of asserts +//////////////////////////////////////////////////////////////////////////////// + +//////////////////////////////////////////////////////////////////////////////// +// available flags + +#define LT_SET_NONE 0x0 /* unsafe */ +#define LT_SET_SAFETY 0x1 /* check malloc type of stuff, input at creation, cheap */ +#define LT_SET_INPUT 0x2 /* check also all runtime inputs */ +#define LT_SET_FUSSY 0x4 /* fussy checks, expensive */ + +//////////////////////////////////////////////////////////////////////////////// +// set the desired flags + +#ifndef OMPTARGET_NVPTX_TEST + #if OMPTARGET_NVPTX_DEBUG + #define OMPTARGET_NVPTX_TEST (LT_SET_FUSSY) + #else + #define OMPTARGET_NVPTX_TEST (LT_SET_SAFETY) + #endif +#endif + +//////////////////////////////////////////////////////////////////////////////// +// set desired level of warnings +//////////////////////////////////////////////////////////////////////////////// + +//////////////////////////////////////////////////////////////////////////////// +// available flags + +#define LW_SET_ALL -1 +#define LW_SET_NONE 0x0 +#define LW_SET_ENV 0x1 +#define LW_SET_INPUT 0x2 +#define LW_SET_FUSSY 0x4 + +//////////////////////////////////////////////////////////////////////////////// +// set the desired flags + +#if OMPTARGET_NVPTX_DEBUG + #define OMPTARGET_NVPTX_WARNING (LW_SET_NONE) +#else + #define OMPTARGET_NVPTX_WARNING (LW_SET_FUSSY) +#endif + + +//////////////////////////////////////////////////////////////////////////////// +// implemtation for debug +//////////////////////////////////////////////////////////////////////////////// + +#if OMPTARGET_NVPTX_DEBUG || OMPTARGET_NVPTX_TEST || OMPTARGET_NVPTX_WARNING + #include +#endif +#if OMPTARGET_NVPTX_TEST + #include +#endif + +// set flags that are tested (inclusion properties) + +#define LD_ALL (LD_SET_ALL) + +#define LD_LOOP (LD_SET_LOOP | LD_SET_LOOPD) +#define LD_LOOPD (LD_SET_LOOPD) +#define LD_PAR (LD_SET_PAR | LD_SET_PARD) +#define LD_PARD (LD_SET_PARD) + +// pos 2 +#define LD_SYNC (LD_SET_SYNC | LD_SET_SYNCD) +#define LD_SYNCD (LD_SET_SYNCD) +#define LD_WAIT (LD_SET_WAIT) +#define LD_TASK (LD_SET_TASK) + +// pos 3 +#define LD_IO (LD_SET_IO | LD_SET_IOD) +#define LD_IOD (LD_SET_IOD) +#define LD_ENV (LD_SET_ENV) +#define LD_CANCEL (LD_SET_CANCEL) + +// pos 3 +#define LD_MEM (LD_SET_MEM) + +// implement +#if OMPTARGET_NVPTX_DEBUG + + #define DON(_flag) ((OMPTARGET_NVPTX_DEBUG) & (_flag)) + + #define PRINT0(_flag, _str) { if (DON(_flag)) { \ + printf(": " _str, blockIdx.x, threadIdx.x, \ + threadIdx.x / warpSize, threadIdx.x & 0x1F); }} + + #define PRINT(_flag, _str, _args...) { if (DON(_flag)) { \ + printf(": " _str, blockIdx.x, threadIdx.x, \ + threadIdx.x / warpSize, threadIdx.x & 0x1F, _args); }} +#else + + #define DON(_flag) (FALSE) + #define PRINT0(flag, str) + #define PRINT(flag, str, _args...) + +#endif + +// for printing without worring about precision, pointers... +#define P64(_x) ((unsigned long long)(_x)) + +//////////////////////////////////////////////////////////////////////////////// +// early defs for test +//////////////////////////////////////////////////////////////////////////////// + +#define LT_SAFETY (LT_SET_SAFETY | LT_SET_INPUT | LT_SET_FUSSY) +#define LT_INPUT (LT_SET_INPUT | LT_SET_FUSSY) +#define LT_FUSSY (LT_SET_FUSSY) + +#if OMPTARGET_NVPTX_TEST == LT_SET_SAFETY + + #define TON(_flag) ((OMPTARGET_NVPTX_TEST) & (_flag)) + #define ASSERT0(_flag, _cond, _str) { if (TON(_flag)) {assert(_cond);}} + #define ASSERT(_flag, _cond, _str, _args...) { if (TON(_flag)) {assert(_cond);}} + +#elif OMPTARGET_NVPTX_TEST >= LT_SET_INPUT + + #define TON(_flag) ((OMPTARGET_NVPTX_TEST) & (_flag)) + #define ASSERT0(_flag, _cond, _str) { if (TON(_flag) && !(_cond)) { \ + printf(" ASSERT: " _str "\n", blockIdx.x, threadIdx.x, \ + threadIdx.x / warpSize, threadIdx.x & 0x1F); assert(_cond);}} + #define ASSERT(_flag, _cond, _str, _args...) { if (TON(_flag) && !(_cond)) { \ + printf(" ASSERT: " _str "\n", blockIdx.x, threadIdx.x, \ + threadIdx.x / warpSize, threadIdx.x & 0x1F, _args); assert(_cond);}} + +#else + + #define TON(_flag) (FALSE) + #define ASSERT0(_flag, _cond, _str) + #define ASSERT(_flag, _cond, _str, _args...) + +#endif + +//////////////////////////////////////////////////////////////////////////////// +// early defs for warning + +#define LW_ALL (LW_SET_ALL) +#define LW_ENV (LW_SET_FUSSY | LW_SET_INPUT | LW_SET_ENV) +#define LW_INPUT (LW_SET_FUSSY | LW_SET_INPUT) +#define LW_FUSSY (LW_SET_FUSSY) + +#if OMPTARGET_NVPTX_WARNING + + #define WON(_flag) ((OMPTARGET_NVPTX_WARNING) & (_flag)) + #define WARNING0(_flag, _str) { if (WON(_flag)) { \ + printf(" WARNING: " _str, blockIdx.x, threadIdx.x, \ + threadIdx.x / warpSize, threadIdx.x & 0x1F); }} + #define WARNING(_flag, _str, _args...) { if (WON(_flag)) { \ + printf(" WARNING: " _str, blockIdx.x, threadIdx.x, \ + threadIdx.x / warpSize, threadIdx.x & 0x1F, _args); }} + +#else + + #define WON(_flag) (FALSE) + #define WARNING0(_flag, _str) + #define WARNING(_flag, _str, _args...) + +#endif + + +#endif Index: libomptarget/deviceRTLs/nvptx/src/debug.cu =================================================================== --- /dev/null +++ libomptarget/deviceRTLs/nvptx/src/debug.cu @@ -0,0 +1,78 @@ +//===------------ debug.cu - NVPTX OpenMP debug utilities -------- CUDA -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is dual licensed under the MIT and the University of Illinois Open +// Source Licenses. See LICENSE.txt for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains the implementation of debug utilities to be +// used in the application. +// +//===----------------------------------------------------------------------===// + +#include "../../../deviceRTLs/nvptx/src/omptarget-nvptx.h" + +//////////////////////////////////////////////////////////////////////////////// +// print current state +//////////////////////////////////////////////////////////////////////////////// + +NOINLINE void PrintTaskDescr(omptarget_nvptx_TaskDescr *taskDescr, char *title, int level) +{ + omp_sched_t sched = taskDescr->GetRuntimeSched(); + PRINT(LD_ALL, "task descr %s %d: %s, in par %d, dyn %d, rt sched %d, chunk %lld;" + " tid %d, tnum %d, nthreads %d\n", + title, level, (taskDescr->IsParallelConstruct()?"par":"task"), + taskDescr->InParallelRegion(), taskDescr->IsDynamic(), + sched, taskDescr->RuntimeChunkSize(), + taskDescr->ThreadId(), taskDescr->ThreadsInTeam(), taskDescr->NThreads()); +} + +//////////////////////////////////////////////////////////////////////////////// +// debug for compiler (should eventually all vanish) +//////////////////////////////////////////////////////////////////////////////// + +EXTERN void __kmpc_print_str(char *title) +{ + PRINT(LD_ALL, " %s\n", title); +} + +EXTERN void __kmpc_print_title_int(char *title, int data) +{ + PRINT(LD_ALL, "%s val=%d\n", title, data); +} + +EXTERN void __kmpc_print_index(char *title, int i) +{ + PRINT(LD_ALL, "i = %d\n", i); +} + +EXTERN void __kmpc_print_int(int data) +{ + PRINT(LD_ALL, "val=%d\n", data); +} + +EXTERN void __kmpc_print_double(double data) +{ + PRINT(LD_ALL, "val=%lf\n", data); +} + +EXTERN void __kmpc_print_address_int64(int64_t data) +{ + PRINT(LD_ALL, "val=%016llx\n", data); +} + +//////////////////////////////////////////////////////////////////////////////// +// substitute for printf in kernel (should vanish) +//////////////////////////////////////////////////////////////////////////////// + +EXTERN void __kmpc_kernel_print(char *title) +{ + PRINT(LD_ALL, " %s\n", title); +} + +EXTERN void __kmpc_kernel_print_int8(char *title, int64_t data) +{ + PRINT(LD_ALL, " %s val=%lld\n", title, data); +} Index: libomptarget/deviceRTLs/nvptx/src/interface.h =================================================================== --- /dev/null +++ libomptarget/deviceRTLs/nvptx/src/interface.h @@ -0,0 +1,658 @@ +//===------- interface.h - NVPTX OpenMP interface definitions ---- CUDA -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is dual licensed under the MIT and the University of Illinois Open +// Source Licenses. See LICENSE.txt for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains debug macros to be used in the application. +// +// This file contains all the definitions that are relevant to +// the interface. The first section contains the interface as +// declared by OpenMP. A second section includes library private calls +// (mostly debug, temporary?) The third section includes the compiler +// specific interfaces. +// +//===----------------------------------------------------------------------===// + +#ifndef _INTERFACES_H_ +#define _INTERFACES_H_ + +//////////////////////////////////////////////////////////////////////////////// +// OpenMP interface +//////////////////////////////////////////////////////////////////////////////// + +typedef uint32_t omp_lock_t; /* arbitrary type of the right length */ +typedef uint64_t omp_nest_lock_t; /* arbitrary type of the right length */ + +typedef enum omp_sched_t { + omp_sched_static = 1, /* chunkSize >0 */ + omp_sched_dynamic = 2, /* chunkSize >0 */ + omp_sched_guided = 3, /* chunkSize >0 */ + omp_sched_auto = 4, /* no chunkSize */ +} omp_sched_t; + +typedef enum omp_proc_bind_t { + omp_proc_bind_false = 0, + omp_proc_bind_true = 1, + omp_proc_bind_master = 2, + omp_proc_bind_close = 3, + omp_proc_bind_spread = 4 +} omp_proc_bind_t; + + +EXTERN double omp_get_wtick(void); +EXTERN double omp_get_wtime(void); + +EXTERN void omp_set_num_threads(int num); +EXTERN int omp_get_num_threads(void); +EXTERN int omp_get_max_threads(void); +EXTERN int omp_get_thread_limit(void); +EXTERN int omp_get_thread_num(void); +EXTERN int omp_get_num_procs(void); +EXTERN int omp_in_parallel(void); +EXTERN int omp_in_final(void); +EXTERN void omp_set_dynamic(int flag); +EXTERN int omp_get_dynamic(void); +EXTERN void omp_set_nested(int flag); +EXTERN int omp_get_nested(void); +EXTERN void omp_set_max_active_levels(int level); +EXTERN int omp_get_max_active_levels(void); +EXTERN int omp_get_level(void); +EXTERN int omp_get_active_level(void); +EXTERN int omp_get_ancestor_thread_num(int level); +EXTERN int omp_get_team_size(int level); + +EXTERN void omp_init_lock(omp_lock_t *lock); +EXTERN void omp_init_nest_lock(omp_nest_lock_t *lock); +EXTERN void omp_destroy_lock(omp_lock_t *lock); +EXTERN void omp_destroy_nest_lock(omp_nest_lock_t *lock); +EXTERN void omp_set_lock(omp_lock_t *lock); +EXTERN void omp_set_nest_lock(omp_nest_lock_t *lock); +EXTERN void omp_unset_lock(omp_lock_t *lock); +EXTERN void omp_unset_nest_lock(omp_nest_lock_t *lock); +EXTERN int omp_test_lock(omp_lock_t *lock); +EXTERN int omp_test_nest_lock(omp_nest_lock_t *lock); + +EXTERN void omp_get_schedule(omp_sched_t * kind, int * modifier); +EXTERN void omp_set_schedule(omp_sched_t kind, int modifier); +EXTERN omp_proc_bind_t omp_get_proc_bind(void); +EXTERN int omp_get_cancellation(void); +EXTERN void omp_set_default_device(int deviceId); +EXTERN int omp_get_default_device(void); +EXTERN int omp_get_num_devices(void); +EXTERN int omp_get_num_teams(void); +EXTERN int omp_get_team_num(void); +EXTERN int omp_is_initial_device(void); + + +//////////////////////////////////////////////////////////////////////////////// +// OMPTARGET_NVPTX private (debug / temportary?) interface +//////////////////////////////////////////////////////////////////////////////// + +// for debug +EXTERN void __kmpc_print_str(char *title); +EXTERN void __kmpc_print_title_int(char *title, int data); +EXTERN void __kmpc_print_index(char *title, int i); +EXTERN void __kmpc_print_int(int data); +EXTERN void __kmpc_print_double(double data); +EXTERN void __kmpc_print_address_int64(int64_t data); + +//////////////////////////////////////////////////////////////////////////////// +// file below is swiped from kmpc host interface +//////////////////////////////////////////////////////////////////////////////// + +//////////////////////////////////////////////////////////////////////////////// +// kmp specifc types +//////////////////////////////////////////////////////////////////////////////// + +typedef enum kmp_sched_t { + kmp_sched_static_chunk = 33, + kmp_sched_static_nochunk = 34, + kmp_sched_dynamic = 35, + kmp_sched_guided = 36, + kmp_sched_runtime = 37, + kmp_sched_auto = 38, + + kmp_sched_static_ordered = 65, + kmp_sched_static_nochunk_ordered = 66, + kmp_sched_dynamic_ordered = 67, + kmp_sched_guided_ordered = 68, + kmp_sched_runtime_ordered = 69, + kmp_sched_auto_ordered = 70, + + kmp_sched_distr_static_chunk = 91, + kmp_sched_distr_static_nochunk = 92, + + kmp_sched_default = kmp_sched_static_nochunk, + kmp_sched_unordered_first = kmp_sched_static_chunk, + kmp_sched_unordered_last = kmp_sched_auto, + kmp_sched_ordered_first = kmp_sched_static_ordered, + kmp_sched_ordered_last = kmp_sched_auto_ordered, + kmp_sched_distribute_first = kmp_sched_distr_static_chunk, + kmp_sched_distribute_last = kmp_sched_distr_static_nochunk +} kmp_sched_t; + + +// parallel defs +typedef void kmp_Indent; +typedef void (* kmp_ParFctPtr)(int32_t *global_tid, int32_t *bound_tid, ...); +typedef void (* kmp_ReductFctPtr)(void *lhsData, void *rhsData); + +// task defs +typedef struct kmp_TaskDescr kmp_TaskDescr; +typedef int32_t (* kmp_TaskFctPtr)(int32_t global_tid, kmp_TaskDescr *taskDescr); +typedef struct kmp_TaskDescr { + void *sharedPointerTable; // ptr to a table of shared var ptrs + kmp_TaskFctPtr sub; // task subroutine + int32_t partId; // unused + kmp_TaskFctPtr destructors; // destructor of c++ first private +} kmp_TaskDescr; +// task dep defs +#define KMP_TASKDEP_IN 0x1u +#define KMP_TASKDEP_OUT 0x2u +typedef struct kmp_TaskDep_Public { + void *addr; + size_t len; + uint8_t flags; // bit 0: in, bit 1: out +} kmp_TaskDep_Public; + +// flags that interpret the interface part of tasking flags +#define KMP_TASK_IS_TIED 0x1 +#define KMP_TASK_FINAL 0x2 +#define KMP_TASK_MERGED_IF0 0x4 /* unused */ +#define KMP_TASK_DESTRUCTOR_THUNK 0x8 + +// flags for task setup return +#define KMP_CURRENT_TASK_NOT_SUSPENDED 0 +#define KMP_CURRENT_TASK_SUSPENDED 1 + +// sync defs +typedef int32_t kmp_CriticalName[8]; + + +//////////////////////////////////////////////////////////////////////////////// +// flags for kstate (all bits initially off) +//////////////////////////////////////////////////////////////////////////////// + +// first 2 bits used by kmp_Reduction (defined in kmp_reduction.cpp) +#define KMP_REDUCTION_MASK 0x3 +#define KMP_SKIP_NEXT_CALL 0x4 +#define KMP_SKIP_NEXT_CANCEL_BARRIER 0x8 + + +//////////////////////////////////////////////////////////////////////////////// +// data +//////////////////////////////////////////////////////////////////////////////// + +//////////////////////////////////////////////////////////////////////////////// +// external interface +//////////////////////////////////////////////////////////////////////////////// + +// query +EXTERN int32_t __kmpc_global_thread_num(kmp_Indent *loc); // missing +EXTERN int32_t __kmpc_global_num_threads(kmp_Indent *loc); // missing +EXTERN int32_t __kmpc_bound_thread_num(kmp_Indent *loc); // missing +EXTERN int32_t __kmpc_bound_num_threads(kmp_Indent *loc); // missing +EXTERN int32_t __kmpc_in_parallel(kmp_Indent *loc); // missing + +// parallel +EXTERN void __kmpc_push_num_threads(kmp_Indent *loc, int32_t global_tid, int32_t num_threads); +//aee ... not supported +//EXTERN void __kmpc_fork_call(kmp_Indent *loc, int32_t argc, kmp_ParFctPtr microtask, ...); +EXTERN void __kmpc_serialized_parallel(kmp_Indent *loc, uint32_t global_tid); +EXTERN void __kmpc_end_serialized_parallel(kmp_Indent *loc, uint32_t global_tid); + +// for static (no chunk or chunk) +EXTERN void __kmpc_for_static_init_4(kmp_Indent *loc, int32_t global_tid, + int32_t sched, int32_t *plastiter, int32_t *plower, int32_t *pupper, + int32_t *pstride, int32_t incr, int32_t chunk); +EXTERN void __kmpc_for_static_init_4u(kmp_Indent *loc, int32_t global_tid, + int32_t sched, int32_t *plastiter, uint32_t *plower, uint32_t *pupper, + int32_t *pstride, int32_t incr, int32_t chunk); +EXTERN void __kmpc_for_static_init_8(kmp_Indent *loc, int32_t global_tid, + int32_t sched, int32_t *plastiter, int64_t *plower, int64_t *pupper, + int64_t *pstride, int64_t incr, int64_t chunk); +EXTERN void __kmpc_for_static_init_8u(kmp_Indent *loc, int32_t global_tid, + int32_t sched, int32_t *plastiter1, uint64_t *plower, uint64_t *pupper, + int64_t *pstride, int64_t incr, int64_t chunk); + +EXTERN void __kmpc_for_static_fini(kmp_Indent *loc, int32_t global_tid); + +// for dynamic +EXTERN void __kmpc_dispatch_init_4(kmp_Indent *loc, int32_t global_tid, + int32_t sched, int32_t lower, int32_t upper, int32_t incr, + int32_t chunk); +EXTERN void __kmpc_dispatch_init_4u(kmp_Indent *loc, int32_t global_tid, + int32_t sched, uint32_t lower, uint32_t upper, int32_t incr, + int32_t chunk); +EXTERN void __kmpc_dispatch_init_8(kmp_Indent *loc, int32_t global_tid, + int32_t sched, int64_t lower, int64_t upper, int64_t incr, + int64_t chunk); +EXTERN void __kmpc_dispatch_init_8u(kmp_Indent *loc, int32_t global_tid, + int32_t sched, uint64_t lower, uint64_t upper, int64_t incr, + int64_t chunk); + +EXTERN int __kmpc_dispatch_next_4(kmp_Indent *loc, int32_t global_tid, + int32_t *plastiter, int32_t *plower, int32_t *pupper, int32_t *pstride); +EXTERN int __kmpc_dispatch_next_4u(kmp_Indent *loc, int32_t global_tid, + int32_t *plastiter, uint32_t *plower, uint32_t *pupper, int32_t *pstride); +EXTERN int __kmpc_dispatch_next_8(kmp_Indent *loc, int32_t global_tid, + int32_t *plastiter, int64_t *plower, int64_t *pupper, int64_t *pstride); +EXTERN int __kmpc_dispatch_next_8u(kmp_Indent *loc, int32_t global_tid, + int32_t *plastiter, uint64_t *plower, uint64_t *pupper,int64_t *pstride); + +EXTERN void __kmpc_dispatch_fini_4(kmp_Indent *loc, int32_t global_tid); +EXTERN void __kmpc_dispatch_fini_4u(kmp_Indent *loc, int32_t global_tid); +EXTERN void __kmpc_dispatch_fini_8(kmp_Indent *loc, int32_t global_tid); +EXTERN void __kmpc_dispatch_fini_8u(kmp_Indent *loc, int32_t global_tid); + +// reduction +EXTERN int32_t __kmpc_reduce41(kmp_Indent *loc, int32_t global_tid, + int32_t varNum, size_t reduceSize, void *reduceData, void *reduceArraySize, + kmp_ReductFctPtr *reductFct, kmp_CriticalName *lock); +EXTERN void __kmpc_end_reduce(kmp_Indent *loc, int32_t global_tid, + kmp_CriticalName *lock); +EXTERN int32_t __kmpc_reduce_nowait41(kmp_Indent *loc, + int32_t global_tid,int32_t varNum, size_t reduceSize, void *reduceData, + void *reduceArraySize, kmp_ReductFctPtr *reductFct, kmp_CriticalName *lock); +EXTERN void __kmpc_end_reduce_nowait(kmp_Indent *loc, + int32_t global_tid, kmp_CriticalName *lock); + +// sync barrier +EXTERN int32_t __kmpc_cancel_barrier(kmp_Indent *loc, int32_t global_tid); + +// single +EXTERN int32_t __kmpc_single(kmp_Indent *loc, int32_t global_tid); +EXTERN void __kmpc_end_single(kmp_Indent *loc, int32_t global_tid); + +// sync +EXTERN int32_t __kmpc_master(kmp_Indent *loc, int32_t global_tid); +EXTERN void __kmpc_end_master(kmp_Indent *loc, int32_t global_tid); +EXTERN void __kmpc_ordered(kmp_Indent *loc, int32_t global_tid); +EXTERN void __kmpc_end_ordered(kmp_Indent *loc, int32_t global_tid); +EXTERN void __kmpc_critical(kmp_Indent *loc, int32_t global_tid, + kmp_CriticalName *crit); +EXTERN void __kmpc_end_critical(kmp_Indent *loc, int32_t global_tid, + kmp_CriticalName *crit); +EXTERN void __kmpc_flush(kmp_Indent *loc); + +// tasks +EXTERN kmp_TaskDescr *__kmpc_omp_task_alloc(kmp_Indent *loc, + uint32_t global_tid, int32_t flag, size_t sizeOfTaskInclPrivate, + size_t sizeOfSharedTable, + kmp_TaskFctPtr sub); +EXTERN int32_t __kmpc_omp_task_with_deps(kmp_Indent *loc, + uint32_t global_tid, kmp_TaskDescr *newLegacyTaskDescr, + int32_t depNum, void * depList, int32_t noAliasDepNum, + void * noAliasDepList); +EXTERN void __kmpc_omp_task_begin_if0(kmp_Indent *loc, + uint32_t global_tid, kmp_TaskDescr *newLegacyTaskDescr); +EXTERN void __kmpc_omp_task_complete_if0(kmp_Indent *loc, + uint32_t global_tid, kmp_TaskDescr *newLegacyTaskDescr); +EXTERN void __kmpc_omp_wait_deps(kmp_Indent *loc, + uint32_t global_tid, int32_t depNum, void * depList, + int32_t noAliasDepNum, void * noAliasDepList); +EXTERN void __kmpc_taskgroup(kmp_Indent *loc, uint32_t global_tid); +EXTERN void __kmpc_end_taskgroup(kmp_Indent *loc, uint32_t global_tid); +EXTERN void __kmpc_omp_taskyield(kmp_Indent *loc, uint32_t global_tid); +EXTERN void __kmpc_omp_taskwait(kmp_Indent *loc, uint32_t global_tid); + +// cancel +EXTERN int32_t __kmpc_cancellationpoint(kmp_Indent* loc, int32_t global_tid, + int32_t cancelVal); +EXTERN int32_t __kmpc_cancel(kmp_Indent* loc, int32_t global_tid, + int32_t cancelVal); + +// target (no target call here) + +// atomic +//EXTERN void __array_atomicfixed4_add(kmp_Indent *loc, int gtid, int32_t *addr1, int32_t *val, int64_t n); +EXTERN void __kmpc_atomic_fixed1_wr (kmp_Indent *loc, int gtid, int8_t *addr, int8_t val) ; +EXTERN void __kmpc_atomic_fixed1_add (kmp_Indent *loc, int gtid, int8_t *addr, int8_t val) ; +EXTERN void __kmpc_atomic_fixed1_sub (kmp_Indent *loc, int gtid, int8_t *addr, int8_t val) ; +EXTERN void __kmpc_atomic_fixed1_sub_rev (kmp_Indent *loc, int gtid, int8_t *addr, int8_t val) ; +EXTERN void __kmpc_atomic_fixed1_mul (kmp_Indent *loc, int gtid, int8_t *addr, int8_t val) ; +EXTERN void __kmpc_atomic_fixed1_div (kmp_Indent *loc, int gtid, int8_t *addr, int8_t val) ; +EXTERN void __kmpc_atomic_fixed1_div_rev (kmp_Indent *loc, int gtid, int8_t *addr, int8_t val) ; +EXTERN void __kmpc_atomic_fixed1u_div (kmp_Indent *loc, int gtid, uint8_t *addr, uint8_t val) ; +EXTERN void __kmpc_atomic_fixed1u_div_rev (kmp_Indent *loc, int gtid, uint8_t *addr, uint8_t val) ; +EXTERN void __kmpc_atomic_fixed1_min (kmp_Indent *loc, int gtid, int8_t *addr, int8_t val) ; +EXTERN void __kmpc_atomic_fixed1_max (kmp_Indent *loc, int gtid, int8_t *addr, int8_t val) ; +EXTERN void __kmpc_atomic_fixed1_andb (kmp_Indent *loc, int gtid, int8_t *addr, int8_t val) ; +EXTERN void __kmpc_atomic_fixed1_orb (kmp_Indent *loc, int gtid, int8_t *addr, int8_t val) ; +EXTERN void __kmpc_atomic_fixed1_xor (kmp_Indent *loc, int gtid, int8_t *addr, int8_t val) ; +EXTERN void __kmpc_atomic_fixed1_andl (kmp_Indent *loc, int gtid, int8_t *addr, int8_t val) ; +EXTERN void __kmpc_atomic_fixed1_orl (kmp_Indent *loc, int gtid, int8_t *addr, int8_t val) ; +EXTERN void __kmpc_atomic_fixed1_eqv (kmp_Indent *loc, int gtid, int8_t *addr, int8_t val) ; +EXTERN void __kmpc_atomic_fixed1_neqv (kmp_Indent *loc, int gtid, int8_t *addr, int8_t val) ; +EXTERN void __kmpc_atomic_fixed1_shl (kmp_Indent *loc, int gtid, int8_t *addr, int8_t val) ; +EXTERN void __kmpc_atomic_fixed1_shl_rev (kmp_Indent *loc, int gtid, int8_t *addr, int8_t val) ; +EXTERN void __kmpc_atomic_fixed1_shr (kmp_Indent *loc, int gtid, int8_t *addr, int8_t val) ; +EXTERN void __kmpc_atomic_fixed1_shr_rev (kmp_Indent *loc, int gtid, int8_t *addr, int8_t val) ; +EXTERN void __kmpc_atomic_fixed1u_shr (kmp_Indent *loc, int gtid, uint8_t *addr, uint8_t val) ; +EXTERN void __kmpc_atomic_fixed1u_shr_rev (kmp_Indent *loc, int gtid, uint8_t *addr, uint8_t val) ; +EXTERN void __kmpc_atomic_fixed2_wr (kmp_Indent *loc, int gtid, int16_t *addr, int16_t val) ; +EXTERN void __kmpc_atomic_fixed2_add (kmp_Indent *loc, int gtid, int16_t *addr, int16_t val) ; +EXTERN void __kmpc_atomic_fixed2_sub (kmp_Indent *loc, int gtid, int16_t *addr, int16_t val) ; +EXTERN void __kmpc_atomic_fixed2_sub_rev (kmp_Indent *loc, int gtid, int16_t *addr, int16_t val) ; +EXTERN void __kmpc_atomic_fixed2_mul (kmp_Indent *loc, int gtid, int16_t *addr, int16_t val) ; +EXTERN void __kmpc_atomic_fixed2_div (kmp_Indent *loc, int gtid, int16_t *addr, int16_t val) ; +EXTERN void __kmpc_atomic_fixed2_div_rev (kmp_Indent *loc, int gtid, int16_t *addr, int16_t val) ; +EXTERN void __kmpc_atomic_fixed2u_div (kmp_Indent *loc, int gtid, uint16_t *addr, uint16_t val) ; +EXTERN void __kmpc_atomic_fixed2u_div_rev (kmp_Indent *loc, int gtid, uint16_t *addr, uint16_t val) ; +EXTERN void __kmpc_atomic_fixed2_min (kmp_Indent *loc, int gtid, int16_t *addr, int16_t val) ; +EXTERN void __kmpc_atomic_fixed2_max (kmp_Indent *loc, int gtid, int16_t *addr, int16_t val) ; +EXTERN void __kmpc_atomic_fixed2_andb (kmp_Indent *loc, int gtid, int16_t *addr, int16_t val) ; +EXTERN void __kmpc_atomic_fixed2_orb (kmp_Indent *loc, int gtid, int16_t *addr, int16_t val) ; +EXTERN void __kmpc_atomic_fixed2_xor (kmp_Indent *loc, int gtid, int16_t *addr, int16_t val) ; +EXTERN void __kmpc_atomic_fixed2_andl (kmp_Indent *loc, int gtid, int16_t *addr, int16_t val) ; +EXTERN void __kmpc_atomic_fixed2_orl (kmp_Indent *loc, int gtid, int16_t *addr, int16_t val) ; +EXTERN void __kmpc_atomic_fixed2_eqv (kmp_Indent *loc, int gtid, int16_t *addr, int16_t val) ; +EXTERN void __kmpc_atomic_fixed2_neqv (kmp_Indent *loc, int gtid, int16_t *addr, int16_t val) ; +EXTERN void __kmpc_atomic_fixed2_shl (kmp_Indent *loc, int gtid, int16_t *addr, int16_t val) ; +EXTERN void __kmpc_atomic_fixed2_shl_rev (kmp_Indent *loc, int gtid, int16_t *addr, int16_t val) ; +EXTERN void __kmpc_atomic_fixed2_shr (kmp_Indent *loc, int gtid, int16_t *addr, int16_t val) ; +EXTERN void __kmpc_atomic_fixed2_shr_rev (kmp_Indent *loc, int gtid, int16_t *addr, int16_t val) ; +EXTERN void __kmpc_atomic_fixed2u_shr (kmp_Indent *loc, int gtid, uint16_t *addr, uint16_t val) ; +EXTERN void __kmpc_atomic_fixed2u_shr_rev (kmp_Indent *loc, int gtid, uint16_t *addr, uint16_t val) ; +EXTERN void __kmpc_atomic_fixed2_swp (kmp_Indent *loc, int gtid, int16_t *addr, int16_t val) ; +EXTERN void __kmpc_atomic_fixed4_wr (kmp_Indent *loc, int gtid, int32_t *addr, int32_t val) ; +EXTERN void __kmpc_atomic_fixed4_add (kmp_Indent *loc, int gtid, int32_t *addr, int32_t val) ; +EXTERN void __kmpc_atomic_fixed4_sub (kmp_Indent *loc, int gtid, int32_t *addr, int32_t val) ; +EXTERN void __kmpc_atomic_fixed4_sub_rev (kmp_Indent *loc, int gtid, int32_t *addr, int32_t val) ; +EXTERN void __kmpc_atomic_fixed4_mul (kmp_Indent *loc, int gtid, int32_t *addr, int32_t val) ; +EXTERN void __kmpc_atomic_fixed4_div (kmp_Indent *loc, int gtid, int32_t *addr, int32_t val) ; +EXTERN void __kmpc_atomic_fixed4_div_rev (kmp_Indent *loc, int gtid, int32_t *addr, int32_t val) ; +EXTERN void __kmpc_atomic_fixed4u_div (kmp_Indent *loc, int gtid, uint32_t *addr, uint32_t val) ; +EXTERN void __kmpc_atomic_fixed4u_div_rev (kmp_Indent *loc, int gtid, uint32_t *addr, uint32_t val) ; +EXTERN void __kmpc_atomic_fixed4_min (kmp_Indent *loc, int gtid, int32_t *addr, int32_t val) ; +EXTERN void __kmpc_atomic_fixed4_max (kmp_Indent *loc, int gtid, int32_t *addr, int32_t val) ; +EXTERN void __kmpc_atomic_fixed4_andb (kmp_Indent *loc, int gtid, int32_t *addr, int32_t val) ; +EXTERN void __kmpc_atomic_fixed4_orb (kmp_Indent *loc, int gtid, int32_t *addr, int32_t val) ; +EXTERN void __kmpc_atomic_fixed4_xor (kmp_Indent *loc, int gtid, int32_t *addr, int32_t val) ; +EXTERN void __kmpc_atomic_fixed4_andl (kmp_Indent *loc, int gtid, int32_t *addr, int32_t val) ; +EXTERN void __kmpc_atomic_fixed4_orl (kmp_Indent *loc, int gtid, int32_t *addr, int32_t val) ; +EXTERN void __kmpc_atomic_fixed4_eqv (kmp_Indent *loc, int gtid, int32_t *addr, int32_t val) ; +EXTERN void __kmpc_atomic_fixed4_neqv (kmp_Indent *loc, int gtid, int32_t *addr, int32_t val) ; +EXTERN void __kmpc_atomic_fixed4_shl (kmp_Indent *loc, int gtid, int32_t *addr, int32_t val) ; +EXTERN void __kmpc_atomic_fixed4_shl_rev (kmp_Indent *loc, int gtid, int32_t *addr, int32_t val) ; +EXTERN void __kmpc_atomic_fixed4_shr (kmp_Indent *loc, int gtid, int32_t *addr, int32_t val) ; +EXTERN void __kmpc_atomic_fixed4_shr_rev (kmp_Indent *loc, int gtid, int32_t *addr, int32_t val) ; +EXTERN void __kmpc_atomic_fixed4u_shr (kmp_Indent *loc, int gtid, uint32_t *addr, uint32_t val) ; +EXTERN void __kmpc_atomic_fixed4u_shr_rev (kmp_Indent *loc, int gtid, uint32_t *addr, uint32_t val) ; +EXTERN void __kmpc_atomic_fixed4_swp (kmp_Indent *loc, int gtid, int32_t *addr, int32_t val) ; +EXTERN void __kmpc_atomic_fixed8_wr (kmp_Indent *loc, int gtid, int64_t *addr, int64_t val) ; +EXTERN void __kmpc_atomic_fixed8_add (kmp_Indent *loc, int gtid, int64_t *addr, int64_t val) ; +EXTERN void __kmpc_atomic_fixed8_sub (kmp_Indent *loc, int gtid, int64_t *addr, int64_t val) ; +EXTERN void __kmpc_atomic_fixed8_sub_rev (kmp_Indent *loc, int gtid, int64_t *addr, int64_t val) ; +EXTERN void __kmpc_atomic_fixed8_mul (kmp_Indent *loc, int gtid, int64_t *addr, int64_t val) ; +EXTERN void __kmpc_atomic_fixed8_div (kmp_Indent *loc, int gtid, int64_t *addr, int64_t val) ; +EXTERN void __kmpc_atomic_fixed8_div_rev (kmp_Indent *loc, int gtid, int64_t *addr, int64_t val) ; +EXTERN void __kmpc_atomic_fixed8u_div (kmp_Indent *loc, int gtid, uint64_t *addr, uint64_t val) ; +EXTERN void __kmpc_atomic_fixed8u_div_rev (kmp_Indent *loc, int gtid, uint64_t *addr, uint64_t val) ; +EXTERN void __kmpc_atomic_fixed8_min (kmp_Indent *loc, int gtid, int64_t *addr, int64_t val) ; +EXTERN void __kmpc_atomic_fixed8_max (kmp_Indent *loc, int gtid, int64_t *addr, int64_t val) ; +EXTERN void __kmpc_atomic_fixed8_andb (kmp_Indent *loc, int gtid, int64_t *addr, int64_t val) ; +EXTERN void __kmpc_atomic_fixed8_orb (kmp_Indent *loc, int gtid, int64_t *addr, int64_t val) ; +EXTERN void __kmpc_atomic_fixed8_xor (kmp_Indent *loc, int gtid, int64_t *addr, int64_t val) ; +EXTERN void __kmpc_atomic_fixed8_andl (kmp_Indent *loc, int gtid, int64_t *addr, int64_t val) ; +EXTERN void __kmpc_atomic_fixed8_orl (kmp_Indent *loc, int gtid, int64_t *addr, int64_t val) ; +EXTERN void __kmpc_atomic_fixed8_eqv (kmp_Indent *loc, int gtid, int64_t *addr, int64_t val) ; +EXTERN void __kmpc_atomic_fixed8_neqv (kmp_Indent *loc, int gtid, int64_t *addr, int64_t val) ; +EXTERN void __kmpc_atomic_fixed8_shl (kmp_Indent *loc, int gtid, int64_t *addr, int64_t val) ; +EXTERN void __kmpc_atomic_fixed8_shl_rev (kmp_Indent *loc, int gtid, int64_t *addr, int64_t val) ; +EXTERN void __kmpc_atomic_fixed8_shr (kmp_Indent *loc, int gtid, int64_t *addr, int64_t val) ; +EXTERN void __kmpc_atomic_fixed8_shr_rev (kmp_Indent *loc, int gtid, int64_t *addr, int64_t val) ; +EXTERN void __kmpc_atomic_fixed8u_shr (kmp_Indent *loc, int gtid, uint64_t *addr, uint64_t val) ; +EXTERN void __kmpc_atomic_fixed8u_shr_rev (kmp_Indent *loc, int gtid, uint64_t *addr, uint64_t val) ; +EXTERN void __kmpc_atomic_fixed8_swp (kmp_Indent *loc, int gtid, int64_t *addr, int64_t val) ; +EXTERN void __kmpc_atomic_float4_add (kmp_Indent *loc, int gtid, float *addr, float val) ; +EXTERN void __kmpc_atomic_float4_sub (kmp_Indent *loc, int gtid, float *addr, float val) ; +EXTERN void __kmpc_atomic_float4_sub_rev (kmp_Indent *loc, int gtid, float *addr, float val) ; +EXTERN void __kmpc_atomic_float4_mul (kmp_Indent *loc, int gtid, float *addr, float val) ; +EXTERN void __kmpc_atomic_float4_div (kmp_Indent *loc, int gtid, float *addr, float val) ; +EXTERN void __kmpc_atomic_float4_div_rev (kmp_Indent *loc, int gtid, float *addr, float val) ; +EXTERN void __kmpc_atomic_float4_min (kmp_Indent *loc, int gtid, float *addr, float val) ; +EXTERN void __kmpc_atomic_float4_max (kmp_Indent *loc, int gtid, float *addr, float val) ; +EXTERN void __kmpc_atomic_float8_add (kmp_Indent *loc, int gtid, double *addr, double val) ; +EXTERN void __kmpc_atomic_float8_sub (kmp_Indent *loc, int gtid, double *addr, double val) ; +EXTERN void __kmpc_atomic_float8_sub_rev (kmp_Indent *loc, int gtid, double *addr, double val) ; +EXTERN void __kmpc_atomic_float8_mul (kmp_Indent *loc, int gtid, double *addr, double val) ; +EXTERN void __kmpc_atomic_float8_div (kmp_Indent *loc, int gtid, double *addr, double val) ; +EXTERN void __kmpc_atomic_float8_div_rev (kmp_Indent *loc, int gtid, double *addr, double val) ; +EXTERN void __kmpc_atomic_float8_min (kmp_Indent *loc, int gtid, double *addr, double val) ; +EXTERN void __kmpc_atomic_float8_max (kmp_Indent *loc, int gtid, double *addr, double val) ; +EXTERN void __kmpc_atomic_cmplx4_add (kmp_Indent *loc, int gtid, float _Complex *addr, float _Complex val) ; +EXTERN void __kmpc_atomic_cmplx4_sub (kmp_Indent *loc, int gtid, float _Complex *addr, float _Complex val) ; +EXTERN void __kmpc_atomic_cmplx4_sub_rev (kmp_Indent *loc, int gtid, float _Complex *addr, float _Complex val) ; +EXTERN void __kmpc_atomic_cmplx4_mul (kmp_Indent *loc, int gtid, float _Complex *addr, float _Complex val) ; +EXTERN void __kmpc_atomic_cmplx4_div (kmp_Indent *loc, int gtid, float _Complex *addr, float _Complex val) ; +EXTERN void __kmpc_atomic_cmplx4_div_rev (kmp_Indent *loc, int gtid, float _Complex *addr, float _Complex val) ; +EXTERN void __kmpc_atomic_cmplx4_swp (kmp_Indent *loc, int gtid, float _Complex *addr, float _Complex val) ; +EXTERN void __kmpc_atomic_cmplx8_add (kmp_Indent *loc, int gtid, double _Complex *addr, double _Complex val) ; +EXTERN void __kmpc_atomic_cmplx8_sub (kmp_Indent *loc, int gtid, double _Complex *addr, double _Complex val) ; +EXTERN void __kmpc_atomic_cmplx8_sub_rev (kmp_Indent *loc, int gtid, double _Complex *addr, double _Complex val) ; +EXTERN void __kmpc_atomic_cmplx8_mul (kmp_Indent *loc, int gtid, double _Complex *addr, double _Complex val) ; +EXTERN void __kmpc_atomic_cmplx8_div (kmp_Indent *loc, int gtid, double _Complex *addr, double _Complex val) ; +EXTERN void __kmpc_atomic_cmplx8_div_rev (kmp_Indent *loc, int gtid, double _Complex *addr, double _Complex val) ; +EXTERN void __kmpc_atomic_cmplx8_swp (kmp_Indent *loc, int gtid, double _Complex *addr, double _Complex val) ; +EXTERN int8_t __kmpc_atomic_fixed1_wr_cpt (kmp_Indent *loc, int gtid, int8_t *lhs, int8_t rhs, int atomicFlag) ; +EXTERN int8_t __kmpc_atomic_fixed1_add_cpt (kmp_Indent *loc, int gtid, int8_t *lhs, int8_t rhs, int atomicFlag) ; +EXTERN int8_t __kmpc_atomic_fixed1_sub_cpt (kmp_Indent *loc, int gtid, int8_t *lhs, int8_t rhs, int atomicFlag) ; +EXTERN int8_t __kmpc_atomic_fixed1_sub_cpt_rev (kmp_Indent *loc, int gtid, int8_t *lhs, int8_t rhs, int atomicFlag) ; +EXTERN int8_t __kmpc_atomic_fixed1_mul_cpt (kmp_Indent *loc, int gtid, int8_t *lhs, int8_t rhs, int atomicFlag) ; +EXTERN int8_t __kmpc_atomic_fixed1_div_cpt (kmp_Indent *loc, int gtid, int8_t *lhs, int8_t rhs, int atomicFlag) ; +EXTERN int8_t __kmpc_atomic_fixed1_div_cpt_rev (kmp_Indent *loc, int gtid, int8_t *lhs, int8_t rhs, int atomicFlag) ; +EXTERN uint8_t __kmpc_atomic_fixed1u_div_cpt (kmp_Indent *loc, int gtid, uint8_t *lhs, uint8_t rhs, int atomicFlag) ; +EXTERN uint8_t __kmpc_atomic_fixed1u_div_cpt_rev (kmp_Indent *loc, int gtid, uint8_t *lhs, uint8_t rhs, int atomicFlag) ; +EXTERN int8_t __kmpc_atomic_fixed1_min_cpt (kmp_Indent *loc, int gtid, int8_t *lhs, int8_t rhs, int atomicFlag) ; +EXTERN int8_t __kmpc_atomic_fixed1_max_cpt (kmp_Indent *loc, int gtid, int8_t *lhs, int8_t rhs, int atomicFlag) ; +EXTERN int8_t __kmpc_atomic_fixed1_andb_cpt (kmp_Indent *loc, int gtid, int8_t *lhs, int8_t rhs, int atomicFlag) ; +EXTERN int8_t __kmpc_atomic_fixed1_orb_cpt (kmp_Indent *loc, int gtid, int8_t *lhs, int8_t rhs, int atomicFlag) ; +EXTERN int8_t __kmpc_atomic_fixed1_xor_cpt (kmp_Indent *loc, int gtid, int8_t *lhs, int8_t rhs, int atomicFlag) ; +EXTERN int8_t __kmpc_atomic_fixed1_andl_cpt (kmp_Indent *loc, int gtid, int8_t *lhs, int8_t rhs, int atomicFlag) ; +EXTERN int8_t __kmpc_atomic_fixed1_orl_cpt (kmp_Indent *loc, int gtid, int8_t *lhs, int8_t rhs, int atomicFlag) ; +EXTERN int8_t __kmpc_atomic_fixed1_eqv_cpt (kmp_Indent *loc, int gtid, int8_t *lhs, int8_t rhs, int atomicFlag) ; +EXTERN int8_t __kmpc_atomic_fixed1_neqv_cpt (kmp_Indent *loc, int gtid, int8_t *lhs, int8_t rhs, int atomicFlag) ; +EXTERN int8_t __kmpc_atomic_fixed1_shl_cpt (kmp_Indent *loc, int gtid, int8_t *lhs, int8_t rhs, int atomicFlag) ; +EXTERN int8_t __kmpc_atomic_fixed1_shl_cpt_rev (kmp_Indent *loc, int gtid, int8_t *lhs, int8_t rhs, int atomicFlag) ; +EXTERN int8_t __kmpc_atomic_fixed1_shr_cpt (kmp_Indent *loc, int gtid, int8_t *lhs, int8_t rhs, int atomicFlag) ; +EXTERN int8_t __kmpc_atomic_fixed1_shr_cpt_rev (kmp_Indent *loc, int gtid, int8_t *lhs, int8_t rhs, int atomicFlag) ; +EXTERN uint8_t __kmpc_atomic_fixed1u_shr_cpt (kmp_Indent *loc, int gtid, uint8_t *lhs, uint8_t rhs, int atomicFlag) ; +EXTERN uint8_t __kmpc_atomic_fixed1u_shr_cpt_rev (kmp_Indent *loc, int gtid, uint8_t *lhs, uint8_t rhs, int atomicFlag) ; +EXTERN int16_t __kmpc_atomic_fixed2_wr_cpt (kmp_Indent *loc, int gtid, int16_t *lhs, int16_t rhs, int atomicFlag) ; +EXTERN int16_t __kmpc_atomic_fixed2_add_cpt (kmp_Indent *loc, int gtid, int16_t *lhs, int16_t rhs, int atomicFlag) ; +EXTERN int16_t __kmpc_atomic_fixed2_sub_cpt (kmp_Indent *loc, int gtid, int16_t *lhs, int16_t rhs, int atomicFlag) ; +EXTERN int16_t __kmpc_atomic_fixed2_sub_cpt_rev (kmp_Indent *loc, int gtid, int16_t *lhs, int16_t rhs, int atomicFlag) ; +EXTERN int16_t __kmpc_atomic_fixed2_mul_cpt (kmp_Indent *loc, int gtid, int16_t *lhs, int16_t rhs, int atomicFlag) ; +EXTERN int16_t __kmpc_atomic_fixed2_div_cpt (kmp_Indent *loc, int gtid, int16_t *lhs, int16_t rhs, int atomicFlag) ; +EXTERN int16_t __kmpc_atomic_fixed2_div_cpt_rev (kmp_Indent *loc, int gtid, int16_t *lhs, int16_t rhs, int atomicFlag) ; +EXTERN uint16_t __kmpc_atomic_fixed2u_div_cpt (kmp_Indent *loc, int gtid, uint16_t *lhs, uint16_t rhs, int atomicFlag) ; +EXTERN uint16_t __kmpc_atomic_fixed2u_div_cpt_rev (kmp_Indent *loc, int gtid, uint16_t *lhs, uint16_t rhs, int atomicFlag) ; +EXTERN int16_t __kmpc_atomic_fixed2_min_cpt (kmp_Indent *loc, int gtid, int16_t *lhs, int16_t rhs, int atomicFlag) ; +EXTERN int16_t __kmpc_atomic_fixed2_max_cpt (kmp_Indent *loc, int gtid, int16_t *lhs, int16_t rhs, int atomicFlag) ; +EXTERN int16_t __kmpc_atomic_fixed2_andb_cpt (kmp_Indent *loc, int gtid, int16_t *lhs, int16_t rhs, int atomicFlag) ; +EXTERN int16_t __kmpc_atomic_fixed2_orb_cpt (kmp_Indent *loc, int gtid, int16_t *lhs, int16_t rhs, int atomicFlag) ; +EXTERN int16_t __kmpc_atomic_fixed2_xor_cpt (kmp_Indent *loc, int gtid, int16_t *lhs, int16_t rhs, int atomicFlag) ; +EXTERN int16_t __kmpc_atomic_fixed2_andl_cpt (kmp_Indent *loc, int gtid, int16_t *lhs, int16_t rhs, int atomicFlag) ; +EXTERN int16_t __kmpc_atomic_fixed2_orl_cpt (kmp_Indent *loc, int gtid, int16_t *lhs, int16_t rhs, int atomicFlag) ; +EXTERN int16_t __kmpc_atomic_fixed2_eqv_cpt (kmp_Indent *loc, int gtid, int16_t *lhs, int16_t rhs, int atomicFlag) ; +EXTERN int16_t __kmpc_atomic_fixed2_neqv_cpt (kmp_Indent *loc, int gtid, int16_t *lhs, int16_t rhs, int atomicFlag) ; +EXTERN int16_t __kmpc_atomic_fixed2_shl_cpt (kmp_Indent *loc, int gtid, int16_t *lhs, int16_t rhs, int atomicFlag) ; +EXTERN int16_t __kmpc_atomic_fixed2_shl_cpt_rev (kmp_Indent *loc, int gtid, int16_t *lhs, int16_t rhs, int atomicFlag) ; +EXTERN int16_t __kmpc_atomic_fixed2_shr_cpt (kmp_Indent *loc, int gtid, int16_t *lhs, int16_t rhs, int atomicFlag) ; +EXTERN int16_t __kmpc_atomic_fixed2_shr_cpt_rev (kmp_Indent *loc, int gtid, int16_t *lhs, int16_t rhs, int atomicFlag) ; +EXTERN uint16_t __kmpc_atomic_fixed2u_shr_cpt (kmp_Indent *loc, int gtid, uint16_t *lhs, uint16_t rhs, int atomicFlag) ; +EXTERN uint16_t __kmpc_atomic_fixed2u_shr_cpt_rev (kmp_Indent *loc, int gtid, uint16_t *lhs, uint16_t rhs, int atomicFlag) ; +EXTERN int16_t __kmpc_atomic_fixed2_swp_cpt (kmp_Indent *loc, int gtid, int16_t *lhs, int16_t rhs, int atomicFlag) ; +EXTERN int32_t __kmpc_atomic_fixed4_wr_cpt (kmp_Indent *loc, int gtid, int32_t *lhs, int32_t rhs, int atomicFlag) ; +EXTERN int32_t __kmpc_atomic_fixed4_add_cpt (kmp_Indent *loc, int gtid, int32_t *lhs, int32_t rhs, int atomicFlag) ; +EXTERN int32_t __kmpc_atomic_fixed4_sub_cpt (kmp_Indent *loc, int gtid, int32_t *lhs, int32_t rhs, int atomicFlag) ; +EXTERN int32_t __kmpc_atomic_fixed4_sub_cpt_rev (kmp_Indent *loc, int gtid, int32_t *lhs, int32_t rhs, int atomicFlag) ; +EXTERN int32_t __kmpc_atomic_fixed4_mul_cpt (kmp_Indent *loc, int gtid, int32_t *lhs, int32_t rhs, int atomicFlag) ; +EXTERN int32_t __kmpc_atomic_fixed4_div_cpt (kmp_Indent *loc, int gtid, int32_t *lhs, int32_t rhs, int atomicFlag) ; +EXTERN int32_t __kmpc_atomic_fixed4_div_cpt_rev (kmp_Indent *loc, int gtid, int32_t *lhs, int32_t rhs, int atomicFlag) ; +EXTERN uint32_t __kmpc_atomic_fixed4u_div_cpt (kmp_Indent *loc, int gtid, uint32_t *lhs, uint32_t rhs, int atomicFlag) ; +EXTERN uint32_t __kmpc_atomic_fixed4u_div_cpt_rev (kmp_Indent *loc, int gtid, uint32_t *lhs, uint32_t rhs, int atomicFlag) ; +EXTERN int32_t __kmpc_atomic_fixed4_min_cpt (kmp_Indent *loc, int gtid, int32_t *lhs, int32_t rhs, int atomicFlag) ; +EXTERN int32_t __kmpc_atomic_fixed4_max_cpt (kmp_Indent *loc, int gtid, int32_t *lhs, int32_t rhs, int atomicFlag) ; +EXTERN int32_t __kmpc_atomic_fixed4_andb_cpt (kmp_Indent *loc, int gtid, int32_t *lhs, int32_t rhs, int atomicFlag) ; +EXTERN int32_t __kmpc_atomic_fixed4_orb_cpt (kmp_Indent *loc, int gtid, int32_t *lhs, int32_t rhs, int atomicFlag) ; +EXTERN int32_t __kmpc_atomic_fixed4_xor_cpt (kmp_Indent *loc, int gtid, int32_t *lhs, int32_t rhs, int atomicFlag) ; +EXTERN int32_t __kmpc_atomic_fixed4_andl_cpt (kmp_Indent *loc, int gtid, int32_t *lhs, int32_t rhs, int atomicFlag) ; +EXTERN int32_t __kmpc_atomic_fixed4_orl_cpt (kmp_Indent *loc, int gtid, int32_t *lhs, int32_t rhs, int atomicFlag) ; +EXTERN int32_t __kmpc_atomic_fixed4_eqv_cpt (kmp_Indent *loc, int gtid, int32_t *lhs, int32_t rhs, int atomicFlag) ; +EXTERN int32_t __kmpc_atomic_fixed4_neqv_cpt (kmp_Indent *loc, int gtid, int32_t *lhs, int32_t rhs, int atomicFlag) ; +EXTERN int32_t __kmpc_atomic_fixed4_shl_cpt (kmp_Indent *loc, int gtid, int32_t *lhs, int32_t rhs, int atomicFlag) ; +EXTERN int32_t __kmpc_atomic_fixed4_shl_cpt_rev (kmp_Indent *loc, int gtid, int32_t *lhs, int32_t rhs, int atomicFlag) ; +EXTERN int32_t __kmpc_atomic_fixed4_shr_cpt (kmp_Indent *loc, int gtid, int32_t *lhs, int32_t rhs, int atomicFlag) ; +EXTERN int32_t __kmpc_atomic_fixed4_shr_cpt_rev (kmp_Indent *loc, int gtid, int32_t *lhs, int32_t rhs, int atomicFlag) ; +EXTERN uint32_t __kmpc_atomic_fixed4u_shr_cpt (kmp_Indent *loc, int gtid, uint32_t *lhs, uint32_t rhs, int atomicFlag) ; +EXTERN uint32_t __kmpc_atomic_fixed4u_shr_cpt_rev (kmp_Indent *loc, int gtid, uint32_t *lhs, uint32_t rhs, int atomicFlag) ; +EXTERN int32_t __kmpc_atomic_fixed4_swp_cpt (kmp_Indent *loc, int gtid, int32_t *lhs, int32_t rhs, int atomicFlag) ; +EXTERN int64_t __kmpc_atomic_fixed8_wr_cpt (kmp_Indent *loc, int gtid, int64_t *lhs, int64_t rhs, int atomicFlag) ; +EXTERN int64_t __kmpc_atomic_fixed8_add_cpt (kmp_Indent *loc, int gtid, int64_t *lhs, int64_t rhs, int atomicFlag) ; +EXTERN int64_t __kmpc_atomic_fixed8_sub_cpt (kmp_Indent *loc, int gtid, int64_t *lhs, int64_t rhs, int atomicFlag) ; +EXTERN int64_t __kmpc_atomic_fixed8_sub_cpt_rev (kmp_Indent *loc, int gtid, int64_t *lhs, int64_t rhs, int atomicFlag) ; +EXTERN int64_t __kmpc_atomic_fixed8_mul_cpt (kmp_Indent *loc, int gtid, int64_t *lhs, int64_t rhs, int atomicFlag) ; +EXTERN int64_t __kmpc_atomic_fixed8_div_cpt (kmp_Indent *loc, int gtid, int64_t *lhs, int64_t rhs, int atomicFlag) ; +EXTERN int64_t __kmpc_atomic_fixed8_div_cpt_rev (kmp_Indent *loc, int gtid, int64_t *lhs, int64_t rhs, int atomicFlag) ; +EXTERN uint64_t __kmpc_atomic_fixed8u_div_cpt (kmp_Indent *loc, int gtid, uint64_t *lhs, uint64_t rhs, int atomicFlag) ; +EXTERN uint64_t __kmpc_atomic_fixed8u_div_cpt_rev (kmp_Indent *loc, int gtid, uint64_t *lhs, uint64_t rhs, int atomicFlag) ; +EXTERN int64_t __kmpc_atomic_fixed8_min_cpt (kmp_Indent *loc, int gtid, int64_t *lhs, int64_t rhs, int atomicFlag) ; +EXTERN int64_t __kmpc_atomic_fixed8_max_cpt (kmp_Indent *loc, int gtid, int64_t *lhs, int64_t rhs, int atomicFlag) ; +EXTERN int64_t __kmpc_atomic_fixed8_andb_cpt (kmp_Indent *loc, int gtid, int64_t *lhs, int64_t rhs, int atomicFlag) ; +EXTERN int64_t __kmpc_atomic_fixed8_orb_cpt (kmp_Indent *loc, int gtid, int64_t *lhs, int64_t rhs, int atomicFlag) ; +EXTERN int64_t __kmpc_atomic_fixed8_xor_cpt (kmp_Indent *loc, int gtid, int64_t *lhs, int64_t rhs, int atomicFlag) ; +EXTERN int64_t __kmpc_atomic_fixed8_andl_cpt (kmp_Indent *loc, int gtid, int64_t *lhs, int64_t rhs, int atomicFlag) ; +EXTERN int64_t __kmpc_atomic_fixed8_orl_cpt (kmp_Indent *loc, int gtid, int64_t *lhs, int64_t rhs, int atomicFlag) ; +EXTERN int64_t __kmpc_atomic_fixed8_eqv_cpt (kmp_Indent *loc, int gtid, int64_t *lhs, int64_t rhs, int atomicFlag) ; +EXTERN int64_t __kmpc_atomic_fixed8_neqv_cpt (kmp_Indent *loc, int gtid, int64_t *lhs, int64_t rhs, int atomicFlag) ; +EXTERN int64_t __kmpc_atomic_fixed8_shl_cpt (kmp_Indent *loc, int gtid, int64_t *lhs, int64_t rhs, int atomicFlag) ; +EXTERN int64_t __kmpc_atomic_fixed8_shl_cpt_rev (kmp_Indent *loc, int gtid, int64_t *lhs, int64_t rhs, int atomicFlag) ; +EXTERN int64_t __kmpc_atomic_fixed8_shr_cpt (kmp_Indent *loc, int gtid, int64_t *lhs, int64_t rhs, int atomicFlag) ; +EXTERN int64_t __kmpc_atomic_fixed8_shr_cpt_rev (kmp_Indent *loc, int gtid, int64_t *lhs, int64_t rhs, int atomicFlag) ; +EXTERN uint64_t __kmpc_atomic_fixed8u_shr_cpt (kmp_Indent *loc, int gtid, uint64_t *lhs, uint64_t rhs, int atomicFlag) ; +EXTERN uint64_t __kmpc_atomic_fixed8u_shr_cpt_rev (kmp_Indent *loc, int gtid, uint64_t *lhs, uint64_t rhs, int atomicFlag) ; +EXTERN int64_t __kmpc_atomic_fixed8_swp_cpt (kmp_Indent *loc, int gtid, int64_t *lhs, int64_t rhs, int atomicFlag) ; +EXTERN float __kmpc_atomic_float4_add_cpt (kmp_Indent *loc, int gtid, float *lhs, float rhs, int atomicFlag) ; +EXTERN float __kmpc_atomic_float4_sub_cpt (kmp_Indent *loc, int gtid, float *lhs, float rhs, int atomicFlag) ; +EXTERN float __kmpc_atomic_float4_sub_cpt_rev (kmp_Indent *loc, int gtid, float *lhs, float rhs, int atomicFlag) ; +EXTERN float __kmpc_atomic_float4_mul_cpt (kmp_Indent *loc, int gtid, float *lhs, float rhs, int atomicFlag) ; +EXTERN float __kmpc_atomic_float4_div_cpt (kmp_Indent *loc, int gtid, float *lhs, float rhs, int atomicFlag) ; +EXTERN float __kmpc_atomic_float4_div_cpt_rev (kmp_Indent *loc, int gtid, float *lhs, float rhs, int atomicFlag) ; +EXTERN float __kmpc_atomic_float4_min_cpt (kmp_Indent *loc, int gtid, float *lhs, float rhs, int atomicFlag) ; +EXTERN float __kmpc_atomic_float4_max_cpt (kmp_Indent *loc, int gtid, float *lhs, float rhs, int atomicFlag) ; +EXTERN double __kmpc_atomic_float8_add_cpt (kmp_Indent *loc, int gtid, double *lhs, double rhs, int atomicFlag) ; +EXTERN double __kmpc_atomic_float8_sub_cpt (kmp_Indent *loc, int gtid, double *lhs, double rhs, int atomicFlag) ; +EXTERN double __kmpc_atomic_float8_sub_cpt_rev (kmp_Indent *loc, int gtid, double *lhs, double rhs, int atomicFlag) ; +EXTERN double __kmpc_atomic_float8_mul_cpt (kmp_Indent *loc, int gtid, double *lhs, double rhs, int atomicFlag) ; +EXTERN double __kmpc_atomic_float8_div_cpt (kmp_Indent *loc, int gtid, double *lhs, double rhs, int atomicFlag) ; +EXTERN double __kmpc_atomic_float8_div_cpt_rev (kmp_Indent *loc, int gtid, double *lhs, double rhs, int atomicFlag) ; +EXTERN double __kmpc_atomic_float8_min_cpt (kmp_Indent *loc, int gtid, double *lhs, double rhs, int atomicFlag) ; +EXTERN double __kmpc_atomic_float8_max_cpt (kmp_Indent *loc, int gtid, double *lhs, double rhs, int atomicFlag) ; + +//special case according to iomp reference +EXTERN void __kmpc_atomic_cmplx4_add_cpt (kmp_Indent *loc, int gtid, float _Complex *lhs, float _Complex rhs, float _Complex *out, int atomicFlag) ; +EXTERN void __kmpc_atomic_cmplx4_sub_cpt (kmp_Indent *loc, int gtid, float _Complex *lhs, float _Complex rhs, float _Complex *out, int atomicFlag) ; +EXTERN void __kmpc_atomic_cmplx4_sub_cpt_rev (kmp_Indent *loc, int gtid, float _Complex *lhs, float _Complex rhs, float _Complex *out, int atomicFlag) ; +EXTERN void __kmpc_atomic_cmplx4_mul_cpt (kmp_Indent *loc, int gtid, float _Complex *lhs, float _Complex rhs, float _Complex *out, int atomicFlag) ; +EXTERN void __kmpc_atomic_cmplx4_div_cpt (kmp_Indent *loc, int gtid, float _Complex *lhs, float _Complex rhs, float _Complex *out, int atomicFlag) ; +EXTERN void __kmpc_atomic_cmplx4_div_cpt_rev (kmp_Indent *loc, int gtid, float _Complex *lhs, float _Complex rhs, float _Complex *out, int atomicFlag) ; +EXTERN void __kmpc_atomic_cmplx4_swp_cpt (kmp_Indent *loc, int gtid, float _Complex *lhs, float _Complex rhs, float _Complex *out, int atomicFlag) ; + +EXTERN double _Complex __kmpc_atomic_cmplx8_add_cpt (kmp_Indent *loc, int gtid, double _Complex *lhs, double _Complex rhs, int atomicFlag) ; +EXTERN double _Complex __kmpc_atomic_cmplx8_sub_cpt (kmp_Indent *loc, int gtid, double _Complex *lhs, double _Complex rhs, int atomicFlag) ; +EXTERN double _Complex __kmpc_atomic_cmplx8_sub_cpt_rev (kmp_Indent *loc, int gtid, double _Complex *lhs, double _Complex rhs, int atomicFlag) ; +EXTERN double _Complex __kmpc_atomic_cmplx8_mul_cpt (kmp_Indent *loc, int gtid, double _Complex *lhs, double _Complex rhs, int atomicFlag) ; +EXTERN double _Complex __kmpc_atomic_cmplx8_div_cpt (kmp_Indent *loc, int gtid, double _Complex *lhs, double _Complex rhs, int atomicFlag) ; +EXTERN double _Complex __kmpc_atomic_cmplx8_div_cpt_rev (kmp_Indent *loc, int gtid, double _Complex *lhs, double _Complex rhs, int atomicFlag) ; +EXTERN double _Complex __kmpc_atomic_cmplx8_swp_cpt (kmp_Indent *loc, int gtid, double _Complex *lhs, double _Complex rhs, int atomicFlag) ; + + +//atomic for array reduction + +EXTERN void __array_atomic_fixed1_add (kmp_Indent *id_ref, int32_t gtid, int8_t * lhs, int8_t *rhs, int64_t n) ; +EXTERN void __array_atomic_fixed1_sub (kmp_Indent *id_ref, int32_t gtid, int8_t * lhs, int8_t *rhs, int64_t n) ; +EXTERN void __array_atomic_fixed1_mul (kmp_Indent *id_ref, int32_t gtid, int8_t * lhs, int8_t *rhs, int64_t n) ; +EXTERN void __array_atomic_fixed1_div (kmp_Indent *id_ref, int32_t gtid, int8_t * lhs, int8_t *rhs, int64_t n) ; +EXTERN void __array_atomic_fixed1_min (kmp_Indent *id_ref, int32_t gtid, int8_t * lhs, int8_t *rhs, int64_t n) ; +EXTERN void __array_atomic_fixed1_max (kmp_Indent *id_ref, int32_t gtid, int8_t * lhs, int8_t *rhs, int64_t n) ; +EXTERN void __array_atomic_fixed1_andb (kmp_Indent *id_ref, int32_t gtid, int8_t * lhs, int8_t *rhs, int64_t n) ; +EXTERN void __array_atomic_fixed1_orb (kmp_Indent *id_ref, int32_t gtid, int8_t * lhs, int8_t *rhs, int64_t n) ; +EXTERN void __array_atomic_fixed1_xor (kmp_Indent *id_ref, int32_t gtid, int8_t * lhs, int8_t *rhs, int64_t n) ; +EXTERN void __array_atomic_fixed1_shl (kmp_Indent *id_ref, int32_t gtid, int8_t * lhs, int8_t *rhs, int64_t n) ; +EXTERN void __array_atomic_fixed1_shr (kmp_Indent *id_ref, int32_t gtid, int8_t * lhs, int8_t *rhs, int64_t n) ; +EXTERN void __array_atomic_fixed1_andl (kmp_Indent *id_ref, int32_t gtid, int8_t * lhs, int8_t *rhs, int64_t n) ; +EXTERN void __array_atomic_fixed1_orl (kmp_Indent *id_ref, int32_t gtid, int8_t * lhs, int8_t *rhs, int64_t n) ; +EXTERN void __array_atomic_fixed1_eqv (kmp_Indent *id_ref, int32_t gtid, int8_t * lhs, int8_t *rhs, int64_t n) ; +EXTERN void __array_atomic_fixed1_neqv (kmp_Indent *id_ref, int32_t gtid, int8_t * lhs, int8_t *rhs, int64_t n) ; +EXTERN void __array_atomic_fixed2_add (kmp_Indent *id_ref, int32_t gtid, int16_t * lhs, int16_t *rhs, int64_t n) ; +EXTERN void __array_atomic_fixed2_sub (kmp_Indent *id_ref, int32_t gtid, int16_t * lhs, int16_t *rhs, int64_t n) ; +EXTERN void __array_atomic_fixed2_mul (kmp_Indent *id_ref, int32_t gtid, int16_t * lhs, int16_t *rhs, int64_t n) ; +EXTERN void __array_atomic_fixed2_div (kmp_Indent *id_ref, int32_t gtid, int16_t * lhs, int16_t *rhs, int64_t n) ; +EXTERN void __array_atomic_fixed2_min (kmp_Indent *id_ref, int32_t gtid, int16_t * lhs, int16_t *rhs, int64_t n) ; +EXTERN void __array_atomic_fixed2_max (kmp_Indent *id_ref, int32_t gtid, int16_t * lhs, int16_t *rhs, int64_t n) ; +EXTERN void __array_atomic_fixed2_andb (kmp_Indent *id_ref, int32_t gtid, int16_t * lhs, int16_t *rhs, int64_t n) ; +EXTERN void __array_atomic_fixed2_orb (kmp_Indent *id_ref, int32_t gtid, int16_t * lhs, int16_t *rhs, int64_t n) ; +EXTERN void __array_atomic_fixed2_xor (kmp_Indent *id_ref, int32_t gtid, int16_t * lhs, int16_t *rhs, int64_t n) ; +EXTERN void __array_atomic_fixed2_shl (kmp_Indent *id_ref, int32_t gtid, int16_t * lhs, int16_t *rhs, int64_t n) ; +EXTERN void __array_atomic_fixed2_shr (kmp_Indent *id_ref, int32_t gtid, int16_t * lhs, int16_t *rhs, int64_t n) ; +EXTERN void __array_atomic_fixed2_andl (kmp_Indent *id_ref, int32_t gtid, int16_t * lhs, int16_t *rhs, int64_t n) ; +EXTERN void __array_atomic_fixed2_orl (kmp_Indent *id_ref, int32_t gtid, int16_t * lhs, int16_t *rhs, int64_t n) ; +EXTERN void __array_atomic_fixed2_eqv (kmp_Indent *id_ref, int32_t gtid, int16_t * lhs, int16_t *rhs, int64_t n) ; +EXTERN void __array_atomic_fixed2_neqv (kmp_Indent *id_ref, int32_t gtid, int16_t * lhs, int16_t *rhs, int64_t n) ; +EXTERN void __array_atomic_fixed4_add (kmp_Indent *id_ref, int32_t gtid, int32_t * lhs, int32_t *rhs, int64_t n) ; +EXTERN void __array_atomic_fixed4_sub (kmp_Indent *id_ref, int32_t gtid, int32_t * lhs, int32_t *rhs, int64_t n) ; +EXTERN void __array_atomic_fixed4_mul (kmp_Indent *id_ref, int32_t gtid, int32_t * lhs, int32_t *rhs, int64_t n) ; +EXTERN void __array_atomic_fixed4_div (kmp_Indent *id_ref, int32_t gtid, int32_t * lhs, int32_t *rhs, int64_t n) ; +EXTERN void __array_atomic_fixed4_min (kmp_Indent *id_ref, int32_t gtid, int32_t * lhs, int32_t *rhs, int64_t n) ; +EXTERN void __array_atomic_fixed4_max (kmp_Indent *id_ref, int32_t gtid, int32_t * lhs, int32_t *rhs, int64_t n) ; +EXTERN void __array_atomic_fixed4_andb (kmp_Indent *id_ref, int32_t gtid, int32_t * lhs, int32_t *rhs, int64_t n) ; +EXTERN void __array_atomic_fixed4_orb (kmp_Indent *id_ref, int32_t gtid, int32_t * lhs, int32_t *rhs, int64_t n) ; +EXTERN void __array_atomic_fixed4_xor (kmp_Indent *id_ref, int32_t gtid, int32_t * lhs, int32_t *rhs, int64_t n) ; +EXTERN void __array_atomic_fixed4_shl (kmp_Indent *id_ref, int32_t gtid, int32_t * lhs, int32_t *rhs, int64_t n) ; +EXTERN void __array_atomic_fixed4_shr (kmp_Indent *id_ref, int32_t gtid, int32_t * lhs, int32_t *rhs, int64_t n) ; +EXTERN void __array_atomic_fixed4_andl (kmp_Indent *id_ref, int32_t gtid, int32_t * lhs, int32_t *rhs, int64_t n) ; +EXTERN void __array_atomic_fixed4_orl (kmp_Indent *id_ref, int32_t gtid, int32_t * lhs, int32_t *rhs, int64_t n) ; +EXTERN void __array_atomic_fixed4_eqv (kmp_Indent *id_ref, int32_t gtid, int32_t * lhs, int32_t *rhs, int64_t n) ; +EXTERN void __array_atomic_fixed4_neqv (kmp_Indent *id_ref, int32_t gtid, int32_t * lhs, int32_t *rhs, int64_t n) ; +EXTERN void __array_atomic_fixed8_add (kmp_Indent *id_ref, int32_t gtid, int64_t * lhs, int64_t *rhs, int64_t n) ; +EXTERN void __array_atomic_fixed8_sub (kmp_Indent *id_ref, int32_t gtid, int64_t * lhs, int64_t *rhs, int64_t n) ; +EXTERN void __array_atomic_fixed8_mul (kmp_Indent *id_ref, int32_t gtid, int64_t * lhs, int64_t *rhs, int64_t n) ; +EXTERN void __array_atomic_fixed8_div (kmp_Indent *id_ref, int32_t gtid, int64_t * lhs, int64_t *rhs, int64_t n) ; +EXTERN void __array_atomic_fixed8_min (kmp_Indent *id_ref, int32_t gtid, int64_t * lhs, int64_t *rhs, int64_t n) ; +EXTERN void __array_atomic_fixed8_max (kmp_Indent *id_ref, int32_t gtid, int64_t * lhs, int64_t *rhs, int64_t n) ; +EXTERN void __array_atomic_fixed8_andb (kmp_Indent *id_ref, int32_t gtid, int64_t * lhs, int64_t *rhs, int64_t n) ; +EXTERN void __array_atomic_fixed8_orb (kmp_Indent *id_ref, int32_t gtid, int64_t * lhs, int64_t *rhs, int64_t n) ; +EXTERN void __array_atomic_fixed8_xor (kmp_Indent *id_ref, int32_t gtid, int64_t * lhs, int64_t *rhs, int64_t n) ; +EXTERN void __array_atomic_fixed8_shl (kmp_Indent *id_ref, int32_t gtid, int64_t * lhs, int64_t *rhs, int64_t n) ; +EXTERN void __array_atomic_fixed8_shr (kmp_Indent *id_ref, int32_t gtid, int64_t * lhs, int64_t *rhs, int64_t n) ; +EXTERN void __array_atomic_fixed8_andl (kmp_Indent *id_ref, int32_t gtid, int64_t * lhs, int64_t *rhs, int64_t n) ; +EXTERN void __array_atomic_fixed8_orl (kmp_Indent *id_ref, int32_t gtid, int64_t * lhs, int64_t *rhs, int64_t n) ; +EXTERN void __array_atomic_fixed8_eqv (kmp_Indent *id_ref, int32_t gtid, int64_t * lhs, int64_t *rhs, int64_t n) ; +EXTERN void __array_atomic_fixed8_neqv (kmp_Indent *id_ref, int32_t gtid, int64_t * lhs, int64_t *rhs, int64_t n) ; +EXTERN void __array_atomic_float4_add (kmp_Indent *id_ref, int32_t gtid, float * lhs, float *rhs, int64_t n) ; +EXTERN void __array_atomic_float4_sub (kmp_Indent *id_ref, int32_t gtid, float * lhs, float *rhs, int64_t n) ; +EXTERN void __array_atomic_float4_mul (kmp_Indent *id_ref, int32_t gtid, float * lhs, float *rhs, int64_t n) ; +EXTERN void __array_atomic_float4_div (kmp_Indent *id_ref, int32_t gtid, float * lhs, float *rhs, int64_t n) ; +EXTERN void __array_atomic_float4_min (kmp_Indent *id_ref, int32_t gtid, float * lhs, float *rhs, int64_t n) ; +EXTERN void __array_atomic_float4_max (kmp_Indent *id_ref, int32_t gtid, float * lhs, float *rhs, int64_t n) ; +EXTERN void __array_atomic_float8_add (kmp_Indent *id_ref, int32_t gtid, double * lhs, double *rhs, int64_t n) ; +EXTERN void __array_atomic_float8_sub (kmp_Indent *id_ref, int32_t gtid, double * lhs, double *rhs, int64_t n) ; +EXTERN void __array_atomic_float8_mul (kmp_Indent *id_ref, int32_t gtid, double * lhs, double *rhs, int64_t n) ; +EXTERN void __array_atomic_float8_div (kmp_Indent *id_ref, int32_t gtid, double * lhs, double *rhs, int64_t n) ; +EXTERN void __array_atomic_float8_min (kmp_Indent *id_ref, int32_t gtid, double * lhs, double *rhs, int64_t n) ; +EXTERN void __array_atomic_float8_max (kmp_Indent *id_ref, int32_t gtid, double * lhs, double *rhs, int64_t n) ; + +// non standard +EXTERN void __kmpc_kernel_init(int OmpHandle, int ThreadLimit); +EXTERN int __kmpc_kernel_prepare_parallel(int numThreads, int numLanes); +EXTERN void __kmpc_kernel_parallel(int numLanes); +EXTERN void __kmpc_kernel_end_parallel(); + +#endif + Index: libomptarget/deviceRTLs/nvptx/src/libcall.cu =================================================================== --- /dev/null +++ libomptarget/deviceRTLs/nvptx/src/libcall.cu @@ -0,0 +1,387 @@ +//===------------ libcall.cu - NVPTX OpenMP user calls ----------- CUDA -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is dual licensed under the MIT and the University of Illinois Open +// Source Licenses. See LICENSE.txt for details. +// +//===----------------------------------------------------------------------===// +// +// This file implements the OpenMP runtime functions that can be +// invoked by the user in an OpenMP region +// +//===----------------------------------------------------------------------===// + +#include "../../../deviceRTLs/nvptx/src/omptarget-nvptx.h" +NOINLINE void PrintTaskDescr(omptarget_nvptx_TaskDescr *taskDescr, char *title, int level); + +EXTERN double omp_get_wtick(void) +{ + double rc = omptarget_nvptx_threadPrivateContext->GlobalICV()->gpuCycleTime; + PRINT(LD_IO, "call omp_get_wtick() returns %g\n", rc); + return rc; +} + +EXTERN double omp_get_wtime(void) +{ + double rc = omptarget_nvptx_threadPrivateContext->GlobalICV()->gpuCycleTime * clock64(); + PRINT(LD_IO, "call omp_get_wtime() returns %g\n", rc); + return rc; +} + +EXTERN void omp_set_num_threads(int num) +{ + PRINT(LD_IO, "call omp_set_num_threads(num %d)\n", num); + if (num <= 0) { + WARNING0(LW_INPUT, "expected positive num; ignore\n"); + } else { + omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor(); + currTaskDescr->NThreads() = num; + } +} + +EXTERN int omp_get_num_threads(void) +{ + int gtid = GetGlobalThreadId(); + int rc = GetNumberOfOmpThreads(gtid); + PRINT(LD_IO, "call omp_get_num_threads() return %d\n", rc); + return rc; +} + +EXTERN int omp_get_max_threads(void) +{ + omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor(); + int rc = 1; // default is 1 thread avail + if (! currTaskDescr->InParallelRegion()) { + // not currently in a parallel region... all are available + rc = GetNumberOfProcsInTeam(); + ASSERT0(LT_FUSSY, rc >= 0, "bad number of threads"); + } + PRINT(LD_IO, "call omp_get_num_threads() return %\n", rc); + return rc; +} + +EXTERN int omp_get_thread_limit(void) +{ + // per contention group.. meaning threads in current team + omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor(); + int rc = currTaskDescr->ThreadLimit(); + PRINT(LD_IO, "call omp_get_thread_limit() return %d\n", rc); + return rc; +} + +EXTERN int omp_get_thread_num() +{ + int gtid = GetGlobalThreadId(); + int rc = GetOmpThreadId(gtid); + PRINT(LD_IO, "call omp_get_thread_num() returns %d\n", rc); + return rc; +} + +EXTERN int omp_get_num_procs(void) +{ + int rc = GetNumberOfThreadsInBlock(); + PRINT(LD_IO, "call omp_get_num_procs() returns %d\n", rc); + return rc; +} + +EXTERN int omp_in_parallel(void) +{ + int rc = 0; + omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor(); + if (currTaskDescr->InParallelRegion()) { + rc = 1; + } + PRINT(LD_IO, "call omp_in_parallel() returns %d\n", rc); + return rc; +} + +EXTERN int omp_in_final(void) +{ + // treat all tasks as final... Specs may expect runtime to keep + // track more precisely if a task was actively set by users... This + // is not explicitely specified; will treat as if runtime can + // actively decide to put a non-final task into a final one. + int rc = 1; + PRINT(LD_IO, "call omp_in_final() returns %d\n", rc); + return rc; +} + + +EXTERN void omp_set_dynamic(int flag) +{ + PRINT(LD_IO, "call omp_set_dynamic(%d)\n", flag); + + omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor(); + if (flag) { + currTaskDescr->SetDynamic(); + } else { + currTaskDescr->ClearDynamic(); + } +} + +EXTERN int omp_get_dynamic(void) +{ + int rc = 0; + omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor(); + if (currTaskDescr->IsDynamic()) { + rc = 1; + } + PRINT(LD_IO, "call omp_get_dynamic() returns %d\n", rc); + return rc; +} + +EXTERN void omp_set_nested(int flag) +{ + PRINT(LD_IO, "call omp_set_nested(%d) is ignored (no nested support)\n", flag); +} + +EXTERN int omp_get_nested(void) +{ + int rc = 0; + PRINT(LD_IO, "call omp_get_nested() returns %d\n", rc); + return rc; +} + +EXTERN void omp_set_max_active_levels(int level) +{ + PRINT(LD_IO, "call omp_set_max_active_levels(%d) is ignored (no nested support)\n", level); +} + +EXTERN int omp_get_max_active_levels(void) +{ + int rc = 1; + PRINT(LD_IO, "call omp_get_nested() returns %d\n", rc); + return rc; +} + +EXTERN int omp_get_level(void) +{ + int level = 0; + omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor(); + ASSERT0(LT_FUSSY, currTaskDescr, "do not expect fct to be called in a non-active thread"); + do { + if (currTaskDescr->IsParallelConstruct()) { + level++; + } + currTaskDescr = currTaskDescr->GetPrevTaskDescr(); + } while (currTaskDescr); + PRINT(LD_IO, "call omp_get_level() returns %d\n", level); + return level; +} + +EXTERN int omp_get_active_level(void) +{ + int level = 0; // no active level parallelism + omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor(); + ASSERT0(LT_FUSSY, currTaskDescr, "do not expect fct to be called in a non-active thread"); + do { + if (currTaskDescr->ThreadsInTeam() > 1) { + // has a parallel with more than one thread in team + level = 1; + break; + } + currTaskDescr = currTaskDescr->GetPrevTaskDescr(); + } while (currTaskDescr); + PRINT(LD_IO, "call omp_get_active_level() returns %d\n", level) + return level; +} + +EXTERN int omp_get_ancestor_thread_num(int level) +{ + int rc = 0; // default at level 0 + if (level>=0) { + int totLevel = omp_get_level(); + if (level<=totLevel) { + omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor(); + int steps = totLevel - level; + PRINT(LD_IO, "backtrack %d steps\n", steps); + ASSERT0(LT_FUSSY, currTaskDescr, "do not expect fct to be called in a non-active thread"); + do { + if (DON(LD_IOD)) PrintTaskDescr(currTaskDescr, (char *)"ancestor", steps); + if (currTaskDescr->IsParallelConstruct()) { + // found the level + if (! steps) { + rc = currTaskDescr->ThreadId(); + break; + } + steps--; + } + currTaskDescr = currTaskDescr->GetPrevTaskDescr(); + } while (currTaskDescr); + ASSERT0(LT_FUSSY, ! steps, "expected to find all steps"); + } + } + PRINT(LD_IO, "call omp_get_ancestor_thread_num(level %d) returns %d\n", level, rc) + return rc; +} + + +EXTERN int omp_get_team_size(int level) +{ + int rc = 1; // default at level 0 + if (level>=0) { + int totLevel = omp_get_level(); + if (level<=totLevel) { + omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor(); + int steps = totLevel - level; + ASSERT0(LT_FUSSY, currTaskDescr, "do not expect fct to be called in a non-active thread"); + do { + if (currTaskDescr->IsParallelConstruct()) { + if (! steps) { + // found the level + rc = currTaskDescr->ThreadsInTeam(); + break; + } + steps--; + } + currTaskDescr = currTaskDescr->GetPrevTaskDescr(); + } while (currTaskDescr); + ASSERT0(LT_FUSSY, ! steps, "expected to find all steps"); + } + } + PRINT(LD_IO, "call omp_get_team_size(level %d) returns %d\n", level, rc) + return rc; +} + + +EXTERN void omp_get_schedule(omp_sched_t * kind, int * modifier) +{ + omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor(); + *kind = currTaskDescr->GetRuntimeSched(); + *modifier = currTaskDescr->RuntimeChunkSize(); + PRINT(LD_IO, "call omp_get_schedule returns sched %d and modif %d\n", + (int) *kind, *modifier); +} + +EXTERN void omp_set_schedule(omp_sched_t kind, int modifier) +{ + PRINT(LD_IO, "call omp_set_schedule(sched %d, modif %d)\n", + (int) kind, modifier); + if (kind>=omp_sched_static && kindSetRuntimeSched(kind); + currTaskDescr->RuntimeChunkSize() = modifier; + PRINT(LD_IOD, "omp_set_schedule did set sched %d & modif %d\n", + (int) currTaskDescr->GetRuntimeSched(), currTaskDescr->RuntimeChunkSize()); + + } +} + +EXTERN omp_proc_bind_t omp_get_proc_bind(void) +{ + PRINT0(LD_IO, "call omp_get_proc_bin() is true, regardless on state\n"); + return omp_proc_bind_true; +} + +EXTERN int omp_get_cancellation(void) +{ + int rc = omptarget_nvptx_threadPrivateContext->GlobalICV()->cancelPolicy; + PRINT(LD_IO, "call omp_get_cancellation() returns %d\n", rc); + return rc; +} + +EXTERN void omp_set_default_device(int deviceId) + { + PRINT0(LD_IO, "call omp_get_default_device() is undef on device\n"); +} + +EXTERN int omp_get_default_device(void) +{ + PRINT0(LD_IO, "call omp_get_default_device() is undef on device, returns 0\n"); + return 0; +} + +EXTERN int omp_get_num_devices(void) +{ + PRINT0(LD_IO, "call omp_get_num_devices() is undef on device, returns 0\n"); + return 0; +} + +EXTERN int omp_get_num_teams(void) +{ + int rc = GetNumberOfOmpTeams(); + PRINT(LD_IO, "call omp_get_num_teams() returns %d\n", rc); + return rc; +} + +EXTERN int omp_get_team_num() +{ + int rc = GetOmpTeamId(); + PRINT(LD_IO, "call omp_get_team_num() returns %d\n", rc); + return rc; +} + +EXTERN int omp_is_initial_device(void) +{ + PRINT0(LD_IO, "call omp_is_initial_device() returns 0\n"); + return 0; // 0 by def on device +} + +//////////////////////////////////////////////////////////////////////////////// +// locks +//////////////////////////////////////////////////////////////////////////////// + +#define __OMP_SPIN 1000 +#define UNSET 0 +#define SET 1 + +EXTERN void omp_init_lock(omp_lock_t * lock) +{ + *lock = UNSET; + PRINT0(LD_IO, "call omp_init_lock()\n"); +} + +EXTERN void omp_destroy_lock(omp_lock_t * lock) +{ + PRINT0(LD_IO, "call omp_destroy_lock()\n"); +} + +EXTERN void omp_set_lock(omp_lock_t * lock) +{ + // int atomicCAS(int* address, int compare, int val); + // (old == compare ? val : old) + int compare = UNSET; + int val = SET; + + // TODO: not sure spinning is a good idea here.. + while (atomicCAS(lock, compare, val) != UNSET) { + + clock_t start = clock(); + clock_t now; + for (;;) + { + now = clock(); + clock_t cycles = now > start ? now - start : now + (0xffffffff - start); + if (cycles >= __OMP_SPIN*blockIdx.x) { + break; + } + } + } // wait for 0 to be the read value + + PRINT0(LD_IO, "call omp_set_lock()\n"); +} + +EXTERN void omp_unset_lock(omp_lock_t * lock) +{ + int compare = SET; + int val = UNSET; + int old = atomicCAS (lock, compare, val); + + PRINT0(LD_IO, "call omp_unset_lock()\n"); +} + +EXTERN int omp_test_lock(omp_lock_t * lock) +{ + // int atomicCAS(int* address, int compare, int val); + // (old == compare ? val : old) + int compare = UNSET; + int val = SET; + + // TODO: should check for the lock to be SET? + int ret = atomicCAS(lock, compare, val); + + PRINT(LD_IO, "call omp_test_lock() return %d\n", ret); + + return ret; +} Index: libomptarget/deviceRTLs/nvptx/src/loop.cu =================================================================== --- /dev/null +++ libomptarget/deviceRTLs/nvptx/src/loop.cu @@ -0,0 +1,562 @@ +//===------------ loop.cu - NVPTX OpenMP loop constructs --------- CUDA -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is dual licensed under the MIT and the University of Illinois Open +// Source Licenses. See LICENSE.txt for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains the implementation of the KMPC interface +// for the loop construct plus other worksharing constructs that use the same +// interface as loops. +// +//===----------------------------------------------------------------------===// + +#include "../../../deviceRTLs/nvptx/src/omptarget-nvptx.h" + +EXTERN void CompleteCG(omptarget_nvptx_CounterGroup & cg, Counter * priv, + Counter n); + +//////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////// +// template class that encapsulate all the helper functions +// +// T is loop iteration type (32 | 64) (unsigned | signed) +// ST is the signed version of T +//////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////// + + +template +class omptarget_nvptx_LoopSupport { +public: + +//////////////////////////////////////////////////////////////////////////////// +// Loop with static scheduling with chunk + +// Generic implementation of OMP loop scheduling with static policy +/*! \brief Calculate initial bounds for static loop and stride + * @param[in] loc location in code of the call (not used here) + * @param[in] global_tid global thread id + * @param[in] schetype type of scheduling (see omptarget-nvptx.h) + * @param[in] plastiter pointer to last iteration + * @param[in,out] pointer to loop lower bound. it will contain value of + * lower bound of first chunk + * @param[in,out] pointer to loop upper bound. It will contain value of + * upper bound of first chunk + * @param[in,out] pointer to loop stride. It will contain value of stride + * between two successive chunks executed by the same thread + * @param[in] loop increment bump + * @param[in] chunk size + */ + +// helper function for static chunk +INLINE static void ForStaticChunk(T &lb, T &ub, ST &stride, ST chunk, + T entityId, T numberOfEntities) +{ + // each thread executes multiple chunks all of the same size, except + // the last one + + // distance between two successive chunks + stride = numberOfEntities * chunk; + lb = lb + entityId * chunk; + ub = lb + chunk -1; // Clang uses i <= ub +} + +//////////////////////////////////////////////////////////////////////////////// +// Loop with static scheduling without chunk + +// helper function for static no chunk +INLINE static void ForStaticNoChunk(T &lb, T &ub, ST &stride, ST &chunk, + T entityId, T numberOfEntities) +{ + // No chunk size specified. Each thread or warp gets at most one + // chunk; chunks are all almost of equal size + T loopSize = ub - lb + 1; + + chunk = loopSize / numberOfEntities; + T leftOver = loopSize - chunk * numberOfEntities; + + if (entityId < leftOver) { + chunk++; + lb = lb + entityId * chunk; + } else { + lb = lb + entityId * chunk + leftOver; + } + + ub = lb + chunk -1; // Clang uses i <= ub + stride = loopSize; // make sure we only do 1 chunk per warp +} + +//////////////////////////////////////////////////////////////////////////////// +// Support for Static Init + +INLINE static void for_static_init(int32_t schedtype, + T *plower, T *pupper, ST *pstride, ST chunk) +{ + int gtid = GetGlobalThreadId(); + + // Assume we are in teams region or that we use a single block + // per target region + ST numberOfActiveOMPThreads = GetNumberOfOmpThreads(gtid); + + // All warps that are in excess of the maximum requested, do + // not execute the loop + PRINT(LD_LOOP, + "OMP Thread %d: schedule type %d, chunk size = %lld, mytid %d, num tids %d\n", + GetOmpThreadId(gtid), schedtype, P64(chunk), GetOmpThreadId(gtid), + GetNumberOfOmpThreads(gtid)); + ASSERT0(LT_FUSSY, GetOmpThreadId(gtid) 0) { + entityId = GetOmpThreadId(gtid); + numberOfEntities = GetNumberOfOmpThreads(gtid); + ForStaticChunk(lb, ub, stride, chunk, entityId, numberOfEntities); + break; + } + } // note: if chunk <=0, use nochunk + case kmp_sched_static_nochunk : + { + entityId = GetOmpThreadId(gtid); + numberOfEntities = GetNumberOfOmpThreads(gtid); + ForStaticNoChunk(lb, ub, stride, chunk, entityId, numberOfEntities); + break; + } + case kmp_sched_distr_static_chunk : + { + if (chunk > 0) { + entityId = GetOmpTeamId(); + numberOfEntities = GetNumberOfOmpTeams(); + ForStaticChunk(lb, ub, stride, chunk, entityId, numberOfEntities); + break; + } // note: if chunk <=0, use nochunk + } + case kmp_sched_distr_static_nochunk : + { + entityId = GetOmpTeamId(); + numberOfEntities = GetNumberOfOmpTeams(); + + ForStaticNoChunk(lb, ub, stride, chunk, entityId, numberOfEntities); + break; + } + default: + { + ASSERT(LT_FUSSY, FALSE, "unknown schedtype %d", schedtype); + PRINT(LD_LOOP, "unknown schedtype %d, revert back to static chunk\n", + schedtype); + entityId = GetOmpThreadId(gtid); + numberOfEntities = GetNumberOfOmpThreads(gtid); + ForStaticChunk(lb, ub, stride, chunk, entityId, numberOfEntities); + } + } + // copy back + *plower = lb; + *pupper = ub; + *pstride = stride; + PRINT(LD_LOOP,"Got sched: Active %d, total %d: lb %lld, ub %lld, stride %lld\n", + GetNumberOfOmpThreads(gtid), GetNumberOfThreadsInBlock(), + P64(*plower), P64(*pupper), P64(*pstride)); +} + + +//////////////////////////////////////////////////////////////////////////////// +// Support for dispatch Init + +INLINE static int OrderedSchedule(kmp_sched_t schedule) +{ + return schedule >= kmp_sched_ordered_first && + schedule <= kmp_sched_ordered_last; +} + +INLINE static void dispatch_init(kmp_sched_t schedule, T lb, T ub, + ST st, ST chunk) +{ + ASSERT0(LT_FUSSY, lb==0, "exected normalized loop"); + lb = 0; + + int gtid = GetGlobalThreadId(); + omptarget_nvptx_TaskDescr * currTaskDescr = getMyTopTaskDescriptor(gtid); + T tnum = currTaskDescr->ThreadsInTeam(); + T tripCount = ub - lb + 1; // +1 because ub is inclusive + ASSERT0(LT_FUSSY, GetOmpThreadId(gtid)GetRuntimeSched(); + chunk = currTaskDescr->RuntimeChunkSize(); + switch (rtSched) { + case omp_sched_static : + { + if (chunk>0) schedule = kmp_sched_static_chunk; + else schedule = kmp_sched_static_nochunk; + break; + } + case omp_sched_auto : + { + schedule = kmp_sched_static_chunk; + chunk = 1; + break; + } + case omp_sched_dynamic : + case omp_sched_guided : + { + schedule = kmp_sched_dynamic; + break; + } + } + PRINT(LD_LOOP, "Runtime sched is %d with chunk %lld\n", schedule, P64(chunk)); + } else if (schedule == kmp_sched_auto) { + schedule = kmp_sched_static_chunk; + chunk = 1; + PRINT(LD_LOOP, "Auto sched is %d with chunk %lld\n", schedule, P64(chunk)); + } else { + PRINT(LD_LOOP, "Dyn sched is %d with chunk %lld\n", schedule, P64(chunk)); + ASSERT(LT_FUSSY, schedule == kmp_sched_dynamic || schedule == kmp_sched_guided, + "unknown schedule %d & chunk %lld\n", + schedule, P64(chunk)); + } + + // save sched state + omptarget_nvptx_threadPrivateContext->ScheduleType(gtid) = schedule; + omptarget_nvptx_threadPrivateContext->LoopUpperBound(gtid) = ub; + + // init schedules + if (schedule == kmp_sched_static_chunk) { + ASSERT0(LT_FUSSY, chunk>0, "bad chunk value"); + // save ub + omptarget_nvptx_threadPrivateContext->LoopUpperBound(gtid) = ub; + // compute static chunk + ST stride; + T threadId = GetOmpThreadId(gtid); + ForStaticChunk(lb, ub, stride, chunk, threadId, tnum); + // save computed params + omptarget_nvptx_threadPrivateContext->Chunk(gtid) = chunk; + omptarget_nvptx_threadPrivateContext->NextLowerBound(gtid) = lb; + omptarget_nvptx_threadPrivateContext->Stride(gtid) = stride; + PRINT(LD_LOOP, + "dispatch init (static chunk) : num threads = %d, ub = %lld," + "next lower bound = %lld, stride = %lld\n", + GetNumberOfOmpThreads(gtid), + omptarget_nvptx_threadPrivateContext->LoopUpperBound(gtid), + omptarget_nvptx_threadPrivateContext->NextLowerBound(gtid), + omptarget_nvptx_threadPrivateContext->Stride(gtid)); + + } else if (schedule == kmp_sched_static_nochunk) { + ASSERT0(LT_FUSSY, chunk==0, "bad chunk value"); + // save ub + omptarget_nvptx_threadPrivateContext->LoopUpperBound(gtid) = ub; + // compute static chunk + ST stride; + T threadId = GetOmpThreadId(gtid); + ForStaticNoChunk(lb, ub, stride, chunk, threadId, tnum); + // save computed params + omptarget_nvptx_threadPrivateContext->Chunk(gtid) = chunk; + omptarget_nvptx_threadPrivateContext->NextLowerBound(gtid) = lb; + omptarget_nvptx_threadPrivateContext->Stride(gtid) = stride; + PRINT(LD_LOOP, + "dispatch init (static nochunk) : num threads = %d, ub = %lld," + "next lower bound = %lld, stride = %lld\n", + GetNumberOfOmpThreads(gtid), + omptarget_nvptx_threadPrivateContext->LoopUpperBound(gtid), + omptarget_nvptx_threadPrivateContext->NextLowerBound(gtid), + omptarget_nvptx_threadPrivateContext->Stride(gtid)); + + } else if (schedule == kmp_sched_dynamic || schedule == kmp_sched_guided) { + if (chunk<1) chunk = 1; + Counter eventNum = ((tripCount -1) / chunk) + 1; // number of chunks + // but each thread (but one) must discover that it is last + eventNum += tnum; + omptarget_nvptx_threadPrivateContext->Chunk(gtid) = chunk; + omptarget_nvptx_threadPrivateContext->EventsNumber(gtid) = eventNum; + PRINT(LD_LOOP, + "dispatch init (dyn) : num threads = %d, ub = %lld, chunk %lld, " + "events number = %lld\n", + GetNumberOfOmpThreads(gtid), + omptarget_nvptx_threadPrivateContext->LoopUpperBound(gtid), + omptarget_nvptx_threadPrivateContext->Chunk(gtid), + omptarget_nvptx_threadPrivateContext->EventsNumber(gtid)); + } + +} + + +//////////////////////////////////////////////////////////////////////////////// +// Support for dispatch next + +INLINE static int DynamicNextChunk(omptarget_nvptx_CounterGroup & cg, Counter priv, + T & lb, T & ub, Counter & chunkId, Counter & currentEvent, + T chunkSize, T loopUpperBound) +{ + // get next event atomically + Counter nextEvent = cg.Next(); + // calculate chunk Id (priv was initialized upon entering the loop to 'start' == 'event') + chunkId = nextEvent - priv; + // calculate lower bound for all lanes in the warp + lb = chunkId * chunkSize; // this code assume normalization of LB + ub = lb + chunkSize -1; // Clang uses i <= ub + + // 3 result cases: + // a. lb and ub < loopUpperBound --> NOT_FINISHED + // b. lb < loopUpperBound and ub >= loopUpperBound: last chunk --> NOT_FINISHED + // c. lb and ub >= loopUpperBound: empty chunk --> FINISHED + currentEvent = nextEvent; + // a. + if (ub <= loopUpperBound) { + PRINT(LD_LOOPD, "lb %lld, ub %lld, loop ub %lld; not finished\n", + P64(lb), P64(ub), P64(loopUpperBound)); + return NOT_FINISHED; + } + // b. + if (lb <= loopUpperBound) { + PRINT(LD_LOOPD, "lb %lld, ub %lld, loop ub %lld; clip to loop ub\n", + P64(lb), P64(ub), P64(loopUpperBound)); + ub = loopUpperBound; + return LAST_CHUNK; + } + // c. if we are here, we are in case 'c' + lb = loopUpperBound +1; + PRINT(LD_LOOPD, "lb %lld, ub %lld, loop ub %lld; finished\n", + P64(lb), P64(ub), P64(loopUpperBound)); + return FINISHED; +} + + +INLINE static int dispatch_next(int32_t *plast, T *plower, T *pupper, ST *pstride) +{ + // ID of a thread in its own warp + + // automatically selects thread or warp ID based on selected implementation + int gtid = GetGlobalThreadId(); + ASSERT0(LT_FUSSY, GetOmpThreadId(gtid)ScheduleType(gtid); + + // xxx reduce to one + if (schedule == kmp_sched_static_chunk || schedule == kmp_sched_static_nochunk) { + T myLb = omptarget_nvptx_threadPrivateContext->NextLowerBound(gtid); + T ub = omptarget_nvptx_threadPrivateContext->LoopUpperBound(gtid); + // finished? + if (myLb > ub) { + PRINT(LD_LOOP, "static loop finished with myLb %lld, ub %lld\n", P64(myLb), P64(ub)); + return DISPATCH_FINISHED; + } + // not finished, save current bounds + ST chunk = omptarget_nvptx_threadPrivateContext->Chunk(gtid); + *plower = myLb; + T myUb = myLb + chunk -1; // Clang uses i <= ub + if (myUb > ub) myUb = ub; + *pupper = myUb; + + // increment next lower bound by the stride + ST stride = omptarget_nvptx_threadPrivateContext->Stride(gtid); + omptarget_nvptx_threadPrivateContext->NextLowerBound(gtid) = myLb + stride; + PRINT(LD_LOOP, "static loop continues with myLb %lld, myUb %lld\n", P64(*plower), P64(*pupper)); + return DISPATCH_NOTFINISHED; + } + ASSERT0(LT_FUSSY, schedule==kmp_sched_dynamic || schedule==kmp_sched_guided, "bad sched"); + omptarget_nvptx_TeamDescr & teamDescr = getMyTeamDescriptor(); + T myLb, myUb; + Counter chunkId; + // xxx current event is now local + omptarget_nvptx_CounterGroup &cg = teamDescr.WorkDescr().CounterGroup(); + int finished = DynamicNextChunk(cg, + omptarget_nvptx_threadPrivateContext->Priv(gtid), myLb, myUb, chunkId, + omptarget_nvptx_threadPrivateContext->CurrentEvent(gtid), + omptarget_nvptx_threadPrivateContext->Chunk(gtid), + omptarget_nvptx_threadPrivateContext->LoopUpperBound(gtid)); + + if (finished == FINISHED) { + cg.Complete(omptarget_nvptx_threadPrivateContext->Priv(gtid), + omptarget_nvptx_threadPrivateContext->EventsNumber(gtid)); + cg.Release (omptarget_nvptx_threadPrivateContext->Priv(gtid), + omptarget_nvptx_threadPrivateContext->CurrentEvent(gtid)); + + return DISPATCH_FINISHED; + } + + // not finished (either not finished or last chunk) + *plower = myLb; + *pupper = myUb; + *pstride = 1; + + PRINT(LD_LOOP,"Got sched: active %d, total %d: lb %lld, ub %lld, stride = %lld\n", + GetNumberOfOmpThreads(gtid), GetNumberOfThreadsInBlock(), + P64(*plower), P64(*pupper), P64(*pstride)); + return DISPATCH_NOTFINISHED; +} + +INLINE static void dispatch_fini() +{ + // nothing +} + +//////////////////////////////////////////////////////////////////////////////// +// end of template class that encapsulate all the helper functions +//////////////////////////////////////////////////////////////////////////////// +} ; + + +//////////////////////////////////////////////////////////////////////////////// +// KMP interface implementation (dyn loops) +//////////////////////////////////////////////////////////////////////////////// + +// init +EXTERN void __kmpc_dispatch_init_4(kmp_Indent * loc, int32_t gtid, + int32_t schedule, int32_t lb, int32_t ub, int32_t st, int32_t chunk) +{ + PRINT0(LD_IO, "call kmpc_dispatch_init_4\n"); + omptarget_nvptx_LoopSupport::dispatch_init((kmp_sched_t) schedule, + lb, ub, st, chunk); +} + +EXTERN void __kmpc_dispatch_init_4u(kmp_Indent * loc, int32_t gtid, + int32_t schedule, uint32_t lb, uint32_t ub, int32_t st, int32_t chunk) +{ + PRINT0(LD_IO, "call kmpc_dispatch_init_4u\n"); + omptarget_nvptx_LoopSupport::dispatch_init((kmp_sched_t) schedule, + lb, ub, st, chunk); +} + +EXTERN void __kmpc_dispatch_init_8(kmp_Indent * loc, int32_t gtid, + int32_t schedule, int64_t lb, int64_t ub, int64_t st, int64_t chunk) +{ + PRINT0(LD_IO, "call kmpc_dispatch_init_8\n"); + omptarget_nvptx_LoopSupport::dispatch_init((kmp_sched_t) schedule, + lb, ub, st, chunk); +} + +EXTERN void __kmpc_dispatch_init_8u(kmp_Indent * loc, int32_t gtid, + int32_t schedule, uint64_t lb, uint64_t ub, int64_t st, int64_t chunk) +{ + PRINT0(LD_IO, "call kmpc_dispatch_init_8u\n"); + omptarget_nvptx_LoopSupport::dispatch_init((kmp_sched_t) schedule, + lb, ub, st, chunk); +} + +// next +EXTERN int __kmpc_dispatch_next_4(kmp_Indent * loc, int32_t gtid, + int32_t * p_last, int32_t * p_lb, int32_t * p_ub, int32_t * p_st) +{ + PRINT0(LD_IO, "call kmpc_dispatch_next_4\n"); + return omptarget_nvptx_LoopSupport::dispatch_next(p_last, + p_lb, p_ub, p_st); +} + +EXTERN int __kmpc_dispatch_next_4u(kmp_Indent * loc, int32_t gtid, + int32_t * p_last, uint32_t * p_lb, uint32_t * p_ub, int32_t * p_st) +{ + PRINT0(LD_IO, "call kmpc_dispatch_next_4u\n"); + return omptarget_nvptx_LoopSupport::dispatch_next(p_last, + p_lb, p_ub, p_st); +} + +EXTERN int __kmpc_dispatch_next_8(kmp_Indent * loc, int32_t gtid, + int32_t * p_last, int64_t * p_lb, int64_t * p_ub, int64_t * p_st) +{ + PRINT0(LD_IO, "call kmpc_dispatch_next_8\n"); + return omptarget_nvptx_LoopSupport::dispatch_next(p_last, + p_lb, p_ub, p_st); +} + +EXTERN int __kmpc_dispatch_next_8u(kmp_Indent * loc, int32_t gtid, + int32_t * p_last, uint64_t * p_lb, uint64_t * p_ub, int64_t * p_st) +{ + PRINT0(LD_IO, "call kmpc_dispatch_next_8u\n"); + return omptarget_nvptx_LoopSupport::dispatch_next(p_last, + p_lb, p_ub, p_st); +} + +// fini +EXTERN void __kmpc_dispatch_fini_4 (kmp_Indent * loc, int32_t gtid) +{ + PRINT0(LD_IO, "call kmpc_dispatch_fini_4\n"); + omptarget_nvptx_LoopSupport::dispatch_fini(); +} + +EXTERN void __kmpc_dispatch_fini_4u (kmp_Indent * loc, int32_t gtid) +{ + PRINT0(LD_IO, "call kmpc_dispatch_fini_4u\n"); + omptarget_nvptx_LoopSupport::dispatch_fini(); +} + +EXTERN void __kmpc_dispatch_fini_8 (kmp_Indent * loc, int32_t gtid) +{ + PRINT0(LD_IO, "call kmpc_dispatch_fini_8\n"); + omptarget_nvptx_LoopSupport::dispatch_fini(); +} + +EXTERN void __kmpc_dispatch_fini_8u (kmp_Indent * loc, int32_t gtid) +{ + PRINT0(LD_IO, "call kmpc_dispatch_fini_8u\n"); + omptarget_nvptx_LoopSupport::dispatch_fini(); +} + + +//////////////////////////////////////////////////////////////////////////////// +// KMP interface implementation (static loops) +//////////////////////////////////////////////////////////////////////////////// + + +EXTERN void __kmpc_for_static_init_4(kmp_Indent *loc, int32_t global_tid, + int32_t schedtype, int32_t *plastiter, int32_t *plower, int32_t *pupper, + int32_t *pstride, int32_t incr, int32_t chunk) +{ + PRINT0(LD_IO, "call kmpc_for_static_init_4\n"); + omptarget_nvptx_LoopSupport::for_static_init (schedtype, + plower, pupper, pstride, chunk); +} + +EXTERN void __kmpc_for_static_init_4u (kmp_Indent *loc, int32_t global_tid, + int32_t schedtype, int32_t *plastiter, uint32_t *plower, uint32_t *pupper, + int32_t *pstride, int32_t incr, int32_t chunk) +{ + PRINT0(LD_IO, "call kmpc_for_static_init_4u\n"); + omptarget_nvptx_LoopSupport::for_static_init(schedtype, + plower, pupper, pstride, chunk); +} + +EXTERN void __kmpc_for_static_init_8(kmp_Indent *loc, int32_t global_tid, + int32_t schedtype, int32_t *plastiter, int64_t *plower, int64_t *pupper, + int64_t *pstride, int64_t incr, int64_t chunk) +{ + PRINT0(LD_IO, "call kmpc_for_static_init_8\n"); + omptarget_nvptx_LoopSupport::for_static_init (schedtype, + plower, pupper, pstride, chunk); +} + +EXTERN void __kmpc_for_static_init_8u (kmp_Indent *loc, int32_t global_tid, + int32_t schedtype, int32_t *plastiter, uint64_t *plower, uint64_t *pupper, + int64_t *pstride, int64_t incr, int64_t chunk) +{ + PRINT0(LD_IO, "call kmpc_for_static_init_8u\n"); + omptarget_nvptx_LoopSupport::for_static_init(schedtype, + plower, pupper, pstride, chunk); +} + + +EXTERN void __kmpc_for_static_fini(kmp_Indent *loc, int32_t global_tid) +{ + PRINT0(LD_IO, "call kmpc_for_static_fini\n"); +} Index: libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h =================================================================== --- /dev/null +++ libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h @@ -0,0 +1,247 @@ +//===---- omptarget-nvptx.h - NVPTX OpenMP GPU initialization ---- CUDA -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is dual licensed under the MIT and the University of Illinois Open +// Source Licenses. See LICENSE.txt for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains the declarations of all library macros, types, +// and functions. +// +//===----------------------------------------------------------------------===// + +#ifndef __OMPTARGET_NVPTX_H +#define __OMPTARGET_NVPTX_H + +// std includes +#include +#include + +// cuda includes +#include +#include + +// local includes +#include "../../../deviceRTLs/nvptx/src/option.h" // choices we have +#include "../../../deviceRTLs/nvptx/src/counter_group.h" +#include "../../../deviceRTLs/nvptx/src/debug.h" // debug +#include "../../../deviceRTLs/nvptx/src/interface.h" // interfaces with omp, compiler, and user +#include "../../../deviceRTLs/nvptx/src/support.h" + +#define OMPTARGET_NVPTX_VERSION 1.1 + +// used by the library for the interface with the app +#define DISPATCH_FINISHED 0 +#define DISPATCH_NOTFINISHED 1 + +// used by dynamic scheduling +#define FINISHED 0 +#define NOT_FINISHED 1 +#define LAST_CHUNK 2 + + +#define TEAM_MASTER 0 +#define BARRIER_COUNTER 0 +#define ORDERED_COUNTER 1 + +//////////////////////////////////////////////////////////////////////////////// +// global ICV + +typedef struct omptarget_nvptx_GlobalICV { + double gpuCycleTime; // currently statically determined, should be set by host + uint8_t cancelPolicy; // 1 bit: enabled (true) or disabled (false) +} omptarget_nvptx_GlobalICV; + +//////////////////////////////////////////////////////////////////////////////// +// task ICV and (implicit & explicit) task state + +class omptarget_nvptx_TaskDescr { + public: + // methods for flags + INLINE omp_sched_t GetRuntimeSched(); + INLINE void SetRuntimeSched(omp_sched_t sched); + INLINE int IsDynamic() { return data.items.flags & TaskDescr_IsDynamic; } + INLINE void SetDynamic() { data.items.flags = data.items.flags | TaskDescr_IsDynamic; } + INLINE void ClearDynamic() { data.items.flags = data.items.flags & (~TaskDescr_IsDynamic); } + INLINE int InParallelRegion() { return data.items.flags & TaskDescr_InPar; } + INLINE int IsParallelConstruct() { return data.items.flags & TaskDescr_IsParConstr; } + INLINE int IsTaskConstruct() { return ! IsParallelConstruct(); } + // methods for other fields + INLINE uint16_t & NThreads() { return data.items.nthreads; } + INLINE uint16_t & ThreadLimit() { return data.items.threadlimit; } + INLINE uint16_t & ThreadId() { return data.items.threadId; } + INLINE uint16_t & ThreadsInTeam() { return data.items.threadsInTeam; } + INLINE uint64_t & RuntimeChunkSize() { return data.items.runtimeChunkSize; } + INLINE omptarget_nvptx_TaskDescr * GetPrevTaskDescr() { return prev; } + INLINE void SetPrevTaskDescr(omptarget_nvptx_TaskDescr *taskDescr) { prev = taskDescr; } + // init & copy + INLINE void InitLevelZeroTaskDescr(); + INLINE void Copy(omptarget_nvptx_TaskDescr *sourceTaskDescr); + INLINE void CopyData(omptarget_nvptx_TaskDescr *sourceTaskDescr); + INLINE void CopyParent(omptarget_nvptx_TaskDescr *parentTaskDescr); + INLINE void CopyForExplicitTask(omptarget_nvptx_TaskDescr *parentTaskDescr); + INLINE void CopyToWorkDescr(omptarget_nvptx_TaskDescr *masterTaskDescr, uint16_t tnum); + INLINE void CopyFromWorkDescr(omptarget_nvptx_TaskDescr *workTaskDescr); + + private: + /* bits for flags: (6 used, 2 free) + 3 bits (SchedMask) for runtime schedule + 1 bit (IsDynamic) for dynamic schedule (false = static) + 1 bit (InPar) if this thread has encountered one or more parallel region + 1 bit (IsParConstr) if ICV for a parallel region (false = explicit task) + */ + static const uint8_t TaskDescr_SchedMask = (0x1 | 0x2 | 0x4); + static const uint8_t TaskDescr_IsDynamic = 0x8; + static const uint8_t TaskDescr_InPar = 0x10; + static const uint8_t TaskDescr_IsParConstr = 0x20; + + union { // both have same size + uint64_t vect[2]; + struct TaskDescr_items { + uint8_t flags; // 6 bit used (see flag above) + uint8_t unused; + uint16_t nthreads; // thread num for subsequent parallel regions + uint16_t threadlimit; // thread limit ICV + uint16_t threadId; // thread id + uint16_t threadsInTeam; // threads in current team + uint64_t runtimeChunkSize; // runtime chunk size + } items; + } data; + omptarget_nvptx_TaskDescr *prev; +}; + +// build on kmp +typedef struct omptarget_nvptx_ExplicitTaskDescr { + omptarget_nvptx_TaskDescr taskDescr; // omptarget_nvptx task description (must be first) + kmp_TaskDescr kmpTaskDescr; // kmp task description (must be last) +} omptarget_nvptx_ExplicitTaskDescr; + +class omptarget_nvptx_TeamDescr; +class omptarget_nvptx_GlobalICV; + +//////////////////////////////////////////////////////////////////////////////// +// thread private data (struct of arrays for better coalescing) +// tid refers here to the global thread id +// do not support multiple concurrent kernel a this time +class omptarget_nvptx_ThreadPrivateContext { +public: + // task + INLINE omptarget_nvptx_TaskDescr *Level1TaskDescr(int gtid) { return & levelOneTaskDescr[gtid]; } + INLINE void SetTopLevelTaskDescr(int gtid, omptarget_nvptx_TaskDescr *taskICV) { topTaskDescr[gtid] = taskICV; } + INLINE omptarget_nvptx_TaskDescr *GetTopLevelTaskDescr(int gtid); + // parallel + INLINE uint16_t & NumThreadsForNextParallel(int gtid) { return tnumForNextPar[gtid]; } + // sync + INLINE Counter & Priv(int gtid) { return priv[gtid]; } + INLINE void IncrementPriv(int gtid, Counter val) { priv[gtid] += val; } + // schedule (for dispatch) + INLINE kmp_sched_t & ScheduleType(int gtid) { return schedule[gtid]; } + INLINE int64_t & Chunk(int gtid) { return chunk[gtid]; } + INLINE int64_t & LoopUpperBound(int gtid) { return loopUpperBound[gtid]; } + // state for dispatch with dyn/guided + INLINE Counter & CurrentEvent(int gtid) { return currEvent_or_nextLowerBound[gtid]; } + INLINE Counter & EventsNumber(int gtid) { return eventsNum_or_stride[gtid]; } + // state for dispatch with static + INLINE Counter & NextLowerBound(int gtid) { return currEvent_or_nextLowerBound[gtid]; } + INLINE Counter & Stride(int gtid) { return eventsNum_or_stride[gtid]; } + + INLINE void SetTeamContext(omptarget_nvptx_TeamDescr *teamContext) { + omptarget_nvptx_teamContext = teamContext; + } + INLINE void SetGlobalICV(omptarget_nvptx_GlobalICV *globalICV) { + omptarget_nvptx_globalICV = globalICV; + } + INLINE omptarget_nvptx_TeamDescr *TeamContext() { return omptarget_nvptx_teamContext; } + INLINE omptarget_nvptx_GlobalICV *GlobalICV() { return omptarget_nvptx_globalICV; } + + INLINE void InitThreadPrivateContext(int gtid); + +private: + // task ICV for implict threads in the only parallel region + omptarget_nvptx_TaskDescr levelOneTaskDescr[MAX_NUM_OMP_THREADS]; + // pointer where to find the current task ICV (top of the stack) + omptarget_nvptx_TaskDescr *topTaskDescr[MAX_NUM_OMP_THREADS]; + // parallel + uint16_t tnumForNextPar[MAX_NUM_OMP_THREADS]; + // sync + Counter priv[MAX_NUM_OMP_THREADS]; + // schedule (for dispatch) + kmp_sched_t schedule[MAX_NUM_OMP_THREADS]; // remember schedule type for #for + int64_t chunk[MAX_NUM_OMP_THREADS]; + int64_t loopUpperBound[MAX_NUM_OMP_THREADS]; + // state for dispatch with dyn/guided OR static (never use both at a time) + Counter currEvent_or_nextLowerBound[MAX_NUM_OMP_THREADS]; + Counter eventsNum_or_stride[MAX_NUM_OMP_THREADS]; + + // Pointers to other omp data in same instance + // These are initialized in kernel_init + omptarget_nvptx_TeamDescr *omptarget_nvptx_teamContext; + omptarget_nvptx_GlobalICV *omptarget_nvptx_globalICV; +}; + +//////////////////////////////////////////////////////////////////////////////// +// Descriptor of a parallel region (worksharing in general) + +class omptarget_nvptx_WorkDescr { + + public: + // access to data + INLINE omptarget_nvptx_CounterGroup & CounterGroup() { return cg; } + INLINE omptarget_nvptx_TaskDescr * WorkTaskDescr() { return & masterTaskICV; } + // init + INLINE void InitWorkDescr(); + +private: + omptarget_nvptx_CounterGroup cg; // for barrier (no other needed) + omptarget_nvptx_TaskDescr masterTaskICV; + bool hasCancel; +}; + + +//////////////////////////////////////////////////////////////////////////////// +// thread private data (struct of arrays for better coalescing) + +class omptarget_nvptx_TeamDescr { + public: + // access to data + INLINE omptarget_nvptx_TaskDescr *LevelZeroTaskDescr() { return &levelZeroTaskDescr; } + INLINE omptarget_nvptx_WorkDescr & WorkDescr() { return workDescrForActiveParallel; } + INLINE omp_lock_t *CriticalLock() {return &criticalLock; } + // init + INLINE void InitTeamDescr(); + + private: + omptarget_nvptx_TaskDescr levelZeroTaskDescr ; // icv for team master initial thread + omptarget_nvptx_WorkDescr workDescrForActiveParallel; // one, ONLY for the active par + omp_lock_t criticalLock; +}; + + +//////////////////////////////////////////////////////////////////////////////// +// global data tables +//////////////////////////////////////////////////////////////////////////////// + +extern __shared__ omptarget_nvptx_ThreadPrivateContext *omptarget_nvptx_threadPrivateContext; + + +//////////////////////////////////////////////////////////////////////////////// +// get private data structures +//////////////////////////////////////////////////////////////////////////////// + +INLINE omptarget_nvptx_TeamDescr & getMyTeamDescriptor(); +INLINE omptarget_nvptx_WorkDescr & getMyWorkDescriptor(); +INLINE omptarget_nvptx_TaskDescr * getMyTopTaskDescriptor(); +INLINE omptarget_nvptx_TaskDescr * getMyTopTaskDescriptor(int globalThreadId); + + +//////////////////////////////////////////////////////////////////////////////// +// inlined implementation +//////////////////////////////////////////////////////////////////////////////// + +#include "../../../deviceRTLs/nvptx/src/supporti.h" +#include "../../../deviceRTLs/nvptx/src/omptarget-nvptxi.h" +#include "../../../deviceRTLs/nvptx/src/counter_groupi.h" + +#endif Index: libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu =================================================================== --- /dev/null +++ libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu @@ -0,0 +1,70 @@ +//===--- omptarget-nvptx.cu - NVPTX OpenMP GPU initialization ---- CUDA -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is dual licensed under the MIT and the University of Illinois Open +// Source Licenses. See LICENSE.txt for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains the initialization code for the GPU +// +//===----------------------------------------------------------------------===// + +#include "../../../deviceRTLs/nvptx/src/omptarget-nvptx.h" + +//////////////////////////////////////////////////////////////////////////////// +// global data tables +//////////////////////////////////////////////////////////////////////////////// + +__device__ omptarget_nvptx_TeamDescr omptarget_nvptx_device_teamContexts[MAX_INSTANCES][MAX_NUM_TEAMS]; +__device__ omptarget_nvptx_ThreadPrivateContext omptarget_nvptx_device_threadPrivateContext[MAX_INSTANCES]; +__device__ omptarget_nvptx_GlobalICV omptarget_nvptx_device_globalICV[MAX_INSTANCES]; + +__shared__ omptarget_nvptx_ThreadPrivateContext *omptarget_nvptx_threadPrivateContext; + +//////////////////////////////////////////////////////////////////////////////// +// init entry points +//////////////////////////////////////////////////////////////////////////////// + + +EXTERN void __kmpc_kernel_init(int OmpHandle, int ThreadLimit) +{ + PRINT(LD_IO, "call to __kmpc_kernel_init with version %f\n", OMPTARGET_NVPTX_VERSION); + ASSERT0(LT_FUSSY, OmpHandle >=0 && OmpHandle < MAX_INSTANCES, + "omp handle out of bounds"); + omptarget_nvptx_threadPrivateContext = &omptarget_nvptx_device_threadPrivateContext[OmpHandle]; + omptarget_nvptx_threadPrivateContext->SetTeamContext( + &omptarget_nvptx_device_teamContexts[OmpHandle][0]); + omptarget_nvptx_threadPrivateContext->SetGlobalICV( + &omptarget_nvptx_device_globalICV[OmpHandle]); + + // init thread private + int globalThreadId = GetGlobalThreadId(); + omptarget_nvptx_threadPrivateContext->InitThreadPrivateContext(globalThreadId); + + int threadIdInBlock = GetThreadIdInBlock(); + if (threadIdInBlock == TEAM_MASTER) { + PRINT0(LD_IO, "call to __kmpc_kernel_init for master\n"); + // init global icv + omptarget_nvptx_threadPrivateContext->GlobalICV()->gpuCycleTime = 1.0 / 745000000.0; // host reports 745 mHz + omptarget_nvptx_threadPrivateContext->GlobalICV()->cancelPolicy = FALSE; // currently false only + // init team context + omptarget_nvptx_TeamDescr & currTeamDescr = getMyTeamDescriptor(); + currTeamDescr.InitTeamDescr(); + // this thread will start execution... has to update its task ICV + // to points to the level zero task ICV. That ICV was init in + // InitTeamDescr() + omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(globalThreadId, + currTeamDescr.LevelZeroTaskDescr()); + + // set number of threads and thread limit in team to started value + int globalThreadId = GetGlobalThreadId(); + omptarget_nvptx_TaskDescr *currTaskDescr = + omptarget_nvptx_threadPrivateContext->GetTopLevelTaskDescr( + globalThreadId); + currTaskDescr->NThreads() = GetNumberOfThreadsInBlock(); + currTaskDescr->ThreadLimit() = ThreadLimit; + } +} + Index: libomptarget/deviceRTLs/nvptx/src/omptarget-nvptxi.h =================================================================== --- /dev/null +++ libomptarget/deviceRTLs/nvptx/src/omptarget-nvptxi.h @@ -0,0 +1,164 @@ +//===---- omptarget-nvptxi.h - NVPTX OpenMP GPU initialization --- CUDA -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is dual licensed under the MIT and the University of Illinois Open +// Source Licenses. See LICENSE.txt for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains the declarations of all library macros, types, +// and functions. +// +//===----------------------------------------------------------------------===// + +//////////////////////////////////////////////////////////////////////////////// +// Task Descriptor +//////////////////////////////////////////////////////////////////////////////// + +INLINE omp_sched_t omptarget_nvptx_TaskDescr::GetRuntimeSched() +{ + // sched starts from 1..4; encode it as 0..3; so add 1 here + uint8_t rc = (data.items.flags & TaskDescr_SchedMask) +1; + return (omp_sched_t) rc; +} + +INLINE void omptarget_nvptx_TaskDescr::SetRuntimeSched(omp_sched_t sched) +{ + // sched starts from 1..4; encode it as 0..3; so add 1 here + uint8_t val = ((uint8_t) sched) -1; + // clear current sched + data.items.flags &= ~TaskDescr_SchedMask; + // set new sched + data.items.flags |= val; +} + +INLINE void omptarget_nvptx_TaskDescr::InitLevelZeroTaskDescr() +{ + // xxx slow method + /* flag: + default sched is static, + dyn is off (unused now anyway, but may need to sample from host ?) + not in parallel + */ + data.items.flags = 0; + data.items.nthreads = GetNumberOfProcsInTeam();; // threads: whatever was alloc by kernel + data.items.threadId = 0; // is master + data.items.threadsInTeam = 1; // sequential + data.items.runtimeChunkSize = 1; // prefered chunking statik with chunk 1 +} + +INLINE void omptarget_nvptx_TaskDescr::CopyData(omptarget_nvptx_TaskDescr *sourceTaskDescr) +{ + data.vect[0] = sourceTaskDescr->data.vect[0]; + data.vect[1] = sourceTaskDescr->data.vect[1]; +} + +INLINE void omptarget_nvptx_TaskDescr::Copy(omptarget_nvptx_TaskDescr *sourceTaskDescr) +{ + CopyData(sourceTaskDescr); + prev = sourceTaskDescr->prev; +} + +INLINE void omptarget_nvptx_TaskDescr::CopyParent(omptarget_nvptx_TaskDescr *parentTaskDescr) +{ + CopyData(parentTaskDescr); + prev = parentTaskDescr; +} + +INLINE void omptarget_nvptx_TaskDescr::CopyForExplicitTask(omptarget_nvptx_TaskDescr *parentTaskDescr) +{ + CopyParent(parentTaskDescr); + data.items.flags = data.items.flags & ~TaskDescr_IsParConstr; + ASSERT0(LT_FUSSY, IsTaskConstruct(), "expected task"); +} + +INLINE void omptarget_nvptx_TaskDescr::CopyToWorkDescr( + omptarget_nvptx_TaskDescr *masterTaskDescr, + uint16_t tnum) +{ + CopyParent(masterTaskDescr); + // overrwrite specific items; + data.items.flags |= TaskDescr_InPar | TaskDescr_IsParConstr; // set flag to parallel + data.items.threadsInTeam = tnum; // set number of threads +} + +INLINE void omptarget_nvptx_TaskDescr::CopyFromWorkDescr(omptarget_nvptx_TaskDescr *workTaskDescr) +{ + Copy(workTaskDescr); + // overrwrite specific items; + data.items.threadId = GetThreadIdInBlock(); // get ids from cuda (only called for 1st level) +} + +//////////////////////////////////////////////////////////////////////////////// +// Thread Private Context +//////////////////////////////////////////////////////////////////////////////// + +INLINE omptarget_nvptx_TaskDescr *omptarget_nvptx_ThreadPrivateContext::GetTopLevelTaskDescr(int gtid) +{ + ASSERT0(LT_FUSSY, gtid < MAX_NUM_OMP_THREADS, + "Getting top level, gtid is larger than allocated data structure size"); + return topTaskDescr[gtid]; +} + +INLINE void omptarget_nvptx_ThreadPrivateContext::InitThreadPrivateContext(int gtid) +{ + // levelOneTaskDescr is init when starting the parallel region + // top task descr is NULL (team master version will be fixed separately) + topTaskDescr[gtid] = NULL; + // no num threads value has been pushed + tnumForNextPar[gtid] = 0; + // priv counter init to zero + priv[gtid] = 0; + // the following don't need to be init here; they are init when using dyn sched + // current_Event, events_Number, chunk, num_Iterations, schedule +} + +//////////////////////////////////////////////////////////////////////////////// +// Work Descriptor +//////////////////////////////////////////////////////////////////////////////// + +INLINE void omptarget_nvptx_WorkDescr::InitWorkDescr() +{ + cg.Clear(); // start and stop to zero too + // threadsInParallelTeam does not need to be init (done in start parallel) + hasCancel = FALSE; +} + +//////////////////////////////////////////////////////////////////////////////// +// Team Descriptor +//////////////////////////////////////////////////////////////////////////////// + +INLINE void omptarget_nvptx_TeamDescr::InitTeamDescr() +{ + levelZeroTaskDescr.InitLevelZeroTaskDescr(); + workDescrForActiveParallel.InitWorkDescr(); + //omp_init_lock(criticalLock); +} + +//////////////////////////////////////////////////////////////////////////////// +// Get private data structure for thread +//////////////////////////////////////////////////////////////////////////////// + +// Utility routines for CUDA threads +INLINE omptarget_nvptx_TeamDescr & getMyTeamDescriptor() +{ + return omptarget_nvptx_threadPrivateContext->TeamContext()[GetOmpTeamId()]; +} + +INLINE omptarget_nvptx_WorkDescr & getMyWorkDescriptor() +{ + omptarget_nvptx_TeamDescr & currTeamDescr = getMyTeamDescriptor(); + return currTeamDescr.WorkDescr(); +} + +INLINE omptarget_nvptx_TaskDescr * getMyTopTaskDescriptor(int globalThreadId) +{ + return omptarget_nvptx_threadPrivateContext->GetTopLevelTaskDescr(globalThreadId); +} + +INLINE omptarget_nvptx_TaskDescr * getMyTopTaskDescriptor() +{ + return getMyTopTaskDescriptor(GetGlobalThreadId()); +} + Index: libomptarget/deviceRTLs/nvptx/src/option.h =================================================================== --- /dev/null +++ libomptarget/deviceRTLs/nvptx/src/option.h @@ -0,0 +1,78 @@ +//===------------ option.h - NVPTX OpenMP GPU options ------------ CUDA -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is dual licensed under the MIT and the University of Illinois Open +// Source Licenses. See LICENSE.txt for details. +// +//===----------------------------------------------------------------------===// +// +// GPU default options +// +//===----------------------------------------------------------------------===// +#ifndef _OPTION_H_ +#define _OPTION_H_ + +//////////////////////////////////////////////////////////////////////////////// +// Kernel options +//////////////////////////////////////////////////////////////////////////////// + +//////////////////////////////////////////////////////////////////////////////// +// following two defs must match absolute limit hardwired in the host RTL +#define TEAMS_ABSOLUTE_LIMIT 512 /* omptx limit (must match teamsAbsoluteLimit) */ +#define THREAD_ABSOLUTE_LIMIT 1024 /* omptx limit (must match threadAbsoluteLimit) */ + +// max number of blocks depend on the kernel we are executing - pick default here +#define MAX_NUM_TEAMS TEAMS_ABSOLUTE_LIMIT +#define WARPSIZE 32 +#define MAX_NUM_WARPS (MAX_NUM_TEAMS * THREAD_ABSOLUTE_LIMIT / WARPSIZE) +#define MAX_NUM_THREADS MAX_NUM_WARPS * WARPSIZE + +#ifdef OMPTHREAD_IS_WARP + // assume here one OpenMP thread per CUDA warp + #define MAX_NUM_OMP_THREADS MAX_NUM_WARPS +#else + // assume here one OpenMP thread per CUDA thread + #define MAX_NUM_OMP_THREADS MAX_NUM_THREADS +#endif + +#define MAX_INSTANCES 16 + +//////////////////////////////////////////////////////////////////////////////// +// algo options +//////////////////////////////////////////////////////////////////////////////// + + +//////////////////////////////////////////////////////////////////////////////// +// data options +//////////////////////////////////////////////////////////////////////////////// + + +// decide if counters are 32 or 64 bit +#define Counter unsigned long long + +// aee: KMP defines kmp_int to be 32 or 64 bits depending on the target. +// think we don't need it here (meaning we can be always 64 bit compatible) +/* +#ifdef KMP_I8 + typedef kmp_int64 kmp_int; +#else + typedef kmp_int32 kmp_int; +#endif +*/ + +//////////////////////////////////////////////////////////////////////////////// +// misc options (by def everythig here is device) +//////////////////////////////////////////////////////////////////////////////// + +#define EXTERN extern "C" __device__ +#define INLINE __inline__ __device__ +#define NOINLINE __noinline__ __device__ +#ifndef TRUE + #define TRUE 1 +#endif +#ifndef FALSE + #define FALSE 0 +#endif + +#endif Index: libomptarget/deviceRTLs/nvptx/src/parallel.cu =================================================================== --- /dev/null +++ libomptarget/deviceRTLs/nvptx/src/parallel.cu @@ -0,0 +1,244 @@ +//===---- parallel.cu - NVPTX OpenMP parallel implementation ----- CUDA -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is dual licensed under the MIT and the University of Illinois Open +// Source Licenses. See LICENSE.txt for details. +// +//===----------------------------------------------------------------------===// +// +// Parallel implemention in the GPU. Here is the pattern: +// +// while (not finished) { +// +// if (master) { +// sequential code, decide which par loop to do, or if finished +// __kmpc_kernel_prepare_parallel() // exec by master only +// } +// syncthreads // A +// __kmpc_kernel_parallel() // exec by all +// if (this thread is included in the parallel) { +// switch () for all parallel loops +// __kmpc_kernel_end_parallel() // exec only by threads in parallel +// } +// +// +// The reason we don't exec end_parallel for the threads not included +// in the parallel loop is that for each barrier in the parallel +// region, these non-included threads will cycle through the +// syncthread A. Thus they must preserve their current threadId that +// is larger than thread in team. +// +// To make a long story short... +// +//===----------------------------------------------------------------------===// + +#include "../../../deviceRTLs/nvptx/src/omptarget-nvptx.h" + +//////////////////////////////////////////////////////////////////////////////// +// support for parallel that goes parallel (1 static level only) +//////////////////////////////////////////////////////////////////////////////// + +// return number of cuda threads that participate to parallel +// calculation has to consider simd implementation in nvptx +// i.e. (num omp threads * num lanes) +// +// cudathreads = +// if(num_threads != 0) { +// if(thread_limit > 0) { +// min (num_threads*numLanes ; thread_limit*numLanes); +// } else { +// min (num_threads*numLanes; blockDim.x) +// } +// } else { +// if (thread_limit != 0) { +// min (thread_limit*numLanes; blockDim.x) +// } else { // no thread_limit, no num_threads, use all cuda threads +// blockDim.x; +// } +// } +EXTERN int __kmpc_kernel_prepare_parallel(int NumThreads, int NumLanes) +{ + PRINT0(LD_IO , "call to __kmpc_kernel_prepare_parallel\n"); + int globalThreadId = GetGlobalThreadId(); + omptarget_nvptx_TaskDescr *currTaskDescr = + omptarget_nvptx_threadPrivateContext->GetTopLevelTaskDescr(globalThreadId); + ASSERT0(LT_FUSSY, currTaskDescr, "expected a top task descr"); + if (currTaskDescr->InParallelRegion()) { + PRINT0(LD_PAR, "already in parallel: go seq\n"); + + // todo: support nested parallelism + return FALSE; + } + + uint16_t CudaThreadsForParallel = 0; + uint16_t NumThreadsClause = + omptarget_nvptx_threadPrivateContext->NumThreadsForNextParallel( + globalThreadId); + + // we cannot have more than block size + uint16_t CudaThreadsAvail = GetNumberOfThreadsInBlock(); + + // this is different from ThreadAvail of OpenMP because we may be + // using some of the CUDA threads as SIMD lanes + + if (NumThreadsClause != 0) { + // reset request to avoid propagating to successive #parallel + omptarget_nvptx_threadPrivateContext->NumThreadsForNextParallel( + globalThreadId) = 0; + + // assume that thread_limit*numlanes is already <= CudaThreadsAvail + // because that is already checked on the host side (CUDA offloading rtl) + if (currTaskDescr->ThreadLimit() != 0) + CudaThreadsForParallel = + NumThreadsClause*NumLanes < currTaskDescr->ThreadLimit()*NumLanes ? + NumThreadsClause*NumLanes : currTaskDescr->ThreadLimit()*NumLanes; + else { + CudaThreadsForParallel = (NumThreadsClause*NumLanes > CudaThreadsAvail) ? + CudaThreadsAvail : NumThreadsClause*NumLanes; + } + } else { + if (currTaskDescr->ThreadLimit() != 0) { + CudaThreadsForParallel = + (currTaskDescr->ThreadLimit()*NumLanes > CudaThreadsAvail) ? + CudaThreadsAvail : currTaskDescr->ThreadLimit()*NumLanes; + } else + CudaThreadsForParallel = GetNumberOfThreadsInBlock(); + } + + ASSERT(LT_FUSSY, CudaThreadsForParallel > 0, "bad thread request of %d threads", CudaThreadsForParallel); + ASSERT0(LT_FUSSY, GetThreadIdInBlock() == TEAM_MASTER, "only team master can create parallel"); + + // set number of threads on work descriptor + // this is different from the number of cuda threads required for the parallel + // region + omptarget_nvptx_WorkDescr & workDescr = getMyWorkDescriptor(); + workDescr.WorkTaskDescr()->CopyToWorkDescr(currTaskDescr, + CudaThreadsForParallel/NumLanes); + // init counters (copy start to init) + workDescr.CounterGroup().Reset(); + + return CudaThreadsForParallel; +} + +// works only for active parallel loop... +EXTERN void __kmpc_kernel_parallel(int numLanes) +{ + PRINT0(LD_IO | LD_PAR, "call to __kmpc_kernel_parallel\n"); + // init work descriptor from workdesccr + int globalThreadId = GetGlobalThreadId(); + omptarget_nvptx_TaskDescr *newTaskDescr = + omptarget_nvptx_threadPrivateContext->Level1TaskDescr(globalThreadId); + omptarget_nvptx_WorkDescr & workDescr = getMyWorkDescriptor(); + ASSERT0(LT_FUSSY, newTaskDescr, "expected a task descr"); + newTaskDescr->CopyFromWorkDescr(workDescr.WorkTaskDescr()); + // install new top descriptor + omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(globalThreadId, newTaskDescr); + // init private from int value + workDescr.CounterGroup().Init( + omptarget_nvptx_threadPrivateContext->Priv(globalThreadId)); + PRINT(LD_PAR, "thread will execute parallel region with id %d in a team of %d threads\n", + newTaskDescr->ThreadId(), newTaskDescr->NThreads()); + + // each thread sets its omp thread ID when entering a parallel region + // based on the number of simd lanes and its cuda thread ID + if (numLanes > 1) { + // the compiler is requesting lanes for #simd execution + // WARNING: assume thread number in #parallel is a multiple of numLanes + newTaskDescr->ThreadId() /= numLanes; + //newTaskDescr->ThreadsInTeam(); // = newTaskDescr->ThreadsInTeam()/numLanes; + } +// } else { +// // not a for with a simd inside: use only one lane +// // we may have started thread_limit*simd_info CUDA threads +// // and we need to set the number of threads to thread_limit value +// // FIXME: is this always the case, even if numLanes > 1? +//// newTaskDescr->ThreadId() = threadIdx.x; +// //newTaskDescr->ThreadsInTeam();// = newTaskDescr->ThreadLimit(); +// } +} + + + +EXTERN void __kmpc_kernel_end_parallel() +{ + PRINT0(LD_IO | LD_PAR, "call to __kmpc_kernel_end_parallel\n"); + // pop stack + int globalThreadId = GetGlobalThreadId(); + omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor(globalThreadId); + omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(globalThreadId, + currTaskDescr->GetPrevTaskDescr()); +} + + +//////////////////////////////////////////////////////////////////////////////// +// support for parallel that goes sequential +//////////////////////////////////////////////////////////////////////////////// + +EXTERN void __kmpc_serialized_parallel(kmp_Indent *loc, uint32_t global_tid) +{ + PRINT0(LD_IO, "call to __kmpc_serialized_parallel\n"); + + // assume this is only called for nested parallel + int globalThreadId = GetGlobalThreadId(); + + // unlike actual parallel, threads in the same team do not share + // the workTaskDescr in this case and num threads is fixed to 1 + + // get current task + omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor(globalThreadId); + + // allocate new task descriptor and copy value from current one, set prev to it + omptarget_nvptx_TaskDescr *newTaskDescr = (omptarget_nvptx_TaskDescr *) + SafeMalloc(sizeof(omptarget_nvptx_TaskDescr), (char *) "new seq parallel task"); + newTaskDescr->CopyParent(currTaskDescr); + + // tweak values for serialized parallel case: + // - each thread becomes ID 0 in its serialized parallel, and + // - there is only one thread per team + newTaskDescr->ThreadId() = 0; + newTaskDescr->ThreadsInTeam() = 1; + + // set new task descriptor as top + omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(globalThreadId, newTaskDescr); +} + +EXTERN void __kmpc_end_serialized_parallel(kmp_Indent *loc, uint32_t global_tid) +{ + PRINT0(LD_IO, "call to __kmpc_end_serialized_parallel\n"); + + // pop stack + int globalThreadId = GetGlobalThreadId(); + omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor(globalThreadId); + // set new top + omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr( + globalThreadId, + currTaskDescr->GetPrevTaskDescr()); + // free + SafeFree(currTaskDescr, (char *) "new seq parallel task"); +} + +//////////////////////////////////////////////////////////////////////////////// +// push params +//////////////////////////////////////////////////////////////////////////////// + + +EXTERN void __kmpc_push_num_threads (kmp_Indent * loc, int32_t gtid, + int32_t num_threads) +{ + PRINT(LD_IO, "call kmpc_push_num_threads %d\n", num_threads); + // only the team master updates the state + gtid = GetGlobalThreadId(); + omptarget_nvptx_threadPrivateContext->NumThreadsForNextParallel(gtid) = num_threads; +} + +// Do not do nothing: the host guarantees we started the requested number of +// teams and we only need inspection gridDim + +EXTERN void __kmpc_push_num_teams (kmp_Indent * loc, int32_t gtid, + int32_t num_teams, int32_t thread_limit) +{ + PRINT(LD_IO, "call kmpc_push_num_teams %d\n", num_teams); + ASSERT0(LT_FUSSY, FALSE, "should never have anything with new teams on device"); +} + Index: libomptarget/deviceRTLs/nvptx/src/reduction.cu =================================================================== --- /dev/null +++ libomptarget/deviceRTLs/nvptx/src/reduction.cu @@ -0,0 +1,1247 @@ +//===---- reduction.cu - NVPTX OpenMP reduction implementation ---- CUDA -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is dual licensed under the MIT and the University of Illinois Open +// Source Licenses. See LICENSE.txt for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains the implementation of reduction with KMPC interface. +// +//===----------------------------------------------------------------------===// + +#include +#include + +#include "../../../deviceRTLs/nvptx/src/omptarget-nvptx.h" + +//EXTERN int32_t __gpu__warpBlockRedu_fixed4_add(int32_t); + +EXTERN void omp_reduction_op_gpu(char *, char *); + + +//cannot implement atomic_start and atomic_end for GPU. Report runtime error +EXTERN void __kmpc_atomic_start() { + printf("__kmpc_atomic_start not supported\n"); \ + asm("trap;"); \ + return; \ +} + +EXTERN void __kmpc_atomic_end() { + printf("__kmpc_atomic_end not supported\n"); \ + asm("trap;"); \ + return; \ +} + +//EXTERN kmp_ReductFctPtr *gpu_callback = (kmp_ReductFctPtr *)omp_reduction_op_gpu; + +EXTERN +int32_t __gpu_block_reduce(){ + if (omp_get_num_threads() != blockDim.x) + return 0; + unsigned tnum = __ballot(1); + if (tnum != (~0x0)) { //assume swapSize is 32 + return 0; + } + return 1; +} + +EXTERN +int32_t __kmpc_reduce_gpu(kmp_Indent *loc, int32_t global_tid, int32_t num_vars, size_t reduce_size, void *reduce_data, void *reduce_array_size, kmp_ReductFctPtr *reductFct, kmp_CriticalName *lck) { + int globalThreadId = GetGlobalThreadId(); + omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor(globalThreadId); + int numthread; + if (currTaskDescr->IsParallelConstruct()) { + numthread = omp_get_num_threads(); + } else { + numthread = omp_get_num_teams(); + } + + if (numthread == 1) + return 1; + else if (!__gpu_block_reduce()) + return 2; + else { + if (threadIdx.x == 0) + return 1; + else + return 0; + } +// return 2; + /** + * Only when all the threads in a block are doing reduction, + * the warpBlockRedu is used. Otherwise atomic. + * check the data type, too. + * A special case: when the size of thread group is one, + * do reduction directly. + **/ + + // Note: this code provokes warning because it follows a "return" + + //since there is no thread interface yet, just infer from the + // result of ballot +#if 0 + unsigned tnum = __ballot(1); + if (tnum != (~0x0)) { //assume swapSize is 32 + return 2; + } + +#if 0 + if (threadIdx.x == 0) { + if ((void *)reductFct != (void *)omp_reduction_op_gpu) { + printf("function pointer value is not correct\n"); + } else { + printf("function pointer value is correct\n"); + } + } +#endif + + //printf("function pointer %p %d %p\n", reductFct, reduce_size, omp_reduction_op_gpu); + if (reduce_size == 0) { + (*reductFct)((char*)reduce_data, (char*)reduce_data); + } else { + //omp_reduction_op_gpu((char*)reduce_data, (char*)reduce_data); + (*gpu_callback)((char*)reduce_data, (char*)reduce_data); + } + + //int **myp = (int **) reduce_data; + // the results are with thread 0. Reduce to the shared one + if (threadIdx.x == 0) { + //printf("function pointer %p %p\n", reductFct, omp_reduction_op); + // printf("my result %d\n", *myp[0]); + return 1; + } else { + return 0; + } +#endif +} + +EXTERN +int32_t __kmpc_reduce_combined(kmp_Indent *loc) { + if (threadIdx.x == 0){ + return 2; + }else{ + return 0; + } +} + +EXTERN +int32_t __kmpc_reduce41(kmp_Indent *loc, int32_t global_tid, int32_t num_vars, size_t reduce_size, void *reduce_data, void *reduce_array_size, kmp_ReductFctPtr *reductFct, kmp_CriticalName *lck) { + return __kmpc_reduce_gpu(loc, global_tid, num_vars, reduce_size, reduce_data, reduce_array_size, reductFct, lck); +} + +EXTERN +int32_t __kmpc_reduce_nowait41(kmp_Indent *loc, int32_t global_tid, int32_t num_vars, size_t reduce_size, void *reduce_data, void *reduce_array_size, kmp_ReductFctPtr *reductFct, kmp_CriticalName *lck) { + int globalThreadId = GetGlobalThreadId(); + omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor(globalThreadId); + int numthread; + if (currTaskDescr->IsParallelConstruct()) { + numthread = omp_get_num_threads(); + } else { + numthread = omp_get_num_teams(); + } + + if (numthread == 1) + return 1; + else if (!__gpu_block_reduce()) + return 2; + else { + if (threadIdx.x == 0) + return 1; + else + return 0; + } + + // Notice: as above, uncomment if 0 once this code below is ready for shipping +#if 0 + unsigned tnum = __ballot(1); + if (tnum != (~0x0)) { //assume swapSize is 32 + return 2; + } + + if (threadIdx.x == 0) { + printf("choose block reduction\n"); + } + + (*reductFct)(reduce_data, reduce_data); + //omp_reduction_op((char*)reduce_data, (char*)reduce_data); + + int **myp = (int **) reduce_data; + // the results are with thread 0. Reduce to the shared one + if (threadIdx.x == 0) { + printf("my result %d\n", *myp[0]); + return 1; + } else { + return 0; + } +#endif +} + +EXTERN +void __kmpc_end_reduce( kmp_Indent *loc, int32_t global_tid, kmp_CriticalName *lck ) { +} + +EXTERN +void __kmpc_end_reduce_nowait( kmp_Indent *loc, int32_t global_tid, kmp_CriticalName *lck ) { +} + + + + +/* implement different data type or operations with atomicCAS + */ + +#define omptarget_nvptx_add(x, y) ((x)+(y)) +#define omptarget_nvptx_sub(x, y) ((x)-(y)) +#define omptarget_nvptx_sub_rev(y, x) ((x)-(y)) +#define omptarget_nvptx_mul(x, y) ((x)*(y)) +#define omptarget_nvptx_div(x, y) ((x)/(y)) +#define omptarget_nvptx_div_rev(y, x) ((x)/(y)) +#define omptarget_nvptx_min(x, y) ((x)>(y)?(y):(x)) +#define omptarget_nvptx_max(x, y) ((x)<(y)?(y):(x)) +#define omptarget_nvptx_andb(x, y) ((x) & (y)) +#define omptarget_nvptx_orb(x, y) ((x) | (y)) +#define omptarget_nvptx_xor(x, y) ((x) ^ (y)) +#define omptarget_nvptx_shl(x,y) ((x) << (y)) +#define omptarget_nvptx_shr(x,y) ((x) >> (y)) +#define omptarget_nvptx_andl(x, y) ((x) && (y)) +#define omptarget_nvptx_orl(x, y) ((x) || (y)) +#define omptarget_nvptx_eqv(x, y) ((x) == (y)) +#define omptarget_nvptx_neqv(x, y) ((x) != (y)) + +#if 0 +// keep for debugging +EXTERN +void __kmpc_atomic_fixed4_add(kmp_Indent *id_ref, int32_t gtid, int32_t * lhs, int32_t rhs) { + //if (gtid < 64) + PRINT(LD_LOOP, "thread %d participating in reduction, lhs = %p, rhs = %d\n", gtid, lhs, rhs); + atomicAdd(lhs, rhs); +} +#endif + + +INLINE __device__ float atomicCAS(float *_addr, float _compare, float _val) { + int *addr = (int *) _addr; + int compare = __float_as_int(_compare); + int val = __float_as_int(_val); + return __int_as_float(atomicCAS(addr, compare, val)); +} + +INLINE __device__ double atomicCAS(double *_addr, double _compare, double _val) { + unsigned long long int *addr = (unsigned long long int *) _addr; + unsigned long long int compare = __double_as_longlong(_compare); + unsigned long long int val = __double_as_longlong(_val); + return __longlong_as_double(atomicCAS(addr, compare, val)); +} + +INLINE __device__ long long int atomicCAS(long long int *_addr, long long int _compare, long long int _val) { + unsigned long long int *addr = (unsigned long long int *) _addr; + unsigned long long int compare = (unsigned long long int)(_compare); + unsigned long long int val = (unsigned long long int)(_val); + return (long long int) (atomicCAS(addr, compare, val)); +} + +INLINE __device__ int64_t atomicCAS(int64_t *_addr, int64_t _compare, int64_t _val) { + unsigned long long int *addr = (unsigned long long int *) _addr; + unsigned long long int compare = (unsigned long long int)(_compare); + unsigned long long int val = (unsigned long long int)(_val); + return (int64_t) (atomicCAS(addr, compare, val)); +} + +INLINE __device__ uint64_t atomicCAS(uint64_t *_addr, uint64_t _compare, uint64_t _val) { + unsigned long long int *addr = (unsigned long long int *) _addr; + unsigned long long int compare = (unsigned long long int)(_compare); + unsigned long long int val = (unsigned long long int)(_val); + return (uint64_t) (atomicCAS(addr, compare, val)); +} + +INLINE __device__ float complex atomicCAS(float complex *_addr, float complex _compare, float complex _val) { + double *addr = (double *) _addr; + double compare = (double)(_compare); + double val = (double)(_val); + return (float complex) (atomicCAS(addr, compare, val)); +} + +#define ATOMIC_GENOP_NATIVE(_name, _dtype, _op, _cudaop) \ + EXTERN void __kmpc_atomic_##_name##_##_op\ + (kmp_Indent *id_ref, int32_t gtid, _dtype * lhs, _dtype rhs) { \ + PRINT(LD_LOOP, "Reduction: thead %d\n", gtid); \ + atomic##_cudaop(lhs, rhs); \ +} \ + \ + EXTERN _dtype __kmpc_atomic_##_name##_##_op##_cpt\ + (kmp_Indent *id_ref, int32_t gtid, _dtype * lhs, _dtype rhs, int flag) { \ + _dtype old = atomic##_cudaop(lhs, rhs); \ + if (flag) { \ + return omptarget_nvptx_##_op(old, rhs); \ + } else {\ + return old; \ + } \ +} + +/*for types that are supported directly by atomicCAS */ +#define ATOMIC_GENOP_DIRECT(_name, _dtype, _op) \ + EXTERN void __kmpc_atomic_##_name##_##_op\ + (kmp_Indent *id_ref, int32_t gtid, _dtype * lhs, _dtype rhs) { \ + PRINT(LD_LOOP, "Reduction: thead %d\n", gtid); \ + _dtype *temp_lhs = lhs; \ + _dtype oldvalue = *temp_lhs; \ + _dtype saved ; \ + _dtype newvalue ; \ + do { \ + saved = oldvalue; \ + newvalue = (_dtype)omptarget_nvptx_##_op(saved, rhs); \ + oldvalue = atomicCAS(temp_lhs, saved, newvalue); \ + } while (saved != oldvalue) ;\ +} \ + \ + EXTERN _dtype __kmpc_atomic_##_name##_##_op##_cpt\ + (kmp_Indent *id_ref, int32_t gtid, _dtype * lhs, _dtype rhs, int flag) { \ + _dtype *temp_lhs = lhs; \ + _dtype oldvalue = *temp_lhs; \ + _dtype saved ; \ + _dtype newvalue ; \ + do { \ + saved = oldvalue; \ + newvalue = (_dtype)omptarget_nvptx_##_op(saved, rhs); \ + oldvalue = atomicCAS(temp_lhs, saved, newvalue); \ + } while (saved != oldvalue) ;\ + if (flag) return newvalue; \ + else return oldvalue ; \ +} + +#define ATOMIC_GENOP_DIRECT_REV(_name, _dtype, _op) \ + EXTERN void __kmpc_atomic_##_name##_##_op##_rev\ + (kmp_Indent *id_ref, int32_t gtid, _dtype * lhs, _dtype rhs) { \ + _dtype *temp_lhs = lhs; \ + _dtype oldvalue = *temp_lhs; \ + _dtype saved ; \ + _dtype newvalue ; \ + do { \ + saved = oldvalue; \ + newvalue = (_dtype)omptarget_nvptx_##_op(rhs, saved); \ + oldvalue = atomicCAS(temp_lhs, saved, newvalue); \ + } while (saved != oldvalue) ;\ +} \ + \ + EXTERN _dtype __kmpc_atomic_##_name##_##_op##_cpt##_rev\ + (kmp_Indent *id_ref, int32_t gtid, _dtype * lhs, _dtype rhs, int flag) { \ + _dtype *temp_lhs = lhs; \ + _dtype oldvalue = *temp_lhs; \ + _dtype saved ; \ + _dtype newvalue ; \ + do { \ + saved = oldvalue; \ + newvalue = (_dtype)omptarget_nvptx_##_op(rhs, saved); \ + oldvalue = atomicCAS(temp_lhs, saved, newvalue); \ + } while (saved != oldvalue) ;\ + if (flag) return newvalue; \ + else return oldvalue ; \ +} + + +INLINE __device__ void dc_add(double complex *lhs, double complex rhs) { + double *ptrl = (double *)lhs; + double *ptrr = (double *) &rhs; + ptrl[0] += ptrr[0]; + ptrl[1] += ptrr[1]; +} + +INLINE __device__ void dc_sub(double complex *lhs, double complex rhs) { + double *ptrl = (double *)lhs; + double *ptrr = (double *) &rhs; + ptrl[0] -= ptrr[0]; + ptrl[1] -= ptrr[1]; +} + +INLINE __device__ void dc_mul(double complex *lhs, double complex rhs) { + double *ptrl = (double *)lhs; + double *ptrr = (double *) &rhs; + double r1 = ptrl[0], r2 = ptrr[0]; + double i1 = ptrl[1], i2 = ptrr[1]; + ptrl[0] = r1*r2-i1*i2; + ptrl[1] = r1*i2+r2*i1; +} + +INLINE __device__ void dc_div(double complex *lhs, double complex rhs) { + double *ptrl = (double *)lhs; + double *ptrr = (double *) &rhs; + double r1 = ptrl[0], r2 = ptrr[0]; + double i1 = ptrl[1], i2 = ptrr[1]; + ptrl[0] = (r1*r2+i1*i2)/(r2*r2+i2*i2); + ptrl[1] = (i1*r2-r1*i2)/(r2*r2+i2*i2); +} + +#define ATOMIC_GENOP_DC(_op) \ + EXTERN void __kmpc_atomic_cmplx8_##_op\ + (kmp_Indent *id_ref, int32_t gtid, double _Complex * lhs, double _Complex rhs) { \ + printf("double complex atomic opertion not supported\n"); \ + asm("trap;"); \ + return; \ + }\ + EXTERN double _Complex __gpu_warpBlockRedu_cmplx8_##_op(double _Complex rhs) { \ + __shared__ double _Complex lhs; \ + if (threadIdx.x == 0 ) \ + lhs = rhs; \ + __syncthreads(); \ + for (int i= 1; i +INLINE __device__ OpType Compute(OpType a, OpType b) // a is old value, b is new value +{ + OpType res = 0; + if (binop == omptarget_nvptx_inc) res = a + b; + if (binop == omptarget_nvptx_dec) res = a - b; + if (binop == omptarget_nvptx_add) res = a + b; + if (binop == omptarget_nvptx_sub) res = a - b; + if (binop == omptarget_nvptx_sub_rev) res = b - a; + if (binop == omptarget_nvptx_mul) res = a * b; + if (binop == omptarget_nvptx_div) res = a / b; + if (binop == omptarget_nvptx_div_rev) res = b / a; + if (binop == omptarget_nvptx_min) res = a < b ? a : b; + if (binop == omptarget_nvptx_max) res = a > b ? a : b; + if (binop == omptarget_nvptx_rd) res = a; // read + if (binop == omptarget_nvptx_wr) res = b; // write and swap are the same + if (binop == omptarget_nvptx_swp) res = b; // write and swap are the same + if (binop == omptarget_nvptx_andb) res = a & b; + if (binop == omptarget_nvptx_orb) res = a | b; + if (binop == omptarget_nvptx_xor) res = a ^ b; + if (binop == omptarget_nvptx_andl) res = a && b; + if (binop == omptarget_nvptx_orl) res = a || b; + if (binop == omptarget_nvptx_eqv) res = a == b; + if (binop == omptarget_nvptx_neqv) res = a != b; + if (binop == omptarget_nvptx_shl) res = a << b; + if (binop == omptarget_nvptx_shl_rev) res = b << a; + if (binop == omptarget_nvptx_shr) res = a >> b; + if (binop == omptarget_nvptx_shr_rev) res = b >> a; + + return res; +} + + +template<> +INLINE __device__ float Compute(float a, float b) +{ + return a+b; +} + +template<> +INLINE __device__ float Compute(float a, float b) +{ + return a-b; +} + +template<> +INLINE __device__ float Compute(float a, float b) +{ + return a*b; +} + +template<> +INLINE __device__ float Compute(float a, float b) +{ + return a/b; +} + +template<> +INLINE __device__ float Compute(float a, float b) +{ + return a +INLINE __device__ float Compute(float a, float b) +{ + return a>b?a:b; +} + +template<> +INLINE __device__ double Compute(double a, double b) +{ + return a+b; +} + +template<> +INLINE __device__ double Compute(double a, double b) +{ + return a-b; +} + +template<> +INLINE __device__ double Compute(double a, double b) +{ + return a*b; +} + +template<> +INLINE __device__ double Compute(double a, double b) +{ + return a/b; +} + +template<> +INLINE __device__ double Compute(double a, double b) +{ + return a +INLINE __device__ double Compute(double a, double b) +{ + return a>b?a:b; +} + +#if 0 +template < + omptarget_nvptx_BINOP_t binop // enum describing the operation +> +INLINE __device__ float Compute(float a, float b) // a is old value, b is new value +{ + OpType res = 0; + if (binop == omptarget_nvptx_add) res = a + b; + if (binop == omptarget_nvptx_sub) res = a - b; + if (binop == omptarget_nvptx_mul) res = a * b; + if (binop == omptarget_nvptx_div) res = a / b; + if (binop == omptarget_nvptx_min) res = a < b ? a : b; + if (binop == omptarget_nvptx_max) res = a > b ? a : b; + return res; +} +#endif + + +//////////////////////////////////////////////////////////////////////////////// +// common atomic slicing functions (modifying only a part of a word) +//////////////////////////////////////////////////////////////////////////////// + +template < + typename MemType, // type of the underlying atomic memory operation + typename OpType // type of the operation performed +> +INLINE __device__ void ComputeAtomic_PrepareSlice( + OpType *addr, // original address + MemType **memAddrPtr, // truncated address to MemType boundary + MemType *memBitShiftRightPtr, // bits to shift to move val to rightmost position + MemType *memValMaskInPlacePtr) // mask of val in proper position +{ + //ASSERT(LT_FUSSY, sizeof(OpType) 0x3; long long -> 0x7 + unsigned long memAddrMask = sizeof(MemType) - 1; + // compute the addr of the atomic variable truncated to alignment of memType + *memAddrPtr = (MemType *) + ((unsigned long) addr & ~ memAddrMask); + // compute the number of bit shift to move the target atomic value in + // the rightmost position + unsigned long byteOffsetInMem = (unsigned long) addr & memAddrMask; + + /* for big-endian */ + //unsigned long lastByteOffsetInMem = byteOffsetInMem + sizeof(OpType) -1; +// unsigned long byteShiftRight = (sizeof(MemType) - 1) - lastByteOffsetInMem; + //*memBitShiftRightPtr = (MemType) (byteShiftRight << 3); // 3: byte to bits + + + /* for little-endian */ + unsigned long byteShiftRight = byteOffsetInMem; + *memBitShiftRightPtr = (MemType) (byteShiftRight << 3); // 3: byte to bits + + // mask to isolate target atomic value located in rightmost position + MemType memValMask = ((MemType) 1 << (sizeof(OpType) << 3)) -1; + // mask to isolate target atomic value located in place + *memValMaskInPlacePtr = memValMask << *memBitShiftRightPtr; +/* + printf( + "Atomic of size %d in mem size %d: addr 0x%llx, truncated addr 0x%llx, shift right %lld, mask in place 0x%llx\n", + sizeof(MemType), sizeof(OpType), (uint64_t) addr, (uint64_t) *memAddrPtr, + (uint64_t) *memBitShiftRightPtr, (uint64_t) *memValMaskInPlacePtr); +*/ +} + +template < + typename MemType, // type of the underlying atomic memory operation + typename OpType, // type of the operation performed + omptarget_nvptx_BINOP_t binop // enum describing the operation +> +INLINE __device__ MemType ComputeAtomic_ComputeSlice( + MemType oldMemVal, // old value + OpType val, // value to compute with + MemType memBitShiftRight, // bits to shift to move val to rightmost position + MemType memValMaskInPlace // mask of val in proper position + ) +{ + OpType oldValtmp; + OpType newValtmp; + // select target atomic val + MemType oldMemVal_targetVal = oldMemVal & memValMaskInPlace; + MemType oldMemVal_otherVal = oldMemVal & ~ memValMaskInPlace; + // shift target atomic val to rightmost place: this is the old value + + //type conversion?? + oldValtmp = (OpType) (oldMemVal_targetVal >> memBitShiftRight); + // perform op + + newValtmp = Compute(oldValtmp, val); + + // insert new value in old world mem + + //type conversion?? + MemType newMemVal_targetVal = ((MemType) newValtmp) << memBitShiftRight; + newMemVal_targetVal &= memValMaskInPlace; + MemType newMemVal = oldMemVal_otherVal | newMemVal_targetVal; + return newMemVal; +} + + + +#define ATOMIC_GENOP_PARTIAL(_name, _dtype, _op, _memType) \ + EXTERN void __kmpc_atomic_##_name##_##_op\ + (kmp_Indent *id_ref, int32_t gtid, _dtype * lhs, _dtype rhs) { \ + _memType *memAddr ; \ + _memType memBitShiftRightPtr; \ + _memType memValMaskInPlacePtr; \ + ComputeAtomic_PrepareSlice<_memType, _dtype> \ + (lhs, &memAddr, &memBitShiftRightPtr, &memValMaskInPlacePtr); \ + _memType oldMemVal, newMemVal; \ + oldMemVal = *memAddr; \ + _memType savedMemVal; \ + do { \ + savedMemVal = oldMemVal; \ + newMemVal = ComputeAtomic_ComputeSlice <_memType, _dtype, omptarget_nvptx_##_op > \ + (oldMemVal, rhs, memBitShiftRightPtr, memValMaskInPlacePtr); \ + oldMemVal = atomicCAS(memAddr, savedMemVal, newMemVal); \ + } while (savedMemVal != oldMemVal); \ +} \ +\ + EXTERN _dtype __kmpc_atomic_##_name##_##_op##_cpt\ + (kmp_Indent *id_ref, int32_t gtid, _dtype * lhs, _dtype rhs, int flag) { \ + _memType *memAddr ; \ + _memType memBitShiftRightPtr; \ + _memType memValMaskInPlacePtr; \ + ComputeAtomic_PrepareSlice<_memType, _dtype> \ + (lhs, &memAddr, &memBitShiftRightPtr, &memValMaskInPlacePtr); \ + _memType oldMemVal, newMemVal; \ + oldMemVal = *memAddr; \ + _memType savedMemVal; \ + do { \ + savedMemVal = oldMemVal; \ + newMemVal = ComputeAtomic_ComputeSlice <_memType, _dtype, omptarget_nvptx_##_op > \ + (oldMemVal, rhs, memBitShiftRightPtr, memValMaskInPlacePtr); \ + oldMemVal = atomicCAS(memAddr, savedMemVal, newMemVal); \ + } while (savedMemVal != oldMemVal); \ + if (flag) \ + return (_dtype) ((newMemVal & memValMaskInPlacePtr) >> memBitShiftRightPtr); \ + else \ + return (_dtype) ((oldMemVal & memValMaskInPlacePtr) >> memBitShiftRightPtr); \ +} + + +#define ATOMIC_GENOP_PARTIAL_REV(_name, _dtype, _op, _memType) \ + EXTERN void __kmpc_atomic_##_name##_##_op##_rev\ + (kmp_Indent *id_ref, int32_t gtid, _dtype * lhs, _dtype rhs) { \ + _memType *memAddr ; \ + _memType memBitShiftRightPtr; \ + _memType memValMaskInPlacePtr; \ + ComputeAtomic_PrepareSlice<_memType, _dtype> \ + (lhs, &memAddr, &memBitShiftRightPtr, &memValMaskInPlacePtr); \ + _memType oldMemVal, newMemVal; \ + oldMemVal = *memAddr; \ + _memType savedMemVal; \ + do { \ + savedMemVal = oldMemVal; \ + newMemVal = ComputeAtomic_ComputeSlice <_memType, _dtype, omptarget_nvptx_##_op > \ + (oldMemVal, rhs, memBitShiftRightPtr, memValMaskInPlacePtr); \ + oldMemVal = atomicCAS(memAddr, savedMemVal, newMemVal); \ + } while (savedMemVal != oldMemVal); \ +} \ +\ + EXTERN _dtype __kmpc_atomic_##_name##_##_op##_cpt_rev\ + (kmp_Indent *id_ref, int32_t gtid, _dtype * lhs, _dtype rhs, int flag) { \ + _memType *memAddr ; \ + _memType memBitShiftRightPtr; \ + _memType memValMaskInPlacePtr; \ + ComputeAtomic_PrepareSlice<_memType, _dtype> \ + (lhs, &memAddr, &memBitShiftRightPtr, &memValMaskInPlacePtr); \ + _memType oldMemVal, newMemVal; \ + oldMemVal = *memAddr; \ + _memType savedMemVal; \ + do { \ + savedMemVal = oldMemVal; \ + newMemVal = ComputeAtomic_ComputeSlice <_memType, _dtype, omptarget_nvptx_##_op > \ + (oldMemVal, rhs, memBitShiftRightPtr, memValMaskInPlacePtr); \ + oldMemVal = atomicCAS(memAddr, savedMemVal, newMemVal); \ + } while (savedMemVal != oldMemVal); \ + if (flag) \ + return (_dtype) ((newMemVal & memValMaskInPlacePtr) >> memBitShiftRightPtr); \ + else \ + return (_dtype) ((oldMemVal & memValMaskInPlacePtr) >> memBitShiftRightPtr); \ +} + + + +#define ATOMIC_GENOP_ALL4(_name, _tname, _optype, _memtype) \ + _name(_tname, _optype, add, _memtype); \ + _name(_tname, _optype, sub, _memtype); \ + _name##_REV(_tname, _optype, sub_rev, _memtype); \ + _name(_tname, _optype, mul, _memtype); \ + _name(_tname, _optype, div, _memtype); \ + _name##_REV(_tname, _optype, div_rev, _memtype); \ + _name(_tname, _optype, min, _memtype); \ + _name(_tname, _optype, max, _memtype); \ + _name(_tname, _optype, andb, _memtype); \ + _name(_tname, _optype, orb, _memtype); \ + _name(_tname, _optype, xor, _memtype); \ + _name(_tname, _optype, andl, _memtype); \ + _name(_tname, _optype, orl, _memtype); \ + _name(_tname, _optype, eqv, _memtype); \ + _name(_tname, _optype, neqv, _memtype); \ + _name(_tname, _optype, shl, _memtype); \ + _name(_tname, _optype, shr, _memtype); + + +ATOMIC_GENOP_ALL4(ATOMIC_GENOP_PARTIAL, fixed1, int8_t, int32_t); +ATOMIC_GENOP_ALL4(ATOMIC_GENOP_PARTIAL, fixed1u, uint8_t, int32_t); +ATOMIC_GENOP_ALL4(ATOMIC_GENOP_PARTIAL, fixed2u, uint16_t, int32_t); +ATOMIC_GENOP_ALL4(ATOMIC_GENOP_PARTIAL, fixed2, int16_t, int32_t); + +/** cooperative reduction + * make use of warp, shared variable, and __syncthreads + **/ + + + + +template +INLINE __device__ T myshfldown(T val, unsigned int delta, int size=warpSize) { + return __shfl_down(val, delta, size); +#if 0 + T ret = 0; + int localv; + int remotev; + /* not finished */ + switch(sizeof(T)) { + case 1: + case 2: + localv = reinterpret_cast(val); + remotev = __shfl_down(localv, delta, size); + ret = reinterpret_cast(remotev); + break; + break; + + } + return ret; +#endif +} + +#if 0 +template<> +INLINE __device__ float myshfldown(float val, unsigned int delta, int size) { + int t = __float_as_int(val); + int t1 = __shfl_down(t, delta, size); + float ret = __int_as_float(t1); + return ret; +} +#endif + +template<> +INLINE __device__ int myshfldown(int val, unsigned int delta, int size) { + return __shfl_down(val, delta, size); +} + +template<> +INLINE __device__ unsigned int myshfldown(unsigned int val, unsigned int delta, int size) { + return __shfl_down(val, delta, size); +} + +template<> +INLINE __device__ int64_t myshfldown(int64_t val, unsigned int delta, int size) { + return __shfl_down(val, delta, size); +} + +template<> +INLINE __device__ uint64_t myshfldown(uint64_t val, unsigned int delta, int size) { + return __shfl_down(val, delta, size); +} + + +template<> +INLINE __device__ float myshfldown(float val, unsigned int delta, int size) { + return __shfl_down(val, delta, size); +} + +template<> +INLINE __device__ double myshfldown(double val, unsigned int delta, int size) { + return __shfl_down(val, delta, size); +} + +template<> +INLINE __device__ unsigned long long myshfldown(unsigned long long val, unsigned int delta, int size) { + return __shfl_down(val, delta, size); +} + + + +template +__inline__ __device__ +T reduInitVal() { + switch(binop) { + case omptarget_nvptx_inc: + case omptarget_nvptx_dec: + case omptarget_nvptx_add: + case omptarget_nvptx_sub: + case omptarget_nvptx_sub_rev: + return (T) 0; + case omptarget_nvptx_mul: + case omptarget_nvptx_div: + return (T) 1; + default: + //ASSERT(0); + return (T) 0; + } +} + +template +__inline__ __device__ +T warpReduceSum(T val, unsigned int size) { + for (int offset = size/2; offset > 0; offset /= 2) + val = Compute(val, myshfldown(val, offset, size)); + return val; +} + +//#define MYGSIZE warpSize +#define MYGSIZE 32 + +template +__inline__ __device__ T warpBlockReduction(T inputval) { + __shared__ T shared[MYGSIZE]; + + unsigned int remainder = blockDim.x & (MYGSIZE-1);; + unsigned int start_r = blockDim.x - remainder; + int lane = threadIdx.x % warpSize; + int wid = threadIdx.x / warpSize; + + if (blockDim.x < MYGSIZE) { + shared[threadIdx.x] = inputval; + } else { + if (threadIdx.x >= start_r) { + shared[threadIdx.x - start_r] = inputval; + } else if (threadIdx.x < MYGSIZE && threadIdx.x >= remainder) { + shared[threadIdx.x] = reduInitVal(); + } + } + __syncthreads(); + + if (blockDim.x < MYGSIZE) { + if (threadIdx.x == 0) { + T val = shared[0]; + for(unsigned i= 1; i < blockDim.x; i++) { + val = Compute(val, shared[i]); + } + return val; + } + return (T) 0; + } + + if (threadIdx.x < start_r) { + T val = warpReduceSum(inputval, MYGSIZE); + if (lane == 0) { + shared[wid] = Compute(shared[wid], val); + } + } + __syncthreads(); + + if (wid == 0) { + T val = warpReduceSum(shared[threadIdx.x], MYGSIZE); + if (threadIdx.x == 0) { +// printf("inside %d\n", val); + return val; + } + } + return (T) 0; +} + + +#define WARPBLOCK_GENREDU(_name, _dtype, _op) \ + EXTERN _dtype __gpu_warpBlockRedu_##_name##_##_op\ + (_dtype rhs) { \ + return warpBlockReduction<_dtype, omptarget_nvptx_##_op>(rhs); \ +} + +#define WARPBLOCK_GENREDU_ALLOP(_name, _dtype) \ + WARPBLOCK_GENREDU(_name, _dtype, add); \ + WARPBLOCK_GENREDU(_name, _dtype, sub); \ + WARPBLOCK_GENREDU(_name, _dtype, mul); \ + WARPBLOCK_GENREDU(_name, _dtype, div); \ + WARPBLOCK_GENREDU(_name, _dtype, min); \ + WARPBLOCK_GENREDU(_name, _dtype, max); \ + WARPBLOCK_GENREDU(_name, _dtype, andb); \ + WARPBLOCK_GENREDU(_name, _dtype, orb); \ + WARPBLOCK_GENREDU(_name, _dtype, xor); \ + WARPBLOCK_GENREDU(_name, _dtype, andl); \ + WARPBLOCK_GENREDU(_name, _dtype, orl); \ + WARPBLOCK_GENREDU(_name, _dtype, eqv); \ + WARPBLOCK_GENREDU(_name, _dtype, neqv); \ + WARPBLOCK_GENREDU(_name, _dtype, shl); \ + WARPBLOCK_GENREDU(_name, _dtype, shr); + + +WARPBLOCK_GENREDU_ALLOP(fixed1, int8_t); +WARPBLOCK_GENREDU_ALLOP(fixed1u, uint8_t); +WARPBLOCK_GENREDU_ALLOP(fixed2, int16_t); +WARPBLOCK_GENREDU_ALLOP(fixed2u, uint16_t); +WARPBLOCK_GENREDU_ALLOP(fixed4, int32_t); +WARPBLOCK_GENREDU_ALLOP(fixed4u, uint32_t); +WARPBLOCK_GENREDU_ALLOP(fixed8, int64_t); +WARPBLOCK_GENREDU_ALLOP(fixed8u, uint64_t); + +#define WARPBLOCK_GENREDU_ALLOP_F(_name, _dtype) \ + WARPBLOCK_GENREDU(_name, _dtype, add); \ + WARPBLOCK_GENREDU(_name, _dtype, sub); \ + WARPBLOCK_GENREDU(_name, _dtype, mul); \ + WARPBLOCK_GENREDU(_name, _dtype, div); \ + WARPBLOCK_GENREDU(_name, _dtype, min); \ + WARPBLOCK_GENREDU(_name, _dtype, max); +WARPBLOCK_GENREDU_ALLOP_F(float4, float); +WARPBLOCK_GENREDU_ALLOP_F(float8, double); + + +/************************************** +* runtime support for array reduction * +***************************************/ + +#define ARRAYATOMIC_GENOP(_name, _dtype, _op) \ + EXTERN void __array_atomic_##_name##_##_op\ + (kmp_Indent *id_ref, int32_t gtid, _dtype * lhs, _dtype *rhs, int64_t n) { \ + PRINT(LD_LOOP, "Reduction: thead %d\n", gtid); \ + for(int i = 0; i < n/sizeof(_dtype); i++) { \ + __kmpc_atomic_##_name##_##_op(id_ref, gtid, lhs+i, rhs[i]);\ + }\ + }\ + EXTERN void __gpu_array_warpBlockRedu_##_name##_##_op\ + (_dtype *ldata, int64_t n) {\ + for(int i = 0; i < n/sizeof(_dtype); i++) { \ + ldata[i] = __gpu_warpBlockRedu_##_name##_##_op(ldata[i]); \ + } \ +} + +#define ARRAY_GEN_ALLOP_INTEGER(_name, _tname, _optype) \ + _name(_tname, _optype, add) ;\ + _name(_tname, _optype, sub) ;\ + _name(_tname, _optype, mul) ;\ + _name(_tname, _optype, div) ;\ + _name(_tname, _optype, min) ;\ + _name(_tname, _optype, max) ;\ + _name(_tname, _optype, andb) ;\ + _name(_tname, _optype, orb) ;\ + _name(_tname, _optype, xor) ;\ + _name(_tname, _optype, shl) ;\ + _name(_tname, _optype, shr) ;\ + _name(_tname, _optype, andl) ;\ + _name(_tname, _optype, orl) ; \ + _name(_tname, _optype, eqv) ; \ + _name(_tname, _optype, neqv) ; \ + +#define ARRAY_GEN_ALLOP_FLOAT(_name, _tname, _optype) \ + _name(_tname, _optype, add) ;\ + _name(_tname, _optype, sub) ;\ + _name(_tname, _optype, mul) ;\ + _name(_tname, _optype, div) ;\ + _name(_tname, _optype, min) ;\ + _name(_tname, _optype, max) ; + + +//ARRAYATOMIC_GENOP(fixed4, int32_t, add); + +#if 1 +ARRAY_GEN_ALLOP_INTEGER(ARRAYATOMIC_GENOP, fixed1, int8_t); +ARRAY_GEN_ALLOP_INTEGER(ARRAYATOMIC_GENOP, fixed2, int16_t); +ARRAY_GEN_ALLOP_INTEGER(ARRAYATOMIC_GENOP, fixed4, int32_t); +ARRAY_GEN_ALLOP_INTEGER(ARRAYATOMIC_GENOP, fixed8, int64_t); +ARRAY_GEN_ALLOP_FLOAT(ARRAYATOMIC_GENOP,float4, float); +ARRAY_GEN_ALLOP_FLOAT(ARRAYATOMIC_GENOP,float8, double); +#endif + Index: libomptarget/deviceRTLs/nvptx/src/stdio.cu =================================================================== --- /dev/null +++ libomptarget/deviceRTLs/nvptx/src/stdio.cu @@ -0,0 +1,24 @@ +//===------------- stdio.cu - NVPTX OpenMP Std I/O --------------- CUDA -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is dual licensed under the MIT and the University of Illinois Open +// Source Licenses. See LICENSE.txt for details. +// +//===----------------------------------------------------------------------===// +// +// Implements standard IO functions. Note that varargs are not supported in +// CUDA, therefore the compiler needs to analyze the arguments passed to +// printf and generate a call to one of the functions defined here. +// +//===----------------------------------------------------------------------===// + + +#include "../../../deviceRTLs/nvptx/src/omptarget-nvptx.h" + +EXTERN int __kmpc_printf(const char* str) +{ + return printf(str); +} + + Index: libomptarget/deviceRTLs/nvptx/src/support.h =================================================================== --- /dev/null +++ libomptarget/deviceRTLs/nvptx/src/support.h @@ -0,0 +1,63 @@ +//===--------- support.h - NVPTX OpenMP support functions -------- CUDA -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is dual licensed under the MIT and the University of Illinois Open +// Source Licenses. See LICENSE.txt for details. +// +//===----------------------------------------------------------------------===// +// +// Wrapper to some functions natively supported by the GPU. +// +//===----------------------------------------------------------------------===// + +//////////////////////////////////////////////////////////////////////////////// +// get info from machine +//////////////////////////////////////////////////////////////////////////////// + +// get global ids to locate tread/team info (constant regardless of OMP) +INLINE int GetGlobalThreadId(); +INLINE int GetGlobalTeamId(); + +// get global number of ids to size thread/team data structures +INLINE int GetNumberOfGlobalThreadIds(); +INLINE int GetNumberOfGlobalTeamIds(); + +// get OpenMP thread and team ids +INLINE int GetOmpThreadId(int globalThreadId); // omp_thread_num +INLINE int GetOmpTeamId(); // omp_team_num + +// get OpenMP number of threads and team +INLINE int GetNumberOfOmpThreads(int globalThreadId); // omp_num_threads +INLINE int GetNumberOfOmpTeams(); // omp_num_teams + +// get OpenMP number of procs +INLINE int GetNumberOfProcsInTeam(); + +// masters +INLINE int IsTeamMaster(int ompThreadId); +INLINE int IsWarpMaster(int ompThreadId); + + +// get low level ids of resources +INLINE int GetThreadIdInBlock(); +INLINE int GetWarpIdInBlock(); +INLINE int GetBlockIdInKernel(); + +// Get low level number of resources +INLINE int GetNumberOfThreadsInBlock(); +INLINE int GetNumberOfWarpsInBlock(); +INLINE int GetNumberOfBlocksInKernel(); + +//////////////////////////////////////////////////////////////////////////////// +// Memory +//////////////////////////////////////////////////////////////////////////////// + + +// safe alloc and free +INLINE void *SafeMalloc(size_t size, const char *msg); // check if success +INLINE void *SafeFree(void *ptr, const char *msg); +// pad to a alignment (power of 2 only) +INLINE unsigned long PadBytes(unsigned long size, unsigned long alignment); +#define ADD_BYTES(_addr, _bytes) ((void *)((char *)((void *)(_addr))+(_bytes))) +#define SUB_BYTES(_addr, _bytes) ((void *)((char *)((void *)(_addr))-(_bytes))) Index: libomptarget/deviceRTLs/nvptx/src/supporti.h =================================================================== --- /dev/null +++ libomptarget/deviceRTLs/nvptx/src/supporti.h @@ -0,0 +1,192 @@ +//===--------- supporti.h - NVPTX OpenMP support functions ------- CUDA -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is dual licensed under the MIT and the University of Illinois Open +// Source Licenses. See LICENSE.txt for details. +// +//===----------------------------------------------------------------------===// +// +// Wrapper implementation to some functions natively supported by the GPU. +// +//===----------------------------------------------------------------------===// + +//////////////////////////////////////////////////////////////////////////////// +// support: get info from machine +//////////////////////////////////////////////////////////////////////////////// + +//////////////////////////////////////////////////////////////////////////////// +// machine: get number of (assuming 1D layout) + +INLINE int GetNumberOfThreadsInBlock() +{ + return blockDim.x; +} + +INLINE int GetNumberOfWarpsInBlock() +{ + ASSERT(LT_FUSSY, GetNumberOfThreadsInBlock() % warpSize == 0, + "expected threads num %d to be a multiple of warp size %d\n", + GetNumberOfThreadsInBlock(), warpSize); + return GetNumberOfThreadsInBlock() / warpSize; +} + +INLINE int GetNumberOfBlocksInKernel() +{ + return gridDim.x; +} + + +//////////////////////////////////////////////////////////////////////////////// +// machine: get ids (assuming 1D layout) + +INLINE int GetThreadIdInBlock() +{ + return threadIdx.x; +} + +INLINE int GetWarpIdInBlock() +{ + ASSERT(LT_FUSSY, GetNumberOfThreadsInBlock() % warpSize == 0, + "expected threads num %d to be a multiple of warp size %d\n", + GetNumberOfThreadsInBlock(), warpSize); + return GetThreadIdInBlock() / warpSize; +} + +INLINE int GetBlockIdInKernel() +{ + return blockIdx.x; +} + + +//////////////////////////////////////////////////////////////////////////////// +// Global thread id used to locate thread info + +INLINE int GetGlobalThreadId() +{ + #ifdef OMPTHREAD_IS_WARP + return GetBlockIdInKernel() * GetNumberOfWarpsInBlock() + GetWarpIdInBlock(); + #else + return GetBlockIdInKernel() * GetNumberOfThreadsInBlock() + GetThreadIdInBlock(); + #endif +} + +INLINE int GetNumberOfGlobalThreadIds() +{ + #ifdef OMPTHREAD_IS_WARP + return GetNumberOfWarpsInBlock() * GetNumberOfBlockInKernel(); + #else + return GetNumberOfThreadsInBlock() * GetNumberOfBlocksInKernel(); + #endif +} + +//////////////////////////////////////////////////////////////////////////////// +// global team id used to locate team info + +INLINE int GetGlobalTeamId() +{ + return GetBlockIdInKernel(); +} + +INLINE int GetNumberOfGlobalTeamIds() +{ + return GetNumberOfBlocksInKernel(); +} + +//////////////////////////////////////////////////////////////////////////////// +// OpenMP Thread id linked to OpenMP + +INLINE int GetOmpThreadId(int globalThreadId) +{ + // omp_thread_num + omptarget_nvptx_TaskDescr *currTaskDescr = + omptarget_nvptx_threadPrivateContext->GetTopLevelTaskDescr(globalThreadId); + int rc = currTaskDescr->ThreadId(); + return rc; +} + +INLINE int GetNumberOfOmpThreads(int globalThreadId) +{ + // omp_num_threads + omptarget_nvptx_TaskDescr *currTaskDescr = + omptarget_nvptx_threadPrivateContext->GetTopLevelTaskDescr(globalThreadId); + + ASSERT0(LT_FUSSY, currTaskDescr, "expected a top task descr"); + + int rc = currTaskDescr->ThreadsInTeam(); + return rc; +} + + +//////////////////////////////////////////////////////////////////////////////// +// Team id linked to OpenMP + +INLINE int GetOmpTeamId() +{ + // omp_team_num + return GetGlobalTeamId(); // assume 1 block per team +} + +INLINE int GetNumberOfOmpTeams() +{ + // omp_num_teams + return GetNumberOfGlobalTeamIds(); // assume 1 block per team +} + + +//////////////////////////////////////////////////////////////////////////////// +// get OpenMP number of procs + +INLINE int GetNumberOfProcsInTeam() +{ + #ifdef OMPTHREAD_IS_WARP + return GetNumberOfWarpsInBlock(); + #else + return GetNumberOfThreadsInBlock(); + #endif +} + + +//////////////////////////////////////////////////////////////////////////////// +// Masters + +INLINE int IsTeamMaster(int ompThreadId) +{ + return (ompThreadId == 0); +} + +INLINE int IsWarpMaster(int ompThreadId) +{ + return (ompThreadId % warpSize == 0); +} + + +//////////////////////////////////////////////////////////////////////////////// +// Memory +//////////////////////////////////////////////////////////////////////////////// + +INLINE unsigned long PadBytes( + unsigned long size, + unsigned long alignment) // must be a power of 2 +{ + // compute the necessary padding to satify alignment constraint + ASSERT(LT_FUSSY, (alignment & (alignment - 1)) == 0, + "alignment %ld is not a power of 2\n", alignment); + return (~(unsigned long) size + 1) & (alignment - 1); +} + +INLINE void *SafeMalloc(size_t size, const char *msg) // check if success +{ + void * ptr = malloc(size); + PRINT(LD_MEM, "malloc data of size %d for %s: 0x%llx\n", size, msg, + P64(ptr)); + ASSERT(LT_SAFETY, ptr, "failed to allocate %d bytes for %s\n", size, msg); + return ptr; +} + +INLINE void *SafeFree(void *ptr, const char *msg) +{ + PRINT(LD_MEM, "free data ptr 0x%llx for %s\n", P64(ptr), msg); + free(ptr); + return NULL; +} Index: libomptarget/deviceRTLs/nvptx/src/sync.cu =================================================================== --- /dev/null +++ libomptarget/deviceRTLs/nvptx/src/sync.cu @@ -0,0 +1,109 @@ +//===------------ sync.h - NVPTX OpenMP synchronizations --------- CUDA -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is dual licensed under the MIT and the University of Illinois Open +// Source Licenses. See LICENSE.txt for details. +// +//===----------------------------------------------------------------------===// +// +// Include all synchronization. +// +//===----------------------------------------------------------------------===// + +#include "../../../deviceRTLs/nvptx/src/omptarget-nvptx.h" + +//////////////////////////////////////////////////////////////////////////////// +// KMP Ordered calls +//////////////////////////////////////////////////////////////////////////////// + + +EXTERN void __kmpc_ordered (kmp_Indent * loc, int32_t gtid) +{ + PRINT0(LD_IO, "call kmpc_ordered\n"); +} + + +EXTERN void __kmpc_end_ordered (kmp_Indent * loc, int32_t gtid) +{ + PRINT0(LD_IO, "call kmpc_end_ordered\n"); +} + + + +//////////////////////////////////////////////////////////////////////////////// +// KMP Barriers +//////////////////////////////////////////////////////////////////////////////// + +// a team is a block: we can use CUDA native synchronization mechanism +// FIXME: what if not all threads (warps) participate to the barrier? +// We may need to implement it differently + +EXTERN int32_t __kmpc_cancel_barrier (kmp_Indent* loc_ref, int32_t gtid) +{ + PRINT0(LD_IO, "call kmpc_cancel_barrier\n"); + __syncthreads(); + PRINT0(LD_SYNC, "completed kmpc_cancel_barrier\n"); + return 0; +} + +// aee this one shoud be discontinued +EXTERN void __kmpc_barrier (kmp_Indent* loc_ref, int32_t gtid) +{ + PRINT0(LD_IO, "call kmpc_barrier\n"); + __syncthreads(); + PRINT0(LD_SYNC, "completed kmpc_barrier\n"); +} + +//////////////////////////////////////////////////////////////////////////////// +// KMP MASTER +//////////////////////////////////////////////////////////////////////////////// + +INLINE int32_t IsMaster() +{ + // only the team master updates the state + int gtid = GetGlobalThreadId(); + int ompThreadId = GetOmpThreadId(gtid); + return IsTeamMaster(ompThreadId); +} + +EXTERN int32_t __kmpc_master(kmp_Indent *loc, int32_t global_tid) +{ + PRINT0(LD_IO, "call kmpc_master\n"); + return IsMaster(); +} + +EXTERN void __kmpc_end_master(kmp_Indent *loc, int32_t global_tid) +{ + PRINT0(LD_IO, "call kmpc_end_master\n"); + ASSERT0(LT_FUSSY, IsMaster(), "expected only master here"); +} + +//////////////////////////////////////////////////////////////////////////////// +// KMP SINGLE +//////////////////////////////////////////////////////////////////////////////// + +EXTERN int32_t __kmpc_single(kmp_Indent *loc, int32_t global_tid) +{ + PRINT0(LD_IO, "call kmpc_single\n"); + // decide to implement single with master; master get the single + return IsMaster(); +} + +EXTERN void __kmpc_end_single(kmp_Indent *loc, int32_t global_tid) +{ + PRINT0(LD_IO, "call kmpc_end_single\n"); + // decide to implement single with master: master get the single + ASSERT0(LT_FUSSY, IsMaster(), "expected only master here"); + // sync barrier is explicitely called... so that is not a problem +} + +//////////////////////////////////////////////////////////////////////////////// +// Flush +//////////////////////////////////////////////////////////////////////////////// + +EXTERN void __kmpc_flush(kmp_Indent *loc) +{ + PRINT0(LD_IO, "call kmpc_flush\n"); + // not aware of anything to do for flush +} Index: libomptarget/deviceRTLs/nvptx/src/task.cu =================================================================== --- /dev/null +++ libomptarget/deviceRTLs/nvptx/src/task.cu @@ -0,0 +1,184 @@ +//===------------- task.h - NVPTX OpenMP tasks support ----------- CUDA -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is dual licensed under the MIT and the University of Illinois Open +// Source Licenses. See LICENSE.txt for details. +// +//===----------------------------------------------------------------------===// +// +// Task implementation support. +// +// explicit task structure uses +// omptarget_nvptx task +// kmp_task +// +// where kmp_task is +// - klegacy_TaskDescr <- task pointer +// shared -> X +// routine +// part_id +// descr +// - private (of size given by task_alloc call). Accessed by +// task+sizeof(klegacy_TaskDescr) +// * private data * +// - shared: X. Accessed by shared ptr in klegacy_TaskDescr +// * pointer table to shared variables * +// - end +// +//===----------------------------------------------------------------------===// + +#include "../../../deviceRTLs/nvptx/src/omptarget-nvptx.h" + +EXTERN kmp_TaskDescr *__kmpc_omp_task_alloc( + kmp_Indent *loc, // unused + uint32_t global_tid, // unused + int32_t flag, // unused (because in our impl, all are immediately exec + size_t sizeOfTaskInclPrivate, + size_t sizeOfSharedTable, + kmp_TaskFctPtr taskSub) +{ + PRINT(LD_IO, + "call __kmpc_omp_task_alloc(size priv&struct %lld, shared %lld, fct 0x%llx)\n", + P64(sizeOfTaskInclPrivate), P64(sizeOfSharedTable), P64(taskSub)); + // want task+priv to be a multiple of 8 bytes + size_t padForTaskInclPriv = PadBytes(sizeOfTaskInclPrivate, sizeof(void*)); + sizeOfTaskInclPrivate += padForTaskInclPriv; + size_t kmpSize = sizeOfTaskInclPrivate + sizeOfSharedTable; + ASSERT(LT_FUSSY, sizeof(omptarget_nvptx_TaskDescr) % sizeof(void *) == 0, + "need task descr of size %d to be a multiple of %d\n", + sizeof(omptarget_nvptx_TaskDescr), sizeof(void *)); + size_t totSize = sizeof(omptarget_nvptx_TaskDescr) + kmpSize; + omptarget_nvptx_ExplicitTaskDescr *newExplicitTaskDescr = (omptarget_nvptx_ExplicitTaskDescr *) + SafeMalloc(totSize, "explicit task descriptor"); + kmp_TaskDescr *newKmpTaskDescr = & newExplicitTaskDescr->kmpTaskDescr; + ASSERT0(LT_FUSSY, (uint64_t) newKmpTaskDescr == + (uint64_t) ADD_BYTES(newExplicitTaskDescr, sizeof(omptarget_nvptx_TaskDescr)), + "bad size assumptions"); + // init kmp_TaskDescr + newKmpTaskDescr->sharedPointerTable = + (void *)((char *)newKmpTaskDescr + sizeOfTaskInclPrivate); + newKmpTaskDescr->sub = taskSub; + newKmpTaskDescr->destructors = NULL; + PRINT(LD_TASK, "return with task descr kmp: 0x%llx, omptarget-nvptx 0x%llx\n", + P64(newKmpTaskDescr), P64(newExplicitTaskDescr)); + + return newKmpTaskDescr; +} + +EXTERN int32_t __kmpc_omp_task_with_deps(kmp_Indent *loc, + uint32_t global_tid, kmp_TaskDescr *newKmpTaskDescr, + int32_t depNum, void * depList, int32_t noAliasDepNum, + void * noAliasDepList) +{ + PRINT(LD_IO, "call to __kmpc_omp_task_with_deps(task 0x%llx)\n", + P64(newKmpTaskDescr)); + // 1. get explict task descr from kmp task descr + omptarget_nvptx_ExplicitTaskDescr *newExplicitTaskDescr = (omptarget_nvptx_ExplicitTaskDescr *) + SUB_BYTES(newKmpTaskDescr, sizeof(omptarget_nvptx_TaskDescr)); + ASSERT0(LT_FUSSY, & newExplicitTaskDescr->kmpTaskDescr == newKmpTaskDescr, + "bad assumptions"); + omptarget_nvptx_TaskDescr *newTaskDescr = & newExplicitTaskDescr->taskDescr; + ASSERT0(LT_FUSSY, (uint64_t) newTaskDescr == (uint64_t) newExplicitTaskDescr, + "bad assumptions"); + + // 2. push new context: update new task descriptor + int gtid = GetGlobalThreadId(); + omptarget_nvptx_TaskDescr *parentTaskDescr = getMyTopTaskDescriptor(gtid); + newTaskDescr->CopyForExplicitTask(parentTaskDescr); + // set new task descriptor as top + omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(gtid, newTaskDescr); + + // 3. call sub + PRINT(LD_TASK, "call task sub 0x%llx(task descr 0x%llx)\n", + P64(newKmpTaskDescr->sub), P64(newKmpTaskDescr)); + newKmpTaskDescr->sub(0, newKmpTaskDescr); + PRINT(LD_TASK, "return from call task sub 0x%llx()\n", + P64(newKmpTaskDescr->sub)); + + // 4. pop context + omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(gtid, parentTaskDescr); + // 5. free + SafeFree(newExplicitTaskDescr, "explicit task descriptor"); + return 0; +} + +EXTERN void __kmpc_omp_task_begin_if0(kmp_Indent *loc, + uint32_t global_tid, kmp_TaskDescr *newKmpTaskDescr) +{ + PRINT(LD_IO, "call to __kmpc_omp_task_begin_if0(task 0x%llx)\n", + P64(newKmpTaskDescr)); + // 1. get explict task descr from kmp task descr + omptarget_nvptx_ExplicitTaskDescr *newExplicitTaskDescr = (omptarget_nvptx_ExplicitTaskDescr *) + SUB_BYTES(newKmpTaskDescr, sizeof(omptarget_nvptx_TaskDescr)); + ASSERT0(LT_FUSSY, & newExplicitTaskDescr->kmpTaskDescr == newKmpTaskDescr, + "bad assumptions"); + omptarget_nvptx_TaskDescr *newTaskDescr = & newExplicitTaskDescr->taskDescr; + ASSERT0(LT_FUSSY, (uint64_t) newTaskDescr == (uint64_t) newExplicitTaskDescr, + "bad assumptions"); + + // 2. push new context: update new task descriptor + int gtid = GetGlobalThreadId(); + omptarget_nvptx_TaskDescr *parentTaskDescr = getMyTopTaskDescriptor(gtid); + newTaskDescr->CopyForExplicitTask(parentTaskDescr); + // set new task descriptor as top + omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(gtid, newTaskDescr); + // 3... noting to call... is inline + // 4 & 5 ... done in complete +} + +EXTERN void __kmpc_omp_task_complete_if0(kmp_Indent *loc, + uint32_t global_tid, kmp_TaskDescr *newKmpTaskDescr) +{ + PRINT(LD_IO, "call to __kmpc_omp_task_complete_if0(task 0x%llx)\n", + P64(newKmpTaskDescr)); + // 1. get explict task descr from kmp task descr + omptarget_nvptx_ExplicitTaskDescr *newExplicitTaskDescr = (omptarget_nvptx_ExplicitTaskDescr *) + SUB_BYTES(newKmpTaskDescr, sizeof(omptarget_nvptx_TaskDescr)); + ASSERT0(LT_FUSSY, & newExplicitTaskDescr->kmpTaskDescr == newKmpTaskDescr, + "bad assumptions"); + omptarget_nvptx_TaskDescr *newTaskDescr = & newExplicitTaskDescr->taskDescr; + ASSERT0(LT_FUSSY, (uint64_t) newTaskDescr == (uint64_t) newExplicitTaskDescr, + "bad assumptions"); + // 2. get parent + omptarget_nvptx_TaskDescr *parentTaskDescr = newTaskDescr->GetPrevTaskDescr(); + // 3... noting to call... is inline + // 4. pop context + int gtid = GetGlobalThreadId(); + omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(gtid, parentTaskDescr); + // 5. free + SafeFree(newExplicitTaskDescr, "explicit task descriptor"); + +} + +EXTERN void __kmpc_omp_wait_deps(kmp_Indent *loc, + uint32_t global_tid, int32_t depNum, void * depList, + int32_t noAliasDepNum, void * noAliasDepList) +{ + PRINT0(LD_IO, "call to __kmpc_omp_wait_deps(..)\n"); + // nothing to do as all our tasks are executed as final +} + +EXTERN void __kmpc_taskgroup(kmp_Indent *loc, uint32_t global_tid) +{ + PRINT0(LD_IO, "call to __kmpc_taskgroup(..)\n"); + // nothing to do as all our tasks are executed as final +} + +EXTERN void __kmpc_end_taskgroup(kmp_Indent *loc, uint32_t global_tid) +{ + PRINT0(LD_IO, "call to __kmpc_end_taskgroup(..)\n"); + // nothing to do as all our tasks are executed as final +} + +EXTERN void __kmpc_omp_taskyield(kmp_Indent *loc, uint32_t global_tid) +{ + PRINT0(LD_IO, "call to __kmpc_taskyield()\n"); + // do nothing +} + +EXTERN void __kmpc_omp_taskwait(kmp_Indent *loc, uint32_t global_tid) +{ + PRINT0(LD_IO, "call to __kmpc_taskwait()\n"); + // nothing to do as all our tasks are executed as final +} Index: libomptarget/exports =================================================================== --- /dev/null +++ libomptarget/exports @@ -0,0 +1,18 @@ +VERS1.0 { + global: + __tgt_register_lib; + __tgt_unregister_lib; + __tgt_target_data_begin; + __tgt_target_data_end; + __tgt_target_data_update; + __tgt_target; + __tgt_target_teams; + __tgt_target_data_begin_nowait; + __tgt_target_data_end_nowait; + __tgt_target_data_update_nowait; + __tgt_target_nowait; + __tgt_target_teams_nowait; + local: + *; +}; + Index: libomptarget/plugins/CMakeLists.txt =================================================================== --- /dev/null +++ libomptarget/plugins/CMakeLists.txt @@ -0,0 +1,59 @@ +##===----------------------------------------------------------------------===## +# +# The LLVM Compiler Infrastructure +# +# This file is dual licensed under the MIT and the University of Illinois Open +# Source Licenses. See LICENSE.txt for details. +# +##===----------------------------------------------------------------------===## +# +# Build plugins for the user system if available. +# +##===----------------------------------------------------------------------===## + +# void build_generic64(string tmachine, string tmachine_name, string tmachine_libname); +# - build a plugin for a generic 64-bit target based on libffi. +# - tmachine: name of the machine processor as used in the cmake build system. +# - tmachine_name: name of the machine to be printed with the debug messages. +# - tmachine_libname: machine name to be appended to the plugin library name. +macro(build_generic64 tmachine tmachine_name tmachine_libname tmachine_triple) +if(CMAKE_SYSTEM_PROCESSOR MATCHES "${tmachine}$") + if(LIBOMPTARGET_DEP_LIBFFI_FOUND) + + libomptarget_say("Building ${tmachine_name} offloading plugin.") + + include_directories(${LIBOMPTARGET_DEP_LIBFFI_INCLUDE_DIR}) + + # Define macro to be used as prefix of the runtime messages for this target. + add_definitions("-DTARGET_NAME=${tmachine_name}") + + add_library("omptarget.rtl.${tmachine_libname}" SHARED + ${CMAKE_CURRENT_SOURCE_DIR}/../generic-64bit/src/rtl.cpp) + + target_link_libraries( + "omptarget.rtl.${tmachine_libname}" + ${LIBOMPTARGET_DEP_LIBFFI_LIBRARIES} + ${LIBOMPTARGET_DEP_LIBELF_LIBRARIES} + dl + "-Wl,--version-script=${CMAKE_CURRENT_SOURCE_DIR}/../exports") + + # Report to the parent scope that we are building a plugin. + set(LIBOMPTARGET_SYSTEM_TARGETS + "${LIBOMPTARGET_SYSTEM_TARGETS} ${tmachine_triple}" PARENT_SCOPE) + + else(LIBOMPTARGET_DEP_LIBFFI_FOUND) + libomptarget_say("Not building ${tmachine_name} offloading plugin: libffi dependency not found.") + endif(LIBOMPTARGET_DEP_LIBFFI_FOUND) +else() + libomptarget_say("Not building ${tmachine_name} offloading plugin: machine not found in the system.") +endif() +endmacro() + +add_subdirectory(cuda) +add_subdirectory(ppc64) +add_subdirectory(ppc64le) +add_subdirectory(x86_64) + +# Make sure the parent scope can see the plugins that will be created. +set(LIBOMPTARGET_SYSTEM_TARGETS "${LIBOMPTARGET_SYSTEM_TARGETS}" PARENT_SCOPE) + Index: libomptarget/plugins/cuda/CMakeLists.txt =================================================================== --- /dev/null +++ libomptarget/plugins/cuda/CMakeLists.txt @@ -0,0 +1,42 @@ +##===----------------------------------------------------------------------===## +# +# The LLVM Compiler Infrastructure +# +# This file is dual licensed under the MIT and the University of Illinois Open +# Source Licenses. See LICENSE.txt for details. +# +##===----------------------------------------------------------------------===## +# +# Build a plugin for a CUDA machine if available. +# +##===----------------------------------------------------------------------===## + +if(LIBOMPTARGET_DEP_CUDA_FOUND) + if(CMAKE_SYSTEM_PROCESSOR MATCHES "(x86_64)|(ppc64le)$" AND CMAKE_SYSTEM_NAME MATCHES "Linux") + + libomptarget_say("Building CUDA offloading plugin.") + + # Define the suffix for the runtime messaging dumps. + add_definitions(-DTARGET_NAME=CUDA) + + if(CMAKE_BUILD_TYPE MATCHES Debug) + add_definitions(-DCUDA_ERROR_REPORT) + endif() + + include_directories(${LIBOMPTARGET_DEP_CUDA_INCLUDE_DIRS}) + + add_library(omptarget.rtl.cuda SHARED src/rtl.cpp) + target_link_libraries(omptarget.rtl.cuda + ${LIBOMPTARGET_DEP_CUDA_LIBRARIES} + cuda + ${LIBOMPTARGET_DEP_LIBELF_LIBRARIES} + "-Wl,--version-script=${CMAKE_CURRENT_SOURCE_DIR}/../exports") + + # Report to the parent scope that we are building a plugin for CUDA. + set(LIBOMPTARGET_SYSTEM_TARGETS "${LIBOMPTARGET_SYSTEM_TARGETS} nvptx64-nvidia-cuda" PARENT_SCOPE) + else() + libomptarget_say("Not building CUDA offloading plugin: only support CUDA in linux x86_64 or ppc64le hosts.") + endif() +else() + libomptarget_say("Not building CUDA offloading plugin: CUDA not found in system.") +endif() Index: libomptarget/plugins/cuda/src/rtl.cpp =================================================================== --- /dev/null +++ libomptarget/plugins/cuda/src/rtl.cpp @@ -0,0 +1,600 @@ +//===----RTLs/cuda/src/rtl.cpp - Target RTLs Implementation ------- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is dual licensed under the MIT and the University of Illinois Open +// Source Licenses. See LICENSE.txt for details. +// +//===----------------------------------------------------------------------===// +// +// RTL for CUDA machine +// +//===----------------------------------------------------------------------===// + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "omptarget.h" + +#ifndef TARGET_NAME +#define TARGET_NAME Generic-64bit +#endif + +#define GETNAME2(name) #name +#define GETNAME(name) GETNAME2(name) +#define DP(...) DEBUGP("Target " GETNAME(TARGET_NAME) " RTL",__VA_ARGS__) + +// Utility for retrieving and printing CUDA error string +#ifdef CUDA_ERROR_REPORT +#define CUDA_ERR_STRING(err) \ + do { \ + const char* errStr; \ + cuGetErrorString (err, &errStr); \ + DP("CUDA error is: %s\n", errStr); \ + } while (0) +#else +#define CUDA_ERR_STRING(err) {} +#endif + +// NVPTX image start encodes a struct that also includes the host entries begin +// and end pointers. The host entries are used by the runtime to accelerate +// the retrieval of the target entry pointers +struct __tgt_nvptx_device_image_start{ + void *RealStart; // Pointer to actual NVPTX elf image + char *TgtName; // Name of the target of the image + __tgt_offload_entry *HostStart; // Pointer to the host entries start + __tgt_offload_entry *HostEnd; // Pointer to the host entries end +}; + +/// Account the memory allocated per device +struct AllocMemEntryTy{ + int64_t TotalSize; + std::vector > Ptrs; + + AllocMemEntryTy() : TotalSize(0) {} +}; + +/// Keep entries table per device +struct FuncOrGblEntryTy{ + __tgt_target_table Table; + std::vector<__tgt_offload_entry> Entries; +}; + +/// Use a single entity to encode a kernel and a set of flags +struct KernelTy{ + CUfunction Func; + int SimdInfo; + + // keep track of cuda pointer to write to it when thread_limit value + // changes (check against last value written to ThreadLimit + CUdeviceptr ThreadLimitPtr; + int ThreadLimit; + + KernelTy(CUfunction _Func, int _SimdInfo, CUdeviceptr _ThreadLimitPtr) + : Func(_Func), SimdInfo(_SimdInfo), ThreadLimitPtr(_ThreadLimitPtr) { + ThreadLimit = 0; //default (0) signals that it was not initialized + }; +}; + +/// List that contains all the kernels. +/// FIXME: we may need this to be per device and per library. +std::list KernelsList; + +/// Class containing all the device information +class RTLDeviceInfoTy{ + std::vector FuncGblEntries; + +public: + int NumberOfDevices; + std::vector Modules; + std::vector Contexts; + std::vector ThreadsPerBlock; + std::vector BlocksPerGrid; + + // Record entry point associated with device + void addOffloadEntry(int32_t device_id, __tgt_offload_entry entry ){ + assert( device_id < (int32_t)FuncGblEntries.size() && "Unexpected device id!"); + FuncOrGblEntryTy &E = FuncGblEntries[device_id]; + + E.Entries.push_back(entry); + } + + // Return true if the entry is associated with device + bool findOffloadEntry(int32_t device_id, void *addr){ + assert( device_id < (int32_t)FuncGblEntries.size() && "Unexpected device id!"); + FuncOrGblEntryTy &E = FuncGblEntries[device_id]; + + for(unsigned i=0; iImageStart); + err = cuModuleLoadDataEx (&cumod, image->ImageStart, 0, NULL, NULL); + if (err != CUDA_SUCCESS) + { + DP ("Error when loading CUDA module\n"); + CUDA_ERR_STRING (err); + return NULL; + } + + DP ("CUDA module successfully loaded!\n"); + DeviceInfo.Modules.push_back(cumod); + + // Here, we take advantage of the data that is appended after img_end to get + // the symbols' name we need to load. This data consist of the host entries + // begin and end as well as the target name (see the offloading linker script + // creation in clang compiler). + // Find the symbols in the module by name. The name can be obtain by + // concatenating the host entry name with the target name + + __tgt_offload_entry *HostBegin = image->EntriesBegin; + __tgt_offload_entry *HostEnd = image->EntriesEnd; + + for( __tgt_offload_entry *e = HostBegin; e != HostEnd; ++e) { + + if( !e->addr ){ + // FIXME: Probably we should fail when something like this happen, the + // host should have always something in the address to uniquely identify + // the target region. + DP("Analyzing host entry '' (size = %lld)...\n", + (unsigned long long)e->size); + + __tgt_offload_entry entry = *e; + DeviceInfo.addOffloadEntry(device_id, entry); + continue; + } + + if( e->size ){ + + __tgt_offload_entry entry = *e; + + CUdeviceptr cuptr; + size_t cusize; + err = cuModuleGetGlobal(&cuptr,&cusize,cumod,e->name); + + if (err != CUDA_SUCCESS){ + DP("loading global '%s' (Failed)\n",e->name); + CUDA_ERR_STRING (err); + return NULL; + } + + if ((int32_t)cusize != e->size){ + DP("loading global '%s' - size mismatch (%lld != %lld)\n",e->name, + (unsigned long long)cusize, + (unsigned long long)e->size); + CUDA_ERR_STRING (err); + return NULL; + } + + DP("Entry point %ld maps to global %s (%016lx)\n",e-HostBegin,e->name,(long)cuptr); + entry.addr = (void*)cuptr; + + DeviceInfo.addOffloadEntry(device_id, entry); + + continue; + } + + CUfunction fun; + err = cuModuleGetFunction (&fun, cumod, e->name); + + if (err != CUDA_SUCCESS){ + DP("loading '%s' (Failed)\n",e->name); + CUDA_ERR_STRING (err); + return NULL; + } + + DP("Entry point %ld maps to %s (%016lx)\n",e-HostBegin,e->name,(Elf64_Addr)fun); + + // default value + int8_t SimdInfoVal = 1; + + // obtain and save simd_info value for target region + const char suffix[] = "_simd_info"; + char * SimdInfoName = (char *) malloc((strlen(e->name)+strlen(suffix))* + sizeof(char)); + sprintf(SimdInfoName, "%s%s", e->name, suffix); + + CUdeviceptr SimdInfoPtr; + size_t cusize; + err = cuModuleGetGlobal(&SimdInfoPtr,&cusize,cumod,SimdInfoName); + if (err == CUDA_SUCCESS) { + if ((int32_t)cusize != sizeof(int8_t)){ + DP("loading global simd_info '%s' - size mismatch (%lld != %lld)\n", SimdInfoName, (unsigned long long)cusize,(unsigned long long)sizeof(int8_t)); + CUDA_ERR_STRING (err); + return NULL; + } + + err = cuMemcpyDtoH(&SimdInfoVal,(CUdeviceptr)SimdInfoPtr,cusize); + if (err != CUDA_SUCCESS) + { + DP("Error when copying data from device to host. Pointers: " + "host = 0x%016lx, device = 0x%016lx, size = %lld\n",(Elf64_Addr)&SimdInfoVal, (Elf64_Addr)SimdInfoPtr,(unsigned long long)cusize); + CUDA_ERR_STRING (err); + return NULL; + } + if (SimdInfoVal < 1) { + DP("Error wrong simd_info value specified in cubin file: %d\n", SimdInfoVal); + return NULL; + } + } + + // obtain cuda pointer to global tracking thread limit + const char SuffixTL[] = "_thread_limit"; + char * ThreadLimitName = (char *) malloc((strlen(e->name)+strlen(SuffixTL))* + sizeof(char)); + sprintf(ThreadLimitName, "%s%s", e->name, SuffixTL); + + CUdeviceptr ThreadLimitPtr; + err = cuModuleGetGlobal(&ThreadLimitPtr,&cusize,cumod,ThreadLimitName); + if (err != CUDA_SUCCESS) { + DP("retrieving pointer for %s global\n", ThreadLimitName); + CUDA_ERR_STRING (err); + return NULL; + } + if ((int32_t)cusize != sizeof(int32_t)) { + DP("loading global thread_limit '%s' - size mismatch (%lld != %lld)\n", ThreadLimitName, (unsigned long long)cusize,(unsigned long long)sizeof(int32_t)); + CUDA_ERR_STRING (err); + return NULL; + } + + // encode function and kernel + KernelsList.push_back(KernelTy(fun, SimdInfoVal, ThreadLimitPtr)); + + __tgt_offload_entry entry = *e; + entry.addr = (void*)&KernelsList.back(); + DeviceInfo.addOffloadEntry(device_id, entry); + } + + return DeviceInfo.getOffloadEntriesTable(device_id); +} + +void *__tgt_rtl_data_alloc(int32_t device_id, int64_t size){ + + //Set the context we are using + CUresult err = cuCtxSetCurrent (DeviceInfo.Contexts[device_id]); + if (err != CUDA_SUCCESS) + { + DP("Error while trying to set CUDA current context\n"); + CUDA_ERR_STRING (err); + return NULL; + } + + CUdeviceptr ptr; + err = cuMemAlloc(&ptr, size); + if (err != CUDA_SUCCESS) + { + DP("Error while trying to allocate %d\n", err); + CUDA_ERR_STRING (err); + return NULL; + } + + void *vptr = (void*) ptr; + return vptr; +} + +int32_t __tgt_rtl_data_submit(int32_t device_id, void *tgt_ptr, void *hst_ptr, int64_t size){ + //Set the context we are using + CUresult err = cuCtxSetCurrent (DeviceInfo.Contexts[device_id]); + if (err != CUDA_SUCCESS) + { + DP("Error when setting CUDA context\n"); + CUDA_ERR_STRING (err); + return OFFLOAD_FAIL; + } + + err = cuMemcpyHtoD((CUdeviceptr)tgt_ptr, hst_ptr, size); + if (err != CUDA_SUCCESS) + { + DP("Error when copying data from host to device. Pointers: " + "host = 0x%016lx, device = 0x%016lx, size = %lld\n", + (Elf64_Addr)hst_ptr, (Elf64_Addr)tgt_ptr, (unsigned long long)size); + CUDA_ERR_STRING (err); + return OFFLOAD_FAIL; + } + return OFFLOAD_SUCCESS; +} + +int32_t __tgt_rtl_data_retrieve(int32_t device_id, void *hst_ptr, void *tgt_ptr, int64_t size){ + //Set the context we are using + CUresult err = cuCtxSetCurrent (DeviceInfo.Contexts[device_id]); + if (err != CUDA_SUCCESS) + { + DP("Error when setting CUDA context\n"); + CUDA_ERR_STRING (err); + return OFFLOAD_FAIL; + } + + err = cuMemcpyDtoH(hst_ptr,(CUdeviceptr)tgt_ptr,size); + if (err != CUDA_SUCCESS) + { + DP("Error when copying data from device to host. Pointers: " + "host = 0x%016lx, device = 0x%016lx, size = %lld\n", + (Elf64_Addr)hst_ptr, (Elf64_Addr)tgt_ptr, (unsigned long long)size); + CUDA_ERR_STRING (err); + return OFFLOAD_FAIL; + } + return OFFLOAD_SUCCESS; +} + +int32_t __tgt_rtl_data_delete(int32_t device_id, void* tgt_ptr){ + //Set the context we are using + CUresult err = cuCtxSetCurrent (DeviceInfo.Contexts[device_id]); + if (err != CUDA_SUCCESS) + { + DP("Error when setting CUDA context\n"); + CUDA_ERR_STRING (err); + return OFFLOAD_FAIL; + } + + err = cuMemFree((CUdeviceptr)tgt_ptr); + if (err != CUDA_SUCCESS) + { + DP("Error when freeing CUDA memory\n"); + CUDA_ERR_STRING (err); + return OFFLOAD_FAIL; + } + return OFFLOAD_SUCCESS; +} + +int32_t __tgt_rtl_run_target_team_region(int32_t device_id, + void *tgt_entry_ptr, void **tgt_args, int32_t arg_num, + int32_t team_num, int32_t thread_limit) +{ + //Set the context we are using + CUresult err = cuCtxSetCurrent (DeviceInfo.Contexts[device_id]); + if (err != CUDA_SUCCESS) + { + DP("Error when setting CUDA context\n"); + CUDA_ERR_STRING (err); + return OFFLOAD_FAIL; + } + + // All args are references + std::vector args(arg_num); + + for(int32_t i=0; iSimdInfo > DeviceInfo.ThreadsPerBlock[device_id])? + DeviceInfo.ThreadsPerBlock[device_id] : + thread_limit*KernelInfo->SimdInfo; + + // update thread limit content in gpu memory if un-initialized or changed + if (KernelInfo->ThreadLimit == 0 || KernelInfo->ThreadLimit != thread_limit) { + // always capped by maximum number of threads in a block: even if 1 OMP thread + // is 1 independent CUDA thread, we may have up to max block size OMP threads + // if the user request thread_limit(tl) with tl > max block size, we + // only start max block size CUDA threads + if (thread_limit > DeviceInfo.ThreadsPerBlock[device_id]) + thread_limit = DeviceInfo.ThreadsPerBlock[device_id]; + + KernelInfo->ThreadLimit = thread_limit; + err = cuMemcpyHtoD(KernelInfo->ThreadLimitPtr,&thread_limit,sizeof(int32_t)); + + if (err != CUDA_SUCCESS) { + DP("Error when setting thread limit global\n"); + return OFFLOAD_FAIL; + } + } + + int blocksPerGrid = team_num>0 ? team_num : + DeviceInfo.BlocksPerGrid[device_id]; + int nshared = 0; + + // Run on the device + DP("launch kernel with %d blocks and %d threads\n", blocksPerGrid, cudaThreadsPerBlock); + + err = cuLaunchKernel(KernelInfo->Func, + blocksPerGrid, 1, 1, cudaThreadsPerBlock, 1, 1, nshared, 0, &args[0], 0); + if( err != CUDA_SUCCESS ) + { + DP("Device kernel launching failed!\n"); + CUDA_ERR_STRING (err); + assert(err == CUDA_SUCCESS && "Unable to launch target execution!" ); + return OFFLOAD_FAIL; + } + + DP("Execution of entry point at %016lx successful!\n", + (Elf64_Addr)tgt_entry_ptr); + return OFFLOAD_SUCCESS; +} + +int32_t __tgt_rtl_run_target_region(int32_t device_id, void *tgt_entry_ptr, + void **tgt_args, int32_t arg_num) +{ + // use one team and one thread + // fix thread num + int32_t team_num = 1; + int32_t thread_limit = 0; // use default + return __tgt_rtl_run_target_team_region(device_id, + tgt_entry_ptr, tgt_args, arg_num, team_num, thread_limit); +} + + +#ifdef __cplusplus +} +#endif Index: libomptarget/plugins/exports =================================================================== --- /dev/null +++ libomptarget/plugins/exports @@ -0,0 +1,15 @@ +VERS1.0 { + global: + __tgt_rtl_device_type; + __tgt_rtl_number_of_devices; + __tgt_rtl_init_device; + __tgt_rtl_load_binary; + __tgt_rtl_data_alloc; + __tgt_rtl_data_submit; + __tgt_rtl_data_retrieve; + __tgt_rtl_data_delete; + __tgt_rtl_run_target_team_region; + __tgt_rtl_run_target_region; + local: + *; +}; Index: libomptarget/plugins/generic-64bit/src/rtl.cpp =================================================================== --- /dev/null +++ libomptarget/plugins/generic-64bit/src/rtl.cpp @@ -0,0 +1,314 @@ +//===-RTLs/generic-64bit/src/rtl.cpp - Target RTLs Implementation - C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is dual licensed under the MIT and the University of Illinois Open +// Source Licenses. See LICENSE.txt for details. +// +//===----------------------------------------------------------------------===// +// +// RTL for generic 64-bit machine +// +//===----------------------------------------------------------------------===// + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "omptarget.h" + +#ifndef TARGET_NAME +#define TARGET_NAME Generic-64bit +#endif + +#define GETNAME2(name) #name +#define GETNAME(name) GETNAME2(name) +#define DP(...) DEBUGP("Target " GETNAME(TARGET_NAME) " RTL",__VA_ARGS__) + +#define NUMBER_OF_DEVICES 4 +#define OFFLOADSECTIONNAME ".omp_offloading.entries" + +/// Array of Dynamic libraries loaded for this target +struct DynLibTy{ + char *FileName; + void* Handle; +}; + +/// Account the memory allocated per device +struct AllocMemEntryTy{ + int64_t TotalSize; + std::vector > Ptrs; + + AllocMemEntryTy() : TotalSize(0) {} +}; + +/// Keep entries table per device +struct FuncOrGblEntryTy{ + __tgt_target_table Table; +}; + +/// Class containing all the device information +class RTLDeviceInfoTy{ + std::vector FuncGblEntries; + +public: + std::list DynLibs; + + // Record entry point associated with device + void createOffloadTable(int32_t device_id, __tgt_offload_entry *begin, __tgt_offload_entry *end){ + assert( device_id < (int32_t)FuncGblEntries.size() && "Unexpected device id!"); + FuncOrGblEntryTy &E = FuncGblEntries[device_id]; + + E.Table.EntriesBegin = begin; + E.Table.EntriesEnd = end; + } + + // Return true if the entry is associated with device + bool findOffloadEntry(int32_t device_id, void *addr){ + assert( device_id < (int32_t)FuncGblEntries.size() && "Unexpected device id!"); + FuncOrGblEntryTy &E = FuncGblEntries[device_id]; + + + + for(__tgt_offload_entry *i= E.Table.EntriesBegin, + *e= E.Table.EntriesEnd; iaddr == addr) + return true; + } + + return false; + } + + // Return the pointer to the target entries table + __tgt_target_table *getOffloadEntriesTable(int32_t device_id){ + assert( device_id < (int32_t)FuncGblEntries.size() && "Unexpected device id!"); + FuncOrGblEntryTy &E = FuncGblEntries[device_id]; + + return &E.Table; + } + + RTLDeviceInfoTy(int32_t num_devices){ + FuncGblEntries.resize(num_devices); + } + + ~RTLDeviceInfoTy(){ + // Close dynamic libraries + for(std::list::iterator + ii = DynLibs.begin(), ie = DynLibs.begin(); ii!=ie; ++ii) + if(ii->Handle){ + dlclose(ii->Handle); + remove(ii->FileName); + } + } +}; + +static RTLDeviceInfoTy DeviceInfo(NUMBER_OF_DEVICES); + + +#ifdef __cplusplus +extern "C" { +#endif + +int __tgt_rtl_device_type(int32_t device_id){ + + if( device_id < NUMBER_OF_DEVICES) + return 21; // EM_PPC64 + + return 0; +} + +int __tgt_rtl_number_of_devices(){ + return NUMBER_OF_DEVICES; +} + +int32_t __tgt_rtl_init_device(int32_t device_id){ + return OFFLOAD_SUCCESS; // success +} + +__tgt_target_table *__tgt_rtl_load_binary(int32_t device_id, __tgt_device_image *image){ + + DP("Dev %d: load binary from 0x%llx image\n", device_id, + (long long)image->ImageStart); + + assert(device_id>=0 && device_idImageEnd - (size_t)image->ImageStart; + size_t NumEntries = (size_t) (image->EntriesEnd - image->EntriesBegin); + DP("Expecting to have %ld entries defined.\n", (long)NumEntries); + + // We do not need to set the ELF version because the caller of this function + // had to do that to decide the right runtime to use + + //Obtain elf handler + Elf *e = elf_memory ((char*)image->ImageStart, ImageSize); + if(!e){ + DP("Unable to get ELF handle: %s!\n", elf_errmsg(-1)); + return NULL; + } + + if( elf_kind(e) != ELF_K_ELF){ + DP("Invalid Elf kind!\n"); + elf_end(e); + return NULL; + } + + //Find the entries section offset + Elf_Scn *section = 0; + Elf64_Off entries_offset = 0; + + size_t shstrndx; + + if (elf_getshdrstrndx (e , &shstrndx )) { + DP("Unable to get ELF strings index!\n"); + elf_end(e); + return NULL; + } + + while ((section = elf_nextscn(e,section))) { + GElf_Shdr hdr; + gelf_getshdr(section, &hdr); + + if (!strcmp(elf_strptr(e,shstrndx,hdr.sh_name),OFFLOADSECTIONNAME)){ + entries_offset = hdr.sh_addr; + break; + } + } + + if (!entries_offset) { + DP("Entries Section Offset Not Found\n"); + elf_end(e); + return NULL; + } + + DP("Offset of entries section is (%016lx).\n", entries_offset); + + // load dynamic library and get the entry points. We use the dl library + // to do the loading of the library, but we could do it directly to avoid the + // dump to the temporary file. + // + // 1) Create tmp file with the library contents + // 2) Use dlopen to load the file and dlsym to retrieve the symbols + char tmp_name[] = "/tmp/tmpfile_XXXXXX"; + int tmp_fd = mkstemp (tmp_name); + + if( tmp_fd == -1 ){ + elf_end(e); + return NULL; + } + + FILE *ftmp = fdopen(tmp_fd, "wb"); + + if( !ftmp ){ + elf_end(e); + return NULL; + } + + fwrite(image->ImageStart,ImageSize,1,ftmp); + fclose(ftmp); + + DynLibTy Lib = { tmp_name, dlopen(tmp_name,RTLD_LAZY) }; + + if(!Lib.Handle){ + DP("target library loading error: %s\n",dlerror()); + elf_end(e); + return NULL; + } + + struct link_map *libInfo = (struct link_map *)Lib.Handle; + + // The place where the entries info is loaded is the library base address + // plus the offset determined from the ELF file. + Elf64_Addr entries_addr = libInfo->l_addr + entries_offset; + + DP("Pointer to first entry to be loaded is (%016lx).\n", entries_addr); + + // Table of pointers to all the entries in the target + __tgt_offload_entry *entries_table = (__tgt_offload_entry*)entries_addr; + + + __tgt_offload_entry *entries_begin = &entries_table[0]; + __tgt_offload_entry *entries_end = entries_begin + NumEntries; + + if(!entries_begin){ + DP("Can't obtain entries begin\n"); + elf_end(e); + return NULL; + } + + DP("Entries table range is (%016lx)->(%016lx)\n",(Elf64_Addr)entries_begin,(Elf64_Addr)entries_end) + DeviceInfo.createOffloadTable(device_id,entries_begin,entries_end); + + elf_end(e); + + return DeviceInfo.getOffloadEntriesTable(device_id); +} + +void *__tgt_rtl_data_alloc(int32_t device_id, int64_t size){ + void *ptr = malloc(size); + return ptr; +} + +int32_t __tgt_rtl_data_submit(int32_t device_id, void *tgt_ptr, void *hst_ptr, int64_t size){ + memcpy(tgt_ptr,hst_ptr,size); + return OFFLOAD_SUCCESS; +} + +int32_t __tgt_rtl_data_retrieve(int32_t device_id, void *hst_ptr, void *tgt_ptr, int64_t size){ + memcpy(hst_ptr,tgt_ptr,size); + return OFFLOAD_SUCCESS; +} + +int32_t __tgt_rtl_data_delete(int32_t device_id, void* tgt_ptr){ + free(tgt_ptr); + return OFFLOAD_SUCCESS; +} + +int32_t __tgt_rtl_run_target_team_region(int32_t device_id, void *tgt_entry_ptr, + void **tgt_args, int32_t arg_num, int32_t team_num, int32_t thread_limit) +{ + // ignore team num and thread limit + + // Use libffi to launch execution + ffi_cif cif; + + // All args are references + std::vector args_types(arg_num, &ffi_type_pointer); + std::vector args(arg_num); + + for(int32_t i=0; i + +#ifndef _OMPTARGET_H_ +#define _OMPTARGET_H_ + +#define OFFLOAD_SUCCESS (0) +#define OFFLOAD_FAIL (~0) + +#define OFFLOAD_DEVICE_DEFAULT -1 +#define OFFLOAD_DEVICE_CONSTRUCTOR -2 +#define OFFLOAD_DEVICE_DESTRUCTOR -3 + +/// Data attributes for each data reference used in an OpenMP target region. +enum tgt_map_type{ + tgt_map_alloc = 0x00, // allocate memory in the device for this reference + tgt_map_to = 0x01, // copy the data to the device but do not update the host memory + tgt_map_from = 0x02, // copy the data to the host but do not update the device memory + tgt_map_always = 0x04, + tgt_map_release = 0x08, + tgt_map_delete = 0x18, + tgt_map_pointer = 0x20, + tgt_map_extra = 0x40 +}; + +/// This struct is a record of an entry point or global. For a function +/// entry point the size is expected to be zero +struct __tgt_offload_entry{ + void *addr; // Pointer to the offload entry info (function or global) + char *name; // Name of the function or global + int64_t size; // Size of the entry info (0 if it a function) +}; + +/// This struct is a record of the device image information +struct __tgt_device_image{ + void *ImageStart; // Pointer to the target code start + void *ImageEnd; // Pointer to the target code end + __tgt_offload_entry *EntriesBegin; // Begin of the table with all the entries + __tgt_offload_entry *EntriesEnd; // End of the table with all the entries (non inclusive) +}; + +/// This struct is a record of all the host code that may be offloaded to a target. +struct __tgt_bin_desc{ + int32_t NumDevices; // Number of devices supported + __tgt_device_image *DeviceImages; // Arrays of device images (one per device) + __tgt_offload_entry *EntriesBegin; // Begin of the table with all the entries + __tgt_offload_entry *EntriesEnd; // End of the table with all the entries (non inclusive) +}; + +/// This struct contains the offload entries identified by the target runtime +struct __tgt_target_table{ + __tgt_offload_entry *EntriesBegin; // Begin of the table with all the entries + __tgt_offload_entry *EntriesEnd; // End of the table with all the entries (non inclusive) +}; + +#ifdef __cplusplus +extern "C" { +#endif + +void omp_set_default_device(int device_num); +int omp_get_default_device(void); +int omp_get_num_devices(void); +int omp_is_initial_device(void); + +/// adds a target shared library to the target execution image +void __tgt_register_lib(__tgt_bin_desc *desc); + +/// removes a target shared library to the target execution image +void __tgt_unregister_lib(__tgt_bin_desc *desc); + +// creates host to the target data mapping, store it in the +// libtarget.so internal structure (an entry in a stack of data maps) and +// passes the data to the device; +void __tgt_target_data_begin(int32_t device_id, int32_t arg_num, + void** args_base, void **args, int64_t *arg_sizes, int32_t *arg_types); +void __tgt_target_data_begin_nowait(int32_t device_id, int32_t arg_num, + void** args_base, void **args, int64_t *arg_sizes, int32_t *arg_types, + int32_t depNum, void * depList, int32_t noAliasDepNum, void * noAliasDepList); + +// passes data from the target, release target memory and destroys the +// host-target mapping (top entry from the stack of data maps) created by +// the last __tgt_target_data_begin +void __tgt_target_data_end(int32_t device_id, int32_t arg_num, + void** args_base, void **args, int64_t *arg_sizes, int32_t *arg_types); +void __tgt_target_data_end_nowait(int32_t device_id, int32_t arg_num, + void** args_base, void **args, int64_t *arg_sizes, int32_t *arg_types, + int32_t depNum, void * depList, int32_t noAliasDepNum, void * noAliasDepList); + +/// passes data to/from the target +void __tgt_target_data_update(int32_t device_id, int32_t arg_num, + void** args_base, void **args, int64_t *arg_sizes, int32_t *arg_types); +void __tgt_target_data_update_nowait(int32_t device_id, int32_t arg_num, + void** args_base, void **args, int64_t *arg_sizes, int32_t *arg_types, + int32_t depNum, void * depList, int32_t noAliasDepNum, void * noAliasDepList); + +// performs the same actions as data_begin in case arg_num is non-zero +// and initiates run of offloaded region on target platform; if arg_num +// is non-zero after the region execution is done it also performs the +// same action as data_update and data_end aboveThe following types are +// used; this function return 0 if it was able to transfer the execution +// to a target and an int different from zero otherwise +int __tgt_target(int32_t device_id, void *host_ptr, int32_t arg_num, + void** args_base, void **args, int64_t *arg_sizes, int32_t *arg_types); +int __tgt_target_nowait(int32_t device_id, void *host_ptr, int32_t arg_num, + void** args_base, void **args, int64_t *arg_sizes, int32_t *arg_types, + int32_t depNum, void * depList, int32_t noAliasDepNum, void * noAliasDepList); + +int __tgt_target_teams(int32_t device_id, void *host_ptr, int32_t arg_num, + void** args_base, void **args, int64_t *arg_sizes, int32_t *arg_types, + int32_t num_teams, int32_t thread_limit); +int __tgt_target_teams_nowait(int32_t device_id, void *host_ptr, int32_t arg_num, + void** args_base, void **args, int64_t *arg_sizes, int32_t *arg_types, + int32_t num_teams, int32_t thread_limit, + int32_t depNum, void * depList, int32_t noAliasDepNum, void * noAliasDepList); + +#ifdef __cplusplus +} +#endif + +#ifdef OMPTARGET_DEBUG +# define DEBUGP(prefix, ...) { fprintf(stderr, "%s --> ", prefix); \ + fprintf(stderr, __VA_ARGS__); } +#else +# define DEBUGP(prefix, ...) {} +#endif + +#ifdef __cplusplus + #define EXTERN extern "C" +#else + #define EXTERN extern +#endif + +#endif // _OMPTARGET_H_ Index: libomptarget/src/omptarget.cpp =================================================================== --- /dev/null +++ libomptarget/src/omptarget.cpp @@ -0,0 +1,999 @@ +//===------ omptarget.cpp - Target independent OpenMP target RTL -- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is dual licensed under the MIT and the University of Illinois Open +// Source Licenses. See LICENSE.txt for details. +// +//===----------------------------------------------------------------------===// +// +// Implementation of the interface to be used by Clang during the codegen of a +// target region. +// +//===----------------------------------------------------------------------===// + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* Header file global to this project */ +#include "omptarget.h" +#include "targets_info.h" + +#define DP(...) DEBUGP("Libomptarget", __VA_ARGS__) + +extern targets_info_table targets_info; + +struct RTLInfoTy; + +/// Map between host data and target data +struct HostDataToTargetTy { + long HstPtrBase; // host info + long HstPtrBegin; + long HstPtrEnd; // non-inclusive + + long TgtPtrBegin; // target info + long TgtPtrEnd; // non-inclusive (aee: not needed???) + + long RefCount; + + HostDataToTargetTy() : + HstPtrBase(0), HstPtrBegin(0), HstPtrEnd(0), + TgtPtrBegin(0), TgtPtrEnd(0), RefCount(0) {} + HostDataToTargetTy(long BP, long B, long E, long TB, long TE) : + HstPtrBase(BP), HstPtrBegin(B), HstPtrEnd(E), + TgtPtrBegin(TB), TgtPtrEnd(TE), RefCount(1) {} +}; + +typedef std::list HostDataToTargetListTy; + +struct DeviceTy { + int32_t DeviceID; + RTLInfoTy *RTL; + int32_t RTLDeviceID; + + bool IsInit; + HostDataToTargetListTy HostDataToTargetMap; + std::list PendingConstrDestrHostPtrList; + + DeviceTy(RTLInfoTy *RTL) : DeviceID(-1), RTL(RTL), RTLDeviceID(-1), + IsInit(false), HostDataToTargetMap(), PendingConstrDestrHostPtrList() {} + + void *getOrAllocTgtPtr(void* HstPtrBegin, void* HstPtrBase, long Size, + long &IsNew, long UpdateRefCount = true); + void *getTgtPtrBegin(void* HstPtrBegin, long Size, long &IsLast, + long UpdateRefCount = true); + void deallocTgtPtr(void* TgtPtrBegin, long Size, long ForceDelete); + + // calls to RTL + int32_t init(); + __tgt_target_table *load_binary(void* Img); + + int32_t data_submit(void* TgtPtrBegin, void* HstPtrBegin, int64_t Size); + int32_t data_retrieve(void* HstPtrBegin, void* TgtPtrBegin, int64_t Size); + + int32_t run_region(void* TgtEntryPtr, void** TgtVarsPtr, int32_t TgtVarsSize); + int32_t run_team_region(void* TgtEntryPtr, void** TgtVarsPtr, + int32_t TgtVarsSize, int32_t NumTeams, int32_t ThreadLimit); +}; + +struct RTLInfoTy{ + typedef int32_t (device_type_ty)(int32_t); + typedef int32_t (number_of_devices_ty)(); + typedef int32_t (init_device_ty)(int32_t); + typedef __tgt_target_table* (load_binary_ty)(int32_t, void*); + typedef void* (data_alloc_ty)(int32_t, int64_t); + typedef int32_t (data_submit_ty)(int32_t, void*, void*, int64_t); + typedef int32_t (data_retrieve_ty)(int32_t, void*, void*, int64_t); + typedef int32_t (data_delete_ty)(int32_t, void*); + typedef int32_t (run_region_ty)(int32_t, void*, void**, int32_t); + typedef int32_t (run_team_region_ty)(int32_t, void*, void**, int32_t, int32_t, int32_t); + + int32_t Idx; // RTL index, index is the number of devices + // of other RTLs that were registered before + int32_t NumberOfDevices; // Number of devices this RTL deal with + std::vector Devices; // one per device (NumberOfDevices in total) + + void *LibraryHandler; + //Functions implemented in the RTL + device_type_ty *device_type; + number_of_devices_ty *number_of_devices; + init_device_ty *init_device; + load_binary_ty *load_binary; + data_alloc_ty *data_alloc; + data_submit_ty *data_submit; + data_retrieve_ty *data_retrieve; + data_delete_ty *data_delete; + run_region_ty *run_region; + run_team_region_ty *run_team_region; +}; + +/// RTLs identified in the system +typedef std::list RTLsTy; +static RTLsTy RTLs; + +/// Map between device type (elf id) and RTL +typedef std::map DTypeToRTLMapTy; +static DTypeToRTLMapTy DTypeToRTLMap; + +/// Map between Device ID (i.e. openmp device id) and its DeviceTy +typedef std::vector DevicesTy; +static DevicesTy Devices; + +/// Map between the host entry begin and the translation table. Each +/// registered library get one TranslationTable. Use the map from +/// __tgt_offload_entry so that we may quickly determine if we are +/// trying to re=register a existing lib, or we really have a new one +struct TranslationTable{ + __tgt_target_table HostTable; + + // Image assigned to a given device + std::vector<__tgt_device_image*> TargetsImages; // One image per device ID + + // Table of entry points or NULL if it was not already computed + std::vector<__tgt_target_table*> TargetsTable; // One table per device ID +}; +typedef std::map<__tgt_offload_entry*, TranslationTable> HostEntriesBeginToTransTableTy; +static HostEntriesBeginToTransTableTy HostEntriesBeginToTransTable; + +/// Map between the host ptr and a table index +struct TableMap{ + TranslationTable *Table; // table associated with host ptr + uint32_t Index; // index in which the host ptr translated entry is found + TableMap() : Table(0) , Index(0) {} + TableMap(TranslationTable *table, uint32_t index) : Table(table) , Index(index) {} +}; +typedef std::map HostPtrToTableMapTy; +static HostPtrToTableMapTy HostPtrToTableMap; + + +//////////////////////////////////////////////////////////////////////////////// +// getter and setter +// +// aee: non compliant. This need to be integrated in KMPC; can keep it +// here for the moment + +static int DefaultDevice = 0; + +//aee non-compliant +void omp_set_default_device(int device_num){ + DefaultDevice = device_num; +} + +//aee non-compliant +int omp_get_default_device(void){ + return DefaultDevice; +} + +int omp_get_num_devices(void) { + return Devices.size(); +} + +int omp_is_initial_device(void){ + return true; +} + +//////////////////////////////////////////////////////////////////////////////// +// functionality for device + +// return the target pointer begin (where the data will be moved) +void *DeviceTy::getTgtPtrBegin(void* HstPtrBegin, long Size, long &IsLast, + long UpdateRefCount) +{ + long hp = (long) HstPtrBegin; + IsLast = false; + + for (HostDataToTargetListTy::iterator + ii=HostDataToTargetMap.begin(), + ie=HostDataToTargetMap.end(); ii!=ie ; ++ii) { + HostDataToTargetTy &HT = *ii; + if (hp >= HT.HstPtrBegin && hp < HT.HstPtrEnd){ + if ((hp + Size) > HT.HstPtrEnd) { + DP("WARNING: Array contain pointer but does not contain the complete section\n"); + } + + IsLast = !(HT.RefCount > 1); + + if (HT.RefCount > 1 && UpdateRefCount) + --HT.RefCount; + + long tp = HT.TgtPtrBegin + (hp - HT.HstPtrBegin); + return (void*)tp; + } + } + + return NULL; +} + +// return the target pointer begin (where the data will be moved) +void *DeviceTy::getOrAllocTgtPtr(void* HstPtrBegin, void* HstPtrBase, + long Size, long &IsNew, long UpdateRefCount) +{ + long hp = (long) HstPtrBegin; + IsNew = false; + + // Check if the pointer is contained + for (HostDataToTargetListTy::iterator + ii=HostDataToTargetMap.begin(), + ie=HostDataToTargetMap.end(); ii!=ie ; ++ii) { + HostDataToTargetTy &HT = *ii; + + // It is contained? + if (hp >= HT.HstPtrBegin && hp < HT.HstPtrEnd){ + if ((hp + Size) > HT.HstPtrEnd) { + DP("WARNING: Array contain pointer but does not contain the complete section\n"); + } + if (UpdateRefCount) + ++HT.RefCount; + long tp = HT.TgtPtrBegin + (hp - HT.HstPtrBegin); + return (void*)tp; + } + } + + //It is not contained we should create a new entry for it + IsNew = true; + long tp = (long) RTL->data_alloc(RTLDeviceID, Size); + HostDataToTargetMap.push_front( + HostDataToTargetTy((long)HstPtrBase, hp, hp+Size, tp, tp+Size)); + return (void*)tp; +} + + +void DeviceTy::deallocTgtPtr(void* HstPtrBegin, long Size, long ForceDelete) +{ + long hp = (long)HstPtrBegin; + + // Check if the pointer is contained in any sub-nodes + for (HostDataToTargetListTy::iterator + ii=HostDataToTargetMap.begin(), + ie=HostDataToTargetMap.end(); ii!=ie ; ++ii) { + HostDataToTargetTy &HT = *ii; + + // It is contained? + if (hp >= HT.HstPtrBegin && hp < HT.HstPtrEnd) { + if ((hp + Size) > HT.HstPtrEnd) { + DP("WARNING: Array contain pointer but does not contain the complete section\n"); + } + if (ForceDelete) HT.RefCount = 1; + if (--HT.RefCount <= 0) { + assert(HT.RefCount == 0 && "did not expect a negative ref count"); + DP("Deleting tgt data 0x%016llx of size %lld\n", + (long long) HT.TgtPtrBegin, + (long long)Size); + RTL->data_delete(RTLDeviceID, (void *) HT.TgtPtrBegin); + HostDataToTargetMap.erase(ii); + } + return; + } + } + DP("Section to delete (hst addr 0x%llx) does not exist in the allocated memory\n", + (unsigned long long) hp); +} + +// init device +int32_t DeviceTy::init() +{ + int32_t rc = RTL->init_device(RTLDeviceID); + if (rc == OFFLOAD_SUCCESS) { + IsInit = true; + } + return rc; +} + +// load binary to device +__tgt_target_table *DeviceTy::load_binary(void* Img) +{ + return RTL->load_binary(RTLDeviceID, Img); +} + +// submit data to device +int32_t DeviceTy::data_submit(void* TgtPtrBegin, void* HstPtrBegin, + int64_t Size) +{ + return RTL->data_submit(RTLDeviceID, TgtPtrBegin, HstPtrBegin, Size); +} + +// retrieve data from device +int32_t DeviceTy::data_retrieve(void* HstPtrBegin, void* TgtPtrBegin, + int64_t Size) +{ + return RTL->data_retrieve(RTLDeviceID, HstPtrBegin, TgtPtrBegin, Size); +} + +// run region on device +int32_t DeviceTy::run_region(void* TgtEntryPtr, void** TgtVarsPtr, + int32_t TgtVarsSize) +{ + return RTL->run_region(RTLDeviceID, TgtEntryPtr, TgtVarsPtr, TgtVarsSize); +} + +// run team region on device +int32_t DeviceTy::run_team_region(void* TgtEntryPtr, void** TgtVarsPtr, + int32_t TgtVarsSize, int32_t NumTeams, int32_t ThreadLimit) +{ + return RTL->run_team_region(RTLDeviceID, TgtEntryPtr, TgtVarsPtr, + TgtVarsSize, NumTeams, ThreadLimit); +} + +//////////////////////////////////////////////////////////////////////////////// +// functionality for registering libs + +static void RegisterImageIntoTranslationTable(TranslationTable &TT, RTLInfoTy &RTL, + __tgt_device_image *image){ + + // sane size, as when we increase the one, we also increase the other + assert(TT.TargetsTable.size() == TT.TargetsImages.size() + && "We should have as many images as we have tables!"); + + // Resize the Targets Table and Images to accommodate the new targets if required + unsigned TargetsTableMinimumSize = RTL.Idx + RTL.NumberOfDevices; + + if (TT.TargetsTable.size() < TargetsTableMinimumSize){ + TT.TargetsImages.resize(TargetsTableMinimumSize, 0); + TT.TargetsTable.resize(TargetsTableMinimumSize, 0); + } + + // Register the image in all devices for this target type + for(int32_t i=0; iEntriesBegin){ + DP("Invalid Host entries!\n"); + return; + } + + DP("Registering entries starting at %016lx...\n", + (Elf64_Addr)desc->EntriesBegin->addr); + + HostEntriesBeginToTransTableTy::iterator TransTableIt = + HostEntriesBeginToTransTable.find(desc->EntriesBegin); + + if (TransTableIt != HostEntriesBeginToTransTable.end()){ + DP("Already registered!\n"); + return; + } + + // Initialize translation table for this + TranslationTable &TransTable = HostEntriesBeginToTransTable[desc->EntriesBegin]; + TransTable.HostTable.EntriesBegin = desc->EntriesBegin; + TransTable.HostTable.EntriesEnd = desc->EntriesEnd; + + // Scan all the device images + for(int32_t i=0; iNumDevices; ++i){ + + __tgt_device_image &img = desc->DeviceImages[i]; + char *img_begin = (char*)img.ImageStart; + char *img_end = (char*)img.ImageEnd; + size_t img_size = img_end - img_begin; + + //Obtain elf handler + Elf *e = elf_memory(img_begin, img_size); + if (!e){ + DP("Unable to get ELF handle: %s!\n", elf_errmsg(-1)); + continue; + } + + //Check if ELF is the right kind + if (elf_kind(e) != ELF_K_ELF){ + DP("Unexpected ELF type!\n"); + continue; + } + Elf64_Ehdr *eh64 = elf64_getehdr(e); + Elf32_Ehdr *eh32 = elf32_getehdr(e); + uint16_t MachineID; + if (eh64 && !eh32) + MachineID = eh64->e_machine; + else if (eh32 && !eh64) + MachineID = eh32->e_machine; + else{ + DP("Ambiguous ELF header!\n"); + continue; + } + + DTypeToRTLMapTy::iterator RTLHandler = DTypeToRTLMap.find(MachineID); + + // We already have an handler for this device's RTL? + // If so insert the Image <-> RTL Map entry and continue + if (RTLHandler != DTypeToRTLMap.end()){ + + // We were unable to find a runtime for this device before... + if (RTLHandler->second == 0) + continue; + + RegisterImageIntoTranslationTable(TransTable, *RTLHandler->second, &img); + continue; + } + + // Locate the library name and create an handler + void *dynlib_handle = 0; + + DP("Looking for RTL libraries: with machine id %d\n", MachineID); + for(int32_t j=0; jNumberOfDevices, device); + for (int32_t device_id = 0; device_id < RTL->NumberOfDevices; device_id++) { + // global device ID + Devices[start + device_id].DeviceID = start + device_id; + // RTL local device ID + Devices[start + device_id].RTLDeviceID = device_id; + + // Save pointer to device in RTL in case we want to unregister the RTL + RTL->Devices.push_back(&Devices[start + device_id]); + } + + DP("Registering image %016lx with RTL!\n", (Elf64_Addr)img_begin); + RegisterImageIntoTranslationTable(TransTable, *RTL, &img); + } + + DP("Done register entries!\n"); +} + +//////////////////////////////////////////////////////////////////////////////// +/// unloads a target shared library +EXTERN void __tgt_unregister_lib(__tgt_bin_desc *desc){ + DP("Unloading target library!\n"); + return; +} + +/// Internal function to do the mapping and transfer the data to the device +static void target_data_begin(DeviceTy & Device, int32_t arg_num, + void** args_base, void **args, int64_t *arg_sizes, int32_t *arg_types) +{ + // process each input + for(int32_t i=0; i (tgt:%016lx)\n", (long)arg_sizes[i], + (long)HstPtrBegin, (long)TgtPtrBegin); + Device.data_submit(TgtPtrBegin, HstPtrBegin, arg_sizes[i]); + } + + if (arg_types[i] & tgt_map_pointer /* && Pointer_IsNew */){ + DP("Update pointer (%016lx) -> [%016lx]\n", + (long)Pointer_TgtPtrBegin, (long)TgtPtrBegin); + uint64_t Delta = (uint64_t) HstPtrBegin - (uint64_t) HstPtrBase; + void *TgrPtrBase_Value = (void *)((uint64_t) TgtPtrBegin - Delta); + Device.data_submit(Pointer_TgtPtrBegin, &TgrPtrBase_Value, sizeof(void*)); + } + } +} + +EXTERN void __tgt_target_data_begin_nowait(int32_t device_id, int32_t arg_num, + void** args_base, void **args, int64_t *arg_sizes, int32_t *arg_types, + int32_t depNum, void * depList, int32_t noAliasDepNum, void * noAliasDepList) +{ + __tgt_target_data_begin(device_id, arg_num, args_base, args, arg_sizes, + arg_types); +} + +/// creates host to the target data mapping, store it in the +/// libtarget.so internal structure (an entry in a stack of data maps) +/// and passes the data to the device; +EXTERN void __tgt_target_data_begin(int32_t device_id, int32_t arg_num, + void** args_base, void **args, int64_t *arg_sizes, int32_t *arg_types) +{ + DP("Entering data begin region for device %d with %d mappings\n", + device_id, arg_num); + + // No devices available? + if (device_id == OFFLOAD_DEVICE_DEFAULT) { + device_id = omp_get_default_device(); + DP("Use default device id %d\n", device_id); + } + if (Devices.size() <= (size_t)device_id){ + DP("Device ID %d does not have a matching RTL.\n", device_id); + return; + } + + // Get device info + DeviceTy & Device = Devices[device_id]; + // Init the device if not done before + if (!Device.IsInit) { + if (Device.init() != OFFLOAD_SUCCESS) { + DP("failed to init device %d\n", device_id); + return; + } + } + + target_data_begin(Device, arg_num, args_base, args, arg_sizes, arg_types); +} + + +/// Internal function to undo the mapping and retrieve the data from the device +static void target_data_end(DeviceTy & Device, int32_t arg_num, + void** args_base, void **args, int64_t *arg_sizes, int32_t *arg_types) +{ + // process each input + for(int32_t i=0; i (hst:%016lx)\n", (long)arg_sizes[i], + (long)TgtPtrBegin, (long)HstPtrBegin); + Device.data_retrieve(HstPtrBegin, TgtPtrBegin, arg_sizes[i]); + } + if (IsLast || ForceDelete) { + Device.deallocTgtPtr(HstPtrBegin, arg_sizes[i], ForceDelete); + } + } +} + +/// passes data from the target, release target memory and destroys +/// the host-target mapping (top entry from the stack of data maps) +/// created by the last __tgt_target_data_begin +EXTERN void __tgt_target_data_end(int32_t device_id, int32_t arg_num, + void** args_base, void **args, int64_t *arg_sizes, int32_t *arg_types) +{ + DP("Entering data end region with %d mappings\n", arg_num); + + // No devices available? + if (device_id == OFFLOAD_DEVICE_DEFAULT) { + device_id = omp_get_default_device(); + } + if (Devices.size() <= (size_t)device_id){ + DP("Device ID %d does not have a matching RTL.\n", device_id); + return; + } + + DeviceTy & Device = Devices[device_id]; + if (!Device.IsInit) { + DP("uninit device: ignore"); + return; + } + + target_data_end(Device, arg_num, args_base, args, arg_sizes, arg_types); + +} + +EXTERN void __tgt_target_data_end_nowait(int32_t device_id, int32_t arg_num, + void** args_base, void **args, int64_t *arg_sizes, int32_t *arg_types, + int32_t depNum, void * depList, int32_t noAliasDepNum, void * noAliasDepList) +{ + __tgt_target_data_end(device_id, arg_num, args_base, args, arg_sizes, + arg_types); +} + +/// passes data to/from the target +EXTERN void __tgt_target_data_update(int32_t device_id, int32_t arg_num, + void** args_base, void **args, int64_t *arg_sizes, int32_t *arg_types) +{ + DP("Entering data update with %d mappings\n", arg_num); + + // No devices available? + if (device_id == OFFLOAD_DEVICE_DEFAULT) { + device_id = omp_get_default_device(); + } + if (Devices.size() <= (size_t)device_id){ + DP("Device ID %d does not have a matching RTL.\n", device_id); + return; + } + + // Get device info + DeviceTy & Device = Devices[device_id]; + if (!Device.IsInit) { + DP("uninit device: ignore"); + return; + } + + // process each input + for(int32_t i=0; i (hst:%016lx)\n", (long)arg_sizes[i], + (long)TgtPtrBegin, (long)HstPtrBegin); + Device.data_retrieve(HstPtrBegin, TgtPtrBegin, arg_sizes[i]); + } + if (arg_types[i] & tgt_map_to) { + DP("Moving %ld bytes (hst:%016lx) -> (tgt:%016lx)\n", (long)arg_sizes[i], + (long)HstPtrBegin, (long)TgtPtrBegin); + Device.data_submit(TgtPtrBegin, HstPtrBegin, arg_sizes[i]); + } + } +} + +EXTERN void __tgt_target_data_update_nowait(int32_t device_id, int32_t arg_num, + void** args_base, void **args, int64_t *arg_sizes, int32_t *arg_types, + int32_t depNum, void * depList, int32_t noAliasDepNum, void * noAliasDepList) +{ + __tgt_target_data_update(device_id, arg_num, args_base, args, arg_sizes, + arg_types); +} + +/// performs the same actions as data_begin in case arg_num is +/// non-zero and initiates run of offloaded region on target platform; +/// if arg_num is non-zero after the region execution is done it also +/// performs the same action as data_update and data_end aboveThe +/// following types are used; this function return 0 if it was able to +/// transfer the execution to a target and an int different from zero +/// otherwise +static int target(int32_t device_id, void *host_ptr, int32_t arg_num, + void** args_base, void **args, int64_t *arg_sizes, int32_t *arg_types, + int32_t team_num, int32_t thread_limit, + int IsTeamConstruct, int IsConstrDestrRecursiveCall) +{ + DP("Entering target region with entry point %016lx and device Id %d\n", + (Elf64_Addr)host_ptr, device_id); + + if (device_id == OFFLOAD_DEVICE_DEFAULT) { + device_id = omp_get_default_device(); + } + // got a new constructor/destructor? + if (device_id == OFFLOAD_DEVICE_CONSTRUCTOR || + device_id == OFFLOAD_DEVICE_DESTRUCTOR) { + DP("Got a constructor/destructor\n"); + for(unsigned D=0; D=0 && (size_t)device_id::iterator + ii=Device.PendingConstrDestrHostPtrList.begin(), + ie=Device.PendingConstrDestrHostPtrList.end(); ii!=ie ; ++ii) { + void *ConstrDestrHostPtr = *ii; + int rc = target(device_id, ConstrDestrHostPtr, 0, NULL, NULL, NULL, NULL, 1, 1, + true /*team*/, true /*recursive*/); + if (rc != OFFLOAD_SUCCESS) { + DP("failed to run constr/destr... enqueue it\n"); + return OFFLOAD_FAIL; + } + } + DP("done with pending constr/destr\n"); + Device.PendingConstrDestrHostPtrList.clear(); + } + + // Find the table information in the map or look it up in the translation tables + TableMap *TM = 0; + HostPtrToTableMapTy::iterator TableMapIt = HostPtrToTableMap.find(host_ptr); + if (TableMapIt == HostPtrToTableMap.end()){ + // We don't have a map. So search all the registered libraries + for (HostEntriesBeginToTransTableTy::iterator + ii = HostEntriesBeginToTransTable.begin(), + ie = HostEntriesBeginToTransTable.end(); !TM && ii!=ie ; ++ii){ + // get the translation table (which contains all the good info) + TranslationTable *TransTable = &ii->second; + // iterate over all the host table entries to see if we can locate the host_ptr + __tgt_offload_entry *begin = TransTable->HostTable.EntriesBegin; + __tgt_offload_entry *end = TransTable->HostTable.EntriesEnd; + __tgt_offload_entry *cur = begin; + for (uint32_t i=0; cur < end; ++cur, ++i) { + if (cur->addr != host_ptr) + continue; + // we got a match, now fill the HostPtrToTableMap so that we + // may avoid this search next time. + TM = &HostPtrToTableMap[host_ptr]; + TM->Table = TransTable; + TM->Index = i; + break; + } + } + } else { + TM = &TableMapIt->second; + } + // No map for this host pointer found! + if (!TM){ + DP("Host ptr %016lx does not have a matching target pointer.\n", + (Elf64_Addr)host_ptr); + return OFFLOAD_FAIL; + } + + // get target table + assert(TM->Table->TargetsTable.size() > (size_t)device_id + && "Not expecting a device ID outside the tables bounds!"); + __tgt_target_table *TargetTable = TM->Table->TargetsTable[device_id]; + // if first call, need to move the data + if (!TargetTable) + { + // 1) get image + assert(TM->Table->TargetsImages.size() > (size_t)device_id + && "Not expecting a device ID outside the tables bounds!"); + __tgt_device_image *img = TM->Table->TargetsImages[device_id]; + if (!img){ + DP("No image loaded for device id %d.\n", device_id); + return OFFLOAD_FAIL; + } + // 2) load image into the target table + TargetTable = TM->Table->TargetsTable[device_id] = Device.load_binary(img); + // Unable to get table for this image: invalidate image and fail + if (!TargetTable){ + DP("Unable to generate entries table for device id %d.\n", device_id); + TM->Table->TargetsImages[device_id] = 0; + return OFFLOAD_FAIL; + } + + // Verify if the two tables sizes match + size_t hsize = TM->Table->HostTable.EntriesEnd - + TM->Table->HostTable.EntriesBegin; + size_t tsize = TargetTable->EntriesEnd - TargetTable->EntriesBegin; + + // Invalid image for this host entries! + if (hsize != tsize){ + DP("Host and Target tables mismatch for device id %d [%lx != %lx].\n", + device_id, hsize, tsize); + TM->Table->TargetsImages[device_id] = 0; + TM->Table->TargetsTable[device_id] = 0; + return OFFLOAD_FAIL; + } + assert(TM->Index < hsize && + "Not expecting index greater than the table size"); + + // process global data that needs to be mapped + __tgt_target_table *HostTable = &TM->Table->HostTable; + for(__tgt_offload_entry *CurrDeviceEntry = TargetTable->EntriesBegin, + *CurrHostEntry = HostTable->EntriesBegin, + *EntryDeviceEnd = TargetTable->EntriesEnd; + CurrDeviceEntry != EntryDeviceEnd; + CurrDeviceEntry++, CurrHostEntry++) { + if (CurrDeviceEntry->size != 0) { + // has data + assert(CurrDeviceEntry->size == CurrHostEntry->size && "data size mismatch"); + long IsLast; + assert(Device.getTgtPtrBegin(CurrHostEntry->addr, CurrHostEntry->size, + IsLast, false) == NULL && "data in declared target should not be already mapped"); + // add entry to map + DP("add mapping from host 0x%llx to 0x%llx with size %lld\n\n", + (unsigned long long) CurrHostEntry->addr, + (unsigned long long)CurrDeviceEntry->addr, + (unsigned long long)CurrDeviceEntry->size); + Device.HostDataToTargetMap.push_front(HostDataToTargetTy( + (long)CurrHostEntry->addr, (long)CurrHostEntry->addr, + (long)CurrHostEntry->addr + CurrHostEntry->size, + (long)CurrDeviceEntry->addr, + (long)CurrDeviceEntry->addr+CurrDeviceEntry->size)); + } + } + } + + //Move data to device + target_data_begin(Device, arg_num, args_base, args, arg_sizes, arg_types); + + std::vector tgt_args; + + for(int32_t i=0; iEntriesBegin[TM->Index].addr, TM->Index); + if (IsTeamConstruct) { + rc = Device.run_team_region(TargetTable->EntriesBegin[TM->Index].addr, + &tgt_args[0], tgt_args.size(), team_num, thread_limit); + } else { + rc = Device.run_region(TargetTable->EntriesBegin[TM->Index].addr, + &tgt_args[0], tgt_args.size()); + } + + if (rc) + return OFFLOAD_FAIL; + + //Move data from device + target_data_end(Device, arg_num, args_base, args, arg_sizes, arg_types); + return OFFLOAD_SUCCESS; +} + +EXTERN int __tgt_target(int32_t device_id, void *host_ptr, int32_t arg_num, + void** args_base, void **args, int64_t *arg_sizes, int32_t *arg_types) +{ + return target(device_id, host_ptr, arg_num, args_base, args, + arg_sizes, arg_types, 0, 0, false /*team*/, false /*recursive*/); +} + +EXTERN int __tgt_target_nowait(int32_t device_id, void *host_ptr, int32_t arg_num, + void** args_base, void **args, int64_t *arg_sizes, int32_t *arg_types, + int32_t depNum, void * depList, int32_t noAliasDepNum, void * noAliasDepList) +{ + return __tgt_target(device_id, host_ptr, arg_num, args_base, args, arg_sizes, + arg_types); +} + +EXTERN int __tgt_target_teams(int32_t device_id, void *host_ptr, int32_t arg_num, + void** args_base, void **args, int64_t *arg_sizes, int32_t *arg_types, + int32_t team_num, int32_t thread_limit) +{ + return target(device_id, host_ptr, arg_num, args_base, args, + arg_sizes, arg_types, team_num, thread_limit, + true /*team*/, false /*recursive*/); +} + +EXTERN int __tgt_target_teams_nowait(int32_t device_id, void *host_ptr, int32_t arg_num, + void** args_base, void **args, int64_t *arg_sizes, int32_t *arg_types, + int32_t team_num, int32_t thread_limit, + int32_t depNum, void * depList, int32_t noAliasDepNum, void * noAliasDepList) +{ + return __tgt_target_teams(device_id, host_ptr, arg_num, args_base, args, + arg_sizes, arg_types, team_num, thread_limit); +} + + +//////////////////////////////////////////////////////////////////////////////// +// temporary for debugging (matching the ones in omptarget-nvptx +// REMOVE XXXXXX (here and in omptarget-nvptx) + +EXTERN void __kmpc_kernel_print(char *title) +{ + DP(" %s\n", title); +} + +EXTERN void __kmpc_kernel_print_int8(char *title, int64_t data) +{ + DP(" %s val=%lld\n", title, (long long)data); +} + Index: libomptarget/src/targets_info.h =================================================================== --- /dev/null +++ libomptarget/src/targets_info.h @@ -0,0 +1,28 @@ +//===-------- targets_info.h - Information about Target RTLs ------ C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is dual licensed under the MIT and the University of Illinois Open +// Source Licenses. See LICENSE.txt for details. +// +//===----------------------------------------------------------------------===// +// +// Map between ELF machine IDs and the RTL library that supports it +// +//===----------------------------------------------------------------------===// + +#include + +#ifndef _TARGETS_INFO_H_ +#define _TARGETS_INFO_H_ + +struct targets_info_table_entry{ + uint16_t Machine_Elf_ID; + const char *Machine_RTL_Lib; +}; +struct targets_info_table{ + int32_t Number_of_Entries; + targets_info_table_entry *Entries; +}; + +#endif Index: libomptarget/src/targets_info.cpp =================================================================== --- /dev/null +++ libomptarget/src/targets_info.cpp @@ -0,0 +1,27 @@ +//===-------- targets_info.cpp - Information about Target RTLs ---- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is dual licensed under the MIT and the University of Illinois Open +// Source Licenses. See LICENSE.txt for details. +// +//===----------------------------------------------------------------------===// +// +// Map between ELF machine IDs and the RTL library that supports it +// +//===----------------------------------------------------------------------===// + + +#include "targets_info.h" + +static targets_info_table_entry targets_info_entries[] = { + { 21 /* EM_PPC64 */ , "libomptarget.rtl.ppc64.so"}, + { 62 /* EM_X86_64*/ , "libomptarget.rtl.x86_64.so"}, + { 190 /* EM_CUDA */ , "libomptarget.rtl.cuda.so"} +}; + +targets_info_table targets_info = { + sizeof(targets_info_entries) / sizeof(targets_info_table_entry), + &targets_info_entries[0] +}; + Index: libomptarget/test/CMakeLists.txt =================================================================== --- /dev/null +++ libomptarget/test/CMakeLists.txt @@ -0,0 +1,93 @@ +# CMakeLists.txt file for unit testing OpenMP Library +include(FindPythonInterp) +include(CheckTypeSize) +if(NOT PYTHONINTERP_FOUND) + libomptarget_warning_say("Could not find Python.") + libomptarget_warning_say("The check-libomptarget target will not be available!") + return() +endif() + +set(LIBOMPTARGET_TEST_CFLAGS "" CACHE STRING + "Extra compiler flags to send to the test compiler") + +if(${LIBOMPTARGET_STANDALONE_BUILD}) + # Make sure we can use the console pool for recent cmake and ninja > 1.5 + if(CMAKE_VERSION VERSION_LESS 3.1.20141117) + set(cmake_3_2_USES_TERMINAL) + else() + set(cmake_3_2_USES_TERMINAL USES_TERMINAL) + endif() + set(LIBOMPTARGET_TEST_C_COMPILER ${CMAKE_C_COMPILER} CACHE STRING + "C compiler to use for testing OpenMP offloading library") + set(LIBOMPTARGET_TEST_CXX_COMPILER ${CMAKE_CXX_COMPILER} CACHE STRING + "C++ compiler to use for testing OpenMP offloading library") + set(LIBOMPTARGET_TEST_OPENMP_FLAG -fopenmp CACHE STRING + "OpenMP compiler flag to use for testing OpenMP offloading library") + set(LIBOMPTARGET_LLVM_LIT_EXECUTABLE "" CACHE STRING + "Path to llvm-lit") + find_program(LIT_EXECUTABLE NAMES llvm-lit ${LIBOMPTARGET_LLVM_LIT_EXECUTABLE}) + if(NOT LIT_EXECUTABLE) + libomptarget_say("Cannot find llvm-lit.") + libomptarget_say("Please put llvm-lit in your PATH or set LIBOMPTARGET_LLVM_LIT_EXECUTABLE to its full path") + libomptarget_warning_say("The check-libomptarget target will not be available!") + return() + endif() + + set(LIBOMPTARGET_FILECHECK_EXECUTABLE "" CACHE STRING + "Path to FileCheck") + find_program(LIBOMPTARGET_FILECHECK NAMES FileCheck ${LIBOMPTARGET_FILECHECK_EXECUTABLE}) + if(NOT LIBOMPTARGET_FILECHECK) + libomptarget_say("Cannot find FileCheck.") + libomptarget_say("Please put FileCheck in your PATH or set LIBOMPTARGET_FILECHECK_EXECUTABLE to its full path") + libomptarget_warning_say("The check-libomptarget target will not be available!") + return() + endif() + + # Set lit arguments + # The -j 1 lets the actual tests run with the entire machine. + # We have one test thread that spawns the tests serially. This allows + # Each test to use the entire machine. + set(LIBOMPTARGET_LIT_ARGS_DEFAULT "-sv --show-unsupported --show-xfail -j 1") + if(MSVC OR XCODE) + set(LIBOMPTARGET_LIT_ARGS_DEFAULT "${LIBOMPTARGET_LIT_ARGS_DEFAULT} --no-progress-bar") + endif() + set(LIBOMPTARGET_LIT_ARGS "${LIBOMPTARGET_LIT_ARGS_DEFAULT}" CACHE STRING + "Default options for lit") + separate_arguments(LIBOMPTARGET_LIT_ARGS) + add_custom_target(check-libomptarget + COMMAND ${PYTHON_EXECUTABLE} ${LIT_EXECUTABLE} ${LIBOMPTARGET_LIT_ARGS} ${CMAKE_CURRENT_BINARY_DIR} + DEPENDS omptarget + COMMENT "Running libomptarget tests" + ${cmake_3_2_USES_TERMINAL} + ) + + set(LIBOMPTARGET_OPENMP_HEADER_FOLDER "" CACHE STRING + "Path to folder containing omp.h") + set(LIBOMPTARGET_OPENMP_HOST_RTL_FOLDER "" CACHE STRING + "Path to folder containing libomp.h") +else() + # LLVM source tree build, test just-built clang + if(NOT MSVC) + set(LIBOMPTARGET_TEST_C_COMPILER ${LLVM_RUNTIME_OUTPUT_INTDIR}/clang) + set(LIBOMPTARGET_TEST_CXX_COMPILER ${LLVM_RUNTIME_OUTPUT_INTDIR}/clang++) + set(LIBOMPTARGET_FILECHECK ${LLVM_RUNTIME_OUTPUT_INTDIR}/FileCheck) + else() + libomptarget_error_say("Not prepared to run tests on Windows systems.") + endif() + set(LIBOMPTARGET_TEST_OPENMP_FLAG -fopenmp=libomp) + # Use add_lit_testsuite() from LLVM CMake. This also depends on OpenMP + # implementation because it uses omp.h. + add_lit_testsuite(check-libomptarget + "Running libomptarget tests" + ${CMAKE_CURRENT_BINARY_DIR} + ARGS "-j 1" + DEPENDS omptarget omp + ) + + set(LIBOMPTARGET_OPENMP_HEADER_FOLDER "${LIBOMPTARGET_BINARY_DIR}/../runtime/src") +endif() + +# Configure the lit.site.cfg.in file +set(AUTO_GEN_COMMENT "## Autogenerated by libomptarget configuration.\n# Do not edit!") +configure_file(lit.site.cfg.in lit.site.cfg @ONLY) + Index: libomptarget/test/lit.cfg =================================================================== --- /dev/null +++ libomptarget/test/lit.cfg @@ -0,0 +1,116 @@ +# -*- Python -*- vim: set ft=python ts=4 sw=4 expandtab tw=79: +# Configuration file for the 'lit' test runner. + +import os +import lit.formats + +# Tell pylint that we know config and lit_config exist somewhere. +if 'PYLINT_IMPORT' in os.environ: + config = object() + lit_config = object() + +def append_dynamic_library_path(name, value, sep): + if name in config.environment: + config.environment[name] = value + sep + config.environment[name] + else: + config.environment[name] = value + +# name: The name of this test suite. +config.name = 'libomptarget' + +# suffixes: A list of file extensions to treat as test files. +config.suffixes = ['.c', '.cpp', '.cc'] + +# test_source_root: The root path where tests are located. +config.test_source_root = os.path.dirname(__file__) + +# test_exec_root: The root object directory where output is placed +config.test_exec_root = config.libomptarget_obj_root + +# test format +config.test_format = lit.formats.ShTest() + +# compiler flags +config.test_cflags = config.test_openmp_flag + \ + " -I " + config.test_source_root + \ + " -I " + config.omp_header_directory + \ + " -L " + config.library_dir; + +if config.omp_host_rtl_directory: + config.test_cflags = config.test_cflags + " -L " + \ + config.omp_host_rtl_directory + +config.test_cflags = config.test_cflags + " " + config.test_extra_cflags + +# Setup environment to find dynamic library at runtime +if config.operating_system == 'Windows': + append_dynamic_library_path('PATH', config.library_dir, ";") + append_dynamic_library_path('PATH', config.omp_host_rtl_directory, ";") +elif config.operating_system == 'Darwin': + append_dynamic_library_path('DYLD_LIBRARY_PATH', config.library_dir, ":") + append_dynamic_library_path('DYLD_LIBRARY_PATH', \ + config.omp_host_rtl_directory, ";") + config.test_cflags += " -Wl,-rpath," + config.library_dir + config.test_cflags += " -Wl,-rpath," + config.omp_host_rtl_directory +else: # Unices + append_dynamic_library_path('LD_LIBRARY_PATH', config.library_dir, ":") + append_dynamic_library_path('LD_LIBRARY_PATH', \ + config.omp_host_rtl_directory, ":") + +# substitutions +# - for targets that exist in the system create the actual command. +# - for valid targets that do not exist in the system, return false, so that the +# same test can be used for different targets. + +# Scan all the valid targets. +for libomptarget_target in config.libomptarget_all_targets: + # Is this target in the current system? If so create a compile, run and test + # command. Otherwise create command that return false. + if libomptarget_target in config.libomptarget_system_targets: + config.substitutions.append(("%libomptarget-compilexx-run-and-check-" + \ + libomptarget_target, \ + "%libomptarget-compilexx-and-run-" + libomptarget_target + \ + " | " + config.libomptarget_filecheck + " %s")) + config.substitutions.append(("%libomptarget-compile-run-and-check-" + \ + libomptarget_target, \ + "%libomptarget-compile-and-run-" + libomptarget_target + \ + " | " + config.libomptarget_filecheck + " %s")) + config.substitutions.append(("%libomptarget-compilexx-and-run-" + \ + libomptarget_target, \ + "%clangxx-" + libomptarget_target + " %s -o %t-" + \ + libomptarget_target + " && %t-" + libomptarget_target)) + config.substitutions.append(("%libomptarget-compile-and-run-" + \ + libomptarget_target, \ + "%clang-" + libomptarget_target + " %s -o %t-" + \ + libomptarget_target + " && %t-" + libomptarget_target)) + config.substitutions.append(("%clangxx-" + libomptarget_target, \ + "%clangxx %cflags -omptargets=" + libomptarget_target)) + config.substitutions.append(("%clang-" + libomptarget_target, \ + "%clang %cflags -omptargets=" + libomptarget_target)) + config.substitutions.append(("%fcheck-" + libomptarget_target, \ + config.libomptarget_filecheck + " %s")) + else: + config.substitutions.append(("%libomptarget-compile-run-and-check-" + \ + libomptarget_target, \ + "echo ignored-command")) + config.substitutions.append(("%libomptarget-compilexx-run-and-check-" + \ + libomptarget_target, \ + "echo ignored-command")) + config.substitutions.append(("%libomptarget-compile-and-run-" + \ + libomptarget_target, \ + "echo ignored-command")) + config.substitutions.append(("%libomptarget-compilexx-and-run-" + \ + libomptarget_target, \ + "echo ignored-command")) + config.substitutions.append(("%clang-" + libomptarget_target, \ + "echo ignored-command")) + config.substitutions.append(("%clangxx-" + libomptarget_target, \ + "echo ignored-command")) + config.substitutions.append(("%fcheck-" + libomptarget_target, \ + "echo ignored-command")) + +config.substitutions.append(("%clangxx", config.test_cxx_compiler)) +config.substitutions.append(("%clang", config.test_c_compiler)) +config.substitutions.append(("%openmp_flag", config.test_openmp_flag)) +config.substitutions.append(("%cflags", config.test_cflags)) + Index: libomptarget/test/lit.site.cfg.in =================================================================== --- /dev/null +++ libomptarget/test/lit.site.cfg.in @@ -0,0 +1,20 @@ +@AUTO_GEN_COMMENT@ + +config.test_c_compiler = "@LIBOMPTARGET_TEST_C_COMPILER@" +config.test_cxx_compiler = "@LIBOMPTARGET_TEST_CXX_COMPILER@" +config.test_openmp_flag = "@LIBOMPTARGET_TEST_OPENMP_FLAG@" +# For the moment we still need to pass libomptarget explicitly. Once the driver +# patch, lands, this is not required anymore. +config.test_extra_cflags = "-lomptarget @LIBOMPTARGET_TEST_CFLAGS@" +config.libomptarget_obj_root = "@CMAKE_CURRENT_BINARY_DIR@" +config.library_dir = "@LIBOMPTARGET_LIBRARY_DIR@" +config.omp_header_directory = "@LIBOMPTARGET_OPENMP_HEADER_FOLDER@" +config.omp_host_rtl_directory = "@LIBOMPTARGET_OPENMP_HOST_RTL_FOLDER@" +config.operating_system = "@CMAKE_SYSTEM_NAME@" +config.libomptarget_all_targets = "@LIBOMPTARGET_ALL_TARGETS@".split() +config.libomptarget_system_targets = "@LIBOMPTARGET_SYSTEM_TARGETS@".split() +config.libomptarget_filecheck = "@LIBOMPTARGET_FILECHECK@" + +# Let the main config do the real work. +lit_config.load_config(config, "@LIBOMPTARGET_BASE_DIR@/test/lit.cfg") + Index: libomptarget/test/offloading/offloading_success.c =================================================================== --- /dev/null +++ libomptarget/test/offloading/offloading_success.c @@ -0,0 +1,21 @@ +// RUN: %libomptarget-compile-run-and-check-powerpc64-ibm-linux-gnu +// RUN: %libomptarget-compile-run-and-check-powerpc64le-ibm-linux-gnu +// RUN: %libomptarget-compile-run-and-check-x86_64-pc-linux-gnu + +#include +#include + +int main(void) { + int isHost = 0; + + #pragma omp target + { + isHost = omp_is_initial_device(); + } + + // The compiler doesn't have support to launch the target region on the device. + // CHECK: Target region executed on the host + printf("Target region executed on the %s\n", isHost ? "host" : "device"); + + return !isHost; +} Index: libomptarget/test/offloading/offloading_success.cpp =================================================================== --- /dev/null +++ libomptarget/test/offloading/offloading_success.cpp @@ -0,0 +1,21 @@ +// RUN: %libomptarget-compilexx-run-and-check-powerpc64-ibm-linux-gnu +// RUN: %libomptarget-compilexx-run-and-check-powerpc64le-ibm-linux-gnu +// RUN: %libomptarget-compilexx-run-and-check-x86_64-pc-linux-gnu + +#include +#include + +int main(void) { + int isHost = 0; + + #pragma omp target + { + isHost = omp_is_initial_device(); + } + + // The compiler doesn't have support to launch the target region on the device. + // CHECK: Target region executed on the host + printf("Target region executed on the %s\n", isHost ? "host" : "device"); + + return !isHost; +}