diff --git a/openmp/libomptarget/cmake/Modules/LibomptargetNVPTXBitcodeLibrary.cmake b/openmp/libomptarget/cmake/Modules/LibomptargetNVPTXBitcodeLibrary.cmake --- a/openmp/libomptarget/cmake/Modules/LibomptargetNVPTXBitcodeLibrary.cmake +++ b/openmp/libomptarget/cmake/Modules/LibomptargetNVPTXBitcodeLibrary.cmake @@ -15,7 +15,7 @@ if (NOT LIBOMPTARGET_NVPTX_CUDA_COMPILER STREQUAL "") set(LIBOMPTARGET_NVPTX_SELECTED_CUDA_COMPILER ${LIBOMPTARGET_NVPTX_CUDA_COMPILER}) elseif(${CMAKE_C_COMPILER_ID} STREQUAL "Clang") - set(LIBOMPTARGET_NVPTX_SELECTED_CUDA_COMPILER ${CMAKE_C_COMPILER}) + set(LIBOMPTARGET_NVPTX_SELECTED_CUDA_COMPILER ${CMAKE_CXX_COMPILER}) else() return() endif() diff --git a/openmp/libomptarget/deviceRTLs/common/omptarget.h b/openmp/libomptarget/deviceRTLs/common/omptarget.h --- a/openmp/libomptarget/deviceRTLs/common/omptarget.h +++ b/openmp/libomptarget/deviceRTLs/common/omptarget.h @@ -273,9 +273,9 @@ /// Memory manager for statically allocated memory. class omptarget_nvptx_SimpleMemoryManager { private: - ALIGN(128) struct MemDataTy { + struct MemDataTy { volatile unsigned keys[OMP_STATE_COUNT]; - } MemData[MAX_SM]; + } MemData[MAX_SM] ALIGN(128); INLINE static uint32_t hash(unsigned key) { return key & (OMP_STATE_COUNT - 1); diff --git a/openmp/libomptarget/deviceRTLs/common/src/libcall.cu b/openmp/libomptarget/deviceRTLs/common/src/libcall.cu --- a/openmp/libomptarget/deviceRTLs/common/src/libcall.cu +++ b/openmp/libomptarget/deviceRTLs/common/src/libcall.cu @@ -316,10 +316,10 @@ return rc; } -EXTERN int omp_is_initial_device(void) { - PRINT0(LD_IO, "call omp_is_initial_device() returns 0\n"); - return 0; // 0 by def on device -} +// EXTERN int omp_is_initial_device(void) { +// PRINT0(LD_IO, "call omp_is_initial_device() returns 0\n"); +// return 0; // 0 by def on device +// } // Unspecified on the device. EXTERN int omp_get_initial_device(void) { diff --git a/openmp/libomptarget/deviceRTLs/common/target_atomic.h b/openmp/libomptarget/deviceRTLs/common/target_atomic.h new file mode 100644 --- /dev/null +++ b/openmp/libomptarget/deviceRTLs/common/target_atomic.h @@ -0,0 +1,46 @@ +//===---- target_atomic.h - OpenMP GPU target atomic functions ---- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// Declarations of atomic functions provided by each target +// +//===----------------------------------------------------------------------===// + +#ifndef OMPTARGET_TARGET_ATOMIC_H +#define OMPTARGET_TARGET_ATOMIC_H + +#include "target_impl.h" + +// We declared these template functions here. Their definition is in +// __clang_cuda_device_functions.h. +template T atomicAdd(T *, T); +template T atomicInc(T *, T); +template T atomicMax(T *, T); +template T atomicExch(T *, T); +template T atomicCAS(T *, T, T); + +template INLINE T __kmpc_atomic_add(T *address, T val) { + return atomicAdd(address, val); +} + +template INLINE T __kmpc_atomic_inc(T *address, T val) { + return atomicInc(address, val); +} + +template INLINE T __kmpc_atomic_max(T *address, T val) { + return atomicMax(address, val); +} + +template INLINE T __kmpc_atomic_exchange(T *address, T val) { + return atomicExch(address, val); +} + +template INLINE T __kmpc_atomic_cas(T *address, T compare, T val) { + return atomicCAS(address, compare, val); +} + +#endif diff --git a/openmp/libomptarget/deviceRTLs/nvptx/CMakeLists.txt b/openmp/libomptarget/deviceRTLs/nvptx/CMakeLists.txt --- a/openmp/libomptarget/deviceRTLs/nvptx/CMakeLists.txt +++ b/openmp/libomptarget/deviceRTLs/nvptx/CMakeLists.txt @@ -10,6 +10,19 @@ # ##===----------------------------------------------------------------------===## +# TODO: This part needs to be refined when libomptarget is going to support +# Windows! +if(CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "x86_64") + set(aux_triple x86_64-unknown-linux-gnu) +elseif(CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "ppc64le") + set(aux_triple powerpc64le-unknown-linux-gnu) +elseif(CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "aarch64") + set(aux_triple aarch64-unknown-linux-gnu) +else() + libomptarget_say("Not building CUDA offloading device RTL: unknow host arch: ${CMAKE_HOST_SYSTEM_PROCESSOR}") + return() +endif() + get_filename_component(devicertl_base_directory ${CMAKE_CURRENT_SOURCE_DIR} DIRECTORY) @@ -79,61 +92,81 @@ ) # Set flags for LLVM Bitcode compilation. - set(bc_flags ${LIBOMPTARGET_NVPTX_SELECTED_CUDA_COMPILER_FLAGS} + set(bc_flags -S -x c++ + -target nvptx64 + -Xclang -emit-llvm-bc + -Xclang -aux-triple -Xclang ${aux_triple} + -fopenmp -Xclang -fopenmp-is-device + -D__CUDACC__ + -fdeclspec -I${devicertl_base_directory} - -I${devicertl_nvptx_directory}/src) + -I${devicertl_nvptx_directory}/src + -I${CUDA_TOOLKIT_ROOT_DIR}/include + -include openmp_wrappers/__clang_openmp_device_functions.h) if(${LIBOMPTARGET_NVPTX_DEBUG}) - set(bc_flags ${bc_flags} -DOMPTARGET_NVPTX_DEBUG=-1) + list(APPEND bc_flags -DOMPTARGET_NVPTX_DEBUG=-1) else() - set(bc_flags ${bc_flags} -DOMPTARGET_NVPTX_DEBUG=0) + list(APPEND bc_flags -DOMPTARGET_NVPTX_DEBUG=0) endif() # Create target to build all Bitcode libraries. add_custom_target(omptarget-nvptx-bc) - # Generate a Bitcode library for all the compute capabilities the user requested. + # That's all PTX versions we know for now + set(nvptx_ptx_list 50 60 61 62 63 64 65 70 71) + + # Generate a Bitcode library for all the compute capabilities the user + # requested and all PTX version we know for now. foreach(sm ${nvptx_sm_list}) - set(cuda_arch --cuda-gpu-arch=sm_${sm}) - - # Compile CUDA files to bitcode. - set(bc_files "") - foreach(src ${cuda_src_files}) - get_filename_component(infile ${src} ABSOLUTE) - get_filename_component(outfile ${src} NAME) - - add_custom_command(OUTPUT ${outfile}-sm_${sm}.bc - COMMAND ${LIBOMPTARGET_NVPTX_SELECTED_CUDA_COMPILER} ${bc_flags} ${cuda_arch} ${MAX_SM_DEFINITION} - -c ${infile} -o ${outfile}-sm_${sm}.bc - DEPENDS ${infile} - IMPLICIT_DEPENDS CXX ${infile} - COMMENT "Building LLVM bitcode ${outfile}-sm_${sm}.bc" - VERBATIM + set(cuda_arch -Xclang -target-cpu -Xclang sm_${sm}) + + foreach(ptx_num ${nvptx_ptx_list}) + set(ptx_version -Xclang -target-feature -Xclang +ptx${ptx_num}) + + set(bc_files "") + foreach(src ${cuda_src_files}) + get_filename_component(infile ${src} ABSOLUTE) + get_filename_component(outfile ${src} NAME) + set(outfile "${outfile}-sm_${sm}-ptx${ptx_num}.bc") + + add_custom_command(OUTPUT ${outfile} + COMMAND ${LIBOMPTARGET_NVPTX_SELECTED_CUDA_COMPILER} ${bc_flags} + ${cuda_arch} ${MAX_SM_DEFINITION} ${infile} -o ${outfile} + DEPENDS ${infile} + IMPLICIT_DEPENDS CXX ${infile} + COMMENT "Building LLVM bitcode ${outfile}" + VERBATIM + ) + set_property(DIRECTORY APPEND PROPERTY ADDITIONAL_MAKE_CLEAN_FILES ${outfile}) + + list(APPEND bc_files ${outfile}) + endforeach() + + set(bclib_name "libomptarget-nvptx-sm_${sm}-ptx${ptx_num}.bc") + + # Link to a bitcode library. + add_custom_command(OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/${bclib_name} + COMMAND ${LIBOMPTARGET_NVPTX_SELECTED_BC_LINKER} + -o ${CMAKE_CURRENT_BINARY_DIR}/${bclib_name} ${bc_files} + DEPENDS ${bc_files} + COMMENT "Linking LLVM bitcode ${bclib_name}" ) - set_property(DIRECTORY APPEND PROPERTY ADDITIONAL_MAKE_CLEAN_FILES ${outfile}-sm_${sm}.bc) + set_property(DIRECTORY APPEND PROPERTY ADDITIONAL_MAKE_CLEAN_FILES ${bclib_name}) - list(APPEND bc_files ${outfile}-sm_${sm}.bc) - endforeach() + set(bclib_target_name "omptarget-nvptx-sm_${sm}-ptx${ptx_num}-bc") + + add_custom_target(${bclib_target_name} ALL DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/${bclib_name}) + add_dependencies(omptarget-nvptx-bc ${bclib_target_name}) - # Link to a bitcode library. - add_custom_command(OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/libomptarget-nvptx-sm_${sm}.bc - COMMAND ${LIBOMPTARGET_NVPTX_SELECTED_BC_LINKER} - -o ${CMAKE_CURRENT_BINARY_DIR}/libomptarget-nvptx-sm_${sm}.bc ${bc_files} - DEPENDS ${bc_files} - COMMENT "Linking LLVM bitcode libomptarget-nvptx-sm_${sm}.bc" - ) - set_property(DIRECTORY APPEND PROPERTY ADDITIONAL_MAKE_CLEAN_FILES libomptarget-nvptx-sm_${sm}.bc) - - add_custom_target(omptarget-nvptx-${sm}-bc ALL DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/libomptarget-nvptx-sm_${sm}.bc) - add_dependencies(omptarget-nvptx-bc omptarget-nvptx-${sm}-bc) - - # Copy library to destination. - add_custom_command(TARGET omptarget-nvptx-${sm}-bc POST_BUILD - COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_BINARY_DIR}/libomptarget-nvptx-sm_${sm}.bc - ${LIBOMPTARGET_LIBRARY_DIR}) - - # Install bitcode library under the lib destination folder. - install(FILES ${CMAKE_CURRENT_BINARY_DIR}/libomptarget-nvptx-sm_${sm}.bc DESTINATION "${OPENMP_INSTALL_LIBDIR}") + # Copy library to destination. + add_custom_command(TARGET ${bclib_target_name} POST_BUILD + COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_BINARY_DIR}/${bclib_name} + ${LIBOMPTARGET_LIBRARY_DIR}) + + # Install bitcode library under the lib destination folder. + install(FILES ${CMAKE_CURRENT_BINARY_DIR}/${bclib_name} DESTINATION "${OPENMP_INSTALL_LIBDIR}") + endforeach() endforeach() endif() diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/nvptx_interface.h b/openmp/libomptarget/deviceRTLs/nvptx/src/nvptx_interface.h --- a/openmp/libomptarget/deviceRTLs/nvptx/src/nvptx_interface.h +++ b/openmp/libomptarget/deviceRTLs/nvptx/src/nvptx_interface.h @@ -11,7 +11,8 @@ #include -#define EXTERN extern "C" __device__ +#define EXTERN extern "C" + typedef uint32_t __kmpc_impl_lanemask_t; typedef uint32_t omp_lock_t; /* arbitrary type of the right length */ diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h b/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h --- a/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h +++ b/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h @@ -20,11 +20,12 @@ #include "nvptx_interface.h" -#define DEVICE __device__ -#define INLINE __forceinline__ DEVICE -#define NOINLINE __noinline__ DEVICE -#define SHARED __shared__ -#define ALIGN(N) __align__(N) +#define DEVICE +#define INLINE inline __attribute__((always_inline)) +#define NOINLINE __attribute__((noinline)) +// FIXME: This attribute doesn't work here +#define SHARED __attribute__((shared)) +#define ALIGN(N) __attribute__((aligned(N))) //////////////////////////////////////////////////////////////////////////////// // Kernel options @@ -67,6 +68,15 @@ #define OMP_ACTIVE_PARALLEL_LEVEL 128 +// FIXME: Forward declaration +extern "C" { +uint32_t __ffs(uint32_t); +uint32_t __popc(uint32_t); +// FIXME: This function is defined in but due to unknown reason we +// cannot use #include directly. +uint32_t min(uint32_t, uint32_t); +} + // Data sharing related quantities, need to match what is used in the compiler. enum DATA_SHARING_SIZES { // The maximum number of workers in a kernel. @@ -131,24 +141,54 @@ DEVICE unsigned GetLaneId(); // Atomics -template INLINE T __kmpc_atomic_add(T *address, T val) { - return atomicAdd(address, val); +template INLINE T __kmpc_atomic_add(T *Address, T Val) { + T Old; +#pragma omp atomic capture + { + Old = *Address; + *Address += Val; + } + return Old; } -template INLINE T __kmpc_atomic_inc(T *address, T val) { - return atomicInc(address, val); +template INLINE T __kmpc_atomic_inc(T *Address, T Val) { + T Old; +#pragma omp atomic capture + { + Old = *Address; + *Address += Old >= Val ? 0 : 1; + } + return Old; } -template INLINE T __kmpc_atomic_max(T *address, T val) { - return atomicMax(address, val); +template INLINE T __kmpc_atomic_max(T *Address, T Val) { + T Old; +#pragma omp atomic capture + { + Old = *Address; + *Address = (Old > Val ? Old : Val); + } + return Old; } -template INLINE T __kmpc_atomic_exchange(T *address, T val) { - return atomicExch(address, val); +template INLINE T __kmpc_atomic_exchange(T *Address, T Val) { + T Old; +#pragma omp atomic capture + { + Old = *Address; + *Address = Val; + } + return Old; } -template INLINE T __kmpc_atomic_cas(T *address, T compare, T val) { - return atomicCAS(address, compare, val); +template INLINE T __kmpc_atomic_cas(T *Address, T Compare, T Val) { + T Old; +#pragma omp atomic capture + { + Old = *Address; + *Address = (Old == Compare ? Val : Old); + } + return Old; } // Locks