diff --git a/openmp/libomptarget/cmake/Modules/LibomptargetNVPTXBitcodeLibrary.cmake b/openmp/libomptarget/cmake/Modules/LibomptargetNVPTXBitcodeLibrary.cmake --- a/openmp/libomptarget/cmake/Modules/LibomptargetNVPTXBitcodeLibrary.cmake +++ b/openmp/libomptarget/cmake/Modules/LibomptargetNVPTXBitcodeLibrary.cmake @@ -15,7 +15,7 @@ if (NOT LIBOMPTARGET_NVPTX_CUDA_COMPILER STREQUAL "") set(LIBOMPTARGET_NVPTX_SELECTED_CUDA_COMPILER ${LIBOMPTARGET_NVPTX_CUDA_COMPILER}) elseif(${CMAKE_C_COMPILER_ID} STREQUAL "Clang") - set(LIBOMPTARGET_NVPTX_SELECTED_CUDA_COMPILER ${CMAKE_C_COMPILER}) + set(LIBOMPTARGET_NVPTX_SELECTED_CUDA_COMPILER ${CMAKE_CXX_COMPILER}) else() return() endif() diff --git a/openmp/libomptarget/deviceRTLs/common/allocator.h b/openmp/libomptarget/deviceRTLs/common/allocator.h new file mode 100644 --- /dev/null +++ b/openmp/libomptarget/deviceRTLs/common/allocator.h @@ -0,0 +1,42 @@ +//===--------- allocator.h - OpenMP target memory allocator ------- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// Macros for allocating variables in different address spaces. +// +//===----------------------------------------------------------------------===// + +#ifndef OMPTARGET_ALLOCATOR_H +#define OMPTARGET_ALLOCATOR_H + +// Follows the pattern in interface.h +// Clang sema checks this type carefully, needs to closely match that from omp.h +typedef enum omp_allocator_handle_t { + omp_null_allocator = 0, + omp_default_mem_alloc = 1, + omp_large_cap_mem_alloc = 2, + omp_const_mem_alloc = 3, + omp_high_bw_mem_alloc = 4, + omp_low_lat_mem_alloc = 5, + omp_cgroup_mem_alloc = 6, + omp_pteam_mem_alloc = 7, + omp_thread_mem_alloc = 8, + KMP_ALLOCATOR_MAX_HANDLE = ~(0U) +} omp_allocator_handle_t; + +#define __PRAGMA(STR) _Pragma(#STR) +#define OMP_PRAGMA(STR) __PRAGMA(omp STR) + +#define SHARED(NAME) \ + NAME [[clang::loader_uninitialized]]; \ + OMP_PRAGMA(allocate(NAME) allocator(omp_pteam_mem_alloc)) + +#define EXTERN_SHARED(NAME) \ + NAME; \ + OMP_PRAGMA(allocate(NAME) allocator(omp_pteam_mem_alloc)) + +#endif // OMPTARGET_ALLOCATOR_H diff --git a/openmp/libomptarget/deviceRTLs/common/omptarget.h b/openmp/libomptarget/deviceRTLs/common/omptarget.h --- a/openmp/libomptarget/deviceRTLs/common/omptarget.h +++ b/openmp/libomptarget/deviceRTLs/common/omptarget.h @@ -14,11 +14,12 @@ #ifndef OMPTARGET_H #define OMPTARGET_H -#include "target_impl.h" -#include "common/debug.h" // debug -#include "interface.h" // interfaces with omp, compiler, and user +#include "common/allocator.h" +#include "common/debug.h" // debug #include "common/state-queue.h" #include "common/support.h" +#include "interface.h" // interfaces with omp, compiler, and user +#include "target_impl.h" #define OMPTARGET_NVPTX_VERSION 1.1 @@ -71,8 +72,8 @@ uint32_t nArgs; }; -extern DEVICE SHARED omptarget_nvptx_SharedArgs - omptarget_nvptx_globalArgs; +extern DEVICE + omptarget_nvptx_SharedArgs EXTERN_SHARED(omptarget_nvptx_globalArgs); // Worker slot type which is initialized with the default worker slot // size of 4*32 bytes. @@ -94,7 +95,7 @@ __kmpc_impl_lanemask_t ActiveThreads[DS_Max_Warp_Number]; }; -extern DEVICE SHARED DataSharingStateTy DataSharingState; +extern DEVICE DataSharingStateTy EXTERN_SHARED(DataSharingState); //////////////////////////////////////////////////////////////////////////////// // task ICV and (implicit & explicit) task state @@ -273,9 +274,9 @@ /// Memory manager for statically allocated memory. class omptarget_nvptx_SimpleMemoryManager { private: - ALIGN(128) struct MemDataTy { + struct MemDataTy { volatile unsigned keys[OMP_STATE_COUNT]; - } MemData[MAX_SM]; + } MemData[MAX_SM] ALIGN(128); INLINE static uint32_t hash(unsigned key) { return key & (OMP_STATE_COUNT - 1); @@ -294,18 +295,18 @@ extern DEVICE omptarget_nvptx_SimpleMemoryManager omptarget_nvptx_simpleMemoryManager; -extern DEVICE SHARED uint32_t usedMemIdx; -extern DEVICE SHARED uint32_t usedSlotIdx; -extern DEVICE SHARED uint8_t - parallelLevel[MAX_THREADS_PER_TEAM / WARPSIZE]; -extern DEVICE SHARED uint16_t threadLimit; -extern DEVICE SHARED uint16_t threadsInTeam; -extern DEVICE SHARED uint16_t nThreads; -extern DEVICE SHARED - omptarget_nvptx_ThreadPrivateContext *omptarget_nvptx_threadPrivateContext; - -extern DEVICE SHARED uint32_t execution_param; -extern DEVICE SHARED void *ReductionScratchpadPtr; +extern DEVICE uint32_t EXTERN_SHARED(usedMemIdx); +extern DEVICE uint32_t EXTERN_SHARED(usedSlotIdx); +extern DEVICE uint8_t parallelLevel[MAX_THREADS_PER_TEAM / WARPSIZE]; +#pragma omp allocate(parallelLevel) allocator(omp_pteam_mem_alloc) +extern DEVICE uint16_t EXTERN_SHARED(threadLimit); +extern DEVICE uint16_t EXTERN_SHARED(threadsInTeam); +extern DEVICE uint16_t EXTERN_SHARED(nThreads); +extern DEVICE omptarget_nvptx_ThreadPrivateContext * + EXTERN_SHARED(omptarget_nvptx_threadPrivateContext); + +extern DEVICE uint32_t EXTERN_SHARED(execution_param); +extern DEVICE void *EXTERN_SHARED(ReductionScratchpadPtr); //////////////////////////////////////////////////////////////////////////////// // work function (outlined parallel/simd functions) and arguments. @@ -313,8 +314,8 @@ //////////////////////////////////////////////////////////////////////////////// typedef void *omptarget_nvptx_WorkFn; -extern volatile DEVICE SHARED omptarget_nvptx_WorkFn - omptarget_nvptx_workFn; +extern volatile DEVICE + omptarget_nvptx_WorkFn EXTERN_SHARED(omptarget_nvptx_workFn); //////////////////////////////////////////////////////////////////////////////// // get private data structures diff --git a/openmp/libomptarget/deviceRTLs/common/src/libcall.cu b/openmp/libomptarget/deviceRTLs/common/src/libcall.cu --- a/openmp/libomptarget/deviceRTLs/common/src/libcall.cu +++ b/openmp/libomptarget/deviceRTLs/common/src/libcall.cu @@ -316,10 +316,10 @@ return rc; } -EXTERN int omp_is_initial_device(void) { - PRINT0(LD_IO, "call omp_is_initial_device() returns 0\n"); - return 0; // 0 by def on device -} +// EXTERN int omp_is_initial_device(void) { +// PRINT0(LD_IO, "call omp_is_initial_device() returns 0\n"); +// return 0; // 0 by def on device +// } // Unspecified on the device. EXTERN int omp_get_initial_device(void) { diff --git a/openmp/libomptarget/deviceRTLs/common/src/omp_data.cu b/openmp/libomptarget/deviceRTLs/common/src/omp_data.cu --- a/openmp/libomptarget/deviceRTLs/common/src/omp_data.cu +++ b/openmp/libomptarget/deviceRTLs/common/src/omp_data.cu @@ -11,8 +11,9 @@ //===----------------------------------------------------------------------===// #pragma omp declare target -#include "common/omptarget.h" +#include "common/allocator.h" #include "common/device_environment.h" +#include "common/omptarget.h" //////////////////////////////////////////////////////////////////////////////// // global device environment @@ -28,44 +29,44 @@ omptarget_nvptx_Queue omptarget_nvptx_device_State[MAX_SM]; -DEVICE omptarget_nvptx_SimpleMemoryManager - omptarget_nvptx_simpleMemoryManager; -DEVICE SHARED uint32_t usedMemIdx; -DEVICE SHARED uint32_t usedSlotIdx; +DEVICE omptarget_nvptx_SimpleMemoryManager omptarget_nvptx_simpleMemoryManager; +DEVICE uint32_t SHARED(usedMemIdx); +DEVICE uint32_t SHARED(usedSlotIdx); -DEVICE SHARED uint8_t parallelLevel[MAX_THREADS_PER_TEAM / WARPSIZE]; -DEVICE SHARED uint16_t threadLimit; -DEVICE SHARED uint16_t threadsInTeam; -DEVICE SHARED uint16_t nThreads; +DEVICE uint8_t parallelLevel[MAX_THREADS_PER_TEAM / WARPSIZE]; +#pragma omp allocate(parallelLevel) allocator(omp_pteam_mem_alloc) +DEVICE uint16_t SHARED(threadLimit); +DEVICE uint16_t SHARED(threadsInTeam); +DEVICE uint16_t SHARED(nThreads); // Pointer to this team's OpenMP state object -DEVICE SHARED - omptarget_nvptx_ThreadPrivateContext *omptarget_nvptx_threadPrivateContext; +DEVICE omptarget_nvptx_ThreadPrivateContext * + SHARED(omptarget_nvptx_threadPrivateContext); //////////////////////////////////////////////////////////////////////////////// // The team master sets the outlined parallel function in this variable to // communicate with the workers. Since it is in shared memory, there is one // copy of these variables for each kernel, instance, and team. //////////////////////////////////////////////////////////////////////////////// -volatile DEVICE SHARED omptarget_nvptx_WorkFn omptarget_nvptx_workFn; +volatile DEVICE omptarget_nvptx_WorkFn SHARED(omptarget_nvptx_workFn); //////////////////////////////////////////////////////////////////////////////// // OpenMP kernel execution parameters //////////////////////////////////////////////////////////////////////////////// -DEVICE SHARED uint32_t execution_param; +DEVICE uint32_t SHARED(execution_param); //////////////////////////////////////////////////////////////////////////////// // Data sharing state //////////////////////////////////////////////////////////////////////////////// -DEVICE SHARED DataSharingStateTy DataSharingState; +DEVICE DataSharingStateTy SHARED(DataSharingState); //////////////////////////////////////////////////////////////////////////////// // Scratchpad for teams reduction. //////////////////////////////////////////////////////////////////////////////// -DEVICE SHARED void *ReductionScratchpadPtr; +DEVICE void *SHARED(ReductionScratchpadPtr); //////////////////////////////////////////////////////////////////////////////// // Data sharing related variables. //////////////////////////////////////////////////////////////////////////////// -DEVICE SHARED omptarget_nvptx_SharedArgs omptarget_nvptx_globalArgs; +DEVICE omptarget_nvptx_SharedArgs SHARED(omptarget_nvptx_globalArgs); #pragma omp end declare target diff --git a/openmp/libomptarget/deviceRTLs/common/src/reduction.cu b/openmp/libomptarget/deviceRTLs/common/src/reduction.cu --- a/openmp/libomptarget/deviceRTLs/common/src/reduction.cu +++ b/openmp/libomptarget/deviceRTLs/common/src/reduction.cu @@ -208,8 +208,8 @@ : /*Master thread only*/ 1; uint32_t TeamId = GetBlockIdInKernel(); uint32_t NumTeams = GetNumberOfBlocksInKernel(); - static SHARED unsigned Bound; - static SHARED unsigned ChunkTeamCount; + static unsigned SHARED(Bound); + static unsigned SHARED(ChunkTeamCount); // Block progress for teams greater than the current upper // limit. We always only allow a number of teams less or equal diff --git a/openmp/libomptarget/deviceRTLs/nvptx/CMakeLists.txt b/openmp/libomptarget/deviceRTLs/nvptx/CMakeLists.txt --- a/openmp/libomptarget/deviceRTLs/nvptx/CMakeLists.txt +++ b/openmp/libomptarget/deviceRTLs/nvptx/CMakeLists.txt @@ -10,6 +10,19 @@ # ##===----------------------------------------------------------------------===## +# TODO: This part needs to be refined when libomptarget is going to support +# Windows! +if(CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "x86_64") + set(aux_triple x86_64-unknown-linux-gnu) +elseif(CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "ppc64le") + set(aux_triple powerpc64le-unknown-linux-gnu) +elseif(CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "aarch64") + set(aux_triple aarch64-unknown-linux-gnu) +else() + libomptarget_say("Not building CUDA offloading device RTL: unknow host arch: ${CMAKE_HOST_SYSTEM_PROCESSOR}") + return() +endif() + get_filename_component(devicertl_base_directory ${CMAKE_CURRENT_SOURCE_DIR} DIRECTORY) @@ -79,61 +92,83 @@ ) # Set flags for LLVM Bitcode compilation. - set(bc_flags ${LIBOMPTARGET_NVPTX_SELECTED_CUDA_COMPILER_FLAGS} + set(bc_flags -S -x c++ + -target nvptx64 + -Xclang -emit-llvm-bc + -Xclang -aux-triple -Xclang ${aux_triple} + -fopenmp -Xclang -fopenmp-is-device + -D__CUDACC__ + -fdeclspec -I${devicertl_base_directory} -I${devicertl_nvptx_directory}/src) if(${LIBOMPTARGET_NVPTX_DEBUG}) - set(bc_flags ${bc_flags} -DOMPTARGET_NVPTX_DEBUG=-1) + list(APPEND bc_flags -DOMPTARGET_NVPTX_DEBUG=-1) else() - set(bc_flags ${bc_flags} -DOMPTARGET_NVPTX_DEBUG=0) + list(APPEND bc_flags -DOMPTARGET_NVPTX_DEBUG=0) endif() # Create target to build all Bitcode libraries. add_custom_target(omptarget-nvptx-bc) - # Generate a Bitcode library for all the compute capabilities the user requested. + # This correlation is from clang/lib/Driver/ToolChains/Cuda.cpp. + # The last element is the default case. + set(cuda_version_list 110 102 101 100 92 91 90 80) + set(ptx_feature_list 70 65 64 63 61 61 60 42) + + # Generate a Bitcode library for all the compute capabilities the user + # requested and all PTX version we know for now. foreach(sm ${nvptx_sm_list}) - set(cuda_arch --cuda-gpu-arch=sm_${sm}) - - # Compile CUDA files to bitcode. - set(bc_files "") - foreach(src ${cuda_src_files}) - get_filename_component(infile ${src} ABSOLUTE) - get_filename_component(outfile ${src} NAME) - - add_custom_command(OUTPUT ${outfile}-sm_${sm}.bc - COMMAND ${LIBOMPTARGET_NVPTX_SELECTED_CUDA_COMPILER} ${bc_flags} ${cuda_arch} ${MAX_SM_DEFINITION} - -c ${infile} -o ${outfile}-sm_${sm}.bc - DEPENDS ${infile} - IMPLICIT_DEPENDS CXX ${infile} - COMMENT "Building LLVM bitcode ${outfile}-sm_${sm}.bc" - VERBATIM + set(sm_flags -Xclang -target-cpu -Xclang sm_${sm} "-D__CUDA_ARCH__=${sm}0") + + foreach(cuda_version ptx_num IN ZIP_LISTS cuda_version_list ptx_feature_list) + set(cuda_flags ${sm_flags}) + list(APPEND cuda_flags -Xclang -target-feature -Xclang +ptx${ptx_num}) + list(APPEND cuda_flags "-DCUDA_VERSION=${cuda_version}00") + + set(bc_files "") + foreach(src ${cuda_src_files}) + get_filename_component(infile ${src} ABSOLUTE) + get_filename_component(outfile ${src} NAME) + set(outfile "${outfile}-sm_${sm}-cuda-${cuda_version}.bc") + + add_custom_command(OUTPUT ${outfile} + COMMAND ${LIBOMPTARGET_NVPTX_SELECTED_CUDA_COMPILER} ${bc_flags} + ${cuda_flags} ${MAX_SM_DEFINITION} ${infile} -o ${outfile} + DEPENDS ${infile} + IMPLICIT_DEPENDS CXX ${infile} + COMMENT "Building LLVM bitcode ${outfile}" + VERBATIM + ) + set_property(DIRECTORY APPEND PROPERTY ADDITIONAL_MAKE_CLEAN_FILES ${outfile}) + + list(APPEND bc_files ${outfile}) + endforeach() + + set(bclib_name "libomptarget-nvptx-sm_${sm}-cuda-${cuda_version}.bc") + + # Link to a bitcode library. + add_custom_command(OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/${bclib_name} + COMMAND ${LIBOMPTARGET_NVPTX_SELECTED_BC_LINKER} + -o ${CMAKE_CURRENT_BINARY_DIR}/${bclib_name} ${bc_files} + DEPENDS ${bc_files} + COMMENT "Linking LLVM bitcode ${bclib_name}" ) - set_property(DIRECTORY APPEND PROPERTY ADDITIONAL_MAKE_CLEAN_FILES ${outfile}-sm_${sm}.bc) + set_property(DIRECTORY APPEND PROPERTY ADDITIONAL_MAKE_CLEAN_FILES ${bclib_name}) - list(APPEND bc_files ${outfile}-sm_${sm}.bc) - endforeach() + set(bclib_target_name "omptarget-nvptx-sm_${sm}-cuda-${cuda_version}-bc") + + add_custom_target(${bclib_target_name} ALL DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/${bclib_name}) + add_dependencies(omptarget-nvptx-bc ${bclib_target_name}) - # Link to a bitcode library. - add_custom_command(OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/libomptarget-nvptx-sm_${sm}.bc - COMMAND ${LIBOMPTARGET_NVPTX_SELECTED_BC_LINKER} - -o ${CMAKE_CURRENT_BINARY_DIR}/libomptarget-nvptx-sm_${sm}.bc ${bc_files} - DEPENDS ${bc_files} - COMMENT "Linking LLVM bitcode libomptarget-nvptx-sm_${sm}.bc" - ) - set_property(DIRECTORY APPEND PROPERTY ADDITIONAL_MAKE_CLEAN_FILES libomptarget-nvptx-sm_${sm}.bc) - - add_custom_target(omptarget-nvptx-${sm}-bc ALL DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/libomptarget-nvptx-sm_${sm}.bc) - add_dependencies(omptarget-nvptx-bc omptarget-nvptx-${sm}-bc) - - # Copy library to destination. - add_custom_command(TARGET omptarget-nvptx-${sm}-bc POST_BUILD - COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_BINARY_DIR}/libomptarget-nvptx-sm_${sm}.bc - ${LIBOMPTARGET_LIBRARY_DIR}) - - # Install bitcode library under the lib destination folder. - install(FILES ${CMAKE_CURRENT_BINARY_DIR}/libomptarget-nvptx-sm_${sm}.bc DESTINATION "${OPENMP_INSTALL_LIBDIR}") + # Copy library to destination. + add_custom_command(TARGET ${bclib_target_name} POST_BUILD + COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_BINARY_DIR}/${bclib_name} + ${LIBOMPTARGET_LIBRARY_DIR}) + + # Install bitcode library under the lib destination folder. + install(FILES ${CMAKE_CURRENT_BINARY_DIR}/${bclib_name} DESTINATION "${OPENMP_INSTALL_LIBDIR}") + endforeach() endforeach() endif() diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/nvptx_interface.h b/openmp/libomptarget/deviceRTLs/nvptx/src/nvptx_interface.h --- a/openmp/libomptarget/deviceRTLs/nvptx/src/nvptx_interface.h +++ b/openmp/libomptarget/deviceRTLs/nvptx/src/nvptx_interface.h @@ -11,7 +11,8 @@ #include -#define EXTERN extern "C" __device__ +#define EXTERN extern "C" + typedef uint32_t __kmpc_impl_lanemask_t; typedef uint32_t omp_lock_t; /* arbitrary type of the right length */ diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h b/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h --- a/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h +++ b/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h @@ -13,18 +13,16 @@ #define _TARGET_IMPL_H_ #include -#include #include #include #include #include "nvptx_interface.h" -#define DEVICE __device__ -#define INLINE __forceinline__ DEVICE -#define NOINLINE __noinline__ DEVICE -#define SHARED __shared__ -#define ALIGN(N) __align__(N) +#define DEVICE +#define INLINE inline __attribute__((always_inline)) +#define NOINLINE __attribute__((noinline)) +#define ALIGN(N) __attribute__((aligned(N))) //////////////////////////////////////////////////////////////////////////////// // Kernel options @@ -61,6 +59,7 @@ #elif __CUDA_ARCH__ >= 600 #define MAX_SM 56 #else +#error "Wrong number!" #define MAX_SM 16 #endif #endif diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.cu b/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.cu --- a/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.cu +++ b/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.cu @@ -14,8 +14,6 @@ #include "target_impl.h" #include "common/debug.h" -#include - // Forward declaration of CUDA primitives which will be evetually transformed // into LLVM intrinsics. extern "C" {