diff --git a/openmp/libomptarget/cmake/Modules/LibomptargetNVPTXBitcodeLibrary.cmake b/openmp/libomptarget/cmake/Modules/LibomptargetNVPTXBitcodeLibrary.cmake --- a/openmp/libomptarget/cmake/Modules/LibomptargetNVPTXBitcodeLibrary.cmake +++ b/openmp/libomptarget/cmake/Modules/LibomptargetNVPTXBitcodeLibrary.cmake @@ -15,7 +15,7 @@ if (NOT LIBOMPTARGET_NVPTX_CUDA_COMPILER STREQUAL "") set(LIBOMPTARGET_NVPTX_SELECTED_CUDA_COMPILER ${LIBOMPTARGET_NVPTX_CUDA_COMPILER}) elseif(${CMAKE_C_COMPILER_ID} STREQUAL "Clang") - set(LIBOMPTARGET_NVPTX_SELECTED_CUDA_COMPILER ${CMAKE_C_COMPILER}) + set(LIBOMPTARGET_NVPTX_SELECTED_CUDA_COMPILER ${CMAKE_CXX_COMPILER}) else() return() endif() diff --git a/openmp/libomptarget/deviceRTLs/common/omptarget.h b/openmp/libomptarget/deviceRTLs/common/omptarget.h --- a/openmp/libomptarget/deviceRTLs/common/omptarget.h +++ b/openmp/libomptarget/deviceRTLs/common/omptarget.h @@ -71,8 +71,7 @@ uint32_t nArgs; }; -extern DEVICE SHARED omptarget_nvptx_SharedArgs - omptarget_nvptx_globalArgs; +extern DEVICE omptarget_nvptx_SharedArgs omptarget_nvptx_globalArgs; // Worker slot type which is initialized with the default worker slot // size of 4*32 bytes. @@ -94,7 +93,7 @@ __kmpc_impl_lanemask_t ActiveThreads[DS_Max_Warp_Number]; }; -extern DEVICE SHARED DataSharingStateTy DataSharingState; +extern DEVICE DataSharingStateTy DataSharingState; //////////////////////////////////////////////////////////////////////////////// // task ICV and (implicit & explicit) task state @@ -273,9 +272,9 @@ /// Memory manager for statically allocated memory. class omptarget_nvptx_SimpleMemoryManager { private: - ALIGN(128) struct MemDataTy { + struct MemDataTy { volatile unsigned keys[OMP_STATE_COUNT]; - } MemData[MAX_SM]; + } MemData[MAX_SM] ALIGN(128); INLINE static uint32_t hash(unsigned key) { return key & (OMP_STATE_COUNT - 1); @@ -294,18 +293,17 @@ extern DEVICE omptarget_nvptx_SimpleMemoryManager omptarget_nvptx_simpleMemoryManager; -extern DEVICE SHARED uint32_t usedMemIdx; -extern DEVICE SHARED uint32_t usedSlotIdx; -extern DEVICE SHARED uint8_t - parallelLevel[MAX_THREADS_PER_TEAM / WARPSIZE]; -extern DEVICE SHARED uint16_t threadLimit; -extern DEVICE SHARED uint16_t threadsInTeam; -extern DEVICE SHARED uint16_t nThreads; -extern DEVICE SHARED - omptarget_nvptx_ThreadPrivateContext *omptarget_nvptx_threadPrivateContext; - -extern DEVICE SHARED uint32_t execution_param; -extern DEVICE SHARED void *ReductionScratchpadPtr; +extern DEVICE uint32_t usedMemIdx; +extern DEVICE uint32_t usedSlotIdx; +extern DEVICE uint8_t parallelLevel[MAX_THREADS_PER_TEAM / WARPSIZE]; +extern DEVICE uint16_t threadLimit; +extern DEVICE uint16_t threadsInTeam; +extern DEVICE uint16_t nThreads; +extern DEVICE omptarget_nvptx_ThreadPrivateContext + *omptarget_nvptx_threadPrivateContext; + +extern DEVICE uint32_t execution_param; +extern DEVICE void *ReductionScratchpadPtr; //////////////////////////////////////////////////////////////////////////////// // work function (outlined parallel/simd functions) and arguments. @@ -313,8 +311,7 @@ //////////////////////////////////////////////////////////////////////////////// typedef void *omptarget_nvptx_WorkFn; -extern volatile DEVICE SHARED omptarget_nvptx_WorkFn - omptarget_nvptx_workFn; +extern volatile DEVICE omptarget_nvptx_WorkFn omptarget_nvptx_workFn; //////////////////////////////////////////////////////////////////////////////// // get private data structures diff --git a/openmp/libomptarget/deviceRTLs/common/src/libcall.cu b/openmp/libomptarget/deviceRTLs/common/src/libcall.cu --- a/openmp/libomptarget/deviceRTLs/common/src/libcall.cu +++ b/openmp/libomptarget/deviceRTLs/common/src/libcall.cu @@ -316,10 +316,10 @@ return rc; } -EXTERN int omp_is_initial_device(void) { - PRINT0(LD_IO, "call omp_is_initial_device() returns 0\n"); - return 0; // 0 by def on device -} +// EXTERN int omp_is_initial_device(void) { +// PRINT0(LD_IO, "call omp_is_initial_device() returns 0\n"); +// return 0; // 0 by def on device +// } // Unspecified on the device. EXTERN int omp_get_initial_device(void) { diff --git a/openmp/libomptarget/deviceRTLs/common/src/omp_data.cu b/openmp/libomptarget/deviceRTLs/common/src/omp_data.cu --- a/openmp/libomptarget/deviceRTLs/common/src/omp_data.cu +++ b/openmp/libomptarget/deviceRTLs/common/src/omp_data.cu @@ -11,8 +11,9 @@ //===----------------------------------------------------------------------===// #pragma omp declare target -#include "common/omptarget.h" +#include "common/allocator.h" #include "common/device_environment.h" +#include "common/omptarget.h" //////////////////////////////////////////////////////////////////////////////// // global device environment @@ -28,44 +29,44 @@ omptarget_nvptx_Queue omptarget_nvptx_device_State[MAX_SM]; -DEVICE omptarget_nvptx_SimpleMemoryManager - omptarget_nvptx_simpleMemoryManager; -DEVICE SHARED uint32_t usedMemIdx; -DEVICE SHARED uint32_t usedSlotIdx; +DEVICE omptarget_nvptx_SimpleMemoryManager omptarget_nvptx_simpleMemoryManager; +DEVICE uint32_t SHARED(usedMemIdx); +DEVICE uint32_t SHARED(usedSlotIdx); -DEVICE SHARED uint8_t parallelLevel[MAX_THREADS_PER_TEAM / WARPSIZE]; -DEVICE SHARED uint16_t threadLimit; -DEVICE SHARED uint16_t threadsInTeam; -DEVICE SHARED uint16_t nThreads; +DEVICE uint8_t parallelLevel[MAX_THREADS_PER_TEAM / WARPSIZE]; +#pragma omp allocate(parallelLevel) allocator(omp_pteam_mem_alloc) +DEVICE uint16_t SHARED(threadLimit); +DEVICE uint16_t SHARED(threadsInTeam); +DEVICE uint16_t SHARED(nThreads); // Pointer to this team's OpenMP state object -DEVICE SHARED - omptarget_nvptx_ThreadPrivateContext *omptarget_nvptx_threadPrivateContext; +DEVICE omptarget_nvptx_ThreadPrivateContext * + SHARED(omptarget_nvptx_threadPrivateContext); //////////////////////////////////////////////////////////////////////////////// // The team master sets the outlined parallel function in this variable to // communicate with the workers. Since it is in shared memory, there is one // copy of these variables for each kernel, instance, and team. //////////////////////////////////////////////////////////////////////////////// -volatile DEVICE SHARED omptarget_nvptx_WorkFn omptarget_nvptx_workFn; +volatile DEVICE omptarget_nvptx_WorkFn SHARED(omptarget_nvptx_workFn); //////////////////////////////////////////////////////////////////////////////// // OpenMP kernel execution parameters //////////////////////////////////////////////////////////////////////////////// -DEVICE SHARED uint32_t execution_param; +DEVICE uint32_t SHARED(execution_param); //////////////////////////////////////////////////////////////////////////////// // Data sharing state //////////////////////////////////////////////////////////////////////////////// -DEVICE SHARED DataSharingStateTy DataSharingState; +DEVICE DataSharingStateTy SHARED(DataSharingState); //////////////////////////////////////////////////////////////////////////////// // Scratchpad for teams reduction. //////////////////////////////////////////////////////////////////////////////// -DEVICE SHARED void *ReductionScratchpadPtr; +DEVICE void *SHARED(ReductionScratchpadPtr); //////////////////////////////////////////////////////////////////////////////// // Data sharing related variables. //////////////////////////////////////////////////////////////////////////////// -DEVICE SHARED omptarget_nvptx_SharedArgs omptarget_nvptx_globalArgs; +DEVICE omptarget_nvptx_SharedArgs SHARED(omptarget_nvptx_globalArgs); #pragma omp end declare target diff --git a/openmp/libomptarget/deviceRTLs/nvptx/CMakeLists.txt b/openmp/libomptarget/deviceRTLs/nvptx/CMakeLists.txt --- a/openmp/libomptarget/deviceRTLs/nvptx/CMakeLists.txt +++ b/openmp/libomptarget/deviceRTLs/nvptx/CMakeLists.txt @@ -10,6 +10,19 @@ # ##===----------------------------------------------------------------------===## +# TODO: This part needs to be refined when libomptarget is going to support +# Windows! +if(CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "x86_64") + set(aux_triple x86_64-unknown-linux-gnu) +elseif(CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "ppc64le") + set(aux_triple powerpc64le-unknown-linux-gnu) +elseif(CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "aarch64") + set(aux_triple aarch64-unknown-linux-gnu) +else() + libomptarget_say("Not building CUDA offloading device RTL: unknow host arch: ${CMAKE_HOST_SYSTEM_PROCESSOR}") + return() +endif() + get_filename_component(devicertl_base_directory ${CMAKE_CURRENT_SOURCE_DIR} DIRECTORY) @@ -79,61 +92,81 @@ ) # Set flags for LLVM Bitcode compilation. - set(bc_flags ${LIBOMPTARGET_NVPTX_SELECTED_CUDA_COMPILER_FLAGS} + set(bc_flags -S -x c++ + -target nvptx64 + -Xclang -emit-llvm-bc + -Xclang -aux-triple -Xclang ${aux_triple} + -fopenmp -Xclang -fopenmp-is-device + -D__CUDACC__ + -fdeclspec -I${devicertl_base_directory} - -I${devicertl_nvptx_directory}/src) + -I${devicertl_nvptx_directory}/src + -I${CUDA_TOOLKIT_ROOT_DIR}/include + -include openmp_wrappers/__clang_openmp_device_functions.h) if(${LIBOMPTARGET_NVPTX_DEBUG}) - set(bc_flags ${bc_flags} -DOMPTARGET_NVPTX_DEBUG=-1) + list(APPEND bc_flags -DOMPTARGET_NVPTX_DEBUG=-1) else() - set(bc_flags ${bc_flags} -DOMPTARGET_NVPTX_DEBUG=0) + list(APPEND bc_flags -DOMPTARGET_NVPTX_DEBUG=0) endif() # Create target to build all Bitcode libraries. add_custom_target(omptarget-nvptx-bc) - # Generate a Bitcode library for all the compute capabilities the user requested. + # That's all PTX versions we know for now + set(nvptx_ptx_list 50 60 61 62 63 64 65 70 71) + + # Generate a Bitcode library for all the compute capabilities the user + # requested and all PTX version we know for now. foreach(sm ${nvptx_sm_list}) - set(cuda_arch --cuda-gpu-arch=sm_${sm}) - - # Compile CUDA files to bitcode. - set(bc_files "") - foreach(src ${cuda_src_files}) - get_filename_component(infile ${src} ABSOLUTE) - get_filename_component(outfile ${src} NAME) - - add_custom_command(OUTPUT ${outfile}-sm_${sm}.bc - COMMAND ${LIBOMPTARGET_NVPTX_SELECTED_CUDA_COMPILER} ${bc_flags} ${cuda_arch} ${MAX_SM_DEFINITION} - -c ${infile} -o ${outfile}-sm_${sm}.bc - DEPENDS ${infile} - IMPLICIT_DEPENDS CXX ${infile} - COMMENT "Building LLVM bitcode ${outfile}-sm_${sm}.bc" - VERBATIM + set(cuda_arch -Xclang -target-cpu -Xclang sm_${sm}) + + foreach(ptx_num ${nvptx_ptx_list}) + set(ptx_version -Xclang -target-feature -Xclang +ptx${ptx_num}) + + set(bc_files "") + foreach(src ${cuda_src_files}) + get_filename_component(infile ${src} ABSOLUTE) + get_filename_component(outfile ${src} NAME) + set(outfile "${outfile}-sm_${sm}-ptx${ptx_num}.bc") + + add_custom_command(OUTPUT ${outfile} + COMMAND ${LIBOMPTARGET_NVPTX_SELECTED_CUDA_COMPILER} ${bc_flags} + ${cuda_arch} ${MAX_SM_DEFINITION} ${infile} -o ${outfile} + DEPENDS ${infile} + IMPLICIT_DEPENDS CXX ${infile} + COMMENT "Building LLVM bitcode ${outfile}" + VERBATIM + ) + set_property(DIRECTORY APPEND PROPERTY ADDITIONAL_MAKE_CLEAN_FILES ${outfile}) + + list(APPEND bc_files ${outfile}) + endforeach() + + set(bclib_name "libomptarget-nvptx-sm_${sm}-ptx${ptx_num}.bc") + + # Link to a bitcode library. + add_custom_command(OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/${bclib_name} + COMMAND ${LIBOMPTARGET_NVPTX_SELECTED_BC_LINKER} + -o ${CMAKE_CURRENT_BINARY_DIR}/${bclib_name} ${bc_files} + DEPENDS ${bc_files} + COMMENT "Linking LLVM bitcode ${bclib_name}" ) - set_property(DIRECTORY APPEND PROPERTY ADDITIONAL_MAKE_CLEAN_FILES ${outfile}-sm_${sm}.bc) + set_property(DIRECTORY APPEND PROPERTY ADDITIONAL_MAKE_CLEAN_FILES ${bclib_name}) - list(APPEND bc_files ${outfile}-sm_${sm}.bc) - endforeach() + set(bclib_target_name "omptarget-nvptx-sm_${sm}-ptx${ptx_num}-bc") + + add_custom_target(${bclib_target_name} ALL DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/${bclib_name}) + add_dependencies(omptarget-nvptx-bc ${bclib_target_name}) - # Link to a bitcode library. - add_custom_command(OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/libomptarget-nvptx-sm_${sm}.bc - COMMAND ${LIBOMPTARGET_NVPTX_SELECTED_BC_LINKER} - -o ${CMAKE_CURRENT_BINARY_DIR}/libomptarget-nvptx-sm_${sm}.bc ${bc_files} - DEPENDS ${bc_files} - COMMENT "Linking LLVM bitcode libomptarget-nvptx-sm_${sm}.bc" - ) - set_property(DIRECTORY APPEND PROPERTY ADDITIONAL_MAKE_CLEAN_FILES libomptarget-nvptx-sm_${sm}.bc) - - add_custom_target(omptarget-nvptx-${sm}-bc ALL DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/libomptarget-nvptx-sm_${sm}.bc) - add_dependencies(omptarget-nvptx-bc omptarget-nvptx-${sm}-bc) - - # Copy library to destination. - add_custom_command(TARGET omptarget-nvptx-${sm}-bc POST_BUILD - COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_BINARY_DIR}/libomptarget-nvptx-sm_${sm}.bc - ${LIBOMPTARGET_LIBRARY_DIR}) - - # Install bitcode library under the lib destination folder. - install(FILES ${CMAKE_CURRENT_BINARY_DIR}/libomptarget-nvptx-sm_${sm}.bc DESTINATION "${OPENMP_INSTALL_LIBDIR}") + # Copy library to destination. + add_custom_command(TARGET ${bclib_target_name} POST_BUILD + COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_BINARY_DIR}/${bclib_name} + ${LIBOMPTARGET_LIBRARY_DIR}) + + # Install bitcode library under the lib destination folder. + install(FILES ${CMAKE_CURRENT_BINARY_DIR}/${bclib_name} DESTINATION "${OPENMP_INSTALL_LIBDIR}") + endforeach() endforeach() endif() diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/nvptx_interface.h b/openmp/libomptarget/deviceRTLs/nvptx/src/nvptx_interface.h --- a/openmp/libomptarget/deviceRTLs/nvptx/src/nvptx_interface.h +++ b/openmp/libomptarget/deviceRTLs/nvptx/src/nvptx_interface.h @@ -11,7 +11,8 @@ #include -#define EXTERN extern "C" __device__ +#define EXTERN extern "C" + typedef uint32_t __kmpc_impl_lanemask_t; typedef uint32_t omp_lock_t; /* arbitrary type of the right length */ diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h b/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h --- a/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h +++ b/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h @@ -20,11 +20,10 @@ #include "nvptx_interface.h" -#define DEVICE __device__ -#define INLINE __forceinline__ DEVICE -#define NOINLINE __noinline__ DEVICE -#define SHARED __shared__ -#define ALIGN(N) __align__(N) +#define DEVICE +#define INLINE inline __attribute__((always_inline)) +#define NOINLINE __attribute__((noinline)) +#define ALIGN(N) __attribute__((aligned(N))) //////////////////////////////////////////////////////////////////////////////// // Kernel options @@ -67,6 +66,15 @@ #define OMP_ACTIVE_PARALLEL_LEVEL 128 +// FIXME: Forward declaration +extern "C" { +uint32_t __ffs(uint32_t); +uint32_t __popc(uint32_t); +// FIXME: This function is defined in but due to unknown reason we +// cannot use #include directly. +uint32_t min(uint32_t, uint32_t); +} + // Data sharing related quantities, need to match what is used in the compiler. enum DATA_SHARING_SIZES { // The maximum number of workers in a kernel. @@ -130,6 +138,15 @@ DEVICE unsigned GetWarpId(); DEVICE unsigned GetLaneId(); +// Forward declaration of atomics. Although they're template functions, we +// already have definitions for different types in CUDA internal headers with +// the right mangled names. +template DEVICE T atomicAdd(T *address, T val); +template DEVICE T atomicInc(T *address, T val); +template DEVICE T atomicMax(T *address, T val); +template DEVICE T atomicExch(T *address, T val); +template DEVICE T atomicCAS(T *address, T compare, T val); + // Atomics template INLINE T __kmpc_atomic_add(T *address, T val) { return atomicAdd(address, val);