diff --git a/openmp/libomptarget/deviceRTLs/CMakeLists.txt b/openmp/libomptarget/deviceRTLs/CMakeLists.txt --- a/openmp/libomptarget/deviceRTLs/CMakeLists.txt +++ b/openmp/libomptarget/deviceRTLs/CMakeLists.txt @@ -6,8 +6,9 @@ # # ##===----------------------------------------------------------------------===## # -# Build a device RTL for each available machine available. +# Build a device RTL for each available machine. # ##===----------------------------------------------------------------------===## +add_subdirectory(amdgcn) add_subdirectory(nvptx) diff --git a/openmp/libomptarget/deviceRTLs/amdgcn/CMakeLists.txt b/openmp/libomptarget/deviceRTLs/amdgcn/CMakeLists.txt new file mode 100644 --- /dev/null +++ b/openmp/libomptarget/deviceRTLs/amdgcn/CMakeLists.txt @@ -0,0 +1,136 @@ +##===----------------------------------------------------------------------===## +# +# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +##===----------------------------------------------------------------------===## +# +# Build the AMDGCN Device RTL if the ROCM tools are available +# +##===----------------------------------------------------------------------===## + +find_package(LLVM QUIET CONFIG + PATHS + $ENV{AOMP} + $ENV{HOME}/rocm/aomp + /opt/rocm/aomp + /usr/lib/rocm/aomp + ${LIBOMPTARGET_NVPTX_CUDA_COMPILER_DIR} + ${LIBOMPTARGET_NVPTX_CUDA_LINKER_DIR} + ${CMAKE_CXX_COMPILER_DIR} + NO_DEFAULT_PATH) + +if (LLVM_DIR) + libomptarget_say("Found LLVM ${LLVM_PACKAGE_VERSION}. Configure: ${LLVM_DIR}/LLVMConfig.cmake") +else() + libomptarget_say("Not building AMDGCN device RTL: AOMP not found") + return() +endif() + +set(AOMP_INSTALL_PREFIX ${LLVM_INSTALL_PREFIX}) + +if (AOMP_INSTALL_PREFIX) + set(AOMP_BINDIR ${AOMP_INSTALL_PREFIX}/bin) +else() + set(AOMP_BINDIR ${LLVM_BUILD_BINARY_DIR}/bin) +endif() + +libomptarget_say("Building AMDGCN device RTL. LLVM_COMPILER_PATH=${AOMP_BINDIR}") + +project(omptarget-amdgcn) + +add_custom_target(omptarget-amdgcn ALL) + +#optimization level +set(optimization_level 2) + +# Activate RTL message dumps if requested by the user. +if(LIBOMPTARGET_NVPTX_DEBUG) + set(CUDA_DEBUG -DOMPTARGET_NVPTX_DEBUG=-1) +endif() + +get_filename_component(devicertl_base_directory + ${CMAKE_CURRENT_SOURCE_DIR} + DIRECTORY) + +set(cuda_sources + ${devicertl_base_directory}/common/src/cancel.cu + ${devicertl_base_directory}/common/src/critical.cu) + +set(h_files + ${CMAKE_CURRENT_SOURCE_DIR}/src/amdgcn_interface.h + ${CMAKE_CURRENT_SOURCE_DIR}/src/target_impl.h + ${devicertl_base_directory}/common/debug.h + ${devicertl_base_directory}/common/device_environment.h + ${devicertl_base_directory}/common/state-queue.h + ${devicertl_base_directory}/common/state-queuei.h + ${devicertl_base_directory}/common/support.h) + +# for both in-tree and out-of-tree build +if (NOT CMAKE_ARCHIVE_OUTPUT_DIRECTORY) + set(OUTPUTDIR ${CMAKE_CURRENT_BINARY_DIR}) +else() + set(OUTPUTDIR ${CMAKE_ARCHIVE_OUTPUT_DIRECTORY}) +endif() + +# create libraries +set(mcpus gfx700 gfx701 gfx801 gfx803 gfx900) +if (DEFINED LIBOMPTARGET_AMDGCN_GFXLIST) + set(mcpus ${LIBOMPTARGET_AMDGCN_GFXLIST}) +endif() + +macro(add_cuda_bc_library) + set(cu_cmd ${AOMP_BINDIR}/clang++ + -std=c++11 + -fcuda-rdc + -fvisibility=default + --cuda-device-only + -Wno-unused-value + -x hip + -O${optimization_level} + --cuda-gpu-arch=${mcpu} + ${CUDA_DEBUG} + -I${CMAKE_CURRENT_SOURCE_DIR}/src + -I${devicertl_base_directory}) + + set(bc1_files) + + foreach(file ${ARGN}) + get_filename_component(fname ${file} NAME_WE) + set(bc1_filename ${fname}.${mcpu}.bc) + + add_custom_command( + OUTPUT ${bc1_filename} + COMMAND ${cu_cmd} ${file} -o ${bc1_filename} + DEPENDS ${file} ${h_files}) + + list(APPEND bc1_files ${bc1_filename}) + endforeach() + + add_custom_command( + OUTPUT linkout.cuda.${mcpu}.bc + COMMAND ${AOMP_BINDIR}/llvm-link ${bc1_files} -o linkout.cuda.${mcpu}.bc + DEPENDS ${bc1_files}) + + list(APPEND bc_files linkout.cuda.${mcpu}.bc) +endmacro() + +set(libname "omptarget-amdgcn") + +foreach(mcpu ${mcpus}) + set(bc_files) + add_cuda_bc_library(${cuda_sources}) + + set(bc_libname lib${libname}-${mcpu}.bc) + add_custom_command( + OUTPUT ${bc_libname} + COMMAND ${AOMP_BINDIR}/llvm-link ${bc_files} | ${AOMP_BINDIR}/opt --always-inline -o ${OUTPUTDIR}/${bc_libname} + DEPENDS ${bc_files}) + + add_custom_target(lib${libname}-${mcpu} ALL DEPENDS ${bc_libname}) + + install(FILES ${OUTPUTDIR}/${bc_libname} + DESTINATION "${OPENMP_INSTALL_LIBDIR}/libdevice" + ) +endforeach() diff --git a/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.h b/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.h --- a/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.h +++ b/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.h @@ -72,8 +72,6 @@ // thread's lane number in the warp EXTERN uint64_t __lanemask_gt(); -EXTERN void llvm_amdgcn_s_barrier(); - // CU id EXTERN unsigned __smid(); @@ -101,25 +99,21 @@ return __smid(); } -INLINE uint64_t __kmpc_impl_ffs(uint64_t x) { return __ffsll(x); } +INLINE uint64_t __kmpc_impl_ffs(uint64_t x) { return __builtin_ffsl(x); } -INLINE uint64_t __kmpc_impl_popc(uint64_t x) { return __popcll(x); } +INLINE uint64_t __kmpc_impl_popc(uint64_t x) { return __builtin_popcountl(x); } INLINE __kmpc_impl_lanemask_t __kmpc_impl_activemask() { return __ballot64(1); } -INLINE int32_t __kmpc_impl_shfl_sync(__kmpc_impl_lanemask_t, int32_t Var, - int32_t SrcLane) { - return __shfl(Var, SrcLane, WARPSIZE); -} +EXTERN int32_t __kmpc_impl_shfl_sync(__kmpc_impl_lanemask_t, int32_t Var, + int32_t SrcLane); -INLINE int32_t __kmpc_impl_shfl_down_sync(__kmpc_impl_lanemask_t, int32_t Var, - uint32_t Delta, int32_t Width) { - return __shfl_down(Var, Delta, Width); -} +EXTERN int32_t __kmpc_impl_shfl_down_sync(__kmpc_impl_lanemask_t, int32_t Var, + uint32_t Delta, int32_t Width); -INLINE void __kmpc_impl_syncthreads() { llvm_amdgcn_s_barrier(); } +INLINE void __kmpc_impl_syncthreads() { __builtin_amdgcn_s_barrier(); } INLINE void __kmpc_impl_named_sync(int barrier, uint32_t num_threads) { // we have protected the master warp from releasing from its barrier @@ -128,4 +122,15 @@ __builtin_amdgcn_s_barrier(); } +// DEVICE versions of part of libc +extern "C" { +DEVICE __attribute__((noreturn)) void +__assertfail(const char *, const char *, unsigned, const char *, size_t); +INLINE static void __assert_fail(const char *__message, const char *__file, + unsigned int __line, const char *__function) { + __assertfail(__message, __file, __line, __function, sizeof(char)); +} +DEVICE int printf(const char *, ...); +} + #endif diff --git a/openmp/libomptarget/deviceRTLs/common/debug.h b/openmp/libomptarget/deviceRTLs/common/debug.h --- a/openmp/libomptarget/deviceRTLs/common/debug.h +++ b/openmp/libomptarget/deviceRTLs/common/debug.h @@ -28,7 +28,7 @@ #ifndef _OMPTARGET_NVPTX_DEBUG_H_ #define _OMPTARGET_NVPTX_DEBUG_H_ -#include "device_environment.h" +#include "common/device_environment.h" //////////////////////////////////////////////////////////////////////////////// // set desired level of debugging @@ -128,7 +128,7 @@ #if OMPTARGET_NVPTX_DEBUG || OMPTARGET_NVPTX_TEST || OMPTARGET_NVPTX_WARNING #include -#include "support.h" +#include "common/support.h" template NOINLINE static void log(const char *fmt, Arguments... parameters) { diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/device_environment.h b/openmp/libomptarget/deviceRTLs/common/device_environment.h rename from openmp/libomptarget/deviceRTLs/nvptx/src/device_environment.h rename to openmp/libomptarget/deviceRTLs/common/device_environment.h --- a/openmp/libomptarget/deviceRTLs/nvptx/src/device_environment.h +++ b/openmp/libomptarget/deviceRTLs/common/device_environment.h @@ -19,6 +19,6 @@ int32_t debug_level; }; -extern __device__ omptarget_device_environmentTy omptarget_device_environment; +extern DEVICE omptarget_device_environmentTy omptarget_device_environment; #endif diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/support.h b/openmp/libomptarget/deviceRTLs/common/support.h rename from openmp/libomptarget/deviceRTLs/nvptx/src/support.h rename to openmp/libomptarget/deviceRTLs/common/support.h diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/omp_data.cu b/openmp/libomptarget/deviceRTLs/nvptx/src/omp_data.cu --- a/openmp/libomptarget/deviceRTLs/nvptx/src/omp_data.cu +++ b/openmp/libomptarget/deviceRTLs/nvptx/src/omp_data.cu @@ -11,7 +11,7 @@ //===----------------------------------------------------------------------===// #include "omptarget-nvptx.h" -#include "device_environment.h" +#include "common/device_environment.h" //////////////////////////////////////////////////////////////////////////////// // global device environment diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h b/openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h --- a/openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h +++ b/openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h @@ -24,7 +24,7 @@ #include "common/debug.h" // debug #include "interface.h" // interfaces with omp, compiler, and user #include "common/state-queue.h" -#include "support.h" +#include "common/support.h" #define OMPTARGET_NVPTX_VERSION 1.1 diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/support.cu b/openmp/libomptarget/deviceRTLs/nvptx/src/support.cu --- a/openmp/libomptarget/deviceRTLs/nvptx/src/support.cu +++ b/openmp/libomptarget/deviceRTLs/nvptx/src/support.cu @@ -10,7 +10,7 @@ // //===----------------------------------------------------------------------===// -#include "support.h" +#include "common/support.h" #include "common/debug.h" #include "omptarget-nvptx.h"