diff --git a/openmp/libomptarget/DeviceRTL/CMakeLists.txt b/openmp/libomptarget/DeviceRTL/CMakeLists.txt --- a/openmp/libomptarget/DeviceRTL/CMakeLists.txt +++ b/openmp/libomptarget/DeviceRTL/CMakeLists.txt @@ -21,6 +21,12 @@ return() endif() +if (NOT LIBOMPTARGET_LLVM_INCLUDE_DIRS) + libomptarget_say("Not building device RTL: Missing definition for LIBOMPTARGET_LLVM_INCLUDE_DIRS") + return() +endif() + + # Check if we can create an LLVM bitcode implementation of the runtime library # that could be inlined in the user application. For that we need to find # a Clang compiler capable of compiling our CUDA files to LLVM bitcode and @@ -132,6 +138,10 @@ set(clang_opt_flags -O1 -mllvm -openmp-opt-disable -DSHARED_SCRATCHPAD_SIZE=2048) set(link_opt_flags -O1 -openmp-opt-disable) +# Prepend -I to each list element +set (LIBOMPTARGET_LLVM_INCLUDE_DIRS_DEVICERTL "${LIBOMPTARGET_LLVM_INCLUDE_DIRS}") +list(TRANSFORM LIBOMPTARGET_LLVM_INCLUDE_DIRS_DEVICERTL PREPEND "-I") + # Set flags for LLVM Bitcode compilation. set(bc_flags -S -x c++ -std=c++17 ${clang_opt_flags} @@ -141,6 +151,7 @@ -fopenmp -fopenmp-cuda-mode -Xclang -fopenmp-is-device -Xclang -target-feature -Xclang +ptx61 -I${include_directory} + ${LIBOMPTARGET_LLVM_INCLUDE_DIRS_DEVICERTL} ) if(${LIBOMPTARGET_DEVICE_DEBUG}) diff --git a/openmp/libomptarget/DeviceRTL/src/Mapping.cpp b/openmp/libomptarget/DeviceRTL/src/Mapping.cpp --- a/openmp/libomptarget/DeviceRTL/src/Mapping.cpp +++ b/openmp/libomptarget/DeviceRTL/src/Mapping.cpp @@ -16,6 +16,8 @@ #pragma omp declare target +#include "llvm/Frontend/OpenMP/OMPGridValues.h" + using namespace _OMP; namespace _OMP { @@ -26,6 +28,10 @@ ///{ #pragma omp begin declare variant match(device = {arch(amdgcn)}) +constexpr const llvm::omp::GV &getGridValue() { + return llvm::omp::AMDGPUGridValues; +} + uint32_t getGridDim(uint32_t n, uint16_t d) { uint32_t q = n / d; return q + (n > q * d); @@ -86,8 +92,6 @@ return mapping::getThreadIdInBlock() / mapping::getWarpSize(); } -uint32_t getWarpSize() { return 64; } - uint32_t getNumberOfWarpsInBlock() { return mapping::getBlockSize() / mapping::getWarpSize(); } @@ -101,6 +105,10 @@ #pragma omp begin declare variant match( \ device = {arch(nvptx, nvptx64)}, implementation = {extension(match_any)}) +constexpr const llvm::omp::GV &getGridValue() { + return llvm::omp::NVPTXGridValues; +} + LaneMaskTy activemask() { unsigned int Mask; asm("activemask.b32 %0;" : "=r"(Mask)); @@ -144,8 +152,6 @@ return mapping::getThreadIdInBlock() / mapping::getWarpSize(); } -uint32_t getWarpSize() { return 32; } - uint32_t getNumberOfWarpsInBlock() { return (mapping::getBlockSize() + mapping::getWarpSize() - 1) / mapping::getWarpSize(); @@ -154,6 +160,8 @@ #pragma omp end declare variant ///} +uint32_t getWarpSize() { return getGridValue().GV_Warp_Size; } + } // namespace impl } // namespace _OMP diff --git a/openmp/libomptarget/deviceRTLs/amdgcn/CMakeLists.txt b/openmp/libomptarget/deviceRTLs/amdgcn/CMakeLists.txt --- a/openmp/libomptarget/deviceRTLs/amdgcn/CMakeLists.txt +++ b/openmp/libomptarget/deviceRTLs/amdgcn/CMakeLists.txt @@ -18,6 +18,12 @@ return() endif() +if (NOT LIBOMPTARGET_LLVM_INCLUDE_DIRS) + libomptarget_say("Not building AMDGCN device RTL: Missing definition for LIBOMPTARGET_LLVM_INCLUDE_DIRS") + return() +endif() + + # Copied from nvptx CMakeLists if(CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "x86_64") set(aux_triple x86_64-unknown-linux-gnu) @@ -103,6 +109,10 @@ set(mcpus ${LIBOMPTARGET_AMDGCN_GFXLIST}) endif() +# Prepend -I to each list element +set (LIBOMPTARGET_LLVM_INCLUDE_DIRS_AMDGCN "${LIBOMPTARGET_LLVM_INCLUDE_DIRS}") +list(TRANSFORM LIBOMPTARGET_LLVM_INCLUDE_DIRS_AMDGCN PREPEND "-I") + macro(add_cuda_bc_library) set(cu_cmd ${CLANG_TOOL} -xc++ @@ -123,7 +133,8 @@ ${CUDA_DEBUG} -I${CMAKE_CURRENT_SOURCE_DIR}/src -I${devicertl_base_directory}/common/include - -I${devicertl_base_directory}) + -I${devicertl_base_directory} + ${LIBOMPTARGET_LLVM_INCLUDE_DIRS_AMDGCN}) set(bc1_files) diff --git a/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.h b/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.h --- a/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.h +++ b/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.h @@ -31,6 +31,12 @@ #define NOINLINE __attribute__((noinline)) #define ALIGN(N) __attribute__((aligned(N))) +#include "llvm/Frontend/OpenMP/OMPGridValues.h" + +INLINE constexpr const llvm::omp::GV &getGridValue() { + return llvm::omp::AMDGPUGridValues; +} + //////////////////////////////////////////////////////////////////////////////// // Kernel options //////////////////////////////////////////////////////////////////////////////// @@ -38,9 +44,8 @@ //////////////////////////////////////////////////////////////////////////////// // The following def must match the absolute limit hardwired in the host RTL // max number of threads per team -#define MAX_THREADS_PER_TEAM 1024 - -#define WARPSIZE 64 +enum { MAX_THREADS_PER_TEAM = getGridValue().GV_Max_WG_Size }; +enum { WARPSIZE = getGridValue().GV_Warp_Size }; // Maximum number of omp state objects per SM allocated statically in global // memory. @@ -52,11 +57,11 @@ // Data sharing related quantities, need to match what is used in the compiler. enum DATA_SHARING_SIZES { // The size reserved for data in a shared memory slot. - DS_Slot_Size = 256, + DS_Slot_Size = getGridValue().GV_Slot_Size, // The slot size that should be reserved for a working warp. - DS_Worker_Warp_Slot_Size = WARPSIZE * DS_Slot_Size, + DS_Worker_Warp_Slot_Size = getGridValue().warpSlotSize(), // The maximum number of warps in use - DS_Max_Warp_Number = 16, + DS_Max_Warp_Number = getGridValue().maxWarpNumber(), }; enum : __kmpc_impl_lanemask_t { diff --git a/openmp/libomptarget/deviceRTLs/nvptx/CMakeLists.txt b/openmp/libomptarget/deviceRTLs/nvptx/CMakeLists.txt --- a/openmp/libomptarget/deviceRTLs/nvptx/CMakeLists.txt +++ b/openmp/libomptarget/deviceRTLs/nvptx/CMakeLists.txt @@ -19,6 +19,11 @@ return() endif() +if (NOT LIBOMPTARGET_LLVM_INCLUDE_DIRS) + libomptarget_say("Not building NVPTX device RTL: Missing definition for LIBOMPTARGET_LLVM_INCLUDE_DIRS") + return() +endif() + # Check if we can create an LLVM bitcode implementation of the runtime library # that could be inlined in the user application. For that we need to find # a Clang compiler capable of compiling our CUDA files to LLVM bitcode and @@ -151,6 +156,10 @@ src/target_impl.cu ) +# Prepend -I to each list element +set (LIBOMPTARGET_LLVM_INCLUDE_DIRS_NVPTX "${LIBOMPTARGET_LLVM_INCLUDE_DIRS}") +list(TRANSFORM LIBOMPTARGET_LLVM_INCLUDE_DIRS_NVPTX PREPEND "-I") + # Set flags for LLVM Bitcode compilation. set(bc_flags -S -x c++ -O1 -std=c++14 -mllvm -openmp-opt-disable @@ -162,7 +171,8 @@ -D__CUDACC__ -I${devicertl_base_directory} -I${devicertl_common_directory}/include - -I${devicertl_nvptx_directory}/src) + -I${devicertl_nvptx_directory}/src + ${LIBOMPTARGET_LLVM_INCLUDE_DIRS_NVPTX}) if(${LIBOMPTARGET_NVPTX_DEBUG}) list(APPEND bc_flags -DOMPTARGET_NVPTX_DEBUG=-1 -g) diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h b/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h --- a/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h +++ b/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h @@ -24,6 +24,12 @@ #define NOINLINE __attribute__((noinline)) #define ALIGN(N) __attribute__((aligned(N))) +#include "llvm/Frontend/OpenMP/OMPGridValues.h" + +INLINE constexpr const llvm::omp::GV &getGridValue() { + return llvm::omp::NVPTXGridValues; +} + //////////////////////////////////////////////////////////////////////////////// // Kernel options //////////////////////////////////////////////////////////////////////////////// @@ -31,9 +37,8 @@ //////////////////////////////////////////////////////////////////////////////// // The following def must match the absolute limit hardwired in the host RTL // max number of threads per team -#define MAX_THREADS_PER_TEAM 1024 - -#define WARPSIZE 32 +enum { MAX_THREADS_PER_TEAM = getGridValue().GV_Max_WG_Size }; +enum { WARPSIZE = getGridValue().GV_Warp_Size }; // Maximum number of omp state objects per SM allocated statically in global // memory. @@ -64,11 +69,11 @@ // Data sharing related quantities, need to match what is used in the compiler. enum DATA_SHARING_SIZES { // The size reserved for data in a shared memory slot. - DS_Slot_Size = 256, + DS_Slot_Size = getGridValue().GV_Slot_Size, // The slot size that should be reserved for a working warp. - DS_Worker_Warp_Slot_Size = WARPSIZE * DS_Slot_Size, + DS_Worker_Warp_Slot_Size = getGridValue().warpSlotSize(), // The maximum number of warps in use - DS_Max_Warp_Number = 32, + DS_Max_Warp_Number = getGridValue().maxWarpNumber(), }; enum : __kmpc_impl_lanemask_t {