diff --git a/clang/lib/Driver/ToolChains/Cuda.cpp b/clang/lib/Driver/ToolChains/Cuda.cpp --- a/clang/lib/Driver/ToolChains/Cuda.cpp +++ b/clang/lib/Driver/ToolChains/Cuda.cpp @@ -712,33 +712,30 @@ CC1Args.push_back("-mlink-builtin-bitcode"); CC1Args.push_back(DriverArgs.MakeArgString(LibDeviceFile)); + std::string CudaVersionStr; + // New CUDA versions often introduce new instructions that are only supported // by new PTX version, so we need to raise PTX level to enable them in NVPTX // back-end. const char *PtxFeature = nullptr; switch (CudaInstallation.version()) { - case CudaVersion::CUDA_110: - PtxFeature = "+ptx70"; - break; - case CudaVersion::CUDA_102: - PtxFeature = "+ptx65"; - break; - case CudaVersion::CUDA_101: - PtxFeature = "+ptx64"; - break; - case CudaVersion::CUDA_100: - PtxFeature = "+ptx63"; - break; - case CudaVersion::CUDA_92: - PtxFeature = "+ptx61"; - break; - case CudaVersion::CUDA_91: - PtxFeature = "+ptx61"; - break; - case CudaVersion::CUDA_90: - PtxFeature = "+ptx60"; +#define CASE_CUDA_VERSION(CUDA_VER, PTX_VER) \ + case CudaVersion::CUDA_##CUDA_VER: \ + CudaVersionStr = #CUDA_VER; \ + PtxFeature = "+ptx" #PTX_VER; \ break; + CASE_CUDA_VERSION(110, 70); + CASE_CUDA_VERSION(102, 65); + CASE_CUDA_VERSION(101, 64); + CASE_CUDA_VERSION(100, 63); + CASE_CUDA_VERSION(92, 61); + CASE_CUDA_VERSION(91, 61); + CASE_CUDA_VERSION(90, 60); +#undef CASE_CUDA_VERSION default: + // If unknown CUDA version, we take it as CUDA 8.0. Same assumption is also + // made in libomptarget/deviceRTLs. + CudaVersionStr = "80"; PtxFeature = "+ptx42"; } CC1Args.append({"-target-feature", PtxFeature}); @@ -784,8 +781,9 @@ } else { bool FoundBCLibrary = false; - std::string LibOmpTargetName = - "libomptarget-nvptx-" + GpuArch.str() + ".bc"; + std::string LibOmpTargetName = "libomptarget-nvptx-cuda-" + + CudaVersionStr + "-" + GpuArch.str() + + ".bc"; for (StringRef LibraryPath : LibraryPaths) { SmallString<128> LibOmpTargetFile(LibraryPath); diff --git a/openmp/libomptarget/deviceRTLs/common/allocator.h b/openmp/libomptarget/deviceRTLs/common/allocator.h new file mode 100644 --- /dev/null +++ b/openmp/libomptarget/deviceRTLs/common/allocator.h @@ -0,0 +1,42 @@ +//===--------- allocator.h - OpenMP target memory allocator ------- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// Macros for allocating variables in different address spaces. +// +//===----------------------------------------------------------------------===// + +#ifndef OMPTARGET_ALLOCATOR_H +#define OMPTARGET_ALLOCATOR_H + +// Follows the pattern in interface.h +// Clang sema checks this type carefully, needs to closely match that from omp.h +typedef enum omp_allocator_handle_t { + omp_null_allocator = 0, + omp_default_mem_alloc = 1, + omp_large_cap_mem_alloc = 2, + omp_const_mem_alloc = 3, + omp_high_bw_mem_alloc = 4, + omp_low_lat_mem_alloc = 5, + omp_cgroup_mem_alloc = 6, + omp_pteam_mem_alloc = 7, + omp_thread_mem_alloc = 8, + KMP_ALLOCATOR_MAX_HANDLE = ~(0U) +} omp_allocator_handle_t; + +#define __PRAGMA(STR) _Pragma(#STR) +#define OMP_PRAGMA(STR) __PRAGMA(omp STR) + +#define SHARED(NAME) \ + NAME [[clang::loader_uninitialized]]; \ + OMP_PRAGMA(allocate(NAME) allocator(omp_pteam_mem_alloc)) + +#define EXTERN_SHARED(NAME) \ + NAME; \ + OMP_PRAGMA(allocate(NAME) allocator(omp_pteam_mem_alloc)) + +#endif // OMPTARGET_ALLOCATOR_H diff --git a/openmp/libomptarget/deviceRTLs/common/omptarget.h b/openmp/libomptarget/deviceRTLs/common/omptarget.h --- a/openmp/libomptarget/deviceRTLs/common/omptarget.h +++ b/openmp/libomptarget/deviceRTLs/common/omptarget.h @@ -14,11 +14,12 @@ #ifndef OMPTARGET_H #define OMPTARGET_H -#include "target_impl.h" -#include "common/debug.h" // debug -#include "interface.h" // interfaces with omp, compiler, and user +#include "common/allocator.h" +#include "common/debug.h" // debug #include "common/state-queue.h" #include "common/support.h" +#include "interface.h" // interfaces with omp, compiler, and user +#include "target_impl.h" #define OMPTARGET_NVPTX_VERSION 1.1 @@ -71,8 +72,8 @@ uint32_t nArgs; }; -extern DEVICE SHARED omptarget_nvptx_SharedArgs - omptarget_nvptx_globalArgs; +extern DEVICE + omptarget_nvptx_SharedArgs EXTERN_SHARED(omptarget_nvptx_globalArgs); // Worker slot type which is initialized with the default worker slot // size of 4*32 bytes. @@ -94,7 +95,7 @@ __kmpc_impl_lanemask_t ActiveThreads[DS_Max_Warp_Number]; }; -extern DEVICE SHARED DataSharingStateTy DataSharingState; +extern DEVICE DataSharingStateTy EXTERN_SHARED(DataSharingState); //////////////////////////////////////////////////////////////////////////////// // task ICV and (implicit & explicit) task state @@ -273,9 +274,9 @@ /// Memory manager for statically allocated memory. class omptarget_nvptx_SimpleMemoryManager { private: - ALIGN(128) struct MemDataTy { + struct MemDataTy { volatile unsigned keys[OMP_STATE_COUNT]; - } MemData[MAX_SM]; + } MemData[MAX_SM] ALIGN(128); INLINE static uint32_t hash(unsigned key) { return key & (OMP_STATE_COUNT - 1); @@ -294,18 +295,18 @@ extern DEVICE omptarget_nvptx_SimpleMemoryManager omptarget_nvptx_simpleMemoryManager; -extern DEVICE SHARED uint32_t usedMemIdx; -extern DEVICE SHARED uint32_t usedSlotIdx; -extern DEVICE SHARED uint8_t - parallelLevel[MAX_THREADS_PER_TEAM / WARPSIZE]; -extern DEVICE SHARED uint16_t threadLimit; -extern DEVICE SHARED uint16_t threadsInTeam; -extern DEVICE SHARED uint16_t nThreads; -extern DEVICE SHARED - omptarget_nvptx_ThreadPrivateContext *omptarget_nvptx_threadPrivateContext; - -extern DEVICE SHARED uint32_t execution_param; -extern DEVICE SHARED void *ReductionScratchpadPtr; +extern DEVICE uint32_t EXTERN_SHARED(usedMemIdx); +extern DEVICE uint32_t EXTERN_SHARED(usedSlotIdx); +extern DEVICE uint8_t parallelLevel[MAX_THREADS_PER_TEAM / WARPSIZE]; +#pragma omp allocate(parallelLevel) allocator(omp_pteam_mem_alloc) +extern DEVICE uint16_t EXTERN_SHARED(threadLimit); +extern DEVICE uint16_t EXTERN_SHARED(threadsInTeam); +extern DEVICE uint16_t EXTERN_SHARED(nThreads); +extern DEVICE omptarget_nvptx_ThreadPrivateContext * + EXTERN_SHARED(omptarget_nvptx_threadPrivateContext); + +extern DEVICE uint32_t EXTERN_SHARED(execution_param); +extern DEVICE void *EXTERN_SHARED(ReductionScratchpadPtr); //////////////////////////////////////////////////////////////////////////////// // work function (outlined parallel/simd functions) and arguments. @@ -313,8 +314,8 @@ //////////////////////////////////////////////////////////////////////////////// typedef void *omptarget_nvptx_WorkFn; -extern volatile DEVICE SHARED omptarget_nvptx_WorkFn - omptarget_nvptx_workFn; +extern volatile DEVICE + omptarget_nvptx_WorkFn EXTERN_SHARED(omptarget_nvptx_workFn); //////////////////////////////////////////////////////////////////////////////// // get private data structures diff --git a/openmp/libomptarget/deviceRTLs/common/src/libcall.cu b/openmp/libomptarget/deviceRTLs/common/src/libcall.cu --- a/openmp/libomptarget/deviceRTLs/common/src/libcall.cu +++ b/openmp/libomptarget/deviceRTLs/common/src/libcall.cu @@ -316,10 +316,10 @@ return rc; } -EXTERN int omp_is_initial_device(void) { - PRINT0(LD_IO, "call omp_is_initial_device() returns 0\n"); - return 0; // 0 by def on device -} +// EXTERN int omp_is_initial_device(void) { +// PRINT0(LD_IO, "call omp_is_initial_device() returns 0\n"); +// return 0; // 0 by def on device +// } // Unspecified on the device. EXTERN int omp_get_initial_device(void) { diff --git a/openmp/libomptarget/deviceRTLs/common/src/omp_data.cu b/openmp/libomptarget/deviceRTLs/common/src/omp_data.cu --- a/openmp/libomptarget/deviceRTLs/common/src/omp_data.cu +++ b/openmp/libomptarget/deviceRTLs/common/src/omp_data.cu @@ -11,8 +11,9 @@ //===----------------------------------------------------------------------===// #pragma omp declare target -#include "common/omptarget.h" +#include "common/allocator.h" #include "common/device_environment.h" +#include "common/omptarget.h" //////////////////////////////////////////////////////////////////////////////// // global device environment @@ -28,44 +29,44 @@ omptarget_nvptx_Queue omptarget_nvptx_device_State[MAX_SM]; -DEVICE omptarget_nvptx_SimpleMemoryManager - omptarget_nvptx_simpleMemoryManager; -DEVICE SHARED uint32_t usedMemIdx; -DEVICE SHARED uint32_t usedSlotIdx; +DEVICE omptarget_nvptx_SimpleMemoryManager omptarget_nvptx_simpleMemoryManager; +DEVICE uint32_t SHARED(usedMemIdx); +DEVICE uint32_t SHARED(usedSlotIdx); -DEVICE SHARED uint8_t parallelLevel[MAX_THREADS_PER_TEAM / WARPSIZE]; -DEVICE SHARED uint16_t threadLimit; -DEVICE SHARED uint16_t threadsInTeam; -DEVICE SHARED uint16_t nThreads; +DEVICE uint8_t parallelLevel[MAX_THREADS_PER_TEAM / WARPSIZE]; +#pragma omp allocate(parallelLevel) allocator(omp_pteam_mem_alloc) +DEVICE uint16_t SHARED(threadLimit); +DEVICE uint16_t SHARED(threadsInTeam); +DEVICE uint16_t SHARED(nThreads); // Pointer to this team's OpenMP state object -DEVICE SHARED - omptarget_nvptx_ThreadPrivateContext *omptarget_nvptx_threadPrivateContext; +DEVICE omptarget_nvptx_ThreadPrivateContext * + SHARED(omptarget_nvptx_threadPrivateContext); //////////////////////////////////////////////////////////////////////////////// // The team master sets the outlined parallel function in this variable to // communicate with the workers. Since it is in shared memory, there is one // copy of these variables for each kernel, instance, and team. //////////////////////////////////////////////////////////////////////////////// -volatile DEVICE SHARED omptarget_nvptx_WorkFn omptarget_nvptx_workFn; +volatile DEVICE omptarget_nvptx_WorkFn SHARED(omptarget_nvptx_workFn); //////////////////////////////////////////////////////////////////////////////// // OpenMP kernel execution parameters //////////////////////////////////////////////////////////////////////////////// -DEVICE SHARED uint32_t execution_param; +DEVICE uint32_t SHARED(execution_param); //////////////////////////////////////////////////////////////////////////////// // Data sharing state //////////////////////////////////////////////////////////////////////////////// -DEVICE SHARED DataSharingStateTy DataSharingState; +DEVICE DataSharingStateTy SHARED(DataSharingState); //////////////////////////////////////////////////////////////////////////////// // Scratchpad for teams reduction. //////////////////////////////////////////////////////////////////////////////// -DEVICE SHARED void *ReductionScratchpadPtr; +DEVICE void *SHARED(ReductionScratchpadPtr); //////////////////////////////////////////////////////////////////////////////// // Data sharing related variables. //////////////////////////////////////////////////////////////////////////////// -DEVICE SHARED omptarget_nvptx_SharedArgs omptarget_nvptx_globalArgs; +DEVICE omptarget_nvptx_SharedArgs SHARED(omptarget_nvptx_globalArgs); #pragma omp end declare target diff --git a/openmp/libomptarget/deviceRTLs/common/src/reduction.cu b/openmp/libomptarget/deviceRTLs/common/src/reduction.cu --- a/openmp/libomptarget/deviceRTLs/common/src/reduction.cu +++ b/openmp/libomptarget/deviceRTLs/common/src/reduction.cu @@ -208,8 +208,8 @@ : /*Master thread only*/ 1; uint32_t TeamId = GetBlockIdInKernel(); uint32_t NumTeams = GetNumberOfBlocksInKernel(); - static SHARED unsigned Bound; - static SHARED unsigned ChunkTeamCount; + static unsigned SHARED(Bound); + static unsigned SHARED(ChunkTeamCount); // Block progress for teams greater than the current upper // limit. We always only allow a number of teams less or equal diff --git a/openmp/libomptarget/deviceRTLs/nvptx/CMakeLists.txt b/openmp/libomptarget/deviceRTLs/nvptx/CMakeLists.txt --- a/openmp/libomptarget/deviceRTLs/nvptx/CMakeLists.txt +++ b/openmp/libomptarget/deviceRTLs/nvptx/CMakeLists.txt @@ -10,6 +10,21 @@ # ##===----------------------------------------------------------------------===## +# TODO: This part needs to be refined when libomptarget is going to support +# Windows! +# TODO: This part can also be removed if we can change the clang driver to make +# it support device only compilation. +if(CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "x86_64") + set(aux_triple x86_64-unknown-linux-gnu) +elseif(CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "ppc64le") + set(aux_triple powerpc64le-unknown-linux-gnu) +elseif(CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "aarch64") + set(aux_triple aarch64-unknown-linux-gnu) +else() + libomptarget_say("Not building CUDA offloading device RTL: unknown host arch: ${CMAKE_HOST_SYSTEM_PROCESSOR}") + return() +endif() + get_filename_component(devicertl_base_directory ${CMAKE_CURRENT_SOURCE_DIR} DIRECTORY) @@ -79,61 +94,82 @@ ) # Set flags for LLVM Bitcode compilation. - set(bc_flags ${LIBOMPTARGET_NVPTX_SELECTED_CUDA_COMPILER_FLAGS} + set(bc_flags -S -x c++ + -target nvptx64 + -Xclang -emit-llvm-bc + -Xclang -aux-triple -Xclang ${aux_triple} + -fopenmp -Xclang -fopenmp-is-device + -D__CUDACC__ -I${devicertl_base_directory} -I${devicertl_nvptx_directory}/src) if(${LIBOMPTARGET_NVPTX_DEBUG}) - set(bc_flags ${bc_flags} -DOMPTARGET_NVPTX_DEBUG=-1) + list(APPEND bc_flags -DOMPTARGET_NVPTX_DEBUG=-1) else() - set(bc_flags ${bc_flags} -DOMPTARGET_NVPTX_DEBUG=0) + list(APPEND bc_flags -DOMPTARGET_NVPTX_DEBUG=0) endif() # Create target to build all Bitcode libraries. add_custom_target(omptarget-nvptx-bc) - # Generate a Bitcode library for all the compute capabilities the user requested. + # This correlation is from clang/lib/Driver/ToolChains/Cuda.cpp. + # The last element is the default case. + set(cuda_version_list 110 102 101 100 92 91 90 80) + set(ptx_feature_list 70 65 64 63 61 61 60 42) + + # Generate a Bitcode library for all the compute capabilities the user + # requested and all PTX version we know for now. foreach(sm ${nvptx_sm_list}) - set(cuda_arch --cuda-gpu-arch=sm_${sm}) - - # Compile CUDA files to bitcode. - set(bc_files "") - foreach(src ${cuda_src_files}) - get_filename_component(infile ${src} ABSOLUTE) - get_filename_component(outfile ${src} NAME) - - add_custom_command(OUTPUT ${outfile}-sm_${sm}.bc - COMMAND ${LIBOMPTARGET_NVPTX_SELECTED_CUDA_COMPILER} ${bc_flags} ${cuda_arch} ${MAX_SM_DEFINITION} - -c ${infile} -o ${outfile}-sm_${sm}.bc - DEPENDS ${infile} - IMPLICIT_DEPENDS CXX ${infile} - COMMENT "Building LLVM bitcode ${outfile}-sm_${sm}.bc" - VERBATIM + set(sm_flags -Xclang -target-cpu -Xclang sm_${sm} "-D__CUDA_ARCH__=${sm}0") + + foreach(cuda_version ptx_num IN ZIP_LISTS cuda_version_list ptx_feature_list) + set(cuda_flags ${sm_flags}) + list(APPEND cuda_flags -Xclang -target-feature -Xclang +ptx${ptx_num}) + list(APPEND cuda_flags "-DCUDA_VERSION=${cuda_version}00") + + set(bc_files "") + foreach(src ${cuda_src_files}) + get_filename_component(infile ${src} ABSOLUTE) + get_filename_component(outfile ${src} NAME) + set(outfile "${outfile}-cuda_${cuda_version}-sm_${sm}.bc") + + add_custom_command(OUTPUT ${outfile} + COMMAND ${LIBOMPTARGET_NVPTX_SELECTED_CUDA_COMPILER} ${bc_flags} + ${cuda_flags} ${MAX_SM_DEFINITION} ${infile} -o ${outfile} + DEPENDS ${infile} + IMPLICIT_DEPENDS CXX ${infile} + COMMENT "Building LLVM bitcode ${outfile}" + VERBATIM + ) + set_property(DIRECTORY APPEND PROPERTY ADDITIONAL_MAKE_CLEAN_FILES ${outfile}) + + list(APPEND bc_files ${outfile}) + endforeach() + + set(bclib_name "libomptarget-nvptx-cuda_${cuda_version}-sm_${sm}.bc") + + # Link to a bitcode library. + add_custom_command(OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/${bclib_name} + COMMAND ${LIBOMPTARGET_NVPTX_SELECTED_BC_LINKER} + -o ${CMAKE_CURRENT_BINARY_DIR}/${bclib_name} ${bc_files} + DEPENDS ${bc_files} + COMMENT "Linking LLVM bitcode ${bclib_name}" ) - set_property(DIRECTORY APPEND PROPERTY ADDITIONAL_MAKE_CLEAN_FILES ${outfile}-sm_${sm}.bc) + set_property(DIRECTORY APPEND PROPERTY ADDITIONAL_MAKE_CLEAN_FILES ${bclib_name}) - list(APPEND bc_files ${outfile}-sm_${sm}.bc) - endforeach() + set(bclib_target_name "omptarget-nvptx-cuda_${cuda_version}-sm_${sm}-bc") + + add_custom_target(${bclib_target_name} ALL DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/${bclib_name}) + add_dependencies(omptarget-nvptx-bc ${bclib_target_name}) - # Link to a bitcode library. - add_custom_command(OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/libomptarget-nvptx-sm_${sm}.bc - COMMAND ${LIBOMPTARGET_NVPTX_SELECTED_BC_LINKER} - -o ${CMAKE_CURRENT_BINARY_DIR}/libomptarget-nvptx-sm_${sm}.bc ${bc_files} - DEPENDS ${bc_files} - COMMENT "Linking LLVM bitcode libomptarget-nvptx-sm_${sm}.bc" - ) - set_property(DIRECTORY APPEND PROPERTY ADDITIONAL_MAKE_CLEAN_FILES libomptarget-nvptx-sm_${sm}.bc) - - add_custom_target(omptarget-nvptx-${sm}-bc ALL DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/libomptarget-nvptx-sm_${sm}.bc) - add_dependencies(omptarget-nvptx-bc omptarget-nvptx-${sm}-bc) - - # Copy library to destination. - add_custom_command(TARGET omptarget-nvptx-${sm}-bc POST_BUILD - COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_BINARY_DIR}/libomptarget-nvptx-sm_${sm}.bc - ${LIBOMPTARGET_LIBRARY_DIR}) - - # Install bitcode library under the lib destination folder. - install(FILES ${CMAKE_CURRENT_BINARY_DIR}/libomptarget-nvptx-sm_${sm}.bc DESTINATION "${OPENMP_INSTALL_LIBDIR}") + # Copy library to destination. + add_custom_command(TARGET ${bclib_target_name} POST_BUILD + COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_BINARY_DIR}/${bclib_name} + ${LIBOMPTARGET_LIBRARY_DIR}) + + # Install bitcode library under the lib destination folder. + install(FILES ${CMAKE_CURRENT_BINARY_DIR}/${bclib_name} DESTINATION "${OPENMP_INSTALL_LIBDIR}") + endforeach() endforeach() endif() diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/nvptx_interface.h b/openmp/libomptarget/deviceRTLs/nvptx/src/nvptx_interface.h --- a/openmp/libomptarget/deviceRTLs/nvptx/src/nvptx_interface.h +++ b/openmp/libomptarget/deviceRTLs/nvptx/src/nvptx_interface.h @@ -11,7 +11,8 @@ #include -#define EXTERN extern "C" __device__ +#define EXTERN extern "C" + typedef uint32_t __kmpc_impl_lanemask_t; typedef uint32_t omp_lock_t; /* arbitrary type of the right length */ diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h b/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h --- a/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h +++ b/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h @@ -13,18 +13,16 @@ #define _TARGET_IMPL_H_ #include -#include #include #include #include #include "nvptx_interface.h" -#define DEVICE __device__ -#define INLINE __forceinline__ DEVICE -#define NOINLINE __noinline__ DEVICE -#define SHARED __shared__ -#define ALIGN(N) __align__(N) +#define DEVICE +#define INLINE inline __attribute__((always_inline)) +#define NOINLINE __attribute__((noinline)) +#define ALIGN(N) __attribute__((aligned(N))) //////////////////////////////////////////////////////////////////////////////// // Kernel options diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.cu b/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.cu --- a/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.cu +++ b/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.cu @@ -14,25 +14,6 @@ #include "target_impl.h" #include "common/debug.h" -#include - -// Forward declaration of CUDA primitives which will be evetually transformed -// into LLVM intrinsics. -extern "C" { -unsigned int __activemask(); -unsigned int __ballot(unsigned); -// The default argument here is based on NVIDIA's website -// https://developer.nvidia.com/blog/using-cuda-warp-level-primitives/ -int __shfl_sync(unsigned mask, int val, int src_line, int width = WARPSIZE); -int __shfl(int val, int src_line, int width = WARPSIZE); -int __shfl_down(int var, unsigned detla, int width); -int __shfl_down_sync(unsigned mask, int var, unsigned detla, int width); -void __syncwarp(int mask); -void __threadfence(); -void __threadfence_block(); -void __threadfence_system(); -} - DEVICE void __kmpc_impl_unpack(uint64_t val, uint32_t &lo, uint32_t &hi) { asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "l"(val)); } @@ -74,10 +55,12 @@ // In Cuda 9.0, __ballot(1) from Cuda 8.0 is replaced with __activemask(). DEVICE __kmpc_impl_lanemask_t __kmpc_impl_activemask() { -#if CUDA_VERSION >= 9000 - return __activemask(); +#if CUDA_VERSION < 9020 + return __nvvm_vote_ballot(1); #else - return __ballot(1); + unsigned int mask; + asm volatile("activemask.b32 %0;" : "=r"(mask)); + return mask; #endif } @@ -85,9 +68,9 @@ DEVICE int32_t __kmpc_impl_shfl_sync(__kmpc_impl_lanemask_t Mask, int32_t Var, int32_t SrcLane) { #if CUDA_VERSION >= 9000 - return __shfl_sync(Mask, Var, SrcLane); + return __nvvm_shfl_sync_idx_i32(Mask, Var, SrcLane, 0x1f); #else - return __shfl(Var, SrcLane); + return __nvvm_shfl_idx_i32(Var, SrcLane, 0x1f); #endif // CUDA_VERSION } @@ -95,9 +78,10 @@ int32_t Var, uint32_t Delta, int32_t Width) { #if CUDA_VERSION >= 9000 - return __shfl_down_sync(Mask, Var, Delta, Width); + return __nvvm_shfl_sync_down_i32(Mask, Var, Delta, + ((WARPSIZE - Width) << 8) | 0x1f); #else - return __shfl_down(Var, Delta, Width); + return __nvvm_shfl_down_i32(Var, Delta, ((WARPSIZE - Width) << 8) | 0x1f); #endif // CUDA_VERSION } @@ -105,7 +89,7 @@ DEVICE void __kmpc_impl_syncwarp(__kmpc_impl_lanemask_t Mask) { #if CUDA_VERSION >= 9000 - __syncwarp(Mask); + __nvvm_bar_warp_sync(Mask); #else // In Cuda < 9.0 no need to sync threads in warps. #endif // CUDA_VERSION @@ -126,9 +110,9 @@ : "memory"); } -DEVICE void __kmpc_impl_threadfence() { __threadfence(); } -DEVICE void __kmpc_impl_threadfence_block() { __threadfence_block(); } -DEVICE void __kmpc_impl_threadfence_system() { __threadfence_system(); } +DEVICE void __kmpc_impl_threadfence() { __nvvm_membar_gl(); } +DEVICE void __kmpc_impl_threadfence_block() { __nvvm_membar_cta(); } +DEVICE void __kmpc_impl_threadfence_system() { __nvvm_membar_sys(); } // Calls to the NVPTX layer (assuming 1D layout) DEVICE int GetThreadIdInBlock() { return __nvvm_read_ptx_sreg_tid_x(); }