diff --git a/openmp/libomptarget/deviceRTLs/amdgcn/CMakeLists.txt b/openmp/libomptarget/deviceRTLs/amdgcn/CMakeLists.txt --- a/openmp/libomptarget/deviceRTLs/amdgcn/CMakeLists.txt +++ b/openmp/libomptarget/deviceRTLs/amdgcn/CMakeLists.txt @@ -68,6 +68,7 @@ ${devicertl_base_directory}/common/src/parallel.cu ${devicertl_base_directory}/common/src/reduction.cu ${devicertl_base_directory}/common/src/support.cu + ${devicertl_base_directory}/common/src/shuffle.cpp ${devicertl_base_directory}/common/src/sync.cu ${devicertl_base_directory}/common/src/task.cu) @@ -112,6 +113,7 @@ -O${optimization_level} ${CUDA_DEBUG} -I${CMAKE_CURRENT_SOURCE_DIR}/src + -I${devicertl_base_directory}/common/include -I${devicertl_base_directory}) set(bc1_files) diff --git a/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.hip b/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.hip --- a/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.hip +++ b/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.hip @@ -52,22 +52,6 @@ return __builtin_amdgcn_read_exec(); } -EXTERN int32_t __kmpc_impl_shfl_sync(__kmpc_impl_lanemask_t, int32_t var, - int32_t srcLane) { - int width = WARPSIZE; - int self = GetLaneId(); - int index = srcLane + (self & ~(width - 1)); - return __builtin_amdgcn_ds_bpermute(index << 2, var); -} - -EXTERN int32_t __kmpc_impl_shfl_down_sync(__kmpc_impl_lanemask_t, int32_t var, - uint32_t laneDelta, int32_t width) { - int self = GetLaneId(); - int index = self + laneDelta; - index = (int)(laneDelta + (self & (width - 1))) >= width ? self : index; - return __builtin_amdgcn_ds_bpermute(index << 2, var); -} - uint32_t __kmpc_L1_Barrier [[clang::loader_uninitialized]]; #pragma allocate(__kmpc_L1_Barrier) allocator(omp_pteam_mem_alloc) diff --git a/openmp/libomptarget/deviceRTLs/common/include/target/shuffle.h b/openmp/libomptarget/deviceRTLs/common/include/target/shuffle.h new file mode 100644 --- /dev/null +++ b/openmp/libomptarget/deviceRTLs/common/include/target/shuffle.h @@ -0,0 +1,107 @@ +//===- shuffle.h - OpenMP variants of the shuffle idiom for all targets -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// Shuffle function implementations for all supported targets. +// +// Note: We unify the mask type to uint64_t instead of __kmpc_impl_lanemask_t. +// +//===----------------------------------------------------------------------===// + +#ifndef LIBOMPTARGET_DEVICERTL_SHUFFLE_H +#define LIBOMPTARGET_DEVICERTL_SHUFFLE_H + +#include +#include + +#pragma omp declare target + +/// External shuffle API +/// +///{ + +extern "C" { +int32_t __kmpc_shuffle_int32(int32_t val, int16_t delta, int16_t size); +int64_t __kmpc_shuffle_int64(int64_t val, int16_t delta, int16_t size); +} + +///} + +/// Forward declarations +/// +///{ +unsigned GetLaneId(); +unsigned GetWarpSize(); +void __kmpc_impl_unpack(uint64_t val, uint32_t &lo, uint32_t &hi); +uint64_t __kmpc_impl_pack(uint32_t lo, uint32_t hi); +///} + +/// Fallback implementations of the shuffle sync idiom. +/// +///{ + +inline int32_t __kmpc_impl_shfl_sync(uint64_t Mask, int32_t Var, + int32_t SrcLane) { + assert(false && + "Fallback version of __kmpc_impl_shfl_sync is not available!"); +} + +inline int32_t __kmpc_impl_shfl_down_sync(uint64_t Mask, int32_t Var, + uint32_t Delta, int32_t Width) { + assert(false && + "Fallback version of __kmpc_impl_shfl_down_sync is not available!"); +} + +///} + +/// AMDGCN implementations of the shuffle sync idiom. +/// +///{ +#pragma omp begin declare variant match(device = {arch(amdgcn)}) + +inline int32_t __kmpc_impl_shfl_sync(uint64_t Mask, int32_t Var, + int32_t SrcLane) { + int Width = GetWarpSize(); + int Self = GetLaneId(); + int Index = SrcLane + (Self & ~(Width - 1)); + return __builtin_amdgcn_ds_bpermute(Index << 2, Var); +} + +inline int32_t __kmpc_impl_shfl_down_sync(uint64_t Mask, int32_t Var, + uint32_t LaneDelta, int32_t Width) { + int Self = GetLaneId(); + int Index = Self + LaneDelta; + Index = (int)(LaneDelta + (Self & (Width - 1))) >= Width ? Self : Index; + return __builtin_amdgcn_ds_bpermute(Index << 2, Var); +} + +#pragma omp end declare variant +///} + +/// NVPTX implementations of the shuffle and shuffle sync idiom. +/// +///{ +#pragma omp begin declare variant match( \ + device = {arch(nvptx, nvptx64)}, implementation = {extension(match_any)}) + +inline int32_t __kmpc_impl_shfl_sync(uint64_t Mask, int32_t Var, + int32_t SrcLane) { + return __nvvm_shfl_idx_i32(Var, SrcLane, 0x1f); +} + +inline int32_t __kmpc_impl_shfl_down_sync(uint64_t Mask, int32_t Var, + uint32_t Delta, int32_t Width) { + int32_t T = ((GetWarpSize() - Width) << 8) | 0x1f; + return __nvvm_shfl_down_i32(Var, Delta, T); +} + +#pragma omp end declare variant +///} + +#pragma omp end declare target + +#endif diff --git a/openmp/libomptarget/deviceRTLs/common/src/data_sharing.cu b/openmp/libomptarget/deviceRTLs/common/src/data_sharing.cu --- a/openmp/libomptarget/deviceRTLs/common/src/data_sharing.cu +++ b/openmp/libomptarget/deviceRTLs/common/src/data_sharing.cu @@ -12,6 +12,7 @@ #pragma omp declare target #include "common/omptarget.h" +#include "target/shuffle.h" #include "target_impl.h" // Return true if this is the master thread. diff --git a/openmp/libomptarget/deviceRTLs/common/src/loop.cu b/openmp/libomptarget/deviceRTLs/common/src/loop.cu --- a/openmp/libomptarget/deviceRTLs/common/src/loop.cu +++ b/openmp/libomptarget/deviceRTLs/common/src/loop.cu @@ -14,6 +14,7 @@ #pragma omp declare target #include "common/omptarget.h" +#include "target/shuffle.h" #include "target_impl.h" //////////////////////////////////////////////////////////////////////////////// diff --git a/openmp/libomptarget/deviceRTLs/common/src/reduction.cu b/openmp/libomptarget/deviceRTLs/common/src/reduction.cu --- a/openmp/libomptarget/deviceRTLs/common/src/reduction.cu +++ b/openmp/libomptarget/deviceRTLs/common/src/reduction.cu @@ -12,6 +12,7 @@ #pragma omp declare target #include "common/omptarget.h" +#include "target/shuffle.h" #include "target_impl.h" EXTERN @@ -20,18 +21,6 @@ EXTERN void __kmpc_nvptx_end_reduce_nowait(int32_t global_tid) {} -EXTERN int32_t __kmpc_shuffle_int32(int32_t val, int16_t delta, int16_t size) { - return __kmpc_impl_shfl_down_sync(__kmpc_impl_all_lanes, val, delta, size); -} - -EXTERN int64_t __kmpc_shuffle_int64(int64_t val, int16_t delta, int16_t size) { - uint32_t lo, hi; - __kmpc_impl_unpack(val, lo, hi); - hi = __kmpc_impl_shfl_down_sync(__kmpc_impl_all_lanes, hi, delta, size); - lo = __kmpc_impl_shfl_down_sync(__kmpc_impl_all_lanes, lo, delta, size); - return __kmpc_impl_pack(lo, hi); -} - INLINE static void gpu_regular_warp_reduce(void *reduce_data, kmp_ShuffleReductFctPtr shflFct) { for (uint32_t mask = WARPSIZE / 2; mask > 0; mask /= 2) { diff --git a/openmp/libomptarget/deviceRTLs/common/src/shuffle.cpp b/openmp/libomptarget/deviceRTLs/common/src/shuffle.cpp new file mode 100644 --- /dev/null +++ b/openmp/libomptarget/deviceRTLs/common/src/shuffle.cpp @@ -0,0 +1,29 @@ +//===--- shuffle.cpp - Implementation of the external shuffle idiom API -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +//===----------------------------------------------------------------------===// + +#include "target/shuffle.h" + +#pragma omp declare target + +static constexpr uint64_t AllLanes = -1; + +int32_t __kmpc_shuffle_int32(int32_t val, int16_t delta, int16_t size) { + return __kmpc_impl_shfl_down_sync(AllLanes, val, delta, size); +} + +int64_t __kmpc_shuffle_int64(int64_t val, int16_t delta, int16_t size) { + uint32_t lo, hi; + __kmpc_impl_unpack(val, lo, hi); + hi = __kmpc_impl_shfl_down_sync(AllLanes, hi, delta, size); + lo = __kmpc_impl_shfl_down_sync(AllLanes, lo, delta, size); + return __kmpc_impl_pack(lo, hi); +} + +#pragma omp end declare target diff --git a/openmp/libomptarget/deviceRTLs/nvptx/CMakeLists.txt b/openmp/libomptarget/deviceRTLs/nvptx/CMakeLists.txt --- a/openmp/libomptarget/deviceRTLs/nvptx/CMakeLists.txt +++ b/openmp/libomptarget/deviceRTLs/nvptx/CMakeLists.txt @@ -128,6 +128,7 @@ ${devicertl_common_directory}/src/support.cu ${devicertl_common_directory}/src/sync.cu ${devicertl_common_directory}/src/task.cu + ${devicertl_common_directory}/src/shuffle.cpp src/target_impl.cu ) @@ -140,6 +141,7 @@ -Xclang -target-feature -Xclang +ptx61 -D__CUDACC__ -I${devicertl_base_directory} + -I${devicertl_common_directory}/include -I${devicertl_nvptx_directory}/src) if(${LIBOMPTARGET_NVPTX_DEBUG}) diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.cu b/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.cu --- a/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.cu +++ b/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.cu @@ -59,18 +59,6 @@ return Mask; } -DEVICE int32_t __kmpc_impl_shfl_sync(__kmpc_impl_lanemask_t Mask, int32_t Var, - int32_t SrcLane) { - return __nvvm_shfl_sync_idx_i32(Mask, Var, SrcLane, 0x1f); -} - -DEVICE int32_t __kmpc_impl_shfl_down_sync(__kmpc_impl_lanemask_t Mask, - int32_t Var, uint32_t Delta, - int32_t Width) { - int32_t T = ((WARPSIZE - Width) << 8) | 0x1f; - return __nvvm_shfl_sync_down_i32(Mask, Var, Delta, T); -} - DEVICE void __kmpc_impl_syncthreads() { __syncthreads(); } DEVICE void __kmpc_impl_syncwarp(__kmpc_impl_lanemask_t Mask) { diff --git a/openmp/libomptarget/deviceRTLs/target_interface.h b/openmp/libomptarget/deviceRTLs/target_interface.h --- a/openmp/libomptarget/deviceRTLs/target_interface.h +++ b/openmp/libomptarget/deviceRTLs/target_interface.h @@ -57,12 +57,6 @@ EXTERN __kmpc_impl_lanemask_t __kmpc_impl_activemask(); -EXTERN int32_t __kmpc_impl_shfl_sync(__kmpc_impl_lanemask_t Mask, int32_t Var, - int32_t SrcLane); -EXTERN int32_t __kmpc_impl_shfl_down_sync(__kmpc_impl_lanemask_t Mask, - int32_t Var, uint32_t Delta, - int32_t Width); - EXTERN void __kmpc_impl_syncthreads(); EXTERN void __kmpc_impl_syncwarp(__kmpc_impl_lanemask_t Mask);