diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/loop.cu b/openmp/libomptarget/deviceRTLs/nvptx/src/loop.cu --- a/openmp/libomptarget/deviceRTLs/nvptx/src/loop.cu +++ b/openmp/libomptarget/deviceRTLs/nvptx/src/loop.cu @@ -13,6 +13,7 @@ //===----------------------------------------------------------------------===// #include "omptarget-nvptx.h" +#include "target_impl.h" //////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////// @@ -381,20 +382,18 @@ INLINE static int64_t Shuffle(unsigned active, int64_t val, int leader) { int lo, hi; - asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "l"(val)); + __kmpc_impl_unpack(val, lo, hi); hi = __SHFL_SYNC(active, hi, leader); lo = __SHFL_SYNC(active, lo, leader); - asm volatile("mov.b64 %0, {%1,%2};" : "=l"(val) : "r"(lo), "r"(hi)); - return val; + return __kmpc_impl_pack(lo, hi); } INLINE static uint64_t NextIter() { - unsigned int active = __ACTIVEMASK(); - int leader = __ffs(active) - 1; - int change = __popc(active); - unsigned lane_mask_lt; - asm("mov.u32 %0, %%lanemask_lt;" : "=r"(lane_mask_lt)); - unsigned int rank = __popc(active & lane_mask_lt); + __kmpc_impl_lanemask_t active = __ACTIVEMASK(); + int leader = __kmpc_impl_ffs(active) - 1; + int change = __kmpc_impl_popc(active); + __kmpc_impl_lanemask_t lane_mask_lt = __kmpc_impl_lanemask_lt(); + unsigned int rank = __kmpc_impl_popc(active & lane_mask_lt); uint64_t warp_res; if (rank == 0) { warp_res = atomicAdd( diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h b/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h new file mode 100644 --- /dev/null +++ b/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h @@ -0,0 +1,41 @@ +//===------------ target_impl.h - NVPTX OpenMP GPU options ------- CUDA -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// Definitions of target specific functions +// +//===----------------------------------------------------------------------===// +#ifndef _TARGET_IMPL_H_ +#define _TARGET_IMPL_H_ + +#include + +#define FORCEINLINE __forceinline__ __device__ + +FORCEINLINE void __kmpc_impl_unpack(int64_t val, int32_t &lo, int32_t &hi) { + asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "l"(val)); +} + +FORCEINLINE int64_t __kmpc_impl_pack(int32_t lo, int32_t hi) { + int64_t val; + asm volatile("mov.b64 %0, {%1,%2};" : "=l"(val) : "r"(lo), "r"(hi)); + return val; +} + +typedef uint32_t __kmpc_impl_lanemask_t; + +FORCEINLINE __kmpc_impl_lanemask_t __kmpc_impl_lanemask_lt() { + __kmpc_impl_lanemask_t res; + asm("mov.u32 %0, %%lanemask_lt;" : "=r"(res)); + return res; +} + +FORCEINLINE int __kmpc_impl_ffs(uint32_t x) { return __ffs(x); } + +FORCEINLINE int __kmpc_impl_popc(uint32_t x) { return __popc(x); } + +#endif