diff --git a/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.hip b/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.hip
--- a/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.hip
+++ b/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.hip
@@ -50,22 +50,6 @@
   return __builtin_amdgcn_read_exec();
 }
 
-DEVICE int32_t __kmpc_impl_shfl_sync(__kmpc_impl_lanemask_t, int32_t var,
-                                     int32_t srcLane) {
-  int width = WARPSIZE;
-  int self = GetLaneId();
-  int index = srcLane + (self & ~(width - 1));
-  return __builtin_amdgcn_ds_bpermute(index << 2, var);
-}
-
-DEVICE int32_t __kmpc_impl_shfl_down_sync(__kmpc_impl_lanemask_t, int32_t var,
-                                          uint32_t laneDelta, int32_t width) {
-  int self = GetLaneId();
-  int index = self + laneDelta;
-  index = (int)(laneDelta + (self & (width - 1))) >= width ? self : index;
-  return __builtin_amdgcn_ds_bpermute(index << 2, var);
-}
-
 static DEVICE SHARED uint32_t L1_Barrier;
 
 DEVICE void __kmpc_impl_target_init() {
diff --git a/openmp/libomptarget/deviceRTLs/common/include/shuffle.h b/openmp/libomptarget/deviceRTLs/common/include/shuffle.h
new file mode 100644
--- /dev/null
+++ b/openmp/libomptarget/deviceRTLs/common/include/shuffle.h
@@ -0,0 +1,133 @@
+//===- shuffle.h - OpenMP variants of the shuffle idiom for all targets -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Shuffle function implementations for all supported targets.
+//
+// Note: We unify the mask type to int64_t instead of __kmpc_impl_lanemask_t.
+//       The value might therefore be extended and later truncated but those
+//       operations are no-ops and will be eventually removed.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LIBOMPTARGET_DEVICERTL_SHUFFLE_H
+#define LIBOMPTARGET_DEVICERTL_SHUFFLE_H
+
+#include <inttypes.h>
+
+#pragma omp declare target
+
+/// Forward declarations
+///
+///{
+unsigned GetLaneId();
+unsigned GetWarpSize();
+void __kmpc_impl_unpack(uint64_t val, uint32_t &lo, uint32_t &hi);
+uint64_t __kmpc_impl_pack(uint32_t lo, uint32_t hi);
+///}
+
+/// Fallback implementations of the shuffle sync idiom.
+///
+///{
+#pragma omp begin declare variant match(                                       \
+    device = {arch(amdgcn, nvptx, nvptx64)},                                   \
+    implementation = {extension(match_none)})
+
+inline int32_t __kmpc_impl_shfl_sync(int64_t Mask, int32_t Var,
+                                     int32_t SrcLane) {
+  static_assert(false,
+                "Fallback version of __kmpc_impl_shfl_sync is not available!");
+}
+
+inline int32_t __kmpc_impl_shfl_down_sync(int64_t Mask, int32_t Var,
+                                          uint32_t Delta, int32_t Width) {
+  static_assert(
+      false,
+      "Fallback version of __kmpc_impl_shfl_down_sync is not available!");
+}
+
+#pragma omp end declare variant
+///}
+
+/// AMDGCN implementations of the shuffle sync idiom.
+///
+///{
+#pragma omp begin declare variant match(device = {arch(amdgcn)})
+
+inline int32_t __kmpc_impl_shfl_sync(int64_t Mask, int32_t Var,
+                                     int32_t SrcLane) {
+  int Width = GetWarpSize();
+  int Self = GetLaneId();
+  int Index = SrcLane + (Self & ~(Width - 1));
+  return __builtin_amdgcn_ds_bpermute(Index << 2, Var);
+}
+
+inline int32_t __kmpc_impl_shfl_down_sync(int64_t Mask, int32_t Var,
+                                          uint32_t LaneDelta, int32_t Width) {
+  int Self = GetLaneId();
+  int Index = Self + LaneDelta;
+  Index = (int)(LaneDelta + (Self & (Width - 1))) >= Width ? Self : Index;
+  return __builtin_amdgcn_ds_bpermute(Index << 2, Var);
+}
+
+#pragma omp end declare variant
+///}
+
+/// NVPTX implementations of the shuffle sync idiom.
+///
+///{
+#pragma omp begin declare variant match(                                       \
+    device = {arch(nvptx, nvptx64)}, implementation = {extension(match_any)})
+
+inline int32_t __kmpc_impl_shfl_sync(int64_t Mask, int32_t Var,
+                                     int32_t SrcLane) {
+// In Cuda 9.0, the *_sync() version takes an extra argument 'mask'.
+#if CUDA_VERSION >= 9000
+  return __nvvm_shfl_sync_idx_i32(Mask, Var, SrcLane, 0x1f);
+#else
+  return __nvvm_shfl_idx_i32(Var, SrcLane, 0x1f);
+#endif // CUDA_VERSION
+}
+
+inline int32_t __kmpc_impl_shfl_down_sync(int64_t Mask, int32_t Var,
+                                          uint32_t Delta, int32_t Width) {
+  int32_t T = ((GetWarpSize() - Width) << 8) | 0x1f;
+// In Cuda 9.0, the *_sync() version takes an extra argument 'mask'.
+#if CUDA_VERSION >= 9000
+  return __nvvm_shfl_sync_down_i32(Mask, Var, Delta, T);
+#else
+  return __nvvm_shfl_down_i32(Var, Delta, T);
+#endif // CUDA_VERSION
+}
+
+#pragma omp end declare variant
+///}
+
+/// External shuffle API
+///
+///{
+
+extern "C" {
+
+inline int32_t __kmpc_shuffle_int32(int32_t val, int16_t delta, int16_t size) {
+  return __kmpc_impl_shfl_down_sync(-1, val, delta, size);
+}
+
+inline int64_t __kmpc_shuffle_int64(int64_t val, int16_t delta, int16_t size) {
+  uint32_t lo, hi;
+  __kmpc_impl_unpack(val, lo, hi);
+  hi = __kmpc_impl_shfl_down_sync(-1, hi, delta, size);
+  lo = __kmpc_impl_shfl_down_sync(-1, lo, delta, size);
+  return __kmpc_impl_pack(lo, hi);
+}
+
+} // extern "C"
+///}
+
+#pragma omp end declare target
+
+#endif
diff --git a/openmp/libomptarget/deviceRTLs/common/src/data_sharing.cu b/openmp/libomptarget/deviceRTLs/common/src/data_sharing.cu
--- a/openmp/libomptarget/deviceRTLs/common/src/data_sharing.cu
+++ b/openmp/libomptarget/deviceRTLs/common/src/data_sharing.cu
@@ -13,6 +13,7 @@
 
 #include "common/omptarget.h"
 #include "target_impl.h"
+#include "shuffle.h"
 
 // Return true if this is the master thread.
 INLINE static bool IsMasterThread(bool isSPMDExecutionMode) {
diff --git a/openmp/libomptarget/deviceRTLs/common/src/loop.cu b/openmp/libomptarget/deviceRTLs/common/src/loop.cu
--- a/openmp/libomptarget/deviceRTLs/common/src/loop.cu
+++ b/openmp/libomptarget/deviceRTLs/common/src/loop.cu
@@ -14,6 +14,7 @@
 #pragma omp declare target
 
 #include "common/omptarget.h"
+#include "shuffle.h"
 #include "target_impl.h"
 
 ////////////////////////////////////////////////////////////////////////////////
diff --git a/openmp/libomptarget/deviceRTLs/common/src/reduction.cu b/openmp/libomptarget/deviceRTLs/common/src/reduction.cu
--- a/openmp/libomptarget/deviceRTLs/common/src/reduction.cu
+++ b/openmp/libomptarget/deviceRTLs/common/src/reduction.cu
@@ -12,6 +12,7 @@
 #pragma omp declare target
 
 #include "common/omptarget.h"
+#include "shuffle.h"
 #include "target_impl.h"
 
 EXTERN
@@ -20,18 +21,6 @@
 EXTERN
 void __kmpc_nvptx_end_reduce_nowait(int32_t global_tid) {}
 
-EXTERN int32_t __kmpc_shuffle_int32(int32_t val, int16_t delta, int16_t size) {
-  return __kmpc_impl_shfl_down_sync(__kmpc_impl_all_lanes, val, delta, size);
-}
-
-EXTERN int64_t __kmpc_shuffle_int64(int64_t val, int16_t delta, int16_t size) {
-   uint32_t lo, hi;
-   __kmpc_impl_unpack(val, lo, hi);
-   hi = __kmpc_impl_shfl_down_sync(__kmpc_impl_all_lanes, hi, delta, size);
-   lo = __kmpc_impl_shfl_down_sync(__kmpc_impl_all_lanes, lo, delta, size);
-   return __kmpc_impl_pack(lo, hi);
-}
-
 INLINE static void gpu_regular_warp_reduce(void *reduce_data,
                                            kmp_ShuffleReductFctPtr shflFct) {
   for (uint32_t mask = WARPSIZE / 2; mask > 0; mask /= 2) {
diff --git a/openmp/libomptarget/deviceRTLs/nvptx/CMakeLists.txt b/openmp/libomptarget/deviceRTLs/nvptx/CMakeLists.txt
--- a/openmp/libomptarget/deviceRTLs/nvptx/CMakeLists.txt
+++ b/openmp/libomptarget/deviceRTLs/nvptx/CMakeLists.txt
@@ -139,6 +139,7 @@
              -fopenmp -fopenmp-cuda-mode -Xclang -fopenmp-is-device
              -D__CUDACC__
              -I${devicertl_base_directory}
+             -I${devicertl_common_directory}/include
              -I${devicertl_nvptx_directory}/src)
 
 if(${LIBOMPTARGET_NVPTX_DEBUG})
diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.cu b/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.cu
--- a/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.cu
+++ b/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.cu
@@ -64,27 +64,6 @@
 #endif
 }
 
-// In Cuda 9.0, the *_sync() version takes an extra argument 'mask'.
-DEVICE int32_t __kmpc_impl_shfl_sync(__kmpc_impl_lanemask_t Mask, int32_t Var,
-                                     int32_t SrcLane) {
-#if CUDA_VERSION >= 9000
-  return __nvvm_shfl_sync_idx_i32(Mask, Var, SrcLane, 0x1f);
-#else
-  return __nvvm_shfl_idx_i32(Var, SrcLane, 0x1f);
-#endif // CUDA_VERSION
-}
-
-DEVICE int32_t __kmpc_impl_shfl_down_sync(__kmpc_impl_lanemask_t Mask,
-                                          int32_t Var, uint32_t Delta,
-                                          int32_t Width) {
-  int32_t T = ((WARPSIZE - Width) << 8) | 0x1f;
-#if CUDA_VERSION >= 9000
-  return __nvvm_shfl_sync_down_i32(Mask, Var, Delta, T);
-#else
-  return __nvvm_shfl_down_i32(Var, Delta, T);
-#endif // CUDA_VERSION
-}
-
 DEVICE void __kmpc_impl_syncthreads() { __syncthreads(); }
 
 DEVICE void __kmpc_impl_syncwarp(__kmpc_impl_lanemask_t Mask) {
diff --git a/openmp/libomptarget/deviceRTLs/target_interface.h b/openmp/libomptarget/deviceRTLs/target_interface.h
--- a/openmp/libomptarget/deviceRTLs/target_interface.h
+++ b/openmp/libomptarget/deviceRTLs/target_interface.h
@@ -57,12 +57,6 @@
 
 EXTERN __kmpc_impl_lanemask_t __kmpc_impl_activemask();
 
-EXTERN int32_t __kmpc_impl_shfl_sync(__kmpc_impl_lanemask_t Mask, int32_t Var,
-                                     int32_t SrcLane);
-EXTERN int32_t __kmpc_impl_shfl_down_sync(__kmpc_impl_lanemask_t Mask,
-                                          int32_t Var, uint32_t Delta,
-                                          int32_t Width);
-
 EXTERN void __kmpc_impl_syncthreads();
 EXTERN void __kmpc_impl_syncwarp(__kmpc_impl_lanemask_t Mask);