diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/data_sharing.cu b/openmp/libomptarget/deviceRTLs/nvptx/src/data_sharing.cu
--- a/openmp/libomptarget/deviceRTLs/nvptx/src/data_sharing.cu
+++ b/openmp/libomptarget/deviceRTLs/nvptx/src/data_sharing.cu
@@ -10,6 +10,7 @@
 //
 //===----------------------------------------------------------------------===//
 #include "omptarget-nvptx.h"
+#include "target_impl.h"
 #include <stdio.h>
 
 // Warp ID in the CUDA block
@@ -430,9 +431,10 @@
     }
   }
   // Get address from lane 0.
-  ((int *)&FrameP)[0] = __SHFL_SYNC(CurActive, ((int *)&FrameP)[0], 0);
+  int *FP = (int *)&FrameP;
+  FP[0] = __kmpc_impl_shfl_sync(CurActive, FP[0], 0);
   if (sizeof(FrameP) == 8)
-    ((int *)&FrameP)[1] = __SHFL_SYNC(CurActive, ((int *)&FrameP)[1], 0);
+    FP[1] = __kmpc_impl_shfl_sync(CurActive, FP[1], 0);
 
   return FrameP;
 }
@@ -551,8 +553,7 @@
     if (GetThreadIdInBlock() == 0) {
       *frame = omptarget_nvptx_simpleMemoryManager.Acquire(buf, size);
     }
-    // FIXME: use __syncthreads instead when the function copy is fixed in LLVM.
-    __SYNCTHREADS();
+    __kmpc_impl_syncthreads();
     return;
   }
   ASSERT0(LT_FUSSY, GetThreadIdInBlock() == GetMasterThreadID(),
@@ -566,8 +567,7 @@
   if (is_shared)
     return;
   if (isSPMDExecutionMode) {
-    // FIXME: use __syncthreads instead when the function copy is fixed in LLVM.
-    __SYNCTHREADS();
+    __kmpc_impl_syncthreads();
     if (GetThreadIdInBlock() == 0) {
       omptarget_nvptx_simpleMemoryManager.Release();
     }
diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/loop.cu b/openmp/libomptarget/deviceRTLs/nvptx/src/loop.cu
--- a/openmp/libomptarget/deviceRTLs/nvptx/src/loop.cu
+++ b/openmp/libomptarget/deviceRTLs/nvptx/src/loop.cu
@@ -383,8 +383,8 @@
   INLINE static int64_t Shuffle(unsigned active, int64_t val, int leader) {
     int lo, hi;
     __kmpc_impl_unpack(val, lo, hi);
-    hi = __SHFL_SYNC(active, hi, leader);
-    lo = __SHFL_SYNC(active, lo, leader);
+    hi = __kmpc_impl_shfl_sync(active, hi, leader);
+    lo = __kmpc_impl_shfl_sync(active, lo, leader);
     return __kmpc_impl_pack(lo, hi);
   }
 
diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h b/openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h
--- a/openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h
+++ b/openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h
@@ -51,28 +51,11 @@
 #ifndef CUDA_VERSION
 #error CUDA_VERSION macro is undefined, something wrong with cuda.
 #elif CUDA_VERSION >= 9000
-#define __SHFL_SYNC(mask, var, srcLane) __shfl_sync((mask), (var), (srcLane))
-#define __SHFL_DOWN_SYNC(mask, var, delta, width)                              \
-  __shfl_down_sync((mask), (var), (delta), (width))
 #define __ACTIVEMASK() __activemask()
-#define __SYNCWARP(Mask) __syncwarp(Mask)
 #else
-#define __SHFL_SYNC(mask, var, srcLane) __shfl((var), (srcLane))
-#define __SHFL_DOWN_SYNC(mask, var, delta, width)                              \
-  __shfl_down((var), (delta), (width))
 #define __ACTIVEMASK() __ballot(1)
-// In Cuda < 9.0 no need to sync threads in warps.
-#define __SYNCWARP(Mask)
 #endif // CUDA_VERSION
 
-#define __SYNCTHREADS_N(n) asm volatile("bar.sync %0;" : : "r"(n) : "memory");
-// Use original __syncthreads if compiled by nvcc or clang >= 9.0.
-#if !defined(__clang__) || __clang_major__ >= 9
-#define __SYNCTHREADS() __syncthreads()
-#else
-#define __SYNCTHREADS() __SYNCTHREADS_N(0)
-#endif
-
 // arguments needed for L0 parallelism only.
 class omptarget_nvptx_SharedArgs {
 public:
diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu b/openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu
--- a/openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu
+++ b/openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu
@@ -11,6 +11,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "omptarget-nvptx.h"
+#include "target_impl.h"
 
 ////////////////////////////////////////////////////////////////////////////////
 // global data tables
@@ -106,7 +107,7 @@
   }
   if (!RequiresOMPRuntime) {
     // Runtime is not required - exit.
-    __SYNCTHREADS();
+    __kmpc_impl_syncthreads();
     return;
   }
 
@@ -125,8 +126,7 @@
     // init team context
     currTeamDescr.InitTeamDescr();
   }
-  // FIXME: use __syncthreads instead when the function copy is fixed in LLVM.
-  __SYNCTHREADS();
+  __kmpc_impl_syncthreads();
 
   omptarget_nvptx_TeamDescr &currTeamDescr = getMyTeamDescriptor();
   omptarget_nvptx_WorkDescr &workDescr = getMyWorkDescriptor();
@@ -168,8 +168,7 @@
   if (!RequiresOMPRuntime)
     return;
 
-  // FIXME: use __syncthreads instead when the function copy is fixed in LLVM.
-  __SYNCTHREADS();
+  __kmpc_impl_syncthreads();
   int threadId = GetThreadIdInBlock();
   if (threadId == 0) {
     // Enqueue omp state object for use by another team.
diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/parallel.cu b/openmp/libomptarget/deviceRTLs/nvptx/src/parallel.cu
--- a/openmp/libomptarget/deviceRTLs/nvptx/src/parallel.cu
+++ b/openmp/libomptarget/deviceRTLs/nvptx/src/parallel.cu
@@ -33,6 +33,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "omptarget-nvptx.h"
+#include "target_impl.h"
 
 typedef struct ConvergentSimdJob {
   omptarget_nvptx_TaskDescr taskDescr;
@@ -64,7 +65,7 @@
       omptarget_nvptx_threadPrivateContext->SimdLimitForNextSimd(threadId);
   job->slimForNextSimd = SimdLimit;
 
-  int32_t SimdLimitSource = __SHFL_SYNC(Mask, SimdLimit, *LaneSource);
+  int32_t SimdLimitSource = __kmpc_impl_shfl_sync(Mask, SimdLimit, *LaneSource);
   // reset simdlimit to avoid propagating to successive #simd
   if (SimdLimitSource > 0 && threadId == sourceThreadId)
     omptarget_nvptx_threadPrivateContext->SimdLimitForNextSimd(threadId) = 0;
@@ -138,7 +139,8 @@
       omptarget_nvptx_threadPrivateContext->NumThreadsForNextParallel(threadId);
   job->tnumForNextPar = NumThreadsClause;
 
-  int32_t NumThreadsSource = __SHFL_SYNC(Mask, NumThreadsClause, *LaneSource);
+  int32_t NumThreadsSource =
+      __kmpc_impl_shfl_sync(Mask, NumThreadsClause, *LaneSource);
   // reset numthreads to avoid propagating to successive #parallel
   if (NumThreadsSource > 0 && threadId == sourceThreadId)
     omptarget_nvptx_threadPrivateContext->NumThreadsForNextParallel(threadId) =
diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/reduction.cu b/openmp/libomptarget/deviceRTLs/nvptx/src/reduction.cu
--- a/openmp/libomptarget/deviceRTLs/nvptx/src/reduction.cu
+++ b/openmp/libomptarget/deviceRTLs/nvptx/src/reduction.cu
@@ -15,6 +15,7 @@
 #include <stdio.h>
 
 #include "omptarget-nvptx.h"
+#include "target_impl.h"
 
 EXTERN
 void __kmpc_nvptx_end_reduce(int32_t global_tid) {}
@@ -23,14 +24,14 @@
 void __kmpc_nvptx_end_reduce_nowait(int32_t global_tid) {}
 
 EXTERN int32_t __kmpc_shuffle_int32(int32_t val, int16_t delta, int16_t size) {
-  return __SHFL_DOWN_SYNC(0xFFFFFFFF, val, delta, size);
+  return __kmpc_impl_shfl_down_sync(0xFFFFFFFF, val, delta, size);
 }
 
 EXTERN int64_t __kmpc_shuffle_int64(int64_t val, int16_t delta, int16_t size) {
    int lo, hi;
    asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "l"(val));
-   hi = __SHFL_DOWN_SYNC(0xFFFFFFFF, hi, delta, size);
-   lo = __SHFL_DOWN_SYNC(0xFFFFFFFF, lo, delta, size);
+   hi = __kmpc_impl_shfl_down_sync(0xFFFFFFFF, hi, delta, size);
+   lo = __kmpc_impl_shfl_down_sync(0xFFFFFFFF, lo, delta, size);
    asm volatile("mov.b64 %0, {%1,%2};" : "=l"(val) : "r"(lo), "r"(hi));
    return val;
 }
diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/supporti.h b/openmp/libomptarget/deviceRTLs/nvptx/src/supporti.h
--- a/openmp/libomptarget/deviceRTLs/nvptx/src/supporti.h
+++ b/openmp/libomptarget/deviceRTLs/nvptx/src/supporti.h
@@ -14,6 +14,8 @@
 // Execution Parameters
 ////////////////////////////////////////////////////////////////////////////////
 
+#include "target_impl.h"
+
 INLINE void setExecutionParameters(ExecutionMode EMode, RuntimeMode RMode) {
   execution_param = EMode;
   execution_param |= RMode;
@@ -203,7 +205,7 @@
 
 INLINE void IncParallelLevel(bool ActiveParallel) {
   unsigned Active = __ACTIVEMASK();
-  __SYNCWARP(Active);
+  __kmpc_impl_syncwarp(Active);
   unsigned LaneMaskLt;
   asm("mov.u32 %0, %%lanemask_lt;" : "=r"(LaneMaskLt));
   unsigned Rank = __popc(Active & LaneMaskLt);
@@ -212,12 +214,12 @@
         (1 + (ActiveParallel ? OMP_ACTIVE_PARALLEL_LEVEL : 0));
     __threadfence();
   }
-  __SYNCWARP(Active);
+  __kmpc_impl_syncwarp(Active);
 }
 
 INLINE void DecParallelLevel(bool ActiveParallel) {
   unsigned Active = __ACTIVEMASK();
-  __SYNCWARP(Active);
+  __kmpc_impl_syncwarp(Active);
   unsigned LaneMaskLt;
   asm("mov.u32 %0, %%lanemask_lt;" : "=r"(LaneMaskLt));
   unsigned Rank = __popc(Active & LaneMaskLt);
@@ -226,7 +228,7 @@
         (1 + (ActiveParallel ? OMP_ACTIVE_PARALLEL_LEVEL : 0));
     __threadfence();
   }
-  __SYNCWARP(Active);
+  __kmpc_impl_syncwarp(Active);
 }
 
 ////////////////////////////////////////////////////////////////////////////////
diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/sync.cu b/openmp/libomptarget/deviceRTLs/nvptx/src/sync.cu
--- a/openmp/libomptarget/deviceRTLs/nvptx/src/sync.cu
+++ b/openmp/libomptarget/deviceRTLs/nvptx/src/sync.cu
@@ -75,8 +75,7 @@
 // parallel region and that all worker threads participate.
 EXTERN void __kmpc_barrier_simple_spmd(kmp_Ident *loc_ref, int32_t tid) {
   PRINT0(LD_SYNC, "call kmpc_barrier_simple_spmd\n");
-  // FIXME: use __syncthreads instead when the function copy is fixed in LLVM.
-  __SYNCTHREADS();
+  __kmpc_impl_syncthreads();
   PRINT0(LD_SYNC, "completed kmpc_barrier_simple_spmd\n");
 }
 
diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h b/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h
--- a/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h
+++ b/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h
@@ -38,6 +38,46 @@
 
 INLINE int __kmpc_impl_popc(uint32_t x) { return __popc(x); }
 
-INLINE void __kmpc_impl_syncwarp(int32_t Mask) { __SYNCWARP(Mask); }
+#ifndef CUDA_VERSION
+#error CUDA_VERSION macro is undefined, something wrong with cuda.
+#endif
+
+// In Cuda 9.0, the *_sync() version takes an extra argument 'mask'.
+
+INLINE int32_t __kmpc_impl_shfl_sync(__kmpc_impl_lanemask_t Mask, int32_t Var,
+                                     int32_t SrcLane) {
+#if CUDA_VERSION >= 9000
+  return __shfl_sync(Mask, Var, SrcLane);
+#else
+  return __shfl(Var, SrcLane);
+#endif // CUDA_VERSION
+}
+
+INLINE int32_t __kmpc_impl_shfl_down_sync(__kmpc_impl_lanemask_t Mask,
+                                          int32_t Var, uint32_t Delta,
+                                          int32_t Width) {
+#if CUDA_VERSION >= 9000
+  return __shfl_down_sync(Mask, Var, Delta, Width);
+#else
+  return __shfl_down(Var, Delta, Width);
+#endif // CUDA_VERSION
+}
+
+INLINE void __kmpc_impl_syncthreads() {
+  // Use original __syncthreads if compiled by nvcc or clang >= 9.0.
+#if !defined(__clang__) || __clang_major__ >= 9
+  __syncthreads();
+#else
+  asm volatile("bar.sync %0;" : : "r"(0) : "memory");
+#endif // __clang__
+}
+
+INLINE void __kmpc_impl_syncwarp(__kmpc_impl_lanemask_t Mask) {
+#if CUDA_VERSION >= 9000
+  __syncwarp(Mask);
+#else
+  // In Cuda < 9.0 no need to sync threads in warps.
+#endif // CUDA_VERSION
+}
 
 #endif