diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.cu b/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.cu
--- a/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.cu
+++ b/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.cu
@@ -10,9 +10,9 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "target_impl.h"
 #include "common/debug.h"
 #include "common/target_atomic.h"
+#include "target_impl.h"
 
 #include <cuda.h>
 
@@ -56,23 +56,23 @@
 }
 
 // In Cuda 9.0, __ballot(1) from Cuda 8.0 is replaced with __activemask().
-
 DEVICE __kmpc_impl_lanemask_t __kmpc_impl_activemask() {
 #if CUDA_VERSION >= 9000
-  return __activemask();
+  uint32_t mask;
+  asm volatile("activemask.b32 %0;" : "=r"(mask));
+  return mask;
 #else
-  return __ballot(1);
+  return __nvvm_vote_ballot(1);
 #endif
 }
 
 // In Cuda 9.0, the *_sync() version takes an extra argument 'mask'.
-
 DEVICE int32_t __kmpc_impl_shfl_sync(__kmpc_impl_lanemask_t Mask, int32_t Var,
                                      int32_t SrcLane) {
 #if CUDA_VERSION >= 9000
-  return __shfl_sync(Mask, Var, SrcLane);
+  return __nvvm_shfl_sync_idx_i32(Mask, Var, SrcLane, WARPSIZE - 1);
 #else
-  return __shfl(Var, SrcLane);
+  return __nvvm_shfl_idx_i32(Var, SrcLane, WARPSIZE - 1);
 #endif // CUDA_VERSION
 }
 
@@ -80,24 +80,18 @@
                                           int32_t Var, uint32_t Delta,
                                           int32_t Width) {
 #if CUDA_VERSION >= 9000
-  return __shfl_down_sync(Mask, Var, Delta, Width);
+  return __nvvm_shfl_sync_down_i32(Mask, Var, Delta,
+                                   ((WARPSIZE - Width) << 8) | 0x1f);
 #else
-  return __shfl_down(Var, Delta, Width);
+  return __nvvm_shfl_down_i32(Var, Delta, ((WARPSIZE - Width) << 8) | 0x1f);
 #endif // CUDA_VERSION
 }
 
-DEVICE void __kmpc_impl_syncthreads() {
-  // Use original __syncthreads if compiled by nvcc or clang >= 9.0.
-#if !defined(__clang__) || __clang_major__ >= 9
-  __syncthreads();
-#else
-  asm volatile("bar.sync %0;" : : "r"(0) : "memory");
-#endif // __clang__
-}
+DEVICE void __kmpc_impl_syncthreads() { __syncthreads(); }
 
 DEVICE void __kmpc_impl_syncwarp(__kmpc_impl_lanemask_t Mask) {
 #if CUDA_VERSION >= 9000
-  __syncwarp(Mask);
+  __nvvm_bar_warp_sync(Mask);
 #else
   // In Cuda < 9.0 no need to sync threads in warps.
 #endif // CUDA_VERSION
@@ -145,11 +139,11 @@
 DEVICE void __kmpc_impl_set_lock(omp_lock_t *lock) {
   // TODO: not sure spinning is a good idea here..
   while (__kmpc_atomic_cas(lock, UNSET, SET) != UNSET) {
-    clock_t start = clock();
-    clock_t now;
+    int32_t start = __nvvm_read_ptx_sreg_clock();
+    int32_t now;
     for (;;) {
-      now = clock();
-      clock_t cycles = now > start ? now - start : now + (0xffffffff - start);
+      now = __nvvm_read_ptx_sreg_clock();
+      int32_t cycles = now > start ? now - start : now + (0xffffffff - start);
       if (cycles >= __OMP_SPIN * GetBlockIdInKernel()) {
         break;
       }