diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/data_sharing.cu b/openmp/libomptarget/deviceRTLs/nvptx/src/data_sharing.cu
--- a/openmp/libomptarget/deviceRTLs/nvptx/src/data_sharing.cu
+++ b/openmp/libomptarget/deviceRTLs/nvptx/src/data_sharing.cu
@@ -10,6 +10,7 @@
 //
 //===----------------------------------------------------------------------===//
 #include "omptarget-nvptx.h"
+#include "target_impl.h"
 #include <stdio.h>
 
 // Warp ID in the CUDA block
@@ -19,7 +20,7 @@
 
 // Return true if this is the first active thread in the warp.
 INLINE static bool IsWarpMasterActiveThread() {
-  unsigned long long Mask = __ACTIVEMASK();
+  unsigned long long Mask = __kmpc_impl_activemask();
   unsigned long long ShNum = WARPSIZE - (GetThreadIdInBlock() % WARPSIZE);
   unsigned long long Sh = Mask << ShNum;
   // Truncate Sh to the 32 lower bits
@@ -111,7 +112,7 @@
           (unsigned long long)SharingDefaultDataSize);
 
   unsigned WID = getWarpId();
-  unsigned CurActiveThreads = __ACTIVEMASK();
+  unsigned CurActiveThreads = __kmpc_impl_activemask();
 
   __kmpc_data_sharing_slot *&SlotP = DataSharingState.SlotPtr[WID];
   void *&StackP = DataSharingState.StackPtr[WID];
@@ -251,7 +252,7 @@
     return;
   }
 
-  int32_t CurActive = __ACTIVEMASK();
+  int32_t CurActive = __kmpc_impl_activemask();
 
   // Only the warp master can restore the stack and frame information, and only
   // if there are no other threads left behind in this environment (i.e. the
@@ -377,7 +378,7 @@
   // Frame pointer must be visible to all workers in the same warp.
   const unsigned WID = getWarpId();
   void *FrameP = 0;
-  int32_t CurActive = __ACTIVEMASK();
+  int32_t CurActive = __kmpc_impl_activemask();
 
   if (IsWarpMaster) {
     // SlotP will point to either the shared memory slot or an existing
diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/loop.cu b/openmp/libomptarget/deviceRTLs/nvptx/src/loop.cu
--- a/openmp/libomptarget/deviceRTLs/nvptx/src/loop.cu
+++ b/openmp/libomptarget/deviceRTLs/nvptx/src/loop.cu
@@ -389,7 +389,7 @@
   }
 
   INLINE static uint64_t NextIter() {
-    __kmpc_impl_lanemask_t active = __ACTIVEMASK();
+    __kmpc_impl_lanemask_t active = __kmpc_impl_activemask();
     int leader = __kmpc_impl_ffs(active) - 1;
     int change = __kmpc_impl_popc(active);
     __kmpc_impl_lanemask_t lane_mask_lt = __kmpc_impl_lanemask_lt();
diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h b/openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h
--- a/openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h
+++ b/openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h
@@ -47,20 +47,17 @@
 
 // Macros for Cuda intrinsics
 // In Cuda 9.0, the *_sync() version takes an extra argument 'mask'.
-// Also, __ballot(1) in Cuda 8.0 is replaced with __activemask().
 #ifndef CUDA_VERSION
 #error CUDA_VERSION macro is undefined, something wrong with cuda.
 #elif CUDA_VERSION >= 9000
 #define __SHFL_SYNC(mask, var, srcLane) __shfl_sync((mask), (var), (srcLane))
 #define __SHFL_DOWN_SYNC(mask, var, delta, width)                              \
   __shfl_down_sync((mask), (var), (delta), (width))
-#define __ACTIVEMASK() __activemask()
 #define __SYNCWARP(Mask) __syncwarp(Mask)
 #else
 #define __SHFL_SYNC(mask, var, srcLane) __shfl((var), (srcLane))
 #define __SHFL_DOWN_SYNC(mask, var, delta, width)                              \
   __shfl_down((var), (delta), (width))
-#define __ACTIVEMASK() __ballot(1)
 // In Cuda < 9.0 no need to sync threads in warps.
 #define __SYNCWARP(Mask)
 #endif // CUDA_VERSION
diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/reduction.cu b/openmp/libomptarget/deviceRTLs/nvptx/src/reduction.cu
--- a/openmp/libomptarget/deviceRTLs/nvptx/src/reduction.cu
+++ b/openmp/libomptarget/deviceRTLs/nvptx/src/reduction.cu
@@ -15,6 +15,7 @@
 #include <stdio.h>
 
 #include "omptarget-nvptx.h"
+#include "target_impl.h"
 
 EXTERN
 void __kmpc_nvptx_end_reduce(int32_t global_tid) {}
@@ -64,11 +65,11 @@
   uint32_t size, remote_id, physical_lane_id;
   physical_lane_id = GetThreadIdInBlock() % WARPSIZE;
   asm("mov.u32 %0, %%lanemask_lt;" : "=r"(lanemask_lt));
-  uint32_t Liveness = __ACTIVEMASK();
+  uint32_t Liveness = __kmpc_impl_activemask();
   uint32_t logical_lane_id = __popc(Liveness & lanemask_lt) * 2;
   asm("mov.u32 %0, %%lanemask_gt;" : "=r"(lanemask_gt));
   do {
-    Liveness = __ACTIVEMASK();
+    Liveness = __kmpc_impl_activemask();
     remote_id = __ffs(Liveness & lanemask_gt);
     size = __popc(Liveness);
     logical_lane_id /= 2;
@@ -83,7 +84,7 @@
                                         size_t reduce_size, void *reduce_data,
                                         kmp_ShuffleReductFctPtr shflFct,
                                         kmp_InterWarpCopyFctPtr cpyFct) {
-  uint32_t Liveness = __ACTIVEMASK();
+  uint32_t Liveness = __kmpc_impl_activemask();
   if (Liveness == 0xffffffff) {
     gpu_regular_warp_reduce(reduce_data, shflFct);
     return GetThreadIdInBlock() % WARPSIZE ==
@@ -144,7 +145,7 @@
   }
   return BlockThreadId == 0;
 #else
-  uint32_t Liveness = __ACTIVEMASK();
+  uint32_t Liveness = __kmpc_impl_activemask();
   if (Liveness == 0xffffffff) // Full warp
     gpu_regular_warp_reduce(reduce_data, shflFct);
   else if (!(Liveness & (Liveness + 1))) // Partial warp but contiguous lanes
@@ -319,7 +320,7 @@
     ldFct(reduce_data, scratchpad, i, NumTeams, /*Load and reduce*/ 1);
 
   // Reduce across warps to the warp master.
-  uint32_t Liveness = __ACTIVEMASK();
+  uint32_t Liveness = __kmpc_impl_activemask();
   if (Liveness == 0xffffffff) // Full warp
     gpu_regular_warp_reduce(reduce_data, shflFct);
   else // Partial warp but contiguous lanes
diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/supporti.h b/openmp/libomptarget/deviceRTLs/nvptx/src/supporti.h
--- a/openmp/libomptarget/deviceRTLs/nvptx/src/supporti.h
+++ b/openmp/libomptarget/deviceRTLs/nvptx/src/supporti.h
@@ -14,6 +14,8 @@
 // Execution Parameters
 ////////////////////////////////////////////////////////////////////////////////
 
+#include "target_impl.h"
+
 INLINE void setExecutionParameters(ExecutionMode EMode, RuntimeMode RMode) {
   execution_param = EMode;
   execution_param |= RMode;
@@ -202,7 +204,7 @@
 // Parallel level
 
 INLINE void IncParallelLevel(bool ActiveParallel) {
-  unsigned Active = __ACTIVEMASK();
+  unsigned Active = __kmpc_impl_activemask();
   __SYNCWARP(Active);
   unsigned LaneMaskLt;
   asm("mov.u32 %0, %%lanemask_lt;" : "=r"(LaneMaskLt));
@@ -216,7 +218,7 @@
 }
 
 INLINE void DecParallelLevel(bool ActiveParallel) {
-  unsigned Active = __ACTIVEMASK();
+  unsigned Active = __kmpc_impl_activemask();
   __SYNCWARP(Active);
   unsigned LaneMaskLt;
   asm("mov.u32 %0, %%lanemask_lt;" : "=r"(LaneMaskLt));
diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/sync.cu b/openmp/libomptarget/deviceRTLs/nvptx/src/sync.cu
--- a/openmp/libomptarget/deviceRTLs/nvptx/src/sync.cu
+++ b/openmp/libomptarget/deviceRTLs/nvptx/src/sync.cu
@@ -143,7 +143,7 @@
 
 EXTERN int32_t __kmpc_warp_active_thread_mask() {
   PRINT0(LD_IO, "call __kmpc_warp_active_thread_mask\n");
-  return __ACTIVEMASK();
+  return __kmpc_impl_activemask();
 }
 
 ////////////////////////////////////////////////////////////////////////////////
diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h b/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h
--- a/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h
+++ b/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h
@@ -38,6 +38,19 @@
 
 INLINE int __kmpc_impl_popc(uint32_t x) { return __popc(x); }
 
+#ifndef CUDA_VERSION
+#error CUDA_VERSION macro is undefined, something wrong with cuda.
+#endif
+
+// __ballot(1) in Cuda 8.0 is replaced with __activemask().
+INLINE __kmpc_impl_lanemask_t __kmpc_impl_activemask() {
+#if CUDA_VERSION >= 9000
+  return __activemask();
+#else
+  return __ballot(1);
+#endif
+}
+
 INLINE void __kmpc_impl_syncwarp(int32_t Mask) { __SYNCWARP(Mask); }
 
 #endif