diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h b/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h
--- a/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h
+++ b/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h
@@ -81,48 +81,17 @@
   DS_Shared_Memory_Size = 128,
 };
 
-INLINE void __kmpc_impl_unpack(uint64_t val, uint32_t &lo, uint32_t &hi) {
-  asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "l"(val));
-}
-
-INLINE uint64_t __kmpc_impl_pack(uint32_t lo, uint32_t hi) {
-  uint64_t val;
-  asm volatile("mov.b64 %0, {%1,%2};" : "=l"(val) : "r"(lo), "r"(hi));
-  return val;
-}
-
 enum : __kmpc_impl_lanemask_t {
   __kmpc_impl_all_lanes = ~(__kmpc_impl_lanemask_t)0
 };
 
-INLINE __kmpc_impl_lanemask_t __kmpc_impl_lanemask_lt() {
-  __kmpc_impl_lanemask_t res;
-  asm("mov.u32 %0, %%lanemask_lt;" : "=r"(res));
-  return res;
-}
-
-INLINE __kmpc_impl_lanemask_t __kmpc_impl_lanemask_gt() {
-  __kmpc_impl_lanemask_t res;
-  asm("mov.u32 %0, %%lanemask_gt;" : "=r"(res));
-  return res;
-}
-
-INLINE uint32_t __kmpc_impl_smid() {
-  uint32_t id;
-  asm("mov.u32 %0, %%smid;" : "=r"(id));
-  return id;
-}
-
-INLINE double __kmpc_impl_get_wtick() {
-  // Timer precision is 1ns
-  return ((double)1E-9);
-}
-
-INLINE double __kmpc_impl_get_wtime() {
-  unsigned long long nsecs;
-  asm("mov.u64  %0, %%globaltimer;" : "=l"(nsecs));
-  return (double)nsecs * __kmpc_impl_get_wtick();
-}
+DEVICE void __kmpc_impl_unpack(uint64_t val, uint32_t &lo, uint32_t &hi);
+DEVICE uint64_t __kmpc_impl_pack(uint32_t lo, uint32_t hi);
+DEVICE __kmpc_impl_lanemask_t __kmpc_impl_lanemask_lt();
+DEVICE __kmpc_impl_lanemask_t __kmpc_impl_lanemask_gt();
+DEVICE uint32_t __kmpc_impl_smid();
+DEVICE double __kmpc_impl_get_wtick();
+DEVICE double __kmpc_impl_get_wtime();
 
 INLINE uint32_t __kmpc_impl_ffs(uint32_t x) { return __ffs(x); }
 
@@ -136,90 +105,45 @@
 #error CUDA_VERSION macro is undefined, something wrong with cuda.
 #endif
 
-// In Cuda 9.0, __ballot(1) from Cuda 8.0 is replaced with __activemask().
+DEVICE __kmpc_impl_lanemask_t __kmpc_impl_activemask();
 
-INLINE __kmpc_impl_lanemask_t __kmpc_impl_activemask() {
-#if CUDA_VERSION >= 9000
-  return __activemask();
-#else
-  return __ballot(1);
-#endif
-}
-
-// In Cuda 9.0, the *_sync() version takes an extra argument 'mask'.
+DEVICE int32_t __kmpc_impl_shfl_sync(__kmpc_impl_lanemask_t Mask, int32_t Var,
+                                     int32_t SrcLane);
 
-INLINE int32_t __kmpc_impl_shfl_sync(__kmpc_impl_lanemask_t Mask, int32_t Var,
-                                     int32_t SrcLane) {
-#if CUDA_VERSION >= 9000
-  return __shfl_sync(Mask, Var, SrcLane);
-#else
-  return __shfl(Var, SrcLane);
-#endif // CUDA_VERSION
-}
-
-INLINE int32_t __kmpc_impl_shfl_down_sync(__kmpc_impl_lanemask_t Mask,
+DEVICE int32_t __kmpc_impl_shfl_down_sync(__kmpc_impl_lanemask_t Mask,
                                           int32_t Var, uint32_t Delta,
-                                          int32_t Width) {
-#if CUDA_VERSION >= 9000
-  return __shfl_down_sync(Mask, Var, Delta, Width);
-#else
-  return __shfl_down(Var, Delta, Width);
-#endif // CUDA_VERSION
-}
+                                          int32_t Width);
 
-INLINE void __kmpc_impl_syncthreads() {
-  // Use original __syncthreads if compiled by nvcc or clang >= 9.0.
-#if !defined(__clang__) || __clang_major__ >= 9
-  __syncthreads();
-#else
-  asm volatile("bar.sync %0;" : : "r"(0) : "memory");
-#endif // __clang__
-}
-
-INLINE void __kmpc_impl_syncwarp(__kmpc_impl_lanemask_t Mask) {
-#if CUDA_VERSION >= 9000
-  __syncwarp(Mask);
-#else
-  // In Cuda < 9.0 no need to sync threads in warps.
-#endif // CUDA_VERSION
-}
+DEVICE void __kmpc_impl_syncthreads();
+DEVICE void __kmpc_impl_syncwarp(__kmpc_impl_lanemask_t Mask);
 
 // NVPTX specific kernel initialization
-INLINE void __kmpc_impl_target_init() { /* nvptx needs no extra setup */
-}
+DEVICE void __kmpc_impl_target_init();
 
 // Barrier until num_threads arrive.
-INLINE void __kmpc_impl_named_sync(uint32_t num_threads) {
-  // The named barrier for active parallel threads of a team in an L1 parallel
-  // region to synchronize with each other.
-  int barrier = 1;
-  asm volatile("bar.sync %0, %1;"
-               :
-               : "r"(barrier), "r"(num_threads)
-               : "memory");
-}
+DEVICE void __kmpc_impl_named_sync(uint32_t num_threads);
 
-INLINE void __kmpc_impl_threadfence(void) { __threadfence(); }
-INLINE void __kmpc_impl_threadfence_block(void) { __threadfence_block(); }
-INLINE void __kmpc_impl_threadfence_system(void) { __threadfence_system(); }
+DEVICE void __kmpc_impl_threadfence();
+DEVICE void __kmpc_impl_threadfence_block();
+DEVICE void __kmpc_impl_threadfence_system();
 
 // Calls to the NVPTX layer (assuming 1D layout)
-INLINE int GetThreadIdInBlock() { return threadIdx.x; }
-INLINE int GetBlockIdInKernel() { return blockIdx.x; }
-INLINE int GetNumberOfBlocksInKernel() { return gridDim.x; }
-INLINE int GetNumberOfThreadsInBlock() { return blockDim.x; }
-INLINE unsigned GetWarpId() { return GetThreadIdInBlock() / WARPSIZE; }
-INLINE unsigned GetLaneId() { return GetThreadIdInBlock() & (WARPSIZE - 1); }
+DEVICE int GetThreadIdInBlock();
+DEVICE int GetBlockIdInKernel();
+DEVICE int GetNumberOfBlocksInKernel();
+DEVICE int GetNumberOfThreadsInBlock();
+DEVICE unsigned GetWarpId();
+DEVICE unsigned GetLaneId();
 
 // Locks
-EXTERN void __kmpc_impl_init_lock(omp_lock_t *lock);
-EXTERN void __kmpc_impl_destroy_lock(omp_lock_t *lock);
-EXTERN void __kmpc_impl_set_lock(omp_lock_t *lock);
-EXTERN void __kmpc_impl_unset_lock(omp_lock_t *lock);
-EXTERN int __kmpc_impl_test_lock(omp_lock_t *lock);
+DEVICE void __kmpc_impl_init_lock(omp_lock_t *lock);
+DEVICE void __kmpc_impl_destroy_lock(omp_lock_t *lock);
+DEVICE void __kmpc_impl_set_lock(omp_lock_t *lock);
+DEVICE void __kmpc_impl_unset_lock(omp_lock_t *lock);
+DEVICE int __kmpc_impl_test_lock(omp_lock_t *lock);
 
 // Memory
-INLINE void *__kmpc_impl_malloc(size_t x) { return malloc(x); }
-INLINE void __kmpc_impl_free(void *x) { free(x); }
+DEVICE void *__kmpc_impl_malloc(size_t);
+DEVICE void __kmpc_impl_free(void *);
 
 #endif
diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.cu b/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.cu
--- a/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.cu
+++ b/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.cu
@@ -14,19 +14,135 @@
 #include "common/debug.h"
 #include "common/target_atomic.h"
 
+#include <cuda.h>
+
+DEVICE void __kmpc_impl_unpack(uint64_t val, uint32_t &lo, uint32_t &hi) {
+  asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "l"(val));
+}
+
+DEVICE uint64_t __kmpc_impl_pack(uint32_t lo, uint32_t hi) {
+  uint64_t val;
+  asm volatile("mov.b64 %0, {%1,%2};" : "=l"(val) : "r"(lo), "r"(hi));
+  return val;
+}
+
+DEVICE __kmpc_impl_lanemask_t __kmpc_impl_lanemask_lt() {
+  __kmpc_impl_lanemask_t res;
+  asm("mov.u32 %0, %%lanemask_lt;" : "=r"(res));
+  return res;
+}
+
+DEVICE __kmpc_impl_lanemask_t __kmpc_impl_lanemask_gt() {
+  __kmpc_impl_lanemask_t res;
+  asm("mov.u32 %0, %%lanemask_gt;" : "=r"(res));
+  return res;
+}
+
+DEVICE uint32_t __kmpc_impl_smid() {
+  uint32_t id;
+  asm("mov.u32 %0, %%smid;" : "=r"(id));
+  return id;
+}
+
+DEVICE double __kmpc_impl_get_wtick() {
+  // Timer precision is 1ns
+  return ((double)1E-9);
+}
+
+DEVICE double __kmpc_impl_get_wtime() {
+  unsigned long long nsecs;
+  asm("mov.u64  %0, %%globaltimer;" : "=l"(nsecs));
+  return (double)nsecs * __kmpc_impl_get_wtick();
+}
+
+// In Cuda 9.0, __ballot(1) from Cuda 8.0 is replaced with __activemask().
+
+DEVICE __kmpc_impl_lanemask_t __kmpc_impl_activemask() {
+#if CUDA_VERSION >= 9000
+  return __activemask();
+#else
+  return __ballot(1);
+#endif
+}
+
+// In Cuda 9.0, the *_sync() version takes an extra argument 'mask'.
+
+DEVICE int32_t __kmpc_impl_shfl_sync(__kmpc_impl_lanemask_t Mask, int32_t Var,
+                                     int32_t SrcLane) {
+#if CUDA_VERSION >= 9000
+  return __shfl_sync(Mask, Var, SrcLane);
+#else
+  return __shfl(Var, SrcLane);
+#endif // CUDA_VERSION
+}
+
+DEVICE int32_t __kmpc_impl_shfl_down_sync(__kmpc_impl_lanemask_t Mask,
+                                          int32_t Var, uint32_t Delta,
+                                          int32_t Width) {
+#if CUDA_VERSION >= 9000
+  return __shfl_down_sync(Mask, Var, Delta, Width);
+#else
+  return __shfl_down(Var, Delta, Width);
+#endif // CUDA_VERSION
+}
+
+DEVICE void __kmpc_impl_syncthreads() {
+  // Use original __syncthreads if compiled by nvcc or clang >= 9.0.
+#if !defined(__clang__) || __clang_major__ >= 9
+  __syncthreads();
+#else
+  asm volatile("bar.sync %0;" : : "r"(0) : "memory");
+#endif // __clang__
+}
+
+DEVICE void __kmpc_impl_syncwarp(__kmpc_impl_lanemask_t Mask) {
+#if CUDA_VERSION >= 9000
+  __syncwarp(Mask);
+#else
+  // In Cuda < 9.0 no need to sync threads in warps.
+#endif // CUDA_VERSION
+}
+
+// NVPTX specific kernel initialization
+DEVICE void __kmpc_impl_target_init() { /* nvptx needs no extra setup */
+}
+
+// Barrier until num_threads arrive.
+DEVICE void __kmpc_impl_named_sync(uint32_t num_threads) {
+  // The named barrier for active parallel threads of a team in an L1 parallel
+  // region to synchronize with each other.
+  int barrier = 1;
+  asm volatile("bar.sync %0, %1;"
+               :
+               : "r"(barrier), "r"(num_threads)
+               : "memory");
+}
+
+DEVICE void __kmpc_impl_threadfence() { __threadfence(); }
+DEVICE void __kmpc_impl_threadfence_block() { __threadfence_block(); }
+DEVICE void __kmpc_impl_threadfence_system() { __threadfence_system(); }
+
+// Calls to the NVPTX layer (assuming 1D layout)
+DEVICE int GetThreadIdInBlock() { return threadIdx.x; }
+DEVICE int GetBlockIdInKernel() { return blockIdx.x; }
+DEVICE int GetNumberOfBlocksInKernel() { return gridDim.x; }
+DEVICE int GetNumberOfThreadsInBlock() { return blockDim.x; }
+DEVICE unsigned GetWarpId() { return GetThreadIdInBlock() / WARPSIZE; }
+DEVICE unsigned GetLaneId() { return GetThreadIdInBlock() & (WARPSIZE - 1); }
+
 #define __OMP_SPIN 1000
 #define UNSET 0u
 #define SET 1u
 
-EXTERN void __kmpc_impl_init_lock(omp_lock_t *lock) {
+DEVICE void __kmpc_impl_init_lock(omp_lock_t *lock) {
   __kmpc_impl_unset_lock(lock);
 }
 
-EXTERN void __kmpc_impl_destroy_lock(omp_lock_t *lock) {
+DEVICE void __kmpc_impl_destroy_lock(omp_lock_t *lock) {
   __kmpc_impl_unset_lock(lock);
 }
 
-EXTERN void __kmpc_impl_set_lock(omp_lock_t *lock) {
+DEVICE void __kmpc_impl_set_lock(omp_lock_t *lock) {
   // TODO: not sure spinning is a good idea here..
   while (__kmpc_atomic_cas(lock, UNSET, SET) != UNSET) {
     clock_t start = clock();
@@ -41,10 +157,13 @@
   } // wait for 0 to be the read value
 }
 
-EXTERN void __kmpc_impl_unset_lock(omp_lock_t *lock) {
+DEVICE void __kmpc_impl_unset_lock(omp_lock_t *lock) {
   (void)__kmpc_atomic_exchange(lock, UNSET);
 }
 
-EXTERN int __kmpc_impl_test_lock(omp_lock_t *lock) {
+DEVICE int __kmpc_impl_test_lock(omp_lock_t *lock) {
   return __kmpc_atomic_add(lock, 0u);
 }
+
+DEVICE void *__kmpc_impl_malloc(size_t x) { return malloc(x); }
+DEVICE void __kmpc_impl_free(void *x) { free(x); }