diff --git a/openmp/libomptarget/deviceRTLs/common/debug.h b/openmp/libomptarget/deviceRTLs/common/debug.h
--- a/openmp/libomptarget/deviceRTLs/common/debug.h
+++ b/openmp/libomptarget/deviceRTLs/common/debug.h
@@ -29,6 +29,9 @@
 #define _OMPTARGET_NVPTX_DEBUG_H_
 
 #include "common/device_environment.h"
+#include "target_interface.h"
+
+#include <cassert>
 
 ////////////////////////////////////////////////////////////////////////////////
 // set desired level of debugging
diff --git a/openmp/libomptarget/deviceRTLs/common/device_environment.h b/openmp/libomptarget/deviceRTLs/common/device_environment.h
--- a/openmp/libomptarget/deviceRTLs/common/device_environment.h
+++ b/openmp/libomptarget/deviceRTLs/common/device_environment.h
@@ -13,7 +13,7 @@
 #ifndef _OMPTARGET_DEVICE_ENVIRONMENT_H_
 #define _OMPTARGET_DEVICE_ENVIRONMENT_H_
 
-#include "target_impl.h"
+#include "interface.h"
 
 struct omptarget_device_environmentTy {
   int32_t debug_level;
diff --git a/openmp/libomptarget/deviceRTLs/common/omptarget.h b/openmp/libomptarget/deviceRTLs/common/omptarget.h
--- a/openmp/libomptarget/deviceRTLs/common/omptarget.h
+++ b/openmp/libomptarget/deviceRTLs/common/omptarget.h
@@ -14,11 +14,10 @@
 #ifndef OMPTARGET_H
 #define OMPTARGET_H
 
-#include "target_impl.h"
-#include "common/debug.h"     // debug
-#include "interface.h" // interfaces with omp, compiler, and user
+#include "common/debug.h" // debug
 #include "common/state-queue.h"
 #include "common/support.h"
+#include "interface.h" // interfaces with omp, compiler, and user
 
 #define OMPTARGET_NVPTX_VERSION 1.1
 
diff --git a/openmp/libomptarget/deviceRTLs/common/src/cancel.cu b/openmp/libomptarget/deviceRTLs/common/src/cancel.cu
--- a/openmp/libomptarget/deviceRTLs/common/src/cancel.cu
+++ b/openmp/libomptarget/deviceRTLs/common/src/cancel.cu
@@ -11,8 +11,9 @@
 //===----------------------------------------------------------------------===//
 #pragma omp declare target
 
-#include "interface.h"
 #include "common/debug.h"
+#include "interface.h"
+#include "target_interface.h"
 
 EXTERN int32_t __kmpc_cancellationpoint(kmp_Ident *loc, int32_t global_tid,
                                         int32_t cancelVal) {
diff --git a/openmp/libomptarget/deviceRTLs/common/src/critical.cu b/openmp/libomptarget/deviceRTLs/common/src/critical.cu
--- a/openmp/libomptarget/deviceRTLs/common/src/critical.cu
+++ b/openmp/libomptarget/deviceRTLs/common/src/critical.cu
@@ -11,8 +11,8 @@
 //===----------------------------------------------------------------------===//
 #pragma omp declare target
 
-#include "interface.h"
 #include "common/debug.h"
+#include "interface.h"
 
 EXTERN
 void __kmpc_critical(kmp_Ident *loc, int32_t global_tid,
diff --git a/openmp/libomptarget/deviceRTLs/common/src/data_sharing.cu b/openmp/libomptarget/deviceRTLs/common/src/data_sharing.cu
--- a/openmp/libomptarget/deviceRTLs/common/src/data_sharing.cu
+++ b/openmp/libomptarget/deviceRTLs/common/src/data_sharing.cu
@@ -12,7 +12,6 @@
 #pragma omp declare target
 
 #include "common/omptarget.h"
-#include "target_impl.h"
 
 // Return true if this is the master thread.
 INLINE static bool IsMasterThread(bool isSPMDExecutionMode) {
diff --git a/openmp/libomptarget/deviceRTLs/common/src/libcall.cu b/openmp/libomptarget/deviceRTLs/common/src/libcall.cu
--- a/openmp/libomptarget/deviceRTLs/common/src/libcall.cu
+++ b/openmp/libomptarget/deviceRTLs/common/src/libcall.cu
@@ -13,7 +13,6 @@
 #pragma omp declare target
 
 #include "common/omptarget.h"
-#include "target_impl.h"
 
 EXTERN double omp_get_wtick(void) {
   double rc = __kmpc_impl_get_wtick();
diff --git a/openmp/libomptarget/deviceRTLs/common/src/loop.cu b/openmp/libomptarget/deviceRTLs/common/src/loop.cu
--- a/openmp/libomptarget/deviceRTLs/common/src/loop.cu
+++ b/openmp/libomptarget/deviceRTLs/common/src/loop.cu
@@ -14,7 +14,6 @@
 #pragma omp declare target
 
 #include "common/omptarget.h"
-#include "target_impl.h"
 
 ////////////////////////////////////////////////////////////////////////////////
 ////////////////////////////////////////////////////////////////////////////////
diff --git a/openmp/libomptarget/deviceRTLs/common/src/omp_data.cu b/openmp/libomptarget/deviceRTLs/common/src/omp_data.cu
--- a/openmp/libomptarget/deviceRTLs/common/src/omp_data.cu
+++ b/openmp/libomptarget/deviceRTLs/common/src/omp_data.cu
@@ -11,8 +11,8 @@
 //===----------------------------------------------------------------------===//
 #pragma omp declare target
 
-#include "common/omptarget.h"
 #include "common/device_environment.h"
+#include "common/omptarget.h"
 
 ////////////////////////////////////////////////////////////////////////////////
 // global device environment
diff --git a/openmp/libomptarget/deviceRTLs/common/src/omptarget.cu b/openmp/libomptarget/deviceRTLs/common/src/omptarget.cu
--- a/openmp/libomptarget/deviceRTLs/common/src/omptarget.cu
+++ b/openmp/libomptarget/deviceRTLs/common/src/omptarget.cu
@@ -12,7 +12,6 @@
 #pragma omp declare target
 
 #include "common/omptarget.h"
-#include "target_impl.h"
 
 ////////////////////////////////////////////////////////////////////////////////
 // global data tables
diff --git a/openmp/libomptarget/deviceRTLs/common/src/parallel.cu b/openmp/libomptarget/deviceRTLs/common/src/parallel.cu
--- a/openmp/libomptarget/deviceRTLs/common/src/parallel.cu
+++ b/openmp/libomptarget/deviceRTLs/common/src/parallel.cu
@@ -34,7 +34,6 @@
 #pragma omp declare target
 
 #include "common/omptarget.h"
-#include "target_impl.h"
 
 ////////////////////////////////////////////////////////////////////////////////
 // support for parallel that goes parallel (1 static level only)
diff --git a/openmp/libomptarget/deviceRTLs/common/src/reduction.cu b/openmp/libomptarget/deviceRTLs/common/src/reduction.cu
--- a/openmp/libomptarget/deviceRTLs/common/src/reduction.cu
+++ b/openmp/libomptarget/deviceRTLs/common/src/reduction.cu
@@ -12,7 +12,6 @@
 #pragma omp declare target
 
 #include "common/omptarget.h"
-#include "target_impl.h"
 
 EXTERN
 void __kmpc_nvptx_end_reduce(int32_t global_tid) {}
diff --git a/openmp/libomptarget/deviceRTLs/common/src/support.cu b/openmp/libomptarget/deviceRTLs/common/src/support.cu
--- a/openmp/libomptarget/deviceRTLs/common/src/support.cu
+++ b/openmp/libomptarget/deviceRTLs/common/src/support.cu
@@ -11,9 +11,9 @@
 //===----------------------------------------------------------------------===//
 #pragma omp declare target
 
-#include "common/support.h"
 #include "common/debug.h"
 #include "common/omptarget.h"
+#include "common/support.h"
 
 ////////////////////////////////////////////////////////////////////////////////
 // Execution Parameters
diff --git a/openmp/libomptarget/deviceRTLs/common/src/sync.cu b/openmp/libomptarget/deviceRTLs/common/src/sync.cu
--- a/openmp/libomptarget/deviceRTLs/common/src/sync.cu
+++ b/openmp/libomptarget/deviceRTLs/common/src/sync.cu
@@ -12,7 +12,6 @@
 #pragma omp declare target
 
 #include "common/omptarget.h"
-#include "target_impl.h"
 
 ////////////////////////////////////////////////////////////////////////////////
 // KMP Ordered calls
diff --git a/openmp/libomptarget/deviceRTLs/common/state-queue.h b/openmp/libomptarget/deviceRTLs/common/state-queue.h
--- a/openmp/libomptarget/deviceRTLs/common/state-queue.h
+++ b/openmp/libomptarget/deviceRTLs/common/state-queue.h
@@ -19,9 +19,7 @@
 #ifndef __STATE_QUEUE_H
 #define __STATE_QUEUE_H
 
-#include <stdint.h>
-
-#include "target_impl.h"
+#include "interface.h"
 
 template <typename ElementType, uint32_t SIZE> class omptarget_nvptx_Queue {
 private:
diff --git a/openmp/libomptarget/deviceRTLs/common/support.h b/openmp/libomptarget/deviceRTLs/common/support.h
--- a/openmp/libomptarget/deviceRTLs/common/support.h
+++ b/openmp/libomptarget/deviceRTLs/common/support.h
@@ -14,7 +14,6 @@
 #define OMPTARGET_SUPPORT_H
 
 #include "interface.h"
-#include "target_impl.h"
 
 ////////////////////////////////////////////////////////////////////////////////
 // Execution Parameters
diff --git a/openmp/libomptarget/deviceRTLs/interface.h b/openmp/libomptarget/deviceRTLs/interface.h
--- a/openmp/libomptarget/deviceRTLs/interface.h
+++ b/openmp/libomptarget/deviceRTLs/interface.h
@@ -16,9 +16,6 @@
 #ifndef _INTERFACES_H_
 #define _INTERFACES_H_
 
-#include <stddef.h>
-#include <stdint.h>
-
 #ifdef __AMDGCN__
 #include "amdgcn/src/amdgcn_interface.h"
 #endif
diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/nvptx_interface.h b/openmp/libomptarget/deviceRTLs/nvptx/src/nvptx_interface.h
--- a/openmp/libomptarget/deviceRTLs/nvptx/src/nvptx_interface.h
+++ b/openmp/libomptarget/deviceRTLs/nvptx/src/nvptx_interface.h
@@ -9,10 +9,73 @@
 #ifndef _NVPTX_INTERFACE_H_
 #define _NVPTX_INTERFACE_H_
 
-#include <stdint.h>
+#define DEVICE __device__
+#define EXTERN extern "C" DEVICE
+#define INLINE __forceinline__ DEVICE
+#define NOINLINE __noinline__ DEVICE
+#define SHARED __shared__
+#define ALIGN(N) __align__(N)
+
+////////////////////////////////////////////////////////////////////////////////
+// Kernel options
+////////////////////////////////////////////////////////////////////////////////
+
+////////////////////////////////////////////////////////////////////////////////
+// The following def must match the absolute limit hardwired in the host RTL
+// max number of threads per team
+#define MAX_THREADS_PER_TEAM 1024
+
+#define WARPSIZE 32
+
+// Maximum number of preallocated arguments to an outlined parallel/simd
+// function. Anything more requires dynamic memory allocation.
+#define MAX_SHARED_ARGS 20
+
+// Maximum number of omp state objects per SM allocated statically in global
+// memory.
+#if __CUDA_ARCH__ >= 600
+#define OMP_STATE_COUNT 32
+#else
+#define OMP_STATE_COUNT 16
+#endif
+
+#if !defined(MAX_SM)
+#if __CUDA_ARCH__ >= 900
+#error unsupported compute capability, define MAX_SM via LIBOMPTARGET_NVPTX_MAX_SM cmake option
+#elif __CUDA_ARCH__ >= 800
+// GA100 design has a maxinum of 128 SMs but A100 product only has 108 SMs
+// GA102 design has a maxinum of 84 SMs
+#define MAX_SM 108
+#elif __CUDA_ARCH__ >= 700
+#define MAX_SM 84
+#elif __CUDA_ARCH__ >= 600
+#define MAX_SM 56
+#else
+#define MAX_SM 16
+#endif
+#endif
+
+#define OMP_ACTIVE_PARALLEL_LEVEL 128
+
+// Data sharing related quantities, need to match what is used in the compiler.
+enum DATA_SHARING_SIZES {
+  // The maximum number of workers in a kernel.
+  DS_Max_Worker_Threads = 992,
+  // The size reserved for data in a shared memory slot.
+  DS_Slot_Size = 256,
+  // The slot size that should be reserved for a working warp.
+  DS_Worker_Warp_Slot_Size = WARPSIZE * DS_Slot_Size,
+  // The maximum number of warps in use
+  DS_Max_Warp_Number = 32,
+  // The size of the preallocated shared memory buffer per team
+  DS_Shared_Memory_Size = 128,
+};
 
-#define EXTERN extern "C" __device__
 typedef uint32_t __kmpc_impl_lanemask_t;
 typedef uint32_t omp_lock_t; /* arbitrary type of the right length */
 
+enum : __kmpc_impl_lanemask_t {
+  __kmpc_impl_all_lanes = ~(__kmpc_impl_lanemask_t)0
+};
+
 #endif
diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h b/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h
deleted file mode 100644
--- a/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h
+++ /dev/null
@@ -1,157 +0,0 @@
-//===------------ target_impl.h - NVPTX OpenMP GPU options ------- CUDA -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// Definitions of target specific functions
-//
-//===----------------------------------------------------------------------===//
-#ifndef _TARGET_IMPL_H_
-#define _TARGET_IMPL_H_
-
-#include <assert.h>
-#include <cuda.h>
-#include <inttypes.h>
-#include <stdio.h>
-#include <stdlib.h>
-
-#include "nvptx_interface.h"
-
-#define DEVICE __device__
-#define INLINE __forceinline__ DEVICE
-#define NOINLINE __noinline__ DEVICE
-#define SHARED __shared__
-#define ALIGN(N) __align__(N)
-
-////////////////////////////////////////////////////////////////////////////////
-// Kernel options
-////////////////////////////////////////////////////////////////////////////////
-
-////////////////////////////////////////////////////////////////////////////////
-// The following def must match the absolute limit hardwired in the host RTL
-// max number of threads per team
-#define MAX_THREADS_PER_TEAM 1024
-
-#define WARPSIZE 32
-
-// Maximum number of preallocated arguments to an outlined parallel/simd function.
-// Anything more requires dynamic memory allocation.
-#define MAX_SHARED_ARGS 20
-
-// Maximum number of omp state objects per SM allocated statically in global
-// memory.
-#if __CUDA_ARCH__ >= 600
-#define OMP_STATE_COUNT 32
-#else
-#define OMP_STATE_COUNT 16
-#endif
-
-#if !defined(MAX_SM)
-#if __CUDA_ARCH__ >= 900
-#error unsupported compute capability, define MAX_SM via LIBOMPTARGET_NVPTX_MAX_SM cmake option
-#elif __CUDA_ARCH__ >= 800
-// GA100 design has a maxinum of 128 SMs but A100 product only has 108 SMs
-// GA102 design has a maxinum of 84 SMs
-#define MAX_SM 108
-#elif __CUDA_ARCH__ >= 700
-#define MAX_SM 84
-#elif __CUDA_ARCH__ >= 600
-#define MAX_SM 56
-#else
-#define MAX_SM 16
-#endif
-#endif
-
-#define OMP_ACTIVE_PARALLEL_LEVEL 128
-
-// Data sharing related quantities, need to match what is used in the compiler.
-enum DATA_SHARING_SIZES {
-  // The maximum number of workers in a kernel.
-  DS_Max_Worker_Threads = 992,
-  // The size reserved for data in a shared memory slot.
-  DS_Slot_Size = 256,
-  // The slot size that should be reserved for a working warp.
-  DS_Worker_Warp_Slot_Size = WARPSIZE * DS_Slot_Size,
-  // The maximum number of warps in use
-  DS_Max_Warp_Number = 32,
-  // The size of the preallocated shared memory buffer per team
-  DS_Shared_Memory_Size = 128,
-};
-
-enum : __kmpc_impl_lanemask_t {
-  __kmpc_impl_all_lanes = ~(__kmpc_impl_lanemask_t)0
-};
-
-DEVICE void __kmpc_impl_unpack(uint64_t val, uint32_t &lo, uint32_t &hi);
-DEVICE uint64_t __kmpc_impl_pack(uint32_t lo, uint32_t hi);
-DEVICE __kmpc_impl_lanemask_t __kmpc_impl_lanemask_lt();
-DEVICE __kmpc_impl_lanemask_t __kmpc_impl_lanemask_gt();
-DEVICE uint32_t __kmpc_impl_smid();
-DEVICE double __kmpc_impl_get_wtick();
-DEVICE double __kmpc_impl_get_wtime();
-
-INLINE uint32_t __kmpc_impl_ffs(uint32_t x) { return __builtin_ffs(x); }
-INLINE uint32_t __kmpc_impl_popc(uint32_t x) { return __builtin_popcount(x); }
-
-#ifndef CUDA_VERSION
-#error CUDA_VERSION macro is undefined, something wrong with cuda.
-#endif
-
-DEVICE __kmpc_impl_lanemask_t __kmpc_impl_activemask();
-
-DEVICE int32_t __kmpc_impl_shfl_sync(__kmpc_impl_lanemask_t Mask, int32_t Var,
-                                     int32_t SrcLane);
-
-DEVICE int32_t __kmpc_impl_shfl_down_sync(__kmpc_impl_lanemask_t Mask,
-                                          int32_t Var, uint32_t Delta,
-                                          int32_t Width);
-
-DEVICE void __kmpc_impl_syncthreads();
-DEVICE void __kmpc_impl_syncwarp(__kmpc_impl_lanemask_t Mask);
-
-// NVPTX specific kernel initialization
-DEVICE void __kmpc_impl_target_init();
-
-// Barrier until num_threads arrive.
-DEVICE void __kmpc_impl_named_sync(uint32_t num_threads);
-
-DEVICE void __kmpc_impl_threadfence();
-DEVICE void __kmpc_impl_threadfence_block();
-DEVICE void __kmpc_impl_threadfence_system();
-
-// Calls to the NVPTX layer (assuming 1D layout)
-DEVICE int GetThreadIdInBlock();
-DEVICE int GetBlockIdInKernel();
-DEVICE int GetNumberOfBlocksInKernel();
-DEVICE int GetNumberOfThreadsInBlock();
-DEVICE unsigned GetWarpId();
-DEVICE unsigned GetLaneId();
-
-// Atomics
-DEVICE uint32_t __kmpc_atomic_add(uint32_t *, uint32_t);
-DEVICE uint32_t __kmpc_atomic_inc(uint32_t *, uint32_t);
-DEVICE uint32_t __kmpc_atomic_max(uint32_t *, uint32_t);
-DEVICE uint32_t __kmpc_atomic_exchange(uint32_t *, uint32_t);
-DEVICE uint32_t __kmpc_atomic_cas(uint32_t *, uint32_t, uint32_t);
-
-static_assert(sizeof(unsigned long long) == sizeof(uint64_t), "");
-DEVICE unsigned long long __kmpc_atomic_exchange(unsigned long long *,
-                                                 unsigned long long);
-DEVICE unsigned long long __kmpc_atomic_add(unsigned long long *,
-                                            unsigned long long);
-
-// Locks
-DEVICE void __kmpc_impl_init_lock(omp_lock_t *lock);
-DEVICE void __kmpc_impl_destroy_lock(omp_lock_t *lock);
-DEVICE void __kmpc_impl_set_lock(omp_lock_t *lock);
-DEVICE void __kmpc_impl_unset_lock(omp_lock_t *lock);
-DEVICE int __kmpc_impl_test_lock(omp_lock_t *lock);
-
-// Memory
-DEVICE void *__kmpc_impl_malloc(size_t);
-DEVICE void __kmpc_impl_free(void *);
-
-#endif
diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.cu b/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.cu
--- a/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.cu
+++ b/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.cu
@@ -11,8 +11,8 @@
 //===----------------------------------------------------------------------===//
 #pragma omp declare target
 
-#include "target_impl.h"
 #include "common/debug.h"
+#include "nvptx_interface.h"
 
 #include <cuda.h>
 
@@ -33,6 +33,10 @@
 void __threadfence_system();
 }
 
+DEVICE uint32_t __kmpc_impl_ffs(uint32_t x) { return __builtin_ffs(x); }
+
+DEVICE uint32_t __kmpc_impl_popc(uint32_t x) { return __builtin_popcount(x); }
+
 DEVICE void __kmpc_impl_unpack(uint64_t val, uint32_t &lo, uint32_t &hi) {
   asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "l"(val));
 }
diff --git a/openmp/libomptarget/deviceRTLs/target_interface.h b/openmp/libomptarget/deviceRTLs/target_interface.h
new file mode 100644
--- /dev/null
+++ b/openmp/libomptarget/deviceRTLs/target_interface.h
@@ -0,0 +1,82 @@
+//===------------- target_interface.h - Target interfaces --------- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains interfaces that must be implemented by each target.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _OMPTARGET_TARGET_INTERFACE_H_
+#define _OMPTARGET_TARGET_INTERFACE_H_
+
+#include "interface.h"
+
+// Calls to the NVPTX layer (assuming 1D layout)
+EXTERN int GetThreadIdInBlock();
+EXTERN int GetBlockIdInKernel();
+EXTERN int GetNumberOfBlocksInKernel();
+EXTERN int GetNumberOfThreadsInBlock();
+EXTERN unsigned GetWarpId();
+EXTERN unsigned GetLaneId();
+
+// Atomics
+extern DEVICE uint32_t __kmpc_atomic_add(uint32_t *, uint32_t);
+extern DEVICE uint32_t __kmpc_atomic_inc(uint32_t *, uint32_t);
+extern DEVICE uint32_t __kmpc_atomic_max(uint32_t *, uint32_t);
+extern DEVICE uint32_t __kmpc_atomic_exchange(uint32_t *, uint32_t);
+extern DEVICE uint32_t __kmpc_atomic_cas(uint32_t *, uint32_t, uint32_t);
+static_assert(sizeof(unsigned long long) == sizeof(uint64_t), "");
+extern DEVICE unsigned long long __kmpc_atomic_exchange(unsigned long long *,
+                                                        unsigned long long);
+extern DEVICE unsigned long long __kmpc_atomic_add(unsigned long long *,
+                                                   unsigned long long);
+
+// Locks
+EXTERN void __kmpc_impl_init_lock(omp_lock_t *lock);
+EXTERN void __kmpc_impl_destroy_lock(omp_lock_t *lock);
+EXTERN void __kmpc_impl_set_lock(omp_lock_t *lock);
+EXTERN void __kmpc_impl_unset_lock(omp_lock_t *lock);
+EXTERN int __kmpc_impl_test_lock(omp_lock_t *lock);
+
+EXTERN void __kmpc_impl_threadfence();
+EXTERN void __kmpc_impl_threadfence_block();
+EXTERN void __kmpc_impl_threadfence_system();
+
+EXTERN double __kmpc_impl_get_wtick();
+EXTERN double __kmpc_impl_get_wtime();
+
+EXTERN void __kmpc_impl_unpack(uint64_t val, uint32_t &lo, uint32_t &hi);
+EXTERN uint64_t __kmpc_impl_pack(uint32_t lo, uint32_t hi);
+EXTERN __kmpc_impl_lanemask_t __kmpc_impl_lanemask_lt();
+EXTERN __kmpc_impl_lanemask_t __kmpc_impl_lanemask_gt();
+EXTERN uint32_t __kmpc_impl_smid();
+
+EXTERN __kmpc_impl_lanemask_t __kmpc_impl_activemask();
+
+EXTERN int32_t __kmpc_impl_shfl_sync(__kmpc_impl_lanemask_t Mask, int32_t Var,
+                                     int32_t SrcLane);
+EXTERN int32_t __kmpc_impl_shfl_down_sync(__kmpc_impl_lanemask_t Mask,
+                                          int32_t Var, uint32_t Delta,
+                                          int32_t Width);
+
+EXTERN uint32_t __kmpc_impl_ffs(uint32_t x);
+EXTERN uint32_t __kmpc_impl_popc(uint32_t x);
+
+EXTERN void __kmpc_impl_syncthreads();
+EXTERN void __kmpc_impl_syncwarp(__kmpc_impl_lanemask_t Mask);
+
+// Kernel initialization
+EXTERN void __kmpc_impl_target_init();
+
+// Memory
+EXTERN void *__kmpc_impl_malloc(size_t);
+EXTERN void __kmpc_impl_free(void *);
+
+// Barrier until num_threads arrive.
+EXTERN void __kmpc_impl_named_sync(uint32_t num_threads);
+
+#endif // _OMPTARGET_TARGET_INTERFACE_H_