diff --git a/openmp/libomptarget/deviceRTLs/nvptx/CMakeLists.txt b/openmp/libomptarget/deviceRTLs/nvptx/CMakeLists.txt
--- a/openmp/libomptarget/deviceRTLs/nvptx/CMakeLists.txt
+++ b/openmp/libomptarget/deviceRTLs/nvptx/CMakeLists.txt
@@ -55,6 +55,7 @@
       src/omptarget-nvptx.cu
       src/parallel.cu
       src/reduction.cu
+      src/support.cu
       src/sync.cu
       src/task.cu
   )
diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h b/openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h
--- a/openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h
+++ b/openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h
@@ -385,6 +385,5 @@
 ////////////////////////////////////////////////////////////////////////////////
 
 #include "omptarget-nvptxi.h"
-#include "supporti.h"
 
 #endif
diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/support.h b/openmp/libomptarget/deviceRTLs/nvptx/src/support.h
--- a/openmp/libomptarget/deviceRTLs/nvptx/src/support.h
+++ b/openmp/libomptarget/deviceRTLs/nvptx/src/support.h
@@ -10,7 +10,12 @@
 //
 //===----------------------------------------------------------------------===//
 
+#ifndef OMPTARGET_SUPPORT_H
+#define OMPTARGET_SUPPORT_H
+
+#include "interface.h"
 #include "target_impl.h"
+
 ////////////////////////////////////////////////////////////////////////////////
 // Execution Parameters
 ////////////////////////////////////////////////////////////////////////////////
@@ -26,58 +31,67 @@
   RuntimeMask = 0x02u,
 };
 
-INLINE void setExecutionParameters(ExecutionMode EMode, RuntimeMode RMode);
-INLINE bool isGenericMode();
-INLINE bool isSPMDMode();
-INLINE bool isRuntimeUninitialized();
-INLINE bool isRuntimeInitialized();
+DEVICE void setExecutionParameters(ExecutionMode EMode, RuntimeMode RMode);
+DEVICE bool isGenericMode();
+DEVICE bool isSPMDMode();
+DEVICE bool isRuntimeUninitialized();
+DEVICE bool isRuntimeInitialized();
+
+////////////////////////////////////////////////////////////////////////////////
+// Execution Modes based on location parameter fields
+////////////////////////////////////////////////////////////////////////////////
+
+DEVICE bool checkSPMDMode(kmp_Ident *loc);
+DEVICE bool checkGenericMode(kmp_Ident *loc);
+DEVICE bool checkRuntimeUninitialized(kmp_Ident *loc);
+DEVICE bool checkRuntimeInitialized(kmp_Ident *loc);
 
 ////////////////////////////////////////////////////////////////////////////////
 // get info from machine
 ////////////////////////////////////////////////////////////////////////////////
 
 // get low level ids of resources
-INLINE int GetThreadIdInBlock();
-INLINE int GetBlockIdInKernel();
-INLINE int GetNumberOfBlocksInKernel();
-INLINE int GetNumberOfThreadsInBlock();
-INLINE unsigned GetWarpId();
-INLINE unsigned GetLaneId();
+DEVICE int GetThreadIdInBlock();
+DEVICE int GetBlockIdInKernel();
+DEVICE int GetNumberOfBlocksInKernel();
+DEVICE int GetNumberOfThreadsInBlock();
+DEVICE unsigned GetWarpId();
+DEVICE unsigned GetLaneId();
 
 // get global ids to locate tread/team info (constant regardless of OMP)
-INLINE int GetLogicalThreadIdInBlock(bool isSPMDExecutionMode);
-INLINE int GetMasterThreadID();
-INLINE int GetNumberOfWorkersInTeam();
+DEVICE int GetLogicalThreadIdInBlock(bool isSPMDExecutionMode);
+DEVICE int GetMasterThreadID();
+DEVICE int GetNumberOfWorkersInTeam();
 
 // get OpenMP thread and team ids
-INLINE int GetOmpThreadId(int threadId,
+DEVICE int GetOmpThreadId(int threadId,
                           bool isSPMDExecutionMode);    // omp_thread_num
-INLINE int GetOmpTeamId();                              // omp_team_num
+DEVICE int GetOmpTeamId();                              // omp_team_num
 
 // get OpenMP number of threads and team
-INLINE int GetNumberOfOmpThreads(bool isSPMDExecutionMode); // omp_num_threads
-INLINE int GetNumberOfOmpTeams();                           // omp_num_teams
+DEVICE int GetNumberOfOmpThreads(bool isSPMDExecutionMode); // omp_num_threads
+DEVICE int GetNumberOfOmpTeams();                           // omp_num_teams
 
 // get OpenMP number of procs
-INLINE int GetNumberOfProcsInTeam(bool isSPMDExecutionMode);
-INLINE int GetNumberOfProcsInDevice(bool isSPMDExecutionMode);
+DEVICE int GetNumberOfProcsInTeam(bool isSPMDExecutionMode);
+DEVICE int GetNumberOfProcsInDevice(bool isSPMDExecutionMode);
 
 // masters
-INLINE int IsTeamMaster(int ompThreadId);
+DEVICE int IsTeamMaster(int ompThreadId);
 
 // Parallel level
-INLINE void IncParallelLevel(bool ActiveParallel, __kmpc_impl_lanemask_t Mask);
-INLINE void DecParallelLevel(bool ActiveParallel, __kmpc_impl_lanemask_t Mask);
+DEVICE void IncParallelLevel(bool ActiveParallel, __kmpc_impl_lanemask_t Mask);
+DEVICE void DecParallelLevel(bool ActiveParallel, __kmpc_impl_lanemask_t Mask);
 
 ////////////////////////////////////////////////////////////////////////////////
 // Memory
 ////////////////////////////////////////////////////////////////////////////////
 
 // safe alloc and free
-INLINE void *SafeMalloc(size_t size, const char *msg); // check if success
-INLINE void *SafeFree(void *ptr, const char *msg);
+DEVICE void *SafeMalloc(size_t size, const char *msg); // check if success
+DEVICE void *SafeFree(void *ptr, const char *msg);
 // pad to a alignment (power of 2 only)
-INLINE unsigned long PadBytes(unsigned long size, unsigned long alignment);
+DEVICE unsigned long PadBytes(unsigned long size, unsigned long alignment);
 #define ADD_BYTES(_addr, _bytes)                                               \
   ((void *)((char *)((void *)(_addr)) + (_bytes)))
 #define SUB_BYTES(_addr, _bytes)                                               \
@@ -86,6 +100,8 @@
 ////////////////////////////////////////////////////////////////////////////////
 // Teams Reduction Scratchpad Helpers
 ////////////////////////////////////////////////////////////////////////////////
-INLINE unsigned int *GetTeamsReductionTimestamp();
-INLINE char *GetTeamsReductionScratchpad();
-INLINE void SetTeamsReductionScratchpadPtr(void *ScratchpadPtr);
+DEVICE unsigned int *GetTeamsReductionTimestamp();
+DEVICE char *GetTeamsReductionScratchpad();
+DEVICE void SetTeamsReductionScratchpadPtr(void *ScratchpadPtr);
+
+#endif
diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/supporti.h b/openmp/libomptarget/deviceRTLs/nvptx/src/support.cu
rename from openmp/libomptarget/deviceRTLs/nvptx/src/supporti.h
rename to openmp/libomptarget/deviceRTLs/nvptx/src/support.cu
--- a/openmp/libomptarget/deviceRTLs/nvptx/src/supporti.h
+++ b/openmp/libomptarget/deviceRTLs/nvptx/src/support.cu
@@ -1,4 +1,4 @@
-//===--------- supporti.h - NVPTX OpenMP support functions ------- CUDA -*-===//
+//===--------- support.cu - NVPTX OpenMP support functions ------- CUDA -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -10,26 +10,28 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "support.h"
+#include "debug.h"
+#include "omptarget-nvptx.h"
+
 ////////////////////////////////////////////////////////////////////////////////
 // Execution Parameters
 ////////////////////////////////////////////////////////////////////////////////
 
-#include "target_impl.h"
-
-INLINE void setExecutionParameters(ExecutionMode EMode, RuntimeMode RMode) {
+DEVICE void setExecutionParameters(ExecutionMode EMode, RuntimeMode RMode) {
   execution_param = EMode;
   execution_param |= RMode;
 }
 
-INLINE bool isGenericMode() { return (execution_param & ModeMask) == Generic; }
+DEVICE bool isGenericMode() { return (execution_param & ModeMask) == Generic; }
 
-INLINE bool isSPMDMode() { return (execution_param & ModeMask) == Spmd; }
+DEVICE bool isSPMDMode() { return (execution_param & ModeMask) == Spmd; }
 
-INLINE bool isRuntimeUninitialized() {
+DEVICE bool isRuntimeUninitialized() {
   return (execution_param & RuntimeMask) == RuntimeUninitialized;
 }
 
-INLINE bool isRuntimeInitialized() {
+DEVICE bool isRuntimeInitialized() {
   return (execution_param & RuntimeMask) == RuntimeInitialized;
 }
 
@@ -37,7 +39,7 @@
 // Execution Modes based on location parameter fields
 ////////////////////////////////////////////////////////////////////////////////
 
-INLINE bool checkSPMDMode(kmp_Ident *loc) {
+DEVICE bool checkSPMDMode(kmp_Ident *loc) {
   if (!loc)
     return isSPMDMode();
 
@@ -55,11 +57,11 @@
   return isSPMDMode();
 }
 
-INLINE bool checkGenericMode(kmp_Ident *loc) {
+DEVICE bool checkGenericMode(kmp_Ident *loc) {
   return !checkSPMDMode(loc);
 }
 
-INLINE bool checkRuntimeUninitialized(kmp_Ident *loc) {
+DEVICE bool checkRuntimeUninitialized(kmp_Ident *loc) {
   if (!loc)
     return isRuntimeUninitialized();
 
@@ -82,7 +84,7 @@
   return isRuntimeUninitialized();
 }
 
-INLINE bool checkRuntimeInitialized(kmp_Ident *loc) {
+DEVICE bool checkRuntimeInitialized(kmp_Ident *loc) {
   return !checkRuntimeUninitialized(loc);
 }
 
@@ -96,17 +98,17 @@
 //
 ////////////////////////////////////////////////////////////////////////////////
 
-INLINE int GetThreadIdInBlock() { return threadIdx.x; }
+DEVICE int GetThreadIdInBlock() { return threadIdx.x; }
 
-INLINE int GetBlockIdInKernel() { return blockIdx.x; }
+DEVICE int GetBlockIdInKernel() { return blockIdx.x; }
 
-INLINE int GetNumberOfBlocksInKernel() { return gridDim.x; }
+DEVICE int GetNumberOfBlocksInKernel() { return gridDim.x; }
 
-INLINE int GetNumberOfThreadsInBlock() { return blockDim.x; }
+DEVICE int GetNumberOfThreadsInBlock() { return blockDim.x; }
 
-INLINE unsigned GetWarpId() { return threadIdx.x / WARPSIZE; }
+DEVICE unsigned GetWarpId() { return threadIdx.x / WARPSIZE; }
 
-INLINE unsigned GetLaneId() { return threadIdx.x & (WARPSIZE - 1); }
+DEVICE unsigned GetLaneId() { return threadIdx.x & (WARPSIZE - 1); }
 
 ////////////////////////////////////////////////////////////////////////////////
 //
@@ -122,11 +124,11 @@
 //      If NumThreads is 1024, master id is 992.
 //
 // Called in Generic Execution Mode only.
-INLINE int GetMasterThreadID() { return (blockDim.x - 1) & ~(WARPSIZE - 1); }
+DEVICE int GetMasterThreadID() { return (blockDim.x - 1) & ~(WARPSIZE - 1); }
 
 // The last warp is reserved for the master; other warps are workers.
 // Called in Generic Execution Mode only.
-INLINE int GetNumberOfWorkersInTeam() { return GetMasterThreadID(); }
+DEVICE int GetNumberOfWorkersInTeam() { return GetMasterThreadID(); }
 
 ////////////////////////////////////////////////////////////////////////////////
 // get thread id in team
@@ -135,7 +137,7 @@
 // or a serial region by the master.  If the master (whose CUDA thread
 // id is GetMasterThreadID()) calls this routine, we return 0 because
 // it is a shadow for the first worker.
-INLINE int GetLogicalThreadIdInBlock(bool isSPMDExecutionMode) {
+DEVICE int GetLogicalThreadIdInBlock(bool isSPMDExecutionMode) {
   // Implemented using control flow (predication) instead of with a modulo
   // operation.
   int tid = GetThreadIdInBlock();
@@ -151,7 +153,7 @@
 //
 ////////////////////////////////////////////////////////////////////////////////
 
-INLINE int GetOmpThreadId(int threadId, bool isSPMDExecutionMode) {
+DEVICE int GetOmpThreadId(int threadId, bool isSPMDExecutionMode) {
   // omp_thread_num
   int rc;
   if ((parallelLevel[GetWarpId()] & (OMP_ACTIVE_PARALLEL_LEVEL - 1)) > 1) {
@@ -167,7 +169,7 @@
   return rc;
 }
 
-INLINE int GetNumberOfOmpThreads(bool isSPMDExecutionMode) {
+DEVICE int GetNumberOfOmpThreads(bool isSPMDExecutionMode) {
   // omp_num_threads
   int rc;
   int Level = parallelLevel[GetWarpId()];
@@ -185,12 +187,12 @@
 ////////////////////////////////////////////////////////////////////////////////
 // Team id linked to OpenMP
 
-INLINE int GetOmpTeamId() {
+DEVICE int GetOmpTeamId() {
   // omp_team_num
   return GetBlockIdInKernel(); // assume 1 block per team
 }
 
-INLINE int GetNumberOfOmpTeams() {
+DEVICE int GetNumberOfOmpTeams() {
   // omp_num_teams
   return GetNumberOfBlocksInKernel(); // assume 1 block per team
 }
@@ -198,12 +200,12 @@
 ////////////////////////////////////////////////////////////////////////////////
 // Masters
 
-INLINE int IsTeamMaster(int ompThreadId) { return (ompThreadId == 0); }
+DEVICE int IsTeamMaster(int ompThreadId) { return (ompThreadId == 0); }
 
 ////////////////////////////////////////////////////////////////////////////////
 // Parallel level
 
-INLINE void IncParallelLevel(bool ActiveParallel, __kmpc_impl_lanemask_t Mask) {
+DEVICE void IncParallelLevel(bool ActiveParallel, __kmpc_impl_lanemask_t Mask) {
   __kmpc_impl_syncwarp(Mask);
   __kmpc_impl_lanemask_t LaneMaskLt = __kmpc_impl_lanemask_lt();
   unsigned Rank = __kmpc_impl_popc(Mask & LaneMaskLt);
@@ -215,7 +217,7 @@
   __kmpc_impl_syncwarp(Mask);
 }
 
-INLINE void DecParallelLevel(bool ActiveParallel, __kmpc_impl_lanemask_t Mask) {
+DEVICE void DecParallelLevel(bool ActiveParallel, __kmpc_impl_lanemask_t Mask) {
   __kmpc_impl_syncwarp(Mask);
   __kmpc_impl_lanemask_t LaneMaskLt = __kmpc_impl_lanemask_lt();
   unsigned Rank = __kmpc_impl_popc(Mask & LaneMaskLt);
@@ -231,13 +233,13 @@
 // get OpenMP number of procs
 
 // Get the number of processors in the device.
-INLINE int GetNumberOfProcsInDevice(bool isSPMDExecutionMode) {
+DEVICE int GetNumberOfProcsInDevice(bool isSPMDExecutionMode) {
   if (!isSPMDExecutionMode)
     return GetNumberOfWorkersInTeam();
   return GetNumberOfThreadsInBlock();
 }
 
-INLINE int GetNumberOfProcsInTeam(bool isSPMDExecutionMode) {
+DEVICE int GetNumberOfProcsInTeam(bool isSPMDExecutionMode) {
   return GetNumberOfProcsInDevice(isSPMDExecutionMode);
 }
 
@@ -245,7 +247,7 @@
 // Memory
 ////////////////////////////////////////////////////////////////////////////////
 
-INLINE unsigned long PadBytes(unsigned long size,
+DEVICE unsigned long PadBytes(unsigned long size,
                               unsigned long alignment) // must be a power of 2
 {
   // compute the necessary padding to satisfy alignment constraint
@@ -254,7 +256,7 @@
   return (~(unsigned long)size + 1) & (alignment - 1);
 }
 
-INLINE void *SafeMalloc(size_t size, const char *msg) // check if success
+DEVICE void *SafeMalloc(size_t size, const char *msg) // check if success
 {
   void *ptr = malloc(size);
   PRINT(LD_MEM, "malloc data of size %llu for %s: 0x%llx\n",
@@ -262,7 +264,7 @@
   return ptr;
 }
 
-INLINE void *SafeFree(void *ptr, const char *msg) {
+DEVICE void *SafeFree(void *ptr, const char *msg) {
   PRINT(LD_MEM, "free data ptr 0x%llx for %s\n", (unsigned long long)ptr, msg);
   free(ptr);
   return NULL;
@@ -272,14 +274,14 @@
 // Teams Reduction Scratchpad Helpers
 ////////////////////////////////////////////////////////////////////////////////
 
-INLINE unsigned int *GetTeamsReductionTimestamp() {
+DEVICE unsigned int *GetTeamsReductionTimestamp() {
   return static_cast<unsigned int *>(ReductionScratchpadPtr);
 }
 
-INLINE char *GetTeamsReductionScratchpad() {
+DEVICE char *GetTeamsReductionScratchpad() {
   return static_cast<char *>(ReductionScratchpadPtr) + 256;
 }
 
-INLINE void SetTeamsReductionScratchpadPtr(void *ScratchpadPtr) {
+DEVICE void SetTeamsReductionScratchpadPtr(void *ScratchpadPtr) {
   ReductionScratchpadPtr = ScratchpadPtr;
 }
diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h b/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h
--- a/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h
+++ b/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h
@@ -15,8 +15,9 @@
 #include <cuda.h>
 #include "nvptx_interface.h"
 
-#define INLINE __forceinline__ __device__
-#define NOINLINE __noinline__ __device__
+#define DEVICE __device__
+#define INLINE __forceinline__ DEVICE
+#define NOINLINE __noinline__ DEVICE
 
 ////////////////////////////////////////////////////////////////////////////////
 // Kernel options