diff --git a/openmp/libomptarget/deviceRTLs/nvptx/CMakeLists.txt b/openmp/libomptarget/deviceRTLs/nvptx/CMakeLists.txt
--- a/openmp/libomptarget/deviceRTLs/nvptx/CMakeLists.txt
+++ b/openmp/libomptarget/deviceRTLs/nvptx/CMakeLists.txt
@@ -55,7 +55,6 @@
       src/omptarget-nvptx.cu
       src/parallel.cu
       src/reduction.cu
-      src/support.cu
       src/sync.cu
       src/task.cu
   )
@@ -89,7 +88,7 @@
   set(BUILD_SHARED_LIBS OFF)
   set(CUDA_SEPARABLE_COMPILATION ON)
   list(APPEND CUDA_NVCC_FLAGS -I${devicertl_base_directory})
-  cuda_add_library(omptarget-nvptx STATIC unity.cu
+  cuda_add_library(omptarget-nvptx STATIC ${cuda_src_files} ${omp_data_objects}
       OPTIONS ${CUDA_ARCH} ${CUDA_DEBUG})
 
   # Install device RTL under the lib destination folder.
diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/data_sharing.cu b/openmp/libomptarget/deviceRTLs/nvptx/src/data_sharing.cu
--- a/openmp/libomptarget/deviceRTLs/nvptx/src/data_sharing.cu
+++ b/openmp/libomptarget/deviceRTLs/nvptx/src/data_sharing.cu
@@ -13,6 +13,11 @@
 #include "target_impl.h"
 #include <stdio.h>
 
+// Warp ID in the CUDA block
+INLINE static unsigned getWarpId() { return threadIdx.x / WARPSIZE; }
+// Lane ID in the CUDA warp.
+INLINE static unsigned getLaneId() { return threadIdx.x % WARPSIZE; }
+
 // Return true if this is the first active thread in the warp.
 INLINE static bool IsWarpMasterActiveThread() {
   unsigned long long Mask = __kmpc_impl_activemask();
@@ -62,7 +67,7 @@
   DSPRINT0(DSFLAG_INIT,
            "Entering __kmpc_initialize_data_sharing_environment\n");
 
-  unsigned WID = GetWarpId();
+  unsigned WID = getWarpId();
   DSPRINT(DSFLAG_INIT, "Warp ID: %u\n", WID);
 
   omptarget_nvptx_TeamDescr *teamDescr =
@@ -106,7 +111,7 @@
   DSPRINT(DSFLAG, "Default Data Size %016llx\n",
           (unsigned long long)SharingDefaultDataSize);
 
-  unsigned WID = GetWarpId();
+  unsigned WID = getWarpId();
   __kmpc_impl_lanemask_t CurActiveThreads = __kmpc_impl_activemask();
 
   __kmpc_data_sharing_slot *&SlotP = DataSharingState.SlotPtr[WID];
@@ -226,7 +231,7 @@
 
   DSPRINT0(DSFLAG, "Entering __kmpc_data_sharing_environment_end\n");
 
-  unsigned WID = GetWarpId();
+  unsigned WID = getWarpId();
 
   if (IsEntryPoint) {
     if (IsWarpMasterActiveThread()) {
@@ -354,7 +359,7 @@
   // This function initializes the stack pointer with the pointer to the
   // statically allocated shared memory slots. The size of a shared memory
   // slot is pre-determined to be 256 bytes.
-  if (GetThreadIdInBlock() == 0)
+  if (threadIdx.x == 0)
     data_sharing_init_stack_common();
 
   __threadfence_block();
@@ -372,7 +377,7 @@
   PushSize = (PushSize + (Alignment - 1)) / Alignment * Alignment;
 
   // Frame pointer must be visible to all workers in the same warp.
-  const unsigned WID = GetWarpId();
+  const unsigned WID = getWarpId();
   void *FrameP = 0;
   __kmpc_impl_lanemask_t CurActive = __kmpc_impl_activemask();
 
@@ -462,7 +467,7 @@
   // Compute the start address of the frame of each thread in the warp.
   uintptr_t FrameStartAddress =
       (uintptr_t) data_sharing_push_stack_common(PushSize);
-  FrameStartAddress += (uintptr_t) (GetLaneId() * DataSize);
+  FrameStartAddress += (uintptr_t) (getLaneId() * DataSize);
   return (void *)FrameStartAddress;
 }
 
@@ -477,7 +482,7 @@
   __threadfence_block();
 
   if (GetThreadIdInBlock() % WARPSIZE == 0) {
-    unsigned WID = GetWarpId();
+    unsigned WID = getWarpId();
 
     // Current slot
     __kmpc_data_sharing_slot *&SlotP = DataSharingState.SlotPtr[WID];
diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/debug.h b/openmp/libomptarget/deviceRTLs/nvptx/src/debug.h
--- a/openmp/libomptarget/deviceRTLs/nvptx/src/debug.h
+++ b/openmp/libomptarget/deviceRTLs/nvptx/src/debug.h
@@ -128,12 +128,12 @@
 
 #if OMPTARGET_NVPTX_DEBUG || OMPTARGET_NVPTX_TEST || OMPTARGET_NVPTX_WARNING
 #include <stdio.h>
-#include "support.h"
+#include "target_impl.h"
 
 template <typename... Arguments>
 NOINLINE static void log(const char *fmt, Arguments... parameters) {
-  printf(fmt, (int)GetBlockIdInKernel(), (int)GetThreadIdInBlock(),
-         (int)GetWarpId(), (int)GetLaneId(), parameters...);
+  printf(fmt, (int)blockIdx.x, (int)threadIdx.x, (int)(threadIdx.x / WARPSIZE),
+         (int)(threadIdx.x & 0x1F), parameters...);
 }
 
 #endif
@@ -144,8 +144,9 @@
 NOINLINE static void check(bool cond, const char *fmt,
                            Arguments... parameters) {
   if (!cond)
-    printf(fmt, (int)GetBlockIdInKernel(), (int)GetThreadIdInBlock(),
-           (int)GetWarpId(), (int)GetLaneId(), parameters...);
+    printf(fmt, (int)blockIdx.x, (int)threadIdx.x,
+           (int)(threadIdx.x / WARPSIZE), (int)(threadIdx.x & 0x1F),
+           parameters...);
   assert(cond);
 }
 
diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/libcall.cu b/openmp/libomptarget/deviceRTLs/nvptx/src/libcall.cu
--- a/openmp/libomptarget/deviceRTLs/nvptx/src/libcall.cu
+++ b/openmp/libomptarget/deviceRTLs/nvptx/src/libcall.cu
@@ -364,7 +364,7 @@
     for (;;) {
       now = clock();
       clock_t cycles = now > start ? now - start : now + (0xffffffff - start);
-      if (cycles >= __OMP_SPIN * GetBlockIdInKernel()) {
+      if (cycles >= __OMP_SPIN * blockIdx.x) {
         break;
       }
     }
diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h b/openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h
--- a/openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h
+++ b/openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h
@@ -385,5 +385,6 @@
 ////////////////////////////////////////////////////////////////////////////////
 
 #include "omptarget-nvptxi.h"
+#include "supporti.h"
 
 #endif
diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/support.h b/openmp/libomptarget/deviceRTLs/nvptx/src/support.h
--- a/openmp/libomptarget/deviceRTLs/nvptx/src/support.h
+++ b/openmp/libomptarget/deviceRTLs/nvptx/src/support.h
@@ -10,12 +10,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef OMPTARGET_SUPPORT_H
-#define OMPTARGET_SUPPORT_H
-
-#include "interface.h"
 #include "target_impl.h"
-
 ////////////////////////////////////////////////////////////////////////////////
 // Execution Parameters
 ////////////////////////////////////////////////////////////////////////////////
@@ -31,70 +26,58 @@
   RuntimeMask = 0x02u,
 };
 
-DEVICE void setExecutionParameters(ExecutionMode EMode, RuntimeMode RMode);
-DEVICE bool isGenericMode();
-DEVICE bool isSPMDMode();
-DEVICE bool isRuntimeUninitialized();
-DEVICE bool isRuntimeInitialized();
-
-////////////////////////////////////////////////////////////////////////////////
-// Execution Modes based on location parameter fields
-////////////////////////////////////////////////////////////////////////////////
-
-DEVICE bool checkSPMDMode(kmp_Ident *loc);
-
-DEVICE bool checkGenericMode(kmp_Ident *loc);
-
-DEVICE bool checkRuntimeUninitialized(kmp_Ident *loc);
-
-DEVICE bool checkRuntimeInitialized(kmp_Ident *loc);
+INLINE void setExecutionParameters(ExecutionMode EMode, RuntimeMode RMode);
+INLINE bool isGenericMode();
+INLINE bool isSPMDMode();
+INLINE bool isRuntimeUninitialized();
+INLINE bool isRuntimeInitialized();
 
 ////////////////////////////////////////////////////////////////////////////////
 // get info from machine
 ////////////////////////////////////////////////////////////////////////////////
 
 // get low level ids of resources
-DEVICE int GetThreadIdInBlock();
-DEVICE int GetBlockIdInKernel();
-DEVICE int GetNumberOfBlocksInKernel();
-DEVICE int GetNumberOfThreadsInBlock();
-DEVICE unsigned GetWarpId();
-DEVICE unsigned GetLaneId();
+INLINE int GetThreadIdInBlock();
+INLINE int GetBlockIdInKernel();
+INLINE int GetNumberOfBlocksInKernel();
+INLINE int GetNumberOfThreadsInBlock();
+INLINE unsigned GetWarpId();
+INLINE unsigned GetLaneId();
 
 // get global ids to locate tread/team info (constant regardless of OMP)
-DEVICE int GetLogicalThreadIdInBlock(bool isSPMDExecutionMode);
-DEVICE int GetMasterThreadID();
-DEVICE int GetNumberOfWorkersInTeam();
+INLINE int GetLogicalThreadIdInBlock(bool isSPMDExecutionMode);
+INLINE int GetMasterThreadID();
+INLINE int GetNumberOfWorkersInTeam();
 
 // get OpenMP thread and team ids
-DEVICE int GetOmpThreadId(int threadId,
+INLINE int GetOmpThreadId(int threadId,
                           bool isSPMDExecutionMode);    // omp_thread_num
-DEVICE int GetOmpTeamId();                              // omp_team_num
+INLINE int GetOmpTeamId();                              // omp_team_num
 
 // get OpenMP number of threads and team
-DEVICE int GetNumberOfOmpThreads(bool isSPMDExecutionMode); // omp_num_threads
-DEVICE int GetNumberOfOmpTeams();                           // omp_num_teams
+INLINE int GetNumberOfOmpThreads(bool isSPMDExecutionMode); // omp_num_threads
+INLINE int GetNumberOfOmpTeams();                           // omp_num_teams
 
 // get OpenMP number of procs
-DEVICE int GetNumberOfProcsInTeam(bool isSPMDExecutionMode);
-DEVICE int GetNumberOfProcsInDevice(bool isSPMDExecutionMode);
+INLINE int GetNumberOfProcsInTeam(bool isSPMDExecutionMode);
+INLINE int GetNumberOfProcsInDevice(bool isSPMDExecutionMode);
 
 // masters
-DEVICE int IsTeamMaster(int ompThreadId);
+INLINE int IsTeamMaster(int ompThreadId);
 
 // Parallel level
-DEVICE void IncParallelLevel(bool ActiveParallel, __kmpc_impl_lanemask_t Mask);
-DEVICE void DecParallelLevel(bool ActiveParallel, __kmpc_impl_lanemask_t Mask);
+INLINE void IncParallelLevel(bool ActiveParallel, __kmpc_impl_lanemask_t Mask);
+INLINE void DecParallelLevel(bool ActiveParallel, __kmpc_impl_lanemask_t Mask);
 
 ////////////////////////////////////////////////////////////////////////////////
 // Memory
 ////////////////////////////////////////////////////////////////////////////////
 
 // safe alloc and free
-DEVICE void *SafeMalloc(size_t size, const char *msg); // check if success
-DEVICE void *SafeFree(void *ptr, const char *msg);
+INLINE void *SafeMalloc(size_t size, const char *msg); // check if success
+INLINE void *SafeFree(void *ptr, const char *msg);
 // pad to a alignment (power of 2 only)
-DEVICE unsigned long PadBytes(unsigned long size, unsigned long alignment);
+INLINE unsigned long PadBytes(unsigned long size, unsigned long alignment);
 #define ADD_BYTES(_addr, _bytes)                                               \
   ((void *)((char *)((void *)(_addr)) + (_bytes)))
 #define SUB_BYTES(_addr, _bytes)                                               \
@@ -103,8 +86,6 @@
 ////////////////////////////////////////////////////////////////////////////////
 // Teams Reduction Scratchpad Helpers
 ////////////////////////////////////////////////////////////////////////////////
-DEVICE unsigned int *GetTeamsReductionTimestamp();
-DEVICE char *GetTeamsReductionScratchpad();
-DEVICE void SetTeamsReductionScratchpadPtr(void *ScratchpadPtr);
-
-#endif
+INLINE unsigned int *GetTeamsReductionTimestamp();
+INLINE char *GetTeamsReductionScratchpad();
+INLINE void SetTeamsReductionScratchpadPtr(void *ScratchpadPtr);
diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/support.cu b/openmp/libomptarget/deviceRTLs/nvptx/src/supporti.h
rename from openmp/libomptarget/deviceRTLs/nvptx/src/support.cu
rename to openmp/libomptarget/deviceRTLs/nvptx/src/supporti.h
--- a/openmp/libomptarget/deviceRTLs/nvptx/src/support.cu
+++ b/openmp/libomptarget/deviceRTLs/nvptx/src/supporti.h
@@ -1,4 +1,4 @@
-//===--------- support.cu - NVPTX OpenMP support functions ------- CUDA -*-===//
+//===--------- supporti.h - NVPTX OpenMP support functions ------- CUDA -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -10,14 +10,12 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "support.h"
-#include "debug.h"
-#include "omptarget-nvptx.h"
-
 ////////////////////////////////////////////////////////////////////////////////
 // Execution Parameters
 ////////////////////////////////////////////////////////////////////////////////
 
+#include "target_impl.h"
+
 INLINE void setExecutionParameters(ExecutionMode EMode, RuntimeMode RMode) {
   execution_param = EMode;
   execution_param |= RMode;
@@ -106,9 +104,9 @@
 
 INLINE int GetNumberOfThreadsInBlock() { return blockDim.x; }
 
-INLINE unsigned GetWarpId() { return GetThreadIdInBlock() / WARPSIZE; }
+INLINE unsigned GetWarpId() { return threadIdx.x / WARPSIZE; }
 
-INLINE unsigned GetLaneId() { return GetThreadIdInBlock() & (WARPSIZE - 1); }
+INLINE unsigned GetLaneId() { return threadIdx.x & (WARPSIZE - 1); }
 
 ////////////////////////////////////////////////////////////////////////////////
 //
@@ -124,9 +122,7 @@
 //      If NumThreads is 1024, master id is 992.
 //
 // Called in Generic Execution Mode only.
-INLINE int GetMasterThreadID() {
-  return (GetNumberOfThreadsInBlock() - 1) & ~(WARPSIZE - 1);
-}
+INLINE int GetMasterThreadID() { return (blockDim.x - 1) & ~(WARPSIZE - 1); }
 
 // The last warp is reserved for the master; other warps are workers.
 // Called in Generic Execution Mode only.
diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h b/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h
--- a/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h
+++ b/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h
@@ -15,9 +15,8 @@
 #include <cuda.h>
 #include "nvptx_interface.h"
 
-#define DEVICE __device__
-#define INLINE __forceinline__ DEVICE
-#define NOINLINE __noinline__ DEVICE
+#define INLINE __forceinline__ __device__
+#define NOINLINE __noinline__ __device__
 
 ////////////////////////////////////////////////////////////////////////////////
 // Kernel options
diff --git a/openmp/libomptarget/deviceRTLs/nvptx/unity.cu b/openmp/libomptarget/deviceRTLs/nvptx/unity.cu
deleted file mode 100644
--- a/openmp/libomptarget/deviceRTLs/nvptx/unity.cu
+++ /dev/null
@@ -1,26 +0,0 @@
-//===------ unity.cu - Unity build of NVPTX deviceRTL ------------ CUDA -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// Support compilers, specifically NVCC, which have not implemented link time
-// optimisation. This removes the runtime cost of moving inline functions into
-// source files in exchange for preventing efficient incremental builds.
-//
-//===----------------------------------------------------------------------===//
-
-#include "src/cancel.cu"
-#include "src/critical.cu"
-#include "src/data_sharing.cu"
-#include "src/libcall.cu"
-#include "src/loop.cu"
-#include "src/omp_data.cu"
-#include "src/omptarget-nvptx.cu"
-#include "src/parallel.cu"
-#include "src/reduction.cu"
-#include "src/support.cu"
-#include "src/sync.cu"
-#include "src/task.cu"