diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def b/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def
--- a/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def
+++ b/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def
@@ -195,6 +195,7 @@
 __OMP_RTL(__kmpc_cancel_barrier, false, Int32, IdentPtr, Int32)
 __OMP_RTL(__kmpc_flush, false, Void, IdentPtr)
 __OMP_RTL(__kmpc_global_thread_num, false, Int32, IdentPtr)
+__OMP_RTL(__kmpc_get_hardware_thread_id_in_block, false, Int32, )
 __OMP_RTL(__kmpc_fork_call, true, Void, IdentPtr, Int32, ParallelTaskPtr)
 __OMP_RTL(__kmpc_omp_taskwait, false, Int32, IdentPtr, Int32)
 __OMP_RTL(__kmpc_omp_taskyield, false, Int32, IdentPtr, Int32, /* Int */ Int32)
diff --git a/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.hip b/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.hip
--- a/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.hip
+++ b/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.hip
@@ -133,7 +133,7 @@
                            __builtin_amdgcn_workgroup_size_x());
 }
 
-EXTERN unsigned GetWarpId() { return GetThreadIdInBlock() / WARPSIZE; }
+EXTERN unsigned GetWarpId() { return __kmpc_get_hardware_thread_id_in_block() / WARPSIZE; }
 EXTERN unsigned GetWarpSize() { return WARPSIZE; }
 EXTERN unsigned GetLaneId() {
   return __builtin_amdgcn_mbcnt_hi(~0u, __builtin_amdgcn_mbcnt_lo(~0u, 0u));
@@ -212,7 +212,7 @@
 }
 
 // Calls to the AMDGCN layer (assuming 1D layout)
-EXTERN int GetThreadIdInBlock() { return __builtin_amdgcn_workitem_id_x(); }
+EXTERN int __kmpc_get_hardware_thread_id_in_block() { return __builtin_amdgcn_workitem_id_x(); }
 EXTERN int GetBlockIdInKernel() { return __builtin_amdgcn_workgroup_id_x(); }
 
 #pragma omp end declare target
diff --git a/openmp/libomptarget/deviceRTLs/common/debug.h b/openmp/libomptarget/deviceRTLs/common/debug.h
--- a/openmp/libomptarget/deviceRTLs/common/debug.h
+++ b/openmp/libomptarget/deviceRTLs/common/debug.h
@@ -132,8 +132,9 @@
 
 template <typename... Arguments>
 NOINLINE static void log(const char *fmt, Arguments... parameters) {
-  printf(fmt, (int)GetBlockIdInKernel(), (int)GetThreadIdInBlock(),
-         (int)GetWarpId(), (int)GetLaneId(), parameters...);
+  printf(fmt, (int)GetBlockIdInKernel(),
+         (int)__kmpc_get_hardware_thread_id_in_block(), (int)GetWarpId(),
+         (int)GetLaneId(), parameters...);
 }
 
 #endif
@@ -143,8 +144,9 @@
 NOINLINE static void check(bool cond, const char *fmt,
                            Arguments... parameters) {
   if (!cond) {
-    printf(fmt, (int)GetBlockIdInKernel(), (int)GetThreadIdInBlock(),
-           (int)GetWarpId(), (int)GetLaneId(), parameters...);
+    printf(fmt, (int)GetBlockIdInKernel(),
+           (int)__kmpc_get_hardware_thread_id_in_block(), (int)GetWarpId(),
+           (int)GetLaneId(), parameters...);
     __builtin_trap();
   }
 }
diff --git a/openmp/libomptarget/deviceRTLs/common/omptargeti.h b/openmp/libomptarget/deviceRTLs/common/omptargeti.h
--- a/openmp/libomptarget/deviceRTLs/common/omptargeti.h
+++ b/openmp/libomptarget/deviceRTLs/common/omptargeti.h
@@ -54,7 +54,8 @@
 
   items.flags = TaskDescr_InPar | TaskDescr_IsParConstr; // set flag to parallel
   items.threadId =
-      GetThreadIdInBlock();   // get ids from cuda (only called for 1st level)
+      __kmpc_get_hardware_thread_id_in_block(); // get ids from cuda (only
+                                                // called for 1st level)
   items.runtimeChunkSize = 1; // preferred chunking statik with chunk 1
   prev = parentTaskDescr;
 }
@@ -97,16 +98,16 @@
   //
   // overwrite specific items;
   //
-  // The threadID should be GetThreadIdInBlock() % GetMasterThreadID().
-  // This is so that the serial master (first lane in the master warp)
-  // gets a threadId of 0.
-  // However, we know that this function is always called in a parallel
-  // region where only workers are active.  The serial master thread
-  // never enters this region.  When a parallel region is executed serially,
-  // the threadId is set to 0 elsewhere and the kmpc_serialized_* functions
-  // are called, which never activate this region.
+  // The threadID should be __kmpc_get_hardware_thread_id_in_block() %
+  // GetMasterThreadID(). This is so that the serial master (first lane in the
+  // master warp) gets a threadId of 0. However, we know that this function is
+  // always called in a parallel region where only workers are active.  The
+  // serial master thread never enters this region.  When a parallel region is
+  // executed serially, the threadId is set to 0 elsewhere and the
+  // kmpc_serialized_* functions are called, which never activate this region.
   items.threadId =
-      GetThreadIdInBlock(); // get ids from cuda (only called for 1st level)
+      __kmpc_get_hardware_thread_id_in_block(); // get ids from cuda (only
+                                                // called for 1st level)
 }
 
 INLINE void omptarget_nvptx_TaskDescr::CopyConvergentParent(
diff --git a/openmp/libomptarget/deviceRTLs/common/src/data_sharing.cu b/openmp/libomptarget/deviceRTLs/common/src/data_sharing.cu
--- a/openmp/libomptarget/deviceRTLs/common/src/data_sharing.cu
+++ b/openmp/libomptarget/deviceRTLs/common/src/data_sharing.cu
@@ -48,7 +48,8 @@
   void *Ptr;
   __kmpc_impl_lanemask_t CurActive = __kmpc_impl_activemask();
   unsigned LeaderID = __kmpc_impl_ffs(CurActive) - 1;
-  bool IsWarpLeader = (GetThreadIdInBlock() % WARPSIZE) == LeaderID;
+  bool IsWarpLeader =
+      (__kmpc_get_hardware_thread_id_in_block() % WARPSIZE) == LeaderID;
   if (IsWarpLeader)
     Ptr = Alloc();
   // Get address from the first active lane.
@@ -61,7 +62,7 @@
 
 EXTERN void *__kmpc_alloc_shared(size_t Bytes) {
   Bytes = Bytes + (Bytes % MinBytes);
-  int TID = GetThreadIdInBlock();
+  int TID = __kmpc_get_hardware_thread_id_in_block();
   if (__kmpc_is_generic_main_thread(TID)) {
     // Main thread alone, use shared memory if space is available.
     if (MainSharedStack.Usage[0] + Bytes <= MainSharedStack.MaxSize) {
@@ -97,7 +98,8 @@
 EXTERN void __kmpc_free_shared(void *Ptr) {
   __kmpc_impl_lanemask_t CurActive = __kmpc_impl_activemask();
   unsigned LeaderID = __kmpc_impl_ffs(CurActive) - 1;
-  bool IsWarpLeader = (GetThreadIdInBlock() % WARPSIZE) == LeaderID;
+  bool IsWarpLeader =
+      (__kmpc_get_hardware_thread_id_in_block() % WARPSIZE) == LeaderID;
   __kmpc_syncwarp(CurActive);
   if (IsWarpLeader) {
     if (Ptr >= &MainSharedStack.Data[0] &&
@@ -190,13 +192,14 @@
     return;
   }
   if (isSPMDExecutionMode) {
-    if (GetThreadIdInBlock() == 0) {
+    if (__kmpc_get_hardware_thread_id_in_block() == 0) {
       *frame = omptarget_nvptx_simpleMemoryManager.Acquire(buf, size);
     }
     __kmpc_impl_syncthreads();
     return;
   }
-  ASSERT0(LT_FUSSY, GetThreadIdInBlock() == GetMasterThreadID(),
+  ASSERT0(LT_FUSSY,
+          __kmpc_get_hardware_thread_id_in_block() == GetMasterThreadID(),
           "Must be called only in the target master thread.");
   *frame = omptarget_nvptx_simpleMemoryManager.Acquire(buf, size);
   __kmpc_impl_threadfence();
@@ -208,13 +211,14 @@
     return;
   if (isSPMDExecutionMode) {
     __kmpc_impl_syncthreads();
-    if (GetThreadIdInBlock() == 0) {
+    if (__kmpc_get_hardware_thread_id_in_block() == 0) {
       omptarget_nvptx_simpleMemoryManager.Release();
     }
     return;
   }
   __kmpc_impl_threadfence();
-  ASSERT0(LT_FUSSY, GetThreadIdInBlock() == GetMasterThreadID(),
+  ASSERT0(LT_FUSSY,
+          __kmpc_get_hardware_thread_id_in_block() == GetMasterThreadID(),
           "Must be called only in the target master thread.");
   omptarget_nvptx_simpleMemoryManager.Release();
 }
diff --git a/openmp/libomptarget/deviceRTLs/common/src/libcall.cu b/openmp/libomptarget/deviceRTLs/common/src/libcall.cu
--- a/openmp/libomptarget/deviceRTLs/common/src/libcall.cu
+++ b/openmp/libomptarget/deviceRTLs/common/src/libcall.cu
@@ -142,7 +142,7 @@
 
 EXTERN int omp_get_ancestor_thread_num(int level) {
   if (__kmpc_is_spmd_exec_mode())
-    return level == 1 ? GetThreadIdInBlock() : 0;
+    return level == 1 ? __kmpc_get_hardware_thread_id_in_block() : 0;
   int rc = -1;
   // If level is 0 or all parallel regions are not active - return 0.
   unsigned parLevel = parallelLevel[GetWarpId()];
diff --git a/openmp/libomptarget/deviceRTLs/common/src/omptarget.cu b/openmp/libomptarget/deviceRTLs/common/src/omptarget.cu
--- a/openmp/libomptarget/deviceRTLs/common/src/omptarget.cu
+++ b/openmp/libomptarget/deviceRTLs/common/src/omptarget.cu
@@ -34,7 +34,7 @@
   if (GetLaneId() == 0)
     parallelLevel[GetWarpId()] = 0;
 
-  int threadIdInBlock = GetThreadIdInBlock();
+  int threadIdInBlock = __kmpc_get_hardware_thread_id_in_block();
   if (threadIdInBlock != GetMasterThreadID())
     return;
 
@@ -87,7 +87,7 @@
 
   setExecutionParameters(Spmd, RequiresFullRuntime ? RuntimeInitialized
                          : RuntimeUninitialized);
-  int threadId = GetThreadIdInBlock();
+  int threadId = __kmpc_get_hardware_thread_id_in_block();
   if (threadId == 0) {
     usedSlotIdx = __kmpc_impl_smid() % MAX_SM;
   }
@@ -147,7 +147,7 @@
     return;
 
   __kmpc_impl_syncthreads();
-  int threadId = GetThreadIdInBlock();
+  int threadId = __kmpc_get_hardware_thread_id_in_block();
   if (threadId == 0) {
     // Enqueue omp state object for use by another team.
     int slot = usedSlotIdx;
@@ -169,7 +169,7 @@
 
 static void __kmpc_target_region_state_machine(ident_t *Ident) {
 
-  int TId = GetThreadIdInBlock();
+  int TId = __kmpc_get_hardware_thread_id_in_block();
   do {
     void* WorkFn = 0;
 
@@ -199,7 +199,7 @@
 int32_t __kmpc_target_init(ident_t *Ident, bool IsSPMD,
                            bool UseGenericStateMachine,
                            bool RequiresFullRuntime) {
-  int TId = GetThreadIdInBlock();
+  int TId = __kmpc_get_hardware_thread_id_in_block();
   if (IsSPMD)
     __kmpc_spmd_kernel_init(RequiresFullRuntime);
   else
diff --git a/openmp/libomptarget/deviceRTLs/common/src/parallel.cu b/openmp/libomptarget/deviceRTLs/common/src/parallel.cu
--- a/openmp/libomptarget/deviceRTLs/common/src/parallel.cu
+++ b/openmp/libomptarget/deviceRTLs/common/src/parallel.cu
@@ -105,7 +105,8 @@
 
   ASSERT(LT_FUSSY, NumThreads > 0, "bad thread request of %d threads",
          (int)NumThreads);
-  ASSERT0(LT_FUSSY, GetThreadIdInBlock() == GetMasterThreadID(),
+  ASSERT0(LT_FUSSY,
+          __kmpc_get_hardware_thread_id_in_block() == GetMasterThreadID(),
           "only team master can create parallel");
 
   // Set number of threads on work descriptor.
@@ -133,7 +134,7 @@
 
   // Only the worker threads call this routine and the master warp
   // never arrives here.  Therefore, use the nvptx thread id.
-  int threadId = GetThreadIdInBlock();
+  int threadId = __kmpc_get_hardware_thread_id_in_block();
   omptarget_nvptx_WorkDescr &workDescr = getMyWorkDescriptor();
   // Set to true for workers participating in the parallel region.
   bool isActive = false;
@@ -166,7 +167,7 @@
 
   // Only the worker threads call this routine and the master warp
   // never arrives here.  Therefore, use the nvptx thread id.
-  int threadId = GetThreadIdInBlock();
+  int threadId = __kmpc_get_hardware_thread_id_in_block();
   omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor(threadId);
   omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(
       threadId, currTaskDescr->GetPrevTaskDescr());
diff --git a/openmp/libomptarget/deviceRTLs/common/src/reduction.cu b/openmp/libomptarget/deviceRTLs/common/src/reduction.cu
--- a/openmp/libomptarget/deviceRTLs/common/src/reduction.cu
+++ b/openmp/libomptarget/deviceRTLs/common/src/reduction.cu
@@ -47,7 +47,7 @@
 INLINE static uint32_t
 gpu_irregular_simd_reduce(void *reduce_data, kmp_ShuffleReductFctPtr shflFct) {
   uint32_t size, remote_id, physical_lane_id;
-  physical_lane_id = GetThreadIdInBlock() % WARPSIZE;
+  physical_lane_id = __kmpc_get_hardware_thread_id_in_block() % WARPSIZE;
   __kmpc_impl_lanemask_t lanemask_lt = __kmpc_impl_lanemask_lt();
   __kmpc_impl_lanemask_t Liveness = __kmpc_impl_activemask();
   uint32_t logical_lane_id = __kmpc_impl_popc(Liveness & lanemask_lt) * 2;
@@ -95,9 +95,10 @@
   if ((NumThreads % WARPSIZE == 0) || (WarpId < WarpsNeeded - 1))
     gpu_regular_warp_reduce(reduce_data, shflFct);
   else if (NumThreads > 1) // Only SPMD execution mode comes thru this case.
-    gpu_irregular_warp_reduce(reduce_data, shflFct,
-                              /*LaneCount=*/NumThreads % WARPSIZE,
-                              /*LaneId=*/GetThreadIdInBlock() % WARPSIZE);
+    gpu_irregular_warp_reduce(
+        reduce_data, shflFct,
+        /*LaneCount=*/NumThreads % WARPSIZE,
+        /*LaneId=*/__kmpc_get_hardware_thread_id_in_block() % WARPSIZE);
 
   // When we have more than [warpsize] number of threads
   // a block reduction is performed here.
@@ -118,9 +119,10 @@
   if (Liveness == __kmpc_impl_all_lanes) // Full warp
     gpu_regular_warp_reduce(reduce_data, shflFct);
   else if (!(Liveness & (Liveness + 1))) // Partial warp but contiguous lanes
-    gpu_irregular_warp_reduce(reduce_data, shflFct,
-                              /*LaneCount=*/__kmpc_impl_popc(Liveness),
-                              /*LaneId=*/GetThreadIdInBlock() % WARPSIZE);
+    gpu_irregular_warp_reduce(
+        reduce_data, shflFct,
+        /*LaneCount=*/__kmpc_impl_popc(Liveness),
+        /*LaneId=*/__kmpc_get_hardware_thread_id_in_block() % WARPSIZE);
   else if (!isRuntimeUninitialized) // Dispersed lanes. Only threads in L2
                                     // parallel region may enter here; return
                                     // early.
@@ -185,7 +187,7 @@
 
   // Terminate all threads in non-SPMD mode except for the master thread.
   if (!__kmpc_is_spmd_exec_mode() &&
-      !__kmpc_is_generic_main_thread(GetThreadIdInBlock()))
+      !__kmpc_is_generic_main_thread(__kmpc_get_hardware_thread_id_in_block()))
     return 0;
 
   uint32_t ThreadId = GetLogicalThreadIdInBlock();
diff --git a/openmp/libomptarget/deviceRTLs/common/src/support.cu b/openmp/libomptarget/deviceRTLs/common/src/support.cu
--- a/openmp/libomptarget/deviceRTLs/common/src/support.cu
+++ b/openmp/libomptarget/deviceRTLs/common/src/support.cu
@@ -70,7 +70,7 @@
 int GetLogicalThreadIdInBlock() {
   // Implemented using control flow (predication) instead of with a modulo
   // operation.
-  int tid = GetThreadIdInBlock();
+  int tid = __kmpc_get_hardware_thread_id_in_block();
   if (__kmpc_is_generic_main_thread(tid))
     return 0;
   else
@@ -84,7 +84,7 @@
 ////////////////////////////////////////////////////////////////////////////////
 
 int GetOmpThreadId() {
-  int tid = GetThreadIdInBlock();
+  int tid = __kmpc_get_hardware_thread_id_in_block();
   if (__kmpc_is_generic_main_thread(tid))
     return 0;
   // omp_thread_num
diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.cu b/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.cu
--- a/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.cu
+++ b/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.cu
@@ -60,7 +60,7 @@
   return Mask;
 }
 
-EXTERN void __kmpc_impl_syncthreads() { 
+EXTERN void __kmpc_impl_syncthreads() {
   int barrier = 2;
   asm volatile("barrier.sync %0;"
                :
@@ -92,15 +92,21 @@
 EXTERN void __kmpc_impl_threadfence_system() { __nvvm_membar_sys(); }
 
 // Calls to the NVPTX layer (assuming 1D layout)
-EXTERN int GetThreadIdInBlock() { return __nvvm_read_ptx_sreg_tid_x(); }
+EXTERN int __kmpc_get_hardware_thread_id_in_block() {
+  return __nvvm_read_ptx_sreg_tid_x();
+}
 EXTERN int GetBlockIdInKernel() { return __nvvm_read_ptx_sreg_ctaid_x(); }
 EXTERN int GetNumberOfBlocksInKernel() {
   return __nvvm_read_ptx_sreg_nctaid_x();
 }
 EXTERN int GetNumberOfThreadsInBlock() { return __nvvm_read_ptx_sreg_ntid_x(); }
-EXTERN unsigned GetWarpId() { return GetThreadIdInBlock() / WARPSIZE; }
+EXTERN unsigned GetWarpId() {
+  return __kmpc_get_hardware_thread_id_in_block() / WARPSIZE;
+}
 EXTERN unsigned GetWarpSize() { return WARPSIZE; }
-EXTERN unsigned GetLaneId() { return GetThreadIdInBlock() & (WARPSIZE - 1); }
+EXTERN unsigned GetLaneId() {
+  return __kmpc_get_hardware_thread_id_in_block() & (WARPSIZE - 1);
+}
 
 // Atomics
 uint32_t __kmpc_atomic_add(uint32_t *Address, uint32_t Val) {
diff --git a/openmp/libomptarget/deviceRTLs/target_interface.h b/openmp/libomptarget/deviceRTLs/target_interface.h
--- a/openmp/libomptarget/deviceRTLs/target_interface.h
+++ b/openmp/libomptarget/deviceRTLs/target_interface.h
@@ -16,7 +16,7 @@
 #include "target_impl.h"
 
 // Calls to the NVPTX layer (assuming 1D layout)
-EXTERN int GetThreadIdInBlock();
+EXTERN int __kmpc_get_hardware_thread_id_in_block();
 EXTERN int GetBlockIdInKernel();
 EXTERN int GetNumberOfBlocksInKernel();
 EXTERN int GetNumberOfThreadsInBlock();