diff --git a/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.h b/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.h
--- a/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.h
+++ b/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.h
@@ -47,8 +47,6 @@
 #define OMP_STATE_COUNT 32
 #define MAX_SM 64
 
-#define OMP_ACTIVE_PARALLEL_LEVEL 128
-
 // Data sharing related quantities, need to match what is used in the compiler.
 enum DATA_SHARING_SIZES {
   // The maximum number of workers in a kernel.
diff --git a/openmp/libomptarget/deviceRTLs/common/include/ICVs.h b/openmp/libomptarget/deviceRTLs/common/include/ICVs.h
new file mode 100644
--- /dev/null
+++ b/openmp/libomptarget/deviceRTLs/common/include/ICVs.h
@@ -0,0 +1,113 @@
+//===--------- ICVs.h - OpenMP ICV handling ----------------------- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+//
+//===----------------------------------------------------------------------===//
+#ifndef OMPTARGET_ICVS_H
+#define OMPTARGET_ICVS_H
+
+struct ICVStateTy {
+  int nthreads_var;
+
+  int levels_var;
+
+  /// The `active-level` describes which of the parallel levels counted with the
+  /// `levels-var` is active. There can only be one.
+  ///
+  /// active-levels-var is 1, if active_level is not 0, otherweise it is 0.
+  int active_level;
+
+  static bool ensureICVStateForThread(unsigned TId);
+
+  static int &getICVForThread(int ICVStateTy::*Var);
+  static int incICVForThread(int ICVStateTy::*Var, int UpdateVal);
+  static int setICVForThread(int ICVStateTy::*Var, int UpdateVal);
+};
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#pragma omp declare target
+
+/// ICV: dyn-var, constant 0
+///
+/// setter: ignored.
+/// getter: returns 0.
+///
+///{
+void omp_set_dynamic(int);
+int omp_get_dynamic(void);
+///}
+
+/// ICV: nthreads-var, integer
+///
+/// scope: data environment
+///
+/// setter: ignored.
+/// getter: returns false.
+///
+/// implementation notes:
+///
+///
+///{
+void omp_set_num_threads(int);
+int omp_get_max_threads(void);
+///}
+
+/// ICV: thread-limit-var, computed
+///
+/// getter: returns thread limited defined during launch.
+///
+///{
+int omp_get_thread_limit(void);
+///}
+
+/// ICV: max-active-levels-var, constant 1
+///
+/// setter: ignored.
+/// getter: returns 1.
+///
+///{
+void omp_set_max_active_levels(int);
+int omp_get_max_active_levels(void);
+///}
+
+/// ICV: places-partition-var
+///
+///
+///{
+///}
+
+/// ICV: active-levels-var, 0 or 1
+///
+/// getter: returns 0 or 1.
+///
+///{
+int omp_get_active_levels(void);
+///}
+
+/// ICV: levels-var
+///
+/// getter: returns parallel region nesting
+///
+///{
+int omp_get_levels(void);
+///}
+
+/// TODO this is incomplete.
+int omp_get_num_threads(void);
+int omp_get_thread_num(void);
+
+#pragma omp end declare target
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/openmp/libomptarget/deviceRTLs/common/include/Mapping.h b/openmp/libomptarget/deviceRTLs/common/include/Mapping.h
new file mode 100644
--- /dev/null
+++ b/openmp/libomptarget/deviceRTLs/common/include/Mapping.h
@@ -0,0 +1,30 @@
+//===--------- Mapping.h - OpenMP device runtime mapping helpers -- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+//
+//===----------------------------------------------------------------------===//
+#ifndef OMPTARGET_MAPPING_H
+#define OMPTARGET_MAPPING_H
+
+namespace omp {
+
+#pragma omp declare target
+
+/// TODO
+bool isMainThreadInGenericMode();
+
+bool isLeaderInSIMD();
+
+unsigned getNumberOfThreadsAccessingSharedMem();
+unsigned getThreadIdForSharedMemArrayAccess();
+
+#pragma omp end declare target
+
+} // namespace omp
+
+#endif
diff --git a/openmp/libomptarget/deviceRTLs/common/include/TeamState.h b/openmp/libomptarget/deviceRTLs/common/include/TeamState.h
new file mode 100644
--- /dev/null
+++ b/openmp/libomptarget/deviceRTLs/common/include/TeamState.h
@@ -0,0 +1,46 @@
+//===--------- TeamState.h - OpenMP team state description -------- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+//
+//===----------------------------------------------------------------------===//
+#ifndef OMPTARGET_TEAM_STATE_H
+#define OMPTARGET_TEAM_STATE_H
+
+#include <stdint.h>
+
+#include "ICVs.h"
+#include "allocator.h"
+
+namespace omp {
+
+#pragma omp declare target
+
+struct TeamStateTy {
+
+  /// TODO: provide a proper init function.
+  void init() {}
+
+  /// ICVs
+  ///
+  /// Preallocated storage for ICV values that are used if the threads have not
+  /// set a custom default. The latter is supported but unlikely and slow(er).
+  ///
+  ///{
+  ICVStateTy ICVState;
+  ///}
+
+  uint16_t ParallelTeamSize;
+};
+
+extern TeamStateTy EXTERN_SHARED(TeamState);
+
+#pragma omp end declare target
+
+} // namespace omp
+
+#endif
diff --git a/openmp/libomptarget/deviceRTLs/common/include/ThreadState.h b/openmp/libomptarget/deviceRTLs/common/include/ThreadState.h
new file mode 100644
--- /dev/null
+++ b/openmp/libomptarget/deviceRTLs/common/include/ThreadState.h
@@ -0,0 +1,59 @@
+//===--------- ThreadState.h - OpenMP thread state description ---- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+//
+//===----------------------------------------------------------------------===//
+#ifndef OMPTARGET_THREAD_STATE_H
+#define OMPTARGET_THREAD_STATE_H
+
+#include "ICVs.h"
+#include "TeamState.h"
+#include "Utils.h"
+#include "allocator.h"
+
+namespace omp {
+
+#pragma omp declare target
+
+struct ThreadStateTy {
+
+  /// ICVs have preallocated storage in the TeamStateTy which is used if a
+  /// thread has not set a custom value. The latter is supported but unlikely.
+  /// When it happens we will allocate dynamic memory to hold the values of all
+  /// ICVs. Thus, the first time an ICV is set by a thread we will allocate an
+  /// ICV struct to hold them all. This is slower than alternatives but allows
+  /// users to pay only for what they use.
+  ///
+  ICVStateTy ICVState;
+
+  ThreadStateTy *PreviousThreadState;
+
+  void init() {
+    ICVState = TeamState.ICVState;
+    PreviousThreadState = nullptr;
+  }
+
+  void init(ThreadStateTy &PreviousTS) {
+    ICVState = PreviousTS.ICVState;
+    PreviousThreadState = &PreviousTS;
+  }
+
+  static void dropForThread(unsigned TId);
+
+  static void enterDataEnvironment();
+  static void exitDataEnvironment();
+};
+
+extern ThreadStateTy *ThreadStates[MAX_THREADS_PER_TEAM];
+#pragma omp allocate(ThreadStates) allocator(omp_pteam_mem_alloc)
+
+#pragma omp end declare target
+
+} // namespace omp
+
+#endif
diff --git a/openmp/libomptarget/deviceRTLs/common/include/Utils.h b/openmp/libomptarget/deviceRTLs/common/include/Utils.h
new file mode 100644
--- /dev/null
+++ b/openmp/libomptarget/deviceRTLs/common/include/Utils.h
@@ -0,0 +1,28 @@
+//===--------- Utils.h - OpenMP device runtime utility functions -- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+//
+//===----------------------------------------------------------------------===//
+#ifndef OMPTARGET_UTILS_H
+#define OMPTARGET_UTILS_H
+
+#include <stdint.h>
+
+#include "ICVs.h"
+#include "Mapping.h"
+#include "target_interface.h"
+
+namespace omp {
+
+//#pragma omp declare target
+
+//#pragma omp end declare target
+
+} // namespace omp
+
+#endif
diff --git a/openmp/libomptarget/deviceRTLs/common/omptarget.h b/openmp/libomptarget/deviceRTLs/common/omptarget.h
--- a/openmp/libomptarget/deviceRTLs/common/omptarget.h
+++ b/openmp/libomptarget/deviceRTLs/common/omptarget.h
@@ -234,10 +234,6 @@
     topTaskDescr[tid] = taskICV;
   }
   INLINE omptarget_nvptx_TaskDescr *GetTopLevelTaskDescr(int tid) const;
-  // parallel
-  INLINE uint16_t &NumThreadsForNextParallel(int tid) {
-    return nextRegion.tnum[tid];
-  }
   // schedule (for dispatch)
   INLINE kmp_sched_t &ScheduleType(int tid) { return schedule[tid]; }
   INLINE int64_t &Chunk(int tid) { return chunk[tid]; }
@@ -257,11 +253,6 @@
   omptarget_nvptx_TaskDescr levelOneTaskDescr[MAX_THREADS_PER_TEAM];
   // pointer where to find the current task ICV (top of the stack)
   omptarget_nvptx_TaskDescr *topTaskDescr[MAX_THREADS_PER_TEAM];
-  union {
-    // Only one of the two is live at the same time.
-    // parallel
-    uint16_t tnum[MAX_THREADS_PER_TEAM];
-  } nextRegion;
   // schedule (for dispatch)
   kmp_sched_t schedule[MAX_THREADS_PER_TEAM]; // remember schedule type for #for
   int64_t chunk[MAX_THREADS_PER_TEAM];
@@ -298,16 +289,6 @@
     omptarget_nvptx_simpleMemoryManager;
 extern DEVICE uint32_t EXTERN_SHARED(usedMemIdx);
 extern DEVICE uint32_t EXTERN_SHARED(usedSlotIdx);
-#if _OPENMP
-extern DEVICE uint8_t parallelLevel[MAX_THREADS_PER_TEAM / WARPSIZE];
-#pragma omp allocate(parallelLevel) allocator(omp_pteam_mem_alloc)
-#else
-extern DEVICE
-    uint8_t EXTERN_SHARED(parallelLevel)[MAX_THREADS_PER_TEAM / WARPSIZE];
-#endif
-extern DEVICE uint16_t EXTERN_SHARED(threadLimit);
-extern DEVICE uint16_t EXTERN_SHARED(threadsInTeam);
-extern DEVICE uint16_t EXTERN_SHARED(nThreads);
 extern DEVICE omptarget_nvptx_ThreadPrivateContext *
     EXTERN_SHARED(omptarget_nvptx_threadPrivateContext);
 
diff --git a/openmp/libomptarget/deviceRTLs/common/omptargeti.h b/openmp/libomptarget/deviceRTLs/common/omptargeti.h
--- a/openmp/libomptarget/deviceRTLs/common/omptargeti.h
+++ b/openmp/libomptarget/deviceRTLs/common/omptargeti.h
@@ -157,8 +157,6 @@
   // levelOneTaskDescr is init when starting the parallel region
   // top task descr is NULL (team master version will be fixed separately)
   topTaskDescr[tid] = NULL;
-  // no num threads value has been pushed
-  nextRegion.tnum[tid] = 0;
   // the following don't need to be init here; they are init when using dyn
   // sched
   // current_Event, events_Number, chunk, num_Iterations, schedule
diff --git a/openmp/libomptarget/deviceRTLs/common/src/ICVs.cpp b/openmp/libomptarget/deviceRTLs/common/src/ICVs.cpp
new file mode 100644
--- /dev/null
+++ b/openmp/libomptarget/deviceRTLs/common/src/ICVs.cpp
@@ -0,0 +1,206 @@
+//===------------ libcall.cu - OpenMP GPU user calls ------------- CUDA -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the OpenMP runtime functions that can be
+// invoked by the user in an OpenMP region
+//
+//===----------------------------------------------------------------------===//
+
+#include "ICVs.h"
+#include "Mapping.h"
+#include "TeamState.h"
+#include "ThreadState.h"
+#include "omptarget.h"
+#include "support.h"
+#include "target_interface.h"
+#include <string.h>
+
+#define ICV_DEBUG(...)
+
+using namespace omp;
+
+#pragma omp declare target
+
+template <typename Ty> static Ty &getICVForThreadImpl(Ty ICVStateTy::*Var) {
+  unsigned TId = getThreadIdForSharedMemArrayAccess();
+  if (!ThreadStates[TId])
+    return TeamState.ICVState.*Var;
+
+  return ThreadStates[TId]->ICVState.*Var;
+}
+
+int &ICVStateTy::getICVForThread(int ICVStateTy::*Var) {
+  return getICVForThreadImpl<int>(Var);
+}
+
+template <typename Ty>
+static Ty incICVForThreadImpl(Ty ICVStateTy::*Var, Ty UpdateVal) {
+  unsigned TId = getThreadIdForSharedMemArrayAccess();
+  ICVStateTy::ensureICVStateForThread(TId);
+
+  ThreadStates[TId]->ICVState.*Var += UpdateVal;
+
+  return ThreadStates[TId]->ICVState.*Var;
+}
+int ICVStateTy::incICVForThread(int ICVStateTy::*Var, int UpdateVal) {
+  return incICVForThreadImpl<int>(Var, UpdateVal);
+}
+
+template <typename Ty>
+static Ty setICVForThreadImpl(Ty ICVStateTy::*Var, Ty UpdateVal) {
+  unsigned TId = getThreadIdForSharedMemArrayAccess();
+  ICVStateTy::ensureICVStateForThread(TId);
+
+  ThreadStates[TId]->ICVState.*Var = UpdateVal;
+
+  return ThreadStates[TId]->ICVState.*Var;
+}
+int ICVStateTy::setICVForThread(int ICVStateTy::*Var, int UpdateVal) {
+  return setICVForThreadImpl<int>(Var, UpdateVal);
+}
+
+bool ICVStateTy::ensureICVStateForThread(unsigned TId) {
+  if (ThreadStates[TId])
+    return false;
+
+  ThreadStates[TId] =
+      static_cast<ThreadStateTy *>(malloc(sizeof(ThreadStateTy)));
+  ThreadStates[TId]->init();
+  return true;
+}
+
+void omp_set_dynamic(int V) { ICV_DEBUG("(%i); ignored", V); }
+int omp_get_dynamic(void) {
+  ICV_DEBUG("0; constant");
+  return 0;
+}
+
+void omp_set_num_threads(int V) {
+  if (isMainThreadInGenericMode()) {
+    ICV_DEBUG("(%i); stored in team state", V);
+    TeamState.ICVState.nthreads_var = V;
+    return;
+  }
+
+  unsigned TId = getThreadIdForSharedMemArrayAccess();
+  if (!ThreadStates[TId] && TeamState.ICVState.nthreads_var == V) {
+    ICV_DEBUG("(%i); equal to team state setting, ignored", V);
+    return;
+  }
+
+  ICVStateTy::ensureICVStateForThread(TId);
+
+  ICV_DEBUG("(%i); set nthreads-var ICV for thread", V);
+  ThreadStates[TId]->ICVState.nthreads_var = V;
+}
+
+int omp_get_max_threads(void) {
+  return ICVStateTy::getICVForThread(&ICVStateTy::nthreads_var);
+}
+
+/// TODO not all functions below belong here.
+
+int omp_get_level(void) {
+  int LevelsVar = ICVStateTy::getICVForThread(&ICVStateTy::levels_var);
+  __builtin_assume(LevelsVar >= 0);
+  return LevelsVar;
+}
+
+int omp_get_active_level(void) {
+  return !!ICVStateTy::getICVForThread(&ICVStateTy::active_level);
+}
+
+int omp_in_parallel(void) {
+  return !!ICVStateTy::getICVForThread(&ICVStateTy::active_level);
+}
+
+static int returnValIfLevelIsActive(int Level, int Val, int DefaultVal,
+                                    int OutOfBoundsVal = -1) {
+  if (Level == 0)
+    return DefaultVal;
+  int LevelsVar = omp_get_level();
+  if (Level < 0 || Level > LevelsVar)
+    return OutOfBoundsVal;
+  int ActiveLevel = ICVStateTy::getICVForThread(&ICVStateTy::active_level);
+  if (Level != ActiveLevel)
+    return DefaultVal;
+  return Val;
+}
+
+int omp_get_ancestor_thread_num(int Level) {
+  return returnValIfLevelIsActive(Level, getThreadIdForSharedMemArrayAccess(),
+                                  0);
+}
+
+int omp_get_thread_num(void) {
+  return omp_get_ancestor_thread_num(omp_get_level());
+}
+
+int omp_get_team_size(int Level) {
+  return returnValIfLevelIsActive(Level, TeamState.ParallelTeamSize, 1);
+}
+
+int omp_get_num_threads(void) { return omp_get_team_size(omp_get_level()); }
+
+void ThreadStateTy::enterDataEnvironment() {
+  unsigned TId = getThreadIdForSharedMemArrayAccess();
+  if (ICVStateTy::ensureICVStateForThread(TId))
+    return;
+
+  ThreadStateTy *NewThreadState =
+      static_cast<ThreadStateTy *>(malloc(sizeof(ThreadStateTy)));
+  NewThreadState->init(*ThreadStates[TId]);
+  ThreadStates[TId] = NewThreadState;
+}
+
+void ThreadStateTy::exitDataEnvironment() {
+  unsigned TId = getThreadIdForSharedMemArrayAccess();
+  // assert(ThreadStates[TId] && "exptected thread state");
+  free(ThreadStates[TId]);
+  ThreadStates[TId] = ThreadStates[TId]->PreviousThreadState;
+}
+
+bool omp::isMainThreadInGenericMode() {
+  if (isSPMDMode())
+    return false;
+
+  int TId = GetThreadIdInBlock();
+  return TId == ((GetNumberOfThreadsInBlock() - 1) & ~(WARPSIZE - 1));
+}
+
+bool omp::isLeaderInSIMD() {
+  __kmpc_impl_lanemask_t Active = __kmpc_impl_activemask();
+  __kmpc_impl_lanemask_t LaneMaskLT = __kmpc_impl_lanemask_lt();
+  unsigned int Position = __kmpc_impl_popc(Active & LaneMaskLT);
+  return Position == 0;
+}
+
+unsigned omp::getNumberOfThreadsAccessingSharedMem() {
+  return GetNumberOfThreadsInBlock();
+}
+
+unsigned omp::getThreadIdForSharedMemArrayAccess() {
+  return GetThreadIdInBlock();
+}
+
+void ThreadStateTy::dropForThread(unsigned TId) {
+  if (!ThreadStates[TId])
+    return;
+
+  // assert(!ThreadStates[TId]->PreviousThreadState && "leftover thread state");
+  free(ThreadStates[TId]);
+  ThreadStates[TId] = nullptr;
+}
+
+DEVICE TeamStateTy SHARED(omp::TeamState);
+
+[[clang::loader_uninitialized]] DEVICE ThreadStateTy
+    *omp::ThreadStates[MAX_THREADS_PER_TEAM];
+#pragma omp allocate(omp::ThreadStates) allocator(omp_pteam_mem_alloc)
+
+#pragma omp end declare target
diff --git a/openmp/libomptarget/deviceRTLs/common/src/libcall.cu b/openmp/libomptarget/deviceRTLs/common/src/libcall.cu
--- a/openmp/libomptarget/deviceRTLs/common/src/libcall.cu
+++ b/openmp/libomptarget/deviceRTLs/common/src/libcall.cu
@@ -27,66 +27,20 @@
   return rc;
 }
 
-EXTERN void omp_set_num_threads(int num) {
-  // Ignore it for SPMD mode.
-  if (isSPMDMode())
-    return;
-  ASSERT0(LT_FUSSY, isRuntimeInitialized(), "Expected initialized runtime.");
-  PRINT(LD_IO, "call omp_set_num_threads(num %d)\n", num);
-  if (num <= 0) {
-    WARNING0(LW_INPUT, "expected positive num; ignore\n");
-  } else if (parallelLevel[GetWarpId()] == 0) {
-    nThreads = num;
-  }
-}
-
-EXTERN int omp_get_num_threads(void) {
-  int rc = GetNumberOfOmpThreads(isSPMDMode());
-  PRINT(LD_IO, "call omp_get_num_threads() return %d\n", rc);
-  return rc;
-}
-
-EXTERN int omp_get_max_threads(void) {
-  if (parallelLevel[GetWarpId()] > 0)
-    // We're already in parallel region.
-    return 1; // default is 1 thread avail
-  // Not currently in a parallel region, return what was set.
-  int rc = 1;
-  if (parallelLevel[GetWarpId()] == 0)
-    rc = nThreads;
-  ASSERT0(LT_FUSSY, rc >= 0, "bad number of threads");
-  PRINT(LD_IO, "call omp_get_max_threads() return %d\n", rc);
-  return rc;
-}
-
 EXTERN int omp_get_thread_limit(void) {
   if (isSPMDMode())
     return GetNumberOfThreadsInBlock();
-  int rc = threadLimit;
+  int rc = GetNumberOfWorkersInTeam();
   PRINT(LD_IO, "call omp_get_thread_limit() return %d\n", rc);
   return rc;
 }
 
-EXTERN int omp_get_thread_num() {
-  bool isSPMDExecutionMode = isSPMDMode();
-  int tid = GetLogicalThreadIdInBlock(isSPMDExecutionMode);
-  int rc = GetOmpThreadId(tid, isSPMDExecutionMode);
-  PRINT(LD_IO, "call omp_get_thread_num() returns %d\n", rc);
-  return rc;
-}
-
 EXTERN int omp_get_num_procs(void) {
   int rc = GetNumberOfProcsInDevice(isSPMDMode());
   PRINT(LD_IO, "call omp_get_num_procs() returns %d\n", rc);
   return rc;
 }
 
-EXTERN int omp_in_parallel(void) {
-  int rc = parallelLevel[GetWarpId()] > OMP_ACTIVE_PARALLEL_LEVEL ? 1 : 0;
-  PRINT(LD_IO, "call omp_in_parallel() returns %d\n", rc);
-  return rc;
-}
-
 EXTERN int omp_in_final(void) {
   // treat all tasks as final... Specs may expect runtime to keep
   // track more precisely if a task was actively set by users... This
@@ -97,16 +51,6 @@
   return rc;
 }
 
-EXTERN void omp_set_dynamic(int flag) {
-  PRINT(LD_IO, "call omp_set_dynamic(%d) is ignored (no support)\n", flag);
-}
-
-EXTERN int omp_get_dynamic(void) {
-  int rc = 0;
-  PRINT(LD_IO, "call omp_get_dynamic() returns %d\n", rc);
-  return rc;
-}
-
 EXTERN void omp_set_nested(int flag) {
   PRINT(LD_IO, "call omp_set_nested(%d) is ignored (no nested support)\n",
         flag);
@@ -130,91 +74,6 @@
   return rc;
 }
 
-EXTERN int omp_get_level(void) {
-  int level = parallelLevel[GetWarpId()] & (OMP_ACTIVE_PARALLEL_LEVEL - 1);
-  PRINT(LD_IO, "call omp_get_level() returns %d\n", level);
-  return level;
-}
-
-EXTERN int omp_get_active_level(void) {
-  int level = parallelLevel[GetWarpId()] > OMP_ACTIVE_PARALLEL_LEVEL ? 1 : 0;
-  PRINT(LD_IO, "call omp_get_active_level() returns %d\n", level)
-  return level;
-}
-
-EXTERN int omp_get_ancestor_thread_num(int level) {
-  if (isSPMDMode())
-    return level == 1 ? GetThreadIdInBlock() : 0;
-  int rc = -1;
-  // If level is 0 or all parallel regions are not active - return 0.
-  unsigned parLevel = parallelLevel[GetWarpId()];
-  if (level == 1 && parLevel > OMP_ACTIVE_PARALLEL_LEVEL) {
-    int totLevel = omp_get_level();
-    if (level <= totLevel) {
-      omptarget_nvptx_TaskDescr *currTaskDescr =
-          getMyTopTaskDescriptor(/*isSPMDExecutionMode=*/false);
-      int steps = totLevel - level;
-      PRINT(LD_IO, "backtrack %d steps\n", steps);
-      ASSERT0(LT_FUSSY, currTaskDescr,
-              "do not expect fct to be called in a non-active thread");
-      do {
-        if (DON(LD_IOD)) {
-          // print current state
-          omp_sched_t sched = currTaskDescr->GetRuntimeSched();
-          PRINT(LD_ALL,
-                "task descr %s %d: %s, in par %d, rt sched %d,"
-                " chunk %" PRIu64 "; tid %d, tnum %d, nthreads %d\n",
-                "ancestor", steps,
-                (currTaskDescr->IsParallelConstruct() ? "par" : "task"),
-                (int)currTaskDescr->InParallelRegion(), (int)sched,
-                currTaskDescr->RuntimeChunkSize(),
-                (int)currTaskDescr->ThreadId(), (int)threadsInTeam,
-                (int)nThreads);
-        }
-
-        if (currTaskDescr->IsParallelConstruct()) {
-          // found the level
-          if (!steps) {
-            rc = currTaskDescr->ThreadId();
-            break;
-          }
-          steps--;
-        }
-        currTaskDescr = currTaskDescr->GetPrevTaskDescr();
-      } while (currTaskDescr);
-      ASSERT0(LT_FUSSY, !steps, "expected to find all steps");
-    }
-  } else if (level == 0 ||
-             (level > 0 && parLevel < OMP_ACTIVE_PARALLEL_LEVEL &&
-              level <= parLevel) ||
-             (level > 1 && parLevel > OMP_ACTIVE_PARALLEL_LEVEL &&
-              level <= (parLevel - OMP_ACTIVE_PARALLEL_LEVEL))) {
-    rc = 0;
-  }
-  PRINT(LD_IO, "call omp_get_ancestor_thread_num(level %d) returns %d\n", level,
-        rc)
-  return rc;
-}
-
-EXTERN int omp_get_team_size(int level) {
-  if (isSPMDMode())
-    return level == 1 ? GetNumberOfThreadsInBlock() : 1;
-  int rc = -1;
-  unsigned parLevel = parallelLevel[GetWarpId()];
-  // If level is 0 or all parallel regions are not active - return 1.
-  if (level == 1 && parLevel > OMP_ACTIVE_PARALLEL_LEVEL) {
-    rc = threadsInTeam;
-  } else if (level == 0 ||
-             (level > 0 && parLevel < OMP_ACTIVE_PARALLEL_LEVEL &&
-              level <= parLevel) ||
-             (level > 1 && parLevel > OMP_ACTIVE_PARALLEL_LEVEL &&
-              level <= (parLevel - OMP_ACTIVE_PARALLEL_LEVEL))) {
-    rc = 1;
-  }
-  PRINT(LD_IO, "call omp_get_team_size(level %d) returns %d\n", level, rc)
-  return rc;
-}
-
 EXTERN void omp_get_schedule(omp_sched_t *kind, int *modifier) {
   if (isRuntimeUninitialized()) {
     ASSERT0(LT_FUSSY, isSPMDMode(),
diff --git a/openmp/libomptarget/deviceRTLs/common/src/loop.cu b/openmp/libomptarget/deviceRTLs/common/src/loop.cu
--- a/openmp/libomptarget/deviceRTLs/common/src/loop.cu
+++ b/openmp/libomptarget/deviceRTLs/common/src/loop.cu
@@ -11,6 +11,7 @@
 // interface as loops.
 //
 //===----------------------------------------------------------------------===//
+#include "ICVs.h"
 #pragma omp declare target
 
 #include "common/omptarget.h"
@@ -103,7 +104,7 @@
 
     // Assume we are in teams region or that we use a single block
     // per target region
-    ST numberOfActiveOMPThreads = GetNumberOfOmpThreads(IsSPMDExecutionMode);
+    ST numberOfActiveOMPThreads = omp_get_num_threads();
 
     // All warps that are in excess of the maximum requested, do
     // not execute the loop
@@ -211,7 +212,7 @@
     }
     int tid = GetLogicalThreadIdInBlock(checkSPMDMode(loc));
     omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor(tid);
-    T tnum = GetNumberOfOmpThreads(checkSPMDMode(loc));
+    T tnum = omp_get_num_threads();
     T tripCount = ub - lb + 1; // +1 because ub is inclusive
     ASSERT0(LT_FUSSY, threadId < tnum,
             "current thread is not needed here; error");
@@ -453,7 +454,7 @@
 
     // automatically selects thread or warp ID based on selected implementation
     int tid = GetLogicalThreadIdInBlock(checkSPMDMode(loc));
-    ASSERT0(LT_FUSSY, gtid < GetNumberOfOmpThreads(checkSPMDMode(loc)),
+    ASSERT0(LT_FUSSY, gtid < omp_get_num_threads(),
             "current thread is not needed here; error");
     // retrieve schedule
     kmp_sched_t schedule =
@@ -507,9 +508,9 @@
     PRINT(LD_LOOP,
           "Got sched: active %d, total %d: lb %lld, ub %lld, stride = %lld, "
           "last %d\n",
-          (int)GetNumberOfOmpThreads(isSPMDMode()),
-          (int)GetNumberOfWorkersInTeam(), (long long)*plower,
-          (long long)*pupper, (long long)*pstride, (int)*plast);
+          (int)omp_get_num_threads(), (int)GetNumberOfWorkersInTeam(),
+          (long long)*plower, (long long)*pupper, (long long)*pstride,
+          (int)*plast);
     return DISPATCH_NOTFINISHED;
   }
 
diff --git a/openmp/libomptarget/deviceRTLs/common/src/omp_data.cu b/openmp/libomptarget/deviceRTLs/common/src/omp_data.cu
--- a/openmp/libomptarget/deviceRTLs/common/src/omp_data.cu
+++ b/openmp/libomptarget/deviceRTLs/common/src/omp_data.cu
@@ -34,13 +34,6 @@
 DEVICE uint32_t SHARED(usedMemIdx);
 DEVICE uint32_t SHARED(usedSlotIdx);
 
-// SHARED doesn't work with array so we add the attribute explicitly.
-[[clang::loader_uninitialized]] DEVICE uint8_t
-    parallelLevel[MAX_THREADS_PER_TEAM / WARPSIZE];
-#pragma omp allocate(parallelLevel) allocator(omp_pteam_mem_alloc)
-DEVICE uint16_t SHARED(threadLimit);
-DEVICE uint16_t SHARED(threadsInTeam);
-DEVICE uint16_t SHARED(nThreads);
 // Pointer to this team's OpenMP state object
 DEVICE omptarget_nvptx_ThreadPrivateContext *
     SHARED(omptarget_nvptx_threadPrivateContext);
diff --git a/openmp/libomptarget/deviceRTLs/common/src/omptarget.cu b/openmp/libomptarget/deviceRTLs/common/src/omptarget.cu
--- a/openmp/libomptarget/deviceRTLs/common/src/omptarget.cu
+++ b/openmp/libomptarget/deviceRTLs/common/src/omptarget.cu
@@ -11,8 +11,13 @@
 //===----------------------------------------------------------------------===//
 #pragma omp declare target
 
+#include "TeamState.h"
+#include "ThreadState.h"
 #include "common/omptarget.h"
 #include "target_impl.h"
+#include "target_interface.h"
+
+#include <string.h>
 
 ////////////////////////////////////////////////////////////////////////////////
 // global data tables
@@ -26,14 +31,12 @@
 // init entry points
 ////////////////////////////////////////////////////////////////////////////////
 
-EXTERN void __kmpc_kernel_init(int ThreadLimit, int16_t RequiresOMPRuntime) {
+EXTERN void __kmpc_kernel_init(int, int16_t RequiresOMPRuntime) {
   PRINT(LD_IO, "call to __kmpc_kernel_init with version %f\n",
         OMPTARGET_NVPTX_VERSION);
   ASSERT0(LT_FUSSY, RequiresOMPRuntime,
           "Generic always requires initialized runtime.");
   setExecutionParameters(Generic, RuntimeInitialized);
-  for (int I = 0; I < MAX_THREADS_PER_TEAM / WARPSIZE; ++I)
-    parallelLevel[I] = 0;
 
   int threadIdInBlock = GetThreadIdInBlock();
   ASSERT0(LT_FUSSY, threadIdInBlock == GetMasterThreadID(),
@@ -62,9 +65,16 @@
   // set number of threads and thread limit in team to started value
   omptarget_nvptx_TaskDescr *currTaskDescr =
       omptarget_nvptx_threadPrivateContext->GetTopLevelTaskDescr(threadId);
-  nThreads = GetNumberOfThreadsInBlock();
-  threadLimit = ThreadLimit;
   __kmpc_impl_target_init();
+
+  if (threadId == 0) {
+    omp::TeamState.ICVState.nthreads_var = GetNumberOfThreadsInBlock();
+    omp::TeamState.ICVState.levels_var = 0;
+    omp::TeamState.ICVState.active_level = -1;
+    omp::TeamState.ParallelTeamSize = -1;
+    memset(omp::ThreadStates, 0,
+           GetNumberOfThreadsInBlock() * sizeof(omp::ThreadStates[0]));
+  }
 }
 
 EXTERN void __kmpc_kernel_deinit(int16_t IsOMPRuntimeInitialized) {
@@ -72,27 +82,26 @@
   ASSERT0(LT_FUSSY, IsOMPRuntimeInitialized,
           "Generic always requires initialized runtime.");
   // Enqueue omp state object for use by another team.
-  int slot = usedSlotIdx;
+  int slot = __kmpc_impl_smid() % MAX_SM;
   omptarget_nvptx_device_State[slot].Enqueue(
       omptarget_nvptx_threadPrivateContext);
   // Done with work.  Kill the workers.
   omptarget_nvptx_workFn = 0;
 }
 
-EXTERN void __kmpc_spmd_kernel_init(int ThreadLimit,
-                                    int16_t RequiresOMPRuntime) {
+EXTERN void __kmpc_spmd_kernel_init(int, int16_t RequiresOMPRuntime) {
   PRINT0(LD_IO, "call to __kmpc_spmd_kernel_init\n");
 
   setExecutionParameters(Spmd, RequiresOMPRuntime ? RuntimeInitialized
                                                   : RuntimeUninitialized);
   int threadId = GetThreadIdInBlock();
   if (threadId == 0) {
-    usedSlotIdx = __kmpc_impl_smid() % MAX_SM;
-    parallelLevel[0] =
-        1 + (GetNumberOfThreadsInBlock() > 1 ? OMP_ACTIVE_PARALLEL_LEVEL : 0);
-  } else if (GetLaneId() == 0) {
-    parallelLevel[GetWarpId()] =
-        1 + (GetNumberOfThreadsInBlock() > 1 ? OMP_ACTIVE_PARALLEL_LEVEL : 0);
+    omp::TeamState.ICVState.nthreads_var = 1;
+    omp::TeamState.ICVState.levels_var = 1;
+    omp::TeamState.ICVState.active_level = GetNumberOfThreadsInBlock() > 1;
+    omp::TeamState.ParallelTeamSize = GetNumberOfThreadsInBlock();
+    memset(omp::ThreadStates, 0,
+           GetNumberOfThreadsInBlock() * sizeof(omp::ThreadStates[0]));
   }
   if (!RequiresOMPRuntime) {
     // Runtime is not required - exit.
diff --git a/openmp/libomptarget/deviceRTLs/common/src/parallel.cu b/openmp/libomptarget/deviceRTLs/common/src/parallel.cu
--- a/openmp/libomptarget/deviceRTLs/common/src/parallel.cu
+++ b/openmp/libomptarget/deviceRTLs/common/src/parallel.cu
@@ -31,31 +31,27 @@
 //    To make a long story short...
 //
 //===----------------------------------------------------------------------===//
+#include "ICVs.h"
+#include "TeamState.h"
+#include "ThreadState.h"
+#include "interface.h"
 #pragma omp declare target
 
 #include "common/omptarget.h"
 #include "target_impl.h"
 
+using namespace omp;
+
 ////////////////////////////////////////////////////////////////////////////////
 // support for parallel that goes parallel (1 static level only)
 ////////////////////////////////////////////////////////////////////////////////
 
-INLINE static uint16_t determineNumberOfThreads(uint16_t NumThreadsClause,
-                                                uint16_t NThreadsICV,
-                                                uint16_t ThreadLimit) {
-  uint16_t ThreadsRequested = NThreadsICV;
-  if (NumThreadsClause != 0) {
-    ThreadsRequested = NumThreadsClause;
-  }
-
-  uint16_t ThreadsAvailable = GetNumberOfWorkersInTeam();
-  if (ThreadLimit != 0 && ThreadLimit < ThreadsAvailable) {
-    ThreadsAvailable = ThreadLimit;
-  }
+INLINE static uint16_t determineNumberOfThreads() {
+  int NThreadsICV = ICVStateTy::getICVForThread(&ICVStateTy::nthreads_var);
 
-  uint16_t NumThreads = ThreadsAvailable;
-  if (ThreadsRequested != 0 && ThreadsRequested < NumThreads) {
-    NumThreads = ThreadsRequested;
+  uint16_t NumThreads = GetNumberOfWorkersInTeam();
+  if (NThreadsICV != 0 && NThreadsICV < NumThreads) {
+    NumThreads = NThreadsICV;
   }
 
 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
@@ -92,26 +88,25 @@
     return;
   }
 
-  uint16_t &NumThreadsClause =
-      omptarget_nvptx_threadPrivateContext->NumThreadsForNextParallel(threadId);
-
-  uint16_t NumThreads =
-      determineNumberOfThreads(NumThreadsClause, nThreads, threadLimit);
-
-  if (NumThreadsClause != 0) {
-    // Reset request to avoid propagating to successive #parallel
-    NumThreadsClause = 0;
-  }
+  uint16_t NumThreads = determineNumberOfThreads();
+  TeamState.ParallelTeamSize = NumThreads;
 
-  ASSERT(LT_FUSSY, NumThreads > 0, "bad thread request of %d threads",
-         (int)NumThreads);
   ASSERT0(LT_FUSSY, GetThreadIdInBlock() == GetMasterThreadID(),
           "only team master can create parallel");
 
   // Set number of threads on work descriptor.
   omptarget_nvptx_WorkDescr &workDescr = getMyWorkDescriptor();
   workDescr.WorkTaskDescr()->CopyToWorkDescr(currTaskDescr);
-  threadsInTeam = NumThreads;
+
+  // We do *not* create a new data environment because all threads in the team
+  // that are active are now running this parallel region. They share the
+  // TeamState, which has an increase level-var and potentially active-level
+  // set, but they do not have individual ThreadStates yet. If they ever
+  // modify the ICVs beyond this point a ThreadStates will be allocated.
+  auto Level = TeamState.ICVState.levels_var += 1;
+  bool IsActiveParallelRegion = NumThreads > 1;
+  if (IsActiveParallelRegion)
+    TeamState.ICVState.active_level = Level;
 }
 
 // All workers call this function.  Deactivate those not needed.
@@ -122,6 +117,11 @@
 EXTERN bool __kmpc_kernel_parallel(void **WorkFn) {
   PRINT0(LD_IO | LD_PAR, "call to __kmpc_kernel_parallel\n");
 
+  int threadId = GetThreadIdInBlock();
+  // In case we have modified an ICV for this thread before a ThreadState was
+  // created. We drop it now to not contaminate the next parallel region.
+  ThreadStateTy::dropForThread(threadId);
+
   // Work function and arguments for L1 parallel region.
   *WorkFn = omptarget_nvptx_workFn;
 
@@ -131,14 +131,15 @@
     return false;
   }
 
+  uint16_t NumThreads = determineNumberOfThreads();
+
   // Only the worker threads call this routine and the master warp
   // never arrives here.  Therefore, use the nvptx thread id.
-  int threadId = GetThreadIdInBlock();
   omptarget_nvptx_WorkDescr &workDescr = getMyWorkDescriptor();
   // Set to true for workers participating in the parallel region.
-  bool isActive = false;
+  bool ThreadIsActive = threadId < NumThreads;
   // Initialize state for active threads.
-  if (threadId < threadsInTeam) {
+  if (ThreadIsActive) {
     // init work descriptor from workdesccr
     omptarget_nvptx_TaskDescr *newTaskDescr =
         omptarget_nvptx_threadPrivateContext->Level1TaskDescr(threadId);
@@ -152,21 +153,9 @@
           "thread will execute parallel region with id %d in a team of "
           "%d threads\n",
           (int)newTaskDescr->ThreadId(), (int)nThreads);
-
-    isActive = true;
-    // Reconverge the threads at the end of the parallel region to correctly
-    // handle parallel levels.
-    // In Cuda9+ in non-SPMD mode we have either 1 worker thread or the whole
-    // warp. If only 1 thread is active, not need to reconverge the threads.
-    // If we have the whole warp, reconverge all the threads in the warp before
-    // actually trying to change the parallel level. Otherwise, parallel level
-    // can be changed incorrectly because of threads divergence.
-    bool IsActiveParallelRegion = threadsInTeam != 1;
-    IncParallelLevel(IsActiveParallelRegion,
-                     IsActiveParallelRegion ? __kmpc_impl_all_lanes : 1u);
   }
 
-  return isActive;
+  return ThreadIsActive;
 }
 
 EXTERN void __kmpc_kernel_end_parallel() {
@@ -181,16 +170,13 @@
   omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(
       threadId, currTaskDescr->GetPrevTaskDescr());
 
-  // Reconverge the threads at the end of the parallel region to correctly
-  // handle parallel levels.
-  // In Cuda9+ in non-SPMD mode we have either 1 worker thread or the whole
-  // warp. If only 1 thread is active, not need to reconverge the threads.
-  // If we have the whole warp, reconverge all the threads in the warp before
-  // actually trying to change the parallel level. Otherwise, parallel level can
-  // be changed incorrectly because of threads divergence.
-  bool IsActiveParallelRegion = threadsInTeam != 1;
-  DecParallelLevel(IsActiveParallelRegion,
-                   IsActiveParallelRegion ? __kmpc_impl_all_lanes : 1u);
+  // We did *not* create a new data environment because all threads in the team
+  // that were active were running the parallel region. We used the TeamState
+  // which needs adjustment now.
+  TeamState.ICVState.levels_var -= 1;
+  bool IsActiveParallelRegion = omp_get_num_threads() > 1;
+  if (IsActiveParallelRegion)
+    TeamState.ICVState.active_level = 0;
 }
 
 ////////////////////////////////////////////////////////////////////////////////
@@ -200,7 +186,8 @@
 EXTERN void __kmpc_serialized_parallel(kmp_Ident *loc, uint32_t global_tid) {
   PRINT0(LD_IO, "call to __kmpc_serialized_parallel\n");
 
-  IncParallelLevel(/*ActiveParallel=*/false, __kmpc_impl_activemask());
+  ThreadStateTy::enterDataEnvironment();
+  ICVStateTy::incICVForThread(&ICVStateTy::levels_var, 1);
 
   if (checkRuntimeUninitialized(loc)) {
     ASSERT0(LT_FUSSY, checkSPMDMode(loc),
@@ -239,7 +226,8 @@
                                            uint32_t global_tid) {
   PRINT0(LD_IO, "call to __kmpc_end_serialized_parallel\n");
 
-  DecParallelLevel(/*ActiveParallel=*/false, __kmpc_impl_activemask());
+  ThreadStateTy::exitDataEnvironment();
+  ICVStateTy::incICVForThread(&ICVStateTy::levels_var, -1);
 
   if (checkRuntimeUninitialized(loc)) {
     ASSERT0(LT_FUSSY, checkSPMDMode(loc),
@@ -261,8 +249,7 @@
 
 EXTERN uint16_t __kmpc_parallel_level(kmp_Ident *loc, uint32_t global_tid) {
   PRINT0(LD_IO, "call to __kmpc_parallel_level\n");
-
-  return parallelLevel[GetWarpId()] & (OMP_ACTIVE_PARALLEL_LEVEL - 1);
+  return omp_get_level();
 }
 
 // This kmpc call returns the thread id across all teams. It's value is
@@ -270,22 +257,16 @@
 // it's cheap to recalculate this value so we never use the result
 // of this call.
 EXTERN int32_t __kmpc_global_thread_num(kmp_Ident *loc) {
-  int tid = GetLogicalThreadIdInBlock(checkSPMDMode(loc));
-  return GetOmpThreadId(tid, checkSPMDMode(loc));
+  return omp_get_thread_num();
 }
 
 ////////////////////////////////////////////////////////////////////////////////
 // push params
 ////////////////////////////////////////////////////////////////////////////////
 
-EXTERN void __kmpc_push_num_threads(kmp_Ident *loc, int32_t tid,
-                                    int32_t num_threads) {
-  PRINT(LD_IO, "call kmpc_push_num_threads %d\n", num_threads);
-  ASSERT0(LT_FUSSY, checkRuntimeInitialized(loc),
-          "Runtime must be initialized.");
-  tid = GetLogicalThreadIdInBlock(checkSPMDMode(loc));
-  omptarget_nvptx_threadPrivateContext->NumThreadsForNextParallel(tid) =
-      num_threads;
+EXTERN void __kmpc_push_num_threads(kmp_Ident *Loc, int32_t TId,
+                                    int32_t NumThreads) {
+  ICVStateTy::setICVForThread(&ICVStateTy::nthreads_var, NumThreads);
 }
 
 // Do nothing. The host guarantees we started the requested number of
diff --git a/openmp/libomptarget/deviceRTLs/common/src/reduction.cu b/openmp/libomptarget/deviceRTLs/common/src/reduction.cu
--- a/openmp/libomptarget/deviceRTLs/common/src/reduction.cu
+++ b/openmp/libomptarget/deviceRTLs/common/src/reduction.cu
@@ -9,6 +9,7 @@
 // This file contains the implementation of reduction with KMPC interface.
 //
 //===----------------------------------------------------------------------===//
+#include "ICVs.h"
 #pragma omp declare target
 
 #include "common/omptarget.h"
@@ -81,7 +82,7 @@
     kmp_ShuffleReductFctPtr shflFct, kmp_InterWarpCopyFctPtr cpyFct,
     bool isSPMDExecutionMode, bool isRuntimeUninitialized) {
   uint32_t BlockThreadId = GetLogicalThreadIdInBlock(isSPMDExecutionMode);
-  uint32_t NumThreads = GetNumberOfOmpThreads(isSPMDExecutionMode);
+  uint32_t NumThreads = omp_get_num_threads();
   if (NumThreads == 1)
     return 1;
     /*
@@ -204,8 +205,7 @@
   // In generic mode only the team master participates in the teams
   // reduction because the workers are waiting for parallel work.
   uint32_t NumThreads =
-      checkSPMDMode(loc) ? GetNumberOfOmpThreads(/*isSPMDExecutionMode=*/true)
-                         : /*Master thread only*/ 1;
+      checkSPMDMode(loc) ? omp_get_num_threads() : /*Master thread only*/ 1;
   uint32_t TeamId = GetBlockIdInKernel();
   uint32_t NumTeams = GetNumberOfBlocksInKernel();
   static unsigned SHARED(Bound);
diff --git a/openmp/libomptarget/deviceRTLs/common/src/support.cu b/openmp/libomptarget/deviceRTLs/common/src/support.cu
--- a/openmp/libomptarget/deviceRTLs/common/src/support.cu
+++ b/openmp/libomptarget/deviceRTLs/common/src/support.cu
@@ -136,37 +136,6 @@
 //
 ////////////////////////////////////////////////////////////////////////////////
 
-DEVICE int GetOmpThreadId(int threadId, bool isSPMDExecutionMode) {
-  // omp_thread_num
-  int rc;
-  if ((parallelLevel[GetWarpId()] & (OMP_ACTIVE_PARALLEL_LEVEL - 1)) > 1) {
-    rc = 0;
-  } else if (isSPMDExecutionMode) {
-    rc = GetThreadIdInBlock();
-  } else {
-    omptarget_nvptx_TaskDescr *currTaskDescr =
-        omptarget_nvptx_threadPrivateContext->GetTopLevelTaskDescr(threadId);
-    ASSERT0(LT_FUSSY, currTaskDescr, "expected a top task descr");
-    rc = currTaskDescr->ThreadId();
-  }
-  return rc;
-}
-
-DEVICE int GetNumberOfOmpThreads(bool isSPMDExecutionMode) {
-  // omp_num_threads
-  int rc;
-  int Level = parallelLevel[GetWarpId()];
-  if (Level != OMP_ACTIVE_PARALLEL_LEVEL + 1) {
-    rc = 1;
-  } else if (isSPMDExecutionMode) {
-    rc = GetNumberOfThreadsInBlock();
-  } else {
-    rc = threadsInTeam;
-  }
-
-  return rc;
-}
-
 ////////////////////////////////////////////////////////////////////////////////
 // Team id linked to OpenMP
 
@@ -185,33 +154,6 @@
 
 DEVICE int IsTeamMaster(int ompThreadId) { return (ompThreadId == 0); }
 
-////////////////////////////////////////////////////////////////////////////////
-// Parallel level
-
-DEVICE void IncParallelLevel(bool ActiveParallel, __kmpc_impl_lanemask_t Mask) {
-  __kmpc_impl_syncwarp(Mask);
-  __kmpc_impl_lanemask_t LaneMaskLt = __kmpc_impl_lanemask_lt();
-  unsigned Rank = __kmpc_impl_popc(Mask & LaneMaskLt);
-  if (Rank == 0) {
-    parallelLevel[GetWarpId()] +=
-        (1 + (ActiveParallel ? OMP_ACTIVE_PARALLEL_LEVEL : 0));
-    __kmpc_impl_threadfence();
-  }
-  __kmpc_impl_syncwarp(Mask);
-}
-
-DEVICE void DecParallelLevel(bool ActiveParallel, __kmpc_impl_lanemask_t Mask) {
-  __kmpc_impl_syncwarp(Mask);
-  __kmpc_impl_lanemask_t LaneMaskLt = __kmpc_impl_lanemask_lt();
-  unsigned Rank = __kmpc_impl_popc(Mask & LaneMaskLt);
-  if (Rank == 0) {
-    parallelLevel[GetWarpId()] -=
-        (1 + (ActiveParallel ? OMP_ACTIVE_PARALLEL_LEVEL : 0));
-    __kmpc_impl_threadfence();
-  }
-  __kmpc_impl_syncwarp(Mask);
-}
-
 ////////////////////////////////////////////////////////////////////////////////
 // get OpenMP number of procs
 
diff --git a/openmp/libomptarget/deviceRTLs/common/src/sync.cu b/openmp/libomptarget/deviceRTLs/common/src/sync.cu
--- a/openmp/libomptarget/deviceRTLs/common/src/sync.cu
+++ b/openmp/libomptarget/deviceRTLs/common/src/sync.cu
@@ -9,6 +9,7 @@
 // Include all synchronization.
 //
 //===----------------------------------------------------------------------===//
+#include "ICVs.h"
 #pragma omp declare target
 
 #include "common/omptarget.h"
@@ -48,8 +49,7 @@
     __kmpc_barrier_simple_spmd(loc_ref, tid);
   } else {
     tid = GetLogicalThreadIdInBlock(checkSPMDMode(loc_ref));
-    int numberOfActiveOMPThreads =
-        GetNumberOfOmpThreads(checkSPMDMode(loc_ref));
+    int numberOfActiveOMPThreads = omp_get_num_threads();
     if (numberOfActiveOMPThreads > 1) {
       if (checkSPMDMode(loc_ref)) {
         __kmpc_barrier_simple_spmd(loc_ref, tid);
diff --git a/openmp/libomptarget/deviceRTLs/common/support.h b/openmp/libomptarget/deviceRTLs/common/support.h
--- a/openmp/libomptarget/deviceRTLs/common/support.h
+++ b/openmp/libomptarget/deviceRTLs/common/support.h
@@ -56,13 +56,10 @@
 DEVICE int GetNumberOfWorkersInTeam();
 
 // get OpenMP thread and team ids
-DEVICE int GetOmpThreadId(int threadId,
-                          bool isSPMDExecutionMode); // omp_thread_num
-DEVICE int GetOmpTeamId();                           // omp_team_num
+DEVICE int GetOmpTeamId(); // omp_team_num
 
 // get OpenMP number of threads and team
-DEVICE int GetNumberOfOmpThreads(bool isSPMDExecutionMode); // omp_num_threads
-DEVICE int GetNumberOfOmpTeams();                           // omp_num_teams
+DEVICE int GetNumberOfOmpTeams(); // omp_num_teams
 
 // get OpenMP number of procs
 DEVICE int GetNumberOfProcsInTeam(bool isSPMDExecutionMode);
@@ -71,10 +68,6 @@
 // masters
 DEVICE int IsTeamMaster(int ompThreadId);
 
-// Parallel level
-DEVICE void IncParallelLevel(bool ActiveParallel, __kmpc_impl_lanemask_t Mask);
-DEVICE void DecParallelLevel(bool ActiveParallel, __kmpc_impl_lanemask_t Mask);
-
 ////////////////////////////////////////////////////////////////////////////////
 // Memory
 ////////////////////////////////////////////////////////////////////////////////
diff --git a/openmp/libomptarget/deviceRTLs/nvptx/CMakeLists.txt b/openmp/libomptarget/deviceRTLs/nvptx/CMakeLists.txt
--- a/openmp/libomptarget/deviceRTLs/nvptx/CMakeLists.txt
+++ b/openmp/libomptarget/deviceRTLs/nvptx/CMakeLists.txt
@@ -68,6 +68,8 @@
 get_filename_component(devicertl_base_directory
   ${CMAKE_CURRENT_SOURCE_DIR}
   DIRECTORY)
+set(devicertl_include_directory
+  ${devicertl_base_directory}/common/include)
 set(devicertl_common_directory
   ${devicertl_base_directory}/common)
 set(devicertl_nvptx_directory
@@ -128,6 +130,7 @@
   ${devicertl_common_directory}/src/support.cu
   ${devicertl_common_directory}/src/sync.cu
   ${devicertl_common_directory}/src/task.cu
+  ${devicertl_common_directory}/src/ICVs.cpp
   src/target_impl.cu
 )
 
@@ -139,6 +142,8 @@
              -fopenmp -fopenmp-cuda-mode -Xclang -fopenmp-is-device
              -D__CUDACC__
              -I${devicertl_base_directory}
+             -I${devicertl_common_directory}
+             -I${devicertl_include_directory}
              -I${devicertl_nvptx_directory}/src)
 
 if(${LIBOMPTARGET_NVPTX_DEBUG})
diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h b/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h
--- a/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h
+++ b/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h
@@ -62,8 +62,6 @@
 #endif
 #endif
 
-#define OMP_ACTIVE_PARALLEL_LEVEL 128
-
 // Data sharing related quantities, need to match what is used in the compiler.
 enum DATA_SHARING_SIZES {
   // The maximum number of workers in a kernel.
diff --git a/openmp/libomptarget/test/offloading/omp_get_team_size.c b/openmp/libomptarget/test/offloading/omp_get_team_size.c
new file mode 100644
--- /dev/null
+++ b/openmp/libomptarget/test/offloading/omp_get_team_size.c
@@ -0,0 +1,36 @@
+// TODO: make this a test case
+#include <omp.h>
+#include <stdio.h>
+#define N 12
+
+void foo(int *A) {
+  if (omp_get_thread_num() == N / 2)
+  {
+    for (int i = 0; i < 4; ++i)
+      A[i] = omp_get_team_size(i);
+    #pragma omp parallel
+    {
+      for (int i = 0; i < 4; ++i)
+        A[i+4] = omp_get_team_size(i);
+      #pragma omp parallel
+      {
+        for (int i = 0; i < 4; ++i)
+          A[i+8] = omp_get_team_size(i);
+      }
+    }
+  }
+}
+
+int main() {
+  int A[N];
+  for (int i = 0; i < N; ++i)
+    A[i] = 42424242;
+  #pragma omp target teams num_teams(1) thread_limit(N) map(tofrom:A[:N])
+  {
+    #pragma omp parallel
+    foo(A);
+  }
+  for (int i = 0; i < N; ++i)
+    printf("%i : %i\n", i, A[i]);
+  return 0;
+}
diff --git a/openmp/libomptarget/test/offloading/omp_set_num_threads.c b/openmp/libomptarget/test/offloading/omp_set_num_threads.c
new file mode 100644
--- /dev/null
+++ b/openmp/libomptarget/test/offloading/omp_set_num_threads.c
@@ -0,0 +1,45 @@
+// TODO: make this a test case
+#include <omp.h>
+#include <stdio.h>
+#define N 12
+
+void foo(int *A) {
+  if (omp_get_thread_num() == N / 2)
+  {
+    for (int i = 0; i < 2; ++i)
+      A[i] = omp_get_max_threads();
+    omp_set_num_threads(42);
+    for (int i = 2; i < 4; ++i)
+      A[i] = omp_get_max_threads();
+    #pragma omp parallel
+    {
+      for (int i = 0; i < 2; ++i)
+        A[i+4] = omp_get_max_threads();
+      omp_set_num_threads(4242);
+      for (int i = 2; i < 4; ++i)
+        A[i+4] = omp_get_max_threads();
+      #pragma omp parallel
+      {
+        for (int i = 0; i < 2; ++i)
+          A[i+8] = omp_get_max_threads();
+        omp_set_num_threads(424242);
+        for (int i = 2; i < 4; ++i)
+          A[i+8] = omp_get_max_threads();
+      }
+    }
+  }
+}
+
+int main() {
+  int A[N];
+  for (int i = 0; i < N; ++i)
+    A[i] = 42424242;
+  #pragma omp target teams num_teams(1) thread_limit(N) map(tofrom:A[:N])
+  {
+    #pragma omp parallel
+    foo(A);
+  }
+  for (int i = 0; i < N; ++i)
+    printf("%i : %i\n", i, A[i]);
+  return 0;
+}