Index: libomptarget/deviceRTLs/nvptx/CMakeLists.txt
===================================================================
--- libomptarget/deviceRTLs/nvptx/CMakeLists.txt
+++ libomptarget/deviceRTLs/nvptx/CMakeLists.txt
@@ -93,7 +93,7 @@
     # Find a clang compiler capable of compiling cuda files to LLVM bitcode and
     # an LLVM linker.
     # We use the one provided by the user, attempt to use the one used to build
-    # libomptarget, attempt to use clang in the PATH, or just fail.
+    # libomptarget or just fail.
 
     set(LIBOMPTARGET_NVPTX_CUDA_COMPILER "" CACHE STRING
       "Location of a CUDA compiler capable of emitting LLVM bitcode.")
@@ -105,8 +105,8 @@
     elseif(${CMAKE_C_COMPILER_ID} STREQUAL "Clang")
       set(LIBOMPTARGET_NVPTX_SELECTED_CUDA_COMPILER ${CMAKE_C_COMPILER})
     else()
-      libomptarget_say("Cannot find a CUDA compiler capable of emitting LLVM bitcode.")
-      libomptarget_say("Please configure with flag -DLIBOMPTARGET_NVPTX_CUDA_COMPILER")
+      libomptarget_error_say("Cannot find a CUDA compiler capable of emitting LLVM bitcode.")
+      libomptarget_error_say("Please configure with flag -DLIBOMPTARGET_NVPTX_CUDA_COMPILER")
     endif()
 
     # Get compiler directory to try to locate a suitable linker
@@ -118,8 +118,8 @@
       # Use llvm-link from the directory containing clang
       set(LIBOMPTARGET_NVPTX_SELECTED_BC_LINKER ${COMPILER_DIR}/llvm-link)
     else()
-      libomptarget_say("Cannot find a linker capable of linking LLVM bitcode objects.")
-      libomptarget_say("Please configure with flag -DLIBOMPTARGET_NVPTX_BC_LINKER")
+      libomptarget_error_say("Cannot find a linker capable of linking LLVM bitcode objects.")
+      libomptarget_error_say("Please configure with flag -DLIBOMPTARGET_NVPTX_BC_LINKER")
     endif()
 
     if(LIBOMPTARGET_NVPTX_SELECTED_CUDA_COMPILER AND LIBOMPTARGET_NVPTX_SELECTED_BC_LINKER)
Index: libomptarget/deviceRTLs/nvptx/src/data_sharing.cu
===================================================================
--- libomptarget/deviceRTLs/nvptx/src/data_sharing.cu
+++ libomptarget/deviceRTLs/nvptx/src/data_sharing.cu
@@ -23,7 +23,7 @@
 }
 // Warp ID in the CUDA block
 __device__ static unsigned getWarpId() {
-  return threadIdx.x >> DS_Max_Worker_Warp_Size_Bits;
+  return threadIdx.x / WARPSIZE;
 }
 
 // The CUDA thread ID of the master thread.
@@ -300,7 +300,7 @@
 
   // Get the frame used by the requested thread.
 
-  unsigned SourceWID = SourceThreadID >> DS_Max_Worker_Warp_Size_Bits;
+  unsigned SourceWID = SourceThreadID / WARPSIZE;
 
   DSPRINT(DSFLAG,"Source  warp: %d\n", SourceWID);
 
Index: libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu
===================================================================
--- libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu
+++ libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu
@@ -42,12 +42,6 @@
   return id;
 }
 
-INLINE unsigned n_sm() {
-  unsigned n_sm;
-  asm("mov.u32 %0, %%nsmid;" : "=r"(n_sm));
-  return n_sm;
-}
-
 EXTERN void __kmpc_kernel_init_params(void *Ptr) {
   PRINT(LD_IO, "call to __kmpc_kernel_init_params with version %f\n",
         OMPTARGET_NVPTX_VERSION);
@@ -172,7 +166,7 @@
 
   if (RequiresDataSharing && threadId % WARPSIZE == 0) {
     // Warp master innitializes data sharing environment.
-    unsigned WID = threadId >> DS_Max_Worker_Warp_Size_Bits;
+    unsigned WID = threadId / WARPSIZE;
     __kmpc_data_sharing_slot *RootS = currTeamDescr.RootS(WID);
     DataSharingState.SlotPtr[WID] = RootS;
     DataSharingState.StackPtr[WID] = (void*)&RootS->Data[0];
Index: libomptarget/deviceRTLs/nvptx/src/option.h
===================================================================
--- libomptarget/deviceRTLs/nvptx/src/option.h
+++ libomptarget/deviceRTLs/nvptx/src/option.h
@@ -19,10 +19,8 @@
 
 ////////////////////////////////////////////////////////////////////////////////
 // The following def must match the absolute limit hardwired in the host RTL
-#define THREAD_ABSOLUTE_LIMIT 1024 /* omptx limit (must match threadAbsoluteLimit) */
-
 // max number of threads per team
-#define MAX_THREADS_PER_TEAM THREAD_ABSOLUTE_LIMIT
+#define MAX_THREADS_PER_TEAM 1024
 
 #define WARPSIZE 32
 
Index: libomptarget/deviceRTLs/nvptx/src/reduction.cu
===================================================================
--- libomptarget/deviceRTLs/nvptx/src/reduction.cu
+++ libomptarget/deviceRTLs/nvptx/src/reduction.cu
@@ -234,8 +234,8 @@
   // Get the OMP thread Id. This is different from BlockThreadId in the case of
   // an L2 parallel region.
   return GetOmpThreadId(BlockThreadId, isSPMDExecutionMode, isRuntimeUninitialized) == 0;
-}
 #endif // __CUDA_ARCH__ >= 700
+}
 
 EXTERN
 int32_t __kmpc_nvptx_parallel_reduce_nowait(
@@ -382,10 +382,10 @@
     if (WarpId == 0)
       gpu_irregular_warp_reduce(reduce_data, shflFct, WarpsNeeded, ThreadId);
   }
+#endif // __CUDA_ARCH__ >= 700
 
   return ThreadId == 0;
 }
-#endif // __CUDA_ARCH__ >= 700
 
 EXTERN
 int32_t __kmpc_nvptx_teams_reduce_nowait(int32_t global_tid, int32_t num_vars,