Index: openmp/trunk/libomptarget/deviceRTLs/nvptx/src/loop.cu
===================================================================
--- openmp/trunk/libomptarget/deviceRTLs/nvptx/src/loop.cu
+++ openmp/trunk/libomptarget/deviceRTLs/nvptx/src/loop.cu
@@ -381,7 +381,7 @@
   // Support for dispatch next
 
   INLINE static int64_t Shuffle(unsigned active, int64_t val, int leader) {
-    int lo, hi;
+    uint32_t lo, hi;
     __kmpc_impl_unpack(val, lo, hi);
     hi = __kmpc_impl_shfl_sync(active, hi, leader);
     lo = __kmpc_impl_shfl_sync(active, lo, leader);
@@ -390,8 +390,8 @@
 
   INLINE static uint64_t NextIter() {
     __kmpc_impl_lanemask_t active = __ACTIVEMASK();
-    int leader = __kmpc_impl_ffs(active) - 1;
-    int change = __kmpc_impl_popc(active);
+    uint32_t leader = __kmpc_impl_ffs(active) - 1;
+    uint32_t change = __kmpc_impl_popc(active);
     __kmpc_impl_lanemask_t lane_mask_lt = __kmpc_impl_lanemask_lt();
     unsigned int rank = __kmpc_impl_popc(active & lane_mask_lt);
     uint64_t warp_res;
Index: openmp/trunk/libomptarget/deviceRTLs/nvptx/src/parallel.cu
===================================================================
--- openmp/trunk/libomptarget/deviceRTLs/nvptx/src/parallel.cu
+++ openmp/trunk/libomptarget/deviceRTLs/nvptx/src/parallel.cu
@@ -49,13 +49,12 @@
                                           int32_t *LaneId, int32_t *NumLanes) {
   PRINT0(LD_IO, "call to __kmpc_kernel_convergent_simd\n");
   uint32_t ConvergentMask = Mask;
-  int32_t ConvergentSize = __popc(ConvergentMask);
+  int32_t ConvergentSize = __kmpc_impl_popc(ConvergentMask);
   uint32_t WorkRemaining = ConvergentMask >> (*LaneSource + 1);
-  *LaneSource += __ffs(WorkRemaining);
-  *IsFinal = __popc(WorkRemaining) == 1;
-  uint32_t lanemask_lt;
-  asm("mov.u32 %0, %%lanemask_lt;" : "=r"(lanemask_lt));
-  *LaneId = __popc(ConvergentMask & lanemask_lt);
+  *LaneSource += __kmpc_impl_ffs(WorkRemaining);
+  *IsFinal = __kmpc_impl_popc(WorkRemaining) == 1;
+  uint32_t lanemask_lt = __kmpc_impl_lanemask_lt();
+  *LaneId = __kmpc_impl_popc(ConvergentMask & lanemask_lt);
 
   int threadId = GetLogicalThreadIdInBlock(isSPMDMode());
   int sourceThreadId = (threadId & ~(WARPSIZE - 1)) + *LaneSource;
@@ -123,13 +122,12 @@
                                               int32_t *LaneSource) {
   PRINT0(LD_IO, "call to __kmpc_kernel_convergent_parallel\n");
   uint32_t ConvergentMask = Mask;
-  int32_t ConvergentSize = __popc(ConvergentMask);
+  int32_t ConvergentSize = __kmpc_impl_popc(ConvergentMask);
   uint32_t WorkRemaining = ConvergentMask >> (*LaneSource + 1);
-  *LaneSource += __ffs(WorkRemaining);
-  *IsFinal = __popc(WorkRemaining) == 1;
-  uint32_t lanemask_lt;
-  asm("mov.u32 %0, %%lanemask_lt;" : "=r"(lanemask_lt));
-  uint32_t OmpId = __popc(ConvergentMask & lanemask_lt);
+  *LaneSource += __kmpc_impl_ffs(WorkRemaining);
+  *IsFinal = __kmpc_impl_popc(WorkRemaining) == 1;
+  uint32_t lanemask_lt = __kmpc_impl_lanemask_lt();
+  uint32_t OmpId = __kmpc_impl_popc(ConvergentMask & lanemask_lt);
 
   int threadId = GetLogicalThreadIdInBlock(isSPMDMode());
   int sourceThreadId = (threadId & ~(WARPSIZE - 1)) + *LaneSource;
Index: openmp/trunk/libomptarget/deviceRTLs/nvptx/src/reduction.cu
===================================================================
--- openmp/trunk/libomptarget/deviceRTLs/nvptx/src/reduction.cu
+++ openmp/trunk/libomptarget/deviceRTLs/nvptx/src/reduction.cu
@@ -28,12 +28,11 @@
 }
 
 EXTERN int64_t __kmpc_shuffle_int64(int64_t val, int16_t delta, int16_t size) {
-   int lo, hi;
-   asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "l"(val));
+   uint32_t lo, hi;
+   __kmpc_impl_unpack(val, lo, hi);
    hi = __kmpc_impl_shfl_down_sync(0xFFFFFFFF, hi, delta, size);
    lo = __kmpc_impl_shfl_down_sync(0xFFFFFFFF, lo, delta, size);
-   asm volatile("mov.b64 %0, {%1,%2};" : "=l"(val) : "r"(lo), "r"(hi));
-   return val;
+   return __kmpc_impl_pack(lo, hi);
 }
 
 INLINE static void gpu_regular_warp_reduce(void *reduce_data,
@@ -60,18 +59,16 @@
 
 INLINE static uint32_t
 gpu_irregular_simd_reduce(void *reduce_data, kmp_ShuffleReductFctPtr shflFct) {
-  uint32_t lanemask_lt;
-  uint32_t lanemask_gt;
   uint32_t size, remote_id, physical_lane_id;
   physical_lane_id = GetThreadIdInBlock() % WARPSIZE;
-  asm("mov.u32 %0, %%lanemask_lt;" : "=r"(lanemask_lt));
+  uint32_t lanemask_lt = __kmpc_impl_lanemask_lt();
   uint32_t Liveness = __ACTIVEMASK();
-  uint32_t logical_lane_id = __popc(Liveness & lanemask_lt) * 2;
-  asm("mov.u32 %0, %%lanemask_gt;" : "=r"(lanemask_gt));
+  uint32_t logical_lane_id = __kmpc_impl_popc(Liveness & lanemask_lt) * 2;
+  uint32_t lanemask_gt = __kmpc_impl_lanemask_gt();
   do {
     Liveness = __ACTIVEMASK();
-    remote_id = __ffs(Liveness & lanemask_gt);
-    size = __popc(Liveness);
+    remote_id = __kmpc_impl_ffs(Liveness & lanemask_gt);
+    size = __kmpc_impl_popc(Liveness);
     logical_lane_id /= 2;
     shflFct(reduce_data, /*LaneId =*/logical_lane_id,
             /*Offset=*/remote_id - 1 - physical_lane_id, /*AlgoVersion=*/2);
@@ -150,7 +147,7 @@
     gpu_regular_warp_reduce(reduce_data, shflFct);
   else if (!(Liveness & (Liveness + 1))) // Partial warp but contiguous lanes
     gpu_irregular_warp_reduce(reduce_data, shflFct,
-                              /*LaneCount=*/__popc(Liveness),
+                              /*LaneCount=*/__kmpc_impl_popc(Liveness),
                               /*LaneId=*/GetThreadIdInBlock() % WARPSIZE);
   else if (!isRuntimeUninitialized) // Dispersed lanes. Only threads in L2
                                     // parallel region may enter here; return
@@ -325,7 +322,7 @@
     gpu_regular_warp_reduce(reduce_data, shflFct);
   else // Partial warp but contiguous lanes
     gpu_irregular_warp_reduce(reduce_data, shflFct,
-                              /*LaneCount=*/__popc(Liveness),
+                              /*LaneCount=*/__kmpc_impl_popc(Liveness),
                               /*LaneId=*/ThreadId % WARPSIZE);
 
   // When we have more than [warpsize] number of threads
Index: openmp/trunk/libomptarget/deviceRTLs/nvptx/src/supporti.h
===================================================================
--- openmp/trunk/libomptarget/deviceRTLs/nvptx/src/supporti.h
+++ openmp/trunk/libomptarget/deviceRTLs/nvptx/src/supporti.h
@@ -206,9 +206,8 @@
 INLINE void IncParallelLevel(bool ActiveParallel) {
   unsigned Active = __ACTIVEMASK();
   __kmpc_impl_syncwarp(Active);
-  unsigned LaneMaskLt;
-  asm("mov.u32 %0, %%lanemask_lt;" : "=r"(LaneMaskLt));
-  unsigned Rank = __popc(Active & LaneMaskLt);
+  unsigned LaneMaskLt = __kmpc_impl_lanemask_lt();
+  unsigned Rank = __kmpc_impl_popc(Active & LaneMaskLt);
   if (Rank == 0) {
     parallelLevel[GetWarpId()] +=
         (1 + (ActiveParallel ? OMP_ACTIVE_PARALLEL_LEVEL : 0));
@@ -220,9 +219,8 @@
 INLINE void DecParallelLevel(bool ActiveParallel) {
   unsigned Active = __ACTIVEMASK();
   __kmpc_impl_syncwarp(Active);
-  unsigned LaneMaskLt;
-  asm("mov.u32 %0, %%lanemask_lt;" : "=r"(LaneMaskLt));
-  unsigned Rank = __popc(Active & LaneMaskLt);
+  unsigned LaneMaskLt = __kmpc_impl_lanemask_lt();
+  unsigned Rank = __kmpc_impl_popc(Active & LaneMaskLt);
   if (Rank == 0) {
     parallelLevel[GetWarpId()] -=
         (1 + (ActiveParallel ? OMP_ACTIVE_PARALLEL_LEVEL : 0));
Index: openmp/trunk/libomptarget/deviceRTLs/nvptx/src/target_impl.h
===================================================================
--- openmp/trunk/libomptarget/deviceRTLs/nvptx/src/target_impl.h
+++ openmp/trunk/libomptarget/deviceRTLs/nvptx/src/target_impl.h
@@ -16,12 +16,12 @@
 
 #include "option.h"
 
-INLINE void __kmpc_impl_unpack(int64_t val, int32_t &lo, int32_t &hi) {
+INLINE void __kmpc_impl_unpack(uint64_t val, uint32_t &lo, uint32_t &hi) {
   asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "l"(val));
 }
 
-INLINE int64_t __kmpc_impl_pack(int32_t lo, int32_t hi) {
-  int64_t val;
+INLINE uint64_t __kmpc_impl_pack(uint32_t lo, uint32_t hi) {
+  uint64_t val;
   asm volatile("mov.b64 %0, {%1,%2};" : "=l"(val) : "r"(lo), "r"(hi));
   return val;
 }
@@ -34,9 +34,15 @@
   return res;
 }
 
-INLINE int __kmpc_impl_ffs(uint32_t x) { return __ffs(x); }
+INLINE __kmpc_impl_lanemask_t __kmpc_impl_lanemask_gt() {
+  __kmpc_impl_lanemask_t res;
+  asm("mov.u32 %0, %%lanemask_gt;" : "=r"(res));
+  return res;
+}
+
+INLINE uint32_t __kmpc_impl_ffs(uint32_t x) { return __ffs(x); }
 
-INLINE int __kmpc_impl_popc(uint32_t x) { return __popc(x); }
+INLINE uint32_t __kmpc_impl_popc(uint32_t x) { return __popc(x); }
 
 #ifndef CUDA_VERSION
 #error CUDA_VERSION macro is undefined, something wrong with cuda.