Index: openmp/trunk/libomptarget/deviceRTLs/nvptx/src/loop.cu =================================================================== --- openmp/trunk/libomptarget/deviceRTLs/nvptx/src/loop.cu +++ openmp/trunk/libomptarget/deviceRTLs/nvptx/src/loop.cu @@ -381,7 +381,7 @@ // Support for dispatch next INLINE static int64_t Shuffle(unsigned active, int64_t val, int leader) { - int lo, hi; + uint32_t lo, hi; __kmpc_impl_unpack(val, lo, hi); hi = __kmpc_impl_shfl_sync(active, hi, leader); lo = __kmpc_impl_shfl_sync(active, lo, leader); @@ -390,8 +390,8 @@ INLINE static uint64_t NextIter() { __kmpc_impl_lanemask_t active = __ACTIVEMASK(); - int leader = __kmpc_impl_ffs(active) - 1; - int change = __kmpc_impl_popc(active); + uint32_t leader = __kmpc_impl_ffs(active) - 1; + uint32_t change = __kmpc_impl_popc(active); __kmpc_impl_lanemask_t lane_mask_lt = __kmpc_impl_lanemask_lt(); unsigned int rank = __kmpc_impl_popc(active & lane_mask_lt); uint64_t warp_res; Index: openmp/trunk/libomptarget/deviceRTLs/nvptx/src/parallel.cu =================================================================== --- openmp/trunk/libomptarget/deviceRTLs/nvptx/src/parallel.cu +++ openmp/trunk/libomptarget/deviceRTLs/nvptx/src/parallel.cu @@ -49,13 +49,12 @@ int32_t *LaneId, int32_t *NumLanes) { PRINT0(LD_IO, "call to __kmpc_kernel_convergent_simd\n"); uint32_t ConvergentMask = Mask; - int32_t ConvergentSize = __popc(ConvergentMask); + int32_t ConvergentSize = __kmpc_impl_popc(ConvergentMask); uint32_t WorkRemaining = ConvergentMask >> (*LaneSource + 1); - *LaneSource += __ffs(WorkRemaining); - *IsFinal = __popc(WorkRemaining) == 1; - uint32_t lanemask_lt; - asm("mov.u32 %0, %%lanemask_lt;" : "=r"(lanemask_lt)); - *LaneId = __popc(ConvergentMask & lanemask_lt); + *LaneSource += __kmpc_impl_ffs(WorkRemaining); + *IsFinal = __kmpc_impl_popc(WorkRemaining) == 1; + uint32_t lanemask_lt = __kmpc_impl_lanemask_lt(); + *LaneId = __kmpc_impl_popc(ConvergentMask & lanemask_lt); int threadId = GetLogicalThreadIdInBlock(isSPMDMode()); int sourceThreadId = (threadId & ~(WARPSIZE - 1)) + *LaneSource; @@ -123,13 +122,12 @@ int32_t *LaneSource) { PRINT0(LD_IO, "call to __kmpc_kernel_convergent_parallel\n"); uint32_t ConvergentMask = Mask; - int32_t ConvergentSize = __popc(ConvergentMask); + int32_t ConvergentSize = __kmpc_impl_popc(ConvergentMask); uint32_t WorkRemaining = ConvergentMask >> (*LaneSource + 1); - *LaneSource += __ffs(WorkRemaining); - *IsFinal = __popc(WorkRemaining) == 1; - uint32_t lanemask_lt; - asm("mov.u32 %0, %%lanemask_lt;" : "=r"(lanemask_lt)); - uint32_t OmpId = __popc(ConvergentMask & lanemask_lt); + *LaneSource += __kmpc_impl_ffs(WorkRemaining); + *IsFinal = __kmpc_impl_popc(WorkRemaining) == 1; + uint32_t lanemask_lt = __kmpc_impl_lanemask_lt(); + uint32_t OmpId = __kmpc_impl_popc(ConvergentMask & lanemask_lt); int threadId = GetLogicalThreadIdInBlock(isSPMDMode()); int sourceThreadId = (threadId & ~(WARPSIZE - 1)) + *LaneSource; Index: openmp/trunk/libomptarget/deviceRTLs/nvptx/src/reduction.cu =================================================================== --- openmp/trunk/libomptarget/deviceRTLs/nvptx/src/reduction.cu +++ openmp/trunk/libomptarget/deviceRTLs/nvptx/src/reduction.cu @@ -28,12 +28,11 @@ } EXTERN int64_t __kmpc_shuffle_int64(int64_t val, int16_t delta, int16_t size) { - int lo, hi; - asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "l"(val)); + uint32_t lo, hi; + __kmpc_impl_unpack(val, lo, hi); hi = __kmpc_impl_shfl_down_sync(0xFFFFFFFF, hi, delta, size); lo = __kmpc_impl_shfl_down_sync(0xFFFFFFFF, lo, delta, size); - asm volatile("mov.b64 %0, {%1,%2};" : "=l"(val) : "r"(lo), "r"(hi)); - return val; + return __kmpc_impl_pack(lo, hi); } INLINE static void gpu_regular_warp_reduce(void *reduce_data, @@ -60,18 +59,16 @@ INLINE static uint32_t gpu_irregular_simd_reduce(void *reduce_data, kmp_ShuffleReductFctPtr shflFct) { - uint32_t lanemask_lt; - uint32_t lanemask_gt; uint32_t size, remote_id, physical_lane_id; physical_lane_id = GetThreadIdInBlock() % WARPSIZE; - asm("mov.u32 %0, %%lanemask_lt;" : "=r"(lanemask_lt)); + uint32_t lanemask_lt = __kmpc_impl_lanemask_lt(); uint32_t Liveness = __ACTIVEMASK(); - uint32_t logical_lane_id = __popc(Liveness & lanemask_lt) * 2; - asm("mov.u32 %0, %%lanemask_gt;" : "=r"(lanemask_gt)); + uint32_t logical_lane_id = __kmpc_impl_popc(Liveness & lanemask_lt) * 2; + uint32_t lanemask_gt = __kmpc_impl_lanemask_gt(); do { Liveness = __ACTIVEMASK(); - remote_id = __ffs(Liveness & lanemask_gt); - size = __popc(Liveness); + remote_id = __kmpc_impl_ffs(Liveness & lanemask_gt); + size = __kmpc_impl_popc(Liveness); logical_lane_id /= 2; shflFct(reduce_data, /*LaneId =*/logical_lane_id, /*Offset=*/remote_id - 1 - physical_lane_id, /*AlgoVersion=*/2); @@ -150,7 +147,7 @@ gpu_regular_warp_reduce(reduce_data, shflFct); else if (!(Liveness & (Liveness + 1))) // Partial warp but contiguous lanes gpu_irregular_warp_reduce(reduce_data, shflFct, - /*LaneCount=*/__popc(Liveness), + /*LaneCount=*/__kmpc_impl_popc(Liveness), /*LaneId=*/GetThreadIdInBlock() % WARPSIZE); else if (!isRuntimeUninitialized) // Dispersed lanes. Only threads in L2 // parallel region may enter here; return @@ -325,7 +322,7 @@ gpu_regular_warp_reduce(reduce_data, shflFct); else // Partial warp but contiguous lanes gpu_irregular_warp_reduce(reduce_data, shflFct, - /*LaneCount=*/__popc(Liveness), + /*LaneCount=*/__kmpc_impl_popc(Liveness), /*LaneId=*/ThreadId % WARPSIZE); // When we have more than [warpsize] number of threads Index: openmp/trunk/libomptarget/deviceRTLs/nvptx/src/supporti.h =================================================================== --- openmp/trunk/libomptarget/deviceRTLs/nvptx/src/supporti.h +++ openmp/trunk/libomptarget/deviceRTLs/nvptx/src/supporti.h @@ -206,9 +206,8 @@ INLINE void IncParallelLevel(bool ActiveParallel) { unsigned Active = __ACTIVEMASK(); __kmpc_impl_syncwarp(Active); - unsigned LaneMaskLt; - asm("mov.u32 %0, %%lanemask_lt;" : "=r"(LaneMaskLt)); - unsigned Rank = __popc(Active & LaneMaskLt); + unsigned LaneMaskLt = __kmpc_impl_lanemask_lt(); + unsigned Rank = __kmpc_impl_popc(Active & LaneMaskLt); if (Rank == 0) { parallelLevel[GetWarpId()] += (1 + (ActiveParallel ? OMP_ACTIVE_PARALLEL_LEVEL : 0)); @@ -220,9 +219,8 @@ INLINE void DecParallelLevel(bool ActiveParallel) { unsigned Active = __ACTIVEMASK(); __kmpc_impl_syncwarp(Active); - unsigned LaneMaskLt; - asm("mov.u32 %0, %%lanemask_lt;" : "=r"(LaneMaskLt)); - unsigned Rank = __popc(Active & LaneMaskLt); + unsigned LaneMaskLt = __kmpc_impl_lanemask_lt(); + unsigned Rank = __kmpc_impl_popc(Active & LaneMaskLt); if (Rank == 0) { parallelLevel[GetWarpId()] -= (1 + (ActiveParallel ? OMP_ACTIVE_PARALLEL_LEVEL : 0)); Index: openmp/trunk/libomptarget/deviceRTLs/nvptx/src/target_impl.h =================================================================== --- openmp/trunk/libomptarget/deviceRTLs/nvptx/src/target_impl.h +++ openmp/trunk/libomptarget/deviceRTLs/nvptx/src/target_impl.h @@ -16,12 +16,12 @@ #include "option.h" -INLINE void __kmpc_impl_unpack(int64_t val, int32_t &lo, int32_t &hi) { +INLINE void __kmpc_impl_unpack(uint64_t val, uint32_t &lo, uint32_t &hi) { asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "l"(val)); } -INLINE int64_t __kmpc_impl_pack(int32_t lo, int32_t hi) { - int64_t val; +INLINE uint64_t __kmpc_impl_pack(uint32_t lo, uint32_t hi) { + uint64_t val; asm volatile("mov.b64 %0, {%1,%2};" : "=l"(val) : "r"(lo), "r"(hi)); return val; } @@ -34,9 +34,15 @@ return res; } -INLINE int __kmpc_impl_ffs(uint32_t x) { return __ffs(x); } +INLINE __kmpc_impl_lanemask_t __kmpc_impl_lanemask_gt() { + __kmpc_impl_lanemask_t res; + asm("mov.u32 %0, %%lanemask_gt;" : "=r"(res)); + return res; +} + +INLINE uint32_t __kmpc_impl_ffs(uint32_t x) { return __ffs(x); } -INLINE int __kmpc_impl_popc(uint32_t x) { return __popc(x); } +INLINE uint32_t __kmpc_impl_popc(uint32_t x) { return __popc(x); } #ifndef CUDA_VERSION #error CUDA_VERSION macro is undefined, something wrong with cuda.