Changeset View
Changeset View
Standalone View
Standalone View
libomptarget/deviceRTLs/nvptx/src/target_impl.h
Show First 20 Lines • Show All 92 Lines • ▼ Show 20 Lines | |||||
INLINE void __kmpc_impl_syncwarp(__kmpc_impl_lanemask_t Mask) { | INLINE void __kmpc_impl_syncwarp(__kmpc_impl_lanemask_t Mask) { | ||||
#if CUDA_VERSION >= 9000 | #if CUDA_VERSION >= 9000 | ||||
__syncwarp(Mask); | __syncwarp(Mask); | ||||
#else | #else | ||||
// In Cuda < 9.0 no need to sync threads in warps. | // In Cuda < 9.0 no need to sync threads in warps. | ||||
#endif // CUDA_VERSION | #endif // CUDA_VERSION | ||||
} | } | ||||
#ifdef __LP64__ | |||||
#define PTR_CONSTRAINT "l" | |||||
#else // __LP64__ | |||||
#define PTR_CONSTRAINT "r" | |||||
#endif // __LP64__ | |||||
INLINE int32_t __kmpc_impl_get_parallel_level(uint8_t &ParLevel) { | |||||
// Use volatile access in case of CUDA8 to prevent dangerous optimizations | |||||
// performed by ptxas. | |||||
int32_t ParLevelVal; | |||||
#if defined(CUDA_VERSION) && CUDA_VERSION <= 8000 | |||||
asm volatile("ld.volatile.u8 %0, [%1];" | |||||
: "=r"(ParLevelVal) | |||||
: PTR_CONSTRAINT(&ParLevel) | |||||
: "memory"); | |||||
#else | |||||
ParLevelVal = ParLevel; | |||||
#endif // CUDA_VERSION | |||||
return ParLevelVal; | |||||
} | |||||
INLINE void __kmpc_impl_set_parallel_level(uint8_t &ParLevel, | |||||
int32_t ParLevelVal) { | |||||
#if defined(CUDA_VERSION) && CUDA_VERSION <= 8000 | |||||
asm volatile("st.volatile.u8 [%1], %0;" ::"r"(ParLevelVal), | |||||
PTR_CONSTRAINT(&ParLevel) | |||||
: "memory"); | |||||
#else // CUDA_VERSION | |||||
ParLevel = ParLevelVal; | |||||
#endif // CUDA_VERSION | |||||
} | |||||
#undef PTR_CONSTRAINT | |||||
#endif | #endif |