diff --git a/clang/lib/Headers/__clang_cuda_intrinsics.h b/clang/lib/Headers/__clang_cuda_intrinsics.h --- a/clang/lib/Headers/__clang_cuda_intrinsics.h +++ b/clang/lib/Headers/__clang_cuda_intrinsics.h @@ -512,6 +512,69 @@ __device__ inline cuuint32_t __nvvm_get_smem_pointer(void *__ptr) { return __nv_cvta_generic_to_shared_impl(__ptr); } + +__device__ inline unsigned __reduce_add_sync_unsigned_impl(unsigned __mask, + unsigned __value) { + return __nvvm_redux_sync_add(__mask, __value); +} +__device__ inline int __reduce_add_sync_signed_impl(unsigned __mask, + int __value) { + return __nvvm_redux_sync_add(__mask, __value); +} +__device__ inline unsigned __reduce_min_sync_unsigned_impl(unsigned __mask, + unsigned __value) { + return __nvvm_redux_sync_umin(__mask, __value); +} +__device__ inline unsigned __reduce_max_sync_unsigned_impl(unsigned __mask, + unsigned __value) { + return __nvvm_redux_sync_umax(__mask, __value); +} +__device__ inline int __reduce_min_sync_signed_impl(unsigned __mask, + int __value) { + return __nvvm_redux_sync_min(__mask, __value); +} +__device__ inline int __reduce_max_sync_signed_impl(unsigned __mask, + int __value) { + return __nvvm_redux_sync_max(__mask, __value); +} +__device__ inline unsigned __reduce_or_sync_unsigned_impl(unsigned __mask, + unsigned __value) { + return __nvvm_redux_sync_or(__mask, __value); +} +__device__ inline unsigned __reduce_and_sync_unsigned_impl(unsigned __mask, + unsigned __value) { + return __nvvm_redux_sync_and(__mask, __value); +} +__device__ inline unsigned __reduce_xor_sync_unsigned_impl(unsigned __mask, + unsigned __value) { + return __nvvm_redux_sync_xor(__mask, __value); +} + +__device__ inline void +__nv_memcpy_async_shared_global_4_impl(void *dst, const void *src, + unsigned src_size) { + __nvvm_cp_async_ca_shared_global_4( + (void __attribute__((address_space(3))) *)dst, + (const void __attribute__((address_space(1))) *)src, + src_size); +} +__device__ inline void +__nv_memcpy_async_shared_global_8_impl(void *dst, const void *src, + unsigned src_size) { + __nvvm_cp_async_ca_shared_global_8( + (void __attribute__((address_space(3))) *)dst, + (const void __attribute__((address_space(1))) *)src, + src_size); +} +__device__ inline void +__nv_memcpy_async_shared_global_16_impl(void *dst, const void *src, + unsigned src_size) { + __nvvm_cp_async_ca_shared_global_16( + (void __attribute__((address_space(3))) *)dst, + (const void __attribute__((address_space(1))) *)src, + src_size); +} + } // extern "C" #endif // CUDA_VERSION >= 11000