Index: cfe/trunk/include/clang/Basic/BuiltinsNVPTX.def =================================================================== --- cfe/trunk/include/clang/Basic/BuiltinsNVPTX.def +++ cfe/trunk/include/clang/Basic/BuiltinsNVPTX.def @@ -402,6 +402,17 @@ BUILTIN(__nvvm_bar0_and, "ii", "") BUILTIN(__nvvm_bar0_or, "ii", "") +// Shuffle + +BUILTIN(__builtin_ptx_shfl_down_i32, "iiii", "") +BUILTIN(__builtin_ptx_shfl_down_f32, "ffii", "") +BUILTIN(__builtin_ptx_shfl_up_i32, "iiii", "") +BUILTIN(__builtin_ptx_shfl_up_f32, "ffii", "") +BUILTIN(__builtin_ptx_shfl_bfly_i32, "iiii", "") +BUILTIN(__builtin_ptx_shfl_bfly_f32, "ffii", "") +BUILTIN(__builtin_ptx_shfl_idx_i32, "iiii", "") +BUILTIN(__builtin_ptx_shfl_idx_f32, "ffii", "") + // Membar BUILTIN(__nvvm_membar_cta, "v", "") Index: cfe/trunk/lib/Headers/__clang_cuda_intrinsics.h =================================================================== --- cfe/trunk/lib/Headers/__clang_cuda_intrinsics.h +++ cfe/trunk/lib/Headers/__clang_cuda_intrinsics.h @@ -26,6 +26,76 @@ #error "This file is for CUDA compilation only." #endif +// sm_30 intrinsics: __shfl_{up,down,xor}. + +#define __SM_30_INTRINSICS_H__ +#define __SM_30_INTRINSICS_HPP__ + +#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 300 + +#pragma push_macro("__MAKE_SHUFFLES") +#define __MAKE_SHUFFLES(__FnName, __IntIntrinsic, __FloatIntrinsic, __Mask) \ + inline __device__ int __FnName(int __in, int __offset, \ + int __width = warpSize) { \ + return __IntIntrinsic(__in, __offset, \ + ((warpSize - __width) << 8) | (__Mask)); \ + } \ + inline __device__ float __FnName(float __in, int __offset, \ + int __width = warpSize) { \ + return __FloatIntrinsic(__in, __offset, \ + ((warpSize - __width) << 8) | (__Mask)); \ + } \ + inline __device__ unsigned int __FnName(unsigned int __in, int __offset, \ + int __width = warpSize) { \ + return static_cast( \ + ::__FnName(static_cast(__in), __offset, __width)); \ + } \ + inline __device__ long long __FnName(long long __in, int __offset, \ + int __width = warpSize) { \ + struct __Bits { \ + int __a, __b; \ + }; \ + _Static_assert(sizeof(__in) == sizeof(__Bits)); \ + _Static_assert(sizeof(__Bits) == 2 * sizeof(int)); \ + __Bits __tmp; \ + memcpy(&__in, &__tmp, sizeof(__in)); \ + __tmp.__a = ::__FnName(__tmp.__a, __offset, __width); \ + __tmp.__b = ::__FnName(__tmp.__b, __offset, __width); \ + long long __out; \ + memcpy(&__out, &__tmp, sizeof(__tmp)); \ + return __out; \ + } \ + inline __device__ unsigned long long __FnName( \ + unsigned long long __in, int __offset, int __width = warpSize) { \ + return static_cast( \ + ::__FnName(static_cast(__in), __offset, __width)); \ + } \ + inline __device__ double __FnName(double __in, int __offset, \ + int __width = warpSize) { \ + long long __tmp; \ + _Static_assert(sizeof(__tmp) == sizeof(__in)); \ + memcpy(&__tmp, &__in, sizeof(__in)); \ + __tmp = ::__FnName(__tmp, __offset, __width); \ + double __out; \ + memcpy(&__out, &__tmp, sizeof(__out)); \ + return __out; \ + } + +__MAKE_SHUFFLES(__shfl, __builtin_ptx_shfl_idx_i32, __builtin_ptx_shfl_idx_f32, + 0x1f); +// We use 0 rather than 31 as our mask, because shfl.up applies to lanes >= +// maxLane. +__MAKE_SHUFFLES(__shfl_up, __builtin_ptx_shfl_up_i32, __builtin_ptx_shfl_up_f32, + 0); +__MAKE_SHUFFLES(__shfl_down, __builtin_ptx_shfl_down_i32, + __builtin_ptx_shfl_down_f32, 0x1f); +__MAKE_SHUFFLES(__shfl_xor, __builtin_ptx_shfl_bfly_i32, + __builtin_ptx_shfl_bfly_f32, 0x1f); + +#pragma pop_macro("__MAKE_SHUFFLES") + +#endif // !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 300 + // sm_32 intrinsics: __ldg and __funnelshift_{l,lc,r,rc}. // Prevent the vanilla sm_32 intrinsics header from being included. Index: cfe/trunk/lib/Headers/__clang_cuda_runtime_wrapper.h =================================================================== --- cfe/trunk/lib/Headers/__clang_cuda_runtime_wrapper.h +++ cfe/trunk/lib/Headers/__clang_cuda_runtime_wrapper.h @@ -198,13 +198,14 @@ #include "sm_20_atomic_functions.hpp" #include "sm_20_intrinsics.hpp" #include "sm_32_atomic_functions.hpp" -// sm_30_intrinsics.h has declarations that use default argument, so -// we have to include it and it will in turn include .hpp -#include "sm_30_intrinsics.h" - -// Don't include sm_32_intrinsics.h. That header defines __ldg using inline -// asm, but we want to define it using builtins, because we can't use the -// [addr+imm] addressing mode if we use the inline asm in the header. + +// Don't include sm_30_intrinsics.h and sm_32_intrinsics.h. These define the +// __shfl and __ldg intrinsics using inline (volatile) asm, but we want to +// define them using builtins so that the optimizer can reason about and across +// these instructions. In particular, using intrinsics for ldg gets us the +// [addr+imm] addressing mode, which, although it doesn't actually exist in the +// hardware, seems to generate faster machine code because ptxas can more easily +// reason about our code. #undef __MATH_FUNCTIONS_HPP__