diff --git a/clang/lib/Headers/__clang_cuda_intrinsics.h b/clang/lib/Headers/__clang_cuda_intrinsics.h --- a/clang/lib/Headers/__clang_cuda_intrinsics.h +++ b/clang/lib/Headers/__clang_cuda_intrinsics.h @@ -45,7 +45,7 @@ _Static_assert(sizeof(__val) == sizeof(__Bits)); \ _Static_assert(sizeof(__Bits) == 2 * sizeof(int)); \ __Bits __tmp; \ - memcpy(&__tmp, &__val, sizeof(__val)); \ + memcpy(&__tmp, &__val, sizeof(__val)); \ __tmp.__a = ::__FnName(__tmp.__a, __offset, __width); \ __tmp.__b = ::__FnName(__tmp.__b, __offset, __width); \ long long __ret; \ @@ -71,8 +71,8 @@ } \ inline __device__ unsigned long long __FnName( \ unsigned long long __val, __Type __offset, int __width = warpSize) { \ - return static_cast(::__FnName( \ - static_cast(__val), __offset, __width)); \ + return static_cast( \ + ::__FnName(static_cast(__val), __offset, __width)); \ } \ inline __device__ double __FnName(double __val, __Type __offset, \ int __width = warpSize) { \ @@ -139,8 +139,8 @@ inline __device__ unsigned long long __FnName( \ unsigned int __mask, unsigned long long __val, __Type __offset, \ int __width = warpSize) { \ - return static_cast(::__FnName( \ - __mask, static_cast(__val), __offset, __width)); \ + return static_cast( \ + ::__FnName(__mask, static_cast(__val), __offset, __width)); \ } \ inline __device__ long __FnName(unsigned int __mask, long __val, \ __Type __offset, int __width = warpSize) { \ @@ -234,8 +234,8 @@ return __nvvm_match_any_sync_i32(mask, value); } -inline __device__ unsigned int -__match64_any_sync(unsigned int mask, unsigned long long value) { +inline __device__ unsigned int __match64_any_sync(unsigned int mask, + unsigned long long value) { return __nvvm_match_any_sync_i64(mask, value); } diff --git a/clang/test/CodeGenCUDA/shuffle_long_long.cu b/clang/test/CodeGenCUDA/shuffle_long_long.cu new file mode 100644 --- /dev/null +++ b/clang/test/CodeGenCUDA/shuffle_long_long.cu @@ -0,0 +1,64 @@ +// REQUIRES: nvptx-registered-target + +// RUN: %clang_cc1 -triple nvptx64-nvidia-cuda -fcuda-is-device -emit-llvm %s -o - | FileCheck %s --check-prefix=NO_SYNC +// RUN: %clang_cc1 -triple nvptx64-nvidia-cuda -fcuda-is-device -emit-llvm -target-feature +ptx70 -DSYNC -DCUDA_VERSION=9000 %s -o - | FileCheck %s --check-prefix=SYNC + +#include "Inputs/cuda.h" + +#undef __CUDA_ARCH__ +#define __CUDA_ARCH__ 300 + +__device__ void *memcpy(void *dest, const void *src, size_t n); + +#define warpSize 32 +#include "__clang_cuda_intrinsics.h" + +__device__ void use(unsigned long long, long long); + +// Test function, 4 shfl calls. +// NO_SYNC: define{{.*}} @_Z14test_long_longv +// NO_SYNC: call noundef i64 @_Z6__shflyii( +// NO_SYNC: call noundef i64 @_Z6__shflxii( + +// SYNC: define{{.*}} @_Z14test_long_longv +// SYNC: call noundef i64 @_Z11__shfl_syncjyii( +// SYNC: call noundef i64 @_Z11__shfl_syncjxii( + +// unsigned long long -> long long +// NO_SYNC: define{{.*}} @_Z6__shflyii +// NO_SYNC: call noundef i64 @_Z6__shflxii( + +// long long -> int + int +// NO_SYNC: define{{.*}} @_Z6__shflxii +// NO_SYNC: call noundef i32 @_Z6__shfliii( +// NO_SYNC: call noundef i32 @_Z6__shfliii( + +// NO_SYNC: define{{.*}} @_Z6__shfliii +// NO_SYNC: call i32 @llvm.nvvm.shfl.idx.i32 + +// unsigned long long -> long long +// SYNC: _Z11__shfl_syncjyii +// SYNC: call noundef i64 @_Z11__shfl_syncjxii( + +// long long -> int + int +// SYNC: define{{.*}} @_Z11__shfl_syncjxii +// SYNC: call noundef i32 @_Z11__shfl_syncjiii( +// SYNC: call noundef i32 @_Z11__shfl_syncjiii( + +// SYNC: define{{.*}} @_Z11__shfl_syncjiii +// SYNC: call i32 @llvm.nvvm.shfl.sync.idx.i32 + +__device__ void test_long_long() { + unsigned long long ull = 13; + long long ll = 17; +#ifndef SYNC + ull = __shfl(ull, 7, 32); + ll = __shfl(ll, 7, 32); + use(ull, ll); +#else + ull = __shfl_sync(0x11, ull, 7, 32); + ll = __shfl_sync(0x11, ll, 7, 32); + use(ull, ll); +#endif +} +