Diff 446575

clang/lib/Headers/__clang_cuda_intrinsics.h

Show First 20 Lines • Show All 65 Lines • ▼ Show 20 Lines	#define __MAKE_SHUFFLES(__FnName, __IntIntrinsic, __FloatIntrinsic, __Mask, \
} \		} \
inline __device__ unsigned long __FnName( \		inline __device__ unsigned long __FnName( \
unsigned long __val, __Type __offset, int __width = warpSize) { \		unsigned long __val, __Type __offset, int __width = warpSize) { \
return static_cast<unsigned long>( \		return static_cast<unsigned long>( \
::__FnName(static_cast<long>(__val), __offset, __width)); \		::__FnName(static_cast<long>(__val), __offset, __width)); \
} \		} \
inline __device__ unsigned long long __FnName( \		inline __device__ unsigned long long __FnName( \
unsigned long long __val, __Type __offset, int __width = warpSize) { \		unsigned long long __val, __Type __offset, int __width = warpSize) { \
return static_cast<unsigned long long>(::__FnName( \		return static_cast<unsigned long long>( \
static_cast<unsigned long long>(__val), __offset, __width)); \		::__FnName(static_cast<long long>(__val), __offset, __width)); \
} \		} \
inline __device__ double __FnName(double __val, __Type __offset, \		inline __device__ double __FnName(double __val, __Type __offset, \
int __width = warpSize) { \		int __width = warpSize) { \
long long __tmp; \		long long __tmp; \
_Static_assert(sizeof(__tmp) == sizeof(__val)); \		_Static_assert(sizeof(__tmp) == sizeof(__val)); \
memcpy(&__tmp, &__val, sizeof(__val)); \		memcpy(&__tmp, &__val, sizeof(__val)); \
__tmp = ::__FnName(__tmp, __offset, __width); \		__tmp = ::__FnName(__tmp, __offset, __width); \
double __ret; \		double __ret; \
▲ Show 20 Lines • Show All 50 Lines • ▼ Show 20 Lines	inline __device__ long long __FnName(unsigned int __mask, long long __val, \
__tmp.__b = ::__FnName(__mask, __tmp.__b, __offset, __width); \		__tmp.__b = ::__FnName(__mask, __tmp.__b, __offset, __width); \
long long __ret; \		long long __ret; \
memcpy(&__ret, &__tmp, sizeof(__tmp)); \		memcpy(&__ret, &__tmp, sizeof(__tmp)); \
return __ret; \		return __ret; \
} \		} \
inline __device__ unsigned long long __FnName( \		inline __device__ unsigned long long __FnName( \
unsigned int __mask, unsigned long long __val, __Type __offset, \		unsigned int __mask, unsigned long long __val, __Type __offset, \
int __width = warpSize) { \		int __width = warpSize) { \
return static_cast<unsigned long long>(::__FnName( \		return static_cast<unsigned long long>( \
__mask, static_cast<unsigned long long>(__val), __offset, __width)); \		::__FnName(__mask, static_cast<long long>(__val), __offset, __width)); \
} \		} \
inline __device__ long __FnName(unsigned int __mask, long __val, \		inline __device__ long __FnName(unsigned int __mask, long __val, \
__Type __offset, int __width = warpSize) { \		__Type __offset, int __width = warpSize) { \
_Static_assert(sizeof(long) == sizeof(long long) \|\| \		_Static_assert(sizeof(long) == sizeof(long long) \|\| \
sizeof(long) == sizeof(int)); \		sizeof(long) == sizeof(int)); \
if (sizeof(long) == sizeof(long long)) { \		if (sizeof(long) == sizeof(long long)) { \
return static_cast<long>(::__FnName( \		return static_cast<long>(::__FnName( \
__mask, static_cast<long long>(__val), __offset, __width)); \		__mask, static_cast<long long>(__val), __offset, __width)); \
▲ Show 20 Lines • Show All 78 Lines • ▼ Show 20 Lines
// Define __match* builtins CUDA-9 headers expect to see.		// Define __match* builtins CUDA-9 headers expect to see.
#if !defined(__CUDA_ARCH__) \|\| __CUDA_ARCH__ >= 700		#if !defined(__CUDA_ARCH__) \|\| __CUDA_ARCH__ >= 700
inline __device__ unsigned int __match32_any_sync(unsigned int mask,		inline __device__ unsigned int __match32_any_sync(unsigned int mask,
unsigned int value) {		unsigned int value) {
return __nvvm_match_any_sync_i32(mask, value);		return __nvvm_match_any_sync_i32(mask, value);
}		}

inline __device__ unsigned int		inline __device__ unsigned int
__match64_any_sync(unsigned int mask, unsigned long long value) {		__match64_any_sync(unsigned int mask, unsigned long long value) {
return __nvvm_match_any_sync_i64(mask, value);		return __nvvm_match_any_sync_i64(mask, value);
		traUnsubmitted Done Reply Inline Actions Nit: this change is irrelevant to the patch and can be removed. tra: Nit: this change is irrelevant to the patch and can be removed.
		jdoerfertAuthorUnsubmitted Done Reply Inline Actions me running clang format on the file. I'll push it nfc before. jdoerfert: me running clang format on the file. I'll push it nfc before.
}		}

inline __device__ unsigned int		inline __device__ unsigned int
__match32_all_sync(unsigned int mask, unsigned int value, int *pred) {		__match32_all_sync(unsigned int mask, unsigned int value, int *pred) {
return __nvvm_match_all_sync_i32p(mask, value, pred);		return __nvvm_match_all_sync_i32p(mask, value, pred);
}		}

inline __device__ unsigned int		inline __device__ unsigned int
▲ Show 20 Lines • Show All 271 Lines • Show Last 20 Lines

clang/test/CodeGenCUDA/shuffle_long_long.cu

This file was added.

				// REQUIRES: nvptx-registered-target

				// RUN: %clang_cc1 -triple nvptx64-nvidia-cuda -fcuda-is-device -emit-llvm -target-cpu sm_30 %s -o - \| FileCheck %s --check-prefix=NO_SYNC
				// RUN: %clang_cc1 -triple nvptx64-nvidia-cuda -fcuda-is-device -emit-llvm -target-cpu sm_30 -target-feature +ptx70 -DSYNC -DCUDA_VERSION=9000 %s -o - \| FileCheck %s --check-prefix=SYNC

				#include "Inputs/cuda.h"

				__device__ void memcpy(void dest, const void *src, size_t n);

				#define warpSize 32
				traUnsubmitted Done Reply Inline Actions This macro should not be set. If you do need something to be compiled for sm_30, you should've specified via `-target-cpu sm_30`. tra: This macro should not be set. If you do need something to be compiled for sm_30, you should've…
				#include <__clang_cuda_intrinsics.h>

				__device__ void use(unsigned long long, long long);

				// Test function, 4 shfl calls.
				traUnsubmitted Done Reply Inline Actions Nit: this should be `<...>` as we want the include to be found in compiler's include paths. tra: Nit: this should be `<...>` as we want the include to be found in compiler's include paths.
				// NO_SYNC: define{{.*}} @_Z14test_long_longv
				// NO_SYNC: call noundef i64 @_Z6__shflyii(
				// NO_SYNC: call noundef i64 @_Z6__shflxii(

				// SYNC: define{{.*}} @_Z14test_long_longv
				// SYNC: call noundef i64 @_Z11__shfl_syncjyii(
				// SYNC: call noundef i64 @_Z11__shfl_syncjxii(

				// unsigned long long -> long long
				// NO_SYNC: define{{.*}} @_Z6__shflyii
				// NO_SYNC: call noundef i64 @_Z6__shflxii(

				// long long -> int + int
				// NO_SYNC: define{{.*}} @_Z6__shflxii
				// NO_SYNC: call noundef i32 @_Z6__shfliii(
				// NO_SYNC: call noundef i32 @_Z6__shfliii(

				// NO_SYNC: define{{.*}} @_Z6__shfliii
				// NO_SYNC: call i32 @llvm.nvvm.shfl.idx.i32

				// unsigned long long -> long long
				// SYNC: _Z11__shfl_syncjyii
				// SYNC: call noundef i64 @_Z11__shfl_syncjxii(

				// long long -> int + int
				// SYNC: define{{.*}} @_Z11__shfl_syncjxii
				// SYNC: call noundef i32 @_Z11__shfl_syncjiii(
				// SYNC: call noundef i32 @_Z11__shfl_syncjiii(

				// SYNC: define{{.*}} @_Z11__shfl_syncjiii
				// SYNC: call i32 @llvm.nvvm.shfl.sync.idx.i32

				__device__ void test_long_long() {
				unsigned long long ull = 13;
				long long ll = 17;
				#ifndef SYNC
				ull = __shfl(ull, 7, 32);
				ll = __shfl(ll, 7, 32);
				traUnsubmitted Done Reply Inline Actions This crashes LLVM when we taget sm_70 where these instructions no longer exist. We should probably disable those sync wrappers when we compile for GPUs where they are not available, so we'd get a proper compiler error instead of a crash. Also, we should probably make non-sync instruction use conditional on SYNC. https://godbolt.org/z/7n4vsb41v tra: This crashes LLVM when we taget sm_70 where these instructions no longer exist. We should…
				use(ull, ll);
				#else
				ull = __shfl_sync(0x11, ull, 7, 32);
				ll = __shfl_sync(0x11, ll, 7, 32);
				use(ull, ll);
				#endif
				}

This is an archive of the discontinued LLVM Phabricator instance.

[CUDA][FIX] Make shfl[_sync] for unsigned long long non-recursive
ClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 446575

clang/lib/Headers/__clang_cuda_intrinsics.h

clang/test/CodeGenCUDA/shuffle_long_long.cu

This is an archive of the discontinued LLVM Phabricator instance.

[CUDA][FIX] Make shfl[_sync] for unsigned long long non-recursiveClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 446575

clang/lib/Headers/__clang_cuda_intrinsics.h

clang/test/CodeGenCUDA/shuffle_long_long.cu

[CUDA][FIX] Make shfl[_sync] for unsigned long long non-recursive
ClosedPublic