Index: cfe/trunk/include/clang/Basic/BuiltinsNVPTX.def =================================================================== --- cfe/trunk/include/clang/Basic/BuiltinsNVPTX.def +++ cfe/trunk/include/clang/Basic/BuiltinsNVPTX.def @@ -390,6 +390,15 @@ BUILTIN(__nvvm_shfl_idx_i32, "iiii", "") BUILTIN(__nvvm_shfl_idx_f32, "ffii", "") +TARGET_BUILTIN(__nvvm_shfl_sync_down_i32, "iUiiii", "", "ptx60") +TARGET_BUILTIN(__nvvm_shfl_sync_down_f32, "fUifii", "", "ptx60") +TARGET_BUILTIN(__nvvm_shfl_sync_up_i32, "iUiiii", "", "ptx60") +TARGET_BUILTIN(__nvvm_shfl_sync_up_f32, "fUifii", "", "ptx60") +TARGET_BUILTIN(__nvvm_shfl_sync_bfly_i32, "iUiiii", "", "ptx60") +TARGET_BUILTIN(__nvvm_shfl_sync_bfly_f32, "fUifii", "", "ptx60") +TARGET_BUILTIN(__nvvm_shfl_sync_idx_i32, "iUiiii", "", "ptx60") +TARGET_BUILTIN(__nvvm_shfl_sync_idx_f32, "fUifii", "", "ptx60") + // Membar BUILTIN(__nvvm_membar_cta, "v", "") Index: cfe/trunk/lib/Driver/ToolChains/Cuda.cpp =================================================================== --- cfe/trunk/lib/Driver/ToolChains/Cuda.cpp +++ cfe/trunk/lib/Driver/ToolChains/Cuda.cpp @@ -507,11 +507,17 @@ CC1Args.push_back("-mlink-cuda-bitcode"); CC1Args.push_back(DriverArgs.MakeArgString(LibDeviceFile)); - // Libdevice in CUDA-7.0 requires PTX version that's more recent - // than LLVM defaults to. Use PTX4.2 which is the PTX version that - // came with CUDA-7.0. - CC1Args.push_back("-target-feature"); - CC1Args.push_back("+ptx42"); + if (CudaInstallation.version() >= CudaVersion::CUDA_90) { + // CUDA-9 uses new instructions that are only available in PTX6.0 + CC1Args.push_back("-target-feature"); + CC1Args.push_back("+ptx60"); + } else { + // Libdevice in CUDA-7.0 requires PTX version that's more recent + // than LLVM defaults to. Use PTX4.2 which is the PTX version that + // came with CUDA-7.0. + CC1Args.push_back("-target-feature"); + CC1Args.push_back("+ptx42"); + } } void CudaToolChain::AddCudaIncludeArgs(const ArgList &DriverArgs, Index: cfe/trunk/lib/Headers/__clang_cuda_intrinsics.h =================================================================== --- cfe/trunk/lib/Headers/__clang_cuda_intrinsics.h +++ cfe/trunk/lib/Headers/__clang_cuda_intrinsics.h @@ -92,6 +92,74 @@ #endif // !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 300 +// __shfl_sync_* variants available in CUDA-9 +#if CUDA_VERSION >= 9000 && (!defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 300) +#pragma push_macro("__MAKE_SYNC_SHUFFLES") +#define __MAKE_SYNC_SHUFFLES(__FnName, __IntIntrinsic, __FloatIntrinsic, \ + __Mask) \ + inline __device__ int __FnName(unsigned int __mask, int __val, int __offset, \ + int __width = warpSize) { \ + return __IntIntrinsic(__mask, __val, __offset, \ + ((warpSize - __width) << 8) | (__Mask)); \ + } \ + inline __device__ float __FnName(unsigned int __mask, float __val, \ + int __offset, int __width = warpSize) { \ + return __FloatIntrinsic(__mask, __val, __offset, \ + ((warpSize - __width) << 8) | (__Mask)); \ + } \ + inline __device__ unsigned int __FnName(unsigned int __mask, \ + unsigned int __val, int __offset, \ + int __width = warpSize) { \ + return static_cast( \ + ::__FnName(__mask, static_cast(__val), __offset, __width)); \ + } \ + inline __device__ long long __FnName(unsigned int __mask, long long __val, \ + int __offset, int __width = warpSize) { \ + struct __Bits { \ + int __a, __b; \ + }; \ + _Static_assert(sizeof(__val) == sizeof(__Bits)); \ + _Static_assert(sizeof(__Bits) == 2 * sizeof(int)); \ + __Bits __tmp; \ + memcpy(&__val, &__tmp, sizeof(__val)); \ + __tmp.__a = ::__FnName(__mask, __tmp.__a, __offset, __width); \ + __tmp.__b = ::__FnName(__mask, __tmp.__b, __offset, __width); \ + long long __ret; \ + memcpy(&__ret, &__tmp, sizeof(__tmp)); \ + return __ret; \ + } \ + inline __device__ unsigned long long __FnName( \ + unsigned int __mask, unsigned long long __val, int __offset, \ + int __width = warpSize) { \ + return static_cast(::__FnName( \ + __mask, static_cast(__val), __offset, __width)); \ + } \ + inline __device__ double __FnName(unsigned int __mask, double __val, \ + int __offset, int __width = warpSize) { \ + long long __tmp; \ + _Static_assert(sizeof(__tmp) == sizeof(__val)); \ + memcpy(&__tmp, &__val, sizeof(__val)); \ + __tmp = ::__FnName(__mask, __tmp, __offset, __width); \ + double __ret; \ + memcpy(&__ret, &__tmp, sizeof(__ret)); \ + return __ret; \ + } +__MAKE_SYNC_SHUFFLES(__shfl_sync, __nvvm_shfl_sync_idx_i32, + __nvvm_shfl_sync_idx_f32, 0x1f); +// We use 0 rather than 31 as our mask, because shfl.up applies to lanes >= +// maxLane. +__MAKE_SYNC_SHUFFLES(__shfl_sync_up, __nvvm_shfl_sync_up_i32, + __nvvm_shfl_sync_up_f32, 0); +__MAKE_SYNC_SHUFFLES(__shfl_sync_down, __nvvm_shfl_sync_down_i32, + __nvvm_shfl_sync_down_f32, 0x1f); +__MAKE_SYNC_SHUFFLES(__shfl_sync_xor, __nvvm_shfl_sync_bfly_i32, + __nvvm_shfl_sync_bfly_f32, 0x1f); + +#pragma pop_macro("__MAKE_SYNC_SHUFFLES") + +#endif // __CUDA_VERSION >= 9000 && (!defined(__CUDA_ARCH__) || + // __CUDA_ARCH__ >= 300) + // sm_32 intrinsics: __ldg and __funnelshift_{l,lc,r,rc}. // Prevent the vanilla sm_32 intrinsics header from being included. Index: cfe/trunk/test/CodeGen/builtins-nvptx-ptx60.cu =================================================================== --- cfe/trunk/test/CodeGen/builtins-nvptx-ptx60.cu +++ cfe/trunk/test/CodeGen/builtins-nvptx-ptx60.cu @@ -0,0 +1,40 @@ +// RUN: %clang_cc1 -triple nvptx64-unknown-unknown -target-cpu sm_60 \ +// RUN: -fcuda-is-device -target-feature +ptx60 \ +// RUN: -S -emit-llvm -o - -x cuda %s \ +// RUN: | FileCheck -check-prefix=CHECK %s +// RUN: %clang_cc1 -triple nvptx-unknown-unknown -target-cpu sm_60 \ +// RUN: -fcuda-is-device -S -o /dev/null -x cuda -verify %s + +#define __device__ __attribute__((device)) +#define __global__ __attribute__((global)) +#define __shared__ __attribute__((shared)) +#define __constant__ __attribute__((constant)) + +// CHECK-LABEL: nvvm_shfl_sync +__device__ void nvvm_shfl_sync(unsigned mask, int i, float f, int a, int b) { + // CHECK: call i32 @llvm.nvvm.shfl.sync.down.i32(i32 {{%[0-9]+}}, i32 + // expected-error@+1 {{'__nvvm_shfl_sync_down_i32' needs target feature ptx60}} + __nvvm_shfl_sync_down_i32(mask, i, a, b); + // CHECK: call float @llvm.nvvm.shfl.sync.down.f32(i32 {{%[0-9]+}}, float + // expected-error@+1 {{'__nvvm_shfl_sync_down_f32' needs target feature ptx60}} + __nvvm_shfl_sync_down_f32(mask, f, a, b); + // CHECK: call i32 @llvm.nvvm.shfl.sync.up.i32(i32 {{%[0-9]+}}, i32 + // expected-error@+1 {{'__nvvm_shfl_sync_up_i32' needs target feature ptx60}} + __nvvm_shfl_sync_up_i32(mask, i, a, b); + // CHECK: call float @llvm.nvvm.shfl.sync.up.f32(i32 {{%[0-9]+}}, float + // expected-error@+1 {{'__nvvm_shfl_sync_up_f32' needs target feature ptx60}} + __nvvm_shfl_sync_up_f32(mask, f, a, b); + // CHECK: call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 {{%[0-9]+}}, i32 + // expected-error@+1 {{'__nvvm_shfl_sync_bfly_i32' needs target feature ptx60}} + __nvvm_shfl_sync_bfly_i32(mask, i, a, b); + // CHECK: call float @llvm.nvvm.shfl.sync.bfly.f32(i32 {{%[0-9]+}}, float + // expected-error@+1 {{'__nvvm_shfl_sync_bfly_f32' needs target feature ptx60}} + __nvvm_shfl_sync_bfly_f32(mask, f, a, b); + // CHECK: call i32 @llvm.nvvm.shfl.sync.idx.i32(i32 {{%[0-9]+}}, i32 + // expected-error@+1 {{'__nvvm_shfl_sync_idx_i32' needs target feature ptx60}} + __nvvm_shfl_sync_idx_i32(mask, i, a, b); + // CHECK: call float @llvm.nvvm.shfl.sync.idx.f32(i32 {{%[0-9]+}}, float + // expected-error@+1 {{'__nvvm_shfl_sync_idx_f32' needs target feature ptx60}} + __nvvm_shfl_sync_idx_f32(mask, f, a, b); + // CHECK: ret void +} Index: cfe/trunk/test/CodeGen/builtins-nvptx.c =================================================================== --- cfe/trunk/test/CodeGen/builtins-nvptx.c +++ cfe/trunk/test/CodeGen/builtins-nvptx.c @@ -636,3 +636,24 @@ typedef double double2 __attribute__((ext_vector_type(2))); __nvvm_ldg_d2((const double2 *)p); } + +// CHECK-LABEL: nvvm_shfl +__device__ void nvvm_shfl(int i, float f, int a, int b) { + // CHECK: call i32 @llvm.nvvm.shfl.down.i32(i32 + __nvvm_shfl_down_i32(i, a, b); + // CHECK: call float @llvm.nvvm.shfl.down.f32(float + __nvvm_shfl_down_f32(f, a, b); + // CHECK: call i32 @llvm.nvvm.shfl.up.i32(i32 + __nvvm_shfl_up_i32(i, a, b); + // CHECK: call float @llvm.nvvm.shfl.up.f32(float + __nvvm_shfl_up_f32(f, a, b); + // CHECK: call i32 @llvm.nvvm.shfl.bfly.i32(i32 + __nvvm_shfl_bfly_i32(i, a, b); + // CHECK: call float @llvm.nvvm.shfl.bfly.f32(float + __nvvm_shfl_bfly_f32(f, a, b); + // CHECK: call i32 @llvm.nvvm.shfl.idx.i32(i32 + __nvvm_shfl_idx_i32(i, a, b); + // CHECK: call float @llvm.nvvm.shfl.idx.f32(float + __nvvm_shfl_idx_f32(f, a, b); + // CHECK: ret void +} Index: llvm/trunk/include/llvm/IR/IntrinsicsNVVM.td =================================================================== --- llvm/trunk/include/llvm/IR/IntrinsicsNVVM.td +++ llvm/trunk/include/llvm/IR/IntrinsicsNVVM.td @@ -3736,4 +3736,48 @@ Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem, IntrConvergent], "llvm.nvvm.shfl.idx.f32">, GCCBuiltin<"__nvvm_shfl_idx_f32">; + +// Synchronizing shfl variants available in CUDA-9. +// On sm_70 these don't have to be convergent, so we may eventually want to +// implement non-convergent variant of this intrinsic. + +// shfl.sync.down.b32 dest, threadmask, val, offset , mask_and_clamp +def int_nvvm_shfl_sync_down_i32 : + Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], + [IntrNoMem, IntrConvergent], "llvm.nvvm.shfl.sync.down.i32">, + GCCBuiltin<"__nvvm_shfl_sync_down_i32">; +def int_nvvm_shfl_sync_down_f32 : + Intrinsic<[llvm_float_ty], [llvm_i32_ty, llvm_float_ty, llvm_i32_ty, llvm_i32_ty], + [IntrNoMem, IntrConvergent], "llvm.nvvm.shfl.sync.down.f32">, + GCCBuiltin<"__nvvm_shfl_sync_down_f32">; + +// shfl.sync.up.b32 dest, threadmask, val, offset, mask_and_clamp +def int_nvvm_shfl_sync_up_i32 : + Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], + [IntrNoMem, IntrConvergent], "llvm.nvvm.shfl.sync.up.i32">, + GCCBuiltin<"__nvvm_shfl_sync_up_i32">; +def int_nvvm_shfl_sync_up_f32 : + Intrinsic<[llvm_float_ty], [llvm_i32_ty, llvm_float_ty, llvm_i32_ty, llvm_i32_ty], + [IntrNoMem, IntrConvergent], "llvm.nvvm.shfl.sync.up.f32">, + GCCBuiltin<"__nvvm_shfl_sync_up_f32">; + +// shfl.sync.bfly.b32 dest, threadmask, val, offset, mask_and_clamp +def int_nvvm_shfl_sync_bfly_i32 : + Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], + [IntrNoMem, IntrConvergent], "llvm.nvvm.shfl.sync.bfly.i32">, + GCCBuiltin<"__nvvm_shfl_sync_bfly_i32">; +def int_nvvm_shfl_sync_bfly_f32 : + Intrinsic<[llvm_float_ty], [llvm_i32_ty, llvm_float_ty, llvm_i32_ty, llvm_i32_ty], + [IntrNoMem, IntrConvergent], "llvm.nvvm.shfl.sync.bfly.f32">, + GCCBuiltin<"__nvvm_shfl_sync_bfly_f32">; + +// shfl.sync.idx.b32 dest, threadmask, val, lane, mask_and_clamp +def int_nvvm_shfl_sync_idx_i32 : + Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], + [IntrNoMem, IntrConvergent], "llvm.nvvm.shfl.sync.idx.i32">, + GCCBuiltin<"__nvvm_shfl_sync_idx_i32">; +def int_nvvm_shfl_sync_idx_f32 : + Intrinsic<[llvm_float_ty], [llvm_i32_ty, llvm_float_ty, llvm_i32_ty, llvm_i32_ty], + [IntrNoMem, IntrConvergent], "llvm.nvvm.shfl.sync.idx.f32">, + GCCBuiltin<"__nvvm_shfl_sync_idx_f32">; } Index: llvm/trunk/lib/Target/NVPTX/NVPTXIntrinsics.td =================================================================== --- llvm/trunk/lib/Target/NVPTX/NVPTXIntrinsics.td +++ llvm/trunk/lib/Target/NVPTX/NVPTXIntrinsics.td @@ -111,8 +111,80 @@ defm INT_SHFL_IDX_I32 : SHFL; defm INT_SHFL_IDX_F32 : SHFL; -} // isConvergent = 1 +multiclass SHFL_SYNC { + // Threadmask and the last two parameters to shfl.sync can be regs or imms. + // ptxas is smart enough to inline constant registers, so strictly speaking we + // don't need to handle immediates here. But it's easy enough, and it makes + // our ptx more readable. + def rrr : NVPTXInst< + (outs regclass:$dst), + (ins Int32Regs:$threadmask, regclass:$src, Int32Regs:$offset, Int32Regs:$mask), + !strconcat("shfl.sync.", mode, ".b32 $dst, $src, $offset, $mask, $threadmask;"), + [(set regclass:$dst, (IntOp Int32Regs:$threadmask, regclass:$src, + Int32Regs:$offset, Int32Regs:$mask))]>; + def rri : NVPTXInst< + (outs regclass:$dst), + (ins Int32Regs:$threadmask, regclass:$src, Int32Regs:$offset, i32imm:$mask), + !strconcat("shfl.sync.", mode, ".b32 $dst, $src, $offset, $mask, $threadmask;"), + [(set regclass:$dst, (IntOp Int32Regs:$threadmask, regclass:$src, + Int32Regs:$offset, imm:$mask))]>; + + def rir : NVPTXInst< + (outs regclass:$dst), + (ins Int32Regs:$threadmask, regclass:$src, i32imm:$offset, Int32Regs:$mask), + !strconcat("shfl.sync.", mode, ".b32 $dst, $src, $offset, $mask, $threadmask;"), + [(set regclass:$dst, (IntOp Int32Regs:$threadmask, regclass:$src, + imm:$offset, Int32Regs:$mask))]>; + + def rii : NVPTXInst< + (outs regclass:$dst), + (ins Int32Regs:$threadmask, regclass:$src, i32imm:$offset, i32imm:$mask), + !strconcat("shfl.sync.", mode, ".b32 $dst, $src, $offset, $mask, $threadmask;"), + [(set regclass:$dst, (IntOp Int32Regs:$threadmask, regclass:$src, + imm:$offset, imm:$mask))]>; + + def irr : NVPTXInst< + (outs regclass:$dst), + (ins i32imm:$threadmask, regclass:$src, Int32Regs:$offset, Int32Regs:$mask), + !strconcat("shfl.sync.", mode, ".b32 $dst, $src, $offset, $mask, $threadmask;"), + [(set regclass:$dst, (IntOp imm:$threadmask, regclass:$src, + Int32Regs:$offset, Int32Regs:$mask))]>; + + def iri : NVPTXInst< + (outs regclass:$dst), + (ins i32imm:$threadmask, regclass:$src, Int32Regs:$offset, i32imm:$mask), + !strconcat("shfl.sync.", mode, ".b32 $dst, $src, $offset, $mask, $threadmask;"), + [(set regclass:$dst, (IntOp imm:$threadmask, regclass:$src, + Int32Regs:$offset, imm:$mask))]>; + + def iir : NVPTXInst< + (outs regclass:$dst), + (ins i32imm:$threadmask, regclass:$src, i32imm:$offset, Int32Regs:$mask), + !strconcat("shfl.sync.", mode, ".b32 $dst, $src, $offset, $mask, $threadmask;"), + [(set regclass:$dst, (IntOp imm:$threadmask, regclass:$src, + imm:$offset, Int32Regs:$mask))]>; + + def iii : NVPTXInst< + (outs regclass:$dst), + (ins i32imm:$threadmask, regclass:$src, i32imm:$offset, i32imm:$mask), + !strconcat("shfl.sync.", mode, ".b32 $dst, $src, $offset, $mask, $threadmask;"), + [(set regclass:$dst, (IntOp imm:$threadmask, regclass:$src, + imm:$offset, imm:$mask))]>; +} + +// On sm_70 these don't have to be convergent, so we may eventually want to +// implement non-convergent variant of this intrinsic. +defm INT_SHFL_SYNC_DOWN_I32 : SHFL_SYNC; +defm INT_SHFL_SYNC_DOWN_F32 : SHFL_SYNC; +defm INT_SHFL_SYNC_UP_I32 : SHFL_SYNC; +defm INT_SHFL_SYNC_UP_F32 : SHFL_SYNC; +defm INT_SHFL_SYNC_BFLY_I32 : SHFL_SYNC; +defm INT_SHFL_SYNC_BFLY_F32 : SHFL_SYNC; +defm INT_SHFL_SYNC_IDX_I32 : SHFL_SYNC; +defm INT_SHFL_SYNC_IDX_F32 : SHFL_SYNC; + +} // isConvergent = 1 //----------------------------------- // Explicit Memory Fence Functions Index: llvm/trunk/test/CodeGen/NVPTX/shfl-sync.ll =================================================================== --- llvm/trunk/test/CodeGen/NVPTX/shfl-sync.ll +++ llvm/trunk/test/CodeGen/NVPTX/shfl-sync.ll @@ -0,0 +1,94 @@ +; RUN: llc < %s -march=nvptx64 -mcpu=sm_30 | FileCheck %s + +declare i32 @llvm.nvvm.shfl.sync.down.i32(i32, i32, i32, i32) +declare float @llvm.nvvm.shfl.sync.down.f32(float, i32, i32, i32) +declare i32 @llvm.nvvm.shfl.sync.up.i32(i32, i32, i32, i32) +declare float @llvm.nvvm.shfl.sync.up.f32(float, i32, i32, i32) +declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) +declare float @llvm.nvvm.shfl.sync.bfly.f32(float, i32, i32, i32) +declare i32 @llvm.nvvm.shfl.sync.idx.i32(i32, i32, i32, i32) +declare float @llvm.nvvm.shfl.sync.idx.f32(float, i32, i32, i32) + +; CHECK-LABEL: .func{{.*}}shfl.sync.rrr +define i32 @shfl.sync.rrr(i32 %mask, i32 %a, i32 %b, i32 %c) { + ; CHECK: ld.param.u32 [[MASK:%r[0-9]+]] + ; CHECK: ld.param.u32 [[A:%r[0-9]+]] + ; CHECK: ld.param.u32 [[B:%r[0-9]+]] + ; CHECK: ld.param.u32 [[C:%r[0-9]+]] + ; CHECK: shfl.sync.down.b32 [[OUT:%r[0-9]+]], [[A]], [[B]], [[C]], [[MASK]]; + ; CHECK: st.param.{{.}}32 {{.*}}, [[OUT]] + %val = call i32 @llvm.nvvm.shfl.sync.down.i32(i32 %mask, i32 %a, i32 %b, i32 %c) + ret i32 %val +} + +; CHECK-LABEL: .func{{.*}}shfl.sync.irr +define i32 @shfl.sync.irr(i32 %a, i32 %b, i32 %c) { + ; CHECK: ld.param.u32 [[A:%r[0-9]+]] + ; CHECK: ld.param.u32 [[B:%r[0-9]+]] + ; CHECK: ld.param.u32 [[C:%r[0-9]+]] + ; CHECK: shfl.sync.down.b32 [[OUT:%r[0-9]+]], [[A]], [[B]], [[C]], 1; + ; CHECK: st.param.{{.}}32 {{.*}}, [[OUT]] + %val = call i32 @llvm.nvvm.shfl.sync.down.i32(i32 1, i32 %a, i32 %b, i32 %c) + ret i32 %val +} + +; CHECK-LABEL: .func{{.*}}shfl.sync.rri +define i32 @shfl.sync.rri(i32 %mask, i32 %a, i32 %b) { + ; CHECK: ld.param.u32 [[MASK:%r[0-9]+]] + ; CHECK: ld.param.u32 [[A:%r[0-9]+]] + ; CHECK: ld.param.u32 [[B:%r[0-9]+]] + ; CHECK: shfl.sync.down.b32 [[OUT:%r[0-9]+]], [[A]], [[B]], 1, [[MASK]]; + ; CHECK: st.param.{{.}}32 {{.*}}, [[OUT]] + %val = call i32 @llvm.nvvm.shfl.sync.down.i32(i32 %mask, i32 %a, i32 %b, i32 1) + ret i32 %val +} + +; CHECK-LABEL: .func{{.*}}shfl.sync.iri +define i32 @shfl.sync.iri(i32 %a, i32 %b) { + ; CHECK: ld.param.u32 [[A:%r[0-9]+]] + ; CHECK: ld.param.u32 [[B:%r[0-9]+]] + ; CHECK: shfl.sync.down.b32 [[OUT:%r[0-9]+]], [[A]], [[B]], 2, 1; + ; CHECK: st.param.{{.}}32 {{.*}}, [[OUT]] + %val = call i32 @llvm.nvvm.shfl.sync.down.i32(i32 1, i32 %a, i32 %b, i32 2) + ret i32 %val +} + +; CHECK-LABEL: .func{{.*}}shfl.sync.rir +define i32 @shfl.sync.rir(i32 %mask, i32 %a, i32 %c) { + ; CHECK: ld.param.u32 [[MASK:%r[0-9]+]] + ; CHECK: ld.param.u32 [[A:%r[0-9]+]] + ; CHECK: ld.param.u32 [[C:%r[0-9]+]] + ; CHECK: shfl.sync.down.b32 [[OUT:%r[0-9]+]], [[A]], 1, [[C]], [[MASK]]; + ; CHECK: st.param.{{.}}32 {{.*}}, [[OUT]] + %val = call i32 @llvm.nvvm.shfl.sync.down.i32(i32 %mask, i32 %a, i32 1, i32 %c) + ret i32 %val +} + +; CHECK-LABEL: .func{{.*}}shfl.sync.iir +define i32 @shfl.sync.iir(i32 %a, i32 %c) { + ; CHECK: ld.param.u32 [[A:%r[0-9]+]] + ; CHECK: ld.param.u32 [[C:%r[0-9]+]] + ; CHECK: shfl.sync.down.b32 [[OUT:%r[0-9]+]], [[A]], 2, [[C]], 1; + ; CHECK: st.param.{{.}}32 {{.*}}, [[OUT]] + %val = call i32 @llvm.nvvm.shfl.sync.down.i32(i32 1, i32 %a, i32 2, i32 %c) + ret i32 %val +} + +; CHECK-LABEL: .func{{.*}}shfl.sync.rii +define i32 @shfl.sync.rii(i32 %mask, i32 %a) { + ; CHECK: ld.param.u32 [[MASK:%r[0-9]+]] + ; CHECK: ld.param.u32 [[A:%r[0-9]+]] + ; CHECK: shfl.sync.down.b32 [[OUT:%r[0-9]+]], [[A]], 1, 2, [[MASK]]; + ; CHECK: st.param.{{.}}32 {{.*}}, [[OUT]] + %val = call i32 @llvm.nvvm.shfl.sync.down.i32(i32 %mask, i32 %a, i32 1, i32 2) + ret i32 %val +} + +; CHECK-LABEL: .func{{.*}}shfl.sync.iii +define i32 @shfl.sync.iii(i32 %a, i32 %b) { + ; CHECK: ld.param.u32 [[A:%r[0-9]+]] + ; CHECK: shfl.sync.down.b32 [[OUT:%r[0-9]+]], [[A]], 2, 3, 1; + ; CHECK: st.param.{{.}}32 {{.*}}, [[OUT]] + %val = call i32 @llvm.nvvm.shfl.sync.down.i32(i32 1, i32 %a, i32 2, i32 3) + ret i32 %val +}