Index: cfe/trunk/include/clang/Basic/BuiltinsNVPTX.def
===================================================================
--- cfe/trunk/include/clang/Basic/BuiltinsNVPTX.def
+++ cfe/trunk/include/clang/Basic/BuiltinsNVPTX.def
@@ -390,6 +390,15 @@
 BUILTIN(__nvvm_shfl_idx_i32, "iiii", "")
 BUILTIN(__nvvm_shfl_idx_f32, "ffii", "")
 
+TARGET_BUILTIN(__nvvm_shfl_sync_down_i32, "iUiiii", "", "ptx60")
+TARGET_BUILTIN(__nvvm_shfl_sync_down_f32, "fUifii", "", "ptx60")
+TARGET_BUILTIN(__nvvm_shfl_sync_up_i32, "iUiiii", "", "ptx60")
+TARGET_BUILTIN(__nvvm_shfl_sync_up_f32, "fUifii", "", "ptx60")
+TARGET_BUILTIN(__nvvm_shfl_sync_bfly_i32, "iUiiii", "", "ptx60")
+TARGET_BUILTIN(__nvvm_shfl_sync_bfly_f32, "fUifii", "", "ptx60")
+TARGET_BUILTIN(__nvvm_shfl_sync_idx_i32, "iUiiii", "", "ptx60")
+TARGET_BUILTIN(__nvvm_shfl_sync_idx_f32, "fUifii", "", "ptx60")
+
 // Membar
 
 BUILTIN(__nvvm_membar_cta, "v", "")
Index: cfe/trunk/lib/Driver/ToolChains/Cuda.cpp
===================================================================
--- cfe/trunk/lib/Driver/ToolChains/Cuda.cpp
+++ cfe/trunk/lib/Driver/ToolChains/Cuda.cpp
@@ -507,11 +507,17 @@
   CC1Args.push_back("-mlink-cuda-bitcode");
   CC1Args.push_back(DriverArgs.MakeArgString(LibDeviceFile));
 
-  // Libdevice in CUDA-7.0 requires PTX version that's more recent
-  // than LLVM defaults to. Use PTX4.2 which is the PTX version that
-  // came with CUDA-7.0.
-  CC1Args.push_back("-target-feature");
-  CC1Args.push_back("+ptx42");
+  if (CudaInstallation.version() >= CudaVersion::CUDA_90) {
+    // CUDA-9 uses new instructions that are only available in PTX6.0
+    CC1Args.push_back("-target-feature");
+    CC1Args.push_back("+ptx60");
+  } else {
+    // Libdevice in CUDA-7.0 requires PTX version that's more recent
+    // than LLVM defaults to. Use PTX4.2 which is the PTX version that
+    // came with CUDA-7.0.
+    CC1Args.push_back("-target-feature");
+    CC1Args.push_back("+ptx42");
+  }
 }
 
 void CudaToolChain::AddCudaIncludeArgs(const ArgList &DriverArgs,
Index: cfe/trunk/lib/Headers/__clang_cuda_intrinsics.h
===================================================================
--- cfe/trunk/lib/Headers/__clang_cuda_intrinsics.h
+++ cfe/trunk/lib/Headers/__clang_cuda_intrinsics.h
@@ -92,6 +92,74 @@
 
 #endif // !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 300
 
+// __shfl_sync_* variants available in CUDA-9
+#if CUDA_VERSION >= 9000 && (!defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 300)
+#pragma push_macro("__MAKE_SYNC_SHUFFLES")
+#define __MAKE_SYNC_SHUFFLES(__FnName, __IntIntrinsic, __FloatIntrinsic,       \
+                             __Mask)                                           \
+  inline __device__ int __FnName(unsigned int __mask, int __val, int __offset, \
+                                 int __width = warpSize) {                     \
+    return __IntIntrinsic(__mask, __val, __offset,                             \
+                          ((warpSize - __width) << 8) | (__Mask));             \
+  }                                                                            \
+  inline __device__ float __FnName(unsigned int __mask, float __val,           \
+                                   int __offset, int __width = warpSize) {     \
+    return __FloatIntrinsic(__mask, __val, __offset,                           \
+                            ((warpSize - __width) << 8) | (__Mask));           \
+  }                                                                            \
+  inline __device__ unsigned int __FnName(unsigned int __mask,                 \
+                                          unsigned int __val, int __offset,    \
+                                          int __width = warpSize) {            \
+    return static_cast<unsigned int>(                                          \
+        ::__FnName(__mask, static_cast<int>(__val), __offset, __width));       \
+  }                                                                            \
+  inline __device__ long long __FnName(unsigned int __mask, long long __val,   \
+                                       int __offset, int __width = warpSize) { \
+    struct __Bits {                                                            \
+      int __a, __b;                                                            \
+    };                                                                         \
+    _Static_assert(sizeof(__val) == sizeof(__Bits));                           \
+    _Static_assert(sizeof(__Bits) == 2 * sizeof(int));                         \
+    __Bits __tmp;                                                              \
+    memcpy(&__val, &__tmp, sizeof(__val));                                     \
+    __tmp.__a = ::__FnName(__mask, __tmp.__a, __offset, __width);              \
+    __tmp.__b = ::__FnName(__mask, __tmp.__b, __offset, __width);              \
+    long long __ret;                                                           \
+    memcpy(&__ret, &__tmp, sizeof(__tmp));                                     \
+    return __ret;                                                              \
+  }                                                                            \
+  inline __device__ unsigned long long __FnName(                               \
+      unsigned int __mask, unsigned long long __val, int __offset,             \
+      int __width = warpSize) {                                                \
+    return static_cast<unsigned long long>(::__FnName(                         \
+        __mask, static_cast<unsigned long long>(__val), __offset, __width));   \
+  }                                                                            \
+  inline __device__ double __FnName(unsigned int __mask, double __val,         \
+                                    int __offset, int __width = warpSize) {    \
+    long long __tmp;                                                           \
+    _Static_assert(sizeof(__tmp) == sizeof(__val));                            \
+    memcpy(&__tmp, &__val, sizeof(__val));                                     \
+    __tmp = ::__FnName(__mask, __tmp, __offset, __width);                      \
+    double __ret;                                                              \
+    memcpy(&__ret, &__tmp, sizeof(__ret));                                     \
+    return __ret;                                                              \
+  }
+__MAKE_SYNC_SHUFFLES(__shfl_sync, __nvvm_shfl_sync_idx_i32,
+                     __nvvm_shfl_sync_idx_f32, 0x1f);
+// We use 0 rather than 31 as our mask, because shfl.up applies to lanes >=
+// maxLane.
+__MAKE_SYNC_SHUFFLES(__shfl_sync_up, __nvvm_shfl_sync_up_i32,
+                     __nvvm_shfl_sync_up_f32, 0);
+__MAKE_SYNC_SHUFFLES(__shfl_sync_down, __nvvm_shfl_sync_down_i32,
+                     __nvvm_shfl_sync_down_f32, 0x1f);
+__MAKE_SYNC_SHUFFLES(__shfl_sync_xor, __nvvm_shfl_sync_bfly_i32,
+                     __nvvm_shfl_sync_bfly_f32, 0x1f);
+
+#pragma pop_macro("__MAKE_SYNC_SHUFFLES")
+
+#endif // __CUDA_VERSION >= 9000 && (!defined(__CUDA_ARCH__) ||
+       // __CUDA_ARCH__ >= 300)
+
 // sm_32 intrinsics: __ldg and __funnelshift_{l,lc,r,rc}.
 
 // Prevent the vanilla sm_32 intrinsics header from being included.
Index: cfe/trunk/test/CodeGen/builtins-nvptx-ptx60.cu
===================================================================
--- cfe/trunk/test/CodeGen/builtins-nvptx-ptx60.cu
+++ cfe/trunk/test/CodeGen/builtins-nvptx-ptx60.cu
@@ -0,0 +1,40 @@
+// RUN: %clang_cc1 -triple nvptx64-unknown-unknown -target-cpu sm_60 \
+// RUN:            -fcuda-is-device -target-feature +ptx60 \
+// RUN:            -S -emit-llvm -o - -x cuda %s \
+// RUN:   | FileCheck -check-prefix=CHECK %s
+// RUN: %clang_cc1 -triple nvptx-unknown-unknown -target-cpu sm_60 \
+// RUN:   -fcuda-is-device -S -o /dev/null -x cuda -verify %s
+
+#define __device__ __attribute__((device))
+#define __global__ __attribute__((global))
+#define __shared__ __attribute__((shared))
+#define __constant__ __attribute__((constant))
+
+// CHECK-LABEL: nvvm_shfl_sync
+__device__ void nvvm_shfl_sync(unsigned mask, int i, float f, int a, int b) {
+  // CHECK: call i32 @llvm.nvvm.shfl.sync.down.i32(i32 {{%[0-9]+}}, i32
+  // expected-error@+1 {{'__nvvm_shfl_sync_down_i32' needs target feature ptx60}}
+  __nvvm_shfl_sync_down_i32(mask, i, a, b);
+  // CHECK: call float @llvm.nvvm.shfl.sync.down.f32(i32 {{%[0-9]+}}, float
+  // expected-error@+1 {{'__nvvm_shfl_sync_down_f32' needs target feature ptx60}}
+  __nvvm_shfl_sync_down_f32(mask, f, a, b);
+  // CHECK: call i32 @llvm.nvvm.shfl.sync.up.i32(i32 {{%[0-9]+}}, i32
+  // expected-error@+1 {{'__nvvm_shfl_sync_up_i32' needs target feature ptx60}}
+  __nvvm_shfl_sync_up_i32(mask, i, a, b);
+  // CHECK: call float @llvm.nvvm.shfl.sync.up.f32(i32 {{%[0-9]+}}, float
+  // expected-error@+1 {{'__nvvm_shfl_sync_up_f32' needs target feature ptx60}}
+  __nvvm_shfl_sync_up_f32(mask, f, a, b);
+  // CHECK: call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 {{%[0-9]+}}, i32
+  // expected-error@+1 {{'__nvvm_shfl_sync_bfly_i32' needs target feature ptx60}}
+  __nvvm_shfl_sync_bfly_i32(mask, i, a, b);
+  // CHECK: call float @llvm.nvvm.shfl.sync.bfly.f32(i32 {{%[0-9]+}}, float
+  // expected-error@+1 {{'__nvvm_shfl_sync_bfly_f32' needs target feature ptx60}}
+  __nvvm_shfl_sync_bfly_f32(mask, f, a, b);
+  // CHECK: call i32 @llvm.nvvm.shfl.sync.idx.i32(i32 {{%[0-9]+}}, i32
+  // expected-error@+1 {{'__nvvm_shfl_sync_idx_i32' needs target feature ptx60}}
+  __nvvm_shfl_sync_idx_i32(mask, i, a, b);
+  // CHECK: call float @llvm.nvvm.shfl.sync.idx.f32(i32 {{%[0-9]+}}, float
+  // expected-error@+1 {{'__nvvm_shfl_sync_idx_f32' needs target feature ptx60}}
+  __nvvm_shfl_sync_idx_f32(mask, f, a, b);
+  // CHECK: ret void
+}
Index: cfe/trunk/test/CodeGen/builtins-nvptx.c
===================================================================
--- cfe/trunk/test/CodeGen/builtins-nvptx.c
+++ cfe/trunk/test/CodeGen/builtins-nvptx.c
@@ -636,3 +636,24 @@
   typedef double double2 __attribute__((ext_vector_type(2)));
   __nvvm_ldg_d2((const double2 *)p);
 }
+
+// CHECK-LABEL: nvvm_shfl
+__device__ void nvvm_shfl(int i, float f, int a, int b) {
+  // CHECK: call i32 @llvm.nvvm.shfl.down.i32(i32
+  __nvvm_shfl_down_i32(i, a, b);
+  // CHECK: call float @llvm.nvvm.shfl.down.f32(float
+  __nvvm_shfl_down_f32(f, a, b);
+  // CHECK: call i32 @llvm.nvvm.shfl.up.i32(i32
+  __nvvm_shfl_up_i32(i, a, b);
+  // CHECK: call float @llvm.nvvm.shfl.up.f32(float
+  __nvvm_shfl_up_f32(f, a, b);
+  // CHECK: call i32 @llvm.nvvm.shfl.bfly.i32(i32
+  __nvvm_shfl_bfly_i32(i, a, b);
+  // CHECK: call float @llvm.nvvm.shfl.bfly.f32(float
+  __nvvm_shfl_bfly_f32(f, a, b);
+  // CHECK: call i32 @llvm.nvvm.shfl.idx.i32(i32
+  __nvvm_shfl_idx_i32(i, a, b);
+  // CHECK: call float @llvm.nvvm.shfl.idx.f32(float
+  __nvvm_shfl_idx_f32(f, a, b);
+  // CHECK: ret void
+}
Index: llvm/trunk/include/llvm/IR/IntrinsicsNVVM.td
===================================================================
--- llvm/trunk/include/llvm/IR/IntrinsicsNVVM.td
+++ llvm/trunk/include/llvm/IR/IntrinsicsNVVM.td
@@ -3736,4 +3736,48 @@
   Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_i32_ty, llvm_i32_ty],
             [IntrNoMem, IntrConvergent], "llvm.nvvm.shfl.idx.f32">,
   GCCBuiltin<"__nvvm_shfl_idx_f32">;
+
+// Synchronizing shfl variants available in CUDA-9.
+// On sm_70 these don't have to be convergent, so we may eventually want to
+// implement non-convergent variant of this intrinsic.
+
+// shfl.sync.down.b32 dest, threadmask, val, offset , mask_and_clamp
+def int_nvvm_shfl_sync_down_i32 :
+  Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
+            [IntrNoMem, IntrConvergent], "llvm.nvvm.shfl.sync.down.i32">,
+  GCCBuiltin<"__nvvm_shfl_sync_down_i32">;
+def int_nvvm_shfl_sync_down_f32 :
+  Intrinsic<[llvm_float_ty], [llvm_i32_ty, llvm_float_ty, llvm_i32_ty, llvm_i32_ty],
+            [IntrNoMem, IntrConvergent], "llvm.nvvm.shfl.sync.down.f32">,
+  GCCBuiltin<"__nvvm_shfl_sync_down_f32">;
+
+// shfl.sync.up.b32 dest, threadmask, val, offset, mask_and_clamp
+def int_nvvm_shfl_sync_up_i32 :
+  Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
+            [IntrNoMem, IntrConvergent], "llvm.nvvm.shfl.sync.up.i32">,
+  GCCBuiltin<"__nvvm_shfl_sync_up_i32">;
+def int_nvvm_shfl_sync_up_f32 :
+  Intrinsic<[llvm_float_ty], [llvm_i32_ty, llvm_float_ty, llvm_i32_ty, llvm_i32_ty],
+            [IntrNoMem, IntrConvergent], "llvm.nvvm.shfl.sync.up.f32">,
+  GCCBuiltin<"__nvvm_shfl_sync_up_f32">;
+
+// shfl.sync.bfly.b32 dest, threadmask, val, offset, mask_and_clamp
+def int_nvvm_shfl_sync_bfly_i32 :
+  Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
+            [IntrNoMem, IntrConvergent], "llvm.nvvm.shfl.sync.bfly.i32">,
+  GCCBuiltin<"__nvvm_shfl_sync_bfly_i32">;
+def int_nvvm_shfl_sync_bfly_f32 :
+  Intrinsic<[llvm_float_ty], [llvm_i32_ty, llvm_float_ty, llvm_i32_ty, llvm_i32_ty],
+            [IntrNoMem, IntrConvergent], "llvm.nvvm.shfl.sync.bfly.f32">,
+  GCCBuiltin<"__nvvm_shfl_sync_bfly_f32">;
+
+// shfl.sync.idx.b32 dest, threadmask, val, lane, mask_and_clamp
+def int_nvvm_shfl_sync_idx_i32 :
+  Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
+            [IntrNoMem, IntrConvergent], "llvm.nvvm.shfl.sync.idx.i32">,
+  GCCBuiltin<"__nvvm_shfl_sync_idx_i32">;
+def int_nvvm_shfl_sync_idx_f32 :
+  Intrinsic<[llvm_float_ty], [llvm_i32_ty, llvm_float_ty, llvm_i32_ty, llvm_i32_ty],
+            [IntrNoMem, IntrConvergent], "llvm.nvvm.shfl.sync.idx.f32">,
+  GCCBuiltin<"__nvvm_shfl_sync_idx_f32">;
 }
Index: llvm/trunk/lib/Target/NVPTX/NVPTXIntrinsics.td
===================================================================
--- llvm/trunk/lib/Target/NVPTX/NVPTXIntrinsics.td
+++ llvm/trunk/lib/Target/NVPTX/NVPTXIntrinsics.td
@@ -111,8 +111,80 @@
 defm INT_SHFL_IDX_I32 : SHFL<Int32Regs, "idx", int_nvvm_shfl_idx_i32>;
 defm INT_SHFL_IDX_F32 : SHFL<Float32Regs, "idx", int_nvvm_shfl_idx_f32>;
 
-} // isConvergent = 1
+multiclass SHFL_SYNC<NVPTXRegClass regclass, string mode, Intrinsic IntOp> {
+  // Threadmask and the last two parameters to shfl.sync can be regs or imms.
+  // ptxas is smart enough to inline constant registers, so strictly speaking we
+  // don't need to handle immediates here.  But it's easy enough, and it makes
+  // our ptx more readable.
+  def rrr : NVPTXInst<
+      (outs regclass:$dst),
+      (ins Int32Regs:$threadmask, regclass:$src, Int32Regs:$offset, Int32Regs:$mask),
+      !strconcat("shfl.sync.", mode, ".b32 $dst, $src, $offset, $mask, $threadmask;"),
+      [(set regclass:$dst, (IntOp Int32Regs:$threadmask, regclass:$src,
+                            Int32Regs:$offset, Int32Regs:$mask))]>;
 
+  def rri : NVPTXInst<
+      (outs regclass:$dst),
+      (ins Int32Regs:$threadmask, regclass:$src, Int32Regs:$offset, i32imm:$mask),
+      !strconcat("shfl.sync.", mode, ".b32 $dst, $src, $offset, $mask, $threadmask;"),
+      [(set regclass:$dst, (IntOp Int32Regs:$threadmask, regclass:$src,
+                            Int32Regs:$offset, imm:$mask))]>;
+
+  def rir : NVPTXInst<
+      (outs regclass:$dst),
+      (ins Int32Regs:$threadmask, regclass:$src, i32imm:$offset, Int32Regs:$mask),
+      !strconcat("shfl.sync.", mode, ".b32 $dst, $src, $offset, $mask, $threadmask;"),
+      [(set regclass:$dst, (IntOp Int32Regs:$threadmask, regclass:$src,
+                            imm:$offset, Int32Regs:$mask))]>;
+
+  def rii : NVPTXInst<
+      (outs regclass:$dst),
+      (ins Int32Regs:$threadmask, regclass:$src, i32imm:$offset, i32imm:$mask),
+      !strconcat("shfl.sync.", mode, ".b32 $dst, $src, $offset, $mask, $threadmask;"),
+      [(set regclass:$dst, (IntOp Int32Regs:$threadmask, regclass:$src,
+                            imm:$offset, imm:$mask))]>;
+
+  def irr : NVPTXInst<
+      (outs regclass:$dst),
+      (ins i32imm:$threadmask, regclass:$src, Int32Regs:$offset, Int32Regs:$mask),
+      !strconcat("shfl.sync.", mode, ".b32 $dst, $src, $offset, $mask, $threadmask;"),
+      [(set regclass:$dst, (IntOp imm:$threadmask, regclass:$src,
+                            Int32Regs:$offset, Int32Regs:$mask))]>;
+
+  def iri : NVPTXInst<
+      (outs regclass:$dst),
+      (ins i32imm:$threadmask, regclass:$src, Int32Regs:$offset, i32imm:$mask),
+      !strconcat("shfl.sync.", mode, ".b32 $dst, $src, $offset, $mask, $threadmask;"),
+      [(set regclass:$dst, (IntOp imm:$threadmask, regclass:$src,
+                            Int32Regs:$offset, imm:$mask))]>;
+
+  def iir : NVPTXInst<
+      (outs regclass:$dst),
+      (ins i32imm:$threadmask, regclass:$src, i32imm:$offset, Int32Regs:$mask),
+      !strconcat("shfl.sync.", mode, ".b32 $dst, $src, $offset, $mask, $threadmask;"),
+      [(set regclass:$dst, (IntOp imm:$threadmask, regclass:$src,
+                            imm:$offset, Int32Regs:$mask))]>;
+
+  def iii : NVPTXInst<
+      (outs regclass:$dst),
+      (ins i32imm:$threadmask, regclass:$src, i32imm:$offset, i32imm:$mask),
+      !strconcat("shfl.sync.", mode, ".b32 $dst, $src, $offset, $mask, $threadmask;"),
+      [(set regclass:$dst, (IntOp imm:$threadmask, regclass:$src,
+                            imm:$offset, imm:$mask))]>;
+}
+
+// On sm_70 these don't have to be convergent, so we may eventually want to
+// implement non-convergent variant of this intrinsic.
+defm INT_SHFL_SYNC_DOWN_I32 : SHFL_SYNC<Int32Regs, "down", int_nvvm_shfl_sync_down_i32>;
+defm INT_SHFL_SYNC_DOWN_F32 : SHFL_SYNC<Float32Regs, "down", int_nvvm_shfl_sync_down_f32>;
+defm INT_SHFL_SYNC_UP_I32 : SHFL_SYNC<Int32Regs, "up", int_nvvm_shfl_sync_up_i32>;
+defm INT_SHFL_SYNC_UP_F32 : SHFL_SYNC<Float32Regs, "up", int_nvvm_shfl_sync_up_f32>;
+defm INT_SHFL_SYNC_BFLY_I32 : SHFL_SYNC<Int32Regs, "bfly", int_nvvm_shfl_sync_bfly_i32>;
+defm INT_SHFL_SYNC_BFLY_F32 : SHFL_SYNC<Float32Regs, "bfly", int_nvvm_shfl_sync_bfly_f32>;
+defm INT_SHFL_SYNC_IDX_I32 : SHFL_SYNC<Int32Regs, "idx", int_nvvm_shfl_sync_idx_i32>;
+defm INT_SHFL_SYNC_IDX_F32 : SHFL_SYNC<Float32Regs, "idx", int_nvvm_shfl_sync_idx_f32>;
+
+} // isConvergent = 1
 
 //-----------------------------------
 // Explicit Memory Fence Functions
Index: llvm/trunk/test/CodeGen/NVPTX/shfl-sync.ll
===================================================================
--- llvm/trunk/test/CodeGen/NVPTX/shfl-sync.ll
+++ llvm/trunk/test/CodeGen/NVPTX/shfl-sync.ll
@@ -0,0 +1,94 @@
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_30 | FileCheck %s
+
+declare i32 @llvm.nvvm.shfl.sync.down.i32(i32, i32, i32, i32)
+declare float @llvm.nvvm.shfl.sync.down.f32(float, i32, i32, i32)
+declare i32 @llvm.nvvm.shfl.sync.up.i32(i32, i32, i32, i32)
+declare float @llvm.nvvm.shfl.sync.up.f32(float, i32, i32, i32)
+declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32)
+declare float @llvm.nvvm.shfl.sync.bfly.f32(float, i32, i32, i32)
+declare i32 @llvm.nvvm.shfl.sync.idx.i32(i32, i32, i32, i32)
+declare float @llvm.nvvm.shfl.sync.idx.f32(float, i32, i32, i32)
+
+; CHECK-LABEL: .func{{.*}}shfl.sync.rrr
+define i32 @shfl.sync.rrr(i32 %mask, i32 %a, i32 %b, i32 %c) {
+  ; CHECK: ld.param.u32 [[MASK:%r[0-9]+]]
+  ; CHECK: ld.param.u32 [[A:%r[0-9]+]]
+  ; CHECK: ld.param.u32 [[B:%r[0-9]+]]
+  ; CHECK: ld.param.u32 [[C:%r[0-9]+]]
+  ; CHECK: shfl.sync.down.b32 [[OUT:%r[0-9]+]], [[A]], [[B]], [[C]], [[MASK]];
+  ; CHECK: st.param.{{.}}32 {{.*}}, [[OUT]]
+  %val = call i32 @llvm.nvvm.shfl.sync.down.i32(i32 %mask, i32 %a, i32 %b, i32 %c)
+  ret i32 %val
+}
+
+; CHECK-LABEL: .func{{.*}}shfl.sync.irr
+define i32 @shfl.sync.irr(i32 %a, i32 %b, i32 %c) {
+  ; CHECK: ld.param.u32 [[A:%r[0-9]+]]
+  ; CHECK: ld.param.u32 [[B:%r[0-9]+]]
+  ; CHECK: ld.param.u32 [[C:%r[0-9]+]]
+  ; CHECK: shfl.sync.down.b32 [[OUT:%r[0-9]+]], [[A]], [[B]], [[C]], 1;
+  ; CHECK: st.param.{{.}}32 {{.*}}, [[OUT]]
+  %val = call i32 @llvm.nvvm.shfl.sync.down.i32(i32 1, i32 %a, i32 %b, i32 %c)
+  ret i32 %val
+}
+
+; CHECK-LABEL: .func{{.*}}shfl.sync.rri
+define i32 @shfl.sync.rri(i32 %mask, i32 %a, i32 %b) {
+  ; CHECK: ld.param.u32 [[MASK:%r[0-9]+]]
+  ; CHECK: ld.param.u32 [[A:%r[0-9]+]]
+  ; CHECK: ld.param.u32 [[B:%r[0-9]+]]
+  ; CHECK: shfl.sync.down.b32 [[OUT:%r[0-9]+]], [[A]], [[B]], 1, [[MASK]];
+  ; CHECK: st.param.{{.}}32 {{.*}}, [[OUT]]
+  %val = call i32 @llvm.nvvm.shfl.sync.down.i32(i32 %mask, i32 %a, i32 %b, i32 1)
+  ret i32 %val
+}
+
+; CHECK-LABEL: .func{{.*}}shfl.sync.iri
+define i32 @shfl.sync.iri(i32 %a, i32 %b) {
+  ; CHECK: ld.param.u32 [[A:%r[0-9]+]]
+  ; CHECK: ld.param.u32 [[B:%r[0-9]+]]
+  ; CHECK: shfl.sync.down.b32 [[OUT:%r[0-9]+]], [[A]], [[B]], 2, 1;
+  ; CHECK: st.param.{{.}}32 {{.*}}, [[OUT]]
+  %val = call i32 @llvm.nvvm.shfl.sync.down.i32(i32 1, i32 %a, i32 %b, i32 2)
+  ret i32 %val
+}
+
+; CHECK-LABEL: .func{{.*}}shfl.sync.rir
+define i32 @shfl.sync.rir(i32 %mask, i32 %a, i32 %c) {
+  ; CHECK: ld.param.u32 [[MASK:%r[0-9]+]]
+  ; CHECK: ld.param.u32 [[A:%r[0-9]+]]
+  ; CHECK: ld.param.u32 [[C:%r[0-9]+]]
+  ; CHECK: shfl.sync.down.b32 [[OUT:%r[0-9]+]], [[A]], 1, [[C]], [[MASK]];
+  ; CHECK: st.param.{{.}}32 {{.*}}, [[OUT]]
+  %val = call i32 @llvm.nvvm.shfl.sync.down.i32(i32 %mask, i32 %a, i32 1, i32 %c)
+  ret i32 %val
+}
+
+; CHECK-LABEL: .func{{.*}}shfl.sync.iir
+define i32 @shfl.sync.iir(i32 %a, i32 %c) {
+  ; CHECK: ld.param.u32 [[A:%r[0-9]+]]
+  ; CHECK: ld.param.u32 [[C:%r[0-9]+]]
+  ; CHECK: shfl.sync.down.b32 [[OUT:%r[0-9]+]], [[A]], 2, [[C]], 1;
+  ; CHECK: st.param.{{.}}32 {{.*}}, [[OUT]]
+  %val = call i32 @llvm.nvvm.shfl.sync.down.i32(i32 1, i32 %a, i32 2, i32 %c)
+  ret i32 %val
+}
+
+; CHECK-LABEL: .func{{.*}}shfl.sync.rii
+define i32 @shfl.sync.rii(i32 %mask, i32 %a) {
+  ; CHECK: ld.param.u32 [[MASK:%r[0-9]+]]
+  ; CHECK: ld.param.u32 [[A:%r[0-9]+]]
+  ; CHECK: shfl.sync.down.b32 [[OUT:%r[0-9]+]], [[A]], 1, 2, [[MASK]];
+  ; CHECK: st.param.{{.}}32 {{.*}}, [[OUT]]
+  %val = call i32 @llvm.nvvm.shfl.sync.down.i32(i32 %mask, i32 %a, i32 1, i32 2)
+  ret i32 %val
+}
+
+; CHECK-LABEL: .func{{.*}}shfl.sync.iii
+define i32 @shfl.sync.iii(i32 %a, i32 %b) {
+  ; CHECK: ld.param.u32 [[A:%r[0-9]+]]
+  ; CHECK: shfl.sync.down.b32 [[OUT:%r[0-9]+]], [[A]], 2, 3, 1;
+  ; CHECK: st.param.{{.}}32 {{.*}}, [[OUT]]
+  %val = call i32 @llvm.nvvm.shfl.sync.down.i32(i32 1, i32 %a, i32 2, i32 3)
+  ret i32 %val
+}