Index: include/llvm/IR/IntrinsicsNVVM.td
===================================================================
--- include/llvm/IR/IntrinsicsNVVM.td
+++ include/llvm/IR/IntrinsicsNVVM.td
@@ -730,15 +730,15 @@
 
 // Bar.Sync
   def int_cuda_syncthreads : GCCBuiltin<"__syncthreads">,
-      Intrinsic<[], [], [IntrNoDuplicate]>;
+      Intrinsic<[], [], [IntrConvergent]>;
   def int_nvvm_barrier0 : GCCBuiltin<"__nvvm_bar0">,
-      Intrinsic<[], [], [IntrNoDuplicate]>;
+      Intrinsic<[], [], [IntrConvergent]>;
   def int_nvvm_barrier0_popc : GCCBuiltin<"__nvvm_bar0_popc">,
-      Intrinsic<[llvm_i32_ty], [llvm_i32_ty], [IntrNoDuplicate]>;
+      Intrinsic<[llvm_i32_ty], [llvm_i32_ty], [IntrConvergent]>;
   def int_nvvm_barrier0_and : GCCBuiltin<"__nvvm_bar0_and">,
-      Intrinsic<[llvm_i32_ty], [llvm_i32_ty], [IntrNoDuplicate]>;
+      Intrinsic<[llvm_i32_ty], [llvm_i32_ty], [IntrConvergent]>;
   def int_nvvm_barrier0_or : GCCBuiltin<"__nvvm_bar0_or">,
-      Intrinsic<[llvm_i32_ty], [llvm_i32_ty], [IntrNoDuplicate]>;
+      Intrinsic<[llvm_i32_ty], [llvm_i32_ty], [IntrConvergent]>;
 
   // Membar
   def int_nvvm_membar_cta : GCCBuiltin<"__nvvm_membar_cta">,
Index: lib/Transforms/Scalar/JumpThreading.cpp
===================================================================
--- lib/Transforms/Scalar/JumpThreading.cpp
+++ lib/Transforms/Scalar/JumpThreading.cpp
@@ -273,7 +273,7 @@
     // as having cost of 2 total, and if they are a vector intrinsic, we model
     // them as having cost 1.
     if (const CallInst *CI = dyn_cast<CallInst>(I)) {
-      if (CI->cannotDuplicate())
+      if (CI->cannotDuplicate() || CI->hasFnAttr(Attribute::Convergent))
         // Blocks with NoDuplicate are modelled as having infinite cost, so they
         // are never duplicated.
         return ~0U;
Index: test/CodeGen/NVPTX/convergent-syncthreads.ll
===================================================================
--- /dev/null
+++ test/CodeGen/NVPTX/convergent-syncthreads.ll
@@ -0,0 +1,51 @@
+; RUN: opt < %s -O3 -S | FileCheck %s
+
+; Make sure the call to syncthreads is not duplicate here by the LLVM
+; optimizations, because it has the convergent attribute set.
+
+; CHECK: call void @llvm.cuda.syncthreads
+; CHECK-NOT: call void @llvm.cuda.syncthreads
+
+; Function Attrs: nounwind
+define void @foo(i32 %idx, float* %output, float* %output2) #1 {
+entry:
+  %cmp = icmp ult i32 %idx, 10
+  br i1 %cmp, label %if.then, label %if.else
+
+if.then:                                          ; preds = %entry
+  %0 = load float, float* %output, align 4
+  %conv1 = fadd float %0, 1.000000e+00
+  store float %conv1, float* %output, align 4
+  br label %if.end
+
+if.else:                                          ; preds = %entry
+  %1 = load float, float* %output2, align 4
+  %conv4 = fadd float %1, 2.000000e+00
+  store float %conv4, float* %output2, align 4
+  br label %if.end
+
+if.end:                                           ; preds = %if.else, %if.then
+  tail call void @llvm.cuda.syncthreads()
+  br i1 %cmp, label %if.then.6, label %if.else.10
+
+if.then.6:                                        ; preds = %if.end
+  %2 = load float, float* %output, align 4
+  %conv9 = fadd float %2, 3.000000e+00
+  store float %conv9, float* %output, align 4
+  br label %if.end.14
+
+if.else.10:                                       ; preds = %if.end
+  %3 = load float, float* %output2, align 4
+  %conv13 = fadd float %3, 4.000000e+00
+  store float %conv13, float* %output2, align 4
+  br label %if.end.14
+
+if.end.14:                                        ; preds = %if.else.10, %if.then.6
+  ret void
+}
+
+; Function Attrs: convergent nounwind
+declare void @llvm.cuda.syncthreads() #2
+
+!0 = !{void (i32, float*, float*)* @foo, !"kernel", i32 1}
+!1 = !{null, !"align", i32 8}
Index: test/CodeGen/NVPTX/noduplicate-syncthreads.ll
===================================================================
--- test/CodeGen/NVPTX/noduplicate-syncthreads.ll
+++ /dev/null
@@ -1,74 +0,0 @@
-; RUN: opt < %s -O3 -S | FileCheck %s
-
-; Make sure the call to syncthreads is not duplicate here by the LLVM
-; optimizations, because it has the noduplicate attribute set.
-
-; CHECK: call void @llvm.cuda.syncthreads
-; CHECK-NOT: call void @llvm.cuda.syncthreads
-
-; Function Attrs: nounwind
-define void @foo(float* %output) #1 {
-entry:
-  %output.addr = alloca float*, align 8
-  store float* %output, float** %output.addr, align 8
-  %0 = load float*, float** %output.addr, align 8
-  %arrayidx = getelementptr inbounds float, float* %0, i64 0
-  %1 = load float, float* %arrayidx, align 4
-  %conv = fpext float %1 to double
-  %cmp = fcmp olt double %conv, 1.000000e+01
-  br i1 %cmp, label %if.then, label %if.else
-
-if.then:                                          ; preds = %entry
-  %2 = load float*, float** %output.addr, align 8
-  %3 = load float, float* %2, align 4
-  %conv1 = fpext float %3 to double
-  %add = fadd double %conv1, 1.000000e+00
-  %conv2 = fptrunc double %add to float
-  store float %conv2, float* %2, align 4
-  br label %if.end
-
-if.else:                                          ; preds = %entry
-  %4 = load float*, float** %output.addr, align 8
-  %5 = load float, float* %4, align 4
-  %conv3 = fpext float %5 to double
-  %add4 = fadd double %conv3, 2.000000e+00
-  %conv5 = fptrunc double %add4 to float
-  store float %conv5, float* %4, align 4
-  br label %if.end
-
-if.end:                                           ; preds = %if.else, %if.then
-  call void @llvm.cuda.syncthreads()
-  %6 = load float*, float** %output.addr, align 8
-  %arrayidx6 = getelementptr inbounds float, float* %6, i64 0
-  %7 = load float, float* %arrayidx6, align 4
-  %conv7 = fpext float %7 to double
-  %cmp8 = fcmp olt double %conv7, 1.000000e+01
-  br i1 %cmp8, label %if.then9, label %if.else13
-
-if.then9:                                         ; preds = %if.end
-  %8 = load float*, float** %output.addr, align 8
-  %9 = load float, float* %8, align 4
-  %conv10 = fpext float %9 to double
-  %add11 = fadd double %conv10, 3.000000e+00
-  %conv12 = fptrunc double %add11 to float
-  store float %conv12, float* %8, align 4
-  br label %if.end17
-
-if.else13:                                        ; preds = %if.end
-  %10 = load float*, float** %output.addr, align 8
-  %11 = load float, float* %10, align 4
-  %conv14 = fpext float %11 to double
-  %add15 = fadd double %conv14, 4.000000e+00
-  %conv16 = fptrunc double %add15 to float
-  store float %conv16, float* %10, align 4
-  br label %if.end17
-
-if.end17:                                         ; preds = %if.else13, %if.then9
-  ret void
-}
-
-; Function Attrs: noduplicate nounwind
-declare void @llvm.cuda.syncthreads() #2
-
-!0 = !{void (float*)* @foo, !"kernel", i32 1}
-!1 = !{null, !"align", i32 8}