Index: include/llvm/IR/IntrinsicsNVVM.td =================================================================== --- include/llvm/IR/IntrinsicsNVVM.td +++ include/llvm/IR/IntrinsicsNVVM.td @@ -730,15 +730,15 @@ // Bar.Sync def int_cuda_syncthreads : GCCBuiltin<"__syncthreads">, - Intrinsic<[], [], [IntrNoDuplicate]>; + Intrinsic<[], [], [IntrConvergent]>; def int_nvvm_barrier0 : GCCBuiltin<"__nvvm_bar0">, - Intrinsic<[], [], [IntrNoDuplicate]>; + Intrinsic<[], [], [IntrConvergent]>; def int_nvvm_barrier0_popc : GCCBuiltin<"__nvvm_bar0_popc">, - Intrinsic<[llvm_i32_ty], [llvm_i32_ty], [IntrNoDuplicate]>; + Intrinsic<[llvm_i32_ty], [llvm_i32_ty], [IntrConvergent]>; def int_nvvm_barrier0_and : GCCBuiltin<"__nvvm_bar0_and">, - Intrinsic<[llvm_i32_ty], [llvm_i32_ty], [IntrNoDuplicate]>; + Intrinsic<[llvm_i32_ty], [llvm_i32_ty], [IntrConvergent]>; def int_nvvm_barrier0_or : GCCBuiltin<"__nvvm_bar0_or">, - Intrinsic<[llvm_i32_ty], [llvm_i32_ty], [IntrNoDuplicate]>; + Intrinsic<[llvm_i32_ty], [llvm_i32_ty], [IntrConvergent]>; // Membar def int_nvvm_membar_cta : GCCBuiltin<"__nvvm_membar_cta">, Index: lib/Transforms/Scalar/JumpThreading.cpp =================================================================== --- lib/Transforms/Scalar/JumpThreading.cpp +++ lib/Transforms/Scalar/JumpThreading.cpp @@ -273,7 +273,7 @@ // as having cost of 2 total, and if they are a vector intrinsic, we model // them as having cost 1. if (const CallInst *CI = dyn_cast(I)) { - if (CI->cannotDuplicate()) + if (CI->cannotDuplicate() || CI->hasFnAttr(Attribute::Convergent)) // Blocks with NoDuplicate are modelled as having infinite cost, so they // are never duplicated. return ~0U; Index: test/CodeGen/NVPTX/convergent-syncthreads.ll =================================================================== --- /dev/null +++ test/CodeGen/NVPTX/convergent-syncthreads.ll @@ -0,0 +1,51 @@ +; RUN: opt < %s -O3 -S | FileCheck %s + +; Make sure the call to syncthreads is not duplicate here by the LLVM +; optimizations, because it has the convergent attribute set. + +; CHECK: call void @llvm.cuda.syncthreads +; CHECK-NOT: call void @llvm.cuda.syncthreads + +; Function Attrs: nounwind +define void @foo(i32 %idx, float* %output, float* %output2) #1 { +entry: + %cmp = icmp ult i32 %idx, 10 + br i1 %cmp, label %if.then, label %if.else + +if.then: ; preds = %entry + %0 = load float, float* %output, align 4 + %conv1 = fadd float %0, 1.000000e+00 + store float %conv1, float* %output, align 4 + br label %if.end + +if.else: ; preds = %entry + %1 = load float, float* %output2, align 4 + %conv4 = fadd float %1, 2.000000e+00 + store float %conv4, float* %output2, align 4 + br label %if.end + +if.end: ; preds = %if.else, %if.then + tail call void @llvm.cuda.syncthreads() + br i1 %cmp, label %if.then.6, label %if.else.10 + +if.then.6: ; preds = %if.end + %2 = load float, float* %output, align 4 + %conv9 = fadd float %2, 3.000000e+00 + store float %conv9, float* %output, align 4 + br label %if.end.14 + +if.else.10: ; preds = %if.end + %3 = load float, float* %output2, align 4 + %conv13 = fadd float %3, 4.000000e+00 + store float %conv13, float* %output2, align 4 + br label %if.end.14 + +if.end.14: ; preds = %if.else.10, %if.then.6 + ret void +} + +; Function Attrs: convergent nounwind +declare void @llvm.cuda.syncthreads() #2 + +!0 = !{void (i32, float*, float*)* @foo, !"kernel", i32 1} +!1 = !{null, !"align", i32 8} Index: test/CodeGen/NVPTX/noduplicate-syncthreads.ll =================================================================== --- test/CodeGen/NVPTX/noduplicate-syncthreads.ll +++ /dev/null @@ -1,74 +0,0 @@ -; RUN: opt < %s -O3 -S | FileCheck %s - -; Make sure the call to syncthreads is not duplicate here by the LLVM -; optimizations, because it has the noduplicate attribute set. - -; CHECK: call void @llvm.cuda.syncthreads -; CHECK-NOT: call void @llvm.cuda.syncthreads - -; Function Attrs: nounwind -define void @foo(float* %output) #1 { -entry: - %output.addr = alloca float*, align 8 - store float* %output, float** %output.addr, align 8 - %0 = load float*, float** %output.addr, align 8 - %arrayidx = getelementptr inbounds float, float* %0, i64 0 - %1 = load float, float* %arrayidx, align 4 - %conv = fpext float %1 to double - %cmp = fcmp olt double %conv, 1.000000e+01 - br i1 %cmp, label %if.then, label %if.else - -if.then: ; preds = %entry - %2 = load float*, float** %output.addr, align 8 - %3 = load float, float* %2, align 4 - %conv1 = fpext float %3 to double - %add = fadd double %conv1, 1.000000e+00 - %conv2 = fptrunc double %add to float - store float %conv2, float* %2, align 4 - br label %if.end - -if.else: ; preds = %entry - %4 = load float*, float** %output.addr, align 8 - %5 = load float, float* %4, align 4 - %conv3 = fpext float %5 to double - %add4 = fadd double %conv3, 2.000000e+00 - %conv5 = fptrunc double %add4 to float - store float %conv5, float* %4, align 4 - br label %if.end - -if.end: ; preds = %if.else, %if.then - call void @llvm.cuda.syncthreads() - %6 = load float*, float** %output.addr, align 8 - %arrayidx6 = getelementptr inbounds float, float* %6, i64 0 - %7 = load float, float* %arrayidx6, align 4 - %conv7 = fpext float %7 to double - %cmp8 = fcmp olt double %conv7, 1.000000e+01 - br i1 %cmp8, label %if.then9, label %if.else13 - -if.then9: ; preds = %if.end - %8 = load float*, float** %output.addr, align 8 - %9 = load float, float* %8, align 4 - %conv10 = fpext float %9 to double - %add11 = fadd double %conv10, 3.000000e+00 - %conv12 = fptrunc double %add11 to float - store float %conv12, float* %8, align 4 - br label %if.end17 - -if.else13: ; preds = %if.end - %10 = load float*, float** %output.addr, align 8 - %11 = load float, float* %10, align 4 - %conv14 = fpext float %11 to double - %add15 = fadd double %conv14, 4.000000e+00 - %conv16 = fptrunc double %add15 to float - store float %conv16, float* %10, align 4 - br label %if.end17 - -if.end17: ; preds = %if.else13, %if.then9 - ret void -} - -; Function Attrs: noduplicate nounwind -declare void @llvm.cuda.syncthreads() #2 - -!0 = !{void (float*)* @foo, !"kernel", i32 1} -!1 = !{null, !"align", i32 8}