Index: llvm/trunk/include/llvm/IR/IntrinsicsNVVM.td =================================================================== --- llvm/trunk/include/llvm/IR/IntrinsicsNVVM.td +++ llvm/trunk/include/llvm/IR/IntrinsicsNVVM.td @@ -733,6 +733,13 @@ // intrinsics in this file, this one is a user-facing API. def int_nvvm_barrier0 : GCCBuiltin<"__syncthreads">, Intrinsic<[], [], [IntrConvergent]>; + // Synchronize all threads in the CTA at barrier 'n'. + def int_nvvm_barrier_n : GCCBuiltin<"__nvvm_bar_n">, + Intrinsic<[], [llvm_i32_ty], [IntrConvergent]>; + // Synchronize 'm', a multiple of warp size, (arg 2) threads in + // the CTA at barrier 'n' (arg 1). + def int_nvvm_barrier : GCCBuiltin<"__nvvm_bar">, + Intrinsic<[], [llvm_i32_ty, llvm_i32_ty], [IntrConvergent]>; def int_nvvm_barrier0_popc : GCCBuiltin<"__nvvm_bar0_popc">, Intrinsic<[llvm_i32_ty], [llvm_i32_ty], [IntrConvergent]>; def int_nvvm_barrier0_and : GCCBuiltin<"__nvvm_bar0_and">, Index: llvm/trunk/lib/Target/NVPTX/NVPTXIntrinsics.td =================================================================== --- llvm/trunk/lib/Target/NVPTX/NVPTXIntrinsics.td +++ llvm/trunk/lib/Target/NVPTX/NVPTXIntrinsics.td @@ -36,6 +36,12 @@ def INT_BARRIER0 : NVPTXInst<(outs), (ins), "bar.sync \t0;", [(int_nvvm_barrier0)]>; +def INT_BARRIERN : NVPTXInst<(outs), (ins Int32Regs:$src1), + "bar.sync \t$src1;", + [(int_nvvm_barrier_n Int32Regs:$src1)]>; +def INT_BARRIER : NVPTXInst<(outs), (ins Int32Regs:$src1, Int32Regs:$src2), + "bar.sync \t$src1, $src2;", + [(int_nvvm_barrier Int32Regs:$src1, Int32Regs:$src2)]>; def INT_BARRIER0_POPC : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$pred), !strconcat("{{ \n\t", ".reg .pred \t%p1; \n\t", Index: llvm/trunk/test/CodeGen/NVPTX/named-barriers.ll =================================================================== --- llvm/trunk/test/CodeGen/NVPTX/named-barriers.ll +++ llvm/trunk/test/CodeGen/NVPTX/named-barriers.ll @@ -0,0 +1,40 @@ +; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s +; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck %s + +; Use bar.sync to arrive at a pre-computed barrier number and +; wait for all threads in CTA to also arrive: +define ptx_device void @test_barrier_named_cta() { +; CHECK: mov.u32 %r[[REG0:[0-9]+]], 0; +; CHECK: bar.sync %r[[REG0]]; +; CHECK: mov.u32 %r[[REG1:[0-9]+]], 10; +; CHECK: bar.sync %r[[REG1]]; +; CHECK: mov.u32 %r[[REG2:[0-9]+]], 15; +; CHECK: bar.sync %r[[REG2]]; +; CHECK: ret; + call void @llvm.nvvm.barrier.n(i32 0) + call void @llvm.nvvm.barrier.n(i32 10) + call void @llvm.nvvm.barrier.n(i32 15) + ret void +} + +; Use bar.sync to arrive at a pre-computed barrier number and +; wait for fixed number of cooperating threads to arrive: +define ptx_device void @test_barrier_named() { +; CHECK: mov.u32 %r[[REG0A:[0-9]+]], 32; +; CHECK: mov.u32 %r[[REG0B:[0-9]+]], 0; +; CHECK: bar.sync %r[[REG0B]], %r[[REG0A]]; +; CHECK: mov.u32 %r[[REG1A:[0-9]+]], 352; +; CHECK: mov.u32 %r[[REG1B:[0-9]+]], 10; +; CHECK: bar.sync %r[[REG1B]], %r[[REG1A]]; +; CHECK: mov.u32 %r[[REG2A:[0-9]+]], 992; +; CHECK: mov.u32 %r[[REG2B:[0-9]+]], 15; +; CHECK: bar.sync %r[[REG2B]], %r[[REG2A]]; +; CHECK: ret; + call void @llvm.nvvm.barrier(i32 0, i32 32) + call void @llvm.nvvm.barrier(i32 10, i32 352) + call void @llvm.nvvm.barrier(i32 15, i32 992) + ret void +} + +declare void @llvm.nvvm.barrier(i32, i32) +declare void @llvm.nvvm.barrier.n(i32)