Index: include/llvm/IR/IntrinsicsNVVM.td =================================================================== --- include/llvm/IR/IntrinsicsNVVM.td +++ include/llvm/IR/IntrinsicsNVVM.td @@ -737,6 +737,10 @@ Intrinsic<[], [], [IntrNoDuplicate, IntrConvergent]>; def int_nvvm_barrier0 : GCCBuiltin<"__nvvm_bar0">, Intrinsic<[], [], [IntrNoDuplicate, IntrConvergent]>; + def int_nvvm_barrier_n : GCCBuiltin<"__nvvm_bar_n">, + Intrinsic<[], [llvm_i32_ty], [IntrNoDuplicate, IntrConvergent]>; + def int_nvvm_barrier : GCCBuiltin<"__nvvm_bar">, + Intrinsic<[], [llvm_i32_ty, llvm_i32_ty], [IntrNoDuplicate, IntrConvergent]>; def int_nvvm_barrier0_popc : GCCBuiltin<"__nvvm_bar0_popc">, Intrinsic<[llvm_i32_ty], [llvm_i32_ty], [IntrNoDuplicate, IntrConvergent]>; def int_nvvm_barrier0_and : GCCBuiltin<"__nvvm_bar0_and">, Index: lib/Target/NVPTX/NVPTXIntrinsics.td =================================================================== --- lib/Target/NVPTX/NVPTXIntrinsics.td +++ lib/Target/NVPTX/NVPTXIntrinsics.td @@ -39,6 +39,12 @@ def INT_BARRIER0 : NVPTXInst<(outs), (ins), "bar.sync \t0;", [(int_nvvm_barrier0)]>; +def INT_BARRIERN : NVPTXInst<(outs), (ins Int32Regs:$src1), + "bar.sync \t$src1;", + [(int_nvvm_barrier_n Int32Regs:$src1)]>; +def INT_BARRIER : NVPTXInst<(outs), (ins Int32Regs:$src1, Int32Regs:$src2), + "bar.sync \t$src1, $src2;", + [(int_nvvm_barrier Int32Regs:$src1, Int32Regs:$src2)]>; def INT_BARRIER0_POPC : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$pred), !strconcat("{{ \n\t", !strconcat(".reg .pred \t%p1; \n\t", Index: test/CodeGen/NVPTX/named-barriers.ll =================================================================== --- /dev/null +++ test/CodeGen/NVPTX/named-barriers.ll @@ -0,0 +1,41 @@ +; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s +; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck %s + +; Use bar.sync to arrive at a pre-computed barrier number and +; wait for all threads in CTA to also arrive: +define ptx_device void @test_barrier_named_cta() { +; CHECK: mov.u32 %r[[REG0:[0-9]+]], 0; +; CHECK: bar.sync %r[[REG0]]; +; CHECK: mov.u32 %r[[REG1:[0-9]+]], 10; +; CHECK: bar.sync %r[[REG1]]; +; CHECK: mov.u32 %r[[REG2:[0-9]+]], 15; +; CHECK: bar.sync %r[[REG2]]; +; CHECK: ret; + call void @llvm.nvvm.barrier.n(i32 0) + call void @llvm.nvvm.barrier.n(i32 10) + call void @llvm.nvvm.barrier.n(i32 15) + ret void +} + +; Use bar.sync to arrive at a pre-computed barrier number and +; wait for fixed number of cooperating threads to arrive: +define ptx_device void @test_barrier_named() { +; CHECK: mov.u32 %r[[REG0A:[0-9]+]], 32; +; CHECK: mov.u32 %r[[REG0B:[0-9]+]], 0; +; CHECK: bar.sync %r[[REG0B]], %r[[REG0A]]; +; CHECK: mov.u32 %r[[REG1A:[0-9]+]], 352; +; CHECK: mov.u32 %r[[REG1B:[0-9]+]], 10; +; CHECK: bar.sync %r[[REG1B]], %r[[REG1A]]; +; CHECK: mov.u32 %r[[REG2A:[0-9]+]], 992; +; CHECK: mov.u32 %r[[REG2B:[0-9]+]], 15; +; CHECK: bar.sync %r[[REG2B]], %r[[REG2A]]; +; CHECK: ret; + call void @llvm.nvvm.barrier(i32 0, i32 32) + call void @llvm.nvvm.barrier(i32 10, i32 352) + call void @llvm.nvvm.barrier(i32 15, i32 992) + ret void +} + +declare void @llvm.nvvm.barrier(i32, i32) +declare void @llvm.nvvm.barrier.n(i32) +