diff --git a/clang/include/clang/Basic/BuiltinsNVPTX.def b/clang/include/clang/Basic/BuiltinsNVPTX.def --- a/clang/include/clang/Basic/BuiltinsNVPTX.def +++ b/clang/include/clang/Basic/BuiltinsNVPTX.def @@ -582,6 +582,11 @@ TARGET_BUILTIN(__nvvm_barrier_sync, "vUi", "n", PTX60) TARGET_BUILTIN(__nvvm_barrier_sync_cnt, "vUiUi", "n", PTX60) +TARGET_BUILTIN(__nvvm_barrier_cluster_arrive, "v", "n", AND(SM_90,PTX78)) +TARGET_BUILTIN(__nvvm_barrier_cluster_arrive_relaxed, "v", "n", AND(SM_90,PTX80)) +TARGET_BUILTIN(__nvvm_barrier_cluster_wait, "v", "n", AND(SM_90,PTX78)) +TARGET_BUILTIN(__nvvm_fence_sc_cluster, "v", "n", AND(SM_90,PTX78)) + // Shuffle BUILTIN(__nvvm_shfl_down_i32, "iiii", "") diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp --- a/clang/lib/CodeGen/CGBuiltin.cpp +++ b/clang/lib/CodeGen/CGBuiltin.cpp @@ -18962,6 +18962,18 @@ return Builder.CreateCall( CGM.getIntrinsic(Intrinsic::nvvm_getctarank_shared_cluster), EmitScalarExpr(E->getArg(0))); + case NVPTX::BI__nvvm_barrier_cluster_arrive: + return Builder.CreateCall( + CGM.getIntrinsic(Intrinsic::nvvm_barrier_cluster_arrive)); + case NVPTX::BI__nvvm_barrier_cluster_arrive_relaxed: + return Builder.CreateCall( + CGM.getIntrinsic(Intrinsic::nvvm_barrier_cluster_arrive_relaxed)); + case NVPTX::BI__nvvm_barrier_cluster_wait: + return Builder.CreateCall( + CGM.getIntrinsic(Intrinsic::nvvm_barrier_cluster_wait)); + case NVPTX::BI__nvvm_fence_sc_cluster: + return Builder.CreateCall( + CGM.getIntrinsic(Intrinsic::nvvm_fence_sc_cluster)); default: return nullptr; } diff --git a/clang/test/CodeGenCUDA/builtins-sm90.cu b/clang/test/CodeGenCUDA/builtins-sm90.cu --- a/clang/test/CodeGenCUDA/builtins-sm90.cu +++ b/clang/test/CodeGenCUDA/builtins-sm90.cu @@ -1,4 +1,4 @@ -// RUN: %clang_cc1 "-triple" "nvptx64-nvidia-cuda" "-target-feature" "+ptx78" "-target-cpu" "sm_90" -emit-llvm -fcuda-is-device -o - %s | FileCheck %s +// RUN: %clang_cc1 "-triple" "nvptx64-nvidia-cuda" "-target-feature" "+ptx80" "-target-cpu" "sm_90" -emit-llvm -fcuda-is-device -o - %s | FileCheck %s // CHECK: define{{.*}} void @_Z6kernelPlPvj( __attribute__((global)) void kernel(long *out, void *ptr, unsigned u) { @@ -57,5 +57,14 @@ // CHECK: call i32 @llvm.nvvm.getctarank.shared.cluster(ptr addrspace(3) {{.*}}) out[i++] = __nvvm_getctarank_shared_cluster(sptr); + // CHECK: call void @llvm.nvvm.barrier.cluster.arrive() + __nvvm_barrier_cluster_arrive(); + // CHECK: call void @llvm.nvvm.barrier.cluster.arrive.relaxed() + __nvvm_barrier_cluster_arrive_relaxed(); + // CHECK: call void @llvm.nvvm.barrier.cluster.wait() + __nvvm_barrier_cluster_wait(); + // CHECK: call void @llvm.nvvm.fence.sc.cluster() + __nvvm_fence_sc_cluster(); + // CHECK: ret void } diff --git a/llvm/include/llvm/IR/IntrinsicsNVVM.td b/llvm/include/llvm/IR/IntrinsicsNVVM.td --- a/llvm/include/llvm/IR/IntrinsicsNVVM.td +++ b/llvm/include/llvm/IR/IntrinsicsNVVM.td @@ -1358,6 +1358,14 @@ Intrinsic<[], [llvm_i32_ty, llvm_i32_ty], [IntrConvergent, IntrNoCallback]>, ClangBuiltin<"__nvvm_barrier_sync_cnt">; + // barrier.cluster.[wait, arrive, arrive.relaxed] + def int_nvvm_barrier_cluster_arrive : + Intrinsic<[], [], [IntrConvergent, IntrNoCallback]>; + def int_nvvm_barrier_cluster_arrive_relaxed : + Intrinsic<[], [], [IntrConvergent, IntrNoCallback]>; + def int_nvvm_barrier_cluster_wait : + Intrinsic<[], [], [IntrConvergent, IntrNoCallback]>; + // Membar def int_nvvm_membar_cta : ClangBuiltin<"__nvvm_membar_cta">, Intrinsic<[], [], [IntrNoCallback]>; @@ -1365,6 +1373,8 @@ Intrinsic<[], [], [IntrNoCallback]>; def int_nvvm_membar_sys : ClangBuiltin<"__nvvm_membar_sys">, Intrinsic<[], [], [IntrNoCallback]>; + def int_nvvm_fence_sc_cluster: + Intrinsic<[], [], [IntrNoCallback]>; // Async Copy def int_nvvm_cp_async_mbarrier_arrive : diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td --- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td +++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td @@ -132,6 +132,18 @@ "barrier.sync \t$id, $cnt;", [(int_nvvm_barrier_sync_cnt imm:$id, imm:$cnt)]>, Requires<[hasPTX<60>, hasSM<30>]>; +class INT_BARRIER_CLUSTER Preds = [hasPTX<78>, hasSM<90>]>: + NVPTXInst<(outs), (ins), "barrier.cluster."# variant #";", [(Intr)]>, + Requires; + +def barrier_cluster_arrive: + INT_BARRIER_CLUSTER<"arrive", int_nvvm_barrier_cluster_arrive>; +def barrier_cluster_arrive_relaxed: + INT_BARRIER_CLUSTER<"arrive.relaxed", + int_nvvm_barrier_cluster_arrive_relaxed, [hasPTX<80>, hasSM<90>]>; +def barrier_cluster_wait: + INT_BARRIER_CLUSTER<"wait", int_nvvm_barrier_cluster_wait>; class SHFL_INSTR @@ -303,6 +315,9 @@ def INT_MEMBAR_GL : MEMBAR<"membar.gl;", int_nvvm_membar_gl>; def INT_MEMBAR_SYS : MEMBAR<"membar.sys;", int_nvvm_membar_sys>; +def INT_FENCE_SC_CLUSTER: + MEMBAR<"fence.sc.cluster;", int_nvvm_fence_sc_cluster>, + Requires<[hasPTX<78>, hasSM<90>]>; //----------------------------------- // Async Copy Functions diff --git a/llvm/test/CodeGen/NVPTX/intrinsics-sm90.ll b/llvm/test/CodeGen/NVPTX/intrinsics-sm90.ll --- a/llvm/test/CodeGen/NVPTX/intrinsics-sm90.ll +++ b/llvm/test/CodeGen/NVPTX/intrinsics-sm90.ll @@ -1,5 +1,5 @@ -; RUN: llc < %s -march=nvptx64 -mcpu=sm_90 -mattr=+ptx78| FileCheck --check-prefixes=CHECK %s -; RUN: %if ptxas-11.8 %{ llc < %s -march=nvptx64 -mcpu=sm_90 -mattr=+ptx78| %ptxas-verify -arch=sm_90 %} +; RUN: llc < %s -march=nvptx64 -mcpu=sm_90 -mattr=+ptx80| FileCheck --check-prefixes=CHECK %s +; RUN: %if ptxas-11.8 %{ llc < %s -march=nvptx64 -mcpu=sm_90 -mattr=+ptx80| %ptxas-verify -arch=sm_90 %} ; CHECK-LABEL: test_isspacep define i1 @test_isspacep_shared_cluster(ptr %p) { @@ -120,6 +120,19 @@ ret i1 %x } +; CHECK-LABEL: test_barrier_cluster( +define void @test_barrier_cluster() { +; CHECK: barrier.cluster.arrive; + call void @llvm.nvvm.barrier.cluster.arrive() +; CHECK: barrier.cluster.arrive.relaxed; + call void @llvm.nvvm.barrier.cluster.arrive.relaxed() +; CHECK: barrier.cluster.wait; + call void @llvm.nvvm.barrier.cluster.wait() +; CHECK: fence.sc.cluster + call void @llvm.nvvm.fence.sc.cluster() + ret void +} + declare i1 @llvm.nvvm.isspacep.shared.cluster(ptr %p); declare ptr @llvm.nvvm.mapa(ptr %p, i32 %r); @@ -137,3 +150,7 @@ declare i32 @llvm.nvvm.read.ptx.sreg.cluster.ctarank() declare i32 @llvm.nvvm.read.ptx.sreg.cluster.nctarank() declare i1 @llvm.nvvm.is_explicit_cluster() +declare void @llvm.nvvm.barrier.cluster.arrive() +declare void @llvm.nvvm.barrier.cluster.arrive.relaxed() +declare void @llvm.nvvm.barrier.cluster.wait() +declare void @llvm.nvvm.fence.sc.cluster()