diff --git a/llvm/include/llvm/IR/IntrinsicsNVVM.td b/llvm/include/llvm/IR/IntrinsicsNVVM.td --- a/llvm/include/llvm/IR/IntrinsicsNVVM.td +++ b/llvm/include/llvm/IR/IntrinsicsNVVM.td @@ -1569,25 +1569,29 @@ // isspacep.{const, global, local, shared} def int_nvvm_isspacep_const - : DefaultAttrsIntrinsic<[llvm_i1_ty], [llvm_ptr_ty], + : DefaultAttrsIntrinsic<[llvm_i1_ty], [llvm_ptr_ty], [IntrNoMem, IntrSpeculatable, NoCapture>], "llvm.nvvm.isspacep.const">, ClangBuiltin<"__nvvm_isspacep_const">; def int_nvvm_isspacep_global - : DefaultAttrsIntrinsic<[llvm_i1_ty], [llvm_ptr_ty], + : DefaultAttrsIntrinsic<[llvm_i1_ty], [llvm_ptr_ty], [IntrNoMem, IntrSpeculatable, NoCapture>], "llvm.nvvm.isspacep.global">, ClangBuiltin<"__nvvm_isspacep_global">; def int_nvvm_isspacep_local - : DefaultAttrsIntrinsic<[llvm_i1_ty], [llvm_ptr_ty], + : DefaultAttrsIntrinsic<[llvm_i1_ty], [llvm_ptr_ty], [IntrNoMem, IntrSpeculatable, NoCapture>], "llvm.nvvm.isspacep.local">, ClangBuiltin<"__nvvm_isspacep_local">; def int_nvvm_isspacep_shared - : DefaultAttrsIntrinsic<[llvm_i1_ty], [llvm_ptr_ty], + : DefaultAttrsIntrinsic<[llvm_i1_ty], [llvm_ptr_ty], [IntrNoMem, IntrSpeculatable, NoCapture>], "llvm.nvvm.isspacep.shared">, ClangBuiltin<"__nvvm_isspacep_shared">; +def int_nvvm_isspacep_shared_cluster + : DefaultAttrsIntrinsic<[llvm_i1_ty], [llvm_ptr_ty], + [IntrNoMem, IntrSpeculatable, NoCapture>], + "llvm.nvvm.isspacep.shared.cluster">; // Environment register read def int_nvvm_read_ptx_sreg_envreg0 @@ -4341,30 +4345,29 @@ // Accessing special registers. + +class PTXReadSRegIntrinsicNB_r32 + : DefaultAttrsIntrinsic<[llvm_i32_ty], [], [IntrNoMem, IntrSpeculatable, NoUndef]>; +class PTXReadSRegIntrinsic_r32 + : PTXReadSRegIntrinsicNB_r32, ClangBuiltin<"__nvvm_read_ptx_sreg_" # name>; + multiclass PTXReadSRegIntrinsic_v4i32 { // FIXME: Do we need the 128-bit integer type version? // def _r64 : Intrinsic<[llvm_i128_ty], [], [IntrNoMem, IntrSpeculatable]>; // FIXME: Enable this once v4i32 support is enabled in back-end. // def _v4i16 : Intrinsic<[llvm_v4i32_ty], [], [IntrNoMem, IntrSpeculatable]>; + foreach suffix = ["_x", "_y", "_z", "_w"] in + def suffix : PTXReadSRegIntrinsic_r32; +} - def _x : DefaultAttrsIntrinsic<[llvm_i32_ty], [], - [IntrNoMem, IntrSpeculatable, NoUndef]>, - ClangBuiltin<"__nvvm_read_ptx_sreg_" # regname # "_x">; - def _y : DefaultAttrsIntrinsic<[llvm_i32_ty], [], - [IntrNoMem, IntrSpeculatable, NoUndef]>, - ClangBuiltin<"__nvvm_read_ptx_sreg_" # regname # "_y">; - def _z : DefaultAttrsIntrinsic<[llvm_i32_ty], [], - [IntrNoMem, IntrSpeculatable, NoUndef]>, - ClangBuiltin<"__nvvm_read_ptx_sreg_" # regname # "_z">; - def _w : DefaultAttrsIntrinsic<[llvm_i32_ty], [], - [IntrNoMem, IntrSpeculatable, NoUndef]>, - ClangBuiltin<"__nvvm_read_ptx_sreg_" # regname # "_w">; +// Same, but without automatic clang builtins. It will be used for +// registers that require particular GPU or PTX version. +multiclass PTXReadSRegIntrinsicNB_v4i32 { + foreach suffix = ["_x", "_y", "_z", "_w"] in + def suffix : PTXReadSRegIntrinsicNB_r32; } -class PTXReadSRegIntrinsic_r32 - : DefaultAttrsIntrinsic<[llvm_i32_ty], [], [IntrNoMem, IntrSpeculatable, NoUndef]>, - ClangBuiltin<"__nvvm_read_ptx_sreg_" # name>; class PTXReadSRegIntrinsic_r64 : DefaultAttrsIntrinsic<[llvm_i64_ty], [], [IntrNoMem, IntrSpeculatable, NoUndef]>, ClangBuiltin<"__nvvm_read_ptx_sreg_" # name>; @@ -4413,6 +4416,15 @@ def int_nvvm_read_ptx_sreg_warpsize : PTXReadSRegIntrinsic_r32<"warpsize">; +// sm90+, PTX7.8+ +defm int_nvvm_read_ptx_sreg_clusterid : PTXReadSRegIntrinsicNB_v4i32; +defm int_nvvm_read_ptx_sreg_nclusterid : PTXReadSRegIntrinsicNB_v4i32; +defm int_nvvm_read_ptx_sreg_cluster_ctaid : PTXReadSRegIntrinsicNB_v4i32; +defm int_nvvm_read_ptx_sreg_cluster_nctaid : PTXReadSRegIntrinsicNB_v4i32; + +def int_nvvm_read_ptx_sreg_cluster_ctarank : PTXReadSRegIntrinsicNB_r32; +def int_nvvm_read_ptx_sreg_cluster_nctarank : PTXReadSRegIntrinsicNB_r32; + // // SHUFFLE // @@ -4661,4 +4673,25 @@ } } +def int_nvvm_mapa + : DefaultAttrsIntrinsic<[llvm_ptr_ty], [llvm_ptr_ty, llvm_i32_ty], + [IntrNoMem, IntrSpeculatable, NoCapture>], + "llvm.nvvm.mapa">; +def int_nvvm_mapa_shared_cluster + : DefaultAttrsIntrinsic<[llvm_shared_i8ptr_ty], [llvm_shared_i8ptr_ty, llvm_i32_ty], + [IntrNoMem, IntrSpeculatable, NoCapture>], + "llvm.nvvm.mapa.shared.cluster">; +def int_nvvm_getctarank + : DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_ptr_ty], + [IntrNoMem, IntrSpeculatable, NoCapture>], + "llvm.nvvm.getctarank">; +def int_nvvm_getctarank_shared_cluster + : DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_shared_i8ptr_ty], + [IntrNoMem, IntrSpeculatable, NoCapture>], + "llvm.nvvm.getctarank.shared.cluster">; +def int_nvvm_is_explicit_cluster + : DefaultAttrsIntrinsic<[llvm_i1_ty], [], + [IntrNoMem, IntrSpeculatable, NoUndef], + "llvm.nvvm.is_explicit_cluster">; + } // let TargetPrefix = "nvvm" diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td --- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td +++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td @@ -2480,41 +2480,24 @@ // isspacep -def ISSPACEP_CONST_32 - : NVPTXInst<(outs Int1Regs:$d), (ins Int32Regs:$a), - "isspacep.const \t$d, $a;", - [(set Int1Regs:$d, (int_nvvm_isspacep_const Int32Regs:$a))]>, - Requires<[hasPTX<31>]>; -def ISSPACEP_CONST_64 - : NVPTXInst<(outs Int1Regs:$d), (ins Int64Regs:$a), - "isspacep.const \t$d, $a;", - [(set Int1Regs:$d, (int_nvvm_isspacep_const Int64Regs:$a))]>, - Requires<[hasPTX<31>]>; -def ISSPACEP_GLOBAL_32 - : NVPTXInst<(outs Int1Regs:$d), (ins Int32Regs:$a), - "isspacep.global \t$d, $a;", - [(set Int1Regs:$d, (int_nvvm_isspacep_global Int32Regs:$a))]>; -def ISSPACEP_GLOBAL_64 - : NVPTXInst<(outs Int1Regs:$d), (ins Int64Regs:$a), - "isspacep.global \t$d, $a;", - [(set Int1Regs:$d, (int_nvvm_isspacep_global Int64Regs:$a))]>; -def ISSPACEP_LOCAL_32 - : NVPTXInst<(outs Int1Regs:$d), (ins Int32Regs:$a), - "isspacep.local \t$d, $a;", - [(set Int1Regs:$d, (int_nvvm_isspacep_local Int32Regs:$a))]>; -def ISSPACEP_LOCAL_64 - : NVPTXInst<(outs Int1Regs:$d), (ins Int64Regs:$a), - "isspacep.local \t$d, $a;", - [(set Int1Regs:$d, (int_nvvm_isspacep_local Int64Regs:$a))]>; -def ISSPACEP_SHARED_32 - : NVPTXInst<(outs Int1Regs:$d), (ins Int32Regs:$a), - "isspacep.shared \t$d, $a;", - [(set Int1Regs:$d, (int_nvvm_isspacep_shared Int32Regs:$a))]>; -def ISSPACEP_SHARED_64 - : NVPTXInst<(outs Int1Regs:$d), (ins Int64Regs:$a), - "isspacep.shared \t$d, $a;", - [(set Int1Regs:$d, (int_nvvm_isspacep_shared Int64Regs:$a))]>; - +multiclass ISSPACEP Preds = []> { + def _32: NVPTXInst<(outs Int1Regs:$d), (ins Int32Regs:$a), + "isspacep." # suffix # "\t$d, $a;", + [(set Int1Regs:$d, (Intr Int32Regs:$a))]>, + Requires; + def _64: NVPTXInst<(outs Int1Regs:$d), (ins Int64Regs:$a), + "isspacep." # suffix # "\t$d, $a;", + [(set Int1Regs:$d, (Intr Int64Regs:$a))]>, + Requires; +} + +defm isspace_const : ISSPACEP<"const", int_nvvm_isspacep_const, [hasPTX<31>]>; +defm isspace_global : ISSPACEP<"global", int_nvvm_isspacep_global>; +defm isspace_local : ISSPACEP<"local", int_nvvm_isspacep_local>; +defm isspace_shared : ISSPACEP<"shared", int_nvvm_isspacep_shared>; +defm isspace_shared_cluster : ISSPACEP<"shared::cluster", + int_nvvm_isspacep_shared_cluster, + [hasPTX<78>, hasSM<90>]>; // Special register reads def MOV_SPECIAL : NVPTXInst<(outs Int32Regs:$d), @@ -6213,35 +6196,51 @@ // Read Special Registers //----------------------------------- -class PTX_READ_SREG_R64 +class PTX_READ_SREG_R64 Preds=[]> : NVPTXInst<(outs Int64Regs:$d), (ins), !strconcat("mov.u64 \t$d, %", regname, ";"), - [(set Int64Regs:$d, (intop))]>; + [(set Int64Regs:$d, (intop))]>, + Requires; -class PTX_READ_SREG_R32 +class PTX_READ_SREG_R32 Preds=[]> : NVPTXInst<(outs Int32Regs:$d), (ins), !strconcat("mov.u32 \t$d, %", regname, ";"), - [(set Int32Regs:$d, (intop))]>; + [(set Int32Regs:$d, (intop))]>, + Requires; + +multiclass PTX_READ_SREG_R32V4 Preds=[]> { + foreach suffix = ["x", "y", "z", "w"] in { + defvar reg = regname # "." # suffix; + defvar intr = !cast("int_nvvm_read_ptx_sreg_" # regname # "_" # suffix); + def "_"#suffix : PTX_READ_SREG_R32; + } +} // TODO Add read vector-version of special registers -def INT_PTX_SREG_TID_X : - PTX_READ_SREG_R32<"tid.x", int_nvvm_read_ptx_sreg_tid_x>; -def INT_PTX_SREG_TID_Y : - PTX_READ_SREG_R32<"tid.y", int_nvvm_read_ptx_sreg_tid_y>; -def INT_PTX_SREG_TID_Z : - PTX_READ_SREG_R32<"tid.z", int_nvvm_read_ptx_sreg_tid_z>; -def INT_PTX_SREG_TID_W : - PTX_READ_SREG_R32<"tid.w", int_nvvm_read_ptx_sreg_tid_w>; - -def INT_PTX_SREG_NTID_X : - PTX_READ_SREG_R32<"ntid.x", int_nvvm_read_ptx_sreg_ntid_x>; -def INT_PTX_SREG_NTID_Y : - PTX_READ_SREG_R32<"ntid.y", int_nvvm_read_ptx_sreg_ntid_y>; -def INT_PTX_SREG_NTID_Z : - PTX_READ_SREG_R32<"ntid.z", int_nvvm_read_ptx_sreg_ntid_z>; -def INT_PTX_SREG_NTID_W : - PTX_READ_SREG_R32<"ntid.w", int_nvvm_read_ptx_sreg_ntid_w>; +defm INT_PTX_SREG_TID : PTX_READ_SREG_R32V4<"tid">; +defm INT_PTX_SREG_NTID : PTX_READ_SREG_R32V4<"ntid">; +defm INT_PTX_SREG_CTAID : PTX_READ_SREG_R32V4<"ctaid">; +defm INT_PTX_SREG_NCTAID: PTX_READ_SREG_R32V4<"nctaid">; + +defm INT_PTX_SREG_CLUSTERID : + PTX_READ_SREG_R32V4<"clusterid", [hasSM<90>, hasPTX<78>]>; +defm INT_PTX_SREG_NCLUSTERID : + PTX_READ_SREG_R32V4<"nclusterid", [hasSM<90>, hasPTX<78>]>; +defm INT_PTX_SREG_CLUSTER_CTAID : + PTX_READ_SREG_R32V4<"cluster_ctaid", [hasSM<90>, hasPTX<78>]>; +defm INT_PTX_SREG_CLUSTER_NCTAID: + PTX_READ_SREG_R32V4<"cluster_nctaid", [hasSM<90>, hasPTX<78>]>; + +def INT_PTX_SREG_CLUSTER_CTARANK : + PTX_READ_SREG_R32<"cluster_ctarank", + int_nvvm_read_ptx_sreg_cluster_ctarank, + [hasSM<90>, hasPTX<78>]>; +def INT_PTX_SREG_CLUSTER_NCTARANK: + PTX_READ_SREG_R32<"cluster_nctarank", + int_nvvm_read_ptx_sreg_cluster_nctarank, + [hasSM<90>, hasPTX<78>]>; + def INT_PTX_SREG_LANEID : PTX_READ_SREG_R32<"laneid", int_nvvm_read_ptx_sreg_laneid>; @@ -6249,25 +6248,6 @@ PTX_READ_SREG_R32<"warpid", int_nvvm_read_ptx_sreg_warpid>; def INT_PTX_SREG_NWARPID : PTX_READ_SREG_R32<"nwarpid", int_nvvm_read_ptx_sreg_nwarpid>; - -def INT_PTX_SREG_CTAID_X : - PTX_READ_SREG_R32<"ctaid.x", int_nvvm_read_ptx_sreg_ctaid_x>; -def INT_PTX_SREG_CTAID_Y : - PTX_READ_SREG_R32<"ctaid.y", int_nvvm_read_ptx_sreg_ctaid_y>; -def INT_PTX_SREG_CTAID_Z : - PTX_READ_SREG_R32<"ctaid.z", int_nvvm_read_ptx_sreg_ctaid_z>; -def INT_PTX_SREG_CTAID_W : - PTX_READ_SREG_R32<"ctaid.w", int_nvvm_read_ptx_sreg_ctaid_w>; - -def INT_PTX_SREG_NCTAID_X : - PTX_READ_SREG_R32<"nctaid.x", int_nvvm_read_ptx_sreg_nctaid_x>; -def INT_PTX_SREG_NCTAID_Y : - PTX_READ_SREG_R32<"nctaid.y", int_nvvm_read_ptx_sreg_nctaid_y>; -def INT_PTX_SREG_NCTAID_Z : - PTX_READ_SREG_R32<"nctaid.z", int_nvvm_read_ptx_sreg_nctaid_z>; -def INT_PTX_SREG_NCTAID_W : - PTX_READ_SREG_R32<"nctaid.w", int_nvvm_read_ptx_sreg_nctaid_w>; - def INT_PTX_SREG_SMID : PTX_READ_SREG_R32<"smid", int_nvvm_read_ptx_sreg_smid>; def INT_PTX_SREG_NSMID : @@ -6704,3 +6684,45 @@ // Build intrinsic->instruction patterns for all MMA instructions. foreach mma = !listconcat(MMAs, WMMAs, MMA_LDSTs, LDMATRIXs) in def : MMA_PAT; + +multiclass MAPA { + def _32: NVPTXInst<(outs Int32Regs:$d), (ins Int32Regs:$a, Int32Regs:$b), + "mapa" # suffix # ".u32\t$d, $a, $b;", + [(set Int32Regs:$d, (Intr Int32Regs:$a, Int32Regs:$b))]>, + Requires<[hasSM<90>, hasPTX<78>]>; + def _32i: NVPTXInst<(outs Int32Regs:$d), (ins Int32Regs:$a, i32imm:$b), + "mapa" # suffix # ".u32\t$d, $a, $b;", + [(set Int32Regs:$d, (Intr Int32Regs:$a, imm:$b))]>, + Requires<[hasSM<90>, hasPTX<78>]>; + def _64: NVPTXInst<(outs Int64Regs:$d), (ins Int64Regs:$a, Int32Regs:$b), + "mapa" # suffix # ".u64\t$d, $a, $b;", + [(set Int64Regs:$d, (Intr Int64Regs:$a, Int32Regs:$b))]>, + Requires<[hasSM<90>, hasPTX<78>]>; + def _64i: NVPTXInst<(outs Int64Regs:$d), (ins Int64Regs:$a, i32imm:$b), + "mapa" # suffix # ".u64\t$d, $a, $b;", + [(set Int64Regs:$d, (Intr Int64Regs:$a, imm:$b))]>, + Requires<[hasSM<90>, hasPTX<78>]>; +} + +defm mapa : MAPA<"", int_nvvm_mapa>; +defm mapa_shared_cluster : MAPA<".shared::cluster", int_nvvm_mapa_shared_cluster>; + + +multiclass GETCTARANK { + def _32: NVPTXInst<(outs Int32Regs:$d), (ins Int32Regs:$a), + "getctarank" # suffix # ".u32\t$d, $a;", + [(set Int32Regs:$d, (Intr Int32Regs:$a))]>, + Requires<[hasSM<90>, hasPTX<78>]>; + def _64: NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a), + "getctarank" # suffix # ".u64\t$d, $a;", + [(set Int32Regs:$d, (Intr Int64Regs:$a))]>, + Requires<[hasSM<90>, hasPTX<78>]>; +} + +defm getctarank : GETCTARANK<"", int_nvvm_getctarank>; +defm getctarank_shared_cluster : GETCTARANK<".shared::cluster", int_nvvm_getctarank_shared_cluster>; + +def is_explicit_cluster: NVPTXInst<(outs Int1Regs:$d), (ins), + "mov.pred\t$d, %is_explicit_cluster;", + [(set Int1Regs:$d, (int_nvvm_is_explicit_cluster))]>, + Requires<[hasSM<90>, hasPTX<78>]>; diff --git a/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp b/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp --- a/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp @@ -291,6 +291,7 @@ case Intrinsic::nvvm_isspacep_local: return std::make_pair(II->getArgOperand(0), llvm::ADDRESS_SPACE_LOCAL); case Intrinsic::nvvm_isspacep_shared: + case Intrinsic::nvvm_isspacep_shared_cluster: return std::make_pair(II->getArgOperand(0), llvm::ADDRESS_SPACE_SHARED); default: break; diff --git a/llvm/test/CodeGen/NVPTX/intrinsics-sm90.ll b/llvm/test/CodeGen/NVPTX/intrinsics-sm90.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/NVPTX/intrinsics-sm90.ll @@ -0,0 +1,139 @@ +; RUN: llc < %s -march=nvptx64 -mcpu=sm_90 -mattr=+ptx78| FileCheck --check-prefixes=CHECK %s +; RUN: %if ptxas-11.8 %{ llc < %s -march=nvptx64 -mcpu=sm_90 -mattr=+ptx78| %ptxas-verify -arch=sm_90 %} + +; CHECK-LABEL: test_isspacep +define i1 @test_isspacep_shared_cluster(ptr %p) { +; CHECK: isspacep.shared::cluster + %a = tail call i1 @llvm.nvvm.isspacep.shared.cluster(ptr %p) +; CHECK: ret + ret i1 %a +} + +; CHECK-LABEL: test_mapa( +define ptr @test_mapa(ptr %p, i32 %r) { +; CHECK64: mapa.u64 + %a = call ptr @llvm.nvvm.mapa(ptr %p, i32 %r) + ret ptr %a +} + +; CHECK-LABEL: test_mapa_shared_cluster( +define ptr addrspace(3) @test_mapa_shared_cluster(ptr addrspace(3) %p, i32 %r) { +; CHECK: mapa.shared::cluster.u64 + %a = call ptr addrspace(3) @llvm.nvvm.mapa.shared.cluster(ptr addrspace(3) %p, i32 %r) + ret ptr addrspace(3) %a +} + +; CHECK-LABEL: test_getctarank( +define i32 @test_getctarank(ptr %p) { +; CHECK: getctarank.u64 + %a = call i32 @llvm.nvvm.getctarank(ptr %p) + ret i32 %a +} + +; CHECK-LABEL: test_getctarank_shared_cluster( +define i32 @test_getctarank_shared_cluster(ptr addrspace(3) %p) { +; CHECK64: getctarank.shared::cluster.u64 +; CHECK32: getctarank.shared::cluster.u32 + %a = call i32 @llvm.nvvm.getctarank.shared.cluster(ptr addrspace(3) %p) + ret i32 %a +} + +; CHECK-LABEL: test_clusterid_x( +define i32 @test_clusterid_x() { +; CHECK: mov.u32 %r{{[0-9]+}}, %clusterid.x; +; CHECK: ret; + %x = call i32 @llvm.nvvm.read.ptx.sreg.clusterid.x() + ret i32 %x +} +; CHECK-LABEL: test_clusterid_y( +define i32 @test_clusterid_y() { +; CHECK: mov.u32 %r{{[0-9]+}}, %clusterid.y; +; CHECK: ret; + %x = call i32 @llvm.nvvm.read.ptx.sreg.clusterid.y() + ret i32 %x +} +; CHECK-LABEL: test_clusterid_z( +define i32 @test_clusterid_z() { +; CHECK: mov.u32 %r{{[0-9]+}}, %clusterid.z; +; CHECK: ret; + %x = call i32 @llvm.nvvm.read.ptx.sreg.clusterid.z() + ret i32 %x +} +; CHECK-LABEL: test_clusterid_w( +define i32 @test_clusterid_w() { +; CHECK: mov.u32 %r{{[0-9]+}}, %clusterid.w; +; CHECK: ret; + %x = call i32 @llvm.nvvm.read.ptx.sreg.clusterid.w() + ret i32 %x +} + +; CHECK-LABEL: test_nclusterid_x( +define i32 @test_nclusterid_x() { +; CHECK: mov.u32 %r{{[0-9]+}}, %nclusterid.x; +; CHECK: ret; + %x = call i32 @llvm.nvvm.read.ptx.sreg.nclusterid.x() + ret i32 %x +} +; CHECK-LABEL: test_nclusterid_y( +define i32 @test_nclusterid_y() { +; CHECK: mov.u32 %r{{[0-9]+}}, %nclusterid.y; +; CHECK: ret; + %x = call i32 @llvm.nvvm.read.ptx.sreg.nclusterid.y() + ret i32 %x +} +; CHECK-LABEL: test_nclusterid_z( +define i32 @test_nclusterid_z() { +; CHECK: mov.u32 %r{{[0-9]+}}, %nclusterid.z; +; CHECK: ret; + %x = call i32 @llvm.nvvm.read.ptx.sreg.nclusterid.z() + ret i32 %x +} +; CHECK-LABEL: test_nclusterid_w( +define i32 @test_nclusterid_w() { +; CHECK: mov.u32 %r{{[0-9]+}}, %nclusterid.w; +; CHECK: ret; + %x = call i32 @llvm.nvvm.read.ptx.sreg.nclusterid.w() + ret i32 %x +} + +; CHECK-LABEL: test_cluster_ctarank( +define i32 @test_cluster_ctarank() { +; CHECK: mov.u32 %r{{[0-9]+}}, %cluster_ctarank; +; CHECK: ret; + %x = call i32 @llvm.nvvm.read.ptx.sreg.cluster.ctarank() + ret i32 %x +} + +; CHECK-LABEL: test_cluster_nctarank( +define i32 @test_cluster_nctarank() { +; CHECK: mov.u32 %r{{[0-9]+}}, %cluster_nctarank; +; CHECK: ret; + %x = call i32 @llvm.nvvm.read.ptx.sreg.cluster.nctarank() + ret i32 %x +} + +; CHECK-LABEL: test_is_explicit_cluster( +define i1 @test_is_explicit_cluster() { +; CHECK: mov.pred %p{{[0-9]+}}, %is_explicit_cluster; +; CHECK: ret; + %x = call i1 @llvm.nvvm.is_explicit_cluster() + ret i1 %x +} + + +declare i1 @llvm.nvvm.isspacep.shared.cluster(ptr %p); +declare ptr @llvm.nvvm.mapa(ptr %p, i32 %r); +declare ptr addrspace(3) @llvm.nvvm.mapa.shared.cluster(ptr addrspace(3) %p, i32 %r); +declare i32 @llvm.nvvm.getctarank(ptr %p); +declare i32 @llvm.nvvm.getctarank.shared.cluster(ptr addrspace(3) %p); +declare i32 @llvm.nvvm.read.ptx.sreg.clusterid.x() +declare i32 @llvm.nvvm.read.ptx.sreg.clusterid.y() +declare i32 @llvm.nvvm.read.ptx.sreg.clusterid.z() +declare i32 @llvm.nvvm.read.ptx.sreg.clusterid.w() +declare i32 @llvm.nvvm.read.ptx.sreg.nclusterid.x() +declare i32 @llvm.nvvm.read.ptx.sreg.nclusterid.y() +declare i32 @llvm.nvvm.read.ptx.sreg.nclusterid.z() +declare i32 @llvm.nvvm.read.ptx.sreg.nclusterid.w() +declare i32 @llvm.nvvm.read.ptx.sreg.cluster.ctarank() +declare i32 @llvm.nvvm.read.ptx.sreg.cluster.nctarank() +declare i1 @llvm.nvvm.is_explicit_cluster()