Index: clang/include/clang/Basic/BuiltinsNVPTX.def =================================================================== --- clang/include/clang/Basic/BuiltinsNVPTX.def +++ clang/include/clang/Basic/BuiltinsNVPTX.def @@ -465,6 +465,29 @@ BUILTIN(__nvvm_membar_gl, "v", "") BUILTIN(__nvvm_membar_sys, "v", "") +// mbarrier + +TARGET_BUILTIN(__nvvm_mbarrier_init_b64, "vWi*i", "", AND(SM_80,PTX70)) +TARGET_BUILTIN(__nvvm_mbarrier_init_shared_b64, "vWi*3i", "", AND(SM_80,PTX70)) + +TARGET_BUILTIN(__nvvm_mbarrier_inval_b64, "vWi*", "", AND(SM_80,PTX70)) +TARGET_BUILTIN(__nvvm_mbarrier_inval_shared_b64, "vWi*3", "", AND(SM_80,PTX70)) + +TARGET_BUILTIN(__nvvm_mbarrier_arrive_b64, "WiWi*", "", AND(SM_80,PTX70)) +TARGET_BUILTIN(__nvvm_mbarrier_arrive_shared_b64, "WiWi*3", "", AND(SM_80,PTX70)) +TARGET_BUILTIN(__nvvm_mbarrier_arrive_noComplete_b64, "WiWi*i", "", AND(SM_80,PTX70)) +TARGET_BUILTIN(__nvvm_mbarrier_arrive_noComplete_shared_b64, "WiWi*3i", "", AND(SM_80,PTX70)) + +TARGET_BUILTIN(__nvvm_mbarrier_arrive_drop_b64, "WiWi*", "", AND(SM_80,PTX70)) +TARGET_BUILTIN(__nvvm_mbarrier_arrive_drop_shared_b64, "WiWi*3", "", AND(SM_80,PTX70)) +TARGET_BUILTIN(__nvvm_mbarrier_arrive_drop_noComplete_b64, "WiWi*i", "", AND(SM_80,PTX70)) +TARGET_BUILTIN(__nvvm_mbarrier_arrive_drop_noComplete_shared_b64, "WiWi*3i", "", AND(SM_80,PTX70)) + +TARGET_BUILTIN(__nvvm_mbarrier_test_wait_b64, "bWi*Wi", "", AND(SM_80,PTX70)) +TARGET_BUILTIN(__nvvm_mbarrier_test_wait_shared_b64, "bWi*3Wi", "", AND(SM_80,PTX70)) + +TARGET_BUILTIN(__nvvm_mbarrier_pending_count_b64, "iWi", "", AND(SM_80,PTX70)) + // Memcpy, Memset BUILTIN(__nvvm_memcpy, "vUc*Uc*zi","") @@ -729,6 +752,21 @@ TARGET_BUILTIN(__imma_m8n8k32_mma_u4, "vi*iC*iC*iC*IiIi", "", AND(SM_75,PTX63)) TARGET_BUILTIN(__imma_m8n8k32_st_c_i32, "vi*iC*UiIi", "", AND(SM_75,PTX63)) +// Async Copy +TARGET_BUILTIN(__nvvm_cp_async_mbarrier_arrive_b64, "vWi*", "", AND(SM_80,PTX70)) +TARGET_BUILTIN(__nvvm_cp_async_mbarrier_arrive_shared_b64, "vWi*3", "", AND(SM_80,PTX70)) +TARGET_BUILTIN(__nvvm_cp_async_mbarrier_arrive_noinc_b64, "vWi*", "", AND(SM_80,PTX70)) +TARGET_BUILTIN(__nvvm_cp_async_mbarrier_arrive_noinc_shared_b64, "vWi*3", "", AND(SM_80,PTX70)) + +TARGET_BUILTIN(__nvvm_cp_async_ca_shared_global_4, "vv*3vC*1", "", AND(SM_80,PTX70)) +TARGET_BUILTIN(__nvvm_cp_async_ca_shared_global_8, "vv*3vC*1", "", AND(SM_80,PTX70)) +TARGET_BUILTIN(__nvvm_cp_async_ca_shared_global_16, "vv*3vC*1", "", AND(SM_80,PTX70)) +TARGET_BUILTIN(__nvvm_cp_async_cg_shared_global_16, "vv*3vC*1", "", AND(SM_80,PTX70)) + +TARGET_BUILTIN(__nvvm_cp_async_commit_group, "v", "", AND(SM_80,PTX70)) +TARGET_BUILTIN(__nvvm_cp_async_wait_group, "vIi", "", AND(SM_80,PTX70)) +TARGET_BUILTIN(__nvvm_cp_async_wait_all, "v", "", AND(SM_80,PTX70)) + #undef BUILTIN #undef TARGET_BUILTIN #pragma pop_macro("AND") Index: clang/test/CodeGen/builtins-nvptx.c =================================================================== --- clang/test/CodeGen/builtins-nvptx.c +++ clang/test/CodeGen/builtins-nvptx.c @@ -1,4 +1,10 @@ // REQUIRES: nvptx-registered-target +// RUN: %clang_cc1 -ffp-contract=off -triple nvptx-unknown-unknown -target-cpu sm_80 -target-feature +ptx70 \ +// RUN: -fcuda-is-device -S -emit-llvm -o - -x cuda %s \ +// RUN: | FileCheck -check-prefix=CHECK -check-prefix=CHECK_PTX70_SM80 -check-prefix=LP32 %s +// RUN: %clang_cc1 -ffp-contract=off -triple nvptx64-unknown-unknown -target-cpu sm_80 -target-feature +ptx70 \ +// RUN: -fcuda-is-device -S -emit-llvm -o - -x cuda %s \ +// RUN: | FileCheck -check-prefix=CHECK -check-prefix=CHECK_PTX70_SM80 -check-prefix=LP64 %s // RUN: %clang_cc1 -ffp-contract=off -triple nvptx-unknown-unknown -target-cpu sm_60 \ // RUN: -fcuda-is-device -S -emit-llvm -o - -x cuda %s \ // RUN: | FileCheck -check-prefix=CHECK -check-prefix=LP32 %s @@ -672,3 +678,78 @@ __nvvm_vote_ballot(pred); // CHECK: ret void } + +__device__ void nvvm_mbarrier(long long* addr, __attribute__((address_space(3))) long long* sharedAddr, int count, long long state) { + #if __CUDA_ARCH__ >= 800 + __nvvm_mbarrier_init_b64(addr, count); + // CHECK_PTX70_SM80: call void @llvm.nvvm.mbarrier.init.b64 + __nvvm_mbarrier_init_shared_b64(sharedAddr, count); + // CHECK_PTX70_SM80: call void @llvm.nvvm.mbarrier.init.shared.b64 + + __nvvm_mbarrier_inval_b64(addr); + // CHECK_PTX70_SM80: call void @llvm.nvvm.mbarrier.inval.b64 + __nvvm_mbarrier_inval_shared_b64(sharedAddr); + // CHECK_PTX70_SM80: call void @llvm.nvvm.mbarrier.inval.shared.b64 + + __nvvm_mbarrier_arrive_b64(addr); + // CHECK_PTX70_SM80: call i64 @llvm.nvvm.mbarrier.arrive.b64 + __nvvm_mbarrier_arrive_shared_b64(sharedAddr); + // CHECK_PTX70_SM80: call i64 @llvm.nvvm.mbarrier.arrive.shared.b64 + __nvvm_mbarrier_arrive_noComplete_b64(addr, count); + // CHECK_PTX70_SM80: call i64 @llvm.nvvm.mbarrier.arrive.noComplete.b64 + __nvvm_mbarrier_arrive_noComplete_shared_b64(sharedAddr, count); + // CHECK_PTX70_SM80: call i64 @llvm.nvvm.mbarrier.arrive.noComplete.shared.b64 + + __nvvm_mbarrier_arrive_drop_b64(addr); + // CHECK_PTX70_SM80: call i64 @llvm.nvvm.mbarrier.arrive.drop.b64 + __nvvm_mbarrier_arrive_drop_shared_b64(sharedAddr); + // CHECK_PTX70_SM80: call i64 @llvm.nvvm.mbarrier.arrive.drop.shared.b64 + __nvvm_mbarrier_arrive_drop_noComplete_b64(addr, count); + // CHECK_PTX70_SM80: call i64 @llvm.nvvm.mbarrier.arrive.drop.noComplete.b64 + __nvvm_mbarrier_arrive_drop_noComplete_shared_b64(sharedAddr, count); + // CHECK_PTX70_SM80: call i64 @llvm.nvvm.mbarrier.arrive.drop.noComplete.shared.b64 + + __nvvm_mbarrier_test_wait_b64(addr, state); + // CHECK_PTX70_SM80: call i1 @llvm.nvvm.mbarrier.test.wait.b64 + __nvvm_mbarrier_test_wait_shared_b64(sharedAddr, state); + // CHECK_PTX70_SM80: call i1 @llvm.nvvm.mbarrier.test.wait.shared.b64 + + __nvvm_mbarrier_pending_count_b64(state); +// // CHECK_PTX70_SM80: call i32 @llvm.nvvm.mbarrier.pending.count.b64 + #endif + // CHECK: ret void +} + +__device__ void nvvm_async_copy(__attribute__((address_space(3))) void* dst, __attribute__((address_space(1))) const void* src, long long* addr, __attribute__((address_space(3))) long long* sharedAddr) { + #if __CUDA_ARCH__ >= 800 + // CHECK_PTX70_SM80: call void @llvm.nvvm.cp.async.mbarrier.arrive.b64 + __nvvm_cp_async_mbarrier_arrive_b64(addr); + // CHECK_PTX70_SM80: call void @llvm.nvvm.cp.async.mbarrier.arrive.shared.b64 + __nvvm_cp_async_mbarrier_arrive_shared_b64(sharedAddr); + // CHECK_PTX70_SM80: call void @llvm.nvvm.cp.async.mbarrier.arrive.noinc.b64 + __nvvm_cp_async_mbarrier_arrive_noinc_b64(addr); + // CHECK_PTX70_SM80: call void @llvm.nvvm.cp.async.mbarrier.arrive.noinc.shared.b64 + __nvvm_cp_async_mbarrier_arrive_noinc_shared_b64(sharedAddr); + + // CHECK_PTX70_SM80: call void @llvm.nvvm.cp.async.ca.shared.global.4 + __nvvm_cp_async_ca_shared_global_4(dst, src); + // CHECK_PTX70_SM80: call void @llvm.nvvm.cp.async.ca.shared.global.8 + __nvvm_cp_async_ca_shared_global_8(dst, src); + // CHECK_PTX70_SM80: call void @llvm.nvvm.cp.async.ca.shared.global.16 + __nvvm_cp_async_ca_shared_global_16(dst, src); + // CHECK_PTX70_SM80: call void @llvm.nvvm.cp.async.cg.shared.global.16 + __nvvm_cp_async_cg_shared_global_16(dst, src); + + // CHECK_PTX70_SM80: call void @llvm.nvvm.cp.async.commit.group + __nvvm_cp_async_commit_group(); + // CHECK_PTX70_SM80: call void @llvm.nvvm.cp.async.wait.group(i32 0) + __nvvm_cp_async_wait_group(0); + // CHECK_PTX70_SM80: call void @llvm.nvvm.cp.async.wait.group(i32 8) + __nvvm_cp_async_wait_group(8); + // CHECK_PTX70_SM80: call void @llvm.nvvm.cp.async.wait.group(i32 16) + __nvvm_cp_async_wait_group(16); + // CHECK_PTX70_SM80: call void @llvm.nvvm.cp.async.wait.all + __nvvm_cp_async_wait_all(); + #endif + // CHECK: ret void +} \ No newline at end of file Index: llvm/include/llvm/IR/IntrinsicsNVVM.td =================================================================== --- llvm/include/llvm/IR/IntrinsicsNVVM.td +++ llvm/include/llvm/IR/IntrinsicsNVVM.td @@ -31,7 +31,12 @@ // * llvm.nvvm.max.ull --> ibid. // * llvm.nvvm.h2f --> llvm.convert.to.fp16.f32 +def llvm_i8ptr_ty : LLVMPointerType; // i8* +def llvm_globali8ptr_ty : LLVMQualPointerType; // (global)i8* +def llvm_sharedi8ptr_ty : LLVMQualPointerType; // (shared)i8* +def llvm_i64ptr_ty : LLVMPointerType; // i64* def llvm_anyi64ptr_ty : LLVMAnyPointerType; // (space)i64* +def llvm_sharedi64ptr_ty : LLVMQualPointerType; // (shared)i64* // // MISC @@ -1052,6 +1057,106 @@ def int_nvvm_membar_sys : GCCBuiltin<"__nvvm_membar_sys">, Intrinsic<[], [], []>; +// Async Copy +def int_nvvm_cp_async_mbarrier_arrive_b64 : + GCCBuiltin<"__nvvm_cp_async_mbarrier_arrive_b64">, + Intrinsic<[],[llvm_i64ptr_ty],[IntrConvergent]>; +def int_nvvm_cp_async_mbarrier_arrive_shared_b64 : + GCCBuiltin<"__nvvm_cp_async_mbarrier_arrive_shared_b64">, + Intrinsic<[],[llvm_sharedi64ptr_ty],[IntrConvergent]>; +def int_nvvm_cp_async_mbarrier_arrive_noinc_b64 : + GCCBuiltin<"__nvvm_cp_async_mbarrier_arrive_noinc_b64">, + Intrinsic<[],[llvm_i64ptr_ty],[IntrConvergent]>; +def int_nvvm_cp_async_mbarrier_arrive_noinc_shared_b64 : + GCCBuiltin<"__nvvm_cp_async_mbarrier_arrive_noinc_shared_b64">, + Intrinsic<[],[llvm_sharedi64ptr_ty],[IntrConvergent]>; + +def int_nvvm_cp_async_ca_shared_global_4 : + GCCBuiltin<"__nvvm_cp_async_ca_shared_global_4">, + Intrinsic<[],[llvm_sharedi8ptr_ty, llvm_globali8ptr_ty], + [IntrArgMemOnly, NoAlias>, NoAlias>, + WriteOnly>, ReadOnly>], + "llvm.nvvm.cp.async.ca.shared.global.4">; +def int_nvvm_cp_async_ca_shared_global_8 : + GCCBuiltin<"__nvvm_cp_async_ca_shared_global_8">, + Intrinsic<[],[llvm_sharedi8ptr_ty, llvm_globali8ptr_ty], + [IntrArgMemOnly, NoAlias>, NoAlias>, + WriteOnly>, ReadOnly>], + "llvm.nvvm.cp.async.ca.shared.global.8">; +def int_nvvm_cp_async_ca_shared_global_16 : + GCCBuiltin<"__nvvm_cp_async_ca_shared_global_16">, + Intrinsic<[],[llvm_sharedi8ptr_ty, llvm_globali8ptr_ty], + [IntrArgMemOnly, NoAlias>, NoAlias>, + WriteOnly>, ReadOnly>], + "llvm.nvvm.cp.async.ca.shared.global.16">; +def int_nvvm_cp_async_cg_shared_global_16 : + GCCBuiltin<"__nvvm_cp_async_cg_shared_global_16">, + Intrinsic<[],[llvm_sharedi8ptr_ty, llvm_globali8ptr_ty], + [IntrArgMemOnly, NoAlias>, NoAlias>, + WriteOnly>, ReadOnly>], + "llvm.nvvm.cp.async.cg.shared.global.16">; + +def int_nvvm_cp_async_commit_group : + GCCBuiltin<"__nvvm_cp_async_commit_group">, + Intrinsic<[],[],[]>; + +def int_nvvm_cp_async_wait_group : + GCCBuiltin<"__nvvm_cp_async_wait_group">, + Intrinsic<[],[llvm_i32_ty],[ImmArg>]>; + +def int_nvvm_cp_async_wait_all : + GCCBuiltin<"__nvvm_cp_async_wait_all">, + Intrinsic<[],[],[]>; + +// mbarrier +def int_nvvm_mbarrier_init_b64 : GCCBuiltin<"__nvvm_mbarrier_init_b64">, + Intrinsic<[],[llvm_i64ptr_ty, llvm_i32_ty],[IntrConvergent]>; +def int_nvvm_mbarrier_init_shared_b64 : + GCCBuiltin<"__nvvm_mbarrier_init_shared_b64">, + Intrinsic<[],[llvm_sharedi64ptr_ty, llvm_i32_ty],[IntrConvergent]>; + +def int_nvvm_mbarrier_inval_b64 : GCCBuiltin<"__nvvm_mbarrier_inval_b64">, + Intrinsic<[],[llvm_i64ptr_ty],[IntrConvergent]>; +def int_nvvm_mbarrier_inval_shared_b64 : + GCCBuiltin<"__nvvm_mbarrier_inval_shared_b64">, + Intrinsic<[],[llvm_sharedi64ptr_ty],[IntrConvergent]>; + +def int_nvvm_mbarrier_arrive_b64 : GCCBuiltin<"__nvvm_mbarrier_arrive_b64">, + Intrinsic<[llvm_i64_ty],[llvm_i64ptr_ty],[IntrConvergent]>; +def int_nvvm_mbarrier_arrive_shared_b64 : + GCCBuiltin<"__nvvm_mbarrier_arrive_shared_b64">, + Intrinsic<[llvm_i64_ty],[llvm_sharedi64ptr_ty],[IntrConvergent]>; +def int_nvvm_mbarrier_arrive_noComplete_b64 : + GCCBuiltin<"__nvvm_mbarrier_arrive_noComplete_b64">, + Intrinsic<[llvm_i64_ty],[llvm_i64ptr_ty, llvm_i32_ty],[IntrConvergent]>; +def int_nvvm_mbarrier_arrive_noComplete_shared_b64 : + GCCBuiltin<"__nvvm_mbarrier_arrive_noComplete_shared_b64">, + Intrinsic<[llvm_i64_ty],[llvm_sharedi64ptr_ty, llvm_i32_ty],[IntrConvergent]>; + +def int_nvvm_mbarrier_arrive_drop_b64 : + GCCBuiltin<"__nvvm_mbarrier_arrive_drop_b64">, + Intrinsic<[llvm_i64_ty],[llvm_i64ptr_ty],[IntrConvergent]>; +def int_nvvm_mbarrier_arrive_drop_shared_b64 : + GCCBuiltin<"__nvvm_mbarrier_arrive_drop_shared_b64">, + Intrinsic<[llvm_i64_ty],[llvm_sharedi64ptr_ty],[IntrConvergent]>; +def int_nvvm_mbarrier_arrive_drop_noComplete_b64 : + GCCBuiltin<"__nvvm_mbarrier_arrive_drop_noComplete_b64">, + Intrinsic<[llvm_i64_ty],[llvm_i64ptr_ty, llvm_i32_ty],[IntrConvergent]>; +def int_nvvm_mbarrier_arrive_drop_noComplete_shared_b64 : + GCCBuiltin<"__nvvm_mbarrier_arrive_drop_noComplete_shared_b64">, + Intrinsic<[llvm_i64_ty],[llvm_sharedi64ptr_ty, llvm_i32_ty],[IntrConvergent]>; + +def int_nvvm_mbarrier_test_wait_b64 : + GCCBuiltin<"__nvvm_mbarrier_test_wait_b64">, + Intrinsic<[llvm_i1_ty],[llvm_i64ptr_ty, llvm_i64_ty],[IntrConvergent]>; +def int_nvvm_mbarrier_test_wait_shared_b64 : + GCCBuiltin<"__nvvm_mbarrier_test_wait_shared_b64">, + Intrinsic<[llvm_i1_ty],[llvm_sharedi64ptr_ty, llvm_i64_ty],[IntrConvergent]>; + +def int_nvvm_mbarrier_pending_count_b64 : + GCCBuiltin<"__nvvm_mbarrier_pending_count_b64">, + Intrinsic<[llvm_i32_ty],[llvm_i64_ty],[IntrConvergent]>; + // Generated within nvvm. Use for ldu on sm_20 or later. Second arg is the // pointer's alignment. def int_nvvm_ldu_global_i : Intrinsic<[llvm_anyint_ty], Index: llvm/lib/Target/NVPTX/NVPTXInstrInfo.td =================================================================== --- llvm/lib/Target/NVPTX/NVPTXInstrInfo.td +++ llvm/lib/Target/NVPTX/NVPTXInstrInfo.td @@ -144,11 +144,13 @@ def hasPTX61 : Predicate<"Subtarget->getPTXVersion() >= 61">; def hasPTX63 : Predicate<"Subtarget->getPTXVersion() >= 63">; def hasPTX64 : Predicate<"Subtarget->getPTXVersion() >= 64">; +def hasPTX70 : Predicate<"Subtarget->getPTXVersion() >= 70">; def hasSM30 : Predicate<"Subtarget->getSmVersion() >= 30">; def hasSM70 : Predicate<"Subtarget->getSmVersion() >= 70">; def hasSM72 : Predicate<"Subtarget->getSmVersion() >= 72">; def hasSM75 : Predicate<"Subtarget->getSmVersion() >= 75">; +def hasSM80 : Predicate<"Subtarget->getSmVersion() >= 80">; // non-sync shfl instructions are not available on sm_70+ in PTX6.4+ def hasSHFL : Predicate<"!(Subtarget->getSmVersion() >= 70" Index: llvm/lib/Target/NVPTX/NVPTXIntrinsics.td =================================================================== --- llvm/lib/Target/NVPTX/NVPTXIntrinsics.td +++ llvm/lib/Target/NVPTX/NVPTXIntrinsics.td @@ -288,6 +288,211 @@ def INT_MEMBAR_SYS : MEMBAR<"membar.sys;", int_nvvm_membar_sys>; +//----------------------------------- +// Async Copy Functions +//----------------------------------- + +multiclass CP_ASYNC_MBARRIER_ARRIVE { + def _32 : NVPTXInst<(outs), (ins Int32Regs:$addr), + !strconcat("cp.async.mbarrier.arrive", NoInc, AddrSpace, ".b64 [$addr];"), + [(Intrin Int32Regs:$addr)]>, + Requires<[hasPTX70, hasSM80]>; + def _64 : NVPTXInst<(outs), (ins Int64Regs:$addr), + !strconcat("cp.async.mbarrier.arrive", NoInc, AddrSpace, ".b64 [$addr];"), + [(Intrin Int64Regs:$addr)]>, + Requires<[hasPTX70, hasSM80]>; +} + +defm CP_ASYNC_MBARRIER_ARRIVE : + CP_ASYNC_MBARRIER_ARRIVE<"", "", int_nvvm_cp_async_mbarrier_arrive_b64>; +defm CP_ASYNC_MBARRIER_ARRIVE_SHARED : + CP_ASYNC_MBARRIER_ARRIVE<"", ".shared", int_nvvm_cp_async_mbarrier_arrive_shared_b64>; +defm CP_ASYNC_MBARRIER_ARRIVE_NOINC : + CP_ASYNC_MBARRIER_ARRIVE<".noinc", "", int_nvvm_cp_async_mbarrier_arrive_noinc_b64>; +defm CP_ASYNC_MBARRIER_ARRIVE_NOINC_SHARED : + CP_ASYNC_MBARRIER_ARRIVE<".noinc", ".shared", int_nvvm_cp_async_mbarrier_arrive_noinc_shared_b64>; + +multiclass CP_ASYNC_CA_SHARED_GLOBAL_I { + def _32 : NVPTXInst<(outs), (ins Int32Regs:$dst, Int32Regs:$src), + !strconcat("cp.async.ca.shared.global [$dst], [$src], ", cpsize, ";"), + [(Intrin Int32Regs:$dst, Int32Regs:$src)]>, + Requires<[hasPTX70, hasSM80]>; + def _64 : NVPTXInst<(outs), (ins Int64Regs:$dst, Int64Regs:$src), + !strconcat("cp.async.ca.shared.global [$dst], [$src], ", cpsize, ";"), + [(Intrin Int64Regs:$dst, Int64Regs:$src)]>, + Requires<[hasPTX70, hasSM80]>; +} + +defm CP_ASYNC_CA_SHARED_GLOBAL_4 : + CP_ASYNC_CA_SHARED_GLOBAL_I<"4", int_nvvm_cp_async_ca_shared_global_4>; + +defm CP_ASYNC_CA_SHARED_GLOBAL_8 : + CP_ASYNC_CA_SHARED_GLOBAL_I<"8", int_nvvm_cp_async_ca_shared_global_8>; + +defm CP_ASYNC_CA_SHARED_GLOBAL_16 : + CP_ASYNC_CA_SHARED_GLOBAL_I<"16", int_nvvm_cp_async_ca_shared_global_16>; + +multiclass CP_ASYNC_CG_SHARED_GLOBAL { + def _32 : NVPTXInst<(outs), (ins Int32Regs:$dst, Int32Regs:$src), + !strconcat("cp.async.cg.shared.global [$dst], [$src], ", cpsize, ";"), + [(Intrin Int32Regs:$dst, Int32Regs:$src)]>, + Requires<[hasPTX70, hasSM80]>; + def _64 : NVPTXInst<(outs), (ins Int64Regs:$dst, Int64Regs:$src), + !strconcat("cp.async.cg.shared.global [$dst], [$src], ", cpsize, ";"), + [(Intrin Int64Regs:$dst, Int64Regs:$src)]>, + Requires<[hasPTX70, hasSM80]>; +} + +defm CP_ASYNC_CG_SHARED_GLOBAL_16 : + CP_ASYNC_CG_SHARED_GLOBAL<"16", int_nvvm_cp_async_cg_shared_global_16>; + +def CP_ASYNC_COMMIT_GROUP : + NVPTXInst<(outs), (ins), "cp.async.commit_group;", [(int_nvvm_cp_async_commit_group)]>, + Requires<[hasPTX70, hasSM80]>; + +def CP_ASYNC_WAIT_GROUP : + NVPTXInst<(outs), (ins i32imm:$n), "cp.async.wait_group $n;", + [(int_nvvm_cp_async_wait_group (i32 timm:$n))]>, + Requires<[hasPTX70, hasSM80]>; + +def CP_ASYNC_WAIT_ALL : + NVPTXInst<(outs), (ins), "cp.async.wait_all;", + [(int_nvvm_cp_async_wait_all)]>, + Requires<[hasPTX70, hasSM80]>; + +//----------------------------------- +// MBarrier Functions +//----------------------------------- + +multiclass MBARRIER_INIT { + def _32 : NVPTXInst<(outs), (ins Int32Regs:$addr, Int32Regs:$count), + !strconcat("mbarrier.init", AddrSpace, ".b64 [$addr], $count;"), + [(Intrin Int32Regs:$addr, Int32Regs:$count)]>, + Requires<[hasPTX70, hasSM80]>; + def _64 : NVPTXInst<(outs), (ins Int64Regs:$addr, Int32Regs:$count), + !strconcat("mbarrier.init", AddrSpace, ".b64 [$addr], $count;"), + [(Intrin Int64Regs:$addr, Int32Regs:$count)]>, + Requires<[hasPTX70, hasSM80]>; +} + +defm MBARRIER_INIT : MBARRIER_INIT<"", int_nvvm_mbarrier_init_b64>; +defm MBARRIER_INIT_SHARED : MBARRIER_INIT<".shared", + int_nvvm_mbarrier_init_shared_b64>; + +multiclass MBARRIER_INVAL { + def _32 : NVPTXInst<(outs), (ins Int32Regs:$addr), + !strconcat("mbarrier.inval", AddrSpace, ".b64 [$addr];"), + [(Intrin Int32Regs:$addr)]>, + Requires<[hasPTX70, hasSM80]>; + def _64 : NVPTXInst<(outs), (ins Int64Regs:$addr), + !strconcat("mbarrier.inval", AddrSpace, ".b64 [$addr];"), + [(Intrin Int64Regs:$addr)]>, + Requires<[hasPTX70, hasSM80]>; +} + +defm MBARRIER_INVAL : MBARRIER_INVAL<"", int_nvvm_mbarrier_inval_b64>; +defm MBARRIER_INVAL_SHARED : MBARRIER_INVAL<".shared", + int_nvvm_mbarrier_inval_shared_b64>; + +multiclass MBARRIER_ARRIVE { + def _32 : NVPTXInst<(outs Int64Regs:$state), (ins Int32Regs:$addr), + !strconcat("mbarrier.arrive", AddrSpace, ".b64 $state, [$addr];"), + [(set Int64Regs:$state, (Intrin Int32Regs:$addr))]>, + Requires<[hasPTX70, hasSM80]>; + def _64 : NVPTXInst<(outs Int64Regs:$state), (ins Int64Regs:$addr), + !strconcat("mbarrier.arrive", AddrSpace, ".b64 $state, [$addr];"), + [(set Int64Regs:$state, (Intrin Int64Regs:$addr))]>, + Requires<[hasPTX70, hasSM80]>; +} + +defm MBARRIER_ARRIVE : MBARRIER_ARRIVE<"", int_nvvm_mbarrier_arrive_b64>; +defm MBARRIER_ARRIVE_SHARED : + MBARRIER_ARRIVE<".shared", int_nvvm_mbarrier_arrive_shared_b64>; + +multiclass MBARRIER_ARRIVE_NOCOMPLETE { + def _32 : NVPTXInst<(outs Int64Regs:$state), + (ins Int32Regs:$addr, Int32Regs:$count), + !strconcat("mbarrier.arrive.noComplete", AddrSpace, + ".b64 $state, [$addr], $count;"), + [(set Int64Regs:$state, (Intrin Int32Regs:$addr, Int32Regs:$count))]>, + Requires<[hasPTX70, hasSM80]>; + def _64 : NVPTXInst<(outs Int64Regs:$state), + (ins Int64Regs:$addr, Int32Regs:$count), + !strconcat("mbarrier.arrive.noComplete", AddrSpace, + ".b64 $state, [$addr], $count;"), + [(set Int64Regs:$state, (Intrin Int64Regs:$addr, Int32Regs:$count))]>, + Requires<[hasPTX70, hasSM80]>; +} + +defm MBARRIER_ARRIVE_NOCOMPLETE : + MBARRIER_ARRIVE_NOCOMPLETE<"", int_nvvm_mbarrier_arrive_noComplete_b64>; +defm MBARRIER_ARRIVE_NOCOMPLETE_SHARED : + MBARRIER_ARRIVE_NOCOMPLETE<".shared", int_nvvm_mbarrier_arrive_noComplete_shared_b64>; + +multiclass MBARRIER_ARRIVE_DROP { + def _32 : NVPTXInst<(outs Int64Regs:$state), (ins Int32Regs:$addr), + !strconcat("mbarrier.arrive_drop", AddrSpace, + ".b64 $state, [$addr];"), + [(set Int64Regs:$state, (Intrin Int32Regs:$addr))]>, + Requires<[hasPTX70, hasSM80]>; + def _64 : NVPTXInst<(outs Int64Regs:$state), (ins Int64Regs:$addr), + !strconcat("mbarrier.arrive_drop", AddrSpace, + ".b64 $state, [$addr];"), + [(set Int64Regs:$state, (Intrin Int64Regs:$addr))]>, + Requires<[hasPTX70, hasSM80]>; +} + +defm MBARRIER_ARRIVE_DROP : + MBARRIER_ARRIVE_DROP<"", int_nvvm_mbarrier_arrive_drop_b64>; +defm MBARRIER_ARRIVE_DROP_SHARED : + MBARRIER_ARRIVE_DROP<".shared", int_nvvm_mbarrier_arrive_drop_shared_b64>; + +multiclass MBARRIER_ARRIVE_DROP_NOCOMPLETE { + def _32 : NVPTXInst<(outs Int64Regs:$state), + (ins Int32Regs:$addr, Int32Regs:$count), + !strconcat("mbarrier.arrive_drop.noComplete", AddrSpace, + ".b64 $state, [$addr], $count;"), + [(set Int64Regs:$state, (Intrin Int32Regs:$addr, Int32Regs:$count))]>, + Requires<[hasPTX70, hasSM80]>; + def _64 : NVPTXInst<(outs Int64Regs:$state), + (ins Int64Regs:$addr, Int32Regs:$count), + !strconcat("mbarrier.arrive_drop.noComplete", AddrSpace, + ".b64 $state, [$addr], $count;"), + [(set Int64Regs:$state, (Intrin Int64Regs:$addr, Int32Regs:$count))]>, + Requires<[hasPTX70, hasSM80]>; +} + +defm MBARRIER_ARRIVE_DROP_NOCOMPLETE : + MBARRIER_ARRIVE_DROP_NOCOMPLETE<"", int_nvvm_mbarrier_arrive_drop_noComplete_b64>; +defm MBARRIER_ARRIVE_DROP_NOCOMPLETE_SHARED : + MBARRIER_ARRIVE_DROP_NOCOMPLETE<".shared", + int_nvvm_mbarrier_arrive_drop_noComplete_shared_b64>; + +multiclass MBARRIER_TEST_WAIT { + def _32 : NVPTXInst<(outs Int1Regs:$res), (ins Int32Regs:$addr, Int64Regs:$state), + !strconcat("mbarrier.test_wait", AddrSpace, ".b64 $res, [$addr], $state;"), + [(set Int1Regs:$res, (Intrin Int32Regs:$addr, Int64Regs:$state))]>, + Requires<[hasPTX70, hasSM80]>; + def _64 : NVPTXInst<(outs Int1Regs:$res), (ins Int64Regs:$addr, Int64Regs:$state), + !strconcat("mbarrier.test_wait", AddrSpace, ".b64 $res, [$addr], $state;"), + [(set Int1Regs:$res, (Intrin Int64Regs:$addr, Int64Regs:$state))]>, + Requires<[hasPTX70, hasSM80]>; +} + +defm MBARRIER_TEST_WAIT : + MBARRIER_TEST_WAIT<"", int_nvvm_mbarrier_test_wait_b64>; +defm MBARRIER_TEST_WAIT_SHARED : + MBARRIER_TEST_WAIT<".shared", int_nvvm_mbarrier_test_wait_shared_b64>; + +class MBARRIER_PENDING_COUNT : + NVPTXInst<(outs Int32Regs:$res), (ins Int64Regs:$state), + "mbarrier.pending_count.b64 $res, $state;", + [(set Int32Regs:$res, (Intrin Int64Regs:$state))]>, + Requires<[hasPTX70, hasSM80]>; + +def MBARRIER_PENDING_COUNT : + MBARRIER_PENDING_COUNT; + //----------------------------------- // Math Functions //----------------------------------- Index: llvm/test/CodeGen/NVPTX/async-copy.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/NVPTX/async-copy.ll @@ -0,0 +1,101 @@ +; RUN: llc < %s -march=nvptx -mcpu=sm_80 -mattr=+ptx70 | FileCheck %s -check-prefix=CHECK_PTX32 +; RUN: llc < %s -march=nvptx64 -mcpu=sm_80 -mattr=+ptx70 | FileCheck %s -check-prefix=CHECK_PTX64 + +declare void @llvm.nvvm.cp.async.wait.group(i32) + +define void @asyncwaitgroup() { + ; CHECK_PTX32: cp.async.wait_group 8; + ; CHECK_PTX64: cp.async.wait_group 8; + tail call void @llvm.nvvm.cp.async.wait.group(i32 8) + ; CHECK_PTX32: cp.async.wait_group 0; + ; CHECK_PTX64: cp.async.wait_group 0; + tail call void @llvm.nvvm.cp.async.wait.group(i32 0) + ; CHECK_PTX32: cp.async.wait_group 16; + ; CHECK_PTX64: cp.async.wait_group 16; + tail call void @llvm.nvvm.cp.async.wait.group(i32 16) + ret void +} + +declare void @llvm.nvvm.cp.async.wait.all() + +define void @asyncwaitall() { +; CHECK_PTX32: cp.async.wait_all +; CHECK_PTX64: cp.async.wait_all + tail call void @llvm.nvvm.cp.async.wait.all() + ret void +} + +declare void @llvm.nvvm.cp.async.commit.group() + +define void @asynccommitgroup() { +; CHECK_PTX32: cp.async.commit_group +; CHECK_PTX64: cp.async.commit_group + tail call void @llvm.nvvm.cp.async.commit.group() + ret void +} + +declare void @llvm.nvvm.cp.async.mbarrier.arrive.b64(i64* %a) +declare void @llvm.nvvm.cp.async.mbarrier.arrive.shared.b64(i64 addrspace(3)* %a) +declare void @llvm.nvvm.cp.async.mbarrier.arrive.noinc.b64(i64* %a) +declare void @llvm.nvvm.cp.async.mbarrier.arrive.noinc.shared.b64(i64 addrspace(3)* %a) + +define void @asyncmbarrier(i64* %a) { +; CHECK_PTX32: cp.async.mbarrier.arrive.b64 [%r{{[0-9]+}}]; +; CHECK_PTX64: cp.async.mbarrier.arrive.b64 [%rd{{[0-9]+}}]; + tail call void @llvm.nvvm.cp.async.mbarrier.arrive.b64(i64* %a) + ret void +} +define void @asyncmbarriershared(i64 addrspace(3)* %a) { +; CHECK_PTX32: cp.async.mbarrier.arrive.shared.b64 [%r{{[0-9]+}}]; +; CHECK_PTX64: cp.async.mbarrier.arrive.shared.b64 [%rd{{[0-9]+}}]; + tail call void @llvm.nvvm.cp.async.mbarrier.arrive.shared.b64(i64 addrspace(3)* %a) + ret void +} +define void @asyncmbarriernoinc(i64* %a) { +; CHECK_PTX32: cp.async.mbarrier.arrive.noinc.b64 [%r{{[0-9]+}}]; +; CHECK_PTX64: cp.async.mbarrier.arrive.noinc.b64 [%rd{{[0-9]+}}]; + tail call void @llvm.nvvm.cp.async.mbarrier.arrive.noinc.b64(i64* %a) + ret void +} +define void @asyncmbarriernoincshared(i64 addrspace(3)* %a) { +; CHECK_PTX32: cp.async.mbarrier.arrive.noinc.shared.b64 [%r{{[0-9]+}}]; +; CHECK_PTX64: cp.async.mbarrier.arrive.noinc.shared.b64 [%rd{{[0-9]+}}]; + tail call void @llvm.nvvm.cp.async.mbarrier.arrive.noinc.shared.b64(i64 addrspace(3)* %a) + ret void +} + +declare void @llvm.nvvm.cp.async.ca.shared.global.4(i8 addrspace(3)* %a, i8 addrspace(1)* %b) + +define void @asynccasharedglobal4i8(i8 addrspace(3)* %a, i8 addrspace(1)* %b) { +; CHECK_PTX32: cp.async.ca.shared.global [%r{{[0-9]+}}], [%r{{[0-9]+}}], 4; +; CHECK_PTX64: cp.async.ca.shared.global [%rd{{[0-9]+}}], [%rd{{[0-9]+}}], 4; + tail call void @llvm.nvvm.cp.async.ca.shared.global.4(i8 addrspace(3)* %a, i8 addrspace(1)* %b) + ret void +} + +declare void @llvm.nvvm.cp.async.ca.shared.global.8(i8 addrspace(3)* %a, i8 addrspace(1)* %b) + +define void @asynccasharedglobal8i8(i8 addrspace(3)* %a, i8 addrspace(1)* %b) { +; CHECK_PTX32: cp.async.ca.shared.global [%r{{[0-9]+}}], [%r{{[0-9]+}}], 8; +; CHECK_PTX64: cp.async.ca.shared.global [%rd{{[0-9]+}}], [%rd{{[0-9]+}}], 8; + tail call void @llvm.nvvm.cp.async.ca.shared.global.8(i8 addrspace(3)* %a, i8 addrspace(1)* %b) + ret void +} + +declare void @llvm.nvvm.cp.async.ca.shared.global.16(i8 addrspace(3)* %a, i8 addrspace(1)* %b) + +define void @asynccasharedglobal16i8(i8 addrspace(3)* %a, i8 addrspace(1)* %b) { +; CHECK_PTX32: cp.async.ca.shared.global [%r{{[0-9]+}}], [%r{{[0-9]+}}], 16; +; CHECK_PTX64: cp.async.ca.shared.global [%rd{{[0-9]+}}], [%rd{{[0-9]+}}], 16; + tail call void @llvm.nvvm.cp.async.ca.shared.global.16(i8 addrspace(3)* %a, i8 addrspace(1)* %b) + ret void +} + +declare void @llvm.nvvm.cp.async.cg.shared.global.16(i8 addrspace(3)* %a, i8 addrspace(1)* %b) + +define void @asynccgsharedglobal16i8(i8 addrspace(3)* %a, i8 addrspace(1)* %b) { +; CHECK_PTX32: cp.async.cg.shared.global [%r{{[0-9]+}}], [%r{{[0-9]+}}], 16; +; CHECK_PTX64: cp.async.cg.shared.global [%rd{{[0-9]+}}], [%rd{{[0-9]+}}], 16; + tail call void @llvm.nvvm.cp.async.cg.shared.global.16(i8 addrspace(3)* %a, i8 addrspace(1)* %b) + ret void +} Index: llvm/test/CodeGen/NVPTX/mbarrier.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/NVPTX/mbarrier.ll @@ -0,0 +1,123 @@ +; RUN: llc < %s -march=nvptx -mcpu=sm_80 | FileCheck %s -check-prefix=x32 +; RUN: llc < %s -march=nvptx64 -mcpu=sm_80 | FileCheck %s -check-prefix=x64 + +declare void @llvm.nvvm.mbarrier.init.b64(i64* %a, i32 %b) +declare void @llvm.nvvm.mbarrier.init.shared.b64(i64 addrspace(3)* %a, i32 %b) + +define void @barrierinit(i64* %a, i32 %b) { +; x32: mbarrier.init.b64 [%r{{[0-9]+}}], %r{{[0-9]+}}; +; x64: mbarrier.init.b64 [%rd{{[0-9]+}}], %r{{[0-9]+}}; + tail call void @llvm.nvvm.mbarrier.init.b64(i64* %a, i32 %b) + ret void +} +define void @barrierinitshared(i64 addrspace(3)* %a, i32 %b) { +; x32: mbarrier.init.shared.b64 [%r{{[0-9]+}}], %r{{[0-9]+}}; +; x64: mbarrier.init.shared.b64 [%rd{{[0-9]+}}], %r{{[0-9]+}}; + tail call void @llvm.nvvm.mbarrier.init.shared.b64(i64 addrspace(3)* %a, i32 %b) + ret void +} + +declare void @llvm.nvvm.mbarrier.inval.b64(i64* %a) +declare void @llvm.nvvm.mbarrier.inval.shared.b64(i64 addrspace(3)* %a) + +define void @barrierinval(i64* %a) { +; x32: mbarrier.inval.b64 [%r{{[0-1]+}}]; +; x64: mbarrier.inval.b64 [%rd{{[0-1]+}}]; + tail call void @llvm.nvvm.mbarrier.inval.b64(i64* %a) + ret void +} +define void @barrierinvalshared(i64 addrspace(3)* %a) { +; x32: mbarrier.inval.shared.b64 [%r{{[0-1]+}}]; +; x64: mbarrier.inval.shared.b64 [%rd{{[0-1]+}}]; + tail call void @llvm.nvvm.mbarrier.inval.shared.b64(i64 addrspace(3)* %a) + ret void +} + +declare i64 @llvm.nvvm.mbarrier.arrive.b64(i64* %a) +declare i64 @llvm.nvvm.mbarrier.arrive.shared.b64(i64 addrspace(3)* %a) + +define void @barrierarrive(i64* %a) { +; x32: mbarrier.arrive.b64 %rd{{[0-9]+}}, [%r{{[0-9]+}}]; +; x64: mbarrier.arrive.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]; + %ret = tail call i64 @llvm.nvvm.mbarrier.arrive.b64(i64* %a) + ret void +} +define void @barrierarriveshared(i64 addrspace(3)* %a) { +; x32: mbarrier.arrive.shared.b64 %rd{{[0-9]+}}, [%r{{[0-9]+}}]; +; x64: mbarrier.arrive.shared.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]; + %ret = tail call i64 @llvm.nvvm.mbarrier.arrive.shared.b64(i64 addrspace(3)* %a) + ret void +} + +declare i64 @llvm.nvvm.mbarrier.arrive.noComplete.b64(i64* %a, i32 %b) +declare i64 @llvm.nvvm.mbarrier.arrive.noComplete.shared.b64(i64 addrspace(3)* %a, i32 %b) + +define void @barrierarrivenoComplete(i64* %a, i32 %b) { +; x32: mbarrier.arrive.noComplete.b64 %rd{{[0-9]+}}, [%r{{[0-9]+}}], %r{{[0-9]+}}; +; x64: mbarrier.arrive.noComplete.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}], %r{{[0-9]+}}; + %ret = tail call i64 @llvm.nvvm.mbarrier.arrive.noComplete.b64(i64* %a, i32 %b) + ret void +} +define void @barrierarrivenoCompleteshared(i64 addrspace(3)* %a, i32 %b) { +; x32: mbarrier.arrive.noComplete.shared.b64 %rd{{[0-9]+}}, [%r{{[0-9]+}}], %r{{[0-9]+}}; +; x64: mbarrier.arrive.noComplete.shared.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}], %r{{[0-9]+}}; + %ret = tail call i64 @llvm.nvvm.mbarrier.arrive.noComplete.shared.b64(i64 addrspace(3)* %a, i32 %b) + ret void +} + +declare i64 @llvm.nvvm.mbarrier.arrive.drop.b64(i64* %a) +declare i64 @llvm.nvvm.mbarrier.arrive.drop.shared.b64(i64 addrspace(3)* %a) + +define void @barrierarrivedrop(i64* %a) { +; x32: mbarrier.arrive_drop.b64 %rd{{[0-9]+}}, [%r{{[0-9]+}}]; +; x64: mbarrier.arrive_drop.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]; + %ret = tail call i64 @llvm.nvvm.mbarrier.arrive.drop.b64(i64* %a) + ret void +} +define void @barrierarrivedropshared(i64 addrspace(3)* %a) { +; x32: mbarrier.arrive_drop.shared.b64 %rd{{[0-9]+}}, [%r{{[0-9]+}}]; +; x64: mbarrier.arrive_drop.shared.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]; + %ret = tail call i64 @llvm.nvvm.mbarrier.arrive.drop.shared.b64(i64 addrspace(3)* %a) + ret void +} + +declare i64 @llvm.nvvm.mbarrier.arrive.drop.noComplete.b64(i64* %a, i32 %b) +declare i64 @llvm.nvvm.mbarrier.arrive.drop.noComplete.shared.b64(i64 addrspace(3)* %a, i32 %b) + +define void @barrierarrivedropnoComplete(i64* %a, i32 %b) { +; x32: mbarrier.arrive_drop.noComplete.b64 %rd{{[0-9]+}}, [%r{{[0-9]+}}], %r{{[0-9]+}}; +; x64: mbarrier.arrive_drop.noComplete.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}], %r{{[0-9]+}}; + %ret = tail call i64 @llvm.nvvm.mbarrier.arrive.drop.noComplete.b64(i64* %a, i32 %b) + ret void +} +define void @barrierarrivedropnoCompleteshared(i64 addrspace(3)* %a, i32 %b) { +; x32: mbarrier.arrive_drop.noComplete.shared.b64 %rd{{[0-9]+}}, [%r{{[0-9]+}}], %r{{[0-9]+}}; +; x64: mbarrier.arrive_drop.noComplete.shared.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}], %r{{[0-9]+}}; + %ret = tail call i64 @llvm.nvvm.mbarrier.arrive.drop.noComplete.shared.b64(i64 addrspace(3)* %a, i32 %b) + ret void +} + +declare i1 @llvm.nvvm.mbarrier.test.wait.b64(i64* %a, i64 %b) +declare i1 @llvm.nvvm.mbarrier.test.wait.shared.b64(i64 addrspace(3)* %a, i64 %b) + +define void @barriertestwait(i64* %a, i64 %b) { +; x32: mbarrier.test_wait.b64 %p{{[0-9]+}}, [%r{{[0-9]+}}], %rd{{[0-9]+}}; +; x64: mbarrier.test_wait.b64 %p{{[0-9]+}}, [%rd{{[0-9]+}}], %rd{{[0-9]+}}; + %ret = tail call i1 @llvm.nvvm.mbarrier.test.wait.b64(i64* %a, i64 %b) + ret void +} +define void @barriertestwaitshared(i64 addrspace(3)* %a, i64 %b) { +; x32: mbarrier.test_wait.shared.b64 %p{{[0-9]+}}, [%r{{[0-9]+}}], %rd{{[0-9]+}}; +; x64: mbarrier.test_wait.shared.b64 %p{{[0-9]+}}, [%rd{{[0-9]+}}], %rd{{[0-9]+}}; + %ret = tail call i1 @llvm.nvvm.mbarrier.test.wait.shared.b64(i64 addrspace(3)* %a, i64 %b) + ret void +} + +declare i32 @llvm.nvvm.mbarrier.pending.count.b64(i64 %b) + +define void @barrierpendingcount(i64* %a, i64 %b) { +; x32: mbarrier.pending_count.b64 %r{{[0-9]+}}, %rd{{[0-9]+}}; +; x64: mbarrier.pending_count.b64 %r{{[0-9]+}}, %rd{{[0-9]+}}; + %ret = tail call i32 @llvm.nvvm.mbarrier.pending.count.b64(i64 %b) + ret void +}