Index: clang/include/clang/Basic/BuiltinsNVPTX.def =================================================================== --- clang/include/clang/Basic/BuiltinsNVPTX.def +++ clang/include/clang/Basic/BuiltinsNVPTX.def @@ -729,6 +729,21 @@ TARGET_BUILTIN(__imma_m8n8k32_mma_u4, "vi*iC*iC*iC*IiIi", "", AND(SM_75,PTX63)) TARGET_BUILTIN(__imma_m8n8k32_st_c_i32, "vi*iC*UiIi", "", AND(SM_75,PTX63)) +// Async Copy +TARGET_BUILTIN(__nvvm_cp_async_mbarrier_arrive_b64, "vv*", "", AND(SM_80,PTX70)) +TARGET_BUILTIN(__nvvm_cp_async_mbarrier_arrive_shared_b64, "vv*", "", AND(SM_80,PTX70)) +TARGET_BUILTIN(__nvvm_cp_async_mbarrier_arrive_noinc_b64, "vv*", "", AND(SM_80,PTX70)) +TARGET_BUILTIN(__nvvm_cp_async_mbarrier_arrive_noinc_shared_b64, "vv*", "", AND(SM_80,PTX70)) + +TARGET_BUILTIN(__nvvm_cp_async_ca_shared_global_4, "vv*vC*", "", AND(SM_80,PTX70)) +TARGET_BUILTIN(__nvvm_cp_async_ca_shared_global_8, "vv*vC*", "", AND(SM_80,PTX70)) +TARGET_BUILTIN(__nvvm_cp_async_ca_shared_global_16, "vv*vC*", "", AND(SM_80,PTX70)) +TARGET_BUILTIN(__nvvm_cp_async_cg_shared_global_16, "vv*vC*", "", AND(SM_80,PTX70)) + +TARGET_BUILTIN(__nvvm_cp_async_commit_group, "v", "", AND(SM_80,PTX70)) +TARGET_BUILTIN(__nvvm_cp_async_wait_group, "vi", "", AND(SM_80,PTX70)) +TARGET_BUILTIN(__nvvm_cp_async_wait_all, "v", "", AND(SM_80,PTX70)) + #undef BUILTIN #undef TARGET_BUILTIN #pragma pop_macro("AND") Index: clang/test/CodeGen/builtins-nvptx.c =================================================================== --- clang/test/CodeGen/builtins-nvptx.c +++ clang/test/CodeGen/builtins-nvptx.c @@ -1,4 +1,10 @@ // REQUIRES: nvptx-registered-target +// RUN: %clang_cc1 -ffp-contract=off -triple nvptx-unknown-unknown -target-cpu sm_80 -target-feature +ptx70 \ +// RUN: -fcuda-is-device -S -emit-llvm -o - -x cuda %s \ +// RUN: | FileCheck -check-prefix=CHECK -check-prefix=CHECK_PTX70_SM80 -check-prefix=LP32 %s +// RUN: %clang_cc1 -ffp-contract=off -triple nvptx64-unknown-unknown -target-cpu sm_80 -target-feature +ptx70 \ +// RUN: -fcuda-is-device -S -emit-llvm -o - -x cuda %s \ +// RUN: | FileCheck -check-prefix=CHECK -check-prefix=CHECK_PTX70_SM80 -check-prefix=LP64 %s // RUN: %clang_cc1 -ffp-contract=off -triple nvptx-unknown-unknown -target-cpu sm_60 \ // RUN: -fcuda-is-device -S -emit-llvm -o - -x cuda %s \ // RUN: | FileCheck -check-prefix=CHECK -check-prefix=LP32 %s @@ -672,3 +678,33 @@ __nvvm_vote_ballot(pred); // CHECK: ret void } + +__device__ void nvvm_async_copy(void* dst, const void* src, void* addr, int n) { + #if __CUDA_ARCH__ >= 800 + // CHECK_PTX70_SM80: call void @llvm.nvvm.cp.async.mbarrier.arrive.b64 + __nvvm_cp_async_mbarrier_arrive_b64(addr); + // CHECK_PTX70_SM80: call void @llvm.nvvm.cp.async.mbarrier.arrive.shared.b64 + __nvvm_cp_async_mbarrier_arrive_shared_b64(addr); + // CHECK_PTX70_SM80: call void @llvm.nvvm.cp.async.mbarrier.arrive.noinc.b64 + __nvvm_cp_async_mbarrier_arrive_noinc_b64(addr); + // CHECK_PTX70_SM80: call void @llvm.nvvm.cp.async.mbarrier.arrive.noinc.shared.b64 + __nvvm_cp_async_mbarrier_arrive_noinc_shared_b64(addr); + + // CHECK_PTX70_SM80: call void @llvm.nvvm.cp.async.ca.shared.global.4 + __nvvm_cp_async_ca_shared_global_4(dst, src); + // CHECK_PTX70_SM80: call void @llvm.nvvm.cp.async.ca.shared.global.8 + __nvvm_cp_async_ca_shared_global_8(dst, src); + // CHECK_PTX70_SM80: call void @llvm.nvvm.cp.async.ca.shared.global.16 + __nvvm_cp_async_ca_shared_global_16(dst, src); + // CHECK_PTX70_SM80: call void @llvm.nvvm.cp.async.cg.shared.global.16 + __nvvm_cp_async_cg_shared_global_16(dst, src); + + // CHECK_PTX70_SM80: call void @llvm.nvvm.cp.async.commit.group + __nvvm_cp_async_commit_group(); + // CHECK_PTX70_SM80: call void @llvm.nvvm.cp.async.wait.group + __nvvm_cp_async_wait_group(n); + // CHECK_PTX70_SM80: call void @llvm.nvvm.cp.async.wait.all + __nvvm_cp_async_wait_all(); + #endif + // CHECK: ret void +} \ No newline at end of file Index: llvm/include/llvm/IR/IntrinsicsNVVM.td =================================================================== --- llvm/include/llvm/IR/IntrinsicsNVVM.td +++ llvm/include/llvm/IR/IntrinsicsNVVM.td @@ -31,7 +31,10 @@ // * llvm.nvvm.max.ull --> ibid. // * llvm.nvvm.h2f --> llvm.convert.to.fp16.f32 +def llvm_i8ptr_ty : LLVMPointerType; // i8* +def llvm_i64ptr_ty : LLVMPointerType; // i64* def llvm_anyi64ptr_ty : LLVMAnyPointerType; // (space)i64* +def llvm_sharedi64ptr_ty : LLVMQualPointerType; // (shared)i64* // // MISC @@ -1052,6 +1055,49 @@ def int_nvvm_membar_sys : GCCBuiltin<"__nvvm_membar_sys">, Intrinsic<[], [], []>; +// Async Copy +def int_nvvm_cp_async_mbarrier_arrive_b64 : + GCCBuiltin<"__nvvm_cp_async_mbarrier_arrive_b64">, + Intrinsic<[],[llvm_i64ptr_ty],[IntrConvergent]>; +def int_nvvm_cp_async_mbarrier_arrive_shared_b64 : + GCCBuiltin<"__nvvm_cp_async_mbarrier_arrive_shared_b64">, + Intrinsic<[],[llvm_sharedi64ptr_ty],[IntrConvergent]>; +def int_nvvm_cp_async_mbarrier_arrive_noinc_b64 : + GCCBuiltin<"__nvvm_cp_async_mbarrier_arrive_noinc_b64">, + Intrinsic<[],[llvm_i64ptr_ty],[IntrConvergent]>; +def int_nvvm_cp_async_mbarrier_arrive_noinc_shared_b64 : + GCCBuiltin<"__nvvm_cp_async_mbarrier_arrive_noinc_shared_b64">, + Intrinsic<[],[llvm_sharedi64ptr_ty],[IntrConvergent]>; + +def int_nvvm_cp_async_ca_shared_global_4 : + GCCBuiltin<"__nvvm_cp_async_ca_shared_global_4">, + Intrinsic<[],[llvm_i8ptr_ty, llvm_i8ptr_ty],[], + "llvm.nvvm.cp.async.ca.shared.global.4">; +def int_nvvm_cp_async_ca_shared_global_8 : + GCCBuiltin<"__nvvm_cp_async_ca_shared_global_8">, + Intrinsic<[],[llvm_i8ptr_ty, llvm_i8ptr_ty],[], + "llvm.nvvm.cp.async.ca.shared.global.8">; +def int_nvvm_cp_async_ca_shared_global_16 : + GCCBuiltin<"__nvvm_cp_async_ca_shared_global_16">, + Intrinsic<[],[llvm_i8ptr_ty, llvm_i8ptr_ty],[], + "llvm.nvvm.cp.async.ca.shared.global.16">; +def int_nvvm_cp_async_cg_shared_global_16 : + GCCBuiltin<"__nvvm_cp_async_cg_shared_global_16">, + Intrinsic<[],[llvm_i8ptr_ty, llvm_i8ptr_ty],[], + "llvm.nvvm.cp.async.cg.shared.global.16">; + +def int_nvvm_cp_async_commit_group : + GCCBuiltin<"__nvvm_cp_async_commit_group">, + Intrinsic<[],[],[]>; + +def int_nvvm_cp_async_wait_group : + GCCBuiltin<"__nvvm_cp_async_wait_group">, + Intrinsic<[],[llvm_i32_ty],[]>; + +def int_nvvm_cp_async_wait_all : + GCCBuiltin<"__nvvm_cp_async_wait_all">, + Intrinsic<[],[],[]>; + // Generated within nvvm. Use for ldu on sm_20 or later. Second arg is the // pointer's alignment. def int_nvvm_ldu_global_i : Intrinsic<[llvm_anyint_ty], Index: llvm/lib/Target/NVPTX/NVPTXInstrInfo.td =================================================================== --- llvm/lib/Target/NVPTX/NVPTXInstrInfo.td +++ llvm/lib/Target/NVPTX/NVPTXInstrInfo.td @@ -144,11 +144,13 @@ def hasPTX61 : Predicate<"Subtarget->getPTXVersion() >= 61">; def hasPTX63 : Predicate<"Subtarget->getPTXVersion() >= 63">; def hasPTX64 : Predicate<"Subtarget->getPTXVersion() >= 64">; +def hasPTX70 : Predicate<"Subtarget->getPTXVersion() >= 70">; def hasSM30 : Predicate<"Subtarget->getSmVersion() >= 30">; def hasSM70 : Predicate<"Subtarget->getSmVersion() >= 70">; def hasSM72 : Predicate<"Subtarget->getSmVersion() >= 72">; def hasSM75 : Predicate<"Subtarget->getSmVersion() >= 75">; +def hasSM80 : Predicate<"Subtarget->getSmVersion() >= 80">; // non-sync shfl instructions are not available on sm_70+ in PTX6.4+ def hasSHFL : Predicate<"!(Subtarget->getSmVersion() >= 70" Index: llvm/lib/Target/NVPTX/NVPTXIntrinsics.td =================================================================== --- llvm/lib/Target/NVPTX/NVPTXIntrinsics.td +++ llvm/lib/Target/NVPTX/NVPTXIntrinsics.td @@ -288,6 +288,78 @@ def INT_MEMBAR_SYS : MEMBAR<"membar.sys;", int_nvvm_membar_sys>; +//----------------------------------- +// Async Copy Functions +//----------------------------------- + +multiclass CP_ASYNC_MBARRIER_ARRIVE { + def _32 : NVPTXInst<(outs), (ins Int32Regs:$addr), + !strconcat("cp.async.mbarrier.arrive", NoInc, AddrSpace, ".b64 [$addr];"), + [(Intrin Int32Regs:$addr)]>, + Requires<[hasPTX70, hasSM80]>; + def _64 : NVPTXInst<(outs), (ins Int64Regs:$addr), + !strconcat("cp.async.mbarrier.arrive", NoInc, AddrSpace, ".b64 [$addr];"), + [(Intrin Int64Regs:$addr)]>, + Requires<[hasPTX70, hasSM80]>; +} + +defm CP_ASYNC_MBARRIER_ARRIVE : + CP_ASYNC_MBARRIER_ARRIVE<"", "", int_nvvm_cp_async_mbarrier_arrive_b64>; +defm CP_ASYNC_MBARRIER_ARRIVE_SHARED : + CP_ASYNC_MBARRIER_ARRIVE<"", ".shared", int_nvvm_cp_async_mbarrier_arrive_shared_b64>; +defm CP_ASYNC_MBARRIER_ARRIVE_NOINC : + CP_ASYNC_MBARRIER_ARRIVE<".noinc", "", int_nvvm_cp_async_mbarrier_arrive_noinc_b64>; +defm CP_ASYNC_MBARRIER_ARRIVE_NOINC_SHARED : + CP_ASYNC_MBARRIER_ARRIVE<".noinc", ".shared", int_nvvm_cp_async_mbarrier_arrive_noinc_shared_b64>; + +multiclass CP_ASYNC_CA_SHARED_GLOBAL_I { + def _32 : NVPTXInst<(outs), (ins Int32Regs:$dst, Int32Regs:$src), + !strconcat("cp.async.ca.shared.global [$dst], [$src], ", cpsize, ";"), + [(Intrin Int32Regs:$dst, Int32Regs:$src)]>, + Requires<[hasPTX70, hasSM80]>; + def _64 : NVPTXInst<(outs), (ins Int64Regs:$dst, Int64Regs:$src), + !strconcat("cp.async.ca.shared.global [$dst], [$src], ", cpsize, ";"), + [(Intrin Int64Regs:$dst, Int64Regs:$src)]>, + Requires<[hasPTX70, hasSM80]>; +} + +defm CP_ASYNC_CA_SHARED_GLOBAL_4 : + CP_ASYNC_CA_SHARED_GLOBAL_I<"4", int_nvvm_cp_async_ca_shared_global_4>; + +defm CP_ASYNC_CA_SHARED_GLOBAL_8 : + CP_ASYNC_CA_SHARED_GLOBAL_I<"8", int_nvvm_cp_async_ca_shared_global_8>; + +defm CP_ASYNC_CA_SHARED_GLOBAL_16 : + CP_ASYNC_CA_SHARED_GLOBAL_I<"16", int_nvvm_cp_async_ca_shared_global_16>; + +multiclass CP_ASYNC_CG_SHARED_GLOBAL { + def _32 : NVPTXInst<(outs), (ins Int32Regs:$dst, Int32Regs:$src), + !strconcat("cp.async.cg.shared.global [$dst], [$src], ", cpsize, ";"), + [(Intrin Int32Regs:$dst, Int32Regs:$src)]>, + Requires<[hasPTX70, hasSM80]>; + def _64 : NVPTXInst<(outs), (ins Int64Regs:$dst, Int64Regs:$src), + !strconcat("cp.async.cg.shared.global [$dst], [$src], ", cpsize, ";"), + [(Intrin Int64Regs:$dst, Int64Regs:$src)]>, + Requires<[hasPTX70, hasSM80]>; +} + +defm CP_ASYNC_CG_SHARED_GLOBAL_16 : + CP_ASYNC_CG_SHARED_GLOBAL<"16", int_nvvm_cp_async_cg_shared_global_16>; + +def CP_ASYNC_COMMIT_GROUP : + NVPTXInst<(outs), (ins), "cp.async.commit_group", [(int_nvvm_cp_async_commit_group)]>, + Requires<[hasPTX70, hasSM80]>; + +def CP_ASYNC_WAIT_GROUP : + NVPTXInst<(outs), (ins Int32Regs:$n), "cp.async.wait_group $n;", + [(int_nvvm_cp_async_wait_group Int32Regs:$n)]>, + Requires<[hasPTX70, hasSM80]>; + +def CP_ASYNC_WAIT_ALL : + NVPTXInst<(outs), (ins), "cp.async.wait_all", + [(int_nvvm_cp_async_wait_all)]>, + Requires<[hasPTX70, hasSM80]>; + //----------------------------------- // Math Functions //----------------------------------- Index: llvm/test/CodeGen/NVPTX/async-copy.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/NVPTX/async-copy.ll @@ -0,0 +1,68 @@ +; RUN: llc < %s -march=nvptx -mcpu=sm_80 | FileCheck %s -check-prefix=x32 +; RUN: llc < %s -march=nvptx64 -mcpu=sm_80 | FileCheck %s -check-prefix=x64 + +declare void @llvm.nvvm.cp.async.mbarrier.arrive.b64(i64* %a) +declare void @llvm.nvvm.cp.async.mbarrier.arrive.shared.b64(i64 addrspace(3)* %a) +declare void @llvm.nvvm.cp.async.mbarrier.arrive.noinc.b64(i64* %a) +declare void @llvm.nvvm.cp.async.mbarrier.arrive.noinc.shared.b64(i64 addrspace(3)* %a) + +define void @asyncmbarrier(i64* %a) { +; x32: cp.async.mbarrier.arrive.b64 [%r{{[0-9]+}}]; +; x64: cp.async.mbarrier.arrive.b64 [%rd{{[0-9]+}}]; + tail call void @llvm.nvvm.cp.async.mbarrier.arrive.b64(i64* %a) + ret void +} +define void @asyncmbarriershared(i64 addrspace(3)* %a) { +; x32: cp.async.mbarrier.arrive.shared.b64 [%r{{[0-9]+}}]; +; x64: cp.async.mbarrier.arrive.shared.b64 [%rd{{[0-9]+}}]; + tail call void @llvm.nvvm.cp.async.mbarrier.arrive.shared.b64(i64 addrspace(3)* %a) + ret void +} +define void @asyncmbarriernoinc(i64* %a) { +; x32: cp.async.mbarrier.arrive.noinc.b64 [%r{{[0-9]+}}]; +; x64: cp.async.mbarrier.arrive.noinc.b64 [%rd{{[0-9]+}}]; + tail call void @llvm.nvvm.cp.async.mbarrier.arrive.noinc.b64(i64* %a) + ret void +} +define void @asyncmbarriernoincshared(i64 addrspace(3)* %a) { +; x32: cp.async.mbarrier.arrive.noinc.shared.b64 [%r{{[0-9]+}}]; +; x64: cp.async.mbarrier.arrive.noinc.shared.b64 [%rd{{[0-9]+}}]; + tail call void @llvm.nvvm.cp.async.mbarrier.arrive.noinc.shared.b64(i64 addrspace(3)* %a) + ret void +} + +declare void @llvm.nvvm.cp.async.ca.shared.global.4(i8* %a, i8* %b) + +define void @asynccasharedglobal4i8(i8* %a, i8* %b) { +; x32: cp.async.ca.shared.global [%r{{[0-9]+}}], [%r{{[0-9]+}}], 4; +; x64: cp.async.ca.shared.global [%rd{{[0-9]+}}], [%rd{{[0-9]+}}], 4; + tail call void @llvm.nvvm.cp.async.ca.shared.global.4(i8* %a, i8* %b) + ret void +} + +declare void @llvm.nvvm.cp.async.ca.shared.global.8(i8* %a, i8* %b) + +define void @asynccasharedglobal8i8(i8* %a, i8* %b) { +; x32: cp.async.ca.shared.global [%r{{[0-9]+}}], [%r{{[0-9]+}}], 8; +; x64: cp.async.ca.shared.global [%rd{{[0-9]+}}], [%rd{{[0-9]+}}], 8; + tail call void @llvm.nvvm.cp.async.ca.shared.global.8(i8* %a, i8* %b) + ret void +} + +declare void @llvm.nvvm.cp.async.ca.shared.global.16(i8* %a, i8* %b) + +define void @asynccasharedglobal16i8(i8* %a, i8* %b) { +; x32: cp.async.ca.shared.global [%r{{[0-9]+}}], [%r{{[0-9]+}}], 16; +; x64: cp.async.ca.shared.global [%rd{{[0-9]+}}], [%rd{{[0-9]+}}], 16; + tail call void @llvm.nvvm.cp.async.ca.shared.global.16(i8* %a, i8* %b) + ret void +} + +declare void @llvm.nvvm.cp.async.cg.shared.global.16(i8* %a, i8* %b) + +define void @asynccgsharedglobal16i8(i8* %a, i8* %b) { +; x32: cp.async.cg.shared.global [%r{{[0-9]+}}], [%r{{[0-9]+}}], 16; +; x64: cp.async.cg.shared.global [%rd{{[0-9]+}}], [%rd{{[0-9]+}}], 16; + tail call void @llvm.nvvm.cp.async.cg.shared.global.16(i8* %a, i8* %b) + ret void +}