diff --git a/clang/include/clang/Basic/BuiltinsNVPTX.def b/clang/include/clang/Basic/BuiltinsNVPTX.def --- a/clang/include/clang/Basic/BuiltinsNVPTX.def +++ b/clang/include/clang/Basic/BuiltinsNVPTX.def @@ -968,10 +968,10 @@ TARGET_BUILTIN(__nvvm_cp_async_mbarrier_arrive_noinc, "vWi*", "", AND(SM_80,PTX70)) TARGET_BUILTIN(__nvvm_cp_async_mbarrier_arrive_noinc_shared, "vWi*3", "", AND(SM_80,PTX70)) -TARGET_BUILTIN(__nvvm_cp_async_ca_shared_global_4, "vv*3vC*1", "", AND(SM_80,PTX70)) -TARGET_BUILTIN(__nvvm_cp_async_ca_shared_global_8, "vv*3vC*1", "", AND(SM_80,PTX70)) -TARGET_BUILTIN(__nvvm_cp_async_ca_shared_global_16, "vv*3vC*1", "", AND(SM_80,PTX70)) -TARGET_BUILTIN(__nvvm_cp_async_cg_shared_global_16, "vv*3vC*1", "", AND(SM_80,PTX70)) +TARGET_BUILTIN(__nvvm_cp_async_ca_shared_global_4, "vv*3vC*1.", "", AND(SM_80,PTX70)) +TARGET_BUILTIN(__nvvm_cp_async_ca_shared_global_8, "vv*3vC*1.", "", AND(SM_80,PTX70)) +TARGET_BUILTIN(__nvvm_cp_async_ca_shared_global_16, "vv*3vC*1.", "", AND(SM_80,PTX70)) +TARGET_BUILTIN(__nvvm_cp_async_cg_shared_global_16, "vv*3vC*1.", "", AND(SM_80,PTX70)) TARGET_BUILTIN(__nvvm_cp_async_commit_group, "v", "", AND(SM_80,PTX70)) TARGET_BUILTIN(__nvvm_cp_async_wait_group, "vIi", "", AND(SM_80,PTX70)) diff --git a/clang/include/clang/Sema/Sema.h b/clang/include/clang/Sema/Sema.h --- a/clang/include/clang/Sema/Sema.h +++ b/clang/include/clang/Sema/Sema.h @@ -13564,6 +13564,8 @@ bool CheckWebAssemblyBuiltinFunctionCall(const TargetInfo &TI, unsigned BuiltinID, CallExpr *TheCall); + bool CheckNVPTXBuiltinFunctionCall(const TargetInfo &TI, unsigned BuiltinID, + CallExpr *TheCall); bool SemaBuiltinVAStart(unsigned BuiltinID, CallExpr *TheCall); bool SemaBuiltinVAStartARMMicrosoft(CallExpr *Call); diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp --- a/clang/lib/CodeGen/CGBuiltin.cpp +++ b/clang/lib/CodeGen/CGBuiltin.cpp @@ -18177,6 +18177,19 @@ {Ptr, CGF.EmitScalarExpr(E->getArg(1))}); } +static Value *MakeCpAsync(unsigned IntrinsicID, unsigned IntrinsicIDS, + CodeGenFunction &CGF, const CallExpr *E, + int SrcSize) { + return E->getNumArgs() == 3 + ? CGF.Builder.CreateCall(CGF.CGM.getIntrinsic(IntrinsicIDS), + {CGF.EmitScalarExpr(E->getArg(0)), + CGF.EmitScalarExpr(E->getArg(1)), + CGF.EmitScalarExpr(E->getArg(2))}) + : CGF.Builder.CreateCall(CGF.CGM.getIntrinsic(IntrinsicID), + {CGF.EmitScalarExpr(E->getArg(0)), + CGF.EmitScalarExpr(E->getArg(1))}); +} + static Value *MakeHalfType(unsigned IntrinsicID, unsigned BuiltinID, const CallExpr *E, CodeGenFunction &CGF) { auto &C = CGF.CGM.getContext(); @@ -18840,6 +18853,22 @@ case NVPTX::BI__nvvm_ldu_h2: { return MakeHalfType(Intrinsic::nvvm_ldu_global_f, BuiltinID, E, *this); } + case NVPTX::BI__nvvm_cp_async_ca_shared_global_4: + return MakeCpAsync(Intrinsic::nvvm_cp_async_ca_shared_global_4, + Intrinsic::nvvm_cp_async_ca_shared_global_4_s, *this, E, + 4); + case NVPTX::BI__nvvm_cp_async_ca_shared_global_8: + return MakeCpAsync(Intrinsic::nvvm_cp_async_ca_shared_global_8, + Intrinsic::nvvm_cp_async_ca_shared_global_8_s, *this, E, + 8); + case NVPTX::BI__nvvm_cp_async_ca_shared_global_16: + return MakeCpAsync(Intrinsic::nvvm_cp_async_ca_shared_global_16, + Intrinsic::nvvm_cp_async_ca_shared_global_16_s, *this, E, + 16); + case NVPTX::BI__nvvm_cp_async_cg_shared_global_16: + return MakeCpAsync(Intrinsic::nvvm_cp_async_cg_shared_global_16, + Intrinsic::nvvm_cp_async_cg_shared_global_16_s, *this, E, + 16); default: return nullptr; } diff --git a/clang/lib/Sema/SemaChecking.cpp b/clang/lib/Sema/SemaChecking.cpp --- a/clang/lib/Sema/SemaChecking.cpp +++ b/clang/lib/Sema/SemaChecking.cpp @@ -2028,6 +2028,9 @@ case llvm::Triple::wasm32: case llvm::Triple::wasm64: return CheckWebAssemblyBuiltinFunctionCall(TI, BuiltinID, TheCall); + case llvm::Triple::nvptx: + case llvm::Triple::nvptx64: + return CheckNVPTXBuiltinFunctionCall(TI, BuiltinID, TheCall); } } @@ -4815,6 +4818,20 @@ return false; } +bool Sema::CheckNVPTXBuiltinFunctionCall(const TargetInfo &TI, + unsigned BuiltinID, + CallExpr *TheCall) { + switch (BuiltinID) { + case NVPTX::BI__nvvm_cp_async_ca_shared_global_4: + case NVPTX::BI__nvvm_cp_async_ca_shared_global_8: + case NVPTX::BI__nvvm_cp_async_ca_shared_global_16: + case NVPTX::BI__nvvm_cp_async_cg_shared_global_16: + return checkArgCountAtMost(*this, TheCall, 3); + } + + return false; +} + /// SemaBuiltinCpuSupports - Handle __builtin_cpu_supports(char *). /// This checks that the target supports __builtin_cpu_supports and /// that the string argument is constant and valid. diff --git a/clang/test/CodeGen/builtins-nvptx.c b/clang/test/CodeGen/builtins-nvptx.c --- a/clang/test/CodeGen/builtins-nvptx.c +++ b/clang/test/CodeGen/builtins-nvptx.c @@ -830,15 +830,24 @@ // CHECK_PTX70_SM80: call void @llvm.nvvm.cp.async.mbarrier.arrive.noinc.shared __nvvm_cp_async_mbarrier_arrive_noinc_shared(sharedAddr); - // CHECK_PTX70_SM80: call void @llvm.nvvm.cp.async.ca.shared.global.4 + // CHECK_PTX70_SM80: call void @llvm.nvvm.cp.async.ca.shared.global.4( __nvvm_cp_async_ca_shared_global_4(dst, src); - // CHECK_PTX70_SM80: call void @llvm.nvvm.cp.async.ca.shared.global.8 + // CHECK_PTX70_SM80: call void @llvm.nvvm.cp.async.ca.shared.global.8( __nvvm_cp_async_ca_shared_global_8(dst, src); - // CHECK_PTX70_SM80: call void @llvm.nvvm.cp.async.ca.shared.global.16 + // CHECK_PTX70_SM80: call void @llvm.nvvm.cp.async.ca.shared.global.16( __nvvm_cp_async_ca_shared_global_16(dst, src); - // CHECK_PTX70_SM80: call void @llvm.nvvm.cp.async.cg.shared.global.16 + // CHECK_PTX70_SM80: call void @llvm.nvvm.cp.async.cg.shared.global.16( __nvvm_cp_async_cg_shared_global_16(dst, src); + // CHECK_PTX70_SM80: call void @llvm.nvvm.cp.async.ca.shared.global.4.s({{.*}}, i32 2) + __nvvm_cp_async_ca_shared_global_4(dst, src, 2); + // CHECK_PTX70_SM80: call void @llvm.nvvm.cp.async.ca.shared.global.8.s({{.*}}, i32 2) + __nvvm_cp_async_ca_shared_global_8(dst, src, 2); + // CHECK_PTX70_SM80: call void @llvm.nvvm.cp.async.ca.shared.global.16.s({{.*}}, i32 2) + __nvvm_cp_async_ca_shared_global_16(dst, src, 2); + // CHECK_PTX70_SM80: call void @llvm.nvvm.cp.async.cg.shared.global.16.s({{.*}}, i32 2) + __nvvm_cp_async_cg_shared_global_16(dst, src, 2); + // CHECK_PTX70_SM80: call void @llvm.nvvm.cp.async.commit.group __nvvm_cp_async_commit_group(); // CHECK_PTX70_SM80: call void @llvm.nvvm.cp.async.wait.group(i32 0) diff --git a/clang/test/SemaCUDA/builtins.cu b/clang/test/SemaCUDA/builtins.cu --- a/clang/test/SemaCUDA/builtins.cu +++ b/clang/test/SemaCUDA/builtins.cu @@ -10,6 +10,7 @@ // RUN: -fsyntax-only -verify=host %s // RUN: %clang_cc1 -triple nvptx64-unknown-cuda -fcuda-is-device \ // RUN: -aux-triple x86_64-unknown-unknown \ +// RUN: -target-cpu sm_80 -target-feature +ptx70 \ // RUN: -fsyntax-only -verify=dev %s #if !(defined(__amd64__) && defined(__PTX__)) @@ -28,3 +29,23 @@ int y = __builtin_ia32_rdtsc(); // dev-error {{reference to __host__ function '__builtin_ia32_rdtsc' in __device__ function}} x = __builtin_abs(1); } + +#if __CUDA_ARCH__ >= 800 +__attribute__((device)) void nvvm_async_copy(__attribute__((address_space(3))) void* dst, + __attribute__((address_space(1))) const void* src) { + __nvvm_cp_async_ca_shared_global_4(dst, src); + __nvvm_cp_async_ca_shared_global_8(dst, src); + __nvvm_cp_async_ca_shared_global_16(dst, src); + __nvvm_cp_async_cg_shared_global_16(dst, src); + + __nvvm_cp_async_ca_shared_global_4(dst, src, 2); + __nvvm_cp_async_ca_shared_global_8(dst, src, 2); + __nvvm_cp_async_ca_shared_global_16(dst, src, 2); + __nvvm_cp_async_cg_shared_global_16(dst, src, 2); + + __nvvm_cp_async_ca_shared_global_4(dst, src, 2, 3); // dev-error {{too many arguments to function call}} + __nvvm_cp_async_ca_shared_global_8(dst, src, 2, 3); // dev-error {{too many arguments to function call}} + __nvvm_cp_async_ca_shared_global_16(dst, src, 2, 3); // dev-error {{too many arguments to function call}} + __nvvm_cp_async_cg_shared_global_16(dst, src, 2, 3); // dev-error {{too many arguments to function call}} +} +#endif diff --git a/llvm/include/llvm/IR/IntrinsicsNVVM.td b/llvm/include/llvm/IR/IntrinsicsNVVM.td --- a/llvm/include/llvm/IR/IntrinsicsNVVM.td +++ b/llvm/include/llvm/IR/IntrinsicsNVVM.td @@ -1380,30 +1380,21 @@ ClangBuiltin<"__nvvm_cp_async_mbarrier_arrive_noinc_shared">, Intrinsic<[],[llvm_shared_i64ptr_ty],[IntrConvergent, IntrNoCallback]>; -def int_nvvm_cp_async_ca_shared_global_4 : - ClangBuiltin<"__nvvm_cp_async_ca_shared_global_4">, - Intrinsic<[],[llvm_shared_i8ptr_ty, llvm_global_i8ptr_ty], - [IntrArgMemOnly, IntrNoCallback, NoAlias>, NoAlias>, - WriteOnly>, ReadOnly>], - "llvm.nvvm.cp.async.ca.shared.global.4">; -def int_nvvm_cp_async_ca_shared_global_8 : - ClangBuiltin<"__nvvm_cp_async_ca_shared_global_8">, - Intrinsic<[],[llvm_shared_i8ptr_ty, llvm_global_i8ptr_ty], - [IntrArgMemOnly, IntrNoCallback, NoAlias>, NoAlias>, - WriteOnly>, ReadOnly>], - "llvm.nvvm.cp.async.ca.shared.global.8">; -def int_nvvm_cp_async_ca_shared_global_16 : - ClangBuiltin<"__nvvm_cp_async_ca_shared_global_16">, - Intrinsic<[],[llvm_shared_i8ptr_ty, llvm_global_i8ptr_ty], - [IntrArgMemOnly, IntrNoCallback, NoAlias>, NoAlias>, - WriteOnly>, ReadOnly>], - "llvm.nvvm.cp.async.ca.shared.global.16">; -def int_nvvm_cp_async_cg_shared_global_16 : - ClangBuiltin<"__nvvm_cp_async_cg_shared_global_16">, - Intrinsic<[],[llvm_shared_i8ptr_ty, llvm_global_i8ptr_ty], - [IntrArgMemOnly, IntrNoCallback, NoAlias>, NoAlias>, - WriteOnly>, ReadOnly>], - "llvm.nvvm.cp.async.cg.shared.global.16">; +multiclass CP_ASYNC_SHARED_GLOBAL { + def NAME: Intrinsic<[],[llvm_shared_i8ptr_ty, llvm_global_i8ptr_ty], + [IntrArgMemOnly, IntrNoCallback, NoAlias>, NoAlias>, + WriteOnly>, ReadOnly>], + "llvm.nvvm.cp.async." # cc # ".shared.global." # n>; + def _s: Intrinsic<[],[llvm_shared_i8ptr_ty, llvm_global_i8ptr_ty, llvm_i32_ty], + [IntrArgMemOnly, IntrNoCallback, NoAlias>, NoAlias>, + WriteOnly>, ReadOnly>], + "llvm.nvvm.cp.async." # cc # ".shared.global." # n # ".s">; +} + +defm int_nvvm_cp_async_ca_shared_global_4 : CP_ASYNC_SHARED_GLOBAL<"4", "ca">; +defm int_nvvm_cp_async_ca_shared_global_8 : CP_ASYNC_SHARED_GLOBAL<"8", "ca">; +defm int_nvvm_cp_async_ca_shared_global_16 : CP_ASYNC_SHARED_GLOBAL<"16", "ca">; +defm int_nvvm_cp_async_cg_shared_global_16 : CP_ASYNC_SHARED_GLOBAL<"16", "cg">; def int_nvvm_cp_async_commit_group : ClangBuiltin<"__nvvm_cp_async_commit_group">, diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td --- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td +++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td @@ -328,39 +328,49 @@ defm CP_ASYNC_MBARRIER_ARRIVE_NOINC_SHARED : CP_ASYNC_MBARRIER_ARRIVE<".noinc", ".shared", int_nvvm_cp_async_mbarrier_arrive_noinc_shared>; -multiclass CP_ASYNC_CA_SHARED_GLOBAL_I { +multiclass CP_ASYNC_SHARED_GLOBAL_I { def _32 : NVPTXInst<(outs), (ins Int32Regs:$dst, Int32Regs:$src), - !strconcat("cp.async.ca.shared.global [$dst], [$src], ", cpsize, ";"), + !strconcat("cp.async.", cc, ".shared.global [$dst], [$src], ", cpsize, ";"), [(Intrin Int32Regs:$dst, Int32Regs:$src)]>, Requires<[hasPTX70, hasSM80]>; def _64 : NVPTXInst<(outs), (ins Int64Regs:$dst, Int64Regs:$src), - !strconcat("cp.async.ca.shared.global [$dst], [$src], ", cpsize, ";"), + !strconcat("cp.async.", cc, ".shared.global [$dst], [$src], ", cpsize, ";"), [(Intrin Int64Regs:$dst, Int64Regs:$src)]>, Requires<[hasPTX70, hasSM80]>; + // Variant with src_size parameter + def _32s : NVPTXInst<(outs), (ins Int32Regs:$dst, Int32Regs:$src, Int32Regs:$src_size), + !strconcat("cp.async.", cc, ".shared.global [$dst], [$src], ", cpsize, ", $src_size;"), + [(IntrinS Int32Regs:$dst, Int32Regs:$src, Int32Regs:$src_size)]>, + Requires<[hasPTX70, hasSM80]>; + def _32si: NVPTXInst<(outs), (ins Int32Regs:$dst, Int32Regs:$src, i32imm:$src_size), + !strconcat("cp.async.", cc, ".shared.global [$dst], [$src], ", cpsize, ", $src_size;"), + [(IntrinS Int32Regs:$dst, Int32Regs:$src, imm:$src_size)]>, + Requires<[hasPTX70, hasSM80]>; + def _64s : NVPTXInst<(outs), (ins Int64Regs:$dst, Int64Regs:$src, Int32Regs:$src_size), + !strconcat("cp.async.", cc, ".shared.global [$dst], [$src], ", cpsize, ", $src_size;"), + [(IntrinS Int64Regs:$dst, Int64Regs:$src, Int32Regs:$src_size)]>, + Requires<[hasPTX70, hasSM80]>; + def _64si: NVPTXInst<(outs), (ins Int64Regs:$dst, Int64Regs:$src, i32imm:$src_size), + !strconcat("cp.async.", cc, ".shared.global [$dst], [$src], ", cpsize, ", $src_size;"), + [(IntrinS Int64Regs:$dst, Int64Regs:$src, imm:$src_size)]>, + Requires<[hasPTX70, hasSM80]>; } defm CP_ASYNC_CA_SHARED_GLOBAL_4 : - CP_ASYNC_CA_SHARED_GLOBAL_I<"4", int_nvvm_cp_async_ca_shared_global_4>; + CP_ASYNC_SHARED_GLOBAL_I<"ca", "4", int_nvvm_cp_async_ca_shared_global_4, + int_nvvm_cp_async_ca_shared_global_4_s>; defm CP_ASYNC_CA_SHARED_GLOBAL_8 : - CP_ASYNC_CA_SHARED_GLOBAL_I<"8", int_nvvm_cp_async_ca_shared_global_8>; + CP_ASYNC_SHARED_GLOBAL_I<"ca", "8", int_nvvm_cp_async_ca_shared_global_8, + int_nvvm_cp_async_ca_shared_global_8_s>; defm CP_ASYNC_CA_SHARED_GLOBAL_16 : - CP_ASYNC_CA_SHARED_GLOBAL_I<"16", int_nvvm_cp_async_ca_shared_global_16>; - -multiclass CP_ASYNC_CG_SHARED_GLOBAL { - def _32 : NVPTXInst<(outs), (ins Int32Regs:$dst, Int32Regs:$src), - !strconcat("cp.async.cg.shared.global [$dst], [$src], ", cpsize, ";"), - [(Intrin Int32Regs:$dst, Int32Regs:$src)]>, - Requires<[hasPTX70, hasSM80]>; - def _64 : NVPTXInst<(outs), (ins Int64Regs:$dst, Int64Regs:$src), - !strconcat("cp.async.cg.shared.global [$dst], [$src], ", cpsize, ";"), - [(Intrin Int64Regs:$dst, Int64Regs:$src)]>, - Requires<[hasPTX70, hasSM80]>; -} + CP_ASYNC_SHARED_GLOBAL_I<"ca", "16", int_nvvm_cp_async_ca_shared_global_16, + int_nvvm_cp_async_ca_shared_global_16_s>; defm CP_ASYNC_CG_SHARED_GLOBAL_16 : - CP_ASYNC_CG_SHARED_GLOBAL<"16", int_nvvm_cp_async_cg_shared_global_16>; + CP_ASYNC_SHARED_GLOBAL_I<"cg", "16", int_nvvm_cp_async_cg_shared_global_16, + int_nvvm_cp_async_cg_shared_global_16_s>; def CP_ASYNC_COMMIT_GROUP : NVPTXInst<(outs), (ins), "cp.async.commit_group;", [(int_nvvm_cp_async_commit_group)]>, diff --git a/llvm/test/CodeGen/NVPTX/async-copy.ll b/llvm/test/CodeGen/NVPTX/async-copy.ll --- a/llvm/test/CodeGen/NVPTX/async-copy.ll +++ b/llvm/test/CodeGen/NVPTX/async-copy.ll @@ -1,35 +1,35 @@ -; RUN: llc < %s -march=nvptx -mcpu=sm_80 -mattr=+ptx70 | FileCheck -check-prefixes=ALL,CHECK_PTX32 %s -; RUN: llc < %s -march=nvptx64 -mcpu=sm_80 -mattr=+ptx70 | FileCheck -check-prefixes=ALL,CHECK_PTX64 %s +; RUN: llc < %s -march=nvptx -mcpu=sm_80 -mattr=+ptx70 | FileCheck -check-prefixes=CHECK,CHECK_PTX32 %s +; RUN: llc < %s -march=nvptx64 -mcpu=sm_80 -mattr=+ptx70 | FileCheck -check-prefixes=CHECK,CHECK_PTX64 %s ; RUN: %if ptxas-11.0 %{ llc < %s -march=nvptx -mcpu=sm_80 -mattr=+ptx70 | %ptxas-verify -arch=sm_80 %} ; RUN: %if ptxas-11.0 %{ llc < %s -march=nvptx64 -mcpu=sm_80 -mattr=+ptx70 | %ptxas-verify -arch=sm_80 %} declare void @llvm.nvvm.cp.async.wait.group(i32) -; ALL-LABEL: asyncwaitgroup +; CHECK-LABEL: asyncwaitgroup define void @asyncwaitgroup() { - ; ALL: cp.async.wait_group 8; + ; CHECK: cp.async.wait_group 8; tail call void @llvm.nvvm.cp.async.wait.group(i32 8) - ; ALL: cp.async.wait_group 0; + ; CHECK: cp.async.wait_group 0; tail call void @llvm.nvvm.cp.async.wait.group(i32 0) - ; ALL: cp.async.wait_group 16; + ; CHECK: cp.async.wait_group 16; tail call void @llvm.nvvm.cp.async.wait.group(i32 16) ret void } declare void @llvm.nvvm.cp.async.wait.all() -; ALL-LABEL: asyncwaitall +; CHECK-LABEL: asyncwaitall define void @asyncwaitall() { -; ALL: cp.async.wait_all +; CHECK: cp.async.wait_all tail call void @llvm.nvvm.cp.async.wait.all() ret void } declare void @llvm.nvvm.cp.async.commit.group() -; ALL-LABEL: asynccommitgroup +; CHECK-LABEL: asynccommitgroup define void @asynccommitgroup() { -; ALL: cp.async.commit_group +; CHECK: cp.async.commit_group tail call void @llvm.nvvm.cp.async.commit.group() ret void } @@ -41,72 +41,87 @@ ; CHECK-LABEL: asyncmbarrier define void @asyncmbarrier(ptr %a) { -; CHECK_PTX32: cp.async.mbarrier.arrive.b64 [%r{{[0-9]+}}]; -; CHECK_PTX64: cp.async.mbarrier.arrive.b64 [%rd{{[0-9]+}}]; +; The distinction between PTX32/PTX64 here is only to capture pointer register type +; in R to be used in subsequent tests. +; CHECK_PTX32: cp.async.mbarrier.arrive.b64 [%[[R:r]]{{[0-9]+}}]; +; CHECK_PTX64: cp.async.mbarrier.arrive.b64 [%[[R:rd]]{{[0-9]+}}]; tail call void @llvm.nvvm.cp.async.mbarrier.arrive(ptr %a) ret void } ; CHECK-LABEL: asyncmbarriershared define void @asyncmbarriershared(ptr addrspace(3) %a) { -; CHECK_PTX32: cp.async.mbarrier.arrive.shared.b64 [%r{{[0-9]+}}]; -; CHECK_PTX64: cp.async.mbarrier.arrive.shared.b64 [%rd{{[0-9]+}}]; +; CHECK: cp.async.mbarrier.arrive.shared.b64 [%[[R]]{{[0-9]+}}]; tail call void @llvm.nvvm.cp.async.mbarrier.arrive.shared(ptr addrspace(3) %a) ret void } ; CHECK-LABEL: asyncmbarriernoinc define void @asyncmbarriernoinc(ptr %a) { -; CHECK_PTX32: cp.async.mbarrier.arrive.noinc.b64 [%r{{[0-9]+}}]; -; CHECK_PTX64: cp.async.mbarrier.arrive.noinc.b64 [%rd{{[0-9]+}}]; +; CHECK_PTX64: cp.async.mbarrier.arrive.noinc.b64 [%[[R]]{{[0-9]+}}]; tail call void @llvm.nvvm.cp.async.mbarrier.arrive.noinc(ptr %a) ret void } ; CHECK-LABEL: asyncmbarriernoincshared define void @asyncmbarriernoincshared(ptr addrspace(3) %a) { -; CHECK_PTX32: cp.async.mbarrier.arrive.noinc.shared.b64 [%r{{[0-9]+}}]; -; CHECK_PTX64: cp.async.mbarrier.arrive.noinc.shared.b64 [%rd{{[0-9]+}}]; +; CHECK: cp.async.mbarrier.arrive.noinc.shared.b64 [%[[R]]{{[0-9]+}}]; tail call void @llvm.nvvm.cp.async.mbarrier.arrive.noinc.shared(ptr addrspace(3) %a) ret void } declare void @llvm.nvvm.cp.async.ca.shared.global.4(ptr addrspace(3) %a, ptr addrspace(1) %b) +declare void @llvm.nvvm.cp.async.ca.shared.global.4.s(ptr addrspace(3) %a, ptr addrspace(1) %b, i32 %c) ; CHECK-LABEL: asynccasharedglobal4i8 -define void @asynccasharedglobal4i8(ptr addrspace(3) %a, ptr addrspace(1) %b) { -; CHECK_PTX32: cp.async.ca.shared.global [%r{{[0-9]+}}], [%r{{[0-9]+}}], 4; -; CHECK_PTX64: cp.async.ca.shared.global [%rd{{[0-9]+}}], [%rd{{[0-9]+}}], 4; +define void @asynccasharedglobal4i8(ptr addrspace(3) %a, ptr addrspace(1) %b, i32 %c) { +; CHECK: cp.async.ca.shared.global [%[[R]]{{[0-9]+}}], [%[[R]]{{[0-9]+}}], 4; +; CHECK: cp.async.ca.shared.global [%[[R]]{{[0-9]+}}], [%[[R]]{{[0-9]+}}], 4, %r{{[0-9]+}}; +; CHECK: cp.async.ca.shared.global [%[[R]]{{[0-9]+}}], [%[[R]]{{[0-9]+}}], 4, 1; tail call void @llvm.nvvm.cp.async.ca.shared.global.4(ptr addrspace(3) %a, ptr addrspace(1) %b) + tail call void @llvm.nvvm.cp.async.ca.shared.global.4.s(ptr addrspace(3) %a, ptr addrspace(1) %b, i32 %c) + tail call void @llvm.nvvm.cp.async.ca.shared.global.4.s(ptr addrspace(3) %a, ptr addrspace(1) %b, i32 1) ret void } declare void @llvm.nvvm.cp.async.ca.shared.global.8(ptr addrspace(3) %a, ptr addrspace(1) %b) +declare void @llvm.nvvm.cp.async.ca.shared.global.8.s(ptr addrspace(3) %a, ptr addrspace(1) %b, i32 %c) ; CHECK-LABEL: asynccasharedglobal8i8 -define void @asynccasharedglobal8i8(ptr addrspace(3) %a, ptr addrspace(1) %b) { -; CHECK_PTX32: cp.async.ca.shared.global [%r{{[0-9]+}}], [%r{{[0-9]+}}], 8; -; CHECK_PTX64: cp.async.ca.shared.global [%rd{{[0-9]+}}], [%rd{{[0-9]+}}], 8; +define void @asynccasharedglobal8i8(ptr addrspace(3) %a, ptr addrspace(1) %b, i32 %c) { +; CHECK: cp.async.ca.shared.global [%[[R]]{{[0-9]+}}], [%[[R]]{{[0-9]+}}], 8; +; CHECK: cp.async.ca.shared.global [%[[R]]{{[0-9]+}}], [%[[R]]{{[0-9]+}}], 8, %r{{[0-9]+}}; +; CHECK: cp.async.ca.shared.global [%[[R]]{{[0-9]+}}], [%[[R]]{{[0-9]+}}], 8, 1; tail call void @llvm.nvvm.cp.async.ca.shared.global.8(ptr addrspace(3) %a, ptr addrspace(1) %b) + tail call void @llvm.nvvm.cp.async.ca.shared.global.8.s(ptr addrspace(3) %a, ptr addrspace(1) %b, i32 %c) + tail call void @llvm.nvvm.cp.async.ca.shared.global.8.s(ptr addrspace(3) %a, ptr addrspace(1) %b, i32 1) ret void } declare void @llvm.nvvm.cp.async.ca.shared.global.16(ptr addrspace(3) %a, ptr addrspace(1) %b) +declare void @llvm.nvvm.cp.async.ca.shared.global.16.s(ptr addrspace(3) %a, ptr addrspace(1) %b, i32 %c) ; CHECK-LABEL: asynccasharedglobal16i8 -define void @asynccasharedglobal16i8(ptr addrspace(3) %a, ptr addrspace(1) %b) { -; CHECK_PTX32: cp.async.ca.shared.global [%r{{[0-9]+}}], [%r{{[0-9]+}}], 16; -; CHECK_PTX64: cp.async.ca.shared.global [%rd{{[0-9]+}}], [%rd{{[0-9]+}}], 16; +define void @asynccasharedglobal16i8(ptr addrspace(3) %a, ptr addrspace(1) %b, i32 %c) { +; CHECK: cp.async.ca.shared.global [%[[R]]{{[0-9]+}}], [%[[R]]{{[0-9]+}}], 16; +; CHECK: cp.async.ca.shared.global [%[[R]]{{[0-9]+}}], [%[[R]]{{[0-9]+}}], 16, %r{{[0-9]+}}; +; CHECK: cp.async.ca.shared.global [%[[R]]{{[0-9]+}}], [%[[R]]{{[0-9]+}}], 16, 1; tail call void @llvm.nvvm.cp.async.ca.shared.global.16(ptr addrspace(3) %a, ptr addrspace(1) %b) + tail call void @llvm.nvvm.cp.async.ca.shared.global.16.s(ptr addrspace(3) %a, ptr addrspace(1) %b, i32 %c) + tail call void @llvm.nvvm.cp.async.ca.shared.global.16.s(ptr addrspace(3) %a, ptr addrspace(1) %b, i32 1) ret void } declare void @llvm.nvvm.cp.async.cg.shared.global.16(ptr addrspace(3) %a, ptr addrspace(1) %b) +declare void @llvm.nvvm.cp.async.cg.shared.global.16.s(ptr addrspace(3) %a, ptr addrspace(1) %b, i32 %c) ; CHECK-LABEL: asynccgsharedglobal16i8 -define void @asynccgsharedglobal16i8(ptr addrspace(3) %a, ptr addrspace(1) %b) { -; CHECK_PTX32: cp.async.cg.shared.global [%r{{[0-9]+}}], [%r{{[0-9]+}}], 16; -; CHECK_PTX64: cp.async.cg.shared.global [%rd{{[0-9]+}}], [%rd{{[0-9]+}}], 16; +define void @asynccgsharedglobal16i8(ptr addrspace(3) %a, ptr addrspace(1) %b, i32 %c) { +; CHECK: cp.async.cg.shared.global [%[[R]]{{[0-9]+}}], [%[[R]]{{[0-9]+}}], 16; +; CHECK: cp.async.cg.shared.global [%[[R]]{{[0-9]+}}], [%[[R]]{{[0-9]+}}], 16, %r{{[0-9]+}}; +; CHECK: cp.async.cg.shared.global [%[[R]]{{[0-9]+}}], [%[[R]]{{[0-9]+}}], 16, 1; tail call void @llvm.nvvm.cp.async.cg.shared.global.16(ptr addrspace(3) %a, ptr addrspace(1) %b) + tail call void @llvm.nvvm.cp.async.cg.shared.global.16.s(ptr addrspace(3) %a, ptr addrspace(1) %b, i32 %c) + tail call void @llvm.nvvm.cp.async.cg.shared.global.16.s(ptr addrspace(3) %a, ptr addrspace(1) %b, i32 1) ret void }