diff --git a/llvm/include/llvm/IR/IntrinsicsNVVM.td b/llvm/include/llvm/IR/IntrinsicsNVVM.td --- a/llvm/include/llvm/IR/IntrinsicsNVVM.td +++ b/llvm/include/llvm/IR/IntrinsicsNVVM.td @@ -1382,25 +1382,25 @@ def int_nvvm_cp_async_ca_shared_global_4 : ClangBuiltin<"__nvvm_cp_async_ca_shared_global_4">, - Intrinsic<[],[llvm_shared_i8ptr_ty, llvm_global_i8ptr_ty], + Intrinsic<[],[llvm_shared_i8ptr_ty, llvm_global_i8ptr_ty, llvm_i32_ty], [IntrArgMemOnly, IntrNoCallback, NoAlias>, NoAlias>, WriteOnly>, ReadOnly>], "llvm.nvvm.cp.async.ca.shared.global.4">; def int_nvvm_cp_async_ca_shared_global_8 : ClangBuiltin<"__nvvm_cp_async_ca_shared_global_8">, - Intrinsic<[],[llvm_shared_i8ptr_ty, llvm_global_i8ptr_ty], + Intrinsic<[],[llvm_shared_i8ptr_ty, llvm_global_i8ptr_ty, llvm_i32_ty], [IntrArgMemOnly, IntrNoCallback, NoAlias>, NoAlias>, WriteOnly>, ReadOnly>], "llvm.nvvm.cp.async.ca.shared.global.8">; def int_nvvm_cp_async_ca_shared_global_16 : ClangBuiltin<"__nvvm_cp_async_ca_shared_global_16">, - Intrinsic<[],[llvm_shared_i8ptr_ty, llvm_global_i8ptr_ty], + Intrinsic<[],[llvm_shared_i8ptr_ty, llvm_global_i8ptr_ty, llvm_i32_ty], [IntrArgMemOnly, IntrNoCallback, NoAlias>, NoAlias>, WriteOnly>, ReadOnly>], "llvm.nvvm.cp.async.ca.shared.global.16">; def int_nvvm_cp_async_cg_shared_global_16 : ClangBuiltin<"__nvvm_cp_async_cg_shared_global_16">, - Intrinsic<[],[llvm_shared_i8ptr_ty, llvm_global_i8ptr_ty], + Intrinsic<[],[llvm_shared_i8ptr_ty, llvm_global_i8ptr_ty, llvm_i32_ty], [IntrArgMemOnly, IntrNoCallback, NoAlias>, NoAlias>, WriteOnly>, ReadOnly>], "llvm.nvvm.cp.async.cg.shared.global.16">; diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td --- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td +++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td @@ -328,39 +328,36 @@ defm CP_ASYNC_MBARRIER_ARRIVE_NOINC_SHARED : CP_ASYNC_MBARRIER_ARRIVE<".noinc", ".shared", int_nvvm_cp_async_mbarrier_arrive_noinc_shared>; -multiclass CP_ASYNC_CA_SHARED_GLOBAL_I { - def _32 : NVPTXInst<(outs), (ins Int32Regs:$dst, Int32Regs:$src), - !strconcat("cp.async.ca.shared.global [$dst], [$src], ", cpsize, ";"), - [(Intrin Int32Regs:$dst, Int32Regs:$src)]>, +multiclass CP_ASYNC_SHARED_GLOBAL_I { + def _32 : NVPTXInst<(outs), (ins Int32Regs:$dst, Int32Regs:$src, Int32Regs:$src_size), + !strconcat("cp.async.", cc, ".shared.global [$dst], [$src], ", cpsize, ", $src_size;"), + [(Intrin Int32Regs:$dst, Int32Regs:$src, Int32Regs:$src_size)]>, Requires<[hasPTX70, hasSM80]>; - def _64 : NVPTXInst<(outs), (ins Int64Regs:$dst, Int64Regs:$src), - !strconcat("cp.async.ca.shared.global [$dst], [$src], ", cpsize, ";"), - [(Intrin Int64Regs:$dst, Int64Regs:$src)]>, + def _32i: NVPTXInst<(outs), (ins Int32Regs:$dst, Int32Regs:$src, i32imm:$src_size), + !strconcat("cp.async.", cc, ".shared.global [$dst], [$src], ", cpsize, ", $src_size;"), + [(Intrin Int32Regs:$dst, Int32Regs:$src, imm:$src_size)]>, + Requires<[hasPTX70, hasSM80]>; + def _64 : NVPTXInst<(outs), (ins Int64Regs:$dst, Int64Regs:$src, Int32Regs:$src_size), + !strconcat("cp.async.", cc, ".shared.global [$dst], [$src], ", cpsize, ", $src_size;"), + [(Intrin Int64Regs:$dst, Int64Regs:$src, Int32Regs:$src_size)]>, + Requires<[hasPTX70, hasSM80]>; + def _64i: NVPTXInst<(outs), (ins Int64Regs:$dst, Int64Regs:$src, i32imm:$src_size), + !strconcat("cp.async.", cc, ".shared.global [$dst], [$src], ", cpsize, ", $src_size;"), + [(Intrin Int64Regs:$dst, Int64Regs:$src, imm:$src_size)]>, Requires<[hasPTX70, hasSM80]>; } defm CP_ASYNC_CA_SHARED_GLOBAL_4 : - CP_ASYNC_CA_SHARED_GLOBAL_I<"4", int_nvvm_cp_async_ca_shared_global_4>; + CP_ASYNC_SHARED_GLOBAL_I<"ca", "4", int_nvvm_cp_async_ca_shared_global_4>; defm CP_ASYNC_CA_SHARED_GLOBAL_8 : - CP_ASYNC_CA_SHARED_GLOBAL_I<"8", int_nvvm_cp_async_ca_shared_global_8>; + CP_ASYNC_SHARED_GLOBAL_I<"ca", "8", int_nvvm_cp_async_ca_shared_global_8>; defm CP_ASYNC_CA_SHARED_GLOBAL_16 : - CP_ASYNC_CA_SHARED_GLOBAL_I<"16", int_nvvm_cp_async_ca_shared_global_16>; - -multiclass CP_ASYNC_CG_SHARED_GLOBAL { - def _32 : NVPTXInst<(outs), (ins Int32Regs:$dst, Int32Regs:$src), - !strconcat("cp.async.cg.shared.global [$dst], [$src], ", cpsize, ";"), - [(Intrin Int32Regs:$dst, Int32Regs:$src)]>, - Requires<[hasPTX70, hasSM80]>; - def _64 : NVPTXInst<(outs), (ins Int64Regs:$dst, Int64Regs:$src), - !strconcat("cp.async.cg.shared.global [$dst], [$src], ", cpsize, ";"), - [(Intrin Int64Regs:$dst, Int64Regs:$src)]>, - Requires<[hasPTX70, hasSM80]>; -} + CP_ASYNC_SHARED_GLOBAL_I<"ca", "16", int_nvvm_cp_async_ca_shared_global_16>; defm CP_ASYNC_CG_SHARED_GLOBAL_16 : - CP_ASYNC_CG_SHARED_GLOBAL<"16", int_nvvm_cp_async_cg_shared_global_16>; + CP_ASYNC_SHARED_GLOBAL_I<"cg", "16", int_nvvm_cp_async_cg_shared_global_16>; def CP_ASYNC_COMMIT_GROUP : NVPTXInst<(outs), (ins), "cp.async.commit_group;", [(int_nvvm_cp_async_commit_group)]>, diff --git a/llvm/test/CodeGen/NVPTX/async-copy.ll b/llvm/test/CodeGen/NVPTX/async-copy.ll --- a/llvm/test/CodeGen/NVPTX/async-copy.ll +++ b/llvm/test/CodeGen/NVPTX/async-copy.ll @@ -1,35 +1,35 @@ -; RUN: llc < %s -march=nvptx -mcpu=sm_80 -mattr=+ptx70 | FileCheck -check-prefixes=ALL,CHECK_PTX32 %s -; RUN: llc < %s -march=nvptx64 -mcpu=sm_80 -mattr=+ptx70 | FileCheck -check-prefixes=ALL,CHECK_PTX64 %s +; RUN: llc < %s -march=nvptx -mcpu=sm_80 -mattr=+ptx70 | FileCheck -check-prefixes=CHECK,CHECK_PTX32 %s +; RUN: llc < %s -march=nvptx64 -mcpu=sm_80 -mattr=+ptx70 | FileCheck -check-prefixes=CHECK,CHECK_PTX64 %s ; RUN: %if ptxas-11.0 %{ llc < %s -march=nvptx -mcpu=sm_80 -mattr=+ptx70 | %ptxas-verify -arch=sm_80 %} ; RUN: %if ptxas-11.0 %{ llc < %s -march=nvptx64 -mcpu=sm_80 -mattr=+ptx70 | %ptxas-verify -arch=sm_80 %} declare void @llvm.nvvm.cp.async.wait.group(i32) -; ALL-LABEL: asyncwaitgroup +; CHECK-LABEL: asyncwaitgroup define void @asyncwaitgroup() { - ; ALL: cp.async.wait_group 8; + ; CHECK: cp.async.wait_group 8; tail call void @llvm.nvvm.cp.async.wait.group(i32 8) - ; ALL: cp.async.wait_group 0; + ; CHECK: cp.async.wait_group 0; tail call void @llvm.nvvm.cp.async.wait.group(i32 0) - ; ALL: cp.async.wait_group 16; + ; CHECK: cp.async.wait_group 16; tail call void @llvm.nvvm.cp.async.wait.group(i32 16) ret void } declare void @llvm.nvvm.cp.async.wait.all() -; ALL-LABEL: asyncwaitall +; CHECK-LABEL: asyncwaitall define void @asyncwaitall() { -; ALL: cp.async.wait_all +; CHECK: cp.async.wait_all tail call void @llvm.nvvm.cp.async.wait.all() ret void } declare void @llvm.nvvm.cp.async.commit.group() -; ALL-LABEL: asynccommitgroup +; CHECK-LABEL: asynccommitgroup define void @asynccommitgroup() { -; ALL: cp.async.commit_group +; CHECK: cp.async.commit_group tail call void @llvm.nvvm.cp.async.commit.group() ret void } @@ -41,72 +41,75 @@ ; CHECK-LABEL: asyncmbarrier define void @asyncmbarrier(ptr %a) { -; CHECK_PTX32: cp.async.mbarrier.arrive.b64 [%r{{[0-9]+}}]; -; CHECK_PTX64: cp.async.mbarrier.arrive.b64 [%rd{{[0-9]+}}]; +; The distinction between PTX32/PTX64 here is only to capture pointer register type +; in R to be used in subsequent tests. +; CHECK_PTX32: cp.async.mbarrier.arrive.b64 [%[[R:r]]{{[0-9]+}}]; +; CHECK_PTX64: cp.async.mbarrier.arrive.b64 [%[[R:rd]]{{[0-9]+}}]; tail call void @llvm.nvvm.cp.async.mbarrier.arrive(ptr %a) ret void } ; CHECK-LABEL: asyncmbarriershared define void @asyncmbarriershared(ptr addrspace(3) %a) { -; CHECK_PTX32: cp.async.mbarrier.arrive.shared.b64 [%r{{[0-9]+}}]; -; CHECK_PTX64: cp.async.mbarrier.arrive.shared.b64 [%rd{{[0-9]+}}]; +; CHECK: cp.async.mbarrier.arrive.shared.b64 [%[[R]]{{[0-9]+}}]; tail call void @llvm.nvvm.cp.async.mbarrier.arrive.shared(ptr addrspace(3) %a) ret void } ; CHECK-LABEL: asyncmbarriernoinc define void @asyncmbarriernoinc(ptr %a) { -; CHECK_PTX32: cp.async.mbarrier.arrive.noinc.b64 [%r{{[0-9]+}}]; -; CHECK_PTX64: cp.async.mbarrier.arrive.noinc.b64 [%rd{{[0-9]+}}]; +; CHECK_PTX64: cp.async.mbarrier.arrive.noinc.b64 [%[[R]]{{[0-9]+}}]; tail call void @llvm.nvvm.cp.async.mbarrier.arrive.noinc(ptr %a) ret void } ; CHECK-LABEL: asyncmbarriernoincshared define void @asyncmbarriernoincshared(ptr addrspace(3) %a) { -; CHECK_PTX32: cp.async.mbarrier.arrive.noinc.shared.b64 [%r{{[0-9]+}}]; -; CHECK_PTX64: cp.async.mbarrier.arrive.noinc.shared.b64 [%rd{{[0-9]+}}]; +; CHECK: cp.async.mbarrier.arrive.noinc.shared.b64 [%[[R]]{{[0-9]+}}]; tail call void @llvm.nvvm.cp.async.mbarrier.arrive.noinc.shared(ptr addrspace(3) %a) ret void } -declare void @llvm.nvvm.cp.async.ca.shared.global.4(ptr addrspace(3) %a, ptr addrspace(1) %b) +declare void @llvm.nvvm.cp.async.ca.shared.global.4(ptr addrspace(3) %a, ptr addrspace(1) %b, i32 %c) ; CHECK-LABEL: asynccasharedglobal4i8 -define void @asynccasharedglobal4i8(ptr addrspace(3) %a, ptr addrspace(1) %b) { -; CHECK_PTX32: cp.async.ca.shared.global [%r{{[0-9]+}}], [%r{{[0-9]+}}], 4; -; CHECK_PTX64: cp.async.ca.shared.global [%rd{{[0-9]+}}], [%rd{{[0-9]+}}], 4; - tail call void @llvm.nvvm.cp.async.ca.shared.global.4(ptr addrspace(3) %a, ptr addrspace(1) %b) +define void @asynccasharedglobal4i8(ptr addrspace(3) %a, ptr addrspace(1) %b, i32 %c) { +; CHECK: cp.async.ca.shared.global [%[[R]]{{[0-9]+}}], [%[[R]]{{[0-9]+}}], 4, %r{{[0-9]+}}; +; CHECK: cp.async.ca.shared.global [%[[R]]{{[0-9]+}}], [%[[R]]{{[0-9]+}}], 4, 1; + tail call void @llvm.nvvm.cp.async.ca.shared.global.4(ptr addrspace(3) %a, ptr addrspace(1) %b, i32 %c) + tail call void @llvm.nvvm.cp.async.ca.shared.global.4(ptr addrspace(3) %a, ptr addrspace(1) %b, i32 1) ret void } -declare void @llvm.nvvm.cp.async.ca.shared.global.8(ptr addrspace(3) %a, ptr addrspace(1) %b) +declare void @llvm.nvvm.cp.async.ca.shared.global.8(ptr addrspace(3) %a, ptr addrspace(1) %b, i32 %c) ; CHECK-LABEL: asynccasharedglobal8i8 -define void @asynccasharedglobal8i8(ptr addrspace(3) %a, ptr addrspace(1) %b) { -; CHECK_PTX32: cp.async.ca.shared.global [%r{{[0-9]+}}], [%r{{[0-9]+}}], 8; -; CHECK_PTX64: cp.async.ca.shared.global [%rd{{[0-9]+}}], [%rd{{[0-9]+}}], 8; - tail call void @llvm.nvvm.cp.async.ca.shared.global.8(ptr addrspace(3) %a, ptr addrspace(1) %b) +define void @asynccasharedglobal8i8(ptr addrspace(3) %a, ptr addrspace(1) %b, i32 %c) { +; CHECK: cp.async.ca.shared.global [%[[R]]{{[0-9]+}}], [%[[R]]{{[0-9]+}}], 8, %r{{[0-9]+}}; +; CHECK: cp.async.ca.shared.global [%[[R]]{{[0-9]+}}], [%[[R]]{{[0-9]+}}], 8, 1; + tail call void @llvm.nvvm.cp.async.ca.shared.global.8(ptr addrspace(3) %a, ptr addrspace(1) %b, i32 %c) + tail call void @llvm.nvvm.cp.async.ca.shared.global.8(ptr addrspace(3) %a, ptr addrspace(1) %b, i32 1) ret void } -declare void @llvm.nvvm.cp.async.ca.shared.global.16(ptr addrspace(3) %a, ptr addrspace(1) %b) +declare void @llvm.nvvm.cp.async.ca.shared.global.16(ptr addrspace(3) %a, ptr addrspace(1) %b, i32 %c) ; CHECK-LABEL: asynccasharedglobal16i8 -define void @asynccasharedglobal16i8(ptr addrspace(3) %a, ptr addrspace(1) %b) { -; CHECK_PTX32: cp.async.ca.shared.global [%r{{[0-9]+}}], [%r{{[0-9]+}}], 16; -; CHECK_PTX64: cp.async.ca.shared.global [%rd{{[0-9]+}}], [%rd{{[0-9]+}}], 16; - tail call void @llvm.nvvm.cp.async.ca.shared.global.16(ptr addrspace(3) %a, ptr addrspace(1) %b) +define void @asynccasharedglobal16i8(ptr addrspace(3) %a, ptr addrspace(1) %b, i32 %c) { +; CHECK: cp.async.ca.shared.global [%[[R]]{{[0-9]+}}], [%[[R]]{{[0-9]+}}], 16, %r{{[0-9]+}}; +; CHECK: cp.async.ca.shared.global [%[[R]]{{[0-9]+}}], [%[[R]]{{[0-9]+}}], 16, 1; + tail call void @llvm.nvvm.cp.async.ca.shared.global.16(ptr addrspace(3) %a, ptr addrspace(1) %b, i32 %c) + tail call void @llvm.nvvm.cp.async.ca.shared.global.16(ptr addrspace(3) %a, ptr addrspace(1) %b, i32 1) ret void } -declare void @llvm.nvvm.cp.async.cg.shared.global.16(ptr addrspace(3) %a, ptr addrspace(1) %b) +declare void @llvm.nvvm.cp.async.cg.shared.global.16(ptr addrspace(3) %a, ptr addrspace(1) %b, i32 %c) ; CHECK-LABEL: asynccgsharedglobal16i8 -define void @asynccgsharedglobal16i8(ptr addrspace(3) %a, ptr addrspace(1) %b) { -; CHECK_PTX32: cp.async.cg.shared.global [%r{{[0-9]+}}], [%r{{[0-9]+}}], 16; -; CHECK_PTX64: cp.async.cg.shared.global [%rd{{[0-9]+}}], [%rd{{[0-9]+}}], 16; - tail call void @llvm.nvvm.cp.async.cg.shared.global.16(ptr addrspace(3) %a, ptr addrspace(1) %b) +define void @asynccgsharedglobal16i8(ptr addrspace(3) %a, ptr addrspace(1) %b, i32 %c) { +; CHECK: cp.async.cg.shared.global [%[[R]]{{[0-9]+}}], [%[[R]]{{[0-9]+}}], 16, %r{{[0-9]+}}; +; CHECK: cp.async.cg.shared.global [%[[R]]{{[0-9]+}}], [%[[R]]{{[0-9]+}}], 16, 1; + tail call void @llvm.nvvm.cp.async.cg.shared.global.16(ptr addrspace(3) %a, ptr addrspace(1) %b, i32 %c) + tail call void @llvm.nvvm.cp.async.cg.shared.global.16(ptr addrspace(3) %a, ptr addrspace(1) %b, i32 1) ret void }