Diff 392817

mlir/include/mlir/Dialect/LLVMIR/LLVMOpBase.td

Show First 20 Lines • Show All 88 Lines • ▼ Show 20 Lines	class LLVM_PointerTo<Type pointee> : Type<
And<[LLVM_AnyPointer.predicate,		And<[LLVM_AnyPointer.predicate,
SubstLeaves<		SubstLeaves<
"$_self",		"$_self",
"$_self.cast<::mlir::LLVM::LLVMPointerType>().getElementType()",		"$_self.cast<::mlir::LLVM::LLVMPointerType>().getElementType()",
pointee.predicate>]>,		pointee.predicate>]>,
"LLVM pointer to " # pointee.summary>;		"LLVM pointer to " # pointee.summary>;

// Type constraints accepting LLVM pointer type to integer of a specific width.		// Type constraints accepting LLVM pointer type to integer of a specific width.
class LLVM_IntPtrBase<int width> : Type<		class LLVM_IntPtrBase<int width, int addressSpace = 0> : Type<
LLVM_PointerTo<I<width>>.predicate,		LLVM_PointerTo<I<width>>.predicate,
"LLVM pointer to " # I<width>.summary>,		"LLVM pointer to " # I<width>.summary>,
BuildableType<"::mlir::LLVM::LLVMPointerType::get("		BuildableType<"::mlir::LLVM::LLVMPointerType::get("
"::mlir::IntegerType::get($_builder.getContext(), "		"::mlir::IntegerType::get($_builder.getContext(), "
# width #"))">;		# width #"), "# addressSpace #")">;

def LLVM_i8Ptr : LLVM_IntPtrBase<8>;		def LLVM_i8Ptr : LLVM_IntPtrBase<8>;

// Type constraint accepting any LLVM structure type.		// Type constraint accepting any LLVM structure type.
def LLVM_AnyStruct : Type<CPred<"$_self.isa<::mlir::LLVM::LLVMStructType>()">,		def LLVM_AnyStruct : Type<CPred<"$_self.isa<::mlir::LLVM::LLVMStructType>()">,
"LLVM structure type">;		"LLVM structure type">;

// Type constraint accepting opaque LLVM structure type.		// Type constraint accepting opaque LLVM structure type.
▲ Show 20 Lines • Show All 248 Lines • Show Last 20 Lines

mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td

Show All 10 Lines

//===----------------------------------------------------------------------===// //===----------------------------------------------------------------------===//

#ifndef NVVMIR_OPS #ifndef NVVMIR_OPS

#define NVVMIR_OPS #define NVVMIR_OPS

include "mlir/Dialect/LLVMIR/LLVMOpBase.td" include "mlir/Dialect/LLVMIR/LLVMOpBase.td"

include "mlir/Interfaces/SideEffectInterfaces.td" include "mlir/Interfaces/SideEffectInterfaces.td"

def LLVM_i8Ptr_global : LLVM_IntPtrBase<8, 1>;

def LLVM_i8Ptr_shared : LLVM_IntPtrBase<8, 3>;

//===----------------------------------------------------------------------===// //===----------------------------------------------------------------------===//

// NVVM dialect definitions // NVVM dialect definitions

//===----------------------------------------------------------------------===// //===----------------------------------------------------------------------===//

def NVVM_Dialect : Dialect { def NVVM_Dialect : Dialect {

let name = "nvvm"; let name = "nvvm";

let cppNamespace = "::mlir::NVVM"; let cppNamespace = "::mlir::NVVM";

let dependentDialects = ["LLVM::LLVMDialect"]; let dependentDialects = ["LLVM::LLVMDialect"];

▲ Show 20 Lines • Show All 125 Lines • ▼ Show 20 Lines def NVVM_VoteBallotOp :

string llvmBuilder = [{ string llvmBuilder = [{

$res = createIntrinsicCall(builder, $res = createIntrinsicCall(builder,

llvm::Intrinsic::nvvm_vote_ballot_sync, {$mask, $pred}); llvm::Intrinsic::nvvm_vote_ballot_sync, {$mask, $pred});

}]; }];

let parser = [{ return parseNVVMVoteBallotOp(parser, result); }]; let parser = [{ return parseNVVMVoteBallotOp(parser, result); }];

let printer = [{ printNVVMIntrinsicOp(p, this->getOperation()); }]; let printer = [{ printNVVMIntrinsicOp(p, this->getOperation()); }];

} }

def NVVM_CpAsyncOp : NVVM_Op<"cp.async.shared.global">,

Arguments<(ins LLVM_i8Ptr_shared:$dst,

LLVM_i8Ptr_global:$src,

I32Attr:$size)> {

string llvmBuilder = [{

llvm::Intrinsic::ID id;

switch ($size) {

case 4:

id = llvm::Intrinsic::nvvm_cp_async_ca_shared_global_4;

break;

case 8:

id = llvm::Intrinsic::nvvm_cp_async_ca_shared_global_8;

break;

case 16:

id = llvm::Intrinsic::nvvm_cp_async_ca_shared_global_16;

break;

default:

llvm_unreachable("unsupported async copy size");

}

createIntrinsicCall(builder, id, {$dst, $src});

}];

let verifier = [{

if (size() != 4 && size() != 8 && size() != 16)

return emitError("expected byte size to be either 4, 8 or 16.");

return success();

}];

let assemblyFormat = "$dst `,` $src `,` $size attr-dict";

}

rriddleUnsubmitted

Done

return success();

}];

- let assemblyFormat = [{

- $dst `,` $src `,` $size attr-dict

- }];

+ let assemblyFormat = "$dst `,` $src `,` $size attr-dict";

}

def NVVM_CpAsyncCommitGroupOp : NVVM_Op<"cp.async.commit.group"> {

Prefer single line when possible.

rriddle: Prefer single line when possible.

def NVVM_CpAsyncCommitGroupOp : NVVM_Op<"cp.async.commit.group"> {

string llvmBuilder = [{

createIntrinsicCall(builder, llvm::Intrinsic::nvvm_cp_async_commit_group);

}];

let assemblyFormat = "attr-dict";

}

def NVVM_CpAsyncWaitGroupOp : NVVM_Op<"cp.async.wait.group">,

mravishankarUnsubmitted

Not Done

Maybe add the wait_all variant as well when you are at it.

mravishankar: Maybe add the `wait_all` variant as well when you are at it.

ThomasRaouxAuthorUnsubmitted

Done

wait_all is strictly equivalent to wait 0 so I'm not sure in what case we would want to use it. I'll add it only if needed to avoid useless code.

ThomasRaoux: `wait_all` is strictly equivalent to `wait 0` so I'm not sure in what case we would want to use…

Arguments<(ins I32Attr:$n)> {

string llvmBuilder = [{

createIntrinsicCall(

builder,

llvm::Intrinsic::nvvm_cp_async_wait_group,

llvm::ConstantInt::get(

llvm::Type::getInt32Ty(moduleTranslation.getLLVMContext()),

$n));

}];

let assemblyFormat = "$n attr-dict";

}

def NVVM_MmaOp : def NVVM_MmaOp :

NVVM_Op<"mma.sync">, NVVM_Op<"mma.sync">,

rriddleUnsubmitted

Done

$n));

}];

- let assemblyFormat = [{

- $n attr-dict

- }];

+ let assemblyFormat = "$n attr-dict";

}

def NVVM_MmaOp :

rriddle:

Results<(outs LLVM_Type:$res)>, Results<(outs LLVM_Type:$res)>,

Arguments<(ins Variadic<LLVM_Type>:$args)> { Arguments<(ins Variadic<LLVM_Type>:$args)> {

string llvmBuilder = [{ string llvmBuilder = [{

$res = createIntrinsicCall( $res = createIntrinsicCall(

builder, llvm::Intrinsic::nvvm_mma_m8n8k4_row_col_f32_f32, $args); builder, llvm::Intrinsic::nvvm_mma_m8n8k4_row_col_f32_f32, $args);

}]; }];

let assemblyFormat = "$args attr-dict `:` functional-type($args, $res)"; let assemblyFormat = "$args attr-dict `:` functional-type($args, $res)";

let verifier = [{ return ::verify(*this); }]; let verifier = [{ return ::verify(*this); }];

▲ Show 20 Lines • Show All 430 Lines • Show Last 20 Lines

mlir/test/Dialect/LLVMIR/invalid.mlir

	Show First 20 Lines • Show All 1,220 Lines • ▼ Show 20 Lines

	// -----			// -----

	func @bitcast(%arg0: vector<2x3xf32>) {			func @bitcast(%arg0: vector<2x3xf32>) {
	// expected-error @below {{op operand #0 must be LLVM-compatible non-aggregate type}}			// expected-error @below {{op operand #0 must be LLVM-compatible non-aggregate type}}
	llvm.bitcast %arg0 : vector<2x3xf32> to vector<2x3xi32>			llvm.bitcast %arg0 : vector<2x3xf32> to vector<2x3xi32>
	return			return
	}			}

				// -----

				func @cp_async(%arg0: !llvm.ptr<i8, 3>, %arg1: !llvm.ptr<i8, 1>) {
				// expected-error @below {{expected byte size to be either 4, 8 or 16.}}
				nvvm.cp.async.shared.global %arg0, %arg1, 32
				return
				}

mlir/test/Dialect/LLVMIR/nvvm.mlir

Show First 20 Lines • Show All 89 Lines • ▼ Show 20 Lines	func @nvvm_wmma_mma(%0 : i32, %1 : i32, %2 : i32, %3 : i32, %4 : i32, %5 : i32,
// CHECK: nvvm.wmma.mma {{.*}} {eltypeA = "tf32", eltypeB = "f32", k = 8 : i32, layoutA = "row", layoutB = "row", m = 16 : i32, n = 16 : i32}		// CHECK: nvvm.wmma.mma {{.*}} {eltypeA = "tf32", eltypeB = "f32", k = 8 : i32, layoutA = "row", layoutB = "row", m = 16 : i32, n = 16 : i32}
%r = nvvm.wmma.mma %0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, %14, %15		%r = nvvm.wmma.mma %0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, %14, %15
{eltypeA = "tf32", eltypeB = "f32", k = 8 : i32, layoutA = "row", layoutB = "row", m = 16 : i32, n = 16 : i32}		{eltypeA = "tf32", eltypeB = "f32", k = 8 : i32, layoutA = "row", layoutB = "row", m = 16 : i32, n = 16 : i32}
: (i32, i32, i32, i32, i32, i32, i32, i32, f32, f32, f32, f32, f32, f32, f32, f32)		: (i32, i32, i32, i32, i32, i32, i32, i32, f32, f32, f32, f32, f32, f32, f32, f32)
-> !llvm.struct<(f32, f32, f32, f32, f32, f32, f32, f32)>		-> !llvm.struct<(f32, f32, f32, f32, f32, f32, f32, f32)>
llvm.return %r : !llvm.struct<(f32, f32, f32, f32, f32, f32, f32, f32)>		llvm.return %r : !llvm.struct<(f32, f32, f32, f32, f32, f32, f32, f32)>
}		}

		llvm.func @cp_async(%arg0: !llvm.ptr<i8, 3>, %arg1: !llvm.ptr<i8, 1>) {
		// CHECK: nvvm.cp.async.shared.global %{{.}}, %{{.}}, 16
		nvvm.cp.async.shared.global %arg0, %arg1, 16
		// CHECK: nvvm.cp.async.commit.group
		nvvm.cp.async.commit.group
		// CHECK: nvvm.cp.async.wait.group 0
		nvvm.cp.async.wait.group 0
		nicolasvasilacheUnsubmitted Not Done Reply Inline Actions `4` ? I find the example in the NVVM doc scary https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-asynchronous-copy: // Example of .wait_all: cp.async.ca.shared.global [shrd1], [gbl1], 4; cp.async.cg.shared.global [shrd2], [gbl2], 16; cp.async.wait_all; // waits for all prior cp.async to complete // Example of .wait_group : cp.async.ca.shared.global [shrd3], [gbl3], 8; cp.async.commit_group; // End of group 1 cp.async.cg.shared.global [shrd4], [gbl4], 16; cp.async.commit_group; // End of group 2 cp.async.cg.shared.global [shrd5], [gbl5], 16; cp.async.commit_group; // End of group 3 cp.async.wait_group 1; // waits for group 1 and group 2 to complete In my mind, there must be an off-by-one error either in the code or in the comments. Should it be: `cp.async.wait_group 2; // waits for group 1 and group 2 to complete` (natural) or `cp.async.wait_group 1; // waits for group 0 and group 1 to complete` (weird but less broken) Or am I fundamentally misunderstanding something..? nicolasvasilache: `4` ? I find the example in the NVVM doc scary https://docs.nvidia.com/cuda/parallel-thread…
		ThomasRaouxAuthorUnsubmitted Done Reply Inline Actions `4` was just a random value. Here if we want to wait on the previous `cp` we would need `0`. I'll change it to `0`. In the example from the spec there is a `cp.async.wait_all` after the group `0` then group 1, 2 and 3 are committed so `cp.async.wait_group 1` waits until at most 1 group is pending which means 1 and 2 are complete. (meaning all groups are complete but 1) ThomasRaoux: `4` was just a random value. Here if we want to wait on the previous `cp` we would need `0`.
		llvm.return
		}

// -----		// -----

// expected-error@below {{attribute attached to unexpected op}}		// expected-error@below {{attribute attached to unexpected op}}
func private @expected_llvm_func() attributes { nvvm.kernel }		func private @expected_llvm_func() attributes { nvvm.kernel }

mlir/test/Target/LLVMIR/nvvmir.mlir

Show First 20 Lines • Show All 156 Lines • ▼ Show 20 Lines	llvm.func @nvvm_wmma_mma(%0 : i32, %1 : i32, %2 : i32, %3 : i32, %4 : i32, %5 : i32,
// CHECK: { float, float, float, float, float, float, float, float } @llvm.nvvm.wmma.m16n16k8.mma.row.row.tf32(i32 %{{.}}, i32 %{{.}}, i32 %{{.}}, i32 %{{.}}, i32 %{{.}}, i32 %{{.}}, i32 %{{.}}, i32 %{{.}}, float %{{.}}, float %{{.}}, float %{{.}}, float %{{.}}, float %{{.}}, float %{{.}}, float %{{.}}, float %{{.}})		// CHECK: { float, float, float, float, float, float, float, float } @llvm.nvvm.wmma.m16n16k8.mma.row.row.tf32(i32 %{{.}}, i32 %{{.}}, i32 %{{.}}, i32 %{{.}}, i32 %{{.}}, i32 %{{.}}, i32 %{{.}}, i32 %{{.}}, float %{{.}}, float %{{.}}, float %{{.}}, float %{{.}}, float %{{.}}, float %{{.}}, float %{{.}}, float %{{.}})
%r = nvvm.wmma.mma %0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, %14, %15		%r = nvvm.wmma.mma %0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, %14, %15
{eltypeA = "tf32", eltypeB = "f32", k = 8 : i32, layoutA = "row", layoutB = "row", m = 16 : i32, n = 16 : i32}		{eltypeA = "tf32", eltypeB = "f32", k = 8 : i32, layoutA = "row", layoutB = "row", m = 16 : i32, n = 16 : i32}
: (i32, i32, i32, i32, i32, i32, i32, i32, f32, f32, f32, f32, f32, f32, f32, f32)		: (i32, i32, i32, i32, i32, i32, i32, i32, f32, f32, f32, f32, f32, f32, f32, f32)
-> !llvm.struct<(f32, f32, f32, f32, f32, f32, f32, f32)>		-> !llvm.struct<(f32, f32, f32, f32, f32, f32, f32, f32)>
llvm.return		llvm.return
}		}

		llvm.func @cp_async(%arg0: !llvm.ptr<i8, 3>, %arg1: !llvm.ptr<i8, 1>) {
		// CHECK: call void @llvm.nvvm.cp.async.ca.shared.global.4(i8 addrspace(3)* %{{.}}, i8 addrspace(1) %{{.*}})
		nvvm.cp.async.shared.global %arg0, %arg1, 4
		// CHECK: call void @llvm.nvvm.cp.async.ca.shared.global.8(i8 addrspace(3)* %{{.}}, i8 addrspace(1) %{{.*}})
		nvvm.cp.async.shared.global %arg0, %arg1, 8
		// CHECK: call void @llvm.nvvm.cp.async.ca.shared.global.16(i8 addrspace(3)* %{{.}}, i8 addrspace(1) %{{.*}})
		nvvm.cp.async.shared.global %arg0, %arg1, 16
		// CHECK: call void @llvm.nvvm.cp.async.commit.group()
		nvvm.cp.async.commit.group
		// CHECK: call void @llvm.nvvm.cp.async.wait.group(i32 0)
		nvvm.cp.async.wait.group 0
		llvm.return
		}

// This function has the "kernel" attribute attached and should appear in the		// This function has the "kernel" attribute attached and should appear in the
// NVVM annotations after conversion.		// NVVM annotations after conversion.
llvm.func @kernel_func() attributes {nvvm.kernel} {		llvm.func @kernel_func() attributes {nvvm.kernel} {
llvm.return		llvm.return
}		}

// CHECK: !nvvm.annotations =		// CHECK: !nvvm.annotations =
// CHECK-NOT: {i32 ()* @nvvm_special_regs, !"kernel", i32 1}		// CHECK-NOT: {i32 ()* @nvvm_special_regs, !"kernel", i32 1}
// CHECK: {void ()* @kernel_func, !"kernel", i32 1}		// CHECK: {void ()* @kernel_func, !"kernel", i32 1}

This is an archive of the discontinued LLVM Phabricator instance.

[mlir][nvvm] Add async copy ops to nvvm dialect
ClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 392817

mlir/include/mlir/Dialect/LLVMIR/LLVMOpBase.td

mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td

mlir/test/Dialect/LLVMIR/invalid.mlir

mlir/test/Dialect/LLVMIR/nvvm.mlir

mlir/test/Target/LLVMIR/nvvmir.mlir

This is an archive of the discontinued LLVM Phabricator instance.

[mlir][nvvm] Add async copy ops to nvvm dialectClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 392817

mlir/include/mlir/Dialect/LLVMIR/LLVMOpBase.td

mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td

mlir/test/Dialect/LLVMIR/invalid.mlir

mlir/test/Dialect/LLVMIR/nvvm.mlir

mlir/test/Target/LLVMIR/nvvmir.mlir

[mlir][nvvm] Add async copy ops to nvvm dialect
ClosedPublic