Diff 444070

mlir/include/mlir/Dialect/NVGPU/IR/NVGPU.td

Show First 20 Lines • Show All 75 Lines • ▼ Show 20 Lines	let arguments = (ins Arg<AnyMemRef, "", [MemRead]>:$srcMemref,
Variadic<Index>:$indices, BoolAttr:$transpose,		Variadic<Index>:$indices, BoolAttr:$transpose,
I32Attr:$numTiles);		I32Attr:$numTiles);
let results = (outs AnyVector:$res);		let results = (outs AnyVector:$res);
let assemblyFormat = [{		let assemblyFormat = [{
$srcMemref`[` $indices `]` attr-dict `:` type($srcMemref) `->` type($res)		$srcMemref`[` $indices `]` attr-dict `:` type($srcMemref) `->` type($res)
}];		}];
}		}

def NVGPU_MmaSyncOp : NVGPU_Op<"mma.sync", [NoSideEffect]> {		def NVGPU_MmaSyncOp : NVGPU_Op<"mma.sync", [
		NoSideEffect,
		PredOpTrait<"matrixA and matrixB have same element type", TCopVTEtIsSameAs<0, 1>>,
		]> {
let description = [{		let description = [{
The `nvgpu.mma.sync` op represents the distributed form of a collective		The `nvgpu.mma.sync` op represents the distributed form of a collective
matrix-multiply-and-accumulate (mma) operation that is compatible with		matrix-multiply-and-accumulate (mma) operation that is compatible with
`nvvm.mma.sync`. The operands and results are fragments of the full matrix		`nvvm.mma.sync`. The operands and results are fragments of the full matrix
operands. The full shape of the distributed mma operation is given by the		operands. The full shape of the distributed mma operation is given by the
`mmaShape` attribute in the form of a list of dimensions `[m, n, k]`.		`mmaShape` attribute in the form of a list of dimensions `[m, n, k]`.

This operation is meant to be lowered to the `nvvm.mma.sync` instruction, and		This operation is meant to be lowered to the `nvvm.mma.sync` instruction, and
Show All 14 Lines	let arguments = (ins AnyVector:$matrixA, AnyVector:$matrixB,
AnyVector:$matrixC, I64ArrayAttr:$mmaShape);		AnyVector:$matrixC, I64ArrayAttr:$mmaShape);

let results = (outs AnyVector:$res);		let results = (outs AnyVector:$res);

let assemblyFormat = [{		let assemblyFormat = [{
`(` $matrixA`,` $matrixB`,` $matrixC `)` attr-dict		`(` $matrixA`,` $matrixB`,` $matrixC `)` attr-dict
`:` `(` type($matrixA) `,` type($matrixB) `,` type($matrixC) `)` `->` type($res)		`:` `(` type($matrixA) `,` type($matrixB) `,` type($matrixC) `)` `->` type($res)
}];		}];

		let hasVerifier = 1;
}		}


def NVGPU_DeviceAsyncCopyOp : NVGPU_Op<"device_async_copy",		def NVGPU_DeviceAsyncCopyOp : NVGPU_Op<"device_async_copy",
[AttrSizedOperandSegments]> {		[AttrSizedOperandSegments]> {
let summary = "device-side asynchronous copy";		let summary = "device-side asynchronous copy";
let description = [{		let description = [{
The `gpu.device_async_copy` op initiates an asynchronous copy operation of		The `gpu.device_async_copy` op initiates an asynchronous copy operation of
▲ Show 20 Lines • Show All 101 Lines • Show Last 20 Lines

mlir/lib/Dialect/NVGPU/IR/NVGPUDialect.cpp

Show First 20 Lines • Show All 82 Lines • ▼ Show 20 Lines	return emitOpError() << "expected " << srcMemref.getRank()
<< " source indices, got " << getSrcIndices().size();		<< " source indices, got " << getSrcIndices().size();
if (size_t(dstMemref.getRank()) != getDstIndices().size())		if (size_t(dstMemref.getRank()) != getDstIndices().size())
return emitOpError() << "expected " << dstMemref.getRank()		return emitOpError() << "expected " << dstMemref.getRank()
<< " destination indices, got "		<< " destination indices, got "
<< getDstIndices().size();		<< getDstIndices().size();
return success();		return success();
}		}

		LogicalResult MmaSyncOp::verify() {

		// Fundamental tensor core mma.sync op
		// For F32 (TF32), F16, S8, and S4 data types fundamental tensor core
		// operation is of shape: 8-by-8-by-128b. F64 is an exception. The
		// verification for mma.sync covering various shapes and data types is based
		// on the fundamental tensor core operionation.
		constexpr int kThreads = 32; // 32 threads per warp
		int64_t shapeM = 8;
		int64_t shapeN = 8;
		int64_t shapeK; // set based on data type (128b for all data types except F64)

		// Number of elements A, B, and C per thread per fundamental tensor core tile
		int64_t numElementA; // set based on data type (32b except F64)
		int64_t numElementB; // set based on data type (32b except F64)
		int64_t numElementC{2}; // two accumulator elements per fundamental tile

		// nvgpu.mma.sync vector operands (per thread)
		auto aVector = getMatrixA().getType().cast<VectorType>();
		auto bVector = getMatrixB().getType().cast<VectorType>();
		auto cVector = getMatrixC().getType().cast<VectorType>();

		// vector shapes
		ArrayRef<int64_t> aShape = aVector.getShape();
		ThomasRaouxUnsubmitted Done Reply Inline Actions Please spell out those types: https://llvm.org/docs/CodingStandards.html#use-auto-type-deduction-to-make-code-more-readable ThomasRaoux: Please spell out those types: https://llvm.org/docs/CodingStandards.html#use-auto-type…
		manishucsdAuthorUnsubmitted Done Reply Inline Actions Thanks for the review comments and sharing the link. I have made the appropriate changes. manishucsd: Thanks for the review comments and sharing the link. I have made the appropriate changes.
		ArrayRef<int64_t> bShape = bVector.getShape();
		ArrayRef<int64_t> cShape = cVector.getShape();

		// vector element type
		Type aType = aVector.getElementType();
		ThomasRaouxUnsubmitted Not Done Reply Inline Actions same here spell those out ThomasRaoux: same here spell those out

		// nvgpu.mma.sync shape (per 32 threads or per warp)
		int64_t m = getMmaShape()[0].cast<IntegerAttr>().getInt();
		int64_t n = getMmaShape()[1].cast<IntegerAttr>().getInt();
		int64_t k = getMmaShape()[2].cast<IntegerAttr>().getInt();

		if (aType.isF64()) {
		// exception to 8-by-8-128b fundamental tensor core tile size
		shapeK = 4;
		ThomasRaouxUnsubmitted Done Reply Inline Actions should be `aType == bType` Note that I think this can be enforced in the tablegen file instead which would be better. ThomasRaoux: should be `aType == bType` Note that I think this can be enforced in the tablegen file instead…
		numElementA = 1;
		numElementB = 1;
		} else if (aType.isF32() \|\| aType.isBF16() \|\| aType.isF16() \|\|
		aType.isInteger(8) \|\| aType.isInteger(4)) {
		// 8-by-8-128b fundamental tensor core tile size
		int operandBitwidth = aType.getIntOrFloatBitWidth();
		shapeK = 128 / operandBitwidth; // 128b wide shapeK
		numElementA = 32 / operandBitwidth; // 32b wide operand A
		numElementB = 32 / operandBitwidth; // 32b wide operand B
		} else {
		return emitError() << "expected input data type (i4,i8,f16,bf16,tf32,f64) "
		"supported by nvgpu.mma.sync";
		}

		//
		// Basic verification
		//

		// verify warp-wide size for vector a
		ThomasRaouxUnsubmitted Done Reply Inline Actions I don't see i4 nor f64 in the condition ThomasRaoux: I don't see i4 nor f64 in the condition
		manishucsdAuthorUnsubmitted Done Reply Inline Actions Thanks for catching this. I added i4. manishucsd: Thanks for catching this. I added i4.
		if (aShape[0] * aShape[1] * kThreads != m * k)
		return emitOpError() << "expected " << m * k
		<< " warp-wide matrix A elements";

		// verify warp-wide size for vector b
		if (bShape[0] * bShape[1] * kThreads != k * n)
		return emitOpError() << "expected " << k * n
		<< " warp-wide matrix B elements";

		ThomasRaouxUnsubmitted Not Done Reply Inline Actions skip braces for single line `if`: https://llvm.org/docs/CodingStandards.html#don-t-use-braces-on-simple-single-statement-bodies-of-if-else-loop-statements ThomasRaoux: skip braces for single line `if`: https://llvm.org/docs/CodingStandards.html#don-t-use-braces…
		// verify warp-wide size for vector c
		if (cShape[0] * cShape[1] * kThreads != m * n)
		return emitOpError() << "expected " << m * n
		<< " warp-wide matrix C elements";

		//
		// Extended verification
		//

		// tiles of fundamental tensor core operations
		int64_t mTile = m / shapeM;
		int64_t nTile = n / shapeN;
		int64_t kTile = k / shapeK;

		// verify shape of aVector
		if (!((aShape[0] == mTile * kTile) && (aShape[1] == numElementA)))
		return emitOpError() << "expected matrix A to be shaped (" << mTile * kTile
		<< " x " << numElementA << ")";

		// verify shape of bVector
		if (!((bShape[0] == kTile * nTile) && (bShape[1] == numElementB)))
		return emitOpError() << "expected matrix B to be shaped (" << kTile * nTile
		<< " x " << numElementB << ")";

		// verify shape of cVector
		if (!((cShape[0] == mTile * nTile) && (cShape[1] == numElementC)))
		return emitOpError() << "expected matrix C to be shaped (" << mTile * nTile
		<< " x " << numElementC << ")";

		return success();
		}

#define GET_OP_CLASSES		#define GET_OP_CLASSES
#include "mlir/Dialect/NVGPU/IR/NVGPU.cpp.inc"		#include "mlir/Dialect/NVGPU/IR/NVGPU.cpp.inc"

mlir/test/Conversion/NVGPUToNVVM/nvgpu-to-nvvm.mlir

Show First 20 Lines • Show All 199 Lines • ▼ Show 20 Lines	func.func @ldmatrix_x1(%arg0: memref<128x128xf16, 3>) -> vector<1x2xf16> {
// CHECK: llvm.bitcast		// CHECK: llvm.bitcast
// CHECK: llvm.insertvalue		// CHECK: llvm.insertvalue
return %a : vector<1x2xf16>		return %a : vector<1x2xf16>
}		}

// -----		// -----

// CHECK-LABEL: @m16n8k4_tf32		// CHECK-LABEL: @m16n8k4_tf32
func.func @m16n8k4_tf32(%arg0: vector<2x1xf32>, %arg1: vector<1x1xf32>, %arg2: vector<4x1xf32>) -> vector<4x1xf32> {		func.func @m16n8k4_tf32(%arg0: vector<2x1xf32>, %arg1: vector<1x1xf32>, %arg2: vector<2x2xf32>) -> vector<2x2xf32> {
// The A, B operand should be bitcast to i32		// The A, B operand should be bitcast to i32
// CHECK: llvm.extractvalue		// CHECK: llvm.extractvalue
// CHECK: llvm.bitcast {{.*}} : vector<1xf32> to i32		// CHECK: llvm.bitcast {{.*}} : vector<1xf32> to i32
// CHECK: llvm.extractvalue		// CHECK: llvm.extractvalue
// CHECK: llvm.bitcast {{.*}} : vector<1xf32> to i32		// CHECK: llvm.bitcast {{.*}} : vector<1xf32> to i32
// CHECK: llvm.extractvalue		// CHECK: llvm.extractvalue
// CHECK: llvm.bitcast {{.*}} : vector<1xf32> to i32		// CHECK: llvm.bitcast {{.*}} : vector<1xf32> to i32

// CHECK: [[d:%.+]] = nvvm.mma.sync A[{{%.+}}, {{%.+}}] B[{{%.+}}] C[{{%.+}}, {{%.+}}, {{%.+}}, {{%.+}}]		// CHECK: [[d:%.+]] = nvvm.mma.sync A[{{%.+}}, {{%.+}}] B[{{%.+}}] C[{{%.+}}, {{%.+}}, {{%.+}}, {{%.+}}]
// CHECK-SAME: multiplicandAPtxType = #nvvm.mma_type<tf32>		// CHECK-SAME: multiplicandAPtxType = #nvvm.mma_type<tf32>
// CHECK-SAME: multiplicandBPtxType = #nvvm.mma_type<tf32>		// CHECK-SAME: multiplicandBPtxType = #nvvm.mma_type<tf32>
// CHECK-SAME: shape = #nvvm.shape<m = 16, n = 8, k = 4>		// CHECK-SAME: shape = #nvvm.shape<m = 16, n = 8, k = 4>
// CHECK-SAME: -> !llvm.struct<(f32, f32, f32, f32)>		// CHECK-SAME: -> !llvm.struct<(f32, f32, f32, f32)>
%d = nvgpu.mma.sync (%arg0, %arg1, %arg2) {mmaShape = [16, 8, 4]} : (vector<2x1xf32>, vector<1x1xf32>, vector<4x1xf32>) -> vector<4x1xf32>		%d = nvgpu.mma.sync (%arg0, %arg1, %arg2) {mmaShape = [16, 8, 4]} : (vector<2x1xf32>, vector<1x1xf32>, vector<2x2xf32>) -> vector<2x2xf32>
// CHECK: [[el:%.+]] = llvm.extractvalue [[d]][0]		// CHECK: [[undef:%.+]] = llvm.mlir.undef : vector<2xf32>
// CHECK: llvm.bitcast [[el]] : f32 to vector<1xf32>		// CHECK-DAG: llvm.extractvalue [[d]][0] : !llvm.struct<(f32, f32, f32, f32)>
// CHECK: [[el:%.+]] = llvm.extractvalue [[d]][1]		// CHECK-DAG: llvm.extractvalue [[d]][1] : !llvm.struct<(f32, f32, f32, f32)>
// CHECK: llvm.bitcast [[el]] : f32 to vector<1xf32>		// CHECK: [[d00:%.+]] = llvm.insertelement {{%.+}}, [[undef]][{{.*}}] : vector<2xf32>
// CHECK: [[el:%.+]] = llvm.extractvalue [[d]][2]		// CHECK: [[d01:%.+]] = llvm.insertelement {{%.+}}, [[d00]][{{.*}}] : vector<2xf32>
// CHECK: llvm.bitcast [[el]] : f32 to vector<1xf32>
// CHECK: [[el:%.+]] = llvm.extractvalue [[d]][3]		// CHECK: [[undef:%.+]] = llvm.mlir.undef : vector<2xf32>
// CHECK: llvm.bitcast [[el]] : f32 to vector<1xf32>		// CHECK-DAG: llvm.extractvalue [[d]][2] : !llvm.struct<(f32, f32, f32, f32)>
// CHECK-COUNT-4: llvm.insertvalue {{.*}} : !llvm.array<4 x vector<1xf32>>		// CHECK-DAG: llvm.extractvalue [[d]][3] : !llvm.struct<(f32, f32, f32, f32)>
return %d : vector<4x1xf32>		// CHECK: [[d10:%.+]] = llvm.insertelement {{%.+}}, [[undef]][{{.*}}] : vector<2xf32>
		// CHECK: [[d11:%.+]] = llvm.insertelement {{%.+}}, [[d10]][{{.*}}] : vector<2xf32>

		// CHECK-DAG: llvm.insertvalue [[d01]], {{%.+}}[0] : !llvm.array<2 x vector<2xf32>>
		// CHECK-DAG: llvm.insertvalue [[d11]], {{%.+}}[1] : !llvm.array<2 x vector<2xf32>>
		return %d : vector<2x2xf32>
}		}

// -----		// -----

// CHECK-LABEL: @async_cp(		// CHECK-LABEL: @async_cp(
// CHECK-SAME: %[[IDX:[a-zA-Z0-9_]+]]: index)		// CHECK-SAME: %[[IDX:[a-zA-Z0-9_]+]]: index)
func.func @async_cp(		func.func @async_cp(
%src: memref<128x128xf32>, %dst: memref<3x16x128xf32, 3>, %i : index) {		%src: memref<128x128xf32>, %dst: memref<3x16x128xf32, 3>, %i : index) {
▲ Show 20 Lines • Show All 54 Lines • Show Last 20 Lines

mlir/test/Dialect/NVGPU/invalid.mlir

	// RUN: mlir-opt -split-input-file -verify-diagnostics %s			// RUN: mlir-opt -split-input-file -verify-diagnostics %s
				func.func @m16n8k16_fp16_vector_shape_a(%arg0: vector<4x4xf16>, %arg1: vector<2x2xf16>, %arg2: vector<2x2xf16>) -> vector<2x2xf16> {
				// expected-error @+1 {{expected 256 warp-wide matrix A elements}}
				%d = nvgpu.mma.sync (%arg0, %arg1, %arg2) {mmaShape = [16, 8, 16]} : (vector<4x4xf16>, vector<2x2xf16>, vector<2x2xf16>) -> vector<2x2xf16>
				return %d : vector<2x2xf16>
				}
				// -----

				func.func @m16n8k16_fp16_vector_shape_b(%arg0: vector<4x2xf16>, %arg1: vector<2x4xf16>, %arg2: vector<2x2xf16>) -> vector<2x2xf16> {
				// expected-error @+1 {{expected 128 warp-wide matrix B elements}}
				%d = nvgpu.mma.sync (%arg0, %arg1, %arg2) {mmaShape = [16, 8, 16]} : (vector<4x2xf16>, vector<2x4xf16>, vector<2x2xf16>) -> vector<2x2xf16>
				return %d : vector<2x2xf16>
				}
				// -----

				func.func @m16n8k16_fp16_vector_shape_c(%arg0: vector<4x2xf16>, %arg1: vector<2x2xf16>, %arg2: vector<2x4xf16>) -> vector<2x4xf16> {
				// expected-error @+1 {{expected 128 warp-wide matrix C elements}}
				%d = nvgpu.mma.sync (%arg0, %arg1, %arg2) {mmaShape = [16, 8, 16]} : (vector<4x2xf16>, vector<2x2xf16>, vector<2x4xf16>) -> vector<2x4xf16>
				return %d : vector<2x4xf16>
				}
				// -----

				func.func @m16n8k16_fp16_vector_shape_a_extended(%arg0: vector<2x4xf16>, %arg1: vector<2x2xf16>, %arg2: vector<2x2xf16>) -> vector<2x2xf16> {
				// expected-error @+1 {{expected matrix A to be shaped (4 x 2)}}
				%d = nvgpu.mma.sync (%arg0, %arg1, %arg2) {mmaShape = [16, 8, 16]} : (vector<2x4xf16>, vector<2x2xf16>, vector<2x2xf16>) -> vector<2x2xf16>
				return %d : vector<2x2xf16>
				}
				// -----

				func.func @m16n8k8_fp32_vector_shape_a(%arg0: vector<4x2xf32>, %arg1: vector<2x1xf32>, %arg2: vector<2x2xf32>) -> vector<2x2xf32> {
				// expected-error @+1 {{expected 128 warp-wide matrix A elements}}
				%d = nvgpu.mma.sync (%arg0, %arg1, %arg2) {mmaShape = [16, 8, 8]} : (vector<4x2xf32>, vector<2x1xf32>, vector<2x2xf32>) -> vector<2x2xf32>
				return %d : vector<2x2xf32>
				}
				// -----

				func.func @m16n8k8_fp32_vector_shape_a_extended(%arg0: vector<1x4xf32>, %arg1: vector<2x1xf32>, %arg2: vector<2x2xf32>) -> vector<2x2xf32> {
				// expected-error @+1 {{expected matrix A to be shaped (4 x 1)}}
				%d = nvgpu.mma.sync (%arg0, %arg1, %arg2) {mmaShape = [16, 8, 8]} : (vector<1x4xf32>, vector<2x1xf32>, vector<2x2xf32>) -> vector<2x2xf32>
				return %d : vector<2x2xf32>
				}
				// -----

				func.func @m8n8k4_fp64_vector_shape_a(%arg0: vector<1x2xf64>, %arg1: vector<1x1xf64>, %arg2: vector<1x2xf64>) -> vector<1x2xf64> {
				// expected-error @+1 {{expected 32 warp-wide matrix A elements}}
				%d = nvgpu.mma.sync (%arg0, %arg1, %arg2) {mmaShape = [8, 8, 4]} : (vector<1x2xf64>, vector<1x1xf64>, vector<1x2xf64>) -> vector<1x2xf64>
				return %d : vector<1x2xf64>
				}
				// -----

				func.func @m8n8k4_fp64_vector_shape_c_extended(%arg0: vector<1x1xf64>, %arg1: vector<1x1xf64>, %arg2: vector<2x1xf64>) -> vector<2x1xf64> {
				// expected-error @+1 {{expected matrix C to be shaped (1 x 2)}}
				%d = nvgpu.mma.sync (%arg0, %arg1, %arg2) {mmaShape = [8, 8, 4]} : (vector<1x1xf64>, vector<1x1xf64>, vector<2x1xf64>) -> vector<2x1xf64>
				return %d : vector<2x1xf64>
				}
				// -----

				func.func @m16n8k32_int8_vector_shape_b(%arg0: vector<4x4xi8>, %arg1: vector<4x4xi8>, %arg2: vector<2x2xi32>) -> vector<2x2xi32> {
				// expected-error @+1 {{expected 256 warp-wide matrix B elements}}
				%d = nvgpu.mma.sync (%arg0, %arg1, %arg2) {mmaShape = [16, 8, 32]} : (vector<4x4xi8>, vector<4x4xi8>, vector<2x2xi32>) -> vector<2x2xi32>
				return %d : vector<2x2xi32>
				}
				// -----

				func.func @m16n8k32_int32_datatype(%arg0: vector<4x4xi32>, %arg1: vector<2x4xi8>, %arg2: vector<2x2xi32>) -> vector<2x2xi32> {
				// expected-error @+1 {{op failed to verify that matrixA and matrixB have same element type}}
				%d = nvgpu.mma.sync (%arg0, %arg1, %arg2) {mmaShape = [16, 8, 32]} : (vector<4x4xi32>, vector<2x4xi8>, vector<2x2xi32>) -> vector<2x2xi32>
				return %d : vector<2x2xi32>
				}
				// -----

	func.func @async_cp_memory_space(%dst : memref<16xf32>, %src : memref<16xf32>, %i : index) -> () {			func.func @async_cp_memory_space(%dst : memref<16xf32>, %src : memref<16xf32>, %i : index) -> () {
	// expected-error @+1 {{destination memref must have memory space 3}}			// expected-error @+1 {{destination memref must have memory space 3}}
	nvgpu.device_async_copy %src[%i], %dst[%i], 16 : memref<16xf32> to memref<16xf32>			nvgpu.device_async_copy %src[%i], %dst[%i], 16 : memref<16xf32> to memref<16xf32>
	return			return
	}			}

	// -----			// -----
	▲ Show 20 Lines • Show All 46 Lines • Show Last 20 Lines

This is an archive of the discontinued LLVM Phabricator instance.

[mlir][NVGPU] Verifiers for nvgpu.mma.sync Op
ClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 444070

mlir/include/mlir/Dialect/NVGPU/IR/NVGPU.td

mlir/lib/Dialect/NVGPU/IR/NVGPUDialect.cpp

mlir/test/Conversion/NVGPUToNVVM/nvgpu-to-nvvm.mlir

mlir/test/Dialect/NVGPU/invalid.mlir

This is an archive of the discontinued LLVM Phabricator instance.

[mlir][NVGPU] Verifiers for nvgpu.mma.sync OpClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 444070

mlir/include/mlir/Dialect/NVGPU/IR/NVGPU.td

mlir/lib/Dialect/NVGPU/IR/NVGPUDialect.cpp

mlir/test/Conversion/NVGPUToNVVM/nvgpu-to-nvvm.mlir

mlir/test/Dialect/NVGPU/invalid.mlir

[mlir][NVGPU] Verifiers for nvgpu.mma.sync Op
ClosedPublic