Diff 443359

mlir/include/mlir/Dialect/NVGPU/IR/NVGPU.td

Show First 20 Lines • Show All 106 Lines • ▼ Show 20 Lines	let arguments = (ins AnyVector:$matrixA, AnyVector:$matrixB,
AnyVector:$matrixC, I64ArrayAttr:$mmaShape);		AnyVector:$matrixC, I64ArrayAttr:$mmaShape);

let results = (outs AnyVector:$res);		let results = (outs AnyVector:$res);

let assemblyFormat = [{		let assemblyFormat = [{
`(` $matrixA`,` $matrixB`,` $matrixC `)` attr-dict		`(` $matrixA`,` $matrixB`,` $matrixC `)` attr-dict
`:` `(` type($matrixA) `,` type($matrixB) `,` type($matrixC) `)` `->` type($res)		`:` `(` type($matrixA) `,` type($matrixB) `,` type($matrixC) `)` `->` type($res)
}];		}];

		let hasVerifier = 1;
}		}


def NVGPU_DeviceAsyncCopyOp : NVGPU_Op<"device_async_copy",		def NVGPU_DeviceAsyncCopyOp : NVGPU_Op<"device_async_copy",
[AttrSizedOperandSegments]> {		[AttrSizedOperandSegments]> {
let summary = "device-side asynchronous copy";		let summary = "device-side asynchronous copy";
let description = [{		let description = [{
The `gpu.device_async_copy` op initiates an asynchronous copy operation of		The `gpu.device_async_copy` op initiates an asynchronous copy operation of
▲ Show 20 Lines • Show All 101 Lines • Show Last 20 Lines

mlir/lib/Dialect/NVGPU/IR/NVGPUDialect.cpp

Show All 11 Lines

#include "mlir/Dialect/NVGPU/IR/NVGPUDialect.h"		#include "mlir/Dialect/NVGPU/IR/NVGPUDialect.h"
#include "mlir/Dialect/GPU/IR/GPUDialect.h"		#include "mlir/Dialect/GPU/IR/GPUDialect.h"
#include "mlir/IR/Builders.h"		#include "mlir/IR/Builders.h"
#include "mlir/IR/DialectImplementation.h"		#include "mlir/IR/DialectImplementation.h"
#include "mlir/IR/OpImplementation.h"		#include "mlir/IR/OpImplementation.h"
#include "mlir/IR/TypeUtilities.h"		#include "mlir/IR/TypeUtilities.h"
#include "llvm/ADT/TypeSwitch.h"		#include "llvm/ADT/TypeSwitch.h"
		#include <iostream>

using namespace mlir;		using namespace mlir;
using namespace mlir::nvgpu;		using namespace mlir::nvgpu;

#include "mlir/Dialect/NVGPU/IR/NVGPUDialect.cpp.inc"		#include "mlir/Dialect/NVGPU/IR/NVGPUDialect.cpp.inc"

void nvgpu::NVGPUDialect::initialize() {		void nvgpu::NVGPUDialect::initialize() {
addTypes<DeviceAsyncTokenType>();		addTypes<DeviceAsyncTokenType>();
▲ Show 20 Lines • Show All 55 Lines • ▼ Show 20 Lines	return emitOpError() << "expected " << srcMemref.getRank()
<< " source indices, got " << getSrcIndices().size();		<< " source indices, got " << getSrcIndices().size();
if (size_t(dstMemref.getRank()) != getDstIndices().size())		if (size_t(dstMemref.getRank()) != getDstIndices().size())
return emitOpError() << "expected " << dstMemref.getRank()		return emitOpError() << "expected " << dstMemref.getRank()
<< " destination indices, got "		<< " destination indices, got "
<< getDstIndices().size();		<< getDstIndices().size();
return success();		return success();
}		}

		LogicalResult MmaSyncOp::verify() {

		// Fundamental tensor core mma.sync op
		// For F32 (TF32), F16, S8, and S4 data types fundamental tensor core
		// operation is of shape: 8-by-8-by-128b. F64 is an exception. The
		// verification for mma.sync covering various shapes and data types is based
		// on the fundamental tensor core operionation.
		constexpr int kThreads = 32; // 32 threads per warp
		int64_t shapeM = 8;
		int64_t shapeN = 8;
		int64_t shapeK; // set based on data type (128b for all data types except F64)

		// Number of elements A, B, and C per thread per fundamental tensor core tile
		int64_t numElementA; // set based on data type (32b except F64)
		int64_t numElementB; // set based on data type (32b except F64)
		int64_t numElementC{2}; // two accumulator elements per fundamental tile

		// nvgpu.mma.sync vector operands (per thread)
		auto aVector = getMatrixA().getType().cast<VectorType>();
		auto bVector = getMatrixB().getType().cast<VectorType>();
		auto cVector = getMatrixC().getType().cast<VectorType>();

		// vector shapes
		auto aShape = aVector.getShape();
		ThomasRaouxUnsubmitted Done Reply Inline Actions Please spell out those types: https://llvm.org/docs/CodingStandards.html#use-auto-type-deduction-to-make-code-more-readable ThomasRaoux: Please spell out those types: https://llvm.org/docs/CodingStandards.html#use-auto-type…
		manishucsdAuthorUnsubmitted Done Reply Inline Actions Thanks for the review comments and sharing the link. I have made the appropriate changes. manishucsd: Thanks for the review comments and sharing the link. I have made the appropriate changes.
		auto bShape = bVector.getShape();
		auto cShape = cVector.getShape();

		// vector element type
		auto aType = aVector.getElementType();
		ThomasRaouxUnsubmitted Not Done Reply Inline Actions same here spell those out ThomasRaoux: same here spell those out
		auto bType = bVector.getElementType();
		auto cType = cVector.getElementType();

		// nvgpu.mma.sync shape (per 32 threads or per warp)
		int64_t m = getMmaShape()[0].cast<IntegerAttr>().getInt();
		int64_t n = getMmaShape()[1].cast<IntegerAttr>().getInt();
		int64_t k = getMmaShape()[2].cast<IntegerAttr>().getInt();

		if (aType.getTypeID() != bType.getTypeID()) {
		ThomasRaouxUnsubmitted Done Reply Inline Actions should be `aType == bType` Note that I think this can be enforced in the tablegen file instead which would be better. ThomasRaoux: should be `aType == bType` Note that I think this can be enforced in the tablegen file instead…
		return emitError() << "expected same data type for matrix A and matrix B";
		}

		if (aType.isF64()) {
		// exception to 8-by-8-128b fundamental tensor core tile size
		shapeK = 4;
		numElementA = 1;
		numElementB = 1;
		}

		else if (aType.isF32() \|\| aType.isBF16() \|\| aType.isF16() \|\|
		aType.isInteger(8)) {
		// 8-by-8-128b fundamental tensor core tile size
		int operandBitwidth = aType.getIntOrFloatBitWidth();
		shapeK = 128 / operandBitwidth; // 128b wide shapeK
		numElementA = 32 / operandBitwidth; // 32b wide operand A
		numElementB = 32 / operandBitwidth; // 32b wide operand B
		} else {
		return emitError() << "expected input data type (i4,i8,f16,bf16,tf32,f64) "
		ThomasRaouxUnsubmitted Done Reply Inline Actions I don't see i4 nor f64 in the condition ThomasRaoux: I don't see i4 nor f64 in the condition
		manishucsdAuthorUnsubmitted Done Reply Inline Actions Thanks for catching this. I added i4. manishucsd: Thanks for catching this. I added i4.
		"supported by nvgpu.mma.sync";
		}

		//
		// Basic verfication
		//

		// verify warp-wide size for vector a
		if (aShape[0] * aShape[1] * kThreads != m * k) {
		ThomasRaouxUnsubmitted Not Done Reply Inline Actions skip braces for single line `if`: https://llvm.org/docs/CodingStandards.html#don-t-use-braces-on-simple-single-statement-bodies-of-if-else-loop-statements ThomasRaoux: skip braces for single line `if`: https://llvm.org/docs/CodingStandards.html#don-t-use-braces…
		return emitOpError() << "expected " << m * k
		<< " warp-wide matrix A elements";
		}

		// verify warp-wide size for vector b
		if (bShape[0] * bShape[1] * kThreads != k * n) {
		return emitOpError() << "expected " << k * n
		<< " warp-wide matrix B elements";
		}

		// verify warp-wide size for vector c
		if (cShape[0] * cShape[1] * kThreads != m * n) {
		return emitOpError() << "expected " << m * n
		<< " warp-wide matrix C elements";
		}

		//
		// Extended verification
		//

		// tiles of fundamental tensor core operations
		int64_t mTile = m / shapeM;
		int64_t nTile = n / shapeN;
		int64_t kTile = k / shapeK;

		// verify shape of aVector
		if (!((aShape[0] == mTile * kTile) && (aShape[1] == numElementA))) {
		return emitOpError() << "expected matrix A to be shaped (" << mTile * kTile
		<< " x " << numElementA << ")";
		}

		// verify shape of bVector
		if (!((bShape[0] == kTile * nTile) && (bShape[1] == numElementB))) {
		return emitOpError() << "expected matrix B to be shaped (" << kTile * nTile
		<< " x " << numElementB << ")";
		}

		// verify shape of cVector
		if (!((cShape[0] == mTile * nTile) && (cShape[1] == numElementC))) {
		return emitOpError() << "expected matrix C to be shaped (" << mTile * nTile
		<< " x " << numElementC << ")";
		}

		return success();
		}

#define GET_OP_CLASSES		#define GET_OP_CLASSES
#include "mlir/Dialect/NVGPU/IR/NVGPU.cpp.inc"		#include "mlir/Dialect/NVGPU/IR/NVGPU.cpp.inc"

mlir/test/Dialect/NVGPU/invalid.mlir

	// RUN: mlir-opt -split-input-file -verify-diagnostics %s			// RUN: mlir-opt -split-input-file -verify-diagnostics %s
				func.func @m16n8k16_fp16_vector_shape_a(%arg0: vector<4x4xf16>, %arg1: vector<2x2xf16>, %arg2: vector<2x2xf16>) -> vector<2x2xf16> {
				// expected-error @+1 {{expected 256 warp-wide matrix A elements}}
				%d = nvgpu.mma.sync (%arg0, %arg1, %arg2) {mmaShape = [16, 8, 16]} : (vector<4x4xf16>, vector<2x2xf16>, vector<2x2xf16>) -> vector<2x2xf16>
				return %d : vector<2x2xf16>
				}
				// -----

				func.func @m16n8k16_fp16_vector_shape_b(%arg0: vector<4x2xf16>, %arg1: vector<2x4xf16>, %arg2: vector<2x2xf16>) -> vector<2x2xf16> {
				// expected-error @+1 {{expected 128 warp-wide matrix B elements}}
				%d = nvgpu.mma.sync (%arg0, %arg1, %arg2) {mmaShape = [16, 8, 16]} : (vector<4x2xf16>, vector<2x4xf16>, vector<2x2xf16>) -> vector<2x2xf16>
				return %d : vector<2x2xf16>
				}
				// -----

				func.func @m16n8k16_fp16_vector_shape_c(%arg0: vector<4x2xf16>, %arg1: vector<2x2xf16>, %arg2: vector<2x4xf16>) -> vector<2x4xf16> {
				// expected-error @+1 {{expected 128 warp-wide matrix C elements}}
				%d = nvgpu.mma.sync (%arg0, %arg1, %arg2) {mmaShape = [16, 8, 16]} : (vector<4x2xf16>, vector<2x2xf16>, vector<2x4xf16>) -> vector<2x4xf16>
				return %d : vector<2x4xf16>
				}
				// -----

				func.func @m16n8k16_fp16_vector_shape_a_extended(%arg0: vector<2x4xf16>, %arg1: vector<2x2xf16>, %arg2: vector<2x2xf16>) -> vector<2x2xf16> {
				// expected-error @+1 {{expected matrix A to be shaped (4 x 2)}}
				%d = nvgpu.mma.sync (%arg0, %arg1, %arg2) {mmaShape = [16, 8, 16]} : (vector<2x4xf16>, vector<2x2xf16>, vector<2x2xf16>) -> vector<2x2xf16>
				return %d : vector<2x2xf16>
				}
				// -----

				func.func @m16n8k8_fp32_vector_shape_a(%arg0: vector<4x2xf32>, %arg1: vector<2x1xf32>, %arg2: vector<2x2xf32>) -> vector<2x2xf32> {
				// expected-error @+1 {{expected 128 warp-wide matrix A elements}}
				%d = nvgpu.mma.sync (%arg0, %arg1, %arg2) {mmaShape = [16, 8, 8]} : (vector<4x2xf32>, vector<2x1xf32>, vector<2x2xf32>) -> vector<2x2xf32>
				return %d : vector<2x2xf32>
				}
				// -----

				func.func @m16n8k8_fp32_vector_shape_a_extended(%arg0: vector<1x4xf32>, %arg1: vector<2x1xf32>, %arg2: vector<2x2xf32>) -> vector<2x2xf32> {
				// expected-error @+1 {{expected matrix A to be shaped (4 x 1)}}
				%d = nvgpu.mma.sync (%arg0, %arg1, %arg2) {mmaShape = [16, 8, 8]} : (vector<1x4xf32>, vector<2x1xf32>, vector<2x2xf32>) -> vector<2x2xf32>
				return %d : vector<2x2xf32>
				}
				// -----

				func.func @m8n8k4_fp64_vector_shape_a(%arg0: vector<1x2xf64>, %arg1: vector<1x1xf64>, %arg2: vector<1x2xf64>) -> vector<1x2xf64> {
				// expected-error @+1 {{expected 32 warp-wide matrix A elements}}
				%d = nvgpu.mma.sync (%arg0, %arg1, %arg2) {mmaShape = [8, 8, 4]} : (vector<1x2xf64>, vector<1x1xf64>, vector<1x2xf64>) -> vector<1x2xf64>
				return %d : vector<1x2xf64>
				}
				// -----

				func.func @m8n8k4_fp64_vector_shape_c_extended(%arg0: vector<1x1xf64>, %arg1: vector<1x1xf64>, %arg2: vector<2x1xf64>) -> vector<2x1xf64> {
				// expected-error @+1 {{expected matrix C to be shaped (1 x 2)}}
				%d = nvgpu.mma.sync (%arg0, %arg1, %arg2) {mmaShape = [8, 8, 4]} : (vector<1x1xf64>, vector<1x1xf64>, vector<2x1xf64>) -> vector<2x1xf64>
				return %d : vector<2x1xf64>
				}
				// -----

				func.func @m16n8k32_int8_vector_shape_b(%arg0: vector<4x4xi8>, %arg1: vector<4x4xi8>, %arg2: vector<2x2xi32>) -> vector<2x2xi32> {
				// expected-error @+1 {{expected 256 warp-wide matrix B elements}}
				%d = nvgpu.mma.sync (%arg0, %arg1, %arg2) {mmaShape = [16, 8, 32]} : (vector<4x4xi8>, vector<4x4xi8>, vector<2x2xi32>) -> vector<2x2xi32>
				return %d : vector<2x2xi32>
				}
				// -----
				func.func @m16n8k32_int32_datatype(%arg0: vector<4x4xi32>, %arg1: vector<2x4xi8>, %arg2: vector<2x2xi32>) -> vector<2x2xi32> {
				// expected-error @+1 {{expected input data type (i4,i8,f16,bf16,tf32,f64) supported by nvgpu.mma.sync}}
				%d = nvgpu.mma.sync (%arg0, %arg1, %arg2) {mmaShape = [16, 8, 32]} : (vector<4x4xi32>, vector<2x4xi8>, vector<2x2xi32>) -> vector<2x2xi32>
				return %d : vector<2x2xi32>
				}
				// -----

	func.func @async_cp_memory_space(%dst : memref<16xf32>, %src : memref<16xf32>, %i : index) -> () {			func.func @async_cp_memory_space(%dst : memref<16xf32>, %src : memref<16xf32>, %i : index) -> () {
	// expected-error @+1 {{destination memref must have memory space 3}}			// expected-error @+1 {{destination memref must have memory space 3}}
	nvgpu.device_async_copy %src[%i], %dst[%i], 16 : memref<16xf32> to memref<16xf32>			nvgpu.device_async_copy %src[%i], %dst[%i], 16 : memref<16xf32> to memref<16xf32>
	return			return
	}			}

	// -----			// -----
	▲ Show 20 Lines • Show All 46 Lines • Show Last 20 Lines

This is an archive of the discontinued LLVM Phabricator instance.

[mlir][NVGPU] Verifiers for nvgpu.mma.sync Op
ClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 443359

mlir/include/mlir/Dialect/NVGPU/IR/NVGPU.td

mlir/lib/Dialect/NVGPU/IR/NVGPUDialect.cpp

mlir/test/Dialect/NVGPU/invalid.mlir

This is an archive of the discontinued LLVM Phabricator instance.

[mlir][NVGPU] Verifiers for nvgpu.mma.sync OpClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 443359

mlir/include/mlir/Dialect/NVGPU/IR/NVGPU.td

mlir/lib/Dialect/NVGPU/IR/NVGPUDialect.cpp

mlir/test/Dialect/NVGPU/invalid.mlir

[mlir][NVGPU] Verifiers for nvgpu.mma.sync Op
ClosedPublic