Diff 469087

mlir/include/mlir/Dialect/NVGPU/Utils/MMAUtils.h

//===-- MMAUtils.h - MLIR NVGPU dialect utilities for MMA operations-------===//

// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.

// See https://llvm.org/LICENSE.txt for license information.

// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

//===----------------------------------------------------------------------===//

// This file provides utilities to assist in the lowering of other dialects

// (e.g. Vector) to `nvgpu.mma.*` dialect operations.

//===----------------------------------------------------------------------===//

#ifndef MLIR_DIALECT_NVGPU_UTILS_MMAUTILS_H

#define MLIR_DIALECT_NVGPU_UTILS_MMAUTILS_H

#include "mlir/Dialect/LLVMIR/NVVMDialect.h"

#include "mlir/Dialect/Vector/IR/VectorOps.h"

#include "mlir/IR/PatternMatch.h"

#include "mlir/IR/Types.h"

namespace mlir {

namespace vector {

enum class IteratorType : uint32_t;

class ContractionOp;

} // namespace vector

namespace NVVM {

enum class MMALayout : uint32_t;

} // namespace NVVM

namespace nvgpu {

/// Represents the role of an operand in an MMA instruction:

/// `result := matmul(A, B) + C`

enum class MatMulOperandRole : int32_t { A = 0, B, C };

/// Returns the first user of the `op` that is vector.contract. If no

/// vector.contract user exists, return failure.

christopherbateUnsubmitted

Done

/// Returns the first user of the `op` that is vector.contract. If no

- /// vector.contract user are ther for the `op`, return failure.

+ /// vector.contract user exists, return failure.

FailureOr<vector::ContractionOp> getUserContract(Operation *op);

christopherbate:

FailureOr<vector::ContractionOp> getUserContract(Operation *op);

/// Collects information about a warp-level matrix operand represented by a

/// VectorType.

struct WarpMatrixInfo {

VectorType vectorType;

MatMulOperandRole operandRole;

};

/// If `op` is a `vector.transfer_write`, return the `WarpMatrixInfo` for the

▲ Show 20 Lines • Show All 72 Lines • Show Last 20 Lines

mlir/lib/Conversion/VectorToGPU/VectorToGPU.cpp

Show First 20 Lines • Show All 186 Lines • ▼ Show 20 Lines	convertElementwiseOpToMMA(Operation *op) {
return llvm::None;		return llvm::None;
}		}

/// Return true if the op is supported as elementwise op on MMAMatrix type.		/// Return true if the op is supported as elementwise op on MMAMatrix type.
static bool elementwiseSupportsMMAMatrixType(Operation *op) {		static bool elementwiseSupportsMMAMatrixType(Operation *op) {
return convertElementwiseOpToMMA(op).has_value();		return convertElementwiseOpToMMA(op).has_value();
}		}

		/// Returns true if the extract strided slice op is supported with `mma.sync`
		/// path.
		static bool
		christopherbateUnsubmitted Not Done Reply Inline Actions Maybe just return false where there are currently `assert` statements. What's going to happen on release builds if this occurs? The purpose of the function is to validate what is expected after all. I know we're kind of in a weird place here in terms of emitting diagnostic information. IMO going forward we should convert these validation functions to return LogicalResult and replace the asserts with return op.emitWarning(msg). Currently if a user is messing with mlir-opt and tries this pass with invalid input IR, they'll get no feedback on why it fails to convert or, in the worst case, a crash. For the `useNvGpu` parameter, it might be simpler to just omit that and `return useNvGpu && extractSlicedSliceSupportsMMAMatrixType(...)` in the `supportsMMaMatrixType` function christopherbate: Maybe just return false where there are currently `assert` statements. What's going to happen…
		extractStridedSliceSupportsMMAMatrixType(vector::ExtractStridedSliceOp op) {

		FailureOr<nvgpu::WarpMatrixInfo> warpMatrixInfo =
		nvgpu::getWarpMatrixInfo(op);
		if (failed(warpMatrixInfo))
		return false;

		FailureOr<vector::ContractionOp> contractOp = nvgpu::getUserContract(op);
		if (failed(contractOp))
		return false;

		// Handle vector.extract_strided_slice on registers containing
		// matrixB and matrixC operands. vector.extract_strided_slice op
		// is not supported on registers containing matrixA operands.
		if (warpMatrixInfo->operandRole == nvgpu::MatMulOperandRole::B)
		return (op->getResult(0).getType().cast<VectorType>() ==
		(*contractOp).getRhs().getType().cast<VectorType>());
		else if (warpMatrixInfo->operandRole == nvgpu::MatMulOperandRole::C)
		return (op->getResult(0).getType().cast<VectorType>() ==
		ThomasRaouxUnsubmitted Done Reply Inline Actions nit: this comment is a bit confusing has the code could reach here it is just that this is cannot be converted to simt code. I would remove it or rephrase saying we only handle matrixB and matrixC ThomasRaoux: nit: this comment is a bit confusing has the code could reach here it is just that this is…
		(*contractOp).getAcc().getType().cast<VectorType>());

		return false;
		christopherbateUnsubmitted Done Reply Inline Actions Can we emit the case for operand A and simplify to the below? if(operandB) return op->getResult() == contractOp->getRhs(); if(operandC) return op->getResult() == contractOp->getAcc(); return false; christopherbate: Can we emit the case for operand A and simplify to the below? ``` if(operandB) return op…
		manishucsdAuthorUnsubmitted Done Reply Inline Actions Done! thanks for the suggestion. manishucsd: Done! thanks for the suggestion.
		}

static bool supportsMMaMatrixType(Operation *op, bool useNvGpu) {		static bool supportsMMaMatrixType(Operation *op, bool useNvGpu) {
if (isa<scf::ForOp, scf::YieldOp>(op))		if (isa<scf::ForOp, scf::YieldOp>(op))
return true;		return true;
if (auto transferRead = dyn_cast<vector::TransferReadOp>(op))		if (auto transferRead = dyn_cast<vector::TransferReadOp>(op))
return transferReadSupportsMMAMatrixType(transferRead, useNvGpu);		return transferReadSupportsMMAMatrixType(transferRead, useNvGpu);
if (auto transferWrite = dyn_cast<vector::TransferWriteOp>(op))		if (auto transferWrite = dyn_cast<vector::TransferWriteOp>(op))
return transferWriteSupportsMMAMatrixType(transferWrite);		return transferWriteSupportsMMAMatrixType(transferWrite);
		if (auto extractStridedSlice = dyn_cast<vector::ExtractStridedSliceOp>(op))
		return useNvGpu &&
		extractStridedSliceSupportsMMAMatrixType(extractStridedSlice);
if (auto contract = dyn_cast<vector::ContractionOp>(op))		if (auto contract = dyn_cast<vector::ContractionOp>(op))
return contractSupportsMMAMatrixType(contract, useNvGpu);		return contractSupportsMMAMatrixType(contract, useNvGpu);
if (auto constant = dyn_cast<arith::ConstantOp>(op))		if (auto constant = dyn_cast<arith::ConstantOp>(op))
return constantSupportsMMAMatrixType(constant);		return constantSupportsMMAMatrixType(constant);
if (auto broadcast = dyn_cast<vector::BroadcastOp>(op))		if (auto broadcast = dyn_cast<vector::BroadcastOp>(op))
return broadcastSupportsMMAMatrixType(broadcast);		return broadcastSupportsMMAMatrixType(broadcast);
return elementwiseSupportsMMAMatrixType(op);		return elementwiseSupportsMMAMatrixType(op);
}		}
▲ Show 20 Lines • Show All 123 Lines • ▼ Show 20 Lines	LogicalResult matchAndRewrite(vector::ContractionOp op,
rewriter.replaceOpWithNewOp<vector::ContractionOp>(		rewriter.replaceOpWithNewOp<vector::ContractionOp>(
op, lhs, rhs, res,		op, lhs, rhs, res,
rewriter.getAffineMapArrayAttr(infer({{m, k}, {k, n}, {m, n}})),		rewriter.getAffineMapArrayAttr(infer({{m, k}, {k, n}, {m, n}})),
op.getIteratorTypes());		op.getIteratorTypes());
return success();		return success();
}		}
};		};

// Merge transpose op into the transfer read op. Transpose are not supported on		// Fold transpose op into the transfer read op. Nvgpu mma.sync op only supports
// MMA types but MMA load can transpose the matrix when loading.		// row-, column-, and row-major layout for matrixA, matrixB, and matrixC,
		// respectively. We can fold the transpose operation when loading the data from
		// Shared Memory to registers.
struct CombineTransferReadOpTranspose final		struct CombineTransferReadOpTranspose final
: public OpRewritePattern<vector::TransposeOp> {		: public OpRewritePattern<vector::TransposeOp> {
using OpRewritePattern<vector::TransposeOp>::OpRewritePattern;		using OpRewritePattern<vector::TransposeOp>::OpRewritePattern;

LogicalResult matchAndRewrite(vector::TransposeOp op,		LogicalResult matchAndRewrite(vector::TransposeOp op,
PatternRewriter &rewriter) const override {		PatternRewriter &rewriter) const override {
auto transferReadOp =		auto transferReadOp =
op.getVector().getDefiningOp<vector::TransferReadOp>();		op.getVector().getDefiningOp<vector::TransferReadOp>();
▲ Show 20 Lines • Show All 264 Lines • ▼ Show 20 Lines	convertTransferReadToLoads(vector::TransferReadOp op,
bool isLdMatrixCompatible =		bool isLdMatrixCompatible =
op.getSource().getType().cast<MemRefType>().getMemorySpaceAsInt() == 3 &&		op.getSource().getType().cast<MemRefType>().getMemorySpaceAsInt() == 3 &&
nvgpu::inferTileWidthInBits(*warpMatrixInfo) == 128;		nvgpu::inferTileWidthInBits(*warpMatrixInfo) == 128;

VectorType vecTy = op.getVectorType();		VectorType vecTy = op.getVectorType();
int64_t bitWidth = vecTy.getElementType().getIntOrFloatBitWidth();		int64_t bitWidth = vecTy.getElementType().getIntOrFloatBitWidth();

// When we are transposing the B operand, ldmatrix will only work if we have		// When we are transposing the B operand, ldmatrix will only work if we have
// at least 8 rows to read and the width to read for the transpose is 128		// at least 8 rows to read and the width to read for the transpose is 128
// bits.		// bits.
if (!op.getPermutationMap().isMinorIdentity() &&		if (!op.getPermutationMap().isMinorIdentity() &&
(bitWidth != 16 \|\| vecTy.getDimSize(1) < 8 \|\|		(bitWidth != 16 \|\| vecTy.getDimSize(1) < 8 \|\|
vecTy.getDimSize(0) * bitWidth < 128))		vecTy.getDimSize(0) * bitWidth < 128))
isLdMatrixCompatible = false;		isLdMatrixCompatible = false;

if (!isLdMatrixCompatible)		if (!isLdMatrixCompatible)
return createNonLdMatrixLoads(op, b, valueMapping);		return createNonLdMatrixLoads(op, b, valueMapping);
Show All 34 Lines	for (unsigned i = 0; i < vectorType.getShape()[0]; i++) {
getXferIndices<vector::TransferWriteOp>(		getXferIndices<vector::TransferWriteOp>(
b, op, *coords, {laneId, logicalValueId}, newIndices);		b, op, *coords, {laneId, logicalValueId}, newIndices);
b.create<vector::StoreOp>(loc, el, op.getSource(), newIndices);		b.create<vector::StoreOp>(loc, el, op.getSource(), newIndices);
}		}
op->erase();		op->erase();
return success();		return success();
}		}

		static void populateFromInt64AttrArray(ArrayAttr arrayAttr,
		SmallVectorImpl<int64_t> &results) {
		for (auto attr : arrayAttr)
		results.push_back(attr.cast<IntegerAttr>().getInt());
		}

		static LogicalResult
		convertExtractStridedSlice(vector::ExtractStridedSliceOp op,
		llvm::DenseMap<Value, Value> &valueMapping) {

		OpBuilder b(op);
		Location loc = op->getLoc();

		FailureOr<nvgpu::WarpMatrixInfo> warpMatrixInfo =
		nvgpu::getWarpMatrixInfo(op);
		if (failed(warpMatrixInfo))
		return failure();

		FailureOr<nvgpu::FragmentElementInfo> mmaSyncFragmentInfo =
		nvgpu::getMmaSyncRegisterType(*warpMatrixInfo);
		if (failed(mmaSyncFragmentInfo))
		return failure();

		// Find the vector.transer_read whose result vector is being sliced.
		auto transferReadOp = op.getVector().getDefiningOp<vector::TransferReadOp>();
		if (!transferReadOp)
		christopherbateUnsubmitted Done Reply Inline Actions `if(!transferReadOp) return failure();` christopherbate: `if(!transferReadOp) return failure();`
		return failure();

		warpMatrixInfo = nvgpu::getWarpMatrixInfo(transferReadOp);
		if (failed(warpMatrixInfo))
		return failure();

		FailureOr<nvgpu::FragmentElementInfo> ldFragmentInfo =
		nvgpu::getMmaSyncRegisterType(*warpMatrixInfo);
		if (failed(ldFragmentInfo))
		return failure();

		assert(
		(mmaSyncFragmentInfo->elementsPerRegister ==
		ldFragmentInfo->elementsPerRegister) &&
		"Number of elements per register should be same for load and mma.sync");

		// Create vector.extract_strided_slice op for thread-owned fragments.
		std::array<int64_t, 2> strides = {1,
		1}; // stride for extract slice is always 1.
		std::array<int64_t, 2> sliceShape = {
		mmaSyncFragmentInfo->numRegistersPerFragment,
		mmaSyncFragmentInfo->elementsPerRegister};
		auto sourceVector = valueMapping.find(transferReadOp)->second;

		// offset and sizes at warp-level of onwership.
		SmallVector<int64_t> offsets;
		populateFromInt64AttrArray(op.getOffsets(), offsets);

		SmallVector<int64_t> sizes;
		populateFromInt64AttrArray(op.getSizes(), sizes);
		ArrayRef<int64_t> warpVectorShape = op.getVectorType().getShape();

		// Compute offset in vector registers. Note that the mma.sync vector registers
		// are shaped as numberOfFragments x numberOfRegistersPerfFragment. The vector
		// registers can only be sliced along numberOfFragments, i.e., sliceOffset[0].
		std::array<int64_t, 2> sliceOffset = {0, 0};

		if (offsets[0] && offsets[1])
		return op->emitError() << "Slicing fragments in 2D is not supported. ";
		else if (offsets[0])
		sliceOffset[0] = (warpVectorShape[0] / offsets[0]);
		else if (offsets[1])
		sliceOffset[0] = (warpVectorShape[1] / offsets[1]);

		Value newOp = b.create<vector::ExtractStridedSliceOp>(
		loc, sourceVector, sliceOffset, sliceShape, strides);

		valueMapping[op] = newOp;
		return success();
		}

static void convertContractOp(vector::ContractionOp op,		static void convertContractOp(vector::ContractionOp op,
llvm::DenseMap<Value, Value> &valueMapping) {		llvm::DenseMap<Value, Value> &valueMapping) {
OpBuilder b(op);		OpBuilder b(op);
Value opA = valueMapping.find(op.getLhs())->second;		Value opA = valueMapping.find(op.getLhs())->second;
Value opB = valueMapping.find(op.getRhs())->second;		Value opB = valueMapping.find(op.getRhs())->second;
Value opC = valueMapping.find(op.getAcc())->second;		Value opC = valueMapping.find(op.getAcc())->second;
Value matmul = b.create<gpu::SubgroupMmaComputeOp>(op.getLoc(), opC.getType(),		Value matmul = b.create<gpu::SubgroupMmaComputeOp>(op.getLoc(), opC.getType(),
opA, opB, opC);		opA, opB, opC);
▲ Show 20 Lines • Show All 171 Lines • ▼ Show 20 Lines	for (Operation *op : ops) {
if (llvm::TypeSwitch<Operation *, LogicalResult>(op)		if (llvm::TypeSwitch<Operation *, LogicalResult>(op)
.Case([&](vector::TransferReadOp transferReadOp) {		.Case([&](vector::TransferReadOp transferReadOp) {
return convertTransferReadToLoads(transferReadOp, valueMapping);		return convertTransferReadToLoads(transferReadOp, valueMapping);
})		})
.Case([&](vector::TransferWriteOp transferWriteOp) {		.Case([&](vector::TransferWriteOp transferWriteOp) {
return convertTransferWriteToStores(transferWriteOp,		return convertTransferWriteToStores(transferWriteOp,
valueMapping);		valueMapping);
})		})
		.Case([&](vector::ExtractStridedSliceOp extractStridedSliceOp) {
		return convertExtractStridedSlice(extractStridedSliceOp,
		valueMapping);
		})
.Case([&](vector::ContractionOp contractionOp) {		.Case([&](vector::ContractionOp contractionOp) {
return convertContractOpToMmaSync(contractionOp, valueMapping);		return convertContractOpToMmaSync(contractionOp, valueMapping);
})		})
.Case([&](scf::ForOp forOp) {		.Case([&](scf::ForOp forOp) {
convertForOp(forOp, valueMapping);		convertForOp(forOp, valueMapping);
return success();		return success();
})		})
.Case([&](scf::YieldOp yieldOp) {		.Case([&](scf::YieldOp yieldOp) {
▲ Show 20 Lines • Show All 48 Lines • Show Last 20 Lines

mlir/lib/Dialect/NVGPU/Utils/MMAUtils.cpp

Show All 39 Lines	static std::array<int64_t, 2> getTileShape(ArrayRef<int64_t> operandShape,
Type elementType,		Type elementType,
int64_t lineSizeBits) {		int64_t lineSizeBits) {
// For each 8x128bit square, a thread is responsible for one 32bit register.		// For each 8x128bit square, a thread is responsible for one 32bit register.
return {operandShape[0] / kNumRowsPerTile,		return {operandShape[0] / kNumRowsPerTile,
(operandShape[1] * elementType.getIntOrFloatBitWidth()) /		(operandShape[1] * elementType.getIntOrFloatBitWidth()) /
lineSizeBits};		lineSizeBits};
}		}

		/// Returns the first user of the `op` that is vector.contract. If no
		/// vector.contract user exists, return failure.
		FailureOr<vector::ContractionOp> nvgpu::getUserContract(Operation *op) {
		christopherbateUnsubmitted Not Done Reply Inline Actions Move to `VectorToGPU.cpp` as a static method? christopherbate: Move to `VectorToGPU.cpp` as a static method?
		manishucsdAuthorUnsubmitted Done Reply Inline Actions Can this helper function also be used in `getWarpMatrixInfo`? manishucsd: Can this helper function also be used in `getWarpMatrixInfo`?
		for (Operation *user : op->getUsers()) {
		if (auto contractOp = dyn_cast<vector::ContractionOp>(user))
		return contractOp;
		}
		return failure();
		}

FailureOr<WarpMatrixInfo> nvgpu::getWarpMatrixInfo(Operation *op) {		FailureOr<WarpMatrixInfo> nvgpu::getWarpMatrixInfo(Operation *op) {
WarpMatrixInfo info;		WarpMatrixInfo info;

// Determine the vector type.		// Determine the vector type at warp-level.
if (vector::TransferWriteOp writeOp = dyn_cast<vector::TransferWriteOp>(op)) {		if (vector::TransferWriteOp writeOp = dyn_cast<vector::TransferWriteOp>(op)) {
info.vectorType = writeOp.getVectorType();		info.vectorType = writeOp.getVectorType();
} else if (isa<vector::TransferReadOp, vector::ContractionOp,		} else if (isa<vector::TransferReadOp, vector::ContractionOp,
arith::ConstantOp>(op)) {		vector::ExtractStridedSliceOp, arith::ConstantOp>(op)) {
info.vectorType = op->getResult(0).getType().cast<VectorType>();		info.vectorType = op->getResult(0).getType().cast<VectorType>();
} else {		} else {
return op->emitError()		return op->emitError()
<< "unhandled operation type in nvgpu.mma.sync conversion path";		<< "unhandled operation type in nvgpu.mma.sync conversion path";
}		}

// Determine the operand role. We assume it is an accumulator/result unless it		// Determine the operand role. We assume it is an accumulator/result unless it
// is directly consumed by a `vector.contract` op.		// is directly consumed by a `vector.contract` op.
info.operandRole = MatMulOperandRole::C;		info.operandRole = MatMulOperandRole::C;
for (Operation *user : op->getUsers()) {		FailureOr<vector::ContractionOp> contractOp = getUserContract(op);
auto contract = dyn_cast<vector::ContractionOp>(user);		if (failed(contractOp))
if (!contract)		return info;
continue;
if (contract.getLhs() == op->getResult(0)) {		if ((*contractOp).getLhs() == op->getResult(0))
info.operandRole = MatMulOperandRole::A;		info.operandRole = MatMulOperandRole::A;
break;		else if ((*contractOp).getRhs() == op->getResult(0))
}
if (contract.getRhs() == op->getResult(0)) {
info.operandRole = MatMulOperandRole::B;		info.operandRole = MatMulOperandRole::B;
break;
}
}
return info;		return info;
}		}

int64_t nvgpu::inferTileWidthInBits(const WarpMatrixInfo &type) {		int64_t nvgpu::inferTileWidthInBits(const WarpMatrixInfo &type) {
bool isAcc = isAccumulatorOrResult(type.operandRole);		bool isAcc = isAccumulatorOrResult(type.operandRole);
Type elType = type.vectorType.getElementType();		Type elType = type.vectorType.getElementType();
if (isAcc && elType.getIntOrFloatBitWidth() == 32) {		if (isAcc && elType.getIntOrFloatBitWidth() == 32) {
return 256;		return 256;
▲ Show 20 Lines • Show All 240 Lines • Show Last 20 Lines

mlir/test/Conversion/VectorToGPU/vector-to-mma-ops-mma-sync.mlir

Show First 20 Lines • Show All 158 Lines • ▼ Show 20 Lines	func.func @m8n8k4_f64_row_row_row(%arg0: memref<128x128xf64>, %arg1: memref<128x128xf64>, %arg2: memref<128x128xf64>) {
// CHECK-DAG: [[col:%.+]] = affine.apply [[$colC0_map]]		// CHECK-DAG: [[col:%.+]] = affine.apply [[$colC0_map]]
// CHECK: vector.store {{%.+}}, %arg2[[[row]], [[col]]] : memref<128x128xf64>, vector<2xf64>		// CHECK: vector.store {{%.+}}, %arg2[[[row]], [[col]]] : memref<128x128xf64>, vector<2xf64>
vector.transfer_write %D, %arg2[%c49, %c40] {in_bounds = [true, true]} : vector<8x8xf64>, memref<128x128xf64>		vector.transfer_write %D, %arg2[%c49, %c40] {in_bounds = [true, true]} : vector<8x8xf64>, memref<128x128xf64>
return		return
}		}

// -----		// -----

//#########################################################		//#########################################################################
// FP16 row-row-row		// FP16 row-row-row (ldmatrix x4 for matrixA and ldmatrix x2 for matrixB)
//#########################################################		//#########################################################################

#map0 = affine_map<(d0, d1) -> (d1, d0)>		#map0 = affine_map<(d0, d1) -> (d1, d0)>
#map1 = affine_map<(d0, d1, d2) -> (d0, d2)>		#map1 = affine_map<(d0, d1, d2) -> (d0, d2)>
#map2 = affine_map<(d0, d1, d2) -> (d1, d2)>		#map2 = affine_map<(d0, d1, d2) -> (d1, d2)>
#map3 = affine_map<(d0, d1, d2) -> (d0, d1)>		#map3 = affine_map<(d0, d1, d2) -> (d0, d1)>

// CHECK-DAG: [[$rowA_map:#.+]] = affine_map<()[s0] -> (s0 mod 16 + 1)>		// CHECK-DAG: [[$rowA_map:#.+]] = affine_map<()[s0] -> (s0 mod 16 + 1)>
// CHECK-DAG: [[$colA_map:#.+]] = affine_map<()[s0] -> ((s0 floordiv 16) * 8 + 3)>		// CHECK-DAG: [[$colA_map:#.+]] = affine_map<()[s0] -> ((s0 floordiv 16) * 8 + 3)>
Show All 20 Lines	func.func @m16n8k16_fp16_row_row_row(%arg0: memref<20x20xf16, 3>, %arg1: memref<20x20xf16, 3>, %arg2: memref<20x20xf16, 3>) {
%C = vector.transfer_read %arg2[%c0, %c0], %cst {in_bounds = [true, true]} : memref<20x20xf16, 3>, vector<16x8xf16>		%C = vector.transfer_read %arg2[%c0, %c0], %cst {in_bounds = [true, true]} : memref<20x20xf16, 3>, vector<16x8xf16>
%D = vector.contract {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %A, %B, %C : vector<16x16xf16>, vector<8x16xf16> into vector<16x8xf16>		%D = vector.contract {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %A, %B, %C : vector<16x16xf16>, vector<8x16xf16> into vector<16x8xf16>
vector.transfer_write %D, %arg2[%c0, %c0] {in_bounds = [true, true]} : vector<16x8xf16>, memref<20x20xf16, 3>		vector.transfer_write %D, %arg2[%c0, %c0] {in_bounds = [true, true]} : vector<16x8xf16>, memref<20x20xf16, 3>
return		return
}		}

// -----		// -----

		//#########################################################################
		// FP16 row-row-row (ldmatrix x4 for matrixA and ldmatrix x4 for matrixB)
		//#########################################################################

		// CHECK-DAG: [[$rowA_map:#.+]] = affine_map<()[s0] -> (s0 mod 16)>
		// CHECK-DAG: [[$colA_map:#.+]] = affine_map<()[s0] -> ((s0 floordiv 16) * 8)>
		// CHECK-DAG: [[$rowB_map:#.+]] = affine_map<()[s0] -> ((s0 floordiv 8) * 8 - ((s0 floordiv 8) floordiv 2) * 16)>
		// CHECK-DAG: [[$colB_map:#.+]] = affine_map<()[s0] -> (s0 mod 8 + ((s0 floordiv 8) floordiv 2) * 8)>

		#map0 = affine_map<(d0, d1) -> (d1, d0)>
		#map1 = affine_map<(d0, d1, d2) -> (d0, d2)>
		#map2 = affine_map<(d0, d1, d2) -> (d1, d2)>
		#map3 = affine_map<(d0, d1, d2) -> (d0, d1)>

		// CHECK-LABEL: func @m16n16k16_mmasync16816_fp16_f16_row_row_row
		func.func @m16n16k16_mmasync16816_fp16_f16_row_row_row(%arg0: memref<42x32xf16, 3>, %arg1: memref<32x64xf16, 3>, %arg2: memref<42x64xf16, 3>) {
		%cst_0 = arith.constant dense<0.000000e+00> : vector<16x8xf16>
		%c0 = arith.constant 0 : index
		%c8 = arith.constant 8 : index
		%cst = arith.constant 0.000000e+00 : f16

		// CHECK-DAG: [[row:%.+]] = affine.apply [[$rowA_map]]
		// CHECK-DAG: [[col:%.+]] = affine.apply [[$colA_map]]
		// CHECK: [[fragmentA:%.+]] = nvgpu.ldmatrix %arg0[[[row]], [[col]]] {numTiles = 4 : i32, transpose = false}
		%A = vector.transfer_read %arg0[%c0, %c0], %cst {in_bounds = [true, true]} : memref<42x32xf16, 3>, vector<16x16xf16>

		// CHECK-DAG: [[row:%.+]] = affine.apply [[$rowB_map]]
		// CHECK-DAG: [[col:%.+]] = affine.apply [[$colB_map]]
		// CHECK-DAG: [[fragmentB:%.+]] = nvgpu.ldmatrix %arg1[[[col]], [[row]]] {numTiles = 4 : i32, transpose = true}
		%B = vector.transfer_read %arg1[%c0, %c0], %cst {permutation_map = #map0, in_bounds = [true, true]} : memref<32x64xf16, 3>, vector<16x16xf16>

		// CHECK-DAG: [[row:%.+]] = affine.apply [[$rowA_map]]
		// CHECK-DAG: [[col:%.+]] = affine.apply [[$colA_map]]
		// CHECK-DAG: [[fragmentC:%.*]] = nvgpu.ldmatrix %arg2[[[row]], [[col]]] {numTiles = 4 : i32, transpose = false}
		%C = vector.transfer_read %arg2[%c0, %c0], %cst {in_bounds = [true, true]} : memref<42x64xf16, 3>, vector<16x16xf16>

		// CHECK-DAG: [[fragmentB0:%.+]] = vector.extract_strided_slice [[fragmentB]] {offsets = [0, 0], sizes = [2, 2], strides = [1, 1]} : vector<4x2xf16> to vector<2x2xf16>
		// CHECK-DAG: [[fragmentC0:%.+]] = vector.extract_strided_slice [[fragmentC]] {offsets = [0, 0], sizes = [2, 2], strides = [1, 1]} : vector<4x2xf16> to vector<2x2xf16>
		// CHECK: nvgpu.mma.sync([[fragmentA]], [[fragmentB0]], [[fragmentC0]]) {mmaShape = [16, 8, 16]} : (vector<4x2xf16>, vector<2x2xf16>, vector<2x2xf16>) -> vector<2x2xf16>
		%B0 = vector.extract_strided_slice %B {offsets = [0, 0], sizes = [8, 16], strides = [1, 1]} : vector<16x16xf16> to vector<8x16xf16>
		%C0 = vector.extract_strided_slice %C {offsets = [0, 0], sizes = [16, 8], strides = [1, 1]} : vector<16x16xf16> to vector<16x8xf16>
		%D0 = vector.contract {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %A, %B0, %C0 : vector<16x16xf16>, vector<8x16xf16> into vector<16x8xf16>
		vector.transfer_write %D0, %arg2[%c0, %c0] {in_bounds = [true, true]} : vector<16x8xf16>, memref<42x64xf16, 3>

		// CHECK-DAG: [[fragmentB1:%.+]] = vector.extract_strided_slice [[fragmentB]] {offsets = [2, 0], sizes = [2, 2], strides = [1, 1]} : vector<4x2xf16> to vector<2x2xf16>
		// CHECK-DAG: [[fragmentC1:%.+]] = vector.extract_strided_slice [[fragmentC]] {offsets = [2, 0], sizes = [2, 2], strides = [1, 1]} : vector<4x2xf16> to vector<2x2xf16>
		// CHECK: nvgpu.mma.sync([[fragmentA]], [[fragmentB1]], [[fragmentC1]]) {mmaShape = [16, 8, 16]} : (vector<4x2xf16>, vector<2x2xf16>, vector<2x2xf16>) -> vector<2x2xf16>
		%B1 = vector.extract_strided_slice %B {offsets = [8, 0], sizes = [8, 16], strides = [1, 1]} : vector<16x16xf16> to vector<8x16xf16>
		%C1 = vector.extract_strided_slice %C {offsets = [0, 8], sizes = [16, 8], strides = [1, 1]} : vector<16x16xf16> to vector<16x8xf16>
		%D1 = vector.contract {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %A, %B1, %C1 : vector<16x16xf16>, vector<8x16xf16> into vector<16x8xf16>
		vector.transfer_write %D1, %arg2[%c0, %c0] {in_bounds = [true, true]} : vector<16x8xf16>, memref<42x64xf16, 3>

		return
		}
		// -----

// CHECK-DAG: [[$Arow_map:#.+]] = affine_map<()[s0] -> (s0 mod 16 + 1)>		// CHECK-DAG: [[$Arow_map:#.+]] = affine_map<()[s0] -> (s0 mod 16 + 1)>
// CHECK-DAG: [[$Acol_map:#.+]] = affine_map<()[s0] -> ((s0 floordiv 16) * 8 + 3)>		// CHECK-DAG: [[$Acol_map:#.+]] = affine_map<()[s0] -> ((s0 floordiv 16) * 8 + 3)>
// CHECK-DAG: [[$Bcol_map:#.+]] = affine_map<() -> (3)>		// CHECK-DAG: [[$Bcol_map:#.+]] = affine_map<() -> (3)>
// CHECK-DAG: [[$Brow_map:#.+]] = affine_map<()[s0] -> (s0 + 3)>		// CHECK-DAG: [[$Brow_map:#.+]] = affine_map<()[s0] -> (s0 + 3)>

#map0 = affine_map<(d0, d1, d2) -> (d2, d1)>		#map0 = affine_map<(d0, d1, d2) -> (d2, d1)>
#map1 = affine_map<(d0, d1, d2) -> (d0, d2)>		#map1 = affine_map<(d0, d1, d2) -> (d0, d2)>
#map2 = affine_map<(d0, d1, d2) -> (d1, d2)>		#map2 = affine_map<(d0, d1, d2) -> (d1, d2)>
▲ Show 20 Lines • Show All 390 Lines • ▼ Show 20 Lines	func.func @m16n8k8_tf32_f32_col_col_row(%arg0: memref<20x20xf32, 3>, %arg1: memref<20x20xf32, 3>, %arg2: memref<20x20xf32>) {
// CHECK: vector.store		// CHECK: vector.store
// CHECK: vector.extract [[d_frag]][1] : vector<2x2xf32>		// CHECK: vector.extract [[d_frag]][1] : vector<2x2xf32>
// CHECK: affine.apply [[$rowC8_map]]		// CHECK: affine.apply [[$rowC8_map]]
// CHECK: affine.apply [[$colC_map]]		// CHECK: affine.apply [[$colC_map]]
// CHECK: vector.store		// CHECK: vector.store
vector.transfer_write %D, %arg2[%c16, %c8] {in_bounds = [true, true]} : vector<16x8xf32>, memref<20x20xf32>		vector.transfer_write %D, %arg2[%c16, %c8] {in_bounds = [true, true]} : vector<16x8xf32>, memref<20x20xf32>
return		return
}		}
No newline at end of file		No newline at end of file

This is an archive of the discontinued LLVM Phabricator instance.

[mlir][nvgpu] Handle Native mma.sync sizes and ldmatrix(x4) for matrixB
ClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 469087

mlir/include/mlir/Dialect/NVGPU/Utils/MMAUtils.h

mlir/lib/Conversion/VectorToGPU/VectorToGPU.cpp

mlir/lib/Dialect/NVGPU/Utils/MMAUtils.cpp

mlir/test/Conversion/VectorToGPU/vector-to-mma-ops-mma-sync.mlir

This is an archive of the discontinued LLVM Phabricator instance.

[mlir][nvgpu] Handle Native mma.sync sizes and ldmatrix(x4) for matrixBClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 469087

mlir/include/mlir/Dialect/NVGPU/Utils/MMAUtils.h

mlir/lib/Conversion/VectorToGPU/VectorToGPU.cpp

mlir/lib/Dialect/NVGPU/Utils/MMAUtils.cpp

mlir/test/Conversion/VectorToGPU/vector-to-mma-ops-mma-sync.mlir

[mlir][nvgpu] Handle Native mma.sync sizes and ldmatrix(x4) for matrixB
ClosedPublic