Diff 430986

mlir/include/mlir/Conversion/Passes.td

	Show First 20 Lines • Show All 845 Lines • ▼ Show 20 Lines
	// VectorToGPU			// VectorToGPU
	//===----------------------------------------------------------------------===//			//===----------------------------------------------------------------------===//

	def ConvertVectorToGPU : Pass<"convert-vector-to-gpu"> {			def ConvertVectorToGPU : Pass<"convert-vector-to-gpu"> {
	let summary = "Lower the operations from the vector dialect into the GPU "			let summary = "Lower the operations from the vector dialect into the GPU "
	"dialect";			"dialect";
	let constructor = "mlir::createConvertVectorToGPUPass()";			let constructor = "mlir::createConvertVectorToGPUPass()";
	let dependentDialects = [			let dependentDialects = [
	"memref::MemRefDialect",			"memref::MemRefDialect", "gpu::GPUDialect", "AffineDialect",
	"gpu::GPUDialect"			"vector::VectorDialect", "nvgpu::NVGPUDialect"
				];

				let options = [
				Option<"useNvGpu", "use-nvgpu", "bool", /default=/"false",
				"convert to NvGPU ops instead of GPU dialect ops">
	];			];
	}			}

	//===----------------------------------------------------------------------===//			//===----------------------------------------------------------------------===//
	// VectorToSCF			// VectorToSCF
	//===----------------------------------------------------------------------===//			//===----------------------------------------------------------------------===//

	def ConvertVectorToSCF : Pass<"convert-vector-to-scf"> {			def ConvertVectorToSCF : Pass<"convert-vector-to-scf"> {
	▲ Show 20 Lines • Show All 90 Lines • Show Last 20 Lines

mlir/include/mlir/Conversion/VectorToGPU/VectorToGPU.h

	Show All 11 Lines
	#include "mlir/IR/PatternMatch.h"			#include "mlir/IR/PatternMatch.h"

	namespace mlir {			namespace mlir {
	class MLIRContext;			class MLIRContext;
	class Pass;			class Pass;
	class RewritePatternSet;			class RewritePatternSet;

	/// Patterns to transform vector ops into a canonical form to convert to MMA			/// Patterns to transform vector ops into a canonical form to convert to MMA
	/// matrix operations.			/// matrix operations. If `useNvGpu` is true, then the patterns will populated
	void populatePrepareVectorToMMAPatterns(RewritePatternSet &patterns);			/// will prepare for conversion to `nvgpu` mma operations rather than the `gpu`
				/// dialect WMMA operations.
				void populatePrepareVectorToMMAPatterns(RewritePatternSet &patterns,
				bool useNvGpu = false);
				bondhugulaUnsubmitted Not Done Reply Inline Actions Can you please document `useWmma` here? bondhugula: Can you please document `useWmma` here?
				christopherbateAuthorUnsubmitted Done Reply Inline Actions Done. I can add more detail if required, but basically the canonical forms of the contractions differ, so that is why this is required. christopherbate: Done. I can add more detail if required, but basically the canonical forms of the contractions…

	/// Convert vector ops to MMA matrix operations nested under `rootOp`. This will			/// Convert vector ops to MMA matrix operations nested under `rootOp`. This will
	/// convert slice of operations that can be legally converted to MMA operations.			/// convert slice of operations that can be legally converted to MMA operations.
	/// The rest of the vector operations are left untouched.			/// The rest of the vector operations are left untouched.
	void convertVectorToMMAOps(Operation *rootOp);			void convertVectorToMMAOps(Operation *rootOp);

				/// Convert vector ops ops nested under `rootOp` to vector and GPU operaitons
				/// compatible with the `nvvm.mma.sync` lowering path. This will convert a slice
				/// of operations that can be legally lowered on this path while the rest of
				/// the vector operations are left untouched.
				LogicalResult convertVectorToNVVMCompatibleMMASync(Operation *rootOp);

	/// Convert from vector to GPU ops.			/// Convert from vector to GPU ops.
	std::unique_ptr<Pass> createConvertVectorToGPUPass();			std::unique_ptr<Pass> createConvertVectorToGPUPass(bool useNvGpu = false);

	} // namespace mlir			} // namespace mlir

	#endif // MLIR_CONVERSION_VECTORTOGPU_VECTORTOGPU_H			#endif // MLIR_CONVERSION_VECTORTOGPU_VECTORTOGPU_H

mlir/lib/Conversion/PassDetail.h

	Show First 20 Lines • Show All 49 Lines • ▼ Show 20 Lines
	namespace linalg {			namespace linalg {
	class LinalgDialect;			class LinalgDialect;
	} // namespace linalg			} // namespace linalg

	namespace LLVM {			namespace LLVM {
	class LLVMDialect;			class LLVMDialect;
	} // namespace LLVM			} // namespace LLVM

				namespace nvgpu {
				class NVGPUDialect;
				}

	namespace NVVM {			namespace NVVM {
	class NVVMDialect;			class NVVMDialect;
	} // namespace NVVM			} // namespace NVVM

	namespace math {			namespace math {
	class MathDialect;			class MathDialect;
	} // namespace math			} // namespace math

	▲ Show 20 Lines • Show All 46 Lines • Show Last 20 Lines

mlir/lib/Conversion/VectorToGPU/CMakeLists.txt

	add_mlir_conversion_library(MLIRVectorToGPU			add_mlir_conversion_library(MLIRVectorToGPU
	VectorToGPU.cpp			VectorToGPU.cpp
				NvGpuSupport.cpp

	ADDITIONAL_HEADER_DIRS			ADDITIONAL_HEADER_DIRS
	${MLIR_MAIN_INCLUDE_DIR}/mlir/Conversion/VectorToGPU			${MLIR_MAIN_INCLUDE_DIR}/mlir/Conversion/VectorToGPU

	LINK_COMPONENTS			LINK_COMPONENTS
	Core			Core

	LINK_LIBS PUBLIC			LINK_LIBS PUBLIC
	MLIRArithmetic			MLIRArithmetic
	MLIRGPUOps			MLIRGPUOps
	MLIRLLVMIR			MLIRLLVMIR
	MLIRMemRef			MLIRMemRef
	MLIRTransforms			MLIRTransforms
	MLIRVector			MLIRVector
	MLIRVectorUtils			MLIRVectorUtils
	)			)

mlir/lib/Conversion/VectorToGPU/NvGpuSupport.h

This file was added.

				//===- NvvmMMASupport.h - MLIR Vector to GPU lowering support --------===//
				//
				// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
				// See https://llvm.org/LICENSE.txt for license information.
				// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
				//
				//===----------------------------------------------------------------------===//
				//
				// This file provides utilities to assist in the lowering of Vector operations
				// to GPU dialect MMA operations.
				//
				//===----------------------------------------------------------------------===//
				#ifndef MLIR_CONVERSION_VECTORTOGPU_NVGPUSUPPORT_H
				#define MLIR_CONVERSION_VECTORTOGPU_NVGPUSUPPORT_H

				#include "mlir/Dialect/Affine/IR/AffineOps.h"
				#include "mlir/Dialect/LLVMIR/LLVMTypes.h"
				#include "mlir/Dialect/LLVMIR/NVVMDialect.h"
				#include "mlir/Dialect/Utils/StructuredOpsUtils.h"
				#include "mlir/Dialect/Vector/IR/VectorOps.h"
				#include "mlir/IR/PatternMatch.h"
				#include "mlir/IR/Types.h"

				namespace mlir {
				namespace nvgpu {

				enum class MatMulOperandRole : int32_t { A = 0, B, C };

				/// Collects information about a warp-level matrix operand represented by a
				/// VectorType.
				struct WarpMatrixInfo {
				VectorType vectorType;
				MatMulOperandRole operandRole;
				};

				/// Given an op that operates on a VectorType representing a warp-level matrix
				/// operand, the function returns a struct containing relevant type information.
				FailureOr<WarpMatrixInfo> getWarpMatrixInfo(Operation *op);

				/// Returns the number of bits in a single tile row. It is either 128, 256, or
				/// 512 bits depending on the data type and` whether the operand is an
				/// accumulator/result operand
				int64_t inferTileWidthInBits(const WarpMatrixInfo &type);

				/// Specifies information about the registers which compose a matrix fragment
				/// according to the PTX documentation.
				struct FragmentElementInfo {
				Type registerLLVMType;
				int64_t elementsPerRegister;
				int64_t registerWidthBits;
				int64_t numRegistersPerFragment;
				};

				/// Returns a FragmentElementInfo struct describing the register types for the
				/// given matrix fragment type.
				FailureOr<FragmentElementInfo>
				getMmaSyncRegisterType(const WarpMatrixInfo &type);

				/// Returns an AffineMap which maps a two dimensions representing (laneId,
				/// logicalValueId) and returns two results representing offsets within a
				/// matrix operand. The offsets point to the values the thread is responsible
				/// for (AKA the matrix fragment values) during a warp-collective matrix
				/// operation. For a visual reference of this LaneId -> (row, col) mapping,
				/// please see NVIDIA's PTX documentation:
				/// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#warp-level-matrix-instructions-for-mma
				FailureOr<AffineMap>
				getLaneIdAndValueIdToOperandCoord(Location loc, OpBuilder &builder,
				const WarpMatrixInfo &fragmentType);

				struct LdMatrixParams {
				VectorType fragmentType;
				bool isAccum;
				int64_t numTiles;
				IteratorType contiguousDimType;
				NVVM::MMALayout targetLayout;
				};

				FailureOr<LdMatrixParams> getLdMatrixParams(const WarpMatrixInfo &type,
				bool transpose);
				/// Returns an AffineMap which maps a single dimension representing the laneId
				/// to two results representing offsets within the matrix operand that should
				/// be the pointer locations a thread should pass to the ldmatrix instruction.
				FailureOr<AffineMap>
				getLaneIdToLdMatrixMatrixCoord(Location loc, OpBuilder &builder,
				const LdMatrixParams &params);

				// Transform contract into (m, k)x(n, k)x(m, n) form so that it can be converted
				// to MMA matmul.
				struct PrepareContractToGPUMMASync
				: public OpRewritePattern<vector::ContractionOp> {
				using OpRewritePattern<vector::ContractionOp>::OpRewritePattern;

				LogicalResult matchAndRewrite(vector::ContractionOp op,
				PatternRewriter &rewriter) const override;
				};

				} // namespace nvgpu
				} // namespace mlir

				#endif // MLIR_CONVERSION_VECTORTOGPU_NVGPUSUPPORT_H

mlir/lib/Conversion/VectorToGPU/NvGpuSupport.cpp

This file was added.

				//===- NvGpuSupport.cpp - MLIR Vector to GPU lowering support --------===//
				//
				// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
				// See https://llvm.org/LICENSE.txt for license information.
				// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
				//
				//===----------------------------------------------------------------------===//
				//
				// This file provides utilities to assist in the lowering of Vector operations
				// to NvGPU dialect MMA operations.
				//
				//===----------------------------------------------------------------------===//

				#include "NvGpuSupport.h"
				#include "mlir/Dialect/Arithmetic/IR/Arithmetic.h"
				#include "mlir/Dialect/NVGPU/NVGPUDialect.h"
				#include "mlir/Dialect/Vector/IR/VectorOps.h"

				namespace mlir {
				namespace nvgpu {
				namespace {

				/// There are always 4 threads per [128\|256\|512] bit row.
				constexpr int64_t kThreadsPerRow = 4;

				constexpr int64_t kNumRowsPerTile = 8;

				bool isAccumulatorOrResult(MatMulOperandRole operandType) {
				return operandType == MatMulOperandRole::C;
				}

				ThomasRaouxUnsubmitted Done Reply Inline Actions Could we not use gpu::MMAMatrixType at all during conversion to nvppu ops, this type is means to be used with the wmma operations and adding a dependency is not ideal, I think we could just use a simple structure to represent the matrix kind we want. ThomasRaoux: Could we not use gpu::MMAMatrixType at all during conversion to nvppu ops, this type is means…
				/// Returns the number of registers which compose a matrix fragment held by a
				/// single thread.
				int64_t inferNumRegistersPerMatrixFragment(const WarpMatrixInfo &type) {
				int64_t lineSize = inferTileWidthInBits(type);
				auto shape = type.vectorType.getShape();
				return (shape[0] / kNumRowsPerTile) *
				(shape[1] * type.vectorType.getElementType().getIntOrFloatBitWidth()) /
				lineSize;
				}

				/// Returns the number of 8 x [128\|256\|512] bit tiles that compose the given
				/// operand shape.
				std::array<int64_t, 2> getTileShape(ArrayRef<int64_t> operandShape,
				Type elementType, int64_t lineSizeBits) {
				// For each 8x128bit square, a thread is responsible for one 32bit register.
				return {operandShape[0] / kNumRowsPerTile,
				(operandShape[1] * elementType.getIntOrFloatBitWidth()) /
				lineSizeBits};
				}

				} // namespace

				FailureOr<WarpMatrixInfo> getWarpMatrixInfo(Operation *op) {
				WarpMatrixInfo info;

				// Determine the vector type.
				if (vector::TransferWriteOp writeOp = dyn_cast<vector::TransferWriteOp>(op)) {
				info.vectorType = writeOp.getVectorType();
				} else if (isa<vector::TransferReadOp, vector::ContractionOp,
				arith::ConstantOp>(op)) {
				info.vectorType = op->getResult(0).getType().cast<VectorType>();
				} else {
				return op->emitError()
				<< "unhandled operation type in nvgpu.mma.sync conversion path";
				}

				// Determine the operand role. We assume it is an accumulator/result unless it
				// is directly consumed by a `vector.contract` op.
				info.operandRole = MatMulOperandRole::C;
				for (Operation *user : op->getUsers()) {
				auto contract = dyn_cast<vector::ContractionOp>(user);
				if (!contract)
				continue;
				if (contract.getLhs() == op->getResult(0)) {
				info.operandRole = MatMulOperandRole::A;
				break;
				}
				if (contract.getRhs() == op->getResult(0)) {
				info.operandRole = MatMulOperandRole::B;
				break;
				}
				}
				return info;
				}

				int64_t inferTileWidthInBits(const WarpMatrixInfo &type) {
				bool isAcc = isAccumulatorOrResult(type.operandRole);
				Type elType = type.vectorType.getElementType();
				if (isAcc && elType.getIntOrFloatBitWidth() == 32) {
				return 256;
				}
				if (elType.getIntOrFloatBitWidth() == 64) {
				return isAcc ? 512 : 256;
				}
				return 128;
				}

				FailureOr<FragmentElementInfo>
				getMmaSyncRegisterType(const WarpMatrixInfo &type) {
				MLIRContext *ctx = type.vectorType.getContext();
				const bool isAccum = isAccumulatorOrResult(type.operandRole);

				Type elType = type.vectorType.getElementType();
				if (elType.isF16()) {
				ThomasRaouxUnsubmitted Done Reply Inline Actions You don't really need a builder just to create an AffineMap as it is not an operation. It seems like it is only used to get the context, you can get it from `elementType` instead ThomasRaoux: You don't really need a builder just to create an AffineMap as it is not an operation. It seems…
				return FragmentElementInfo{
				LLVM::getFixedVectorType(Float16Type::get(ctx), 2), 2, 32,
				inferNumRegistersPerMatrixFragment(type)};
				}

				// f64 operand
				Type f64Ty = Float64Type::get(ctx);
				if (elType.isF64()) {
				return isAccum
				? FragmentElementInfo{LLVM::getFixedVectorType(f64Ty, 2), 2, 128,
				inferNumRegistersPerMatrixFragment(type)}
				: FragmentElementInfo{f64Ty, 1, 64,
				inferNumRegistersPerMatrixFragment(type)};
				}

				// int8 operand
				if (elType.isInteger(8)) {
				return FragmentElementInfo{
				LLVM::getFixedVectorType(IntegerType::get(ctx, 8), 4), 4, 32,
				inferNumRegistersPerMatrixFragment(type)};
				}
				// Integer 32bit acc operands
				if (elType.isInteger(32)) {
				return FragmentElementInfo{
				LLVM::getFixedVectorType(IntegerType::get(ctx, 32), 2), 2, 64,
				inferNumRegistersPerMatrixFragment(type)};
				}

				// Floating point 32bit operands
				if (elType.isF32()) {
				Type f32Ty = Float32Type::get(ctx);
				return isAccum
				? FragmentElementInfo{LLVM::getFixedVectorType(f32Ty, 2), 2, 64,
				inferNumRegistersPerMatrixFragment(type)}
				: FragmentElementInfo{f32Ty, 1, 32,
				inferNumRegistersPerMatrixFragment(type)};
				}
				return failure();
				}

				static AffineMap getRegisterIndexToTileOffsetMap(int64_t lineSize,
				Type elementType,
				ArrayRef<int64_t> operandShape,
				bool isAccumulator,
				int64_t elementsPerRegister,
				AffineExpr logicalValueId) {
				const int64_t elementsPerLine =
				lineSize / elementType.getIntOrFloatBitWidth();
				const std::array<int64_t, 2> num8x128bTiles =
				getTileShape(operandShape, elementType, lineSize);
				AffineExpr registerIdx = logicalValueId.floorDiv(elementsPerRegister);
				return AffineMap::get(
				2, 0,
				{(registerIdx % num8x128bTiles[0]) * 8,
				(registerIdx.floorDiv(num8x128bTiles[0])) * elementsPerLine},
				elementType.getContext());
				}

				FailureOr<AffineMap>
				getLaneIdAndValueIdToOperandCoord(Location loc, OpBuilder &builder,
				const WarpMatrixInfo &fragmentType) {
				Type elementType = fragmentType.vectorType.getElementType();
				ArrayRef<int64_t> operandShape = fragmentType.vectorType.getShape();
				FailureOr<nvgpu::FragmentElementInfo> regInfo =
				getMmaSyncRegisterType(fragmentType);
				if (failed(regInfo))
				return failure();

				const int64_t elementBitWidth = elementType.getIntOrFloatBitWidth();
				const int64_t elementsPerRegister =
				regInfo->registerWidthBits / elementBitWidth;
				const int64_t lineSize = inferTileWidthInBits(fragmentType);

				AffineExpr laneId, logicalValueIdDim;
				bindDims(builder.getContext(), laneId, logicalValueIdDim);

				// Determine what register logicalValueId corresponds to. Use that as a
				// linear index into the coordinate mapping `index -> (tile row, tile col)`.
				AffineMap registerIndexToTileCoord = getRegisterIndexToTileOffsetMap(
				lineSize, elementType, operandShape,
				isAccumulatorOrResult(fragmentType.operandRole), elementsPerRegister,
				logicalValueIdDim);

				auto makeMap = [&](ArrayRef<AffineExpr> dimExprs) -> AffineMap {
				return AffineMap::get(2, 0, dimExprs, builder.getContext());
				};

				auto tileRow = registerIndexToTileCoord.getResult(0);
				auto tileCol = registerIndexToTileCoord.getResult(1);
				return makeMap({tileRow + laneId.floorDiv(kThreadsPerRow),
				tileCol + (laneId % kThreadsPerRow) * elementsPerRegister +
				(logicalValueIdDim % elementsPerRegister)});
				}

				FailureOr<nvgpu::LdMatrixParams> getLdMatrixParams(const WarpMatrixInfo &type,
				bool transpose) {
				LdMatrixParams params;
				Type elType = type.vectorType.getElementType();
				params.fragmentType = type.vectorType;
				if (type.operandRole == MatMulOperandRole::A \|\|
				type.operandRole == MatMulOperandRole::C) {
				params.targetLayout = NVVM::MMALayout::row;
				} else {
				params.targetLayout = NVVM::MMALayout::col;
				}
				ArrayRef<int64_t> shape = type.vectorType.getShape();
				params.contiguousDimType =
				transpose ? IteratorType::Parallel : IteratorType::Reduction;

				if (params.targetLayout == NVVM::MMALayout::row) {
				params.numTiles = (shape[0] / kNumRowsPerTile) *
				ThomasRaouxUnsubmitted Done Reply Inline Actions nit: spell out type ThomasRaoux: nit: spell out type
				christopherbateAuthorUnsubmitted Done Reply Inline Actions I think I fixed this, let me know if there's still an issue. christopherbate: I think I fixed this, let me know if there's still an issue.
				((shape[1] * elType.getIntOrFloatBitWidth()) / 128);
				} else {
				params.numTiles = (shape[1] / kNumRowsPerTile) *
				((shape[0] * elType.getIntOrFloatBitWidth()) / 128);
				}

				if (params.numTiles == 0)
				return failure();

				return params;
				}

				FailureOr<AffineMap>
				getLaneIdToLdMatrixMatrixCoord(Location loc, OpBuilder &builder,
				const LdMatrixParams &params) {
				// One thread per 128b row.
				const int64_t kNumThreadsPerTile = kNumRowsPerTile;
				const int bitsPerElement = static_cast<int>(
				params.fragmentType.getElementType().getIntOrFloatBitWidth());
				const int kElementsPer128b = (128 / bitsPerElement);
				ArrayRef<int64_t> operandShape = params.fragmentType.getShape();
				AffineExpr d0 = getAffineDimExpr(0, builder.getContext());

				auto makeMap = [&](ArrayRef<AffineExpr> dimExprs) -> AffineMap {
				return AffineMap::get(1, 0, dimExprs, builder.getContext());
				};

				// This case corresponds to row-major A\|C or col-major B operands.
				if (params.contiguousDimType == IteratorType::Reduction) {
				AffineExpr row = d0 % (operandShape[0]);
				AffineExpr col = d0.floorDiv(operandShape[0]) * (kElementsPer128b);
				return makeMap({row, col});
				}

				// This case Corresponds to col-major A\|C or row-major B operands. The
				// operandShape given is already pre-transposed (e.g. 8x16 = KxN).
				if (params.contiguousDimType == IteratorType::Parallel) {
				const int64_t num8x128bCols = (operandShape[0] * bitsPerElement) / 128;
				// Threads are assigned in groups of 8 first across columns, then to
				// rows. This is transpose of what `ldmatrix` expects, but when
				// `ldmatrix` gets the `.trans` qualifier, final the effect will be to
				// transpose just the blocks.
				auto groupIdx = d0.floorDiv(kNumThreadsPerTile);
				auto tileCol = (groupIdx % num8x128bCols);
				auto tileRow = groupIdx.floorDiv(num8x128bCols);
				return makeMap({tileCol * kElementsPer128b,
				tileRow * kNumRowsPerTile + (d0 % kNumRowsPerTile)});
				}
				return failure();
				}

				LogicalResult
				PrepareContractToGPUMMASync::matchAndRewrite(vector::ContractionOp op,
				PatternRewriter &rewriter) const {
				Location loc = op.getLoc();
				Value lhs = op.getLhs();
				Value rhs = op.getRhs();
				Value res = op.getAcc();

				// Set up the parallel/reduction structure in right form.
				using MapList = ArrayRef<ArrayRef<AffineExpr>>;
				auto infer = [](MapList m) { return AffineMap::inferFromExprList(m); };
				AffineExpr m;
				AffineExpr n;
				AffineExpr k;
				bindDims(rewriter.getContext(), m, n, k);
				static constexpr std::array<int64_t, 2> perm = {1, 0};
				auto iteratorTypes = op.getIteratorTypes().getValue();
				SmallVector<AffineMap, 4> maps = op.getIndexingMaps();
				if (iteratorTypes.size() != 3)
				return failure();
				if (!(isParallelIterator(iteratorTypes[0]) &&
				isParallelIterator(iteratorTypes[1]) &&
				isReductionIterator(iteratorTypes[2])))
				return failure();

				// The canonical form is "TNT" = A row-major, B col-major, C row-major.
				const auto canonicalForm = infer({{m, k}, {n, k}, {m, n}});
				if (maps == canonicalForm) {
				return failure();
				}
				if (maps == infer({{m, k}, {k, n}, {m, n}})) {
				rhs = rewriter.create<vector::TransposeOp>(loc, rhs, perm);
				} else if (maps == infer({{k, m}, {k, n}, {m, n}})) {
				lhs = rewriter.create<vector::TransposeOp>(loc, lhs, perm);
				} else if (maps == infer({{k, m}, {k, n}, {m, n}})) {
				rhs = rewriter.create<vector::TransposeOp>(loc, rhs, perm);
				lhs = rewriter.create<vector::TransposeOp>(loc, lhs, perm);
				} else if (maps == infer({{k, m}, {k, n}, {n, m}})) {
				std::swap(rhs, lhs);
				rhs = rewriter.create<vector::TransposeOp>(loc, rhs, perm);
				lhs = rewriter.create<vector::TransposeOp>(loc, lhs, perm);
				} else if (maps == infer({{k, m}, {n, k}, {n, m}})) {
				std::swap(rhs, lhs);
				rhs = rewriter.create<vector::TransposeOp>(loc, rhs, perm);
				} else if (maps == infer({{m, k}, {k, n}, {n, m}})) {
				std::swap(lhs, rhs);
				lhs = rewriter.create<vector::TransposeOp>(loc, lhs, perm);
				} else if (maps == infer({{m, k}, {n, k}, {n, m}})) {
				std::swap(lhs, rhs);
				} else {
				return failure();
				}
				rewriter.replaceOpWithNewOp<vector::ContractionOp>(
				op, lhs, rhs, res, rewriter.getAffineMapArrayAttr(canonicalForm),
				op.getIteratorTypes());
				return success();
				}

				} // namespace nvgpu
				} // namespace mlir
				No newline at end of file

mlir/lib/Conversion/VectorToGPU/VectorToGPU.cpp

//===- VectorToGPU.cpp - Convert vector to GPU dialect ----------- C++ --===//		//===- VectorToGPU.cpp - Convert vector to GPU dialect ----------- C++ --===//
//		//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.		// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.		// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception		// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//		//
//===----------------------------------------------------------------------===//		//===----------------------------------------------------------------------===//
//		//
// This file implements lowering of vector operations to GPU dialect ops.		// This file implements lowering of vector operations to GPU dialect ops.
//		//
//===----------------------------------------------------------------------===//		//===----------------------------------------------------------------------===//

#include <type_traits>		#include <type_traits>

		#include "NvGpuSupport.h"
#include "mlir/Conversion/VectorToGPU/VectorToGPU.h"		#include "mlir/Conversion/VectorToGPU/VectorToGPU.h"

#include "../PassDetail.h"		#include "../PassDetail.h"
#include "mlir/Analysis/SliceAnalysis.h"		#include "mlir/Analysis/SliceAnalysis.h"
#include "mlir/Dialect/Arithmetic/IR/Arithmetic.h"		#include "mlir/Dialect/Arithmetic/IR/Arithmetic.h"
#include "mlir/Dialect/GPU/GPUDialect.h"		#include "mlir/Dialect/GPU/GPUDialect.h"
#include "mlir/Dialect/MemRef/IR/MemRef.h"		#include "mlir/Dialect/MemRef/IR/MemRef.h"
		#include "mlir/Dialect/NVGPU/NVGPUDialect.h"
#include "mlir/Dialect/SCF/SCF.h"		#include "mlir/Dialect/SCF/SCF.h"
#include "mlir/Dialect/Utils/StructuredOpsUtils.h"		#include "mlir/Dialect/Utils/StructuredOpsUtils.h"
#include "mlir/Dialect/Vector/IR/VectorOps.h"		#include "mlir/Dialect/Vector/IR/VectorOps.h"
#include "mlir/Dialect/Vector/Utils/VectorUtils.h"		#include "mlir/Dialect/Vector/Utils/VectorUtils.h"
#include "mlir/IR/Builders.h"		#include "mlir/IR/Builders.h"
#include "mlir/Pass/Pass.h"		#include "mlir/Pass/Pass.h"
#include "mlir/Transforms/GreedyPatternRewriteDriver.h"		#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
#include "mlir/Transforms/Passes.h"		#include "mlir/Transforms/Passes.h"
		#include "llvm/ADT/TypeSwitch.h"

using namespace mlir;		using namespace mlir;

		/// For a vector TransferOpType `xferOp`, an empty `indices` vector, and an
		/// AffineMap representing offsets to apply to indices, the function fills
		/// `indices` with the original indices plus the offsets. The offsets are
		/// applied by taking into account the permutation map of the transfer op. If
		/// the `offsetMap` has dimension placeholders, those should be provided in
		/// `dimValues`.
		template <typename TransferOpType>
		static void getXferIndices(OpBuilder &b, TransferOpType xferOp,
		AffineMap offsetMap, ArrayRef<Value> dimValues,
		SmallVector<Value, 4> &indices) {
		indices.append(xferOp.getIndices().begin(), xferOp.getIndices().end());
		Location loc = xferOp.getLoc();
		unsigned offsetsIdx = 0;
		for (auto expr : xferOp.getPermutationMap().getResults()) {
		if (auto dim = expr.template dyn_cast<AffineDimExpr>()) {
		Value prevIdx = indices[dim.getPosition()];
		SmallVector<Value, 3> dims(dimValues.begin(), dimValues.end());
		dims.push_back(prevIdx);
		AffineExpr d0 = b.getAffineDimExpr(offsetMap.getNumDims());
		indices[dim.getPosition()] = makeComposedAffineApply(
		b, loc, d0 + offsetMap.getResult(offsetsIdx++), dims);
		continue;
		}
		}
		}

// Return true if the contract op can be convert to MMA matmul.		// Return true if the contract op can be convert to MMA matmul.
static bool contractSupportsMMAMatrixType(vector::ContractionOp contract) {		static bool contractSupportsMMAMatrixType(vector::ContractionOp contract,
		bool useNvGpu) {
if (llvm::size(contract.getMasks()) != 0)		if (llvm::size(contract.getMasks()) != 0)
return false;		return false;

using MapList = ArrayRef<ArrayRef<AffineExpr>>;		using MapList = ArrayRef<ArrayRef<AffineExpr>>;
auto infer = [](MapList m) { return AffineMap::inferFromExprList(m); };		auto infer = [](MapList m) { return AffineMap::inferFromExprList(m); };
AffineExpr m, n, k;		AffineExpr m, n, k;
bindDims(contract.getContext(), m, n, k);		bindDims(contract.getContext(), m, n, k);
auto iteratorTypes = contract.getIteratorTypes().getValue();		auto iteratorTypes = contract.getIteratorTypes().getValue();
if (!(isParallelIterator(iteratorTypes[0]) &&		if (!(isParallelIterator(iteratorTypes[0]) &&
isParallelIterator(iteratorTypes[1]) &&		isParallelIterator(iteratorTypes[1]) &&
isReductionIterator(iteratorTypes[2])))		isReductionIterator(iteratorTypes[2])))
return false;		return false;

// The contract needs to represent a matmul to be able to convert to		// The contract needs to represent a matmul to be able to convert to
// MMAMatrix matmul.		// MMAMatrix matmul.
if (contract.getIndexingMaps() != infer({{m, k}, {k, n}, {m, n}}))		if (!useNvGpu &&
		contract.getIndexingMaps() != infer({{m, k}, {k, n}, {m, n}}))
		return false;
		if (useNvGpu && contract.getIndexingMaps() != infer({{m, k}, {n, k}, {m, n}}))
		nirvedhmeshramUnsubmitted Done Reply Inline Actions why is this ordering different than the wmma case? nirvedhmeshram: why is this ordering different than the wmma case?
		christopherbateAuthorUnsubmitted Done Reply Inline Actions The wmma code is treating the canonical layout as "row-row-row" whereas we need "row-col-row", because "row-col-row" is what the actual `mma.sync` instructions are asking for. If the user is giving data as "row-row-row", we need to ensure that a transpose operation is manifested in the IR (and merged into the `vector.transfer_read`). This then serves as an indicator that the elements should be transposed during loading (by either passing the "transpose=true" attribute to `nvgpu.ldmatrix` or loading the elements individually in a transposed manner. christopherbate: The wmma code is treating the canonical layout as "row-row-row" whereas we need "row-col-row"…
return false;		return false;

return true;		return true;
}		}

// Return the stide for the dimension 0 of \|type\| if it is a memref and has a		// Return the stide for the dimension 0 of \|type\| if it is a memref and has a
// constant stride.		// constant stride.
static llvm::Optional<int64_t>		static llvm::Optional<int64_t>
getMemrefConstantHorizontalStride(ShapedType type) {		getMemrefConstantHorizontalStride(ShapedType type) {
auto memrefType = type.dyn_cast<MemRefType>();		auto memrefType = type.dyn_cast<MemRefType>();
if (!memrefType)		if (!memrefType)
return false;		return false;
// If the memref is 0 or 1D the horizontal stride is 0.		// If the memref is 0 or 1D the horizontal stride is 0.
if(memrefType.getRank() < 2)		if (memrefType.getRank() < 2)
return 0;		return 0;
int64_t offset = 0;		int64_t offset = 0;
SmallVector<int64_t, 2> strides;		SmallVector<int64_t, 2> strides;
if (failed(getStridesAndOffset(memrefType, strides, offset)) \|\|		if (failed(getStridesAndOffset(memrefType, strides, offset)) \|\|
strides.back() != 1)		strides.back() != 1)
return llvm::None;		return llvm::None;
int64_t stride = strides[strides.size() - 2];		int64_t stride = strides[strides.size() - 2];
if (stride == ShapedType::kDynamicStrideOrOffset)		if (stride == ShapedType::kDynamicStrideOrOffset)
return llvm::None;		return llvm::None;
return stride;		return stride;
}		}

// Return true if the transfer op can be converted to a MMA matrix load.		// Return true if the transfer op can be converted to a MMA matrix load.
static bool transferReadSupportsMMAMatrixType(vector::TransferReadOp readOp) {		static bool transferReadSupportsMMAMatrixType(vector::TransferReadOp readOp,
		bool useNvGpu) {
if (readOp.getMask() \|\| readOp.hasOutOfBoundsDim() \|\|		if (readOp.getMask() \|\| readOp.hasOutOfBoundsDim() \|\|
readOp.getVectorType().getRank() != 2)		readOp.getVectorType().getRank() != 2)
return false;		return false;
if (!getMemrefConstantHorizontalStride(readOp.getShapedType()))		if (!getMemrefConstantHorizontalStride(readOp.getShapedType()))
return false;		return false;
AffineMap map = readOp.getPermutationMap();		AffineMap map = readOp.getPermutationMap();
OpBuilder b(readOp.getContext());		OpBuilder b(readOp.getContext());
AffineExpr innerDim = b.getAffineDimExpr(map.getNumDims() - 1);		AffineExpr innerDim = b.getAffineDimExpr(map.getNumDims() - 1);
AffineExpr zero = b.getAffineConstantExpr(0);		AffineExpr zero = b.getAffineConstantExpr(0);
auto broadcastInnerDim = AffineMap::get(map.getNumDims(), 0, {zero, innerDim},		auto broadcastInnerDim = AffineMap::get(map.getNumDims(), 0, {zero, innerDim},
readOp.getContext());		readOp.getContext());

		if (!useNvGpu) {
// TODO: Support transpose once it is added to GPU dialect ops.		// TODO: Support transpose once it is added to GPU dialect ops.
// For now we only support (d0, d1) -> (d0, d1) and (d0, d1) -> (0, d1).		// For now we only support (d0, d1) -> (d0, d1) and (d0, d1) -> (0, d1).
return !(!map.isMinorIdentity() && map != broadcastInnerDim);		return map.isMinorIdentity() \|\| map == broadcastInnerDim;
		}

		return true;
}		}

// Return true if the transfer op can be converted to a MMA matrix store.		// Return true if the transfer op can be converted to a MMA matrix store.
static bool		static bool
transferWriteSupportsMMAMatrixType(vector::TransferWriteOp writeOp) {		transferWriteSupportsMMAMatrixType(vector::TransferWriteOp writeOp) {
// TODO: support 0-d corner case.		// TODO: support 0-d corner case.
if (writeOp.getTransferRank() == 0)		if (writeOp.getTransferRank() == 0)
return false;		return false;
▲ Show 20 Lines • Show All 41 Lines • ▼ Show 20 Lines	convertElementwiseOpToMMA(Operation *op) {
return llvm::None;		return llvm::None;
}		}

/// Return true if the op is supported as elementwise op on MMAMatrix type.		/// Return true if the op is supported as elementwise op on MMAMatrix type.
static bool elementwiseSupportsMMAMatrixType(Operation *op) {		static bool elementwiseSupportsMMAMatrixType(Operation *op) {
return convertElementwiseOpToMMA(op).hasValue();		return convertElementwiseOpToMMA(op).hasValue();
}		}

static bool supportsMMaMatrixType(Operation *op) {		static bool supportsMMaMatrixType(Operation *op, bool useNvGpu) {
if (isa<scf::ForOp, scf::YieldOp>(op))		if (isa<scf::ForOp, scf::YieldOp>(op))
return true;		return true;
if (auto transferRead = dyn_cast<vector::TransferReadOp>(op))		if (auto transferRead = dyn_cast<vector::TransferReadOp>(op))
return transferReadSupportsMMAMatrixType(transferRead);		return transferReadSupportsMMAMatrixType(transferRead, useNvGpu);
if (auto transferWrite = dyn_cast<vector::TransferWriteOp>(op))		if (auto transferWrite = dyn_cast<vector::TransferWriteOp>(op))
return transferWriteSupportsMMAMatrixType(transferWrite);		return transferWriteSupportsMMAMatrixType(transferWrite);
if (auto contract = dyn_cast<vector::ContractionOp>(op))		if (auto contract = dyn_cast<vector::ContractionOp>(op))
return contractSupportsMMAMatrixType(contract);		return contractSupportsMMAMatrixType(contract, useNvGpu);
if (auto constant = dyn_cast<arith::ConstantOp>(op))		if (auto constant = dyn_cast<arith::ConstantOp>(op))
return constantSupportsMMAMatrixType(constant);		return constantSupportsMMAMatrixType(constant);
if (auto broadcast = dyn_cast<vector::BroadcastOp>(op))		if (auto broadcast = dyn_cast<vector::BroadcastOp>(op))
return broadcastSupportsMMAMatrixType(broadcast);		return broadcastSupportsMMAMatrixType(broadcast);
return elementwiseSupportsMMAMatrixType(op);		return elementwiseSupportsMMAMatrixType(op);
}		}

/// Return an unsorted slice handling scf.for region differently than		/// Return an unsorted slice handling scf.for region differently than
Show All 31 Lines	while (currentIndex != slice.size()) {
slice.insert(forwardSlice.begin(), forwardSlice.end());		slice.insert(forwardSlice.begin(), forwardSlice.end());
++currentIndex;		++currentIndex;
}		}
return slice;		return slice;
}		}

// Analyze slice of operations based on convert op to figure out if the whole		// Analyze slice of operations based on convert op to figure out if the whole
// slice can be converted to MMA operations.		// slice can be converted to MMA operations.
static SetVector<Operation > getOpToConvert(mlir::Operation op) {		static SetVector<Operation > getOpToConvert(mlir::Operation op,
		bool useNvGpu) {
auto hasVectorDest = [](Operation *op) {		auto hasVectorDest = [](Operation *op) {
return llvm::any_of(op->getResultTypes(),		return llvm::any_of(op->getResultTypes(),
[](Type t) { return t.isa<VectorType>(); });		[](Type t) { return t.isa<VectorType>(); });
};		};
auto hasVectorSrc = [](Operation *op) {		auto hasVectorSrc = [](Operation *op) {
return llvm::any_of(op->getOperandTypes(),		return llvm::any_of(op->getOperandTypes(),
[](Type t) { return t.isa<VectorType>(); });		[](Type t) { return t.isa<VectorType>(); });
};		};
SetVector<Operation *> opToConvert;		SetVector<Operation *> opToConvert;
op->walk([&](vector::ContractionOp contract) {		op->walk([&](vector::ContractionOp contract) {
if (opToConvert.contains(contract.getOperation()))		if (opToConvert.contains(contract.getOperation()))
return;		return;
SetVector<Operation *> dependentOps =		SetVector<Operation *> dependentOps =
getSliceContract(contract, hasVectorDest, hasVectorSrc);		getSliceContract(contract, hasVectorDest, hasVectorSrc);
// If any instruction cannot use MMA matrix type drop the whole		// If any instruction cannot use MMA matrix type drop the whole
// chain. MMA matrix are stored in an opaque type so they cannot be used		// chain. MMA matrix are stored in an opaque type so they cannot be used
// by all operations.		// by all operations.
if (llvm::any_of(dependentOps,		if (llvm::any_of(dependentOps, [useNvGpu](Operation *op) {
[](Operation *op) { return !supportsMMaMatrixType(op); }))		return !supportsMMaMatrixType(op, useNvGpu);
		}))
return;		return;
opToConvert.insert(dependentOps.begin(), dependentOps.end());		opToConvert.insert(dependentOps.begin(), dependentOps.end());
});		});
// Sort the operations so that we can convert them in topological order.		// Sort the operations so that we can convert them in topological order.
return topologicalSort(opToConvert);		return topologicalSort(opToConvert);
}		}

namespace {		namespace {
▲ Show 20 Lines • Show All 112 Lines • ▼ Show 20 Lines	if (contract.getRhs() == op.getResult())
return "BOp";		return "BOp";
}		}
return "COp";		return "COp";
}		}

static void convertTransferReadOp(vector::TransferReadOp op,		static void convertTransferReadOp(vector::TransferReadOp op,
llvm::DenseMap<Value, Value> &valueMapping) {		llvm::DenseMap<Value, Value> &valueMapping) {
assert(op.getTransferRank() > 0 && "unexpected 0-d transfer");		assert(op.getTransferRank() > 0 && "unexpected 0-d transfer");
assert(transferReadSupportsMMAMatrixType(op));		assert(transferReadSupportsMMAMatrixType(op, /useNvGpu=/false));
Optional<int64_t> stride =		Optional<int64_t> stride =
getMemrefConstantHorizontalStride(op.getShapedType());		getMemrefConstantHorizontalStride(op.getShapedType());
AffineMap map = op.getPermutationMap();		AffineMap map = op.getPermutationMap();
// Handle broadcast by setting the stride to 0.		// Handle broadcast by setting the stride to 0.
if (map.getResult(0).isa<AffineConstantExpr>()) {		if (map.getResult(0).isa<AffineConstantExpr>()) {
assert(map.getResult(0).cast<AffineConstantExpr>().getValue() == 0);		assert(map.getResult(0).cast<AffineConstantExpr>().getValue() == 0);
stride = 0;		stride = 0;
}		}
Show All 18 Lines	static void convertTransferWriteOp(vector::TransferWriteOp op,
OpBuilder b(op);		OpBuilder b(op);
Value matrix = valueMapping.find(op.getVector())->second;		Value matrix = valueMapping.find(op.getVector())->second;
b.create<gpu::SubgroupMmaStoreMatrixOp>(op.getLoc(), matrix, op.getSource(),		b.create<gpu::SubgroupMmaStoreMatrixOp>(op.getLoc(), matrix, op.getSource(),
op.getIndices(),		op.getIndices(),
b.getIndexAttr(*stride));		b.getIndexAttr(*stride));
op.erase();		op.erase();
}		}

		/// Returns the vector type which represents a matrix fragment.
		static VectorType
		getMmaSyncVectorOperandType(const nvgpu::FragmentElementInfo &regInfo) {
		SmallVector<int64_t> shape{regInfo.numRegistersPerFragment,
		regInfo.elementsPerRegister};
		Type elType = regInfo.registerLLVMType;
		if (auto vecType = elType.dyn_cast<VectorType>())
		elType = vecType.getElementType();
		return VectorType::get(shape, elType);
		}

		/// Convert a 2D splat ConstantOp to a SubgroupMmaConstantMatrix op.
		static LogicalResult
		convertConstantOpMmaSync(arith::ConstantOp op,
		llvm::DenseMap<Value, Value> &valueMapping) {
		OpBuilder b(op);
		FailureOr<nvgpu::WarpMatrixInfo> warpMatrixInfo =
		nvgpu::getWarpMatrixInfo(op);
		if (failed(warpMatrixInfo))
		return failure();

		FailureOr<nvgpu::FragmentElementInfo> regInfo =
		nvgpu::getMmaSyncRegisterType(*warpMatrixInfo);
		if (failed(regInfo))
		return failure();

		VectorType vectorType = getMmaSyncVectorOperandType(*regInfo);
		auto dense = op.getValue().dyn_cast<SplatElementsAttr>();
		if (!dense)
		return failure();
		Value result = b.create<arith::ConstantOp>(
		op.getLoc(), vectorType,
		DenseElementsAttr::get(vectorType, dense.getSplatValue<Attribute>()));
		valueMapping[op.getResult()] = result;
		return success();
		}

		static LogicalResult
		creatLdMatrixCompatibleLoads(vector::TransferReadOp op, OpBuilder &builder,
		llvm::DenseMap<Value, Value> &valueMapping) {
		Location loc = op->getLoc();

		FailureOr<nvgpu::WarpMatrixInfo> warpMatrixInfo =
		nvgpu::getWarpMatrixInfo(op);
		if (failed(warpMatrixInfo))
		return failure();

		FailureOr<nvgpu::FragmentElementInfo> regInfo =
		nvgpu::getMmaSyncRegisterType(*warpMatrixInfo);
		if (failed(regInfo))
		return failure();

		FailureOr<nvgpu::LdMatrixParams> params = nvgpu::getLdMatrixParams(
		*warpMatrixInfo,
		/transpose=/!op.getPermutationMap().isMinorIdentity());
		if (failed(params)) {
		return op->emitError()
		<< "failed to convert vector.transfer_read to ldmatrix; this op "
		"likely "
		"should not be converted to a nvgpu.ldmatrix call.";
		}

		// Adjust the load offset.
		auto laneId = builder.create<gpu::LaneIdOp>(loc);
		FailureOr<AffineMap> offsets =
		nvgpu::getLaneIdToLdMatrixMatrixCoord(loc, builder, *params);
		if (failed(offsets))
		return failure();

		VectorType vectorType = getMmaSyncVectorOperandType(*regInfo);

		SmallVector<Value, 4> indices;
		getXferIndices<vector::TransferReadOp>(builder, op, *offsets, {laneId},
		indices);
		nvgpu::LdMatrixOp newOp = builder.create<nvgpu::LdMatrixOp>(
		loc, vectorType, op.getSource(), indices,
		!op.getPermutationMap().isMinorIdentity(), params->numTiles);
		valueMapping[op] = newOp->getResult(0);
		return success();
		}

		static LogicalResult
		createNonLdMatrixLoads(vector::TransferReadOp op, OpBuilder &builder,
		llvm::DenseMap<Value, Value> &valueMapping) {
		Location loc = op.getLoc();
		FailureOr<nvgpu::WarpMatrixInfo> warpMatrixInfo =
		nvgpu::getWarpMatrixInfo(op);
		if (failed(warpMatrixInfo))
		return failure();
		FailureOr<nvgpu::FragmentElementInfo> regInfo =
		nvgpu::getMmaSyncRegisterType(*warpMatrixInfo);
		if (failed(regInfo)) {
		op->emitError() << "Failed to deduce register fragment type during "
		"conversion to distributed non-ldmatrix compatible load";
		return failure();
		}

		NVVM::MMALayout targetLayout =
		warpMatrixInfo->operandRole == nvgpu::MatMulOperandRole::B
		? NVVM::MMALayout::col
		: NVVM::MMALayout::row;

		Value laneId = builder.create<gpu::LaneIdOp>(loc);
		SmallVector<Value, 4> elements;

		// This is the individual element type.
		Type loadedElType = regInfo->registerLLVMType;
		VectorType vectorType = getMmaSyncVectorOperandType(*regInfo);

		Value fill = builder.create<arith::ConstantOp>(
		op.getLoc(), vectorType.getElementType(),
		builder.getZeroAttr(vectorType.getElementType()));
		Value result = builder.create<vector::SplatOp>(op.getLoc(), fill, vectorType);

		bool isTransposeLoad = !op.getPermutationMap().isMinorIdentity();

		// Vectorized loads.
		if (!isTransposeLoad && targetLayout == NVVM::MMALayout::row) {
		if (!loadedElType.isa<VectorType>()) {
		loadedElType = VectorType::get({1}, loadedElType);
		}

		for (int i = 0; i < vectorType.getShape()[0]; i++) {
		FailureOr<AffineMap> coords = nvgpu::getLaneIdAndValueIdToOperandCoord(
		op.getLoc(), builder, *warpMatrixInfo);
		if (failed(coords))
		return failure();
		Value logicalValueId = builder.create<arith::ConstantOp>(
		loc, builder.getIndexType(),
		builder.getIndexAttr(i * regInfo->elementsPerRegister));
		SmallVector<Value, 4> newIndices;
		getXferIndices<vector::TransferReadOp>(
		builder, op, *coords, {laneId, logicalValueId}, newIndices);

		Value el = builder.create<vector::LoadOp>(loc, loadedElType,
		op.getSource(), newIndices);
		result = builder.create<vector::InsertOp>(loc, el, result,
		builder.getI64ArrayAttr(i));
		}
		} else if (isTransposeLoad && targetLayout == NVVM::MMALayout::col) {
		if (auto vecType = loadedElType.dyn_cast<VectorType>()) {
		loadedElType = vecType.getElementType();
		}
		// Load each element individually.
		for (int i = 0; i < vectorType.getShape()[0]; i++) {
		for (unsigned innerIdx = 0; innerIdx < vectorType.getShape()[1];
		innerIdx++) {

		Value logicalValueId = builder.create<arith::ConstantOp>(
		loc, builder.getIndexType(),
		builder.getIndexAttr(i * regInfo->elementsPerRegister + innerIdx));
		FailureOr<AffineMap> coords = nvgpu::getLaneIdAndValueIdToOperandCoord(
		op.getLoc(), builder, *warpMatrixInfo);
		if (failed(coords))
		return failure();

		SmallVector<Value, 4> newIndices;
		getXferIndices<vector::TransferReadOp>(
		builder, op, *coords, {laneId, logicalValueId}, newIndices);
		Value el = builder.create<memref::LoadOp>(op.getLoc(), loadedElType,
		op.getSource(), newIndices);
		result = builder.create<vector::InsertOp>(
		op.getLoc(), el, result, builder.getI64ArrayAttr({i, innerIdx}));
		}
		}
		} else {
		return failure();
		}

		valueMapping[op.getResult()] = result;
		return success();
		}

		/// Converts a `vector.transfer_read` operation directly to either a
		/// `vector.load` or a `nvgpu.ldmatrix` operation. This function should only be
		/// used when converting to `nvgpu.mma.sync` operations.
		static LogicalResult
		convertTransferReadToLoads(vector::TransferReadOp op,
		llvm::DenseMap<Value, Value> &valueMapping) {
		OpBuilder b(op);

		FailureOr<nvgpu::WarpMatrixInfo> warpMatrixInfo =
		nvgpu::getWarpMatrixInfo(op);
		if (failed(warpMatrixInfo))
		return failure();

		bool isLdMatrixCompatible =
		op.getSource().getType().cast<MemRefType>().getMemorySpaceAsInt() == 3 &&
		nvgpu::inferTileWidthInBits(*warpMatrixInfo) == 128;

		VectorType vecTy = op.getVectorType();
		int64_t bitWidth = vecTy.getElementType().getIntOrFloatBitWidth();

		// When we are transposing the B operand, ldmatrix will only work if we have
		// at least 8 rows to read and the width to read for the transpose is 128
		// bits.
		if (!op.getPermutationMap().isMinorIdentity() &&
		(vecTy.getDimSize(1) < 8 \|\| vecTy.getDimSize(0) * bitWidth < 128))
		isLdMatrixCompatible = false;

		if (!isLdMatrixCompatible)
		return createNonLdMatrixLoads(op, b, valueMapping);

		return creatLdMatrixCompatibleLoads(op, b, valueMapping);
		}

		static LogicalResult
		convertTransferWriteToStores(vector::TransferWriteOp op,
		llvm::DenseMap<Value, Value> &valueMapping) {
		OpBuilder b(op);
		Location loc = op->getLoc();
		Value matrix = valueMapping.find(op.getVector())->second;

		FailureOr<nvgpu::WarpMatrixInfo> warpMatrixInfo =
		nvgpu::getWarpMatrixInfo(op);
		if (failed(warpMatrixInfo))
		return failure();
		FailureOr<nvgpu::FragmentElementInfo> regInfo =
		nvgpu::getMmaSyncRegisterType(*warpMatrixInfo);
		if (failed(regInfo))
		return failure();

		VectorType vectorType = getMmaSyncVectorOperandType(*regInfo);
		Value laneId = b.create<gpu::LaneIdOp>(loc);

		for (unsigned i = 0; i < vectorType.getShape()[0]; i++) {
		Value logicalValueId = b.create<arith::ConstantOp>(
		loc, b.getIndexType(),
		b.getIndexAttr(i * regInfo->elementsPerRegister));
		FailureOr<AffineMap> coords = nvgpu::getLaneIdAndValueIdToOperandCoord(
		op.getLoc(), b, *warpMatrixInfo);
		if (failed(coords))
		return failure();

		Value el = b.create<vector::ExtractOp>(loc, matrix, ArrayRef<int64_t>{i});
		SmallVector<Value, 4> newIndices;
		getXferIndices<vector::TransferWriteOp>(
		b, op, *coords, {laneId, logicalValueId}, newIndices);
		b.create<vector::StoreOp>(loc, el, op.getSource(), newIndices);
		}
		op->erase();
		return success();
		}

static void convertContractOp(vector::ContractionOp op,		static void convertContractOp(vector::ContractionOp op,
llvm::DenseMap<Value, Value> &valueMapping) {		llvm::DenseMap<Value, Value> &valueMapping) {
OpBuilder b(op);		OpBuilder b(op);
Value opA = valueMapping.find(op.getLhs())->second;		Value opA = valueMapping.find(op.getLhs())->second;
Value opB = valueMapping.find(op.getRhs())->second;		Value opB = valueMapping.find(op.getRhs())->second;
Value opC = valueMapping.find(op.getAcc())->second;		Value opC = valueMapping.find(op.getAcc())->second;
Value matmul = b.create<gpu::SubgroupMmaComputeOp>(op.getLoc(), opC.getType(),		Value matmul = b.create<gpu::SubgroupMmaComputeOp>(op.getLoc(), opC.getType(),
opA, opB, opC);		opA, opB, opC);
valueMapping[op.getResult()] = matmul;		valueMapping[op.getResult()] = matmul;
}		}

		static LogicalResult
		convertContractOpToMmaSync(vector::ContractionOp op,
		llvm::DenseMap<Value, Value> &valueMapping) {
		OpBuilder b(op);
		Value opA = valueMapping.find(op.getLhs())->second;
		Value opB = valueMapping.find(op.getRhs())->second;
		Value opC = valueMapping.find(op.getAcc())->second;
		int64_t m = op.getLhs().getType().cast<VectorType>().getShape()[0];
		int64_t n = op.getRhs().getType().cast<VectorType>().getShape()[0];
		int64_t k = op.getLhs().getType().cast<VectorType>().getShape()[1];
		Value matmul = b.create<nvgpu::MmaSyncOp>(
		op.getLoc(), opC.getType(), opA, opB, opC, b.getI64ArrayAttr({m, n, k}));
		valueMapping[op.getResult()] = matmul;
		return success();
		}

/// Convert a 2D splat ConstantOp to a SubgroupMmaConstantMatrix op.		/// Convert a 2D splat ConstantOp to a SubgroupMmaConstantMatrix op.
static void convertConstantOp(arith::ConstantOp op,		static void convertConstantOp(arith::ConstantOp op,
llvm::DenseMap<Value, Value> &valueMapping) {		llvm::DenseMap<Value, Value> &valueMapping) {
assert(constantSupportsMMAMatrixType(op));		assert(constantSupportsMMAMatrixType(op));
OpBuilder b(op);		OpBuilder b(op);
Attribute splat =		Attribute splat =
op.getValue().cast<SplatElementsAttr>().getSplatValue<Attribute>();		op.getValue().cast<SplatElementsAttr>().getSplatValue<Attribute>();
auto scalarConstant =		auto scalarConstant =
▲ Show 20 Lines • Show All 96 Lines • ▼ Show 20 Lines	static void convertElementwiseOp(Operation *op, gpu::MMAElementwiseOp opType,
SmallVector<Value> matrixOperands;		SmallVector<Value> matrixOperands;
for (Value operand : op->getOperands())		for (Value operand : op->getOperands())
matrixOperands.push_back(valueMapping.find(operand)->second);		matrixOperands.push_back(valueMapping.find(operand)->second);
Value newOp = b.create<gpu::SubgroupMmaElementwiseOp>(		Value newOp = b.create<gpu::SubgroupMmaElementwiseOp>(
op->getLoc(), matrixOperands[0].getType(), matrixOperands, opType);		op->getLoc(), matrixOperands[0].getType(), matrixOperands, opType);
valueMapping[op->getResult(0)] = newOp;		valueMapping[op->getResult(0)] = newOp;
}		}

void mlir::populatePrepareVectorToMMAPatterns(RewritePatternSet &patterns) {		void mlir::populatePrepareVectorToMMAPatterns(RewritePatternSet &patterns,
		bool useNvGpu) {
		if (!useNvGpu) {
patterns.add<PrepareContractToGPUMMA, CombineTransferReadOpTranspose>(		patterns.add<PrepareContractToGPUMMA, CombineTransferReadOpTranspose>(
patterns.getContext());		patterns.getContext());
		return;
		}
		patterns
		.add<nvgpu::PrepareContractToGPUMMASync, CombineTransferReadOpTranspose>(
		patterns.getContext());
}		}

void mlir::convertVectorToMMAOps(Operation *rootOp) {		void mlir::convertVectorToMMAOps(Operation *rootOp) {
SetVector<Operation *> ops = getOpToConvert(rootOp);		SetVector<Operation > ops = getOpToConvert(rootOp, /useNvGpu=*/false);
		nirvedhmeshramUnsubmitted Done Reply Inline Actions Isnt this always false so nvgpu sub-path is never used? nirvedhmeshram: Isnt this always false so nvgpu sub-path is never used?
		nirvedhmeshramUnsubmitted Done Reply Inline Actions I see that for the nvgpu path another function is provided. Sorry about the noise. nirvedhmeshram: I see that for the nvgpu path another function is provided. Sorry about the noise.
llvm::DenseMap<Value, Value> valueMapping;		llvm::DenseMap<Value, Value> valueMapping;
for (Operation *op : ops) {		for (Operation *op : ops) {
if (auto transferRead = dyn_cast<vector::TransferReadOp>(op)) {		if (auto transferRead = dyn_cast<vector::TransferReadOp>(op)) {
convertTransferReadOp(transferRead, valueMapping);		convertTransferReadOp(transferRead, valueMapping);
} else if (auto transferWrite = dyn_cast<vector::TransferWriteOp>(op)) {		} else if (auto transferWrite = dyn_cast<vector::TransferWriteOp>(op)) {
convertTransferWriteOp(transferWrite, valueMapping);		convertTransferWriteOp(transferWrite, valueMapping);
} else if (auto contractOp = dyn_cast<vector::ContractionOp>(op)) {		} else if (auto contractOp = dyn_cast<vector::ContractionOp>(op)) {
convertContractOp(contractOp, valueMapping);		convertContractOp(contractOp, valueMapping);
} else if (auto constantOp = dyn_cast<arith::ConstantOp>(op)) {		} else if (auto constantOp = dyn_cast<arith::ConstantOp>(op)) {
convertConstantOp(constantOp, valueMapping);		convertConstantOp(constantOp, valueMapping);
} else if (auto broadcastOp = dyn_cast<vector::BroadcastOp>(op)) {		} else if (auto broadcastOp = dyn_cast<vector::BroadcastOp>(op)) {
convertBroadcastOp(broadcastOp, valueMapping);		convertBroadcastOp(broadcastOp, valueMapping);
} else if (auto forOp = dyn_cast<scf::ForOp>(op)) {		} else if (auto forOp = dyn_cast<scf::ForOp>(op)) {
convertForOp(forOp, valueMapping);		convertForOp(forOp, valueMapping);
} else if (auto yiledOp = dyn_cast<scf::YieldOp>(op)) {		} else if (auto yiledOp = dyn_cast<scf::YieldOp>(op)) {
convertYieldOp(yiledOp, valueMapping);		convertYieldOp(yiledOp, valueMapping);
} else if (auto elementwiseType = convertElementwiseOpToMMA(op)) {		} else if (auto elementwiseType = convertElementwiseOpToMMA(op)) {
convertElementwiseOp(op, *elementwiseType, valueMapping);		convertElementwiseOp(op, *elementwiseType, valueMapping);
}		}
}		}
}		}

		LogicalResult mlir::convertVectorToNVVMCompatibleMMASync(Operation *rootOp) {
		SetVector<Operation > ops = getOpToConvert(rootOp, /useNvGpu=*/true);
		llvm::DenseMap<Value, Value> valueMapping;
		for (Operation *op : ops) {
		if (llvm::TypeSwitch<Operation *, LogicalResult>(op)
		.Case([&](vector::TransferReadOp transferReadOp) {
		return convertTransferReadToLoads(transferReadOp, valueMapping);
		})
		.Case([&](vector::TransferWriteOp transferWriteOp) {
		return convertTransferWriteToStores(transferWriteOp,
		valueMapping);
		})
		.Case([&](vector::ContractionOp contractionOp) {
		return convertContractOpToMmaSync(contractionOp, valueMapping);
		})
		.Case([&](scf::ForOp forOp) {
		convertForOp(forOp, valueMapping);
		return success();
		})
		.Case([&](scf::YieldOp yieldOp) {
		convertYieldOp(yieldOp, valueMapping);
		return success();
		})
		.Case([&](arith::ConstantOp constOp) {
		return convertConstantOpMmaSync(constOp, valueMapping);
		})
		.Default([&](Operation *op) {
		op->emitError() << "unhandled vector to mma type: " << *op;
		return failure();
		})
		.failed()) {
		op->emitError() << "Failed to convert op " << *op;
		return failure();
		}
		}
		return success();
		}

namespace {		namespace {

struct ConvertVectorToGPUPass		struct ConvertVectorToGPUPass
: public ConvertVectorToGPUBase<ConvertVectorToGPUPass> {		: public ConvertVectorToGPUBase<ConvertVectorToGPUPass> {

		explicit ConvertVectorToGPUPass(bool useNvGpu_) {
		useNvGpu.setValue(useNvGpu_);
		}

void runOnOperation() override {		void runOnOperation() override {
RewritePatternSet patterns(&getContext());		RewritePatternSet patterns(&getContext());
populatePrepareVectorToMMAPatterns(patterns);		populatePrepareVectorToMMAPatterns(patterns, useNvGpu.getValue());
(void)applyPatternsAndFoldGreedily(getOperation(), std::move(patterns));		if (failed(
		applyPatternsAndFoldGreedily(getOperation(), std::move(patterns))))
		return signalPassFailure();

		if (useNvGpu.getValue()) {
		if (failed(convertVectorToNVVMCompatibleMMASync(getOperation())))
		return signalPassFailure();
		}

convertVectorToMMAOps(getOperation());		(void)convertVectorToMMAOps(getOperation());
}		}
};		};

} // namespace		} // namespace

std::unique_ptr<Pass> mlir::createConvertVectorToGPUPass() {		std::unique_ptr<Pass> mlir::createConvertVectorToGPUPass(bool useNvGpu) {
return std::make_unique<ConvertVectorToGPUPass>();		return std::make_unique<ConvertVectorToGPUPass>(useNvGpu);
}		}

mlir/test/Conversion/VectorToGPU/vector-to-mma-ops-mma-sync.mlir

This file was added.

				// RUN: mlir-opt %s -split-input-file -pass-pipeline="func.func(convert-vector-to-gpu{use-nvgpu=true})" \| FileCheck %s

				//#########################################################
				// INT8 row-row-row
				//#########################################################

				// CHECK-DAG: [[$rowA0_map:#.+]] = affine_map<()[s0] -> (s0 mod 16 + 1)>
				// CHECK-DAG: [[$colA0_map:#.+]] = affine_map<()[s0] -> ((s0 floordiv 16) * 16 + 1)>
				ThomasRaouxUnsubmitted Done Reply Inline Actions nit: Note that the trick to make CHECK-LABEL work with map variable is to do: // CHECK-DAG: [[$rowA0_map:#.+]] = affine_map<()[s0] -> (s0 mod 16 + 1)> This way the variable are not reset by CHECK-LABEL ThomasRaoux: nit: Note that the trick to make CHECK-LABEL work with map variable is to do: ``` // CHECK-DAG…

				// CHECK-DAG: [[$rowB0_map:#.+]] = affine_map<()[s0] -> (s0 * 4 - (s0 floordiv 4) * 16 + 39)>
				// CHECK-DAG: [[$colB0_map:#.+]] = affine_map<()[s0] -> (s0 floordiv 4 + 40)>
				// CHECK-DAG: [[$rowB1_map:#.+]] = affine_map<()[s0] -> (s0 * 4 - (s0 floordiv 4) * 16 + 40)>
				// CHECK-DAG: [[$rowB2_map:#.+]] = affine_map<()[s0] -> (s0 * 4 - (s0 floordiv 4) * 16 + 41)>
				// CHECK-DAG: [[$rowB3_map:#.+]] = affine_map<()[s0] -> (s0 * 4 - (s0 floordiv 4) * 16 + 42)>
				// CHECK-DAG: [[$rowB4_map:#.+]] = affine_map<()[s0] -> (s0 * 4 - (s0 floordiv 4) * 16 + 55)>
				// CHECK-DAG: [[$rowB5_map:#.+]] = affine_map<()[s0] -> (s0 * 4 - (s0 floordiv 4) * 16 + 56)>
				// CHECK-DAG: [[$rowB6_map:#.+]] = affine_map<()[s0] -> (s0 * 4 - (s0 floordiv 4) * 16 + 57)>
				// CHECK-DAG: [[$rowB7_map:#.+]] = affine_map<()[s0] -> (s0 * 4 - (s0 floordiv 4) * 16 + 58)>

				// CHECK-DAG: [[$rowC0_map:#.+]] = affine_map<()[s0] -> (s0 floordiv 4 + 49)>
				// CHECK-DAG: [[$colC0_map:#.+]] = affine_map<()[s0] -> (s0 * 2 - (s0 floordiv 4) * 8 + 40)>
				// CHECK-DAG: [[$rowC8_map:#.+]] = affine_map<()[s0] -> (s0 floordiv 4 + 57)>


				#map0 = affine_map<(d0, d1) -> (d1, d0)>
				#map1 = affine_map<(d0, d1, d2) -> (d0, d2)>
				#map2 = affine_map<(d0, d1, d2) -> (d1, d2)>
				#map3 = affine_map<(d0, d1, d2) -> (d0, d1)>

				// CHECK-LABEL: func @m16n8k32_int8_row_row_row
				func.func @m16n8k32_int8_row_row_row(%arg0: memref<128x128xi8, 3>, %arg1: memref<128x128xi8, 3>, %arg2: memref<128x128xi32>) {
				%cst_0 = arith.constant dense<0> : vector<32x8xi8>
				%c0 = arith.constant 0 : index
				%c1 = arith.constant 1 : index
				%c17 = arith.constant 17 : index
				%c39 = arith.constant 39 : index
				%c40 = arith.constant 40 : index
				%c49 = arith.constant 49 : index
				%c50 = arith.constant 50 : index
				%cst = arith.constant 0 : i8
				%cst0 = arith.constant 0 : i32

				// Verify that the operand A is distributed to loads correctly.

				// CHECK: [[row:%.+]] = affine.apply [[$rowA0_map]]()[{{%.+}}]
				// CHECK: [[col:%.+]] = affine.apply [[$colA0_map]]()[{{%.+}}]
				// CHECK: nvgpu.ldmatrix %arg0[[[row]], [[col]]] {numTiles = 4 : i32, transpose = false} : memref<128x128xi8, 3> -> vector<4x4xi8>

				// Verify that the operand B is distributed to loads correctly. It's elements
				// must be loaded in a non-vectorized manner to do the transpose.

				// CHECK-DAG: [[row:%.+]] = affine.apply [[$rowB0_map]]()[{{%.+}}]
				// CHECK-DAG: [[col:%.+]] = affine.apply [[$colB0_map]]()[{{%.+}}]
				// CHECK: memref.load %arg1[[[row]], [[col]]] : memref<128x128xi8, 3>
				// CHECK-DAG: [[row:%.+]] = affine.apply [[$rowB1_map]]()[{{%.+}}]
				// CHECK-DAG: [[col:%.+]] = affine.apply [[$colB0_map]]()[{{%.+}}]
				// CHECK: memref.load %arg1[[[row]], [[col]]] : memref<128x128xi8, 3>
				// CHECK-DAG: [[row:%.+]] = affine.apply [[$rowB2_map]]()[{{%.+}}]
				// CHECK-DAG: [[col:%.+]] = affine.apply [[$colB0_map]]()[{{%.+}}]
				// CHECK: memref.load %arg1[[[row]], [[col]]] : memref<128x128xi8, 3>
				// CHECK-DAG: [[row:%.+]] = affine.apply [[$rowB3_map]]()[{{%.+}}]
				// CHECK-DAG: [[col:%.+]] = affine.apply [[$colB0_map]]()[{{%.+}}]
				// CHECK: memref.load %arg1[[[row]], [[col]]] : memref<128x128xi8, 3>
				// CHECK-DAG: [[col:%.+]] = affine.apply [[$colB0_map]]()[{{%.+}}]
				// CHECK-DAG: [[row:%.+]] = affine.apply [[$rowB4_map]]()[{{%.+}}]
				// CHECK: memref.load %arg1[[[row]], [[col]]] : memref<128x128xi8, 3>
				// CHECK-DAG: [[row:%.+]] = affine.apply [[$rowB5_map]]()[{{%.+}}]
				// CHECK-DAG: [[col:%.+]] = affine.apply [[$colB0_map]]()[{{%.+}}]
				// CHECK: memref.load %arg1[[[row]], [[col]]] : memref<128x128xi8, 3>
				// CHECK-DAG: [[row:%.+]] = affine.apply [[$rowB6_map]]()[{{%.+}}]
				// CHECK-DAG: [[col:%.+]] = affine.apply [[$colB0_map]]()[{{%.+}}]
				// CHECK: memref.load %arg1[[[row]], [[col]]] : memref<128x128xi8, 3>
				// CHECK-DAG: [[row:%.+]] = affine.apply [[$rowB7_map]]()[{{%.+}}]
				// CHECK-DAG: [[col:%.+]] = affine.apply [[$colB0_map]]()[{{%.+}}]
				// CHECK: memref.load %arg1[[[row]], [[col]]] : memref<128x128xi8, 3>
				// CHECK-NOT: memref.load %arg1

				// Verify that the operand C is distributed to loads correctly.
				// CHECK: [[row:%.+]] = affine.apply [[$rowC0_map]]()[{{%.+}}]
				// CHECK: [[col:%.+]] = affine.apply [[$colC0_map]]()[{{%.+}}]
				// CHECK: vector.load %arg2[[[row]], [[col]]] : memref<128x128xi32>, vector<2xi32>
				// CHECK: [[row:%.+]] = affine.apply [[$rowC8_map]]()[{{%.+}}]
				// CHECK: [[col:%.+]] = affine.apply [[$colC0_map]]()[{{%.+}}]
				// CHECK: vector.load %arg2[[[row]], [[col]]] : memref<128x128xi32>, vector<2xi32>
				// CHECK-NOT: vector.load %arg2{{.*}}

				%A = vector.transfer_read %arg0[%c1, %c1], %cst {in_bounds = [true, true]} : memref<128x128xi8, 3>, vector<16x32xi8>
				%B = vector.transfer_read %arg1[%c39, %c40], %cst {in_bounds = [true, true], permutation_map = #map0} : memref<128x128xi8, 3>, vector<8x32xi8>
				%C = vector.transfer_read %arg2[%c49, %c40], %cst0 {in_bounds = [true, true]} : memref<128x128xi32>, vector<16x8xi32>
				// CHECK: [[d:%.+]] = nvgpu.mma.sync({{.*}}) {mmaShape = [16, 8, 32]} : (vector<4x4xi8>, vector<2x4xi8>, vector<2x2xi32>) -> vector<2x2xi32>
				%D = vector.contract {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %A, %B, %C : vector<16x32xi8>, vector<8x32xi8> into vector<16x8xi32>

				// CHECK: [[row:%.+]] = affine.apply [[$rowC0_map]]()[{{%.+}}]
				// CHECK: [[col:%.+]] = affine.apply [[$colC0_map]]()[{{%.+}}]
				// CHECK: vector.store {{%.+}}, %arg2[[[row]], [[col]]] : memref<128x128xi32>, vector<2xi32>
				// CHECK: [[row:%.+]] = affine.apply [[$rowC8_map]]()[{{%.+}}]
				// CHECK: [[col:%.+]] = affine.apply [[$colC0_map]]()[{{%.+}}]
				// CHECK: vector.store {{%.+}}, %arg2[[[row]], [[col]]] : memref<128x128xi32>, vector<2xi32>
				vector.transfer_write %D, %arg2[%c49, %c40] {in_bounds = [true, true]} : vector<16x8xi32>, memref<128x128xi32>
				return
				}

				// -----

				//#########################################################
				// f64 row-row-row
				//#########################################################
				// CHECK-DAG: [[$rowA0_map:#.+]] = affine_map<()[s0] -> (s0 floordiv 4 + 1)>
				// CHECK-DAG: [[$colA0_map:#.+]] = affine_map<()[s0] -> (s0 mod 4 + 1)>

				// CHECK-DAG: [[$rowb0_map:#.+]] = affine_map<()[s0] -> (s0 mod 4 + 39)>
				// CHECK-DAG: [[$colb0_map:#.+]] = affine_map<()[s0] -> (s0 floordiv 4 + 40)>

				// CHECK-DAG: [[$rowC0_map:#.+]] = affine_map<()[s0] -> (s0 floordiv 4 + 49)>
				// CHECK-DAG: [[$colC0_map:#.+]] = affine_map<()[s0] -> (s0 * 2 - (s0 floordiv 4) * 8 + 40)

				#map0 = affine_map<(d0, d1) -> (d1, d0)>
				#map1 = affine_map<(d0, d1, d2) -> (d0, d2)>
				#map2 = affine_map<(d0, d1, d2) -> (d1, d2)>
				#map3 = affine_map<(d0, d1, d2) -> (d0, d1)>

				// CHECK-LABEL: func @m8n8k4_f64_row_row_row
				func.func @m8n8k4_f64_row_row_row(%arg0: memref<128x128xf64>, %arg1: memref<128x128xf64>, %arg2: memref<128x128xf64>) {
				%cst_0 = arith.constant dense<0.0> : vector<4x8xf64>
				%c0 = arith.constant 0 : index
				%c1 = arith.constant 1 : index
				%c17 = arith.constant 17 : index
				%c39 = arith.constant 39 : index
				%c40 = arith.constant 40 : index
				%c49 = arith.constant 49 : index
				%c50 = arith.constant 50 : index
				%cst = arith.constant 0.0 : f64
				%cst0 = arith.constant 0.0 : f64

				// Verify that the operand A is distributed to loads correctly.

				// CHECK-DAG: [[row:%.+]] = affine.apply [[$rowA0_map]]
				// CHECK-DAG: [[col:%.+]] = affine.apply [[$colA0_map]]
				// CHECK: vector.load %arg0[[[row]], [[col]]] : memref<128x128xf64>, vector<1xf64>

				// Verify that the operand B is distributed to loads correctly. It's elements
				// must be loaded in a non-vectorized manner to do the transpose.

				// CHECK-DAG: [[row:%.+]] = affine.apply [[$rowb0_map]]
				// CHECK-DAG: [[col:%.+]] = affine.apply [[$colb0_map]]
				// CHECK: memref.load %arg1[[[row]], [[col]]] : memref<128x128xf64>

				// CHECK-DAG: [[row:%.+]] = affine.apply [[$rowC0_map]]
				// CHECK-DAG: [[col:%.+]] = affine.apply [[$colC0_map]]
				// CHECK: vector.load %arg2[[[row]], [[col]]] : memref<128x128xf64>, vector<2xf64>

				%A = vector.transfer_read %arg0[%c1, %c1], %cst {in_bounds = [true, true]} : memref<128x128xf64>, vector<8x4xf64>
				%B = vector.transfer_read %arg1[%c39, %c40], %cst {in_bounds = [true, true], permutation_map = #map0} : memref<128x128xf64>, vector<8x4xf64>
				%C = vector.transfer_read %arg2[%c49, %c40], %cst0 {in_bounds = [true, true]} : memref<128x128xf64>, vector<8x8xf64>
				// CHECK: [[d:%.+]] = nvgpu.mma.sync({{.*}}) {mmaShape = [8, 8, 4]} : (vector<1x1xf64>, vector<1x1xf64>, vector<1x2xf64>) -> vector<1x2xf64>
				%D = vector.contract {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %A, %B, %C : vector<8x4xf64>, vector<8x4xf64> into vector<8x8xf64>

				// CHECK-DAG: [[row:%.+]] = affine.apply [[$rowC0_map]]
				// CHECK-DAG: [[col:%.+]] = affine.apply [[$colC0_map]]
				// CHECK: vector.store {{%.+}}, %arg2[[[row]], [[col]]] : memref<128x128xf64>, vector<2xf64>
				vector.transfer_write %D, %arg2[%c49, %c40] {in_bounds = [true, true]} : vector<8x8xf64>, memref<128x128xf64>
				return
				}

				// -----

				//#########################################################
				// FP16 row-row-row
				//#########################################################

				#map0 = affine_map<(d0, d1) -> (d1, d0)>
				#map1 = affine_map<(d0, d1, d2) -> (d0, d2)>
				#map2 = affine_map<(d0, d1, d2) -> (d1, d2)>
				#map3 = affine_map<(d0, d1, d2) -> (d0, d1)>

				// CHECK-DAG: [[$rowA_map:#.+]] = affine_map<()[s0] -> (s0 mod 16 + 1)>
				// CHECK-DAG: [[$colA_map:#.+]] = affine_map<()[s0] -> ((s0 floordiv 16) * 8 + 3)>

				// CHECK-DAG: [[$rowB_map:#.+]] = affine_map<()[s0] -> (s0 + 3)>
				// CHECK-DAG: [[$colB_map:#.+]] = affine_map<() -> (3)>

				// CHECK-LABEL: func @m16n8k16_fp16_row_row_row
				func.func @m16n8k16_fp16_row_row_row(%arg0: memref<20x20xf16, 3>, %arg1: memref<20x20xf16, 3>, %arg2: memref<20x20xf16, 3>) {
				%cst_0 = arith.constant dense<0.000000e+00> : vector<16x8xf16>
				%c0 = arith.constant 0 : index
				%c1 = arith.constant 1 : index
				%c3 = arith.constant 3 : index
				%cst = arith.constant 0.000000e+00 : f16
				// CHECK-DAG: [[row:%.+]] = affine.apply [[$rowA_map]]
				// CHECK-DAG: [[col:%.+]] = affine.apply [[$colA_map]]
				// CHECK: nvgpu.ldmatrix %arg0[[[row]], [[col]]] {numTiles = 4 : i32, transpose = false}

				// CHECK-DAG: [[row:%.+]] = affine.apply [[$rowB_map]]
				// CHECK-DAG: [[col:%.+]] = affine.apply [[$colB_map]]
				// CHECK: nvgpu.ldmatrix %arg1[[[row]], [[col]]] {numTiles = 2 : i32, transpose = true}
				%A = vector.transfer_read %arg0[%c1, %c3], %cst {in_bounds = [true, true]} : memref<20x20xf16, 3>, vector<16x16xf16>
				%B = vector.transfer_read %arg1[%c3, %c3], %cst {permutation_map = #map0, in_bounds = [true, true]} : memref<20x20xf16, 3>, vector<8x16xf16>
				%C = vector.transfer_read %arg2[%c0, %c0], %cst {in_bounds = [true, true]} : memref<20x20xf16, 3>, vector<16x8xf16>
				%D = vector.contract {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %A, %B, %C : vector<16x16xf16>, vector<8x16xf16> into vector<16x8xf16>
				vector.transfer_write %D, %arg2[%c0, %c0] {in_bounds = [true, true]} : vector<16x8xf16>, memref<20x20xf16, 3>
				return
				}

				// -----

				// CHECK-DAG: [[$Arow_map:#.+]] = affine_map<()[s0] -> (s0 mod 16 + 1)>
				// CHECK-DAG: [[$Acol_map:#.+]] = affine_map<()[s0] -> ((s0 floordiv 16) * 8 + 3)>
				// CHECK-DAG: [[$Bcol_map:#.+]] = affine_map<() -> (3)>
				// CHECK-DAG: [[$Brow_map:#.+]] = affine_map<()[s0] -> (s0 + 3)>

				#map0 = affine_map<(d0, d1, d2) -> (d2, d1)>
				#map1 = affine_map<(d0, d1, d2) -> (d0, d2)>
				#map2 = affine_map<(d0, d1, d2) -> (d1, d2)>
				#map3 = affine_map<(d0, d1, d2) -> (d0, d1)>

				// CHECK-LABEL: func @batch_m16n8k16_fp16_row_row_row
				func.func @batch_m16n8k16_fp16_row_row_row(%arg0: memref<2x20x20xf16, 3>, %arg1: memref<2x20x20xf16, 3>, %arg2: memref<2x20x20xf16, 3>) {
				%cst_0 = arith.constant dense<0.000000e+00> : vector<20x20xf16>
				// CHECK: [[C0:%.+]] = arith.constant 0 : index
				%c0 = arith.constant 0 : index
				%c1 = arith.constant 1 : index
				%c3 = arith.constant 3 : index
				%cst = arith.constant 0.000000e+00 : f16

				// CHECK-DAG: [[row:%.+]] = affine.apply [[$Arow_map]]
				// CHECK-DAG: [[col:%.+]] = affine.apply [[$Acol_map]]
				// CHECK: nvgpu.ldmatrix %arg0[[[C0]], [[row]], [[col]]] {numTiles = 4 : i32, transpose = false} : memref<2x20x20xf16, 3> -> vector<4x2xf16>
				%A = vector.transfer_read %arg0[%c0, %c1, %c3], %cst {in_bounds = [true, true]} : memref<2x20x20xf16, 3>, vector<16x16xf16>

				// CHECK-DAG: [[row:%.+]] = affine.apply [[$Brow_map]]
				// CHECK-DAG: [[col:%.+]] = affine.apply [[$Bcol_map]]
				// CHECK: nvgpu.ldmatrix %arg1[[[C0]], [[row]], [[col]]] {numTiles = 2 : i32, transpose = true} : memref<2x20x20xf16, 3> -> vector<2x2xf16>
				%B = vector.transfer_read %arg1[%c0, %c3, %c3], %cst {permutation_map = #map0, in_bounds = [true, true]} : memref<2x20x20xf16, 3>, vector<8x16xf16>

				// CHECK-DAG: [[row:%.+]] = affine.apply [[$Arow_map]]
				// CHECK-DAG: [[col:%.+]] = affine.apply [[$Acol_map]]
				// CHECK: nvgpu.ldmatrix %arg2[[[C0]], [[row]], [[col]]] {numTiles = 2 : i32, transpose = false} : memref<2x20x20xf16, 3> -> vector<2x2xf16>
				ThomasRaouxUnsubmitted Not Done Reply Inline Actions should be [[C0]] ThomasRaoux: should be [[C0]]
				%C = vector.transfer_read %arg2[%c0, %c1, %c3], %cst {in_bounds = [true, true]} : memref<2x20x20xf16, 3>, vector<16x8xf16>
				%D = vector.contract {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %A, %B, %C : vector<16x16xf16>, vector<8x16xf16> into vector<16x8xf16>
				vector.transfer_write %D, %arg2[%c0, %c1, %c3] {in_bounds = [true, true]} : vector<16x8xf16>, memref<2x20x20xf16, 3>
				return
				}

				// -----

				//#########################################################
				// FP16 row-col-row
				//#########################################################

				#map1 = affine_map<(d0, d1, d2) -> (d0, d2)>
				#map2 = affine_map<(d0, d1, d2) -> (d1, d2)>
				#map3 = affine_map<(d0, d1, d2) -> (d0, d1)>

				// CHECK: [[$rowA_map:#.+]] = affine_map<()[s0] -> (s0 mod 16 + 1)>
				// CHECK: [[$colA_map:#.+]] = affine_map<()[s0] -> ((s0 floordiv 16) * 8 + 3)>

				// CHECK: [[$rowB_map:#.+]] = affine_map<()[s0] -> (s0 mod 8 + 1)>
				// CHECK: [[$colB_map:#.+]] = affine_map<()[s0] -> ((s0 floordiv 8) * 8 + 3)>

				// CHECK-LABEL: func @m16n8k16_fp16_row_col_row
				func.func @m16n8k16_fp16_row_col_row(%arg0: memref<20x20xf16, 3>, %arg1: memref<20x20xf16, 3>, %arg2: memref<20x20xf16, 3>) {
				%cst_0 = arith.constant dense<0.000000e+00> : vector<16x8xf16>
				%c0 = arith.constant 0 : index
				%c1 = arith.constant 1 : index
				%c3 = arith.constant 3 : index
				%cst = arith.constant 0.000000e+00 : f16
				// CHECK-DAG: [[row:%.+]] = affine.apply [[$rowA_map]]
				// CHECK-DAG: [[col:%.+]] = affine.apply [[$colA_map]]
				// CHECK: nvgpu.ldmatrix %arg0[[[row]], [[col]]] {numTiles = 4 : i32
				// CHECK-SAME: transpose = false

				// CHECK-DAG: [[row:%.+]] = affine.apply [[$rowB_map]]
				// CHECK-DAG: [[col:%.+]] = affine.apply [[$colB_map]]
				// CHECK: nvgpu.ldmatrix %arg1[[[row]], [[col]]] {numTiles = 2 : i32
				// CHECK-SAME: transpose = false

				// CHECK-DAG: [[row:%.+]] = affine.apply [[$rowA_map]]
				// CHECK-DAG: [[col:%.+]] = affine.apply [[$colA_map]]
				// CHECK: nvgpu.ldmatrix %arg2[[[row]], [[col]]] {numTiles = 2 : i32
				// CHECK-SAME: transpose = false
				%A = vector.transfer_read %arg0[%c1, %c3], %cst {in_bounds = [true, true]} : memref<20x20xf16, 3>, vector<16x16xf16>
				%B = vector.transfer_read %arg1[%c1, %c3], %cst {in_bounds = [true, true]} : memref<20x20xf16, 3>, vector<8x16xf16>
				%C = vector.transfer_read %arg2[%c1, %c3], %cst {in_bounds = [true, true]} : memref<20x20xf16, 3>, vector<16x8xf16>
				%D = vector.contract {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %A, %B, %C : vector<16x16xf16>, vector<8x16xf16> into vector<16x8xf16>
				vector.transfer_write %D, %arg2[%c0, %c0] {in_bounds = [true, true]} : vector<16x8xf16>, memref<20x20xf16, 3>
				return
				}
				ThomasRaouxUnsubmitted Done Reply Inline Actions nit: missing newline ThomasRaoux: nit: missing newline

				// -----

				//#########################################################
				// TF32 (multiplicand) F32 (accumulator) row-row-row
				//#########################################################

				#map0 = affine_map<(d0, d1) -> (d1, d0)>
				#map1 = affine_map<(d0, d1, d2) -> (d0, d2)>
				#map2 = affine_map<(d0, d1, d2) -> (d1, d2)>
				#map3 = affine_map<(d0, d1, d2) -> (d0, d1)>

				// CHECK-DAG: [[$rowA_map:#.+]] = affine_map<()[s0] -> (s0 mod 16 + 1)>
				// CHECK-DAG: [[$colA_map:#.+]] = affine_map<()[s0] -> ((s0 floordiv 16) * 4 + 3)>

				// CHECK-DAG: [[$rowB_map:#.+]] = affine_map<()[s0] -> (s0 mod 4 + 3)>
				// CHECK-DAG: [[$colB_map:#.+]] = affine_map<()[s0] -> (s0 floordiv 4 + 3)>

				// CHECK-DAG: [[$rowC_map:#.+]] = affine_map<()[s0] -> (s0 floordiv 4)>
				// CHECK-DAG: [[$rowC8_map:#.+]] = affine_map<()[s0] -> (s0 floordiv 4 + 8)>
				// CHECK-DAG: [[$colC_map:#.+]] = affine_map<()[s0] -> (s0 * 2 - (s0 floordiv 4) * 8)>

				// CHECK-LABEL: func @m16n8k4_tf32_f32_row_row_row
				func.func @m16n8k4_tf32_f32_row_row_row(%arg0: memref<20x20xf32, 3>, %arg1: memref<20x20xf32, 3>, %arg2: memref<20x20xf32>) {
				%cst_0 = arith.constant dense<0.000000e+00> : vector<16x8xf32>
				%c0 = arith.constant 0 : index
				%c1 = arith.constant 1 : index
				%c3 = arith.constant 3 : index
				%cst = arith.constant 0.000000e+00 : f32

				// CHECK: [[c_frag:%.+]] = arith.constant {{.*}} : vector<2x2xf32>

				// CHECK-DAG: [[row:%.+]] = affine.apply [[$rowA_map]]
				// CHECK-DAG: [[col:%.+]] = affine.apply [[$colA_map]]
				// CHECK: [[a_frag:%.+]] = nvgpu.ldmatrix %arg0[[[row]], [[col]]] {numTiles = 2 : i32, transpose = false}

				// b and c are not loaded by ldmatrix in this test.
				// CHECK-NOT: nvgpu.ldmatrix

				// CHECK-DAG: [[row:%.+]] = affine.apply [[$rowB_map]]
				// CHECK-DAG: [[col:%.+]] = affine.apply [[$colB_map]]
				// CHECK: [[b_el:%.+]] = memref.load {{%.+}} : memref<20x20xf32, 3>
				// CHECK: [[b_frag:%.+]] = vector.insert [[b_el]], {{.*}} : f32 into vector<1x1xf32>

				// CHECK: [[d_frag:%.+]] = nvgpu.mma.sync([[a_frag]], [[b_frag]], [[c_frag]])
				// CHECK-SAME: mmaShape = [16, 8, 4]
				// CHECK-SAME: -> vector<2x2xf32>
				%A = vector.transfer_read %arg0[%c1, %c3], %cst {in_bounds = [true, true]} : memref<20x20xf32, 3>, vector<16x4xf32>
				%B = vector.transfer_read %arg1[%c3, %c3], %cst {permutation_map = #map0, in_bounds = [true, true]} : memref<20x20xf32, 3>, vector<8x4xf32>
				%D = vector.contract {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %A, %B, %cst_0 : vector<16x4xf32>, vector<8x4xf32> into vector<16x8xf32>

				// CHECK: vector.extract [[d_frag]][0] : vector<2x2xf32>
				// CHECK: affine.apply [[$rowC_map]]
				// CHECK: affine.apply [[$colC_map]]
				// CHECK: vector.store
				// CHECK: vector.extract [[d_frag]][1] : vector<2x2xf32>
				// CHECK: affine.apply [[$rowC8_map]]
				// CHECK: affine.apply [[$colC_map]]
				// CHECK: vector.store
				vector.transfer_write %D, %arg2[%c0, %c0] {in_bounds = [true, true]} : vector<16x8xf32>, memref<20x20xf32>
				return
				}

This is an archive of the discontinued LLVM Phabricator instance.

[MLIR][GPU] Add NvGpu mma.sync path to the VectorToGPU pass
ClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 430986

mlir/include/mlir/Conversion/Passes.td

mlir/include/mlir/Conversion/VectorToGPU/VectorToGPU.h

mlir/lib/Conversion/PassDetail.h

mlir/lib/Conversion/VectorToGPU/CMakeLists.txt

mlir/lib/Conversion/VectorToGPU/NvGpuSupport.h

mlir/lib/Conversion/VectorToGPU/NvGpuSupport.cpp

mlir/lib/Conversion/VectorToGPU/VectorToGPU.cpp

mlir/test/Conversion/VectorToGPU/vector-to-mma-ops-mma-sync.mlir

This is an archive of the discontinued LLVM Phabricator instance.

[MLIR][GPU] Add NvGpu mma.sync path to the VectorToGPU passClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 430986

mlir/include/mlir/Conversion/Passes.td

mlir/include/mlir/Conversion/VectorToGPU/VectorToGPU.h

mlir/lib/Conversion/PassDetail.h

mlir/lib/Conversion/VectorToGPU/CMakeLists.txt

mlir/lib/Conversion/VectorToGPU/NvGpuSupport.h

mlir/lib/Conversion/VectorToGPU/NvGpuSupport.cpp

mlir/lib/Conversion/VectorToGPU/VectorToGPU.cpp

mlir/test/Conversion/VectorToGPU/vector-to-mma-ops-mma-sync.mlir

[MLIR][GPU] Add NvGpu mma.sync path to the VectorToGPU pass
ClosedPublic