Diff 351450

mlir/include/mlir/Conversion/Passes.h

	Show All 32 Lines
	#include "mlir/Conversion/SCFToStandard/SCFToStandard.h"			#include "mlir/Conversion/SCFToStandard/SCFToStandard.h"
	#include "mlir/Conversion/SPIRVToLLVM/SPIRVToLLVMPass.h"			#include "mlir/Conversion/SPIRVToLLVM/SPIRVToLLVMPass.h"
	#include "mlir/Conversion/ShapeToStandard/ShapeToStandard.h"			#include "mlir/Conversion/ShapeToStandard/ShapeToStandard.h"
	#include "mlir/Conversion/StandardToLLVM/ConvertStandardToLLVMPass.h"			#include "mlir/Conversion/StandardToLLVM/ConvertStandardToLLVMPass.h"
	#include "mlir/Conversion/StandardToSPIRV/StandardToSPIRVPass.h"			#include "mlir/Conversion/StandardToSPIRV/StandardToSPIRVPass.h"
	#include "mlir/Conversion/TosaToLinalg/TosaToLinalg.h"			#include "mlir/Conversion/TosaToLinalg/TosaToLinalg.h"
	#include "mlir/Conversion/TosaToSCF/TosaToSCF.h"			#include "mlir/Conversion/TosaToSCF/TosaToSCF.h"
	#include "mlir/Conversion/TosaToStandard/TosaToStandard.h"			#include "mlir/Conversion/TosaToStandard/TosaToStandard.h"
				#include "mlir/Conversion/VectorToGPU/VectorToGPU.h"
	#include "mlir/Conversion/VectorToLLVM/ConvertVectorToLLVM.h"			#include "mlir/Conversion/VectorToLLVM/ConvertVectorToLLVM.h"
	#include "mlir/Conversion/VectorToROCDL/VectorToROCDL.h"			#include "mlir/Conversion/VectorToROCDL/VectorToROCDL.h"
	#include "mlir/Conversion/VectorToSCF/VectorToSCF.h"			#include "mlir/Conversion/VectorToSCF/VectorToSCF.h"
	#include "mlir/Conversion/VectorToSPIRV/VectorToSPIRVPass.h"			#include "mlir/Conversion/VectorToSPIRV/VectorToSPIRVPass.h"

	namespace mlir {			namespace mlir {

	/// Generate the code for registering conversion passes.			/// Generate the code for registering conversion passes.
	#define GEN_PASS_REGISTRATION			#define GEN_PASS_REGISTRATION
	#include "mlir/Conversion/Passes.h.inc"			#include "mlir/Conversion/Passes.h.inc"

	} // namespace mlir			} // namespace mlir

	#endif // MLIR_CONVERSION_PASSES_H			#endif // MLIR_CONVERSION_PASSES_H

mlir/include/mlir/Conversion/Passes.td

Show First 20 Lines • Show All 508 Lines • ▼ Show 20 Lines	let description = [{
Pass that converts TOSA operations to the equivalent operations using the		Pass that converts TOSA operations to the equivalent operations using the
operations in the Standard dialect.		operations in the Standard dialect.
}];		}];

let constructor = "tosa::createTosaToStandard()";		let constructor = "tosa::createTosaToStandard()";
}		}

//===----------------------------------------------------------------------===//		//===----------------------------------------------------------------------===//
		// VectorToGPU
		//===----------------------------------------------------------------------===//

		def ConvertVectorToGPU : FunctionPass<"convert-vector-to-gpu"> {
		let summary = "Lower the operations from the vector dialect into the GPU "
		"dialect";
		let constructor = "mlir::createConvertVectorToGPUPass()";
		let dependentDialects = [
		"memref::MemRefDialect",
		"gpu::GPUDialect"
		];
		}

		//===----------------------------------------------------------------------===//
// VectorToSCF		// VectorToSCF
//===----------------------------------------------------------------------===//		//===----------------------------------------------------------------------===//

def ConvertVectorToSCF : FunctionPass<"convert-vector-to-scf"> {		def ConvertVectorToSCF : FunctionPass<"convert-vector-to-scf"> {
let summary = "Lower the operations from the vector dialect into the SCF "		let summary = "Lower the operations from the vector dialect into the SCF "
"dialect";		"dialect";
let constructor = "mlir::createConvertVectorToSCFPass()";		let constructor = "mlir::createConvertVectorToSCFPass()";
let dependentDialects = [		let dependentDialects = [
▲ Show 20 Lines • Show All 97 Lines • Show Last 20 Lines

mlir/include/mlir/Conversion/VectorToGPU/VectorToGPU.h

This file was added.

				//===- VectorToGPU.h - Convert vector to GPU dialect ------------- C++ --===//
				//
				// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
				// See https://llvm.org/LICENSE.txt for license information.
				// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
				//
				//===----------------------------------------------------------------------===//

				#ifndef MLIR_INCLUDE_MLIR_CONVERSION_VECTORTOSCF_VECTORTOGPU_H_
				#define MLIR_INCLUDE_MLIR_CONVERSION_VECTORTOSCF_VECTORTOGPU_H_

				#include "mlir/IR/PatternMatch.h"

				namespace mlir {
				class MLIRContext;
				class Pass;
				class FuncOp;
				class RewritePatternSet;

				/// Patterns to transform vector ops into a canonical form to convert to MMA
				/// matrix operations.
				void populatePrepareVectorToMMAPatterns(RewritePatternSet &patterns);
				bondhugulaUnsubmitted Not Done Reply Inline Actions Pass to test convertion -> Convert I guess this is not a test pass but the conversion itself. bondhugula: Pass to test convertion -> Convert I guess this is not a test pass but the conversion itself.
				ThomasRaouxAuthorUnsubmitted Done Reply Inline Actions Done, you're right, I was originally thinking about having this only for test but at this point it is really a conversion pass. ThomasRaoux: Done, you're right, I was originally thinking about having this only for test but at this point…

				/// Convert vector ops to MMA matrix operations. This will convert slice of
				/// operations that can be legally converted to MMA operations. The rest of the
				/// vector operations are left untouched.
				void convertVectorToMMAOps(FuncOp funcOp);

				/// Convert from vector to GPU ops.
				std::unique_ptr<Pass> createConvertVectorToGPUPass();

				} // namespace mlir

				#endif // MLIR_INCLUDE_MLIR_CONVERSION_VECTORTOSCF_VECTORTOGPU_H_

mlir/lib/Conversion/CMakeLists.txt

	Show All 23 Lines
	add_subdirectory(SPIRVToLLVM)			add_subdirectory(SPIRVToLLVM)
	add_subdirectory(StandardToLLVM)			add_subdirectory(StandardToLLVM)
	add_subdirectory(StandardToSPIRV)			add_subdirectory(StandardToSPIRV)
	add_subdirectory(TosaToLinalg)			add_subdirectory(TosaToLinalg)
	add_subdirectory(TosaToSCF)			add_subdirectory(TosaToSCF)
	add_subdirectory(TosaToStandard)			add_subdirectory(TosaToStandard)
	add_subdirectory(VectorToROCDL)			add_subdirectory(VectorToROCDL)
	add_subdirectory(VectorToLLVM)			add_subdirectory(VectorToLLVM)
				add_subdirectory(VectorToGPU)
	add_subdirectory(VectorToSCF)			add_subdirectory(VectorToSCF)
	add_subdirectory(VectorToSPIRV)			add_subdirectory(VectorToSPIRV)

mlir/lib/Conversion/VectorToGPU/CMakeLists.txt

This file was added.

				add_mlir_conversion_library(MLIRVectorToGPU
				VectorToGPU.cpp

				ADDITIONAL_HEADER_DIRS
				${MLIR_MAIN_INCLUDE_DIR}/mlir/Conversion/VectorToGPU

				LINK_COMPONENTS
				Core

				LINK_LIBS PUBLIC
				MLIRGPU
				MLIRLLVMIR
				MLIRMemRef
				MLIRTransforms
				)

mlir/lib/Conversion/VectorToGPU/VectorToGPU.cpp

This file was added.

				//===- VectorToGPU.cpp - Convert vector to GPU dialect ----------- C++ --===//
				//
				// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
				// See https://llvm.org/LICENSE.txt for license information.
				// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
				//
				//===----------------------------------------------------------------------===//
				//
				// This file implements lowering of vector operations to GPU dialect ops.
				//
				//===----------------------------------------------------------------------===//

				#include <type_traits>

				#include "mlir/Conversion/VectorToGPU/VectorToGPU.h"

				#include "../PassDetail.h"
				#include "mlir/Analysis/SliceAnalysis.h"
				#include "mlir/Dialect/GPU/GPUDialect.h"
				#include "mlir/Dialect/Utils/StructuredOpsUtils.h"
				#include "mlir/Dialect/Vector/VectorOps.h"
				#include "mlir/Dialect/Vector/VectorUtils.h"
				#include "mlir/IR/Builders.h"
				#include "mlir/Pass/Pass.h"
				#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
				#include "mlir/Transforms/Passes.h"

				using namespace mlir;

				// Return true if the contract op can be convert to MMA matmul.
				static bool contractSupportsMMAMatrixType(vector::ContractionOp contract) {
				if (llvm::size(contract.masks()) != 0)
				return false;

				using MapList = ArrayRef<ArrayRef<AffineExpr>>;
				auto infer = [](MapList m) { return AffineMap::inferFromExprList(m); };
				AffineExpr m, n, k;
				bindDims(contract.getContext(), m, n, k);
				auto iteratorTypes = contract.iterator_types().getValue();
				if (!(isParallelIterator(iteratorTypes[0]) &&
				isParallelIterator(iteratorTypes[1]) &&
				isReductionIterator(iteratorTypes[2])))
				return false;

				// The contract needs to represent a matmul to be able to convert to
				// MMAMatrix matmul.
				if (contract.getIndexingMaps() != infer({{m, k}, {k, n}, {m, n}}))
				return false;

				// Check that the size matches what is natively supported.
				VectorType lhsType = contract.lhs().getType().cast<VectorType>();
				nicolasvasilacheUnsubmitted Done Reply Inline Actions nit: matches nicolasvasilache: nit: matches
				VectorType rhsType = contract.rhs().getType().cast<VectorType>();
				VectorType accType = contract.acc().getType().cast<VectorType>();

				std::tuple<int, int, int> dim(lhsType.getDimSize(0), rhsType.getDimSize(1),
				lhsType.getDimSize(1));
				if (lhsType.getElementType().isInteger(8) &&
				rhsType.getElementType().isInteger(8) &&
				accType.getElementType().isInteger(32) &&
				(dim == std::make_tuple(8, 8, 32) \|\| dim == std::make_tuple(16, 16, 32) \|\|
				dim == std::make_tuple(16, 8, 32)))
				return true;

				if (lhsType.getElementType().isF16() && rhsType.getElementType().isF16() &&
				(accType.getElementType().isF16() \|\| accType.getElementType().isF32()) &&
				(dim == std::make_tuple(8, 8, 16) \|\| dim == std::make_tuple(16, 16, 16) \|\|
				dim == std::make_tuple(16, 8, 16)))
				return true;
				return false;
				}

				// Return the stide for the dimension 0 of \|type\| if it is a memref and has a
				// constant stride.
				static llvm::Optional<int64_t>
				getMemrefConstantHorizontalStride(ShapedType type) {
				auto memrefType = type.dyn_cast<MemRefType>();
				if (!memrefType)
				return false;
				int64_t offset = 0;
				SmallVector<int64_t, 2> strides;
				if (failed(getStridesAndOffset(memrefType, strides, offset)))
				return llvm::None;
				if (strides[0] == ShapedType::kDynamicStrideOrOffset)
				return llvm::None;
				return strides[0];
				}

				// Return true if the transfer op can be converted to a MMA matrix load.
				static bool transferReadSupportsMMAMatrixType(vector::TransferReadOp readOp) {
				if (readOp.mask() \|\| readOp.hasOutOfBoundsDim() \|\|
				readOp.getVectorType().getRank() != 2)
				return false;
				if (!getMemrefConstantHorizontalStride(readOp.getShapedType()))
				return false;
				// TODO: Support transpose once it is added to GPU dialect ops.
				if (!readOp.permutation_map().isMinorIdentity())
				return false;
				return true;
				}

				// Return true if the transfer op can be converted to a MMA matrix store.
				static bool
				transferWriteSupportsMMAMatrixType(vector::TransferWriteOp writeOp) {
				if (writeOp.mask() \|\| writeOp.hasOutOfBoundsDim() \|\|
				writeOp.getVectorType().getRank() != 2)
				return false;
				if (!getMemrefConstantHorizontalStride(writeOp.getShapedType()))
				return false;
				// TODO: Support transpose once it is added to GPU dialect ops.
				if (!writeOp.permutation_map().isMinorIdentity())
				return false;
				return true;
				}

				nicolasvasilacheUnsubmitted Done Reply Inline Actions Nite: Analyze ? nicolasvasilache: Nite: Analyze ?
				static bool supportsMMaMatrixType(Operation *op) {
				if (auto transferRead = dyn_cast<vector::TransferReadOp>(op))
				return transferReadSupportsMMAMatrixType(transferRead);
				nicolasvasilacheUnsubmitted Done Reply Inline Actions nit: trivial braces here and below nicolasvasilache: nit: trivial braces here and below
				if (auto transferWrite = dyn_cast<vector::TransferWriteOp>(op))
				nicolasvasilacheUnsubmitted Done Reply Inline Actions You should be able to only walk the vector::ContractionOp and save some logic below. nicolasvasilache: You should be able to only walk the vector::ContractionOp and save some logic below.
				return transferWriteSupportsMMAMatrixType(transferWrite);
				if (auto contract = dyn_cast<vector::ContractionOp>(op))
				return contractSupportsMMAMatrixType(contract);
				return false;
				nicolasvasilacheUnsubmitted Done Reply Inline Actions auto hasVectorDest = [](Operation op) { return op->getNumResults() == 0 \|\| llvm::any_of(op->getResultTypes(), [](Type t) { return t.isa<VectorType>(); }); }; Also, do you want "any_of" or "all_of" behavior ? I'd also lift it at the top of the function to avoid interleaving logic and simplify the reading. nicolasvasilache:* ``` auto hasVectorDest = [](Operation *op) { return op->getNumResults() == 0 \|\| llvm::any_of…
				}

				// Analyze slice of operations based on convert op to figure out if the whole
				// slice can be converted to MMA operations.
				static SetVector<Operation > getOpToConvert(mlir::Operation op) {
				auto hasVectorDest = [](Operation *op) {
				return op->getNumResults() == 0 \|\|
				llvm::any_of(op->getResultTypes(),
				[](Type t) { return t.isa<VectorType>(); });
				};
				nicolasvasilacheUnsubmitted Done Reply Inline Actions if (llvm::any_of(dependentOp, [](){ return !supports...})) return; nicolasvasilache: ``` if (llvm::any_of(dependentOp, [](){ return !supports...})) return; ```
				SetVector<Operation *> opToConvert;
				op->walk([&](vector::ContractionOp contract) {
				if (opToConvert.contains(contract.getOperation()))
				return;
				SetVector<Operation *> dependentOps =
				getSlice(contract, hasVectorDest, hasVectorDest);
				// If any instruction cannot use MMA matrix type drop the whole
				// chaine. MMA matrix are stored in an opaque type so they cannot be used
				// by all operations.
				if (llvm::any_of(dependentOps,
				[](Operation *op) { return !supportsMMaMatrixType(op); }))
				return;
				opToConvert.insert(dependentOps.begin(), dependentOps.end());
				});
				return opToConvert;
				}
				nicolasvasilacheUnsubmitted Not Done Reply Inline Actions Can a proper subset of the vector matmul lowering pattern be exposed and plugged here (maybe with some extra lambda) ? This looks like a lot of code to duplicate. nicolasvasilache: Can a proper subset of the vector matmul lowering pattern be exposed and plugged here (maybe…
				ThomasRaouxAuthorUnsubmitted Done Reply Inline Actions I couldn't get to anything that makes sense, the matmul lowering is trying to put the contract in the form (k, m), (k, n), (m, n) while this code is transforming it to (m, k), (k, n), (m, n) so most of the logic is different the only thing that I could move into a common function was the dim binding but it tends to make the code more complicated as the matmul lowering also handles vector * mat. Let me know if you have any suggestions to improve that. ThomasRaoux: I couldn't get to anything that makes sense, the matmul lowering is trying to put the contract…
				nicolasvasilacheUnsubmitted Not Done Reply Inline Actions That's sad, these are all so close. I guess we are reaching the point where we want to cast some of these manipulations finding some permutation and just doing something custom with it. This could reduce these explicit 8 case enumeration to just 3 cases + a few swaps. Not for this CL obviously, thanks for trying! nicolasvasilache: That's sad, these are all so close. I guess we are reaching the point where we want to cast…
				ThomasRaouxAuthorUnsubmitted Done Reply Inline Actions Correct, we can reduce the number of cases here and in matmul lowering, I'll try to do it in the next CL. ThomasRaoux: Correct, we can reduce the number of cases here and in matmul lowering, I'll try to do it in…

				namespace {
				// Transform contract into (m, k)x(k, n)x(m, n) form so that it can be converted
				// to MMA matmul.
				struct PrepareContractToGPUMMA
				: public OpRewritePattern<vector::ContractionOp> {
				using OpRewritePattern<vector::ContractionOp>::OpRewritePattern;

				LogicalResult matchAndRewrite(vector::ContractionOp op,
				PatternRewriter &rewriter) const override {
				Location loc = op.getLoc();
				Value lhs = op.lhs(), rhs = op.rhs(), res = op.acc();

				// Set up the parallel/reduction structure in right form.
				using MapList = ArrayRef<ArrayRef<AffineExpr>>;
				auto infer = [](MapList m) { return AffineMap::inferFromExprList(m); };
				AffineExpr m, n, k;
				bindDims(rewriter.getContext(), m, n, k);
				static constexpr std::array<int64_t, 2> perm = {1, 0};
				auto iteratorTypes = op.iterator_types().getValue();
				SmallVector<AffineMap, 4> maps = op.getIndexingMaps();
				if (!(isParallelIterator(iteratorTypes[0]) &&
				isParallelIterator(iteratorTypes[1]) &&
				isReductionIterator(iteratorTypes[2])))
				return failure();
				//
				// Two outer parallel, one inner reduction (matmat flavor).
				//
				if (maps == infer({{m, k}, {k, n}, {m, n}})) {
				// This is the classical row-major matmul, nothing to do.
				return failure();
				}
				if (maps == infer({{m, k}, {n, k}, {m, n}})) {
				rhs = rewriter.create<vector::TransposeOp>(loc, rhs, perm);
				} else if (maps == infer({{k, m}, {k, n}, {m, n}})) {
				lhs = rewriter.create<vector::TransposeOp>(loc, lhs, perm);
				} else if (maps == infer({{k, m}, {n, k}, {m, n}})) {
				rhs = rewriter.create<vector::TransposeOp>(loc, rhs, perm);
				lhs = rewriter.create<vector::TransposeOp>(loc, lhs, perm);
				} else if (maps == infer({{m, k}, {k, n}, {n, m}})) {
				std::swap(rhs, lhs);
				rhs = rewriter.create<vector::TransposeOp>(loc, rhs, perm);
				lhs = rewriter.create<vector::TransposeOp>(loc, lhs, perm);
				} else if (maps == infer({{m, k}, {n, k}, {n, m}})) {
				std::swap(rhs, lhs);
				rhs = rewriter.create<vector::TransposeOp>(loc, rhs, perm);
				} else if (maps == infer({{k, m}, {k, n}, {n, m}})) {
				std::swap(lhs, rhs);
				lhs = rewriter.create<vector::TransposeOp>(loc, lhs, perm);
				} else if (maps == infer({{k, m}, {n, k}, {n, m}})) {
				std::swap(lhs, rhs);
				} else {
				return failure();
				}
				rewriter.replaceOpWithNewOp<vector::ContractionOp>(
				op, lhs, rhs, res,
				rewriter.getAffineMapArrayAttr(infer({{m, k}, {k, n}, {m, n}})),
				op.iterator_types());
				return success();
				}
				nicolasvasilacheUnsubmitted Not Done Reply Inline Actions We should already have a pattern that rewrites vector.transfer + vector.transpose into the proper memref.transpose + vector.transfer; can we reuse it? @bkramer for visibility. nicolasvasilache: We should already have a pattern that rewrites vector.transfer + vector.transpose into the…
				ThomasRaouxAuthorUnsubmitted Done Reply Inline Actions I can't find this pattern but i'm not sure how this help for this case. Ideally I want a transfer with transpose that can directly be lowered to a mma.load op. How would the memref.transpose help this case? ThomasRaoux: I can't find this pattern but i'm not sure how this help for this case. Ideally I want a…
				nicolasvasilacheUnsubmitted Not Done Reply Inline Actions I forgot part of the thinking but IIRC, the idea was that since the mma.load seems richer, it may be possible to fold these transposes into the vector.transfer indexing logic and propagate that to the mma ops; rather than perform acrual transposes. Not clear whether this is really possible, I may look a little more into it in the future. nicolasvasilache: I forgot part of the thinking but IIRC, the idea was that since the mma.load seems richer, it…
				ThomasRaouxAuthorUnsubmitted Done Reply Inline Actions I'm a bit confused, the pattern does merge the transpose into the transfer_read indexing logic. I don't understand why we would want a memref.transpose. If there is a pattern already doing transpose+transfer_read -> transfer_read with affine map I can use it but I couldn't find any. ThomasRaoux: I'm a bit confused, the pattern does merge the transpose into the transfer_read indexing logic.
				nicolasvasilacheUnsubmitted Not Done Reply Inline Actions You're right, I confirmed with @bkramer offline that this actually did not land, I just reviewed it a few months back. It was the motivation for moving linalg.transpose to memref in this CL: https://reviews.llvm.org/D88651. I imagine we will want to revive the `vector.transfer + transpose` -> `vector.transfer + strided_memref` ? In Ben's case he wanted to remove the strided_memref but I imagine very similar code will be useful for our purpose too. nicolasvasilache: You're right, I confirmed with @bkramer offline that this actually did not land, I just…
				ThomasRaouxAuthorUnsubmitted Done Reply Inline Actions Sounds good, thanks for checking, I can help move those patterns to a more generic place when it makes sense ThomasRaoux: Sounds good, thanks for checking, I can help move those patterns to a more generic place when…
				};

				// Merge transpose op into the transfer read op. Transpose are not supported on
				// MMA types but MMA load can transpose the matrix when loading.
				struct CombineTransferReadOpTranspose final
				: public OpRewritePattern<vector::TransposeOp> {
				using OpRewritePattern<vector::TransposeOp>::OpRewritePattern;

				LogicalResult matchAndRewrite(vector::TransposeOp op,
				PatternRewriter &rewriter) const override {
				auto transferReadOp = op.vector().getDefiningOp<vector::TransferReadOp>();
				if (!transferReadOp)
				return failure();
				if (transferReadOp.mask() \|\| transferReadOp.hasOutOfBoundsDim())
				return failure();
				SmallVector<int64_t, 2> perm;
				op.getTransp(perm);
				SmallVector<unsigned, 2> permU;
				for (int64_t o : perm)
				permU.push_back(unsigned(o));
				AffineMap permutationMap =
				AffineMap::getPermutationMap(permU, op.getContext());
				AffineMap newMap = permutationMap.compose(transferReadOp.permutation_map());
				rewriter.replaceOpWithNewOp<vector::TransferReadOp>(
				op, op.getType(), transferReadOp.source(), transferReadOp.indices(),
				newMap, transferReadOp.padding(), transferReadOp.mask(),
				transferReadOp.in_boundsAttr());
				return success();
				}
				};

				} // namespace

				// MMA types have different layout based on how they are used in matmul ops.
				// Figure the right layout to use by looking at Transfer op uses.
				// TODO: Change the GPU dialect to abstract the layout at the this level and
				// only care about it during lowering to NVVM.
				static const char *inferFragType(vector::TransferReadOp op) {
				for (Operation *users : op->getUsers()) {
				auto contract = dyn_cast<vector::ContractionOp>(users);
				nicolasvasilacheUnsubmitted Not Done Reply Inline Actions +1 I wonder whether this has a relation to my rambling about transpose and vector.transfer ? nicolasvasilache: +1 I wonder whether this has a relation to my rambling about transpose and vector.transfer ?
				ThomasRaouxAuthorUnsubmitted Done Reply Inline Actions I don't think it is directly related. The transfer op should already have the transpose indexing merged at this point. ThomasRaoux: I don't think it is directly related. The transfer op should already have the transpose…
				if (!contract)
				continue;
				if (contract.lhs() == op.getResult())
				return "AOp";
				if (contract.rhs() == op.getResult())
				return "BOp";
				}
				return "COp";
				}

				static void convertTransferReadOp(vector::TransferReadOp op,
				llvm::DenseMap<Value, Value> &valueMapping) {
				assert(transferReadSupportsMMAMatrixType(op));
				Optional<int64_t> stride =
				getMemrefConstantHorizontalStride(op.getShapedType());
				assert(stride);
				nicolasvasilacheUnsubmitted Not Done Reply Inline Actions You need to check for static stride (and bail if not). In the general case of dynamic stride, we will need some additional abstractions as the stride is currently kept opaque from std. nicolasvasilache: You need to check for static stride (and bail if not). In the general case of dynamic stride…
				ThomasRaouxAuthorUnsubmitted Done Reply Inline Actions Good point, I moved this into a helper function. ThomasRaoux: Good point, I moved this into a helper function.
				const char *fragType = inferFragType(op);
				gpu::MMAMatrixType type =
				gpu::MMAMatrixType::get(op.getVectorType().getShape(),
				op.getVectorType().getElementType(), fragType);
				OpBuilder b(op);
				Value load = b.create<gpu::SubgroupMmaLoadMatrixOp>(
				op.getLoc(), type, op.source(), op.indices(), b.getIndexAttr(*stride));
				valueMapping[op.getResult()] = load;
				}

				static void convertTransferWriteOp(vector::TransferWriteOp op,
				llvm::DenseMap<Value, Value> &valueMapping) {
				assert(transferWriteSupportsMMAMatrixType(op));
				Optional<int64_t> stride =
				getMemrefConstantHorizontalStride(op.getShapedType());
				assert(stride);
				OpBuilder b(op);
				Value matrix = valueMapping.find(op.vector())->second;
				b.create<gpu::SubgroupMmaStoreMatrixOp>(
				nicolasvasilacheUnsubmitted Not Done Reply Inline Actions You need to check for static stride (and bail if not). In the general case of dynamic stride, we will need some additional abstractions as the stride is currently kept opaque from std. nicolasvasilache: You need to check for static stride (and bail if not). In the general case of dynamic stride…
				ThomasRaouxAuthorUnsubmitted Done Reply Inline Actions ditto ThomasRaoux: ditto
				op.getLoc(), matrix, op.source(), op.indices(), b.getIndexAttr(*stride));
				op.erase();
				}

				static void convertContractOp(vector::ContractionOp op,
				llvm::DenseMap<Value, Value> &valueMapping) {
				OpBuilder b(op);
				Value opA = valueMapping.find(op.lhs())->second;
				Value opB = valueMapping.find(op.rhs())->second;
				Value opC = valueMapping.find(op.acc())->second;
				Value matmul = b.create<gpu::SubgroupMmaComputeOp>(op.getLoc(), opC.getType(),
				opA, opB, opC);
				valueMapping[op.getResult()] = matmul;
				}

				namespace mlir {

				void populatePrepareVectorToMMAPatterns(RewritePatternSet &patterns) {
				patterns.add<PrepareContractToGPUMMA, CombineTransferReadOpTranspose>(
				patterns.getContext());
				}

				void convertVectorToMMAOps(FuncOp funcOp) {
				SetVector<Operation *> ops = getOpToConvert(funcOp);
				llvm::DenseMap<Value, Value> valueMapping;
				for (Operation *op : ops) {
				if (auto transferRead = dyn_cast<vector::TransferReadOp>(op)) {
				convertTransferReadOp(transferRead, valueMapping);
				} else if (auto transferWrite = dyn_cast<vector::TransferWriteOp>(op)) {
				convertTransferWriteOp(transferWrite, valueMapping);
				} else if (auto contractOp = dyn_cast<vector::ContractionOp>(op)) {
				convertContractOp(contractOp, valueMapping);
				}
				}
				}

				} // namespace mlir
				namespace {

				struct ConvertVectorToGPUPass
				: public ConvertVectorToGPUBase<ConvertVectorToGPUPass> {
				void runOnFunction() override {
				RewritePatternSet patterns(getFunction().getContext());
				populatePrepareVectorToMMAPatterns(patterns);
				(void)applyPatternsAndFoldGreedily(getFunction(), std::move(patterns));

				nicolasvasilacheUnsubmitted Not Done Reply Inline Actions I would split anything related to scf.for + scf.yield in a separate CL and discuss there; in particular there are quite some rewrite and canonicalization patterns that may simplify some of this code. nicolasvasilache: I would split anything related to scf.for + scf.yield in a separate CL and discuss there; in…
				ThomasRaouxAuthorUnsubmitted Done Reply Inline Actions I removed all the code related to scf, I'll send a separate patch for it once this one lands. ThomasRaoux: I removed all the code related to scf, I'll send a separate patch for it once this one lands.
				convertVectorToMMAOps(getFunction());
				}
				};

				} // namespace

				std::unique_ptr<Pass> mlir::createConvertVectorToGPUPass() {
				return std::make_unique<ConvertVectorToGPUPass>();
				}

mlir/test/Conversion/VectorToGPU/vector-to-mma-ops.mlir

This file was added.

				// RUN: mlir-opt %s -convert-vector-to-gpu -canonicalize \| FileCheck %s

				#map0 = affine_map<(d0, d1) -> (d1, d0)>
				#map1 = affine_map<(d0, d1, d2) -> (d0, d2)>
				#map2 = affine_map<(d0, d1, d2) -> (d1, d2)>
				#map3 = affine_map<(d0, d1, d2) -> (d0, d1)>

				// CHECK-LABEL: func @matmul
				// CHECK-DAG: %[[A:.+]] = gpu.subgroup_mma_load_matrix %{{.}}[%{{.}}, %{{.*}}] {leadDimension = 16 : index} : memref<16x16xf16> -> !gpu.mma_matrix<16x16xf16, "AOp">
				// CHECK-DAG: %[[B:.+]] = gpu.subgroup_mma_load_matrix %{{.*}}[%c0, %c0] {leadDimension = 16 : index} : memref<16x16xf16> -> !gpu.mma_matrix<16x16xf16, "BOp">
				// CHECK-DAG: %[[C:.+]] = gpu.subgroup_mma_load_matrix %{{.*}}[%c0, %c0] {leadDimension = 16 : index} : memref<16x16xf16> -> !gpu.mma_matrix<16x16xf16, "COp">
				// CHECK: %[[D:.+]] = gpu.subgroup_mma_compute %[[A]], %[[B]], %[[C]] : !gpu.mma_matrix<16x16xf16, "AOp">, !gpu.mma_matrix<16x16xf16, "BOp"> -> !gpu.mma_matrix<16x16xf16, "COp">
				// CHECK: gpu.subgroup_mma_store_matrix %[[D]], %{{.}}[%{{.}}, %{{.*}}] {leadDimension = 16 : index} : !gpu.mma_matrix<16x16xf16, "COp">, memref<16x16xf16>
				func @matmul(%arg0: memref<16x16xf16>, %arg1: memref<16x16xf16>, %arg2: memref<16x16xf16>) {
				%cst_0 = constant dense<0.000000e+00> : vector<16x16xf16>
				%c0 = constant 0 : index
				%cst = constant 0.000000e+00 : f16
				%A = vector.transfer_read %arg0[%c0, %c0], %cst {in_bounds = [true, true]} : memref<16x16xf16>, vector<16x16xf16>
				%B = vector.transfer_read %arg1[%c0, %c0], %cst {permutation_map = #map0, in_bounds = [true, true]} : memref<16x16xf16>, vector<16x16xf16>
				%C = vector.transfer_read %arg2[%c0, %c0], %cst {in_bounds = [true, true]} : memref<16x16xf16>, vector<16x16xf16>
				%D = vector.contract {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %A, %B, %C : vector<16x16xf16>, vector<16x16xf16> into vector<16x16xf16>
				vector.transfer_write %D, %arg2[%c0, %c0] {in_bounds = [true, true]} : vector<16x16xf16>, memref<16x16xf16>
				return
				}

				// Negative test until scf.for support is added.
				// CHECK-LABEL: func @matmul_loop
				// CHECK: vector.contract
				func @matmul_loop(%arg0: memref<128x128xf16>, %arg1: memref<128x128xf16>, %arg2: memref<128x128xf16>) {
				%c0 = constant 0 : index
				%c128 = constant 128 : index
				%c32 = constant 32 : index
				%cst = constant 0.000000e+00 : f16
				%C = vector.transfer_read %arg2[%c0, %c0], %cst {in_bounds = [true, true]} : memref<128x128xf16>, vector<16x16xf16>
				%14 = scf.for %arg17 = %c0 to %c128 step %c32 iter_args(%arg18 = %C) -> (vector<16x16xf16>) {
				%17 = vector.transfer_read %arg0[%c0, %arg17], %cst {in_bounds = [true, true]} : memref<128x128xf16>, vector<16x16xf16>
				%18 = vector.transfer_read %arg1[%arg17, %c0], %cst {permutation_map = #map0, in_bounds = [true, true]} : memref<128x128xf16>, vector<16x16xf16>
				%19 = vector.contract {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %17, %18, %arg18 : vector<16x16xf16>, vector<16x16xf16> into vector<16x16xf16>
				scf.yield %19 : vector<16x16xf16>
				}
				vector.transfer_write %14, %arg2[%c0, %c0] {in_bounds = [true, true]} : vector<16x16xf16>, memref<128x128xf16>
				return
				}

This is an archive of the discontinued LLVM Phabricator instance.

[mlir][VectorToGPU] First step to convert vector ops to GPU MMA ops
ClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 351450

mlir/include/mlir/Conversion/Passes.h

mlir/include/mlir/Conversion/Passes.td

mlir/include/mlir/Conversion/VectorToGPU/VectorToGPU.h

mlir/lib/Conversion/CMakeLists.txt

mlir/lib/Conversion/VectorToGPU/CMakeLists.txt

mlir/lib/Conversion/VectorToGPU/VectorToGPU.cpp

mlir/test/Conversion/VectorToGPU/vector-to-mma-ops.mlir

This is an archive of the discontinued LLVM Phabricator instance.

[mlir][VectorToGPU] First step to convert vector ops to GPU MMA opsClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 351450

mlir/include/mlir/Conversion/Passes.h

mlir/include/mlir/Conversion/Passes.td

mlir/include/mlir/Conversion/VectorToGPU/VectorToGPU.h

mlir/lib/Conversion/CMakeLists.txt

mlir/lib/Conversion/VectorToGPU/CMakeLists.txt

mlir/lib/Conversion/VectorToGPU/VectorToGPU.cpp

mlir/test/Conversion/VectorToGPU/vector-to-mma-ops.mlir

[mlir][VectorToGPU] First step to convert vector ops to GPU MMA ops
ClosedPublic