Diff 419861

mlir/include/mlir/Conversion/Passes.td

	Show First 20 Lines • Show All 758 Lines • ▼ Show 20 Lines
	// VectorToGPU			// VectorToGPU
	//===----------------------------------------------------------------------===//			//===----------------------------------------------------------------------===//

	def ConvertVectorToGPU : Pass<"convert-vector-to-gpu"> {			def ConvertVectorToGPU : Pass<"convert-vector-to-gpu"> {
	let summary = "Lower the operations from the vector dialect into the GPU "			let summary = "Lower the operations from the vector dialect into the GPU "
	"dialect";			"dialect";
	let constructor = "mlir::createConvertVectorToGPUPass()";			let constructor = "mlir::createConvertVectorToGPUPass()";
	let dependentDialects = [			let dependentDialects = [
	"memref::MemRefDialect",			"memref::MemRefDialect", "gpu::GPUDialect", "AffineDialect",
	"gpu::GPUDialect"			"vector::VectorDialect"
				];

				let options = [
				Option<"useWmma", "use-wmma", "bool", /default=/"true", "">
	];			];
	}			}

	//===----------------------------------------------------------------------===//			//===----------------------------------------------------------------------===//
	// VectorToSCF			// VectorToSCF
	//===----------------------------------------------------------------------===//			//===----------------------------------------------------------------------===//

	def ConvertVectorToSCF : Pass<"convert-vector-to-scf"> {			def ConvertVectorToSCF : Pass<"convert-vector-to-scf"> {
	▲ Show 20 Lines • Show All 90 Lines • Show Last 20 Lines

mlir/include/mlir/Conversion/VectorToGPU/NvvmMMASupport.h

This file was added.

				//===- NvvmMMASupport.h - MLIR Vector to GPU lowering support --------===//
				//
				// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
				// See https://llvm.org/LICENSE.txt for license information.
				// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
				//
				//===----------------------------------------------------------------------===//
				//
				// This file provides utilities to assist in the lowering of Vector operations
				// to GPU dialect MMA operations.
				//
				//===----------------------------------------------------------------------===//
				#ifndef MLIR_CONVERSION_VECTORTOGPU_MMASUPPORT_H
				#define MLIR_CONVERSION_VECTORTOGPU_MMASUPPORT_H

				#include "mlir/Dialect/Affine/IR/AffineOps.h"
				#include "mlir/Dialect/GPU/GPUDialect.h"
				#include "mlir/Dialect/LLVMIR/LLVMTypes.h"
				#include "mlir/Dialect/LLVMIR/NVVMDialect.h"
				#include "mlir/Dialect/Utils/StructuredOpsUtils.h"
				#include "mlir/Dialect/Vector/IR/VectorOps.h"
				#include "mlir/IR/PatternMatch.h"
				#include "mlir/IR/Types.h"

				namespace mlir {
				namespace gpu {

				/// Helps to calculate the offsets within the tile for any NVVM/PTX MMA operand
				/// that has a base tile size of 8 elements x [128\|256\|512] bits
				namespace NvvmMmaOperandBaseTileOperand8x128 {

				/// Returns the number of bits in a single tile row. It is either 128, 256, or
				/// 512 bits depending on the data type and whether the operand is an
				/// accumulator.
				int64_t inferTileWidthInBits(Type elementType, bool isAcc);

				/// Specifies information about the registers which compose a matrix fragment
				/// according to the PTX documentation.
				struct FragmentElementInfo {
				Type registerLLVMType;
				int64_t elementsPerRegister;
				int64_t registerWidthBits;
				int64_t numRegistersPerFragment;
				};

				/// Returns a FragmentElementInfo struct describing the register types for the
				/// given matrix fragment type.
				FailureOr<FragmentElementInfo> getRegisterType(MMAMatrixType type);

				/// Returns an AffineMap which maps a two dimensions representing (laneId,
				/// logicalValueId) and returns two results representing offsets within a
				/// matrix operand. The offsets point to the values the thread is responsible
				/// for (AKA the matrix fragment values) during a warp-collective matrix
				/// operation. For a visual reference of this LaneId -> (row, col) mapping,
				/// please see NVIDIA's PTX documentation:
				/// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#warp-level-matrix-instructions-for-mma
				FailureOr<AffineMap>
				getLaneIdAndValueIdToOperandCoord(Location loc, OpBuilder &builder,
				MMAMatrixType fragmentType);

				struct LdMatrixParams {
				MMAMatrixType fragmentType;
				int64_t numTiles;
				IteratorType contiguousDimType;
				NVVM::MMALayout targetLayout;
				};

				FailureOr<LdMatrixParams> getLdMatrixParams(MMAMatrixType fragType,
				bool transpose);
				/// Returns an AffineMap which maps a single dimension representing the laneId
				/// to two results representing offsets within the matrix operand that should
				/// be the pointer locations a thread should pass to the ldmatrix instruction.
				FailureOr<AffineMap>
				getLaneIdToLdMatrixMatrixCoord(Location loc, OpBuilder &builder,
				const LdMatrixParams &params);

				} // namespace NvvmMmaOperandBaseTileOperand8x128
				} // namespace gpu
				} // namespace mlir

				#endif // MLIR_CONVERSION_VECTORTOGPU_MMASUPPORT_H

mlir/include/mlir/Conversion/VectorToGPU/VectorToGPU.h

	Show All 12 Lines

	namespace mlir {			namespace mlir {
	class MLIRContext;			class MLIRContext;
	class Pass;			class Pass;
	class RewritePatternSet;			class RewritePatternSet;

	/// Patterns to transform vector ops into a canonical form to convert to MMA			/// Patterns to transform vector ops into a canonical form to convert to MMA
	/// matrix operations.			/// matrix operations.
	void populatePrepareVectorToMMAPatterns(RewritePatternSet &patterns);			void populatePrepareVectorToMMAPatterns(RewritePatternSet &patterns,
				bool useWmma);
				bondhugulaUnsubmitted Not Done Reply Inline Actions Can you please document `useWmma` here? bondhugula: Can you please document `useWmma` here?
				christopherbateAuthorUnsubmitted Done Reply Inline Actions Done. I can add more detail if required, but basically the canonical forms of the contractions differ, so that is why this is required. christopherbate: Done. I can add more detail if required, but basically the canonical forms of the contractions…

	/// Convert vector ops to MMA matrix operations nested under `rootOp`. This will			/// Convert vector ops to MMA matrix operations nested under `rootOp`. This will
	/// convert slice of operations that can be legally converted to MMA operations.			/// convert slice of operations that can be legally converted to MMA operations.
	/// The rest of the vector operations are left untouched.			/// The rest of the vector operations are left untouched.
	void convertVectorToMMAOps(Operation *rootOp);			void convertVectorToMMAOps(Operation *rootOp);

				/// Convert vector ops ops nested under `rootOp` to vector and GPU operaitons
				/// compatible with the `nvvm.mma.sync` lowering path. This will convert a slice
				/// of operations that can be legally lowered on this path while the rest of
				/// the vector operations are left untouched.
				LogicalResult convertVectorToNVVMCompatibleMMASync(Operation *rootOp);

	/// Convert from vector to GPU ops.			/// Convert from vector to GPU ops.
	std::unique_ptr<Pass> createConvertVectorToGPUPass();			std::unique_ptr<Pass> createConvertVectorToGPUPass(bool useWmma = true);

	} // namespace mlir			} // namespace mlir

	#endif // MLIR_CONVERSION_VECTORTOGPU_VECTORTOGPU_H			#endif // MLIR_CONVERSION_VECTORTOGPU_VECTORTOGPU_H

mlir/include/mlir/Dialect/GPU/GPUDialect.h

Show First 20 Lines • Show All 159 Lines • ▼ Show 20 Lines	public:

/// Get elementType of a single element.		/// Get elementType of a single element.
Type getElementType() const;		Type getElementType() const;

/// The general form of operation this type supports is given by the equation		/// The general form of operation this type supports is given by the equation
/// C += A*B. This function returns which operand in the given equation is		/// C += A*B. This function returns which operand in the given equation is
/// held by this type. String returned can be one of"AOp", "BOp" and "COp".		/// held by this type. String returned can be one of"AOp", "BOp" and "COp".
StringRef getOperand() const;		StringRef getOperand() const;

		/// Returns whether this operand represents an accumulator or result type.
		bool isAccOrResult() const { return getOperand() == "COp"; }

		int64_t getElementTypeBitWidth() const {
		bondhugulaUnsubmitted Not Done Reply Inline Actions Doc comment here. bondhugula: Doc comment here.
		return getElementType().getIntOrFloatBitWidth();
		}
};		};

// Adds a `gpu.async.token` to the front of the argument list.		// Adds a `gpu.async.token` to the front of the argument list.
void addAsyncDependency(Operation *op, Value token);		void addAsyncDependency(Operation *op, Value token);

} // namespace gpu		} // namespace gpu
} // namespace mlir		} // namespace mlir

Show All 13 Lines

mlir/include/mlir/Dialect/GPU/GPUOps.td

Show First 20 Lines • Show All 90 Lines • ▼ Show 20 Lines	let description = [{
Example:		Example:

```mlir		```mlir
%tIdX = gpu.thread_id x		%tIdX = gpu.thread_id x
```		```
}];		}];
}		}

		def GPU_LaneIdOp : GPU_Op<"lane_id", [NoSideEffect]> {
		let description = [{
		Returns the lane id within the subgroup (warp/wave).

		Example:
		```mlir
		%laneId = gpu.lane_id : index
		```
		}];
		let results = (outs Index:$result);
		let assemblyFormat = "attr-dict `:` type($result)";
		}

def GPU_SubgroupIdOp : GPU_Op<"subgroup_id", [NoSideEffect]>,		def GPU_SubgroupIdOp : GPU_Op<"subgroup_id", [NoSideEffect]>,
Arguments<(ins)>, Results<(outs Index:$result)> {		Arguments<(ins)>, Results<(outs Index:$result)> {
let description = [{		let description = [{
Returns the subgroup id, i.e. the index of the current subgroup within the		Returns the subgroup id, i.e. the index of the current subgroup within the
workgroup.		workgroup.

Example:		Example:

▲ Show 20 Lines • Show All 1,242 Lines • ▼ Show 20 Lines	def GPU_DeviceAsyncWaitOp : GPU_Op<"device_async_wait", []> {
}];		}];
let arguments = (ins GPU_DeviceAsyncToken:$asyncDependencies,		let arguments = (ins GPU_DeviceAsyncToken:$asyncDependencies,
OptionalAttr<I32Attr>:$numGroups);		OptionalAttr<I32Attr>:$numGroups);
let assemblyFormat = [{		let assemblyFormat = [{
$asyncDependencies attr-dict		$asyncDependencies attr-dict
}];		}];
}		}

		def GPU_MmaLdMatrixOp : GPU_Op<"mma.ldmatrix",
		[MemoryEffects<[MemRead]>]> {
		let description = [{
		The `gpu.mma.ldmatrix` op represents loading a matrix fragment from
		memory. The load source and result type must be compatible with lowering
		to the `nvvm.ldmatrix` instruction. This op is meant to represent
		the distributed version of a `vector.transfer_read` as an intermediate
		step between lowering from `vector.transfer_read` to `nvvm.ldmatrix`.

		Example:

		```mlir
		gpu.mma.ldmatrix %shm_buffer[%c0, %c0] : memref<16x16xf16, 3> -> vector<4x2xf16>
		```
		}];

		let arguments = (ins Arg<AnyMemRef, "", [MemRead]>:$srcMemref,
		Variadic<Index>:$indices, BoolAttr:$transpose,
		I32Attr:$numTiles);
		let results = (outs AnyVector:$res);
		let assemblyFormat = [{
		$srcMemref`[` $indices `]` attr-dict `:` type($srcMemref) `->` type($res)
		}];
		}

		def GPU_MmaSyncOp : GPU_Op<"mma.sync", [NoSideEffect]> {
		let description = [{
		The `gpu.mma.sync` op represents the distributed form of a collective
		matrix-multiply-and-accumulate (mma) operation that is compatible with
		`nvvm.mma.sync`. The operands and results are fragments of the full matrix
		operands. The full shape of the distributed mma operation is given by the
		`mmaShape` attribute in the form of a list of dimensions `[m, n, k]`.

		This operation is meant to be lowered to the `nvvm.mma.sync` instruction, and
		is an intermediate point between lowering from `vector.contract` to
		`nvvm.mma.sync`.

		Example:

		```mlir
		gpu.mma.sync (%a, %b, %c) : (vector<4x2xf16>, vector<2x2xf16>, vector<2x2xf16>) -> vector<2x2xf16>
		```
		}];
		let arguments = (ins AnyVector:$matrixA, AnyVector:$matrixB, AnyVector:$matrixC,
		I64ArrayAttr:$mmaShape);

		let results = (outs AnyVector:$res);

		let assemblyFormat = [{
		`(` $matrixA`,` $matrixB`,` $matrixC `)` attr-dict
		`:` `(` type($matrixA) `,` type($matrixB) `,` type($matrixC) `)` `->` type($res)
		}];
		}

#endif // GPU_OPS		#endif // GPU_OPS

mlir/lib/Conversion/GPUCommon/IndexIntrinsicsOpLowering.h

Show First 20 Lines • Show All 56 Lines • ▼ Show 20 Lines	if (indexBitwidth > 32) {
loc, IntegerType::get(context, indexBitwidth), newOp);		loc, IntegerType::get(context, indexBitwidth), newOp);
}		}

rewriter.replaceOp(op, {newOp});		rewriter.replaceOp(op, {newOp});
return success();		return success();
}		}
};		};

		template <typename SrcOp, typename TargetOp>
		struct GPULaneIdIntrinsicOpLowering : ConvertOpToLLVMPattern<SrcOp> {
		private:
		unsigned indexBitwidth;

		public:
		explicit GPULaneIdIntrinsicOpLowering(LLVMTypeConverter &typeConverter)
		: ConvertOpToLLVMPattern<SrcOp>(typeConverter),
		indexBitwidth(typeConverter.getIndexTypeBitwidth()) {}
		// Convert the kernel arguments to an LLVM type, preserve the rest.
		LogicalResult
		matchAndRewrite(SrcOp op, typename SrcOp::Adaptor adaptor,
		ConversionPatternRewriter &rewriter) const override {
		auto loc = op->getLoc();
		MLIRContext *context = rewriter.getContext();
		Value newOp = rewriter.create<TargetOp>(loc, rewriter.getI32Type());
		if (indexBitwidth > 32) {
		newOp = rewriter.create<LLVM::SExtOp>(
		loc, IntegerType::get(context, indexBitwidth), newOp);
		} else if (indexBitwidth < 32) {
		newOp = rewriter.create<LLVM::TruncOp>(
		loc, IntegerType::get(context, indexBitwidth), newOp);
		}
		rewriter.replaceOp(op, {newOp});
		return success();
		}
		};

} // namespace mlir		} // namespace mlir

#endif // MLIR_CONVERSION_GPUCOMMON_INDEXINTRINSICSOPLOWERING_H_		#endif // MLIR_CONVERSION_GPUCOMMON_INDEXINTRINSICSOPLOWERING_H_

mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp

Show All 12 Lines

#include "mlir/Conversion/GPUToNVVM/GPUToNVVMPass.h"		#include "mlir/Conversion/GPUToNVVM/GPUToNVVMPass.h"

#include "mlir/Conversion/ArithmeticToLLVM/ArithmeticToLLVM.h"		#include "mlir/Conversion/ArithmeticToLLVM/ArithmeticToLLVM.h"
#include "mlir/Conversion/ControlFlowToLLVM/ControlFlowToLLVM.h"		#include "mlir/Conversion/ControlFlowToLLVM/ControlFlowToLLVM.h"
#include "mlir/Conversion/FuncToLLVM/ConvertFuncToLLVM.h"		#include "mlir/Conversion/FuncToLLVM/ConvertFuncToLLVM.h"
#include "mlir/Conversion/LLVMCommon/ConversionTarget.h"		#include "mlir/Conversion/LLVMCommon/ConversionTarget.h"
#include "mlir/Conversion/LLVMCommon/LoweringOptions.h"		#include "mlir/Conversion/LLVMCommon/LoweringOptions.h"
		#include "mlir/Conversion/LLVMCommon/Pattern.h"
#include "mlir/Conversion/LLVMCommon/TypeConverter.h"		#include "mlir/Conversion/LLVMCommon/TypeConverter.h"
#include "mlir/Conversion/MemRefToLLVM/MemRefToLLVM.h"		#include "mlir/Conversion/MemRefToLLVM/MemRefToLLVM.h"
		#include "mlir/Conversion/VectorToGPU/NvvmMMASupport.h"
		#include "mlir/Conversion/VectorToLLVM/ConvertVectorToLLVM.h"
#include "mlir/Dialect/Arithmetic/IR/Arithmetic.h"		#include "mlir/Dialect/Arithmetic/IR/Arithmetic.h"
#include "mlir/Dialect/ControlFlow/IR/ControlFlow.h"		#include "mlir/Dialect/ControlFlow/IR/ControlFlow.h"
#include "mlir/Dialect/Func/IR/FuncOps.h"		#include "mlir/Dialect/Func/IR/FuncOps.h"
#include "mlir/Dialect/GPU/GPUDialect.h"		#include "mlir/Dialect/GPU/GPUDialect.h"
#include "mlir/Dialect/GPU/Passes.h"		#include "mlir/Dialect/GPU/Passes.h"
		#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
		#include "mlir/Dialect/LLVMIR/LLVMTypes.h"
#include "mlir/Dialect/LLVMIR/NVVMDialect.h"		#include "mlir/Dialect/LLVMIR/NVVMDialect.h"
#include "mlir/Dialect/Math/IR/Math.h"		#include "mlir/Dialect/Math/IR/Math.h"
#include "mlir/Dialect/MemRef/IR/MemRef.h"		#include "mlir/Dialect/MemRef/IR/MemRef.h"
		#include "mlir/Dialect/Vector/IR/VectorOps.h"
#include "mlir/IR/BlockAndValueMapping.h"		#include "mlir/IR/BlockAndValueMapping.h"
		#include "mlir/IR/BuiltinAttributes.h"
		#include "mlir/IR/BuiltinOps.h"
		#include "mlir/IR/BuiltinTypes.h"
		#include "mlir/IR/MLIRContext.h"
		#include "mlir/IR/PatternMatch.h"
#include "mlir/Transforms/DialectConversion.h"		#include "mlir/Transforms/DialectConversion.h"
#include "mlir/Transforms/GreedyPatternRewriteDriver.h"		#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
#include "llvm/Support/FormatVariadic.h"		#include "llvm/Support/FormatVariadic.h"

#include "../GPUCommon/GPUOpsLowering.h"		#include "../GPUCommon/GPUOpsLowering.h"
#include "../GPUCommon/IndexIntrinsicsOpLowering.h"		#include "../GPUCommon/IndexIntrinsicsOpLowering.h"
#include "../GPUCommon/OpToFuncCallLowering.h"		#include "../GPUCommon/OpToFuncCallLowering.h"
#include "../PassDetail.h"		#include "../PassDetail.h"
▲ Show 20 Lines • Show All 163 Lines • ▼ Show 20 Lines	matchAndRewrite(gpu::DeviceAsyncWaitOp op, OpAdaptor adaptor,
// If numGroup is not present pick 0 as a conservative correct value.		// If numGroup is not present pick 0 as a conservative correct value.
int32_t numGroups = adaptor.numGroups() ? *adaptor.numGroups() : 0;		int32_t numGroups = adaptor.numGroups() ? *adaptor.numGroups() : 0;
rewriter.create<NVVM::CpAsyncWaitGroupOp>(op.getLoc(), numGroups);		rewriter.create<NVVM::CpAsyncWaitGroupOp>(op.getLoc(), numGroups);
rewriter.eraseOp(op);		rewriter.eraseOp(op);
return success();		return success();
}		}
};		};

		struct MmaLdMatrixOpToNVVM : public ConvertOpToLLVMPattern<gpu::MmaLdMatrixOp> {
		using ConvertOpToLLVMPattern<gpu::MmaLdMatrixOp>::ConvertOpToLLVMPattern;

		LogicalResult
		matchAndRewrite(gpu::MmaLdMatrixOp op, OpAdaptor adaptor,
		ConversionPatternRewriter &rewriter) const override {
		MLIRContext *ctx = getContext();
		Location loc = op->getLoc();

		// The result type of ldmatrix always be a struct of 32bit integer
		// registers. The result type of the GPU operation is always a vector of
		// shape (NumRegisters, VectorRegister) where VectorRegister is the vector
		// type of the result and always 32 bits long. We bitcast the result of the
		// NVVM::LdMatrix to this vector type.
		auto vectorResultType = op->getResultTypes()[0].dyn_cast<VectorType>();
		if (!vectorResultType)
		return failure();
		Type innerVectorType = LLVM::getFixedVectorType(
		vectorResultType.getElementType(), vectorResultType.getShape()[1]);

		LLVM::LLVMStructType ldMatrixResultType = LLVM::LLVMStructType::getLiteral(
		ctx, SmallVector<Type>(vectorResultType.getShape()[0],
		rewriter.getI32Type()));

		auto srcMemrefType = op.srcMemref().getType().cast<MemRefType>();
		Value srcPtr = getStridedElementPtr(loc, srcMemrefType, adaptor.srcMemref(),
		adaptor.indices(), rewriter);
		Value ldMatrixResult = rewriter.create<NVVM::LdMatrixOp>(
		loc, ldMatrixResultType, srcPtr,
		/num=/op.numTiles(),
		/layout=/op.transpose() ? NVVM::MMALayout::col
		: NVVM::MMALayout::row);

		Type finalResultType = typeConverter->convertType(vectorResultType);
		Value result = rewriter.create<LLVM::UndefOp>(loc, finalResultType);
		for (int i = 0; i < vectorResultType.getShape()[0]; i++) {
		Value i32Register = rewriter.create<LLVM::ExtractValueOp>(
		loc, ldMatrixResultType.getBody()[i], ldMatrixResult,
		rewriter.getI64ArrayAttr(i));
		Value casted =
		rewriter.create<LLVM::BitcastOp>(loc, innerVectorType, i32Register);
		result = rewriter.create<LLVM::InsertValueOp>(
		loc, finalResultType, result, casted, rewriter.getI64ArrayAttr(i));
		}

		rewriter.replaceOp(op, result);
		return success();
		}
		};

		struct MmaSyncOptoNVVM : public ConvertOpToLLVMPattern<gpu::MmaSyncOp> {
		using ConvertOpToLLVMPattern<gpu::MmaSyncOp>::ConvertOpToLLVMPattern;

		private:
		/// Checks if all the operands of the op being lowered are of LLVM Types. The
		/// types are expected to be converted by the `LLVMTypeConverter` before the
		/// op is actually lowered. If the type of an operands is not already
		/// converted it hints a missing typeConversion and failure is returned in
		/// that case.
		static LogicalResult areAllLLVMTypes(Operation *op, ValueRange operands,
		ConversionPatternRewriter &rewriter) {
		if (!llvm::all_of(operands, [](Value value) {
		return LLVM::isCompatibleType(value.getType());
		})) {
		return rewriter.notifyMatchFailure(
		op, "cannot convert if operands aren't of LLVM type.");
		}

		return success();
		}

		/// Returns the type for the intrinsic given the vectorResultType of the
		/// `gpu.mma.sync` operation.
		static Type inferIntrinsicResultType(Type vectorResultType) {
		MLIRContext *ctx = vectorResultType.getContext();
		auto a = vectorResultType.cast<LLVM::LLVMArrayType>();
		auto f16x2Ty = LLVM::getFixedVectorType(Float16Type::get(ctx), 2);
		auto i32Ty = IntegerType::get(ctx, 32);
		auto i32x2Ty = LLVM::getFixedVectorType(i32Ty, 2);
		Type f64Ty = Float64Type::get(ctx);
		Type f64x2Ty = LLVM::getFixedVectorType(f64Ty, 2);
		if (a.getElementType() == f16x2Ty) {
		return LLVM::LLVMStructType::getLiteral(
		ctx, SmallVector<Type>(a.getNumElements(), f16x2Ty));
		}
		if (a.getElementType() == i32x2Ty) {
		return LLVM::LLVMStructType::getLiteral(
		ctx, SmallVector<Type>(static_cast<size_t>(a.getNumElements()) * 2,
		i32Ty));
		}
		if (a.getElementType() == f64x2Ty) {
		return LLVM::LLVMStructType::getLiteral(ctx, {f64Ty, f64Ty});
		}
		return vectorResultType;
		}

		/// Convert the SSA result of the NVVM intrinsic `nvvm.mma.sync` (which is
		/// always an LLVM struct) into a fragment that is compatible with the vector
		/// type of this operation. This involves extracting elements from the struct
		/// and inserting them into an LLVM array. These extra data-movement
		/// operations should be canonicalized away by the LLVM backend.
		static Value convertIntrinsicResult(Location loc, Type intrinsicResultType,
		Type resultType, Value intrinsicResult,
		RewriterBase &rewriter) {
		MLIRContext *ctx = rewriter.getContext();
		auto structType = intrinsicResultType.dyn_cast<LLVM::LLVMStructType>();
		auto arrayType = resultType.dyn_cast<LLVM::LLVMArrayType>();
		Type i32Ty = rewriter.getI32Type();
		Type f64Ty = rewriter.getF64Type();
		Type f16x2Ty = LLVM::getFixedVectorType(rewriter.getF16Type(), 2);
		Type i32x2Ty = LLVM::getFixedVectorType(i32Ty, 2);
		Type f64x2Ty = LLVM::getFixedVectorType(f64Ty, 2);

		auto makeConst = [&](int32_t index) -> Value {
		return rewriter.create<LLVM::ConstantOp>(
		loc, IntegerType::get(ctx, 32), rewriter.getI32IntegerAttr(index));
		};

		if (arrayType) {
		SmallVector<Value, 4> elements;

		if (arrayType.getElementType() == f16x2Ty) {
		for (unsigned i = 0; i < structType.getBody().size(); i++) {
		elements.push_back(rewriter.create<LLVM::ExtractValueOp>(
		loc, structType.getBody()[i], intrinsicResult,
		rewriter.getI64ArrayAttr(i)));
		}
		}

		// The intrinsic returns i32 and f64 values as individual scalars. We need
		// to extract them from the struct and pack them into vectors.
		if (arrayType.getElementType() == i32x2Ty \|\|
		arrayType.getElementType() == f64x2Ty) {
		Value vec =
		rewriter.create<LLVM::UndefOp>(loc, arrayType.getElementType());
		for (unsigned i = 0; i < structType.getBody().size() / 2; i++) {
		Value x1 = rewriter.create<LLVM::ExtractValueOp>(
		loc, structType.getBody()[i * 2], intrinsicResult,
		rewriter.getI64ArrayAttr(i * 2));
		Value x2 = rewriter.create<LLVM::ExtractValueOp>(
		loc, structType.getBody()[i * 2 + 1], intrinsicResult,
		rewriter.getI64ArrayAttr(i * 2 + 1));
		vec = rewriter.create<LLVM::InsertElementOp>(loc, vec.getType(), vec,
		x1, makeConst(0));
		vec = rewriter.create<LLVM::InsertElementOp>(loc, vec.getType(), vec,
		x2, makeConst(1));
		}
		elements.push_back(vec);
		}

		// Create the final vectorized result.
		Value result = rewriter.create<LLVM::UndefOp>(loc, arrayType);
		for (auto el : llvm::enumerate(elements)) {
		result = rewriter.create<LLVM::InsertValueOp>(
		loc, arrayType, result, el.value(),
		rewriter.getI64ArrayAttr(el.index()));
		}
		return result;
		}

		return intrinsicResult;
		}

		static SmallVector<Value, 4>
		unpackOperandVector(RewriterBase &rewriter, Location loc, Value operand) {
		SmallVector<Value, 4> result;
		Type i32Ty = rewriter.getI32Type();
		Type f64Ty = rewriter.getF64Type();
		Type i8Ty = rewriter.getI8Type();
		auto arrayTy = operand.getType().cast<LLVM::LLVMArrayType>();
		for (size_t i = 0, e = arrayTy.getNumElements(); i < e; ++i) {
		Value toUse = rewriter.create<LLVM::ExtractValueOp>(
		loc, arrayTy.getElementType(), operand, rewriter.getI64ArrayAttr(i));

		// For 4xi8 vectors, the intrinsic expects these to be provided as i32
		// scalar types.
		Type i8x4Ty = LLVM::getFixedVectorType(i8Ty, 4);
		if (arrayTy.getElementType() == i8x4Ty) {
		result.push_back(rewriter.create<LLVM::BitcastOp>(
		loc, rewriter.getI32Type(), toUse));
		continue;
		}

		// For some element types (i32, f64), we need to unpack the inner
		// vector/array type as well because the intrinsic expects individual
		// scalars to be provided.
		VectorType innerArrayTy = arrayTy.getElementType().dyn_cast<VectorType>();
		if (innerArrayTy && (innerArrayTy.getElementType() == i32Ty \|\|
		innerArrayTy.getElementType() == f64Ty)) {
		for (int idx = 0; idx < innerArrayTy.getNumElements(); idx++) {
		result.push_back(rewriter.create<LLVM::ExtractElementOp>(
		loc, toUse,
		rewriter.create<LLVM::ConstantOp>(
		loc, rewriter.getI64Type(),
		rewriter.getI64IntegerAttr(idx))));
		}
		continue;
		}
		result.push_back(toUse);
		}
		return result;
		}

		public:
		LogicalResult
		matchAndRewrite(gpu::MmaSyncOp op, OpAdaptor adaptor,
		ConversionPatternRewriter &rewriter) const override {
		Location loc = op->getLoc();

		if (failed(areAllLLVMTypes(op, adaptor.getOperands(), rewriter))) {
		return failure();
		}

		// Get the shapes of the MMAMatrix type being used. The shapes will
		// choose which intrinsic this op will be lowered to.
		auto aType = op.matrixA().getType().cast<VectorType>();

		int64_t m = op.mmaShape()[0].cast<IntegerAttr>().getInt();
		int64_t n = op.mmaShape()[1].cast<IntegerAttr>().getInt();
		int64_t k = op.mmaShape()[2].cast<IntegerAttr>().getInt();
		std::array<int64_t, 3> gemmShape{m, n, k};

		auto matA = unpackOperandVector(rewriter, loc, adaptor.matrixA());
		auto matB = unpackOperandVector(rewriter, loc, adaptor.matrixB());
		auto matC = unpackOperandVector(rewriter, loc, adaptor.matrixC());

		NVVM::MMATypes ptxTypeA;
		NVVM::MMATypes ptxTypeB;
		Optional<NVVM::MMAIntOverflow> overflow(llvm::None);
		if (aType.getElementType().isInteger(8)) {
		ptxTypeA = NVVM::MMATypes::s8;
		ptxTypeB = NVVM::MMATypes::s8;
		overflow = NVVM::MMAIntOverflow::satfinite;

		} else if (aType.getElementType().isF16()) {
		ptxTypeA = NVVM::MMATypes::f16;
		ptxTypeB = NVVM::MMATypes::f16;
		} else if (aType.getElementType().isF64()) {
		ptxTypeA = NVVM::MMATypes::f64;
		ptxTypeB = NVVM::MMATypes::f64;
		} else {
		return op->emitError("could not deduce operand PTX types");
		}

		Type desiredRetTy = typeConverter->convertType(op->getResultTypes()[0]);
		Type intrinsicResTy = inferIntrinsicResultType(
		typeConverter->convertType(op->getResultTypes()[0]));
		Value intrinsicResult = rewriter.create<NVVM::MmaOp>(
		op.getLoc(), intrinsicResTy, matA, matB, matC,
		/shape=/gemmShape,
		/b1Op=/llvm::None,
		/intOverflow=/overflow,
		/multiplicandPtxTypes=/
		std::array<NVVM::MMATypes, 2>{ptxTypeA, ptxTypeB},
		/multiplicandLayouts=/
		std::array<NVVM::MMALayout, 2>{NVVM::MMALayout::row,
		NVVM::MMALayout::col});
		rewriter.replaceOp(op, convertIntrinsicResult(op.getLoc(), intrinsicResTy,
		desiredRetTy, intrinsicResult,
		rewriter));
		return success();
		}
		};

/// Import the GPU Ops to NVVM Patterns.		/// Import the GPU Ops to NVVM Patterns.
#include "GPUToNVVM.cpp.inc"		#include "GPUToNVVM.cpp.inc"

/// A pass that replaces all occurrences of GPU device operations with their		/// A pass that replaces all occurrences of GPU device operations with their
/// corresponding NVVM equivalent.		/// corresponding NVVM equivalent.
///		///
/// This pass only handles device code and is not meant to be run on GPU host		/// This pass only handles device code and is not meant to be run on GPU host
/// code.		/// code.
▲ Show 20 Lines • Show All 52 Lines • ▼ Show 20 Lines	void runOnOperation() override {
populateGpuWMMAToNVVMConversionPatterns(converter, llvmPatterns);		populateGpuWMMAToNVVMConversionPatterns(converter, llvmPatterns);
LLVMConversionTarget target(getContext());		LLVMConversionTarget target(getContext());
configureGpuToNVVMConversionLegality(target);		configureGpuToNVVMConversionLegality(target);
if (failed(applyPartialConversion(m, target, std::move(llvmPatterns))))		if (failed(applyPartialConversion(m, target, std::move(llvmPatterns))))
signalPassFailure();		signalPassFailure();
}		}
};		};

		struct GPULaneIdOpLowering : public ConvertOpToLLVMPattern<gpu::LaneIdOp> {
		using ConvertOpToLLVMPattern<gpu::LaneIdOp>::ConvertOpToLLVMPattern;
		LogicalResult
		matchAndRewrite(gpu::LaneIdOp op, OpAdaptor adaptor,
		ConversionPatternRewriter &rewriter) const override {

		return success();
		}
		};

} // namespace		} // namespace

void mlir::configureGpuToNVVMConversionLegality(ConversionTarget &target) {		void mlir::configureGpuToNVVMConversionLegality(ConversionTarget &target) {
target.addIllegalOp<FuncOp>();		target.addIllegalOp<FuncOp>();
target.addLegalDialect<::mlir::LLVM::LLVMDialect>();		target.addLegalDialect<::mlir::LLVM::LLVMDialect>();
target.addLegalDialect<::mlir::NVVM::NVVMDialect>();		target.addLegalDialect<::mlir::NVVM::NVVMDialect>();
target.addIllegalDialect<gpu::GPUDialect>();		target.addIllegalDialect<gpu::GPUDialect>();
target.addIllegalOp<LLVM::CosOp, LLVM::ExpOp, LLVM::Exp2Op, LLVM::FAbsOp,		target.addIllegalOp<LLVM::CosOp, LLVM::ExpOp, LLVM::Exp2Op, LLVM::FAbsOp,
Show All 11 Lines	patterns
.add<GPUIndexIntrinsicOpLowering<gpu::ThreadIdOp, NVVM::ThreadIdXOp,		.add<GPUIndexIntrinsicOpLowering<gpu::ThreadIdOp, NVVM::ThreadIdXOp,
NVVM::ThreadIdYOp, NVVM::ThreadIdZOp>,		NVVM::ThreadIdYOp, NVVM::ThreadIdZOp>,
GPUIndexIntrinsicOpLowering<gpu::BlockDimOp, NVVM::BlockDimXOp,		GPUIndexIntrinsicOpLowering<gpu::BlockDimOp, NVVM::BlockDimXOp,
NVVM::BlockDimYOp, NVVM::BlockDimZOp>,		NVVM::BlockDimYOp, NVVM::BlockDimZOp>,
GPUIndexIntrinsicOpLowering<gpu::BlockIdOp, NVVM::BlockIdXOp,		GPUIndexIntrinsicOpLowering<gpu::BlockIdOp, NVVM::BlockIdXOp,
NVVM::BlockIdYOp, NVVM::BlockIdZOp>,		NVVM::BlockIdYOp, NVVM::BlockIdZOp>,
GPUIndexIntrinsicOpLowering<gpu::GridDimOp, NVVM::GridDimXOp,		GPUIndexIntrinsicOpLowering<gpu::GridDimOp, NVVM::GridDimXOp,
NVVM::GridDimYOp, NVVM::GridDimZOp>,		NVVM::GridDimYOp, NVVM::GridDimZOp>,
GPUShuffleOpLowering, GPUReturnOpLowering>(converter);		GPULaneIdIntrinsicOpLowering<gpu::LaneIdOp, NVVM::LaneIdOp>,
		GPUShuffleOpLowering, GPUReturnOpLowering, MmaSyncOptoNVVM,
		MmaLdMatrixOpToNVVM>(converter);

// Explicitly drop memory space when lowering private memory		// Explicitly drop memory space when lowering private memory
// attributions since NVVM models it as `alloca`s in the default		// attributions since NVVM models it as `alloca`s in the default
// memory space and does not support `alloca`s with addrspace(5).		// memory space and does not support `alloca`s with addrspace(5).
patterns.add<GPUFuncOpLowering>(		patterns.add<GPUFuncOpLowering>(
converter, /allocaAddrSpace=/0,		converter, /allocaAddrSpace=/0,
StringAttr::get(&converter.getContext(),		StringAttr::get(&converter.getContext(),
NVVM::NVVMDialect::getKernelFuncAttrName()));		NVVM::NVVMDialect::getKernelFuncAttrName()));
▲ Show 20 Lines • Show All 45 Lines • Show Last 20 Lines

mlir/lib/Conversion/VectorToGPU/CMakeLists.txt

	add_mlir_conversion_library(MLIRVectorToGPU			add_mlir_conversion_library(
				MLIRVectorToGPU
	VectorToGPU.cpp			VectorToGPU.cpp
				NvvmMMASupport.cpp
	ADDITIONAL_HEADER_DIRS			ADDITIONAL_HEADER_DIRS
	${MLIR_MAIN_INCLUDE_DIR}/mlir/Conversion/VectorToGPU			${MLIR_MAIN_INCLUDE_DIR}/mlir/Conversion/VectorToGPU

	LINK_COMPONENTS			LINK_COMPONENTS
	Core			Core
				LINK_LIBS
	LINK_LIBS PUBLIC			PUBLIC
	MLIRArithmetic			MLIRArithmetic
	MLIRGPUOps			MLIRGPUOps
	MLIRLLVMIR			MLIRLLVMIR
	MLIRMemRef			MLIRMemRef
	MLIRTransforms			MLIRTransforms
	MLIRVector			MLIRVector
	MLIRVectorUtils			MLIRVectorUtils)
	)

mlir/lib/Conversion/VectorToGPU/NvvmMMASupport.cpp

This file was added.

				//===- NvvmMMASupport.cpp - MLIR Vector to GPU lowering support --------===//
				//
				// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
				// See https://llvm.org/LICENSE.txt for license information.
				// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
				//
				//===----------------------------------------------------------------------===//
				//
				// This file provides utilities to assist in the lowering of Vector operations
				// to GPU dialect MMA operations.
				//
				//===----------------------------------------------------------------------===//

				#include "mlir/Conversion/VectorToGPU/NvvmMMASupport.h"

				namespace mlir {
				namespace gpu {
				namespace NvvmMmaOperandBaseTileOperand8x128 {

				namespace {

				/// There are always 4 threads per [128\|256\|512] bit row.
				constexpr int64_t kThreadsPerRow = 4;

				constexpr int64_t kNumRowsPerTile = 8;

				/// Returns the number of registers which compose a matrix fragment held by a
				/// single thread.
				int64_t inferNumRegistersPerMatrixFragment(MMAMatrixType type) {
				int64_t lineSize =
				inferTileWidthInBits(type.getElementType(), type.isAccOrResult());
				auto shape = type.getShape();
				return (shape[0] / kNumRowsPerTile) *
				(shape[1] * type.getElementTypeBitWidth()) / lineSize;
				}

				/// Returns the number of 8 x [128\|256\|512] bit tiles that compose the given
				/// operand shape.
				std::array<int64_t, 2> getTileShape(ArrayRef<int64_t> operandShape,
				Type elementType, int64_t lineSizeBits) {
				// For each 8x128bit square, a thread is responsible for one 32bit register.
				return {operandShape[0] / kNumRowsPerTile,
				(operandShape[1] * elementType.getIntOrFloatBitWidth()) /
				lineSizeBits};
				}

				} // namespace

				int64_t inferTileWidthInBits(Type elementType, bool isAcc) {
				if (isAcc && elementType.getIntOrFloatBitWidth() == 32) {
				return 256;
				}
				if (elementType.getIntOrFloatBitWidth() == 64) {
				return isAcc ? 512 : 256;
				}
				return 128;
				}

				FailureOr<FragmentElementInfo> getRegisterType(MMAMatrixType type) {
				MLIRContext *ctx = type.getContext();
				if (type.getElementType().isF16()) {
				return FragmentElementInfo{
				LLVM::getFixedVectorType(Float16Type::get(ctx), 2), 2, 32,
				inferNumRegistersPerMatrixFragment(type)};
				}

				// f64 acc
				Type f64Ty = Float64Type::get(ctx);
				if (type.getElementType().isF64() && type.isAccOrResult()) {
				return FragmentElementInfo{LLVM::getFixedVectorType(f64Ty, 2), 2, 128,
				inferNumRegistersPerMatrixFragment(type)};
				}

				// f64 operand
				if (type.getElementType().isF64() && !type.isAccOrResult()) {
				return FragmentElementInfo{f64Ty, 1, 64,
				inferNumRegistersPerMatrixFragment(type)};
				}

				// int8 operand
				if (type.getElementType().isInteger(8)) {
				return FragmentElementInfo{
				LLVM::getFixedVectorType(IntegerType::get(ctx, 8), 4), 4, 32,
				inferNumRegistersPerMatrixFragment(type)};
				}
				// 32bit acc operands
				if (type.getElementType().isInteger(32)) {
				return FragmentElementInfo{
				LLVM::getFixedVectorType(IntegerType::get(ctx, 32), 2), 2, 64,
				inferNumRegistersPerMatrixFragment(type)};
				}
				return failure();
				}

				static AffineMap getRegisterIndexToTileOffsetMap(OpBuilder &base,
				Type elementType,
				ArrayRef<int64_t> operandShape,
				bool isAccumulator,
				int64_t elementsPerRegister,
				AffineExpr logicalValueId) {
				const int64_t lineSize = inferTileWidthInBits(elementType, isAccumulator);
				const int64_t elementsPerLine =
				lineSize / elementType.getIntOrFloatBitWidth();
				const std::array<int64_t, 2> num8x128bTiles =
				getTileShape(operandShape, elementType, lineSize);
				AffineExpr registerIdx = logicalValueId.floorDiv(elementsPerRegister);
				return AffineMap::get(
				2, 0,
				{(registerIdx % num8x128bTiles[0]) * 8,
				(registerIdx.floorDiv(num8x128bTiles[0])) * elementsPerLine},
				base.getContext());
				}

				FailureOr<AffineMap>
				getLaneIdAndValueIdToOperandCoord(Location loc, OpBuilder &builder,
				MMAMatrixType fragmentType) {
				Type elementType = fragmentType.getElementType();
				ArrayRef<int64_t> operandShape = fragmentType.getShape();
				bool isAccumulator = fragmentType.isAccOrResult();
				FailureOr<FragmentElementInfo> regInfo = getRegisterType(fragmentType);
				if (failed(regInfo))
				return failure();

				const int64_t elementBitWidth = fragmentType.getElementTypeBitWidth();
				const int64_t elementsPerRegister =
				regInfo->registerWidthBits / elementBitWidth;

				AffineExpr laneId, logicalValueIdDim;
				bindDims(builder.getContext(), laneId, logicalValueIdDim);

				// Determine what register logicalValueId corresponds to. Use that as a
				// linear index into the coordinate mapping `index -> (tile row, tile col)`.
				AffineMap registerIndexToTileCoord = getRegisterIndexToTileOffsetMap(
				builder, elementType, operandShape, isAccumulator, elementsPerRegister,
				logicalValueIdDim);

				auto makeMap = [&](ArrayRef<AffineExpr> dimExprs) -> AffineMap {
				return AffineMap::get(2, 0, dimExprs, builder.getContext());
				};

				auto tileRow = registerIndexToTileCoord.getResult(0);
				auto tileCol = registerIndexToTileCoord.getResult(1);
				return makeMap({tileRow + laneId.floorDiv(kThreadsPerRow),
				tileCol + (laneId % kThreadsPerRow) * elementsPerRegister +
				(logicalValueIdDim % elementsPerRegister)});
				}

				FailureOr<LdMatrixParams> getLdMatrixParams(MMAMatrixType fragType,
				bool transpose) {
				LdMatrixParams params;
				params.fragmentType = fragType;
				if (fragType.getOperand() == "AOp" \|\| fragType.getOperand() == "COp") {
				params.targetLayout = NVVM::MMALayout::row;
				} else {
				params.targetLayout = NVVM::MMALayout::col;
				}
				ArrayRef<int64_t> shape = fragType.getShape();
				params.contiguousDimType =
				transpose ? IteratorType::Parallel : IteratorType::Reduction;

				if (params.targetLayout == NVVM::MMALayout::row) {
				params.numTiles = (shape[0] / kNumRowsPerTile) *
				((shape[1] * fragType.getElementTypeBitWidth()) / 128);
				} else {
				params.numTiles = (shape[1] / kNumRowsPerTile) *
				((shape[0] * fragType.getElementTypeBitWidth()) / 128);
				}

				return params;
				}

				FailureOr<AffineMap>
				getLaneIdToLdMatrixMatrixCoord(Location loc, OpBuilder &builder,
				const LdMatrixParams &params) {
				// One thread per 128b row.
				const int64_t kNumThreadsPerTile = kNumRowsPerTile;
				const int bitsPerElement =
				static_cast<int>(params.fragmentType.getElementTypeBitWidth());
				const int kElementsPer128b = (128 / bitsPerElement);
				ArrayRef<int64_t> operandShape = params.fragmentType.getShape();
				AffineExpr d0 = getAffineDimExpr(0, builder.getContext());

				auto makeMap = [&](ArrayRef<AffineExpr> dimExprs) -> AffineMap {
				return AffineMap::get(1, 0, dimExprs, builder.getContext());
				};

				// This case corresponds to row-major A\|C or col-major B operands.
				if (params.contiguousDimType == IteratorType::Reduction) {
				AffineExpr row = d0 % (operandShape[0]);
				AffineExpr col = d0.floorDiv(operandShape[0]) * (kElementsPer128b);
				return makeMap({row, col});
				}

				// This case Corresponds to col-major A\|C or row-major B operands. The
				// operandShape given is already pre-transposed (e.g. 8x16 = KxN).
				if (params.contiguousDimType == IteratorType::Parallel) {
				const int64_t num8x128bCols = (operandShape[0] * bitsPerElement) / 128;
				// Threads are assigned in groups of 8 first across columns, then to
				// rows. This is transpose of what `ldmatrix` expects, but when
				// `ldmatrix` gets the `.trans` qualifier, final the effect will be to
				// transpose just the blocks.
				auto groupIdx = d0.floorDiv(kNumThreadsPerTile);
				auto tileCol = (groupIdx % num8x128bCols);
				auto tileRow = groupIdx.floorDiv(num8x128bCols);
				return makeMap({tileCol * kElementsPer128b,
				tileRow * kNumRowsPerTile + (d0 % kNumRowsPerTile)});
				}
				return failure();
				}

				} // namespace NvvmMmaOperandBaseTileOperand8x128
				} // namespace gpu
				} // namespace mlir
				No newline at end of file

mlir/lib/Conversion/VectorToGPU/VectorToGPU.cpp

//===- VectorToGPU.cpp - Convert vector to GPU dialect ----------- C++ --===//		//===- VectorToGPU.cpp - Convert vector to GPU dialect ----------- C++ --===//
//		//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.		// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.		// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception		// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//		//
//===----------------------------------------------------------------------===//		//===----------------------------------------------------------------------===//
//		//
// This file implements lowering of vector operations to GPU dialect ops.		// This file implements lowering of vector operations to GPU dialect ops.
//		//
//===----------------------------------------------------------------------===//		//===----------------------------------------------------------------------===//

#include <type_traits>		#include <type_traits>

		#include "mlir/Conversion/VectorToGPU/NvvmMMASupport.h"
#include "mlir/Conversion/VectorToGPU/VectorToGPU.h"		#include "mlir/Conversion/VectorToGPU/VectorToGPU.h"

#include "../PassDetail.h"		#include "../PassDetail.h"
#include "mlir/Analysis/SliceAnalysis.h"		#include "mlir/Analysis/SliceAnalysis.h"
#include "mlir/Dialect/Arithmetic/IR/Arithmetic.h"		#include "mlir/Dialect/Arithmetic/IR/Arithmetic.h"
#include "mlir/Dialect/GPU/GPUDialect.h"		#include "mlir/Dialect/GPU/GPUDialect.h"
		#include "mlir/Dialect/LLVMIR/NVVMDialect.h"
#include "mlir/Dialect/MemRef/IR/MemRef.h"		#include "mlir/Dialect/MemRef/IR/MemRef.h"
#include "mlir/Dialect/SCF/SCF.h"		#include "mlir/Dialect/SCF/SCF.h"
#include "mlir/Dialect/Utils/StructuredOpsUtils.h"		#include "mlir/Dialect/Utils/StructuredOpsUtils.h"
#include "mlir/Dialect/Vector/IR/VectorOps.h"		#include "mlir/Dialect/Vector/IR/VectorOps.h"
#include "mlir/Dialect/Vector/Utils/VectorUtils.h"		#include "mlir/Dialect/Vector/Utils/VectorUtils.h"
#include "mlir/IR/Builders.h"		#include "mlir/IR/Builders.h"
		#include "mlir/IR/BuiltinTypes.h"
		#include "mlir/IR/MLIRContext.h"
		#include "mlir/IR/PatternMatch.h"
#include "mlir/Pass/Pass.h"		#include "mlir/Pass/Pass.h"
		#include "mlir/Support/LogicalResult.h"
#include "mlir/Transforms/GreedyPatternRewriteDriver.h"		#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
#include "mlir/Transforms/Passes.h"		#include "mlir/Transforms/Passes.h"
		#include "llvm/ADT/TypeSwitch.h"

using namespace mlir;		namespace mlir {
		namespace {

		/// For a vector TransferOpType `xferOp`, an empty `indices` vector, and an
		/// AffineMap representing offsets to apply to indices, the function fills
		/// `indices` with the original indices plus the offsets. The offsets are
		/// applied by taking into account the permutation map of the transfer op. If
		/// the `offsetMap` has dimension placeholders, those should be provided in
		/// `dimValues`.
		template <typename TransferOpType>
		void getXferIndices(OpBuilder &b, TransferOpType xferOp, AffineMap offsetMap,
		ArrayRef<Value> dimValues, SmallVector<Value, 4> &indices) {
		indices.append(xferOp.getIndices().begin(), xferOp.getIndices().end());
		Location loc = xferOp.getLoc();
		unsigned offsetsIdx = 0;
		for (auto expr : xferOp.getPermutationMap().getResults()) {
		if (auto dim = expr.template dyn_cast<AffineDimExpr>()) {
		Value prevIdx = indices[dim.getPosition()];
		SmallVector<Value, 3> dims(dimValues.begin(), dimValues.end());
		dims.push_back(prevIdx);
		AffineExpr d0 = b.getAffineDimExpr(offsetMap.getNumDims());
		indices[dim.getPosition()] = makeComposedAffineApply(
		b, loc, d0 + offsetMap.getResult(offsetsIdx++), dims);
		continue;
		}
		}
		}
		} // namespace

// Return true if the contract op can be convert to MMA matmul.		// Return true if the contract op can be convert to MMA matmul.
static bool contractSupportsMMAMatrixType(vector::ContractionOp contract) {		static bool contractSupportsMMAMatrixType(vector::ContractionOp contract,
		bool useWmma) {
if (llvm::size(contract.getMasks()) != 0)		if (llvm::size(contract.getMasks()) != 0)
return false;		return false;

using MapList = ArrayRef<ArrayRef<AffineExpr>>;		using MapList = ArrayRef<ArrayRef<AffineExpr>>;
auto infer = [](MapList m) { return AffineMap::inferFromExprList(m); };		auto infer = [](MapList m) { return AffineMap::inferFromExprList(m); };
AffineExpr m, n, k;		AffineExpr m, n, k;
bindDims(contract.getContext(), m, n, k);		bindDims(contract.getContext(), m, n, k);
auto iteratorTypes = contract.getIteratorTypes().getValue();		auto iteratorTypes = contract.getIteratorTypes().getValue();
if (!(isParallelIterator(iteratorTypes[0]) &&		if (!(isParallelIterator(iteratorTypes[0]) &&
isParallelIterator(iteratorTypes[1]) &&		isParallelIterator(iteratorTypes[1]) &&
isReductionIterator(iteratorTypes[2])))		isReductionIterator(iteratorTypes[2])))
return false;		return false;

// The contract needs to represent a matmul to be able to convert to		// The contract needs to represent a matmul to be able to convert to
// MMAMatrix matmul.		// MMAMatrix matmul.
if (contract.getIndexingMaps() != infer({{m, k}, {k, n}, {m, n}}))		if (useWmma && contract.getIndexingMaps() != infer({{m, k}, {k, n}, {m, n}}))
		return false;
		if (!useWmma && contract.getIndexingMaps() != infer({{m, k}, {n, k}, {m, n}}))
return false;		return false;
		nirvedhmeshramUnsubmitted Done Reply Inline Actions why is this ordering different than the wmma case? nirvedhmeshram: why is this ordering different than the wmma case?
		christopherbateAuthorUnsubmitted Done Reply Inline Actions The wmma code is treating the canonical layout as "row-row-row" whereas we need "row-col-row", because "row-col-row" is what the actual `mma.sync` instructions are asking for. If the user is giving data as "row-row-row", we need to ensure that a transpose operation is manifested in the IR (and merged into the `vector.transfer_read`). This then serves as an indicator that the elements should be transposed during loading (by either passing the "transpose=true" attribute to `nvgpu.ldmatrix` or loading the elements individually in a transposed manner. christopherbate: The wmma code is treating the canonical layout as "row-row-row" whereas we need "row-col-row"…

return true;		return true;
}		}

// Return the stide for the dimension 0 of \|type\| if it is a memref and has a		// Return the stide for the dimension 0 of \|type\| if it is a memref and has a
// constant stride.		// constant stride.
static llvm::Optional<int64_t>		static llvm::Optional<int64_t>
getMemrefConstantHorizontalStride(ShapedType type) {		getMemrefConstantHorizontalStride(ShapedType type) {
auto memrefType = type.dyn_cast<MemRefType>();		auto memrefType = type.dyn_cast<MemRefType>();
if (!memrefType)		if (!memrefType)
return false;		return false;
// If the memref is 0 or 1D the horizontal stride is 0.		// If the memref is 0 or 1D the horizontal stride is 0.
if(memrefType.getRank() < 2)		if (memrefType.getRank() < 2)
return 0;		return 0;
int64_t offset = 0;		int64_t offset = 0;
SmallVector<int64_t, 2> strides;		SmallVector<int64_t, 2> strides;
if (failed(getStridesAndOffset(memrefType, strides, offset)) \|\|		if (failed(getStridesAndOffset(memrefType, strides, offset)) \|\|
strides.back() != 1)		strides.back() != 1)
return llvm::None;		return llvm::None;
int64_t stride = strides[strides.size() - 2];		int64_t stride = strides[strides.size() - 2];
if (stride == ShapedType::kDynamicStrideOrOffset)		if (stride == ShapedType::kDynamicStrideOrOffset)
return llvm::None;		return llvm::None;
return stride;		return stride;
}		}

// Return true if the transfer op can be converted to a MMA matrix load.		// Return true if the transfer op can be converted to a MMA matrix load.
static bool transferReadSupportsMMAMatrixType(vector::TransferReadOp readOp) {		static bool transferReadSupportsMMAMatrixType(vector::TransferReadOp readOp,
		bool useWmma) {
if (readOp.getMask() \|\| readOp.hasOutOfBoundsDim() \|\|		if (readOp.getMask() \|\| readOp.hasOutOfBoundsDim() \|\|
readOp.getVectorType().getRank() != 2)		readOp.getVectorType().getRank() != 2)
return false;		return false;
if (!getMemrefConstantHorizontalStride(readOp.getShapedType()))		if (!getMemrefConstantHorizontalStride(readOp.getShapedType()))
return false;		return false;
AffineMap map = readOp.getPermutationMap();		AffineMap map = readOp.getPermutationMap();
OpBuilder b(readOp.getContext());		OpBuilder b(readOp.getContext());
AffineExpr innerDim = b.getAffineDimExpr(map.getNumDims() - 1);		AffineExpr innerDim = b.getAffineDimExpr(map.getNumDims() - 1);
AffineExpr zero = b.getAffineConstantExpr(0);		AffineExpr zero = b.getAffineConstantExpr(0);
auto broadcastInnerDim = AffineMap::get(map.getNumDims(), 0, {zero, innerDim},		auto broadcastInnerDim = AffineMap::get(map.getNumDims(), 0, {zero, innerDim},
readOp.getContext());		readOp.getContext());

		if (useWmma) {
// TODO: Support transpose once it is added to GPU dialect ops.		// TODO: Support transpose once it is added to GPU dialect ops.
// For now we only support (d0, d1) -> (d0, d1) and (d0, d1) -> (0, d1).		// For now we only support (d0, d1) -> (d0, d1) and (d0, d1) -> (0, d1).
return !(!map.isMinorIdentity() && map != broadcastInnerDim);		return map.isMinorIdentity() \|\| map == broadcastInnerDim;
		}

		return true;
}		}

// Return true if the transfer op can be converted to a MMA matrix store.		// Return true if the transfer op can be converted to a MMA matrix store.
static bool		static bool
transferWriteSupportsMMAMatrixType(vector::TransferWriteOp writeOp) {		transferWriteSupportsMMAMatrixType(vector::TransferWriteOp writeOp) {
// TODO: support 0-d corner case.		// TODO: support 0-d corner case.
if (writeOp.getTransferRank() == 0)		if (writeOp.getTransferRank() == 0)
return false;		return false;
▲ Show 20 Lines • Show All 41 Lines • ▼ Show 20 Lines	convertElementwiseOpToMMA(Operation *op) {
return llvm::None;		return llvm::None;
}		}

/// Return true if the op is supported as elementwise op on MMAMatrix type.		/// Return true if the op is supported as elementwise op on MMAMatrix type.
static bool elementwiseSupportsMMAMatrixType(Operation *op) {		static bool elementwiseSupportsMMAMatrixType(Operation *op) {
return convertElementwiseOpToMMA(op).hasValue();		return convertElementwiseOpToMMA(op).hasValue();
}		}

static bool supportsMMaMatrixType(Operation *op) {		static bool supportsMMaMatrixType(Operation *op, bool useWmma) {
if (isa<scf::ForOp, scf::YieldOp>(op))		if (isa<scf::ForOp, scf::YieldOp>(op))
return true;		return true;
if (auto transferRead = dyn_cast<vector::TransferReadOp>(op))		if (auto transferRead = dyn_cast<vector::TransferReadOp>(op))
return transferReadSupportsMMAMatrixType(transferRead);		return transferReadSupportsMMAMatrixType(transferRead, useWmma);
if (auto transferWrite = dyn_cast<vector::TransferWriteOp>(op))		if (auto transferWrite = dyn_cast<vector::TransferWriteOp>(op))
return transferWriteSupportsMMAMatrixType(transferWrite);		return transferWriteSupportsMMAMatrixType(transferWrite);
if (auto contract = dyn_cast<vector::ContractionOp>(op))		if (auto contract = dyn_cast<vector::ContractionOp>(op))
return contractSupportsMMAMatrixType(contract);		return contractSupportsMMAMatrixType(contract, useWmma);
if (auto constant = dyn_cast<arith::ConstantOp>(op))		if (auto constant = dyn_cast<arith::ConstantOp>(op))
return constantSupportsMMAMatrixType(constant);		return constantSupportsMMAMatrixType(constant);
if (auto broadcast = dyn_cast<vector::BroadcastOp>(op))		if (auto broadcast = dyn_cast<vector::BroadcastOp>(op))
return broadcastSupportsMMAMatrixType(broadcast);		return broadcastSupportsMMAMatrixType(broadcast);
return elementwiseSupportsMMAMatrixType(op);		return elementwiseSupportsMMAMatrixType(op);
}		}

/// Return an unsorted slice handling scf.for region differently than		/// Return an unsorted slice handling scf.for region differently than
Show All 31 Lines	while (currentIndex != slice.size()) {
slice.insert(forwardSlice.begin(), forwardSlice.end());		slice.insert(forwardSlice.begin(), forwardSlice.end());
++currentIndex;		++currentIndex;
}		}
return slice;		return slice;
}		}

// Analyze slice of operations based on convert op to figure out if the whole		// Analyze slice of operations based on convert op to figure out if the whole
// slice can be converted to MMA operations.		// slice can be converted to MMA operations.
static SetVector<Operation > getOpToConvert(mlir::Operation op) {		static SetVector<Operation > getOpToConvert(mlir::Operation op,
		bool useWmma) {
auto hasVectorDest = [](Operation *op) {		auto hasVectorDest = [](Operation *op) {
return llvm::any_of(op->getResultTypes(),		return llvm::any_of(op->getResultTypes(),
[](Type t) { return t.isa<VectorType>(); });		[](Type t) { return t.isa<VectorType>(); });
};		};
auto hasVectorSrc = [](Operation *op) {		auto hasVectorSrc = [](Operation *op) {
return llvm::any_of(op->getOperandTypes(),		return llvm::any_of(op->getOperandTypes(),
[](Type t) { return t.isa<VectorType>(); });		[](Type t) { return t.isa<VectorType>(); });
};		};
SetVector<Operation *> opToConvert;		SetVector<Operation *> opToConvert;
op->walk([&](vector::ContractionOp contract) {		op->walk([&](vector::ContractionOp contract) {
if (opToConvert.contains(contract.getOperation()))		if (opToConvert.contains(contract.getOperation()))
return;		return;
SetVector<Operation *> dependentOps =		SetVector<Operation *> dependentOps =
getSliceContract(contract, hasVectorDest, hasVectorSrc);		getSliceContract(contract, hasVectorDest, hasVectorSrc);
// If any instruction cannot use MMA matrix type drop the whole		// If any instruction cannot use MMA matrix type drop the whole
// chain. MMA matrix are stored in an opaque type so they cannot be used		// chain. MMA matrix are stored in an opaque type so they cannot be used
// by all operations.		// by all operations.
if (llvm::any_of(dependentOps,		if (llvm::any_of(dependentOps, [useWmma](Operation *op) {
[](Operation *op) { return !supportsMMaMatrixType(op); }))		return !supportsMMaMatrixType(op, useWmma);
		}))
return;		return;
opToConvert.insert(dependentOps.begin(), dependentOps.end());		opToConvert.insert(dependentOps.begin(), dependentOps.end());
});		});
// Sort the operations so that we can convert them in topological order.		// Sort the operations so that we can convert them in topological order.
return topologicalSort(opToConvert);		return topologicalSort(opToConvert);
}		}

namespace {		namespace {
// Transform contract into (m, k)x(k, n)x(m, n) form so that it can be converted		// Transform contract into (m, k)x(k, n)x(m, n) form so that it can be converted
// to MMA matmul.		// to MMA matmul.
		struct PrepareContractToGPUMMASync
		: public OpRewritePattern<vector::ContractionOp> {
		using OpRewritePattern<vector::ContractionOp>::OpRewritePattern;

		LogicalResult matchAndRewrite(vector::ContractionOp op,
		PatternRewriter &rewriter) const override {
		Location loc = op.getLoc();
		Value lhs = op.getLhs(), rhs = op.getRhs(), res = op.getAcc();

		// Set up the parallel/reduction structure in right form.
		using MapList = ArrayRef<ArrayRef<AffineExpr>>;
		auto infer = [](MapList m) { return AffineMap::inferFromExprList(m); };
		AffineExpr m, n, k;
		bindDims(rewriter.getContext(), m, n, k);
		static constexpr std::array<int64_t, 2> perm = {1, 0};
		auto iteratorTypes = op.getIteratorTypes().getValue();
		SmallVector<AffineMap, 4> maps = op.getIndexingMaps();
		if (!(isParallelIterator(iteratorTypes[0]) &&
		isParallelIterator(iteratorTypes[1]) &&
		isReductionIterator(iteratorTypes[2])))
		return failure();

		// The canonical form is "TNT" = A row-major, B col-major, C row-major.
		const auto canonicalForm = infer({{m, k}, {n, k}, {m, n}});
		if (maps == canonicalForm) {
		return failure();
		}
		if (maps == infer({{m, k}, {k, n}, {m, n}})) {
		rhs = rewriter.create<vector::TransposeOp>(loc, rhs, perm);
		} else if (maps == infer({{k, m}, {k, n}, {m, n}})) {
		lhs = rewriter.create<vector::TransposeOp>(loc, lhs, perm);
		} else if (maps == infer({{k, m}, {k, n}, {m, n}})) {
		rhs = rewriter.create<vector::TransposeOp>(loc, rhs, perm);
		lhs = rewriter.create<vector::TransposeOp>(loc, lhs, perm);
		} else if (maps == infer({{k, m}, {k, n}, {n, m}})) {
		std::swap(rhs, lhs);
		rhs = rewriter.create<vector::TransposeOp>(loc, rhs, perm);
		lhs = rewriter.create<vector::TransposeOp>(loc, lhs, perm);
		} else if (maps == infer({{k, m}, {n, k}, {n, m}})) {
		std::swap(rhs, lhs);
		rhs = rewriter.create<vector::TransposeOp>(loc, rhs, perm);
		} else if (maps == infer({{m, k}, {k, n}, {n, m}})) {
		std::swap(lhs, rhs);
		lhs = rewriter.create<vector::TransposeOp>(loc, lhs, perm);
		} else if (maps == infer({{m, k}, {n, k}, {n, m}})) {
		std::swap(lhs, rhs);
		} else {
		return failure();
		}
		rewriter.replaceOpWithNewOp<vector::ContractionOp>(
		op, lhs, rhs, res, rewriter.getAffineMapArrayAttr(canonicalForm),
		op.getIteratorTypes());
		return success();
		}
		};

struct PrepareContractToGPUMMA		struct PrepareContractToGPUMMA
: public OpRewritePattern<vector::ContractionOp> {		: public OpRewritePattern<vector::ContractionOp> {
using OpRewritePattern<vector::ContractionOp>::OpRewritePattern;		using OpRewritePattern<vector::ContractionOp>::OpRewritePattern;

LogicalResult matchAndRewrite(vector::ContractionOp op,		LogicalResult matchAndRewrite(vector::ContractionOp op,
PatternRewriter &rewriter) const override {		PatternRewriter &rewriter) const override {
Location loc = op.getLoc();		Location loc = op.getLoc();
Value lhs = op.getLhs(), rhs = op.getRhs(), res = op.getAcc();		Value lhs = op.getLhs(), rhs = op.getRhs(), res = op.getAcc();
▲ Show 20 Lines • Show All 102 Lines • ▼ Show 20 Lines	if (contract.getRhs() == op.getResult())
return "BOp";		return "BOp";
}		}
return "COp";		return "COp";
}		}

static void convertTransferReadOp(vector::TransferReadOp op,		static void convertTransferReadOp(vector::TransferReadOp op,
llvm::DenseMap<Value, Value> &valueMapping) {		llvm::DenseMap<Value, Value> &valueMapping) {
assert(op.getTransferRank() > 0 && "unexpected 0-d transfer");		assert(op.getTransferRank() > 0 && "unexpected 0-d transfer");
assert(transferReadSupportsMMAMatrixType(op));		assert(transferReadSupportsMMAMatrixType(op, /useWmma=/true));
Optional<int64_t> stride =		Optional<int64_t> stride =
getMemrefConstantHorizontalStride(op.getShapedType());		getMemrefConstantHorizontalStride(op.getShapedType());
AffineMap map = op.getPermutationMap();		AffineMap map = op.getPermutationMap();
// Handle broadcast by setting the stride to 0.		// Handle broadcast by setting the stride to 0.
if (map.getResult(0).isa<AffineConstantExpr>()) {		if (map.getResult(0).isa<AffineConstantExpr>()) {
assert(map.getResult(0).cast<AffineConstantExpr>().getValue() == 0);		assert(map.getResult(0).cast<AffineConstantExpr>().getValue() == 0);
stride = 0;		stride = 0;
}		}
Show All 18 Lines	static void convertTransferWriteOp(vector::TransferWriteOp op,
OpBuilder b(op);		OpBuilder b(op);
Value matrix = valueMapping.find(op.getVector())->second;		Value matrix = valueMapping.find(op.getVector())->second;
b.create<gpu::SubgroupMmaStoreMatrixOp>(op.getLoc(), matrix, op.getSource(),		b.create<gpu::SubgroupMmaStoreMatrixOp>(op.getLoc(), matrix, op.getSource(),
op.getIndices(),		op.getIndices(),
b.getIndexAttr(*stride));		b.getIndexAttr(*stride));
op.erase();		op.erase();
}		}

		/// Returns the vector type which represents a matrix fragment.
		static VectorType getMmaSyncVectorOperandType(
		gpu::MMAMatrixType fragType,
		const gpu::NvvmMmaOperandBaseTileOperand8x128::FragmentElementInfo
		&regInfo) {
		SmallVector<int64_t> shape{regInfo.numRegistersPerFragment,
		regInfo.elementsPerRegister};
		Type elType = regInfo.registerLLVMType;
		if (auto vecType = elType.dyn_cast<VectorType>())
		elType = vecType.getElementType();
		return VectorType::get(shape, elType);
		}

		static LogicalResult
		creatLdMatrixCompatibleLoads(vector::TransferReadOp op, OpBuilder &builder,
		gpu::MMAMatrixType fragType,
		llvm::DenseMap<Value, Value> &valueMapping) {
		Location loc = op->getLoc();

		FailureOr<gpu::NvvmMmaOperandBaseTileOperand8x128::FragmentElementInfo>
		regInfo =
		gpu::NvvmMmaOperandBaseTileOperand8x128::getRegisterType(fragType);
		if (failed(regInfo))
		return failure();

		auto params = gpu::NvvmMmaOperandBaseTileOperand8x128::getLdMatrixParams(
		fragType,
		/transpose=/!op.getPermutationMap().isMinorIdentity());
		if (failed(params))
		return failure();

		// Adjust the load offset.
		auto laneId = builder.create<gpu::LaneIdOp>(loc);
		FailureOr<AffineMap> offsets =
		gpu::NvvmMmaOperandBaseTileOperand8x128::getLaneIdToLdMatrixMatrixCoord(
		loc, builder, *params);
		if (failed(offsets))
		return failure();

		VectorType vectorType = getMmaSyncVectorOperandType(fragType, *regInfo);

		SmallVector<Value, 4> indices;
		getXferIndices<vector::TransferReadOp>(builder, op, *offsets, {laneId},
		indices);
		gpu::MmaLdMatrixOp newOp = builder.create<gpu::MmaLdMatrixOp>(
		loc, vectorType, op.getSource(), indices,
		!op.getPermutationMap().isMinorIdentity(), params->numTiles);
		valueMapping[op] = newOp->getResult(0);
		return success();
		}

		static LogicalResult
		createNonLdMatrixLoads(vector::TransferReadOp op, OpBuilder &builder,
		gpu::MMAMatrixType fragmentType,
		llvm::DenseMap<Value, Value> &valueMapping) {
		Location loc = op.getLoc();
		FailureOr<gpu::NvvmMmaOperandBaseTileOperand8x128::FragmentElementInfo>
		regInfo = gpu::NvvmMmaOperandBaseTileOperand8x128::getRegisterType(
		fragmentType);
		if (failed(regInfo))
		return failure();

		NVVM::MMALayout targetLayout = fragmentType.getOperand() == "BOp"
		? NVVM::MMALayout::col
		: NVVM::MMALayout::row;

		Value laneId = builder.create<gpu::LaneIdOp>(loc);
		SmallVector<Value, 4> elements;

		// This is the individual element type.
		Type loadedElType = regInfo->registerLLVMType;
		VectorType vectorType = getMmaSyncVectorOperandType(fragmentType, *regInfo);

		Value fill = builder.create<arith::ConstantOp>(
		op.getLoc(), fragmentType.getElementType(),
		builder.getZeroAttr(fragmentType.getElementType()));
		Value result = builder.create<vector::SplatOp>(op.getLoc(), fill, vectorType);

		bool isTransposeLoad = !op.getPermutationMap().isMinorIdentity();

		// Vectorized loads.
		if (!isTransposeLoad && targetLayout == NVVM::MMALayout::row) {
		if (!loadedElType.isa<VectorType>()) {
		loadedElType = VectorType::get({1}, loadedElType);
		}

		for (int i = 0; i < vectorType.getShape()[0]; i++) {
		FailureOr<AffineMap> coords = gpu::NvvmMmaOperandBaseTileOperand8x128::
		getLaneIdAndValueIdToOperandCoord(op.getLoc(), builder, fragmentType);
		if (failed(coords))
		return failure();
		Value logicalValueId = builder.create<arith::ConstantOp>(
		loc, builder.getIndexType(),
		builder.getIndexAttr(i * regInfo->elementsPerRegister));
		SmallVector<Value, 4> newIndices;
		getXferIndices<vector::TransferReadOp>(
		builder, op, *coords, {laneId, logicalValueId}, newIndices);

		Value el = builder.create<vector::LoadOp>(loc, loadedElType,
		op.getSource(), newIndices);
		result = builder.create<vector::InsertOp>(loc, el, result,
		builder.getI64ArrayAttr(i));
		}
		} else if (isTransposeLoad && targetLayout == NVVM::MMALayout::col) {
		if (auto vecType = loadedElType.dyn_cast<VectorType>()) {
		loadedElType = vecType.getElementType();
		}
		// Load each element individually.
		for (int i = 0; i < vectorType.getShape()[0]; i++) {
		for (unsigned innerIdx = 0; innerIdx < vectorType.getShape()[1];
		innerIdx++) {

		Value logicalValueId = builder.create<arith::ConstantOp>(
		loc, builder.getIndexType(),
		builder.getIndexAttr(i * regInfo->elementsPerRegister + innerIdx));
		FailureOr<AffineMap> coords = gpu::NvvmMmaOperandBaseTileOperand8x128::
		getLaneIdAndValueIdToOperandCoord(op.getLoc(), builder,
		fragmentType);
		if (failed(coords))
		return failure();

		SmallVector<Value, 4> newIndices;
		getXferIndices<vector::TransferReadOp>(
		builder, op, *coords, {laneId, logicalValueId}, newIndices);
		Value el = builder.create<memref::LoadOp>(op.getLoc(), loadedElType,
		op.getSource(), newIndices);
		result = builder.create<vector::InsertOp>(
		op.getLoc(), el, result, builder.getI64ArrayAttr({i, innerIdx}));
		}
		}
		} else {
		return failure();
		}

		valueMapping[op.getResult()] = result;
		return success();
		}

		LogicalResult
		convertTransferReadToLoads(vector::TransferReadOp op,
		llvm::DenseMap<Value, Value> &valueMapping) {
		OpBuilder b(op);
		const char *fragType = inferFragType(op);
		gpu::MMAMatrixType type =
		gpu::MMAMatrixType::get(op.getVectorType().getShape(),
		op.getVectorType().getElementType(), fragType);

		bool isLdMatrixCompatible = true;
		if (gpu::NvvmMmaOperandBaseTileOperand8x128::inferTileWidthInBits(
		type.getElementType(), type.isAccOrResult()) != 128) {
		isLdMatrixCompatible = false;
		}
		if (!op.getPermutationMap().isMinorIdentity() &&
		(type.getOperand() == "BOp") && type.getElementTypeBitWidth() < 16) {
		isLdMatrixCompatible = false;
		}
		if ((type.getOperand() == "COp") && type.getElementTypeBitWidth() < 16) {
		isLdMatrixCompatible = false;
		}

		if (!isLdMatrixCompatible)
		return createNonLdMatrixLoads(op, b, type, valueMapping);

		return creatLdMatrixCompatibleLoads(op, b, type, valueMapping);
		}

		LogicalResult
		convertTransferWriteToStores(vector::TransferWriteOp op,
		llvm::DenseMap<Value, Value> &valueMapping) {
		OpBuilder b(op);
		Location loc = op->getLoc();
		Value matrix = valueMapping.find(op.getVector())->second;

		gpu::MMAMatrixType matType =
		gpu::MMAMatrixType::get(op.getVectorType().getShape(),
		op.getVectorType().getElementType(), "COp");
		FailureOr<gpu::NvvmMmaOperandBaseTileOperand8x128::FragmentElementInfo>
		regInfo =
		gpu::NvvmMmaOperandBaseTileOperand8x128::getRegisterType(matType);
		if (failed(regInfo))
		return failure();

		VectorType vectorType = getMmaSyncVectorOperandType(matType, *regInfo);
		Value laneId = b.create<gpu::LaneIdOp>(loc);

		for (unsigned i = 0; i < vectorType.getShape()[0]; i++) {
		Value logicalValueId = b.create<arith::ConstantOp>(
		loc, b.getIndexType(),
		b.getIndexAttr(i * regInfo->elementsPerRegister));
		FailureOr<AffineMap> coords = gpu::NvvmMmaOperandBaseTileOperand8x128::
		getLaneIdAndValueIdToOperandCoord(op.getLoc(), b, matType);
		if (failed(coords))
		return failure();

		Value el = b.create<vector::ExtractOp>(loc, matrix, ArrayRef<int64_t>{i});
		SmallVector<Value, 4> newIndices;
		getXferIndices<vector::TransferWriteOp>(
		b, op, *coords, {laneId, logicalValueId}, newIndices);
		b.create<vector::StoreOp>(loc, el, op.getSource(), newIndices);
		}
		op->erase();
		return success();
		}

static void convertContractOp(vector::ContractionOp op,		static void convertContractOp(vector::ContractionOp op,
llvm::DenseMap<Value, Value> &valueMapping) {		llvm::DenseMap<Value, Value> &valueMapping) {
OpBuilder b(op);		OpBuilder b(op);
Value opA = valueMapping.find(op.getLhs())->second;		Value opA = valueMapping.find(op.getLhs())->second;
Value opB = valueMapping.find(op.getRhs())->second;		Value opB = valueMapping.find(op.getRhs())->second;
Value opC = valueMapping.find(op.getAcc())->second;		Value opC = valueMapping.find(op.getAcc())->second;
Value matmul = b.create<gpu::SubgroupMmaComputeOp>(op.getLoc(), opC.getType(),		Value matmul = b.create<gpu::SubgroupMmaComputeOp>(op.getLoc(), opC.getType(),
opA, opB, opC);		opA, opB, opC);
valueMapping[op.getResult()] = matmul;		valueMapping[op.getResult()] = matmul;
}		}

		LogicalResult
		convertContractOpToMmaSync(vector::ContractionOp op,
		llvm::DenseMap<Value, Value> &valueMapping) {
		OpBuilder b(op);
		Value opA = valueMapping.find(op.getLhs())->second;
		Value opB = valueMapping.find(op.getRhs())->second;
		Value opC = valueMapping.find(op.getAcc())->second;
		int64_t m = op.getLhs().getType().cast<VectorType>().getShape()[0];
		int64_t n = op.getRhs().getType().cast<VectorType>().getShape()[0];
		int64_t k = op.getLhs().getType().cast<VectorType>().getShape()[1];
		Value matmul = b.create<gpu::MmaSyncOp>(op.getLoc(), opC.getType(), opA, opB,
		opC, b.getI64ArrayAttr({m, n, k}));
		valueMapping[op.getResult()] = matmul;
		return success();
		}

/// Convert a 2D splat ConstantOp to a SubgroupMmaConstantMatrix op.		/// Convert a 2D splat ConstantOp to a SubgroupMmaConstantMatrix op.
static void convertConstantOp(arith::ConstantOp op,		static void convertConstantOp(arith::ConstantOp op,
llvm::DenseMap<Value, Value> &valueMapping) {		llvm::DenseMap<Value, Value> &valueMapping) {
assert(constantSupportsMMAMatrixType(op));		assert(constantSupportsMMAMatrixType(op));
OpBuilder b(op);		OpBuilder b(op);
Attribute splat =		Attribute splat =
op.getValue().cast<SplatElementsAttr>().getSplatValue<Attribute>();		op.getValue().cast<SplatElementsAttr>().getSplatValue<Attribute>();
auto scalarConstant =		auto scalarConstant =
▲ Show 20 Lines • Show All 96 Lines • ▼ Show 20 Lines	static void convertElementwiseOp(Operation *op, gpu::MMAElementwiseOp opType,
SmallVector<Value> matrixOperands;		SmallVector<Value> matrixOperands;
for (Value operand : op->getOperands())		for (Value operand : op->getOperands())
matrixOperands.push_back(valueMapping.find(operand)->second);		matrixOperands.push_back(valueMapping.find(operand)->second);
Value newOp = b.create<gpu::SubgroupMmaElementwiseOp>(		Value newOp = b.create<gpu::SubgroupMmaElementwiseOp>(
op->getLoc(), matrixOperands[0].getType(), matrixOperands, opType);		op->getLoc(), matrixOperands[0].getType(), matrixOperands, opType);
valueMapping[op->getResult(0)] = newOp;		valueMapping[op->getResult(0)] = newOp;
}		}

void mlir::populatePrepareVectorToMMAPatterns(RewritePatternSet &patterns) {		void populatePrepareVectorToMMAPatterns(RewritePatternSet &patterns,
		bool useWmma) {
		if (useWmma) {
patterns.add<PrepareContractToGPUMMA, CombineTransferReadOpTranspose>(		patterns.add<PrepareContractToGPUMMA, CombineTransferReadOpTranspose>(
patterns.getContext());		patterns.getContext());
		return;
		}
		patterns.add<PrepareContractToGPUMMASync, CombineTransferReadOpTranspose>(
		patterns.getContext());
}		}

void mlir::convertVectorToMMAOps(Operation *rootOp) {		void convertVectorToMMAOps(Operation *rootOp) {
SetVector<Operation *> ops = getOpToConvert(rootOp);		SetVector<Operation > ops = getOpToConvert(rootOp, /useWmma=*/true);
		nirvedhmeshramUnsubmitted Done Reply Inline Actions Isnt this always false so nvgpu sub-path is never used? nirvedhmeshram: Isnt this always false so nvgpu sub-path is never used?
		nirvedhmeshramUnsubmitted Done Reply Inline Actions I see that for the nvgpu path another function is provided. Sorry about the noise. nirvedhmeshram: I see that for the nvgpu path another function is provided. Sorry about the noise.
llvm::DenseMap<Value, Value> valueMapping;		llvm::DenseMap<Value, Value> valueMapping;
for (Operation *op : ops) {		for (Operation *op : ops) {
if (auto transferRead = dyn_cast<vector::TransferReadOp>(op)) {		if (auto transferRead = dyn_cast<vector::TransferReadOp>(op)) {
convertTransferReadOp(transferRead, valueMapping);		convertTransferReadOp(transferRead, valueMapping);
} else if (auto transferWrite = dyn_cast<vector::TransferWriteOp>(op)) {		} else if (auto transferWrite = dyn_cast<vector::TransferWriteOp>(op)) {
convertTransferWriteOp(transferWrite, valueMapping);		convertTransferWriteOp(transferWrite, valueMapping);
} else if (auto contractOp = dyn_cast<vector::ContractionOp>(op)) {		} else if (auto contractOp = dyn_cast<vector::ContractionOp>(op)) {
convertContractOp(contractOp, valueMapping);		convertContractOp(contractOp, valueMapping);
} else if (auto constantOp = dyn_cast<arith::ConstantOp>(op)) {		} else if (auto constantOp = dyn_cast<arith::ConstantOp>(op)) {
convertConstantOp(constantOp, valueMapping);		convertConstantOp(constantOp, valueMapping);
} else if (auto broadcastOp = dyn_cast<vector::BroadcastOp>(op)) {		} else if (auto broadcastOp = dyn_cast<vector::BroadcastOp>(op)) {
convertBroadcastOp(broadcastOp, valueMapping);		convertBroadcastOp(broadcastOp, valueMapping);
} else if (auto forOp = dyn_cast<scf::ForOp>(op)) {		} else if (auto forOp = dyn_cast<scf::ForOp>(op)) {
convertForOp(forOp, valueMapping);		convertForOp(forOp, valueMapping);
} else if (auto yiledOp = dyn_cast<scf::YieldOp>(op)) {		} else if (auto yiledOp = dyn_cast<scf::YieldOp>(op)) {
convertYieldOp(yiledOp, valueMapping);		convertYieldOp(yiledOp, valueMapping);
} else if (auto elementwiseType = convertElementwiseOpToMMA(op)) {		} else if (auto elementwiseType = convertElementwiseOpToMMA(op)) {
convertElementwiseOp(op, *elementwiseType, valueMapping);		convertElementwiseOp(op, *elementwiseType, valueMapping);
}		}
}		}
}		}

		LogicalResult convertVectorToNVVMCompatibleMMASync(Operation *rootOp) {
		SetVector<Operation > ops = getOpToConvert(rootOp, /useWmma=*/false);
		llvm::DenseMap<Value, Value> valueMapping;
		for (Operation *op : ops) {
		if (llvm::TypeSwitch<Operation *, LogicalResult>(op)
		.Case([&](vector::TransferReadOp transferReadOp) {
		return convertTransferReadToLoads(transferReadOp, valueMapping);
		})
		.Case([&](vector::TransferWriteOp transferWriteOp) {
		return convertTransferWriteToStores(transferWriteOp,
		valueMapping);
		})
		.Case([&](vector::ContractionOp contractionOp) {
		return convertContractOpToMmaSync(contractionOp, valueMapping);
		})
		.Case([&](arith::ConstantOp constantOp) { return success(); })
		.Default([&](Operation *op) { return failure(); })
		.failed()) {
		return failure();
		}
		}
		return success();
		}

namespace {		namespace {

struct ConvertVectorToGPUPass		struct ConvertVectorToGPUPass
: public ConvertVectorToGPUBase<ConvertVectorToGPUPass> {		: public ConvertVectorToGPUBase<ConvertVectorToGPUPass> {

		explicit ConvertVectorToGPUPass(bool _useWmma) { useWmma.setValue(_useWmma); }

void runOnOperation() override {		void runOnOperation() override {
RewritePatternSet patterns(&getContext());		RewritePatternSet patterns(&getContext());
populatePrepareVectorToMMAPatterns(patterns);		populatePrepareVectorToMMAPatterns(patterns, useWmma.getValue());
(void)applyPatternsAndFoldGreedily(getOperation(), std::move(patterns));		(void)applyPatternsAndFoldGreedily(getOperation(), std::move(patterns));

		if (useWmma.getValue()) {
convertVectorToMMAOps(getOperation());		convertVectorToMMAOps(getOperation());
		return;
		}

		if (failed(convertVectorToNVVMCompatibleMMASync(getOperation())))
		return signalPassFailure();
}		}
};		};

} // namespace		} // namespace

std::unique_ptr<Pass> mlir::createConvertVectorToGPUPass() {		std::unique_ptr<Pass> createConvertVectorToGPUPass(bool useWmma) {
return std::make_unique<ConvertVectorToGPUPass>();		return std::make_unique<ConvertVectorToGPUPass>(useWmma);
}		}

		} // namespace mlir

mlir/lib/Dialect/GPU/IR/GPUDialect.cpp

Show First 20 Lines • Show All 54 Lines • ▼ Show 20 Lines	ArrayRef<int64_t> MMAMatrixType::getShape() const {
return getImpl()->getShape();		return getImpl()->getShape();
}		}

Type MMAMatrixType::getElementType() const { return getImpl()->elementType; }		Type MMAMatrixType::getElementType() const { return getImpl()->elementType; }

StringRef MMAMatrixType::getOperand() const { return getImpl()->getOperand(); }		StringRef MMAMatrixType::getOperand() const { return getImpl()->getOperand(); }

bool MMAMatrixType::isValidElementType(Type elementType) {		bool MMAMatrixType::isValidElementType(Type elementType) {
return elementType.isF16() \|\| elementType.isF32();		return elementType.isF16() \|\| elementType.isF32() \|\| elementType.isF64() \|\|
		elementType.isInteger(8) \|\| elementType.isInteger(32);
}		}

LogicalResult		LogicalResult
MMAMatrixType::verify(function_ref<InFlightDiagnostic()> emitError,		MMAMatrixType::verify(function_ref<InFlightDiagnostic()> emitError,
ArrayRef<int64_t> shape, Type elementType,		ArrayRef<int64_t> shape, Type elementType,
StringRef operand) {		StringRef operand) {
if (!operand.equals("AOp") && !operand.equals("BOp") &&		if (!operand.equals("AOp") && !operand.equals("BOp") &&
!operand.equals("COp"))		!operand.equals("COp"))
▲ Show 20 Lines • Show All 1,207 Lines • Show Last 20 Lines

mlir/test/Conversion/GPUToNVVM/gpu-to-nvvm.mlir

// RUN: mlir-opt %s -convert-gpu-to-nvvm -split-input-file \| FileCheck %s		// RUN: mlir-opt %s -convert-gpu-to-nvvm -split-input-file \| FileCheck %s
// RUN: mlir-opt %s -convert-gpu-to-nvvm='index-bitwidth=32' -split-input-file \| FileCheck --check-prefix=CHECK32 %s		// RUN: mlir-opt %s -convert-gpu-to-nvvm='index-bitwidth=32' -split-input-file \| FileCheck --check-prefix=CHECK32 %s

gpu.module @test_module {		gpu.module @test_module {
// CHECK-LABEL: func @gpu_index_ops()		// CHECK-LABEL: func @gpu_index_ops()
// CHECK32-LABEL: func @gpu_index_ops()		// CHECK32-LABEL: func @gpu_index_ops()
func.func @gpu_index_ops()		func.func @gpu_index_ops()
-> (index, index, index, index, index, index,		-> (index, index, index, index, index, index,
index, index, index, index, index, index) {		index, index, index, index, index, index,
		index) {
// CHECK32-NOT: = llvm.sext %{{.*}} : i32 to i64		// CHECK32-NOT: = llvm.sext %{{.*}} : i32 to i64

// CHECK: = nvvm.read.ptx.sreg.tid.x : i32		// CHECK: = nvvm.read.ptx.sreg.tid.x : i32
// CHECK: = llvm.sext %{{.*}} : i32 to i64		// CHECK: = llvm.sext %{{.*}} : i32 to i64
%tIdX = gpu.thread_id x		%tIdX = gpu.thread_id x
// CHECK: = nvvm.read.ptx.sreg.tid.y : i32		// CHECK: = nvvm.read.ptx.sreg.tid.y : i32
// CHECK: = llvm.sext %{{.*}} : i32 to i64		// CHECK: = llvm.sext %{{.*}} : i32 to i64
%tIdY = gpu.thread_id y		%tIdY = gpu.thread_id y
Show All 26 Lines	func.func @gpu_index_ops()
%gDimX = gpu.grid_dim x		%gDimX = gpu.grid_dim x
// CHECK: = nvvm.read.ptx.sreg.nctaid.y : i32		// CHECK: = nvvm.read.ptx.sreg.nctaid.y : i32
// CHECK: = llvm.sext %{{.*}} : i32 to i64		// CHECK: = llvm.sext %{{.*}} : i32 to i64
%gDimY = gpu.grid_dim y		%gDimY = gpu.grid_dim y
// CHECK: = nvvm.read.ptx.sreg.nctaid.z : i32		// CHECK: = nvvm.read.ptx.sreg.nctaid.z : i32
// CHECK: = llvm.sext %{{.*}} : i32 to i64		// CHECK: = llvm.sext %{{.*}} : i32 to i64
%gDimZ = gpu.grid_dim z		%gDimZ = gpu.grid_dim z


		// CHECK: = nvvm.read.ptx.sreg.laneid : i32
		// CHECK: = llvm.sext %{{.*}} : i32 to i64
		%laneId = gpu.lane_id : index

func.return %tIdX, %tIdY, %tIdZ, %bDimX, %bDimY, %bDimZ,		func.return %tIdX, %tIdY, %tIdZ, %bDimX, %bDimY, %bDimZ,
%bIdX, %bIdY, %bIdZ, %gDimX, %gDimY, %gDimZ		%bIdX, %bIdY, %bIdZ, %gDimX, %gDimY, %gDimZ,
		%laneId
: index, index, index, index, index, index,		: index, index, index, index, index, index,
index, index, index, index, index, index		index, index, index, index, index, index,
		index
}		}
}		}

// -----		// -----

gpu.module @test_module {		gpu.module @test_module {
// CHECK-LABEL: func @gpu_index_comp		// CHECK-LABEL: func @gpu_index_comp
// CHECK32-LABEL: func @gpu_index_comp		// CHECK32-LABEL: func @gpu_index_comp
▲ Show 20 Lines • Show All 451 Lines • Show Last 20 Lines

mlir/test/Conversion/GPUToNVVM/mma-sync-to-nvvm.mlir

This file was added.

				// RUN: mlir-opt --convert-gpu-to-nvvm --split-input-file %s \| FileCheck %s

				gpu.module @test_module {
				// CHECK-LABEL: @m16n8k16_fp16
				func @m16n8k16_fp16(%arg0: vector<4x2xf16>, %arg1: vector<2x2xf16>, %arg2: vector<2x2xf16>) -> vector<2x2xf16> {
				// CHECK: llvm.extractvalue %arg0[0] : !llvm.array<4 x vector<2xf16>>
				// CHECK: llvm.extractvalue %arg0[1] : !llvm.array<4 x vector<2xf16>>
				// CHECK: llvm.extractvalue %arg0[2] : !llvm.array<4 x vector<2xf16>>
				// CHECK: llvm.extractvalue %arg0[3] : !llvm.array<4 x vector<2xf16>>

				// CHECK: llvm.extractvalue %arg1[0] : !llvm.array<2 x vector<2xf16>>
				// CHECK: llvm.extractvalue %arg1[1] : !llvm.array<2 x vector<2xf16>>

				// CHECK: llvm.extractvalue %arg2[0] : !llvm.array<2 x vector<2xf16>>
				// CHECK: llvm.extractvalue %arg2[1] : !llvm.array<2 x vector<2xf16>>
				// CHECK-NOT llvm.extractvalue
				// CHECK: [[d:%.+]] = nvvm.mma.sync
				// CHECK-SAME: shape = {k = 16 : i32, m = 16 : i32, n = 8 : i32}
				%d = "gpu.mma.sync"(%arg0, %arg1, %arg2) {mmaShape = [16, 8, 16]} : (vector<4x2xf16>, vector<2x2xf16>, vector<2x2xf16>) -> vector<2x2xf16>
				// CHECK-DAG: llvm.extractvalue [[d]][0] : !llvm.struct<(vector<2xf16>, vector<2xf16>)>
				// CHECK-DAG: llvm.extractvalue [[d]][1] : !llvm.struct<(vector<2xf16>, vector<2xf16>)>
				// CHECK: llvm.mlir.undef : !llvm.array<2 x vector<2xf16>>
				// CHECK-DAG: llvm.insertvalue {{%.+}}, {{%.+}}[0] : !llvm.array<2 x vector<2xf16>>
				// CHECK-DAG: llvm.insertvalue {{%.+}}, {{%.+}}[1] : !llvm.array<2 x vector<2xf16>>
				// CHECK: llvm.return {{%.+}} : !llvm.array<2 x vector<2xf16>>
				return %d : vector<2x2xf16>
				}

				// CHECK-LABEL: @m16n8k32_int8
				func @m16n8k32_int8(%arg0: vector<4x4xi8>, %arg1: vector<2x4xi8>, %arg2: vector<2x2xi32>) -> vector<2x2xi32> {

				// CHECK: [[el:%.+]] = llvm.extractvalue %arg0[{{.*}}] : !llvm.array<4 x vector<4xi8>>
				// CHECK: llvm.bitcast [[el]] : vector<4xi8> to i32
				// CHECK: [[el:%.+]] = llvm.extractvalue %arg0[{{.*}}] : !llvm.array<4 x vector<4xi8>>
				// CHECK: llvm.bitcast [[el]] : vector<4xi8> to i32
				// CHECK: [[el:%.+]] = llvm.extractvalue %arg0[{{.*}}] : !llvm.array<4 x vector<4xi8>>
				// CHECK: llvm.bitcast [[el]] : vector<4xi8> to i32
				// CHECK: [[el:%.+]] = llvm.extractvalue %arg0[{{.*}}] : !llvm.array<4 x vector<4xi8>>
				// CHECK: llvm.bitcast [[el]] : vector<4xi8> to i32

				// CHECK: [[el:%.+]] = llvm.extractvalue %arg1[{{.*}}] : !llvm.array<2 x vector<4xi8>>
				// CHECK: llvm.bitcast [[el]] : vector<4xi8> to i32
				// CHECK: [[el:%.+]] = llvm.extractvalue %arg1[{{.*}}] : !llvm.array<2 x vector<4xi8>>
				// CHECK: llvm.bitcast [[el]] : vector<4xi8> to i32

				// CHECK: [[el:%.+]] = llvm.extractvalue %arg2[{{.*}}] : !llvm.array<2 x vector<2xi32>>
				// CHECK: [[el:%.+]] = llvm.extractvalue %arg2[{{.*}}] : !llvm.array<2 x vector<2xi32>>

				// CHECK: [[d:%.+]] = nvvm.mma.sync
				// CHECK-SAME: intOverflowBehavior = #nvvm.mma_int_overflow<satfinite>
				// CHECK-SAME: multiplicandAPtxType = #nvvm.mma_type<s8>
				// CHECK-SAME: multiplicandBPtxType = #nvvm.mma_type<s8>
				// CHECK-SAME: shape = {k = 32 : i32, m = 16 : i32, n = 8 : i32}
				%d = "gpu.mma.sync"(%arg0, %arg1, %arg2) {mmaShape = [16, 8, 32]} : (vector<4x4xi8>, vector<2x4xi8>, vector<2x2xi32>) -> vector<2x2xi32>

				// CHECK: llvm.return {{%.+}} : !llvm.array<2 x vector<2xi32>>
				return %d : vector<2x2xi32>
				}

				// CHECK-LABEL: @m8n8k4_f64
				func @m8n8k4_f64(%arg0: vector<1x1xf64>, %arg1: vector<1x1xf64>, %arg2: vector<1x2xf64>) -> vector<1x2xf64> {
				// CHECK: llvm.extractvalue %arg0
				// CHECK: llvm.extractvalue %arg1
				// CHECK: llvm.extractvalue %arg2

				// CHECK: [[d:%.+]] = nvvm.mma.sync A[{{%.+}}] B[{{%.+}}] C[{{%.+}}, {{%.+}}]
				// CHECK-SAME: shape = {k = 4 : i32, m = 8 : i32, n = 8 : i32}
				%d = "gpu.mma.sync"(%arg0, %arg1, %arg2) {mmaShape = [8, 8, 4]} : (vector<1x1xf64>, vector<1x1xf64>, vector<1x2xf64>) -> vector<1x2xf64>
				// CHECK: llvm.mlir.undef : vector<2xf64>
				// CHECK-DAG: llvm.extractvalue [[d]][0] : !llvm.struct<(f64, f64)>
				// CHECK-DAG: llvm.extractvalue [[d]][1] : !llvm.struct<(f64, f64)>
				// CHECK-COUNT-2: llvm.insertelement {{.*}} : vector<2xf64>
				// CHECK-DAG: llvm.insertvalue {{%.+}}, {{%.+}}[0] : !llvm.array<1 x vector<2xf64>>
				// CHECK: llvm.return {{%.+}} : !llvm.array<1 x vector<2xf64>>
				return %d : vector<1x2xf64>
				}

				func @ldmatrix(%arg0: memref<128x128xf16, 3>) -> vector<4x2xf16> {
				%c0 = arith.constant 0 : index
				// CHECK: nvvm.ldmatrix {{%.+}} {layout = #nvvm.mma_layout<row>, num = 4 : i32} {{.*}} -> !llvm.struct<(i32, i32, i32, i32)
				%a = gpu.mma.ldmatrix %arg0[%c0, %c0] {transpose = false, numTiles = 4 : i32} : memref<128x128xf16, 3> -> vector<4x2xf16>
				// CHECK: llvm.extractvalue
				// CHECK: llvm.bitcast
				// CHECK: llvm.insertvalue
				// CHECK: llvm.extractvalue
				// CHECK: llvm.bitcast
				// CHECK: llvm.insertvalue
				// CHECK: llvm.extractvalue
				// CHECK: llvm.bitcast
				// CHECK: llvm.insertvalue
				// CHECK: llvm.extractvalue
				// CHECK: llvm.bitcast
				// CHECK: llvm.insertvalue
				return %a : vector<4x2xf16>
				}
				}

mlir/test/Conversion/VectorToGPU/vector-to-mma-ops-mma-sync.mlir

This file was added.

				// RUN: mlir-opt %s -split-input-file -pass-pipeline="func.func(convert-vector-to-gpu{use-wmma=false})" \| FileCheck %s

				//#########################################################
				// INT8 row-row-row
				//#########################################################

				// CHECK-DAG: [[rowA0_map:#.+]] = affine_map<()[s0] -> (s0 mod 16 + 1)>
				// CHECK-DAG: [[colA0_map:#.+]] = affine_map<()[s0] -> ((s0 floordiv 16) * 16 + 1)>
				ThomasRaouxUnsubmitted Done Reply Inline Actions nit: Note that the trick to make CHECK-LABEL work with map variable is to do: // CHECK-DAG: [[$rowA0_map:#.+]] = affine_map<()[s0] -> (s0 mod 16 + 1)> This way the variable are not reset by CHECK-LABEL ThomasRaoux: nit: Note that the trick to make CHECK-LABEL work with map variable is to do: ``` // CHECK-DAG…

				// CHECK-DAG: [[rowB0_map:#.+]] = affine_map<()[s0] -> (s0 * 4 - (s0 floordiv 4) * 16 + 39)>
				// CHECK-DAG: [[colB0_map:#.+]] = affine_map<()[s0] -> (s0 floordiv 4 + 40)>
				// CHECK-DAG: [[rowB1_map:#.+]] = affine_map<()[s0] -> (s0 * 4 - (s0 floordiv 4) * 16 + 40)>
				// CHECK-DAG: [[rowB2_map:#.+]] = affine_map<()[s0] -> (s0 * 4 - (s0 floordiv 4) * 16 + 41)>
				// CHECK-DAG: [[rowB3_map:#.+]] = affine_map<()[s0] -> (s0 * 4 - (s0 floordiv 4) * 16 + 42)>
				// CHECK-DAG: [[rowB4_map:#.+]] = affine_map<()[s0] -> (s0 * 4 - (s0 floordiv 4) * 16 + 55)>
				// CHECK-DAG: [[rowB5_map:#.+]] = affine_map<()[s0] -> (s0 * 4 - (s0 floordiv 4) * 16 + 56)>
				// CHECK-DAG: [[rowB6_map:#.+]] = affine_map<()[s0] -> (s0 * 4 - (s0 floordiv 4) * 16 + 57)>
				// CHECK-DAG: [[rowB7_map:#.+]] = affine_map<()[s0] -> (s0 * 4 - (s0 floordiv 4) * 16 + 58)>

				// CHECK-DAG: [[rowC0_map:#.+]] = affine_map<()[s0] -> (s0 floordiv 4 + 49)>
				// CHECK-DAG: [[colC0_map:#.+]] = affine_map<()[s0] -> (s0 * 2 - (s0 floordiv 4) * 8 + 40)>
				// CHECK-DAG: [[rowC8_map:#.+]] = affine_map<()[s0] -> (s0 floordiv 4 + 57)>


				#map0 = affine_map<(d0, d1) -> (d1, d0)>
				#map1 = affine_map<(d0, d1, d2) -> (d0, d2)>
				#map2 = affine_map<(d0, d1, d2) -> (d1, d2)>
				#map3 = affine_map<(d0, d1, d2) -> (d0, d1)>

				// CHECK: func @m16n8k32_int8_row_row_row
				func @m16n8k32_int8_row_row_row(%arg0: memref<128x128xi8>, %arg1: memref<128x128xi8>, %arg2: memref<128x128xi32>) {
				%cst_0 = arith.constant dense<0> : vector<32x8xi8>
				%c0 = arith.constant 0 : index
				%c1 = arith.constant 1 : index
				%c17 = arith.constant 17 : index
				%c39 = arith.constant 39 : index
				%c40 = arith.constant 40 : index
				%c49 = arith.constant 49 : index
				%c50 = arith.constant 50 : index
				%cst = arith.constant 0 : i8
				%cst0 = arith.constant 0 : i32

				// Verify that the operand A is distributed to loads correctly.

				// CHECK: [[row:%.+]] = affine.apply [[rowA0_map]]()[{{%.+}}]
				// CHECK: [[col:%.+]] = affine.apply [[colA0_map]]()[{{%.+}}]
				// CHECK: gpu.mma.ldmatrix %arg0[[[row]], [[col]]] {numTiles = 4 : i32, transpose = false} : memref<128x128xi8> -> vector<4x4xi8>

				// Verify that the operand B is distributed to loads correctly. It's elements
				// must be loaded in a non-vectorized manner to do the transpose.

				// CHECK-DAG: [[row:%.+]] = affine.apply [[rowB0_map]]()[{{%.+}}]
				// CHECK-DAG: [[col:%.+]] = affine.apply [[colB0_map]]()[{{%.+}}]
				// CHECK: memref.load %arg1[[[row]], [[col]]] : memref<128x128xi8>
				// CHECK-DAG: [[row:%.+]] = affine.apply [[rowB1_map]]()[{{%.+}}]
				// CHECK-DAG: [[col:%.+]] = affine.apply [[colB0_map]]()[{{%.+}}]
				// CHECK: memref.load %arg1[[[row]], [[col]]] : memref<128x128xi8>
				// CHECK-DAG: [[row:%.+]] = affine.apply [[rowB2_map]]()[{{%.+}}]
				// CHECK-DAG: [[col:%.+]] = affine.apply [[colB0_map]]()[{{%.+}}]
				// CHECK: memref.load %arg1[[[row]], [[col]]] : memref<128x128xi8>
				// CHECK-DAG: [[row:%.+]] = affine.apply [[rowB3_map]]()[{{%.+}}]
				// CHECK-DAG: [[col:%.+]] = affine.apply [[colB0_map]]()[{{%.+}}]
				// CHECK: memref.load %arg1[[[row]], [[col]]] : memref<128x128xi8>
				// CHECK-DAG: [[col:%.+]] = affine.apply [[colB0_map]]()[{{%.+}}]
				// CHECK-DAG: [[row:%.+]] = affine.apply [[rowB4_map]]()[{{%.+}}]
				// CHECK: memref.load %arg1[[[row]], [[col]]] : memref<128x128xi8>
				// CHECK-DAG: [[row:%.+]] = affine.apply [[rowB5_map]]()[{{%.+}}]
				// CHECK-DAG: [[col:%.+]] = affine.apply [[colB0_map]]()[{{%.+}}]
				// CHECK: memref.load %arg1[[[row]], [[col]]] : memref<128x128xi8>
				// CHECK-DAG: [[row:%.+]] = affine.apply [[rowB6_map]]()[{{%.+}}]
				// CHECK-DAG: [[col:%.+]] = affine.apply [[colB0_map]]()[{{%.+}}]
				// CHECK: memref.load %arg1[[[row]], [[col]]] : memref<128x128xi8>
				// CHECK-DAG: [[row:%.+]] = affine.apply [[rowB7_map]]()[{{%.+}}]
				// CHECK-DAG: [[col:%.+]] = affine.apply [[colB0_map]]()[{{%.+}}]
				// CHECK: memref.load %arg1[[[row]], [[col]]] : memref<128x128xi8>
				// CHECK-NOT: memref.load %arg1

				// Verify that the operand C is distributed to loads correctly.
				// CHECK: [[row:%.+]] = affine.apply [[rowC0_map]]()[{{%.+}}]
				// CHECK: [[col:%.+]] = affine.apply [[colC0_map]]()[{{%.+}}]
				// CHECK: vector.load %arg2[[[row]], [[col]]] : memref<128x128xi32>, vector<2xi32>
				// CHECK: [[row:%.+]] = affine.apply [[rowC8_map]]()[{{%.+}}]
				// CHECK: [[col:%.+]] = affine.apply [[colC0_map]]()[{{%.+}}]
				// CHECK: vector.load %arg2[[[row]], [[col]]] : memref<128x128xi32>, vector<2xi32>
				// CHECK-NOT: vector.load %arg2{{.*}}

				%A = vector.transfer_read %arg0[%c1, %c1], %cst {in_bounds = [true, true]} : memref<128x128xi8>, vector<16x32xi8>
				%B = vector.transfer_read %arg1[%c39, %c40], %cst {in_bounds = [true, true], permutation_map = #map0} : memref<128x128xi8>, vector<8x32xi8>
				%C = vector.transfer_read %arg2[%c49, %c40], %cst0 {in_bounds = [true, true]} : memref<128x128xi32>, vector<16x8xi32>
				// CHECK: [[d:%.+]] = gpu.mma.sync({{.*}}) {mmaShape = [16, 8, 32]} : (vector<4x4xi8>, vector<2x4xi8>, vector<2x2xi32>) -> vector<2x2xi32>
				%D = vector.contract {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %A, %B, %C : vector<16x32xi8>, vector<8x32xi8> into vector<16x8xi32>

				// CHECK: [[row:%.+]] = affine.apply [[rowC0_map]]()[{{%.+}}]
				// CHECK: [[col:%.+]] = affine.apply [[colC0_map]]()[{{%.+}}]
				// CHECK: vector.store {{%.+}}, %arg2[[[row]], [[col]]] : memref<128x128xi32>, vector<2xi32>
				// CHECK: [[row:%.+]] = affine.apply [[rowC8_map]]()[{{%.+}}]
				// CHECK: [[col:%.+]] = affine.apply [[colC0_map]]()[{{%.+}}]
				// CHECK: vector.store {{%.+}}, %arg2[[[row]], [[col]]] : memref<128x128xi32>, vector<2xi32>
				vector.transfer_write %D, %arg2[%c49, %c40] {in_bounds = [true, true]} : vector<16x8xi32>, memref<128x128xi32>
				return
				}

				// -----

				//#########################################################
				// f64 row-row-row
				//#########################################################
				// CHECK-DAG: [[rowA0_map:#.+]] = affine_map<()[s0] -> (s0 floordiv 4 + 1)>
				// CHECK-DAG: [[colA0_map:#.+]] = affine_map<()[s0] -> (s0 mod 4 + 1)>

				// CHECK-DAG: [[rowb0_map:#.+]] = affine_map<()[s0] -> (s0 mod 4 + 39)>
				// CHECK-DAG: [[colb0_map:#.+]] = affine_map<()[s0] -> (s0 floordiv 4 + 40)>

				// CHECK-DAG: [[rowC0_map:#.+]] = affine_map<()[s0] -> (s0 floordiv 4 + 49)>
				// CHECK-DAG: [[colC0_map:#.+]] = affine_map<()[s0] -> (s0 * 2 - (s0 floordiv 4) * 8 + 40)

				#map0 = affine_map<(d0, d1) -> (d1, d0)>
				#map1 = affine_map<(d0, d1, d2) -> (d0, d2)>
				#map2 = affine_map<(d0, d1, d2) -> (d1, d2)>
				#map3 = affine_map<(d0, d1, d2) -> (d0, d1)>

				// CHECK: func @m8n8k4_f64_row_row_row
				func @m8n8k4_f64_row_row_row(%arg0: memref<128x128xf64>, %arg1: memref<128x128xf64>, %arg2: memref<128x128xf64>) {
				%cst_0 = arith.constant dense<0.0> : vector<4x8xf64>
				%c0 = arith.constant 0 : index
				%c1 = arith.constant 1 : index
				%c17 = arith.constant 17 : index
				%c39 = arith.constant 39 : index
				%c40 = arith.constant 40 : index
				%c49 = arith.constant 49 : index
				%c50 = arith.constant 50 : index
				%cst = arith.constant 0.0 : f64
				%cst0 = arith.constant 0.0 : f64

				// Verify that the operand A is distributed to loads correctly.

				// CHECK-DAG: [[row:%.+]] = affine.apply [[rowA0_map]]
				// CHECK-DAG: [[col:%.+]] = affine.apply [[colA0_map]]
				// CHECK: vector.load %arg0[[[row]], [[col]]] : memref<128x128xf64>, vector<1xf64>

				// Verify that the operand B is distributed to loads correctly. It's elements
				// must be loaded in a non-vectorized manner to do the transpose.

				// CHECK-DAG: [[row:%.+]] = affine.apply [[rowb0_map]]
				// CHECK-DAG: [[col:%.+]] = affine.apply [[colb0_map]]
				// CHECK: memref.load %arg1[[[row]], [[col]]] : memref<128x128xf64>

				// CHECK-DAG: [[row:%.+]] = affine.apply [[rowC0_map]]
				// CHECK-DAG: [[col:%.+]] = affine.apply [[colC0_map]]
				// CHECK: vector.load %arg2[[[row]], [[col]]] : memref<128x128xf64>, vector<2xf64>

				%A = vector.transfer_read %arg0[%c1, %c1], %cst {in_bounds = [true, true]} : memref<128x128xf64>, vector<8x4xf64>
				%B = vector.transfer_read %arg1[%c39, %c40], %cst {in_bounds = [true, true], permutation_map = #map0} : memref<128x128xf64>, vector<8x4xf64>
				%C = vector.transfer_read %arg2[%c49, %c40], %cst0 {in_bounds = [true, true]} : memref<128x128xf64>, vector<8x8xf64>
				// CHECK: [[d:%.+]] = gpu.mma.sync({{.*}}) {mmaShape = [8, 8, 4]} : (vector<1x1xf64>, vector<1x1xf64>, vector<1x2xf64>) -> vector<1x2xf64>
				%D = vector.contract {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %A, %B, %C : vector<8x4xf64>, vector<8x4xf64> into vector<8x8xf64>

				// CHECK-DAG: [[row:%.+]] = affine.apply [[rowC0_map]]
				// CHECK-DAG: [[col:%.+]] = affine.apply [[colC0_map]]
				// CHECK: vector.store {{%.+}}, %arg2[[[row]], [[col]]] : memref<128x128xf64>, vector<2xf64>
				vector.transfer_write %D, %arg2[%c49, %c40] {in_bounds = [true, true]} : vector<8x8xf64>, memref<128x128xf64>
				return
				}

				// -----

				//#########################################################
				// FP16 row-row-row
				//#########################################################

				#map0 = affine_map<(d0, d1) -> (d1, d0)>
				#map1 = affine_map<(d0, d1, d2) -> (d0, d2)>
				#map2 = affine_map<(d0, d1, d2) -> (d1, d2)>
				#map3 = affine_map<(d0, d1, d2) -> (d0, d1)>

				// CHECK-DAG: [[rowA_map:#.+]] = affine_map<()[s0] -> (s0 mod 16 + 1)>
				// CHECK-DAG: [[colA_map:#.+]] = affine_map<()[s0] -> ((s0 floordiv 16) * 8 + 3)>

				// CHECK-DAG: [[rowB_map:#.+]] = affine_map<()[s0] -> (s0 + 3)>
				// CHECK-DAG: [[colB_map:#.+]] = affine_map<() -> (3)>

				// CHECK: func @m16n8k16_fp16_row_row_row
				func @m16n8k16_fp16_row_row_row(%arg0: memref<20x20xf16>, %arg1: memref<20x20xf16>, %arg2: memref<20x20xf16>) {
				%cst_0 = arith.constant dense<0.000000e+00> : vector<16x8xf16>
				%c0 = arith.constant 0 : index
				%c1 = arith.constant 1 : index
				%c3 = arith.constant 3 : index
				%cst = arith.constant 0.000000e+00 : f16
				// CHECK-DAG: [[row:%.+]] = affine.apply [[rowA_map]]
				// CHECK-DAG: [[col:%.+]] = affine.apply [[colA_map]]
				// CHECK: gpu.mma.ldmatrix %arg0[[[row]], [[col]]] {numTiles = 4 : i32, transpose = false}

				// CHECK-DAG: [[row:%.+]] = affine.apply [[rowB_map]]
				// CHECK-DAG: [[col:%.+]] = affine.apply [[colB_map]]
				// CHECK: gpu.mma.ldmatrix %arg1[[[row]], [[col]]] {numTiles = 2 : i32, transpose = true}
				%A = vector.transfer_read %arg0[%c1, %c3], %cst {in_bounds = [true, true]} : memref<20x20xf16>, vector<16x16xf16>
				%B = vector.transfer_read %arg1[%c3, %c3], %cst {permutation_map = #map0, in_bounds = [true, true]} : memref<20x20xf16>, vector<8x16xf16>
				%C = vector.transfer_read %arg2[%c0, %c0], %cst {in_bounds = [true, true]} : memref<20x20xf16>, vector<16x8xf16>
				%D = vector.contract {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %A, %B, %C : vector<16x16xf16>, vector<8x16xf16> into vector<16x8xf16>
				vector.transfer_write %D, %arg2[%c0, %c0] {in_bounds = [true, true]} : vector<16x8xf16>, memref<20x20xf16>
				return
				}

				// -----

				// CHECK-DAG: [[Arow_map:#.+]] = affine_map<()[s0] -> (s0 mod 16 + 1)>
				// CHECK-DAG: [[Acol_map:#.+]] = affine_map<()[s0] -> ((s0 floordiv 16) * 8 + 3)>
				// CHECK-DAG: [[Bcol_map:#.+]] = affine_map<() -> (3)>
				// CHECK-DAG: [[Brow_map:#.+]] = affine_map<()[s0] -> (s0 + 3)>

				#map0 = affine_map<(d0, d1, d2) -> (d2, d1)>
				#map1 = affine_map<(d0, d1, d2) -> (d0, d2)>
				#map2 = affine_map<(d0, d1, d2) -> (d1, d2)>
				#map3 = affine_map<(d0, d1, d2) -> (d0, d1)>

				// CHECK: func @batch_m16n8k16_fp16_row_row_row
				func @batch_m16n8k16_fp16_row_row_row(%arg0: memref<2x20x20xf16>, %arg1: memref<2x20x20xf16>, %arg2: memref<2x20x20xf16>) {
				%cst_0 = arith.constant dense<0.000000e+00> : vector<20x20xf16>
				%c0 = arith.constant 0 : index
				%c1 = arith.constant 1 : index
				%c3 = arith.constant 3 : index
				%cst = arith.constant 0.000000e+00 : f16

				// CHECK-DAG: [[row:%.+]] = affine.apply [[Arow_map]]
				// CHECK-DAG: [[col:%.+]] = affine.apply [[Acol_map]]
				// CHECK: gpu.mma.ldmatrix %arg0[%c0, [[row]], [[col]]] {numTiles = 4 : i32, transpose = false} : memref<2x20x20xf16> -> vector<4x2xf16>
				%A = vector.transfer_read %arg0[%c0, %c1, %c3], %cst {in_bounds = [true, true]} : memref<2x20x20xf16>, vector<16x16xf16>

				// CHECK-DAG: [[row:%.+]] = affine.apply [[Brow_map]]
				// CHECK-DAG: [[col:%.+]] = affine.apply [[Bcol_map]]
				// CHECK: gpu.mma.ldmatrix %arg1[%c0, [[row]], [[col]]] {numTiles = 2 : i32, transpose = true} : memref<2x20x20xf16> -> vector<2x2xf16>
				%B = vector.transfer_read %arg1[%c0, %c3, %c3], %cst {permutation_map = #map0, in_bounds = [true, true]} : memref<2x20x20xf16>, vector<8x16xf16>

				// CHECK-DAG: [[row:%.+]] = affine.apply [[Arow_map]]
				// CHECK-DAG: [[col:%.+]] = affine.apply [[Acol_map]]
				// CHECK: gpu.mma.ldmatrix %arg2[%c0, [[row]], [[col]]] {numTiles = 2 : i32, transpose = false} : memref<2x20x20xf16> -> vector<2x2xf16>
				%C = vector.transfer_read %arg2[%c0, %c1, %c3], %cst {in_bounds = [true, true]} : memref<2x20x20xf16>, vector<16x8xf16>
				ThomasRaouxUnsubmitted Not Done Reply Inline Actions should be [[C0]] ThomasRaoux: should be [[C0]]
				%D = vector.contract {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %A, %B, %C : vector<16x16xf16>, vector<8x16xf16> into vector<16x8xf16>
				vector.transfer_write %D, %arg2[%c0, %c1, %c3] {in_bounds = [true, true]} : vector<16x8xf16>, memref<2x20x20xf16>
				return
				}

				// -----

				//#########################################################
				// FP16 row-col-row
				//#########################################################

				#map1 = affine_map<(d0, d1, d2) -> (d0, d2)>
				#map2 = affine_map<(d0, d1, d2) -> (d1, d2)>
				#map3 = affine_map<(d0, d1, d2) -> (d0, d1)>

				// CHECK: [[rowA_map:#.+]] = affine_map<()[s0] -> (s0 mod 16 + 1)>
				// CHECK: [[colA_map:#.+]] = affine_map<()[s0] -> ((s0 floordiv 16) * 8 + 3)>

				// CHECK: [[rowB_map:#.+]] = affine_map<()[s0] -> (s0 mod 8 + 1)>
				// CHECK: [[colB_map:#.+]] = affine_map<()[s0] -> ((s0 floordiv 8) * 8 + 3)>

				// CHECK: func @m16n8k16_fp16_row_row_row
				func @m16n8k16_fp16_row_row_row(%arg0: memref<20x20xf16>, %arg1: memref<20x20xf16>, %arg2: memref<20x20xf16>) {
				%cst_0 = arith.constant dense<0.000000e+00> : vector<16x8xf16>
				%c0 = arith.constant 0 : index
				%c1 = arith.constant 1 : index
				%c3 = arith.constant 3 : index
				%cst = arith.constant 0.000000e+00 : f16
				// CHECK-DAG: [[row:%.+]] = affine.apply [[rowA_map]]
				// CHECK-DAG: [[col:%.+]] = affine.apply [[colA_map]]
				// CHECK: gpu.mma.ldmatrix %arg0[[[row]], [[col]]] {numTiles = 4 : i32
				// CHECK-SAME: transpose = false

				// CHECK-DAG: [[row:%.+]] = affine.apply [[rowB_map]]
				// CHECK-DAG: [[col:%.+]] = affine.apply [[colB_map]]
				// CHECK: gpu.mma.ldmatrix %arg1[[[row]], [[col]]] {numTiles = 2 : i32
				// CHECK-SAME: transpose = false

				// CHECK-DAG: [[row:%.+]] = affine.apply [[rowA_map]]
				// CHECK-DAG: [[col:%.+]] = affine.apply [[colA_map]]
				// CHECK: gpu.mma.ldmatrix %arg2[[[row]], [[col]]] {numTiles = 2 : i32
				// CHECK-SAME: transpose = false
				%A = vector.transfer_read %arg0[%c1, %c3], %cst {in_bounds = [true, true]} : memref<20x20xf16>, vector<16x16xf16>
				%B = vector.transfer_read %arg1[%c1, %c3], %cst {in_bounds = [true, true]} : memref<20x20xf16>, vector<8x16xf16>
				%C = vector.transfer_read %arg2[%c1, %c3], %cst {in_bounds = [true, true]} : memref<20x20xf16>, vector<16x8xf16>
				%D = vector.contract {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %A, %B, %C : vector<16x16xf16>, vector<8x16xf16> into vector<16x8xf16>
				vector.transfer_write %D, %arg2[%c0, %c0] {in_bounds = [true, true]} : vector<16x8xf16>, memref<20x20xf16>
				return
				}
				No newline at end of file
				ThomasRaouxUnsubmitted Done Reply Inline Actions nit: missing newline ThomasRaoux: nit: missing newline

mlir/test/Dialect/GPU/invalid.mlir

Show First 20 Lines • Show All 470 Lines • ▼ Show 20 Lines	func @mmamatrix_operand_type(){
%i = arith.constant 16 : index		%i = arith.constant 16 : index
// expected-error @+1 {{operand expected to be one of AOp, BOp or COp}}		// expected-error @+1 {{operand expected to be one of AOp, BOp or COp}}
%0 = gpu.subgroup_mma_load_matrix %wg[%i, %i] {leadDimension = 32 : index} : memref<32x32xf16, 3> -> !gpu.mma_matrix<16x16xf16, "EOp">		%0 = gpu.subgroup_mma_load_matrix %wg[%i, %i] {leadDimension = 32 : index} : memref<32x32xf16, 3> -> !gpu.mma_matrix<16x16xf16, "EOp">
return		return
}		}

// -----		// -----

func @mmamatrix_invalid_element_type(){
%wg = memref.alloca() {alignment = 32} : memref<32x32xf16, 3>
%i = arith.constant 16 : index
// expected-error @+1 {{MMAMatrixType elements must be F16 or F32}}
%0 = gpu.subgroup_mma_load_matrix %wg[%i, %i] {leadDimension = 32 : index} : memref<32x32xf16, 3> -> !gpu.mma_matrix<16x16xi32, "AOp">
return
}

// -----

#layout_map_col_major = affine_map<(i, j) -> (j, i)>		#layout_map_col_major = affine_map<(i, j) -> (j, i)>

func @mmaLoadOp_identity_layout(){		func @mmaLoadOp_identity_layout(){
%wg = memref.alloca() {alignment = 32} : memref<32x32xf16, #layout_map_col_major, 3>		%wg = memref.alloca() {alignment = 32} : memref<32x32xf16, #layout_map_col_major, 3>
%i = arith.constant 16 : index		%i = arith.constant 16 : index
// expected-error @+1 {{expected source memref most minor dim must have unit stride}}		// expected-error @+1 {{expected source memref most minor dim must have unit stride}}
%0 = gpu.subgroup_mma_load_matrix %wg[%i, %i] {leadDimension = 32 : index} : memref<32x32xf16, #layout_map_col_major, 3> -> !gpu.mma_matrix<16x16xf16, "AOp">		%0 = gpu.subgroup_mma_load_matrix %wg[%i, %i] {leadDimension = 32 : index} : memref<32x32xf16, #layout_map_col_major, 3> -> !gpu.mma_matrix<16x16xf16, "AOp">
return		return
▲ Show 20 Lines • Show All 157 Lines • Show Last 20 Lines

This is an archive of the discontinued LLVM Phabricator instance.

[MLIR][GPU] Add NvGpu mma.sync path to the VectorToGPU pass
ClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 419861

mlir/include/mlir/Conversion/Passes.td

mlir/include/mlir/Conversion/VectorToGPU/NvvmMMASupport.h

mlir/include/mlir/Conversion/VectorToGPU/VectorToGPU.h

mlir/include/mlir/Dialect/GPU/GPUDialect.h

mlir/include/mlir/Dialect/GPU/GPUOps.td

mlir/lib/Conversion/GPUCommon/IndexIntrinsicsOpLowering.h

mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp

mlir/lib/Conversion/VectorToGPU/CMakeLists.txt

mlir/lib/Conversion/VectorToGPU/NvvmMMASupport.cpp

mlir/lib/Conversion/VectorToGPU/VectorToGPU.cpp

mlir/lib/Dialect/GPU/IR/GPUDialect.cpp

mlir/test/Conversion/GPUToNVVM/gpu-to-nvvm.mlir

mlir/test/Conversion/GPUToNVVM/mma-sync-to-nvvm.mlir

mlir/test/Conversion/VectorToGPU/vector-to-mma-ops-mma-sync.mlir

mlir/test/Dialect/GPU/invalid.mlir

This is an archive of the discontinued LLVM Phabricator instance.

[MLIR][GPU] Add NvGpu mma.sync path to the VectorToGPU passClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 419861

mlir/include/mlir/Conversion/Passes.td

mlir/include/mlir/Conversion/VectorToGPU/NvvmMMASupport.h

mlir/include/mlir/Conversion/VectorToGPU/VectorToGPU.h

mlir/include/mlir/Dialect/GPU/GPUDialect.h

mlir/include/mlir/Dialect/GPU/GPUOps.td

mlir/lib/Conversion/GPUCommon/IndexIntrinsicsOpLowering.h

mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp

mlir/lib/Conversion/VectorToGPU/CMakeLists.txt

mlir/lib/Conversion/VectorToGPU/NvvmMMASupport.cpp

mlir/lib/Conversion/VectorToGPU/VectorToGPU.cpp

mlir/lib/Dialect/GPU/IR/GPUDialect.cpp

mlir/test/Conversion/GPUToNVVM/gpu-to-nvvm.mlir

mlir/test/Conversion/GPUToNVVM/mma-sync-to-nvvm.mlir

mlir/test/Conversion/VectorToGPU/vector-to-mma-ops-mma-sync.mlir

mlir/test/Dialect/GPU/invalid.mlir

[MLIR][GPU] Add NvGpu mma.sync path to the VectorToGPU pass
ClosedPublic