Diff 495650

mlir/include/mlir/Dialect/GPU/IR/GPUBase.td

	Show First 20 Lines • Show All 95 Lines • ▼ Show 20 Lines

	// Predicat to check if type is gpu::MMAMatrixType.			// Predicat to check if type is gpu::MMAMatrixType.
	def IsMMAMatrixTypePred : CPred<"$_self.isa<::mlir::gpu::MMAMatrixType>()">;			def IsMMAMatrixTypePred : CPred<"$_self.isa<::mlir::gpu::MMAMatrixType>()">;

	def GPU_MMAMatrix : DialectType<			def GPU_MMAMatrix : DialectType<
	GPU_Dialect, IsMMAMatrixTypePred, "MMAMatrix type">;			GPU_Dialect, IsMMAMatrixTypePred, "MMAMatrix type">;

	// Memref type acceptable to gpu.subgroup_mma_{load\|store}_matrix ops.			// Memref type acceptable to gpu.subgroup_mma_{load\|store}_matrix ops.
	def GPU_MMAMemRef : MemRefOf<[F16, F32, VectorOfRankAndType<[1], [F16, F32]>]>;			def GPU_MMAMemRef : MemRefOf<[I8, I32, F16, F32, VectorOfRankAndType<[1], [I8, I32, F16, F32]>]>;

	class MMAMatrixOf<list<Type> allowedTypes> :			class MMAMatrixOf<list<Type> allowedTypes> :
	ContainerType<AnyTypeOf<allowedTypes>, IsMMAMatrixTypePred,			ContainerType<AnyTypeOf<allowedTypes>, IsMMAMatrixTypePred,
	"$_self.cast<::mlir::gpu::MMAMatrixType>().getElementType()",			"$_self.cast<::mlir::gpu::MMAMatrixType>().getElementType()",
	"gpu.mma_matrix", "::mlir::gpu::MMAMatrixType">;			"gpu.mma_matrix", "::mlir::gpu::MMAMatrixType">;

	//===----------------------------------------------------------------------===//			//===----------------------------------------------------------------------===//
	// GPU Interfaces.			// GPU Interfaces.
	▲ Show 20 Lines • Show All 55 Lines • Show Last 20 Lines

mlir/include/mlir/Dialect/GPU/IR/GPUOps.td

Show First 20 Lines • Show All 1,144 Lines • ▼ Show 20 Lines	let description = [{
This operation takes a memref as its first operand: it is the source matrix		This operation takes a memref as its first operand: it is the source matrix
from which data is to be loaded. The op returns a `!gpu.mma_matrix`. The		from which data is to be loaded. The op returns a `!gpu.mma_matrix`. The
source memref can be in global memory or shared memory. The load address is		source memref can be in global memory or shared memory. The load address is
determined using `indices`. The matrix being loaded into is the result. The		determined using `indices`. The matrix being loaded into is the result. The
`leadDimension` attribute specifies the leading dimension size of the source		`leadDimension` attribute specifies the leading dimension size of the source
matrix which eventually allows the lowering to determine the size of each		matrix which eventually allows the lowering to determine the size of each
row. If the `transpose` attribute is present then the op does a transposed load.		row. If the `transpose` attribute is present then the op does a transposed load.

		For integer types, the resulting `!gpu.mma_matrix` type needs to specify the
		signedness of the data if the matrix type is an `A` or `B` operand for
		`gpu.subgroup_mma_compute`.

This op is often meant to be used along with `gpu.subgroup_mma_store_matrix` and		This op is often meant to be used along with `gpu.subgroup_mma_store_matrix` and
`gpu.subgroup_mma_compute`.		`gpu.subgroup_mma_compute`.

Example:		Example:

```mlir		```mlir
%0 = gpu.subgroup_mma_load_matrix src[%i,%j] : {leadDimension = 32 : i32}		%0 = gpu.subgroup_mma_load_matrix src[%i,%j] : {leadDimension = 32 : i32}
: memref<32x32xf16, 3>, !gpu.mma_matrix<16x16xf16, "AOp">		: memref<32x32xf16, 3>, !gpu.mma_matrix<16x16xf16, "AOp">
Show All 35 Lines	let description = [{
Example:		Example:

```mlir		```mlir
gpu.subgroup_mma_store_matrix %D, %sg[%i,%j] : { leadDimension = 32 : i32}		gpu.subgroup_mma_store_matrix %D, %sg[%i,%j] : { leadDimension = 32 : i32}
: !gpu.mma_matrix<16x16xf16, "COp">, memref<32x32xf16, 3>		: !gpu.mma_matrix<16x16xf16, "COp">, memref<32x32xf16, 3>
```		```
}];		}];

let arguments = (ins Arg<MMAMatrixOf<[F16, F32]>>:$src,		let arguments = (ins Arg<MMAMatrixOf<[SI8, UI8, I32, F16, F32]>>:$src,
Arg<GPU_MMAMemRef, "",[MemWrite]>:$dstMemref,		Arg<GPU_MMAMemRef, "",[MemWrite]>:$dstMemref,
Variadic<Index>:$indices,		Variadic<Index>:$indices,
IndexAttr:$leadDimension,		IndexAttr:$leadDimension,
OptionalAttr<UnitAttr>:$transpose);		OptionalAttr<UnitAttr>:$transpose);

let assemblyFormat = [{		let assemblyFormat = [{
$src`,` $dstMemref`[`$indices`]` attr-dict `:` type($src)`,` type($dstMemref)		$src`,` $dstMemref`[`$indices`]` attr-dict `:` type($src)`,` type($dstMemref)
}];		}];
Show All 9 Lines	let description = [{
The `gpu.subgroup_mma_compute` operation performs a matrix-multiply accumulate (mma)		The `gpu.subgroup_mma_compute` operation performs a matrix-multiply accumulate (mma)
operation using all the threads in a subgroup.		operation using all the threads in a subgroup.

This operation takes three `!gpu.mma_matrix`s as arguments: these hold `A`,		This operation takes three `!gpu.mma_matrix`s as arguments: these hold `A`,
`B` and `C`operands for the mma operation. The operation performed is represented		`B` and `C`operands for the mma operation. The operation performed is represented
as `C += A * B`. The op returns a `!gpu.mma_matrix` which contains the result of		as `C += A * B`. The op returns a `!gpu.mma_matrix` which contains the result of
the operation held by all threads in a subgroup. `a_transpose` or		the operation held by all threads in a subgroup. `a_transpose` or
`b_transpose` if present, signify that the respective operand was loaded in a		`b_transpose` if present, signify that the respective operand was loaded in a
transposed manner. The transpose opernads are required to map to correct		transposed manner. The transpose operands are required to map to correct
underlying intrisics but they currently do not seem to affect correctness		underlying intrisics but they currently do not seem to affect correctness
even if they are absent given that the operands were loaded correctly using		even if they are absent given that the operands were loaded correctly using
the `transpose` attribute in `gpu.subgroup_mma_load_matrix` op.		the `transpose` attribute in `gpu.subgroup_mma_load_matrix` op.

		For integer types, the `A` and `B` matrices carry their signedness with their
		types. The accumulator type is expected to be signless and imply a signed integer
		with a greater width than the other two operands.

This op is meant to be used along with `gpu.subgroup_mma_store_matrix` and		This op is meant to be used along with `gpu.subgroup_mma_store_matrix` and
`gpu.subgroup_mma_load_matrix` ops.		`gpu.subgroup_mma_load_matrix` ops.

Example:		Example:

```mlir		```mlir
%D = gpu.subgroup_mma_compute_matrix %A, %B, %C :		%D = gpu.subgroup_mma_compute_matrix %A, %B, %C :
!gpu.mma_matrix<16x16xf16, "AOp">, !gpu.mma_matrix<16x16xf16, "BOp">>		!gpu.mma_matrix<16x16xf16, "AOp">, !gpu.mma_matrix<16x16xf16, "BOp">>
-> !gpu.mma_matrix<16x16xf16, "COp">		-> !gpu.mma_matrix<16x16xf16, "COp">
```		```
}];		}];

let arguments = (ins Arg<MMAMatrixOf<[F16, F32]>>:$opA,		let arguments = (ins Arg<MMAMatrixOf<[SI8, UI8, F16, F32]>>:$opA,
Arg<MMAMatrixOf<[F16, F32]>>:$opB,		Arg<MMAMatrixOf<[SI8, UI8, F16, F32]>>:$opB,
Arg<MMAMatrixOf<[F16, F32]>>:$opC,		Arg<MMAMatrixOf<[I32, F16, F32]>>:$opC,
OptionalAttr<UnitAttr>:$a_transpose,		OptionalAttr<UnitAttr>:$a_transpose,
OptionalAttr<UnitAttr>:$b_transpose);		OptionalAttr<UnitAttr>:$b_transpose);

let results = (outs GPU_MMAMatrix : $res);		let results = (outs GPU_MMAMatrix : $res);

let assemblyFormat = [{		let assemblyFormat = [{
$opA`,` $opB`,` $opC attr-dict `:` type($opA)`,` type($opB) `->` type($res)		$opA`,` $opB`,` $opC attr-dict `:` type($opA)`,` type($opB) `->` type($res)
}];		}];
Show All 25 Lines	let description = [{
```mlir		```mlir
%0 = gpu.subgroup_mma_constant_matrix %a :		%0 = gpu.subgroup_mma_constant_matrix %a :
!gpu.mma_matrix<16x16xf16, "AOp">		!gpu.mma_matrix<16x16xf16, "AOp">
%1 = gpu.subgroup_mma_constant_matrix %b :		%1 = gpu.subgroup_mma_constant_matrix %b :
!gpu.mma_matrix<16x16xf32, "COp">		!gpu.mma_matrix<16x16xf32, "COp">
```		```
}];		}];

let arguments = (ins AnyTypeOf<[F16, F32]>:$value);		let arguments = (ins AnyTypeOf<[SI8, UI8, I32, F16, F32]>:$value);

let results = (outs GPU_MMAMatrix:$res);		let results = (outs GPU_MMAMatrix:$res);

let extraClassDeclaration = [{		let extraClassDeclaration = [{
gpu::MMAMatrixType getType() {		gpu::MMAMatrixType getType() {
return getRes().getType().cast<gpu::MMAMatrixType>();		return getRes().getType().cast<gpu::MMAMatrixType>();
}		}
}];		}];
▲ Show 20 Lines • Show All 84 Lines • Show Last 20 Lines

mlir/include/mlir/Dialect/LLVMIR/NVVMDialect.h

Show All 31 Lines	enum NVVMMemorySpace {
/// Shared memory space identifier.		/// Shared memory space identifier.
kSharedMemorySpace = 3		kSharedMemorySpace = 3
};		};

/// Return the element type and number of elements associated with a wmma matrix		/// Return the element type and number of elements associated with a wmma matrix
/// of given chracteristics. This matches the logic in IntrinsicsNVVM.td		/// of given chracteristics. This matches the logic in IntrinsicsNVVM.td
/// WMMA_REGS structure.		/// WMMA_REGS structure.
std::pair<mlir::Type, unsigned> inferMMAType(mlir::NVVM::MMATypes type,		std::pair<mlir::Type, unsigned> inferMMAType(mlir::NVVM::MMATypes type,
mlir::NVVM::MMAFrag frag,		mlir::NVVM::MMAFrag frag, int nRow,
		int nCol,
mlir::MLIRContext *context);		mlir::MLIRContext *context);
} // namespace NVVM		} // namespace NVVM
} // namespace mlir		} // namespace mlir

///// Ops /////		///// Ops /////
#define GET_ATTRDEF_CLASSES		#define GET_ATTRDEF_CLASSES
#include "mlir/Dialect/LLVMIR/NVVMOpsAttributes.h.inc"		#include "mlir/Dialect/LLVMIR/NVVMOpsAttributes.h.inc"

#define GET_OP_CLASSES		#define GET_OP_CLASSES
#include "mlir/Dialect/LLVMIR/NVVMOps.h.inc"		#include "mlir/Dialect/LLVMIR/NVVMOps.h.inc"

#include "mlir/Dialect/LLVMIR/NVVMOpsDialect.h.inc"		#include "mlir/Dialect/LLVMIR/NVVMOpsDialect.h.inc"

#endif /* MLIR_DIALECT_LLVMIR_NVVMDIALECT_H_ */		#endif /* MLIR_DIALECT_LLVMIR_NVVMDIALECT_H_ */

mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td

	Show First 20 Lines • Show All 379 Lines • ▼ Show 20 Lines
	class NVVM_MMA_OPS {			class NVVM_MMA_OPS {
	// "wmma" operations			// "wmma" operations
	list<list<WMMA_REGS>> tf32_wmma_ops = MMA_OPS<			list<list<WMMA_REGS>> tf32_wmma_ops = MMA_OPS<
	[GEOM<16, 16, 8>],			[GEOM<16, 16, 8>],
	["tf32"], [], ["f32"], []>.ret;			["tf32"], [], ["f32"], []>.ret;
	list<list<WMMA_REGS>> fp_wmma_ops = MMA_OPS<			list<list<WMMA_REGS>> fp_wmma_ops = MMA_OPS<
	[GEOM<16, 16, 16>, GEOM<32, 8, 16>, GEOM<8, 32, 16>],			[GEOM<16, 16, 16>, GEOM<32, 8, 16>, GEOM<8, 32, 16>],
	["f16"], [], ["f16", "f32"], []>.ret;			["f16"], [], ["f16", "f32"], []>.ret;
				list<list<WMMA_REGS>> i8_wmma_ops = MMA_OPS<
				[GEOM<16, 16, 16>, GEOM<32, 8, 16>, GEOM<8, 32, 16>],
				["s8","u8"], [], ["s32"], []>.ret;
	list<list<WMMA_REGS>> all_wmma_ops = !listconcat(			list<list<WMMA_REGS>> all_wmma_ops = !listconcat(
	tf32_wmma_ops,			tf32_wmma_ops,
	fp_wmma_ops);			fp_wmma_ops,
				i8_wmma_ops);

	list<WMMA_REGS> ldst_ab_ops = MMA_LDST_OPS<			list<WMMA_REGS> ldst_ab_ops = MMA_LDST_OPS<
	[GEOM<16, 16, 16>, GEOM<32, 8, 16>, GEOM<8, 32, 16>],			[GEOM<16, 16, 16>, GEOM<32, 8, 16>, GEOM<8, 32, 16>],
	["a", "b"], ["f16"]>.ret;			["a", "b"], ["f16","s8","u8"]>.ret;
	list<WMMA_REGS> ldst_cd_ops = MMA_LDST_OPS<			list<WMMA_REGS> ldst_cd_ops = MMA_LDST_OPS<
	[GEOM<16, 16, 16>, GEOM<32, 8, 16>, GEOM<8, 32, 16>],			[GEOM<16, 16, 16>, GEOM<32, 8, 16>, GEOM<8, 32, 16>],
	["c", "d"], ["f16", "f32"]>.ret;			["c", "d"], ["f16", "f32","s32"]>.ret;
	list<WMMA_REGS> ldst_tf32_ab_ops = MMA_LDST_OPS<			list<WMMA_REGS> ldst_tf32_ab_ops = MMA_LDST_OPS<
	[GEOM<16, 16, 8>],			[GEOM<16, 16, 8>],
	["a", "b"], ["tf32"]>.ret;			["a", "b"], ["tf32"]>.ret;
	list<WMMA_REGS> ldst_tf32_cd_ops = MMA_LDST_OPS<			list<WMMA_REGS> ldst_tf32_cd_ops = MMA_LDST_OPS<
	[GEOM<16, 16, 8>],			[GEOM<16, 16, 8>],
	["c", "d"], ["f32"]>.ret;			["c", "d"], ["f32"]>.ret;
	list<WMMA_REGS> all_ldst_ops = !listconcat(ldst_ab_ops, ldst_cd_ops,			list<WMMA_REGS> all_ldst_ops = !listconcat(ldst_ab_ops, ldst_cd_ops,
	ldst_tf32_ab_ops,			ldst_tf32_ab_ops,
	▲ Show 20 Lines • Show All 678 Lines • Show Last 20 Lines

mlir/lib/Conversion/GPUToNVVM/WmmaOpsToNvvm.cpp

Show First 20 Lines • Show All 51 Lines • ▼ Show 20 Lines
}		}

static NVVM::MMATypes getElementType(gpu::MMAMatrixType type) {		static NVVM::MMATypes getElementType(gpu::MMAMatrixType type) {
if (type.getElementType().isF16())		if (type.getElementType().isF16())
return NVVM::MMATypes::f16;		return NVVM::MMATypes::f16;
if (type.getElementType().isF32())		if (type.getElementType().isF32())
return type.getOperand().equals("COp") ? NVVM::MMATypes::f32		return type.getOperand().equals("COp") ? NVVM::MMATypes::f32
: NVVM::MMATypes::tf32;		: NVVM::MMATypes::tf32;

		if (type.getElementType().isSignedInteger(8))
		return NVVM::MMATypes::s8;
		// Accumulator type is signless and implies signed.
		if (type.getElementType().isInteger(32))
		return NVVM::MMATypes::s32;
llvm_unreachable("Unsupported type");		llvm_unreachable("Unsupported type");
}		}

/// This class implements the conversion of GPU MMA loadOp to wmma.load op		/// This class implements the conversion of GPU MMA loadOp to wmma.load op
/// in the NVVM dialect. The conversion not only emits the NVVM op but also		/// in the NVVM dialect. The conversion not only emits the NVVM op but also
/// emits code that is necessary to store the data in the destination memref		/// emits code that is necessary to store the data in the destination memref
/// after it has been loaded.		/// after it has been loaded.
struct WmmaLoadOpToNVVMLowering		struct WmmaLoadOpToNVVMLowering
Show All 33 Lines	if (retType.getOperand().equals("AOp")) {
m = NVVM::WMMALoadOp::inferMDimension(k, n, eltype);		m = NVVM::WMMALoadOp::inferMDimension(k, n, eltype);
} else if (retType.getOperand().equals("COp")) {		} else if (retType.getOperand().equals("COp")) {
m = retTypeShape[0];		m = retTypeShape[0];
n = retTypeShape[1];		n = retTypeShape[1];
k = NVVM::WMMALoadOp::inferKDimension(m, n, eltype);		k = NVVM::WMMALoadOp::inferKDimension(m, n, eltype);
}		}
NVVM::MMAFrag frag = convertOperand(retType.getOperand());		NVVM::MMAFrag frag = convertOperand(retType.getOperand());
// Check that there is an exisiting instruction for the combination we need.		// Check that there is an exisiting instruction for the combination we need.
if (NVVM::WMMALoadOp::getIntrinsicID(m, n, k, layout, eltype, frag) == 0)		if (NVVM::WMMALoadOp::getIntrinsicID(m, n, k, layout, eltype, frag) == 0) {
		llvm::errs() << "No matching intrinsic " << m << " " << n << " " << k
		<< "\n";
return rewriter.notifyMatchFailure(op, kInvalidCaseStr);		return rewriter.notifyMatchFailure(op, kInvalidCaseStr);
		}

Type resType = convertMMAToLLVMType(retType);		Type resType = convertMMAToLLVMType(retType);
Location loc = op->getLoc();		Location loc = op->getLoc();

// Create nvvm.mma_load op according to the operand types.		// Create nvvm.mma_load op according to the operand types.
Value dataPtr = getStridedElementPtr(		Value dataPtr = getStridedElementPtr(
loc,		loc,
subgroupMmaLoadMatrixOp.getSrcMemref().getType().cast<MemRefType>(),		subgroupMmaLoadMatrixOp.getSrcMemref().getType().cast<MemRefType>(),
▲ Show 20 Lines • Show All 242 Lines • ▼ Show 20 Lines
};		};

} // namespace		} // namespace

/// Return the LLVMStructureType corresponding to the MMAMatrixType `type`.		/// Return the LLVMStructureType corresponding to the MMAMatrixType `type`.
LLVM::LLVMStructType mlir::convertMMAToLLVMType(gpu::MMAMatrixType type) {		LLVM::LLVMStructType mlir::convertMMAToLLVMType(gpu::MMAMatrixType type) {
NVVM::MMAFrag frag = convertOperand(type.getOperand());		NVVM::MMAFrag frag = convertOperand(type.getOperand());
NVVM::MMATypes eltType = getElementType(type);		NVVM::MMATypes eltType = getElementType(type);
		auto nRow = type.getShape()[0];
		auto nCol = type.getShape()[1];
std::pair<Type, unsigned> typeInfo =		std::pair<Type, unsigned> typeInfo =
NVVM::inferMMAType(eltType, frag, type.getContext());		NVVM::inferMMAType(eltType, frag, nRow, nCol, type.getContext());
return LLVM::LLVMStructType::getLiteral(		return LLVM::LLVMStructType::getLiteral(
type.getContext(), SmallVector<Type, 8>(typeInfo.second, typeInfo.first));		type.getContext(), SmallVector<Type, 8>(typeInfo.second, typeInfo.first));
}		}

void mlir::populateGpuWMMAToNVVMConversionPatterns(		void mlir::populateGpuWMMAToNVVMConversionPatterns(
LLVMTypeConverter &converter, RewritePatternSet &patterns) {		LLVMTypeConverter &converter, RewritePatternSet &patterns) {
patterns.add<WmmaLoadOpToNVVMLowering, WmmaMmaOpToNVVMLowering,		patterns.add<WmmaLoadOpToNVVMLowering, WmmaMmaOpToNVVMLowering,
WmmaStoreOpToNVVMLowering, WmmaConstantOpToNVVMLowering,		WmmaStoreOpToNVVMLowering, WmmaConstantOpToNVVMLowering,
WmmaElementwiseOpToNVVMLowering>(converter);		WmmaElementwiseOpToNVVMLowering>(converter);
}		}

mlir/lib/Conversion/VectorToGPU/VectorToGPU.cpp

Show First 20 Lines • Show All 134 Lines • ▼ Show 20 Lines
// Return true if the transfer op can be converted to a MMA matrix load.		// Return true if the transfer op can be converted to a MMA matrix load.
static bool transferReadSupportsMMAMatrixType(vector::TransferReadOp readOp,		static bool transferReadSupportsMMAMatrixType(vector::TransferReadOp readOp,
bool useNvGpu) {		bool useNvGpu) {
if (readOp.getMask() \|\| readOp.hasOutOfBoundsDim() \|\|		if (readOp.getMask() \|\| readOp.hasOutOfBoundsDim() \|\|
readOp.getVectorType().getRank() != 2)		readOp.getVectorType().getRank() != 2)
return false;		return false;
if (!getMemrefConstantHorizontalStride(readOp.getShapedType()))		if (!getMemrefConstantHorizontalStride(readOp.getShapedType()))
return false;		return false;

		// Only allow integer types if the signedness can be inferred.
		if (!useNvGpu && readOp.getVectorType().getElementType().isInteger(8))
		if (!readOp->hasOneUse() \|\| !isa<arith::ExtSIOp>(*readOp->user_begin()))
		return false;

AffineMap map = readOp.getPermutationMap();		AffineMap map = readOp.getPermutationMap();
OpBuilder b(readOp.getContext());		OpBuilder b(readOp.getContext());
AffineExpr innerDim = b.getAffineDimExpr(map.getNumDims() - 1);		AffineExpr innerDim = b.getAffineDimExpr(map.getNumDims() - 1);
AffineExpr zero = b.getAffineConstantExpr(0);		AffineExpr zero = b.getAffineConstantExpr(0);
auto broadcastInnerDim = AffineMap::get(map.getNumDims(), 0, {zero, innerDim},		auto broadcastInnerDim = AffineMap::get(map.getNumDims(), 0, {zero, innerDim},
readOp.getContext());		readOp.getContext());

if (!useNvGpu) {		if (!useNvGpu) {
Show All 29 Lines	static bool constantSupportsMMAMatrixType(arith::ConstantOp constantOp) {
auto vecType = constantOp.getType().dyn_cast<VectorType>();		auto vecType = constantOp.getType().dyn_cast<VectorType>();
if (!vecType \|\| vecType.getRank() != 2)		if (!vecType \|\| vecType.getRank() != 2)
return false;		return false;
return constantOp.getValue().isa<SplatElementsAttr>();		return constantOp.getValue().isa<SplatElementsAttr>();
}		}

/// Return true if this is a broadcast from scalar to a 2D vector.		/// Return true if this is a broadcast from scalar to a 2D vector.
static bool broadcastSupportsMMAMatrixType(vector::BroadcastOp broadcastOp) {		static bool broadcastSupportsMMAMatrixType(vector::BroadcastOp broadcastOp) {
return broadcastOp.getVectorType().getRank() == 2 &&		return broadcastOp.getVectorType().getRank() == 2;
broadcastOp.getSource().getType().isa<FloatType>();		}

		/// Return true if this signed extend op can be folded into a contract op.
		static bool signedExtendSupportsMMAMatrixType(arith::ExtSIOp extOp) {
		if (!isa<vector::TransferReadOp>(extOp.getOperand().getDefiningOp()))
		return false;
		return llvm::all_of(extOp->getUsers(), [](Operation *user) {
		return isa<vector::ContractionOp>(user);
		ThomasRaouxUnsubmitted Not Done Reply Inline Actions do we need this? ThomasRaoux: do we need this?
		qedawkinsAuthorUnsubmitted Done Reply Inline Actions My thinking was that if there isn't a contraction op to match against, then we can't safely fuse the signed extend, but looking at this now there is no reason to actually have a check like this :P qedawkins: My thinking was that if there isn't a contraction op to match against, then we can't safely…
		});
}		}

/// Return the MMA elementwise enum associated with `op` if it is supported.		/// Return the MMA elementwise enum associated with `op` if it is supported.
/// Return `std::nullopt` otherwise.		/// Return `std::nullopt` otherwise.
static std::optional<gpu::MMAElementwiseOp>		static std::optional<gpu::MMAElementwiseOp>
convertElementwiseOpToMMA(Operation *op) {		convertElementwiseOpToMMA(Operation *op) {
if (isa<arith::AddFOp>(op))		if (isa<arith::AddFOp>(op))
return gpu::MMAElementwiseOp::ADDF;		return gpu::MMAElementwiseOp::ADDF;
▲ Show 20 Lines • Show All 65 Lines • ▼ Show 20 Lines	if (auto extractStridedSlice = dyn_cast<vector::ExtractStridedSliceOp>(op))
return useNvGpu &&		return useNvGpu &&
extractStridedSliceSupportsMMAMatrixType(extractStridedSlice);		extractStridedSliceSupportsMMAMatrixType(extractStridedSlice);
if (auto contract = dyn_cast<vector::ContractionOp>(op))		if (auto contract = dyn_cast<vector::ContractionOp>(op))
return contractSupportsMMAMatrixType(contract, useNvGpu);		return contractSupportsMMAMatrixType(contract, useNvGpu);
if (auto constant = dyn_cast<arith::ConstantOp>(op))		if (auto constant = dyn_cast<arith::ConstantOp>(op))
return constantSupportsMMAMatrixType(constant);		return constantSupportsMMAMatrixType(constant);
if (auto broadcast = dyn_cast<vector::BroadcastOp>(op))		if (auto broadcast = dyn_cast<vector::BroadcastOp>(op))
return broadcastSupportsMMAMatrixType(broadcast);		return broadcastSupportsMMAMatrixType(broadcast);
		if (auto extend = dyn_cast<arith::ExtSIOp>(op))
		return signedExtendSupportsMMAMatrixType(extend);
return elementwiseSupportsMMAMatrixType(op);		return elementwiseSupportsMMAMatrixType(op);
}		}

/// Return an unsorted slice handling scf.for region differently than		/// Return an unsorted slice handling scf.for region differently than
/// `getSlice`. In scf.for we only want to include as part of the slice elements		/// `getSlice`. In scf.for we only want to include as part of the slice elements
/// that are part of the use/def chain.		/// that are part of the use/def chain.
static SetVector<Operation > getSliceContract(Operation op,		static SetVector<Operation > getSliceContract(Operation op,
TransitiveFilter backwardFilter,		TransitiveFilter backwardFilter,
▲ Show 20 Lines • Show All 127 Lines • ▼ Show 20 Lines
// respectively. We can fold the transpose operation when loading the data from		// respectively. We can fold the transpose operation when loading the data from
// Shared Memory to registers.		// Shared Memory to registers.
struct CombineTransferReadOpTranspose final		struct CombineTransferReadOpTranspose final
: public OpRewritePattern<vector::TransposeOp> {		: public OpRewritePattern<vector::TransposeOp> {
using OpRewritePattern<vector::TransposeOp>::OpRewritePattern;		using OpRewritePattern<vector::TransposeOp>::OpRewritePattern;

LogicalResult matchAndRewrite(vector::TransposeOp op,		LogicalResult matchAndRewrite(vector::TransposeOp op,
PatternRewriter &rewriter) const override {		PatternRewriter &rewriter) const override {
auto transferReadOp =		// Look through integer extend ops.
op.getVector().getDefiningOp<vector::TransferReadOp>();		Value source = op.getVector();
		auto extOp = source.getDefiningOp<arith::ExtSIOp>();
		auto resultType = op.getVectorType();
		if (extOp) {
		source = extOp.getOperand();
		resultType =
		VectorType::get(resultType.getShape(),
		source.getType().cast<VectorType>().getElementType());
		}

		auto transferReadOp = source.getDefiningOp<vector::TransferReadOp>();
if (!transferReadOp)		if (!transferReadOp)
return failure();		return failure();

// TODO: support 0-d corner case.		// TODO: support 0-d corner case.
if (transferReadOp.getTransferRank() == 0)		if (transferReadOp.getTransferRank() == 0)
return failure();		return failure();

if (transferReadOp.getMask() \|\| transferReadOp.hasOutOfBoundsDim())		if (transferReadOp.getMask() \|\| transferReadOp.hasOutOfBoundsDim())
return failure();		return failure();
SmallVector<int64_t, 2> perm;		SmallVector<int64_t, 2> perm;
op.getTransp(perm);		op.getTransp(perm);
SmallVector<unsigned, 2> permU;		SmallVector<unsigned, 2> permU;
for (int64_t o : perm)		for (int64_t o : perm)
permU.push_back(unsigned(o));		permU.push_back(unsigned(o));
AffineMap permutationMap =		AffineMap permutationMap =
AffineMap::getPermutationMap(permU, op.getContext());		AffineMap::getPermutationMap(permU, op.getContext());
AffineMap newMap =		AffineMap newMap =
permutationMap.compose(transferReadOp.getPermutationMap());		permutationMap.compose(transferReadOp.getPermutationMap());
rewriter.replaceOpWithNewOp<vector::TransferReadOp>(
op, op.getType(), transferReadOp.getSource(),		auto loc = op.getLoc();
		Value result =
		rewriter
		.create<vector::TransferReadOp>(
		loc, resultType, transferReadOp.getSource(),
transferReadOp.getIndices(), AffineMapAttr::get(newMap),		transferReadOp.getIndices(), AffineMapAttr::get(newMap),
transferReadOp.getPadding(), transferReadOp.getMask(),		transferReadOp.getPadding(), transferReadOp.getMask(),
transferReadOp.getInBoundsAttr());		transferReadOp.getInBoundsAttr())
		.getResult();

		// Fuse through the integer extend op.
		if (extOp)
		result = rewriter.create<arith::ExtSIOp>(loc, op.getType(), result)
		.getResult();

		rewriter.replaceOp(op, result);
return success();		return success();
}		}
};		};

} // namespace		} // namespace

// MMA types have different layout based on how they are used in matmul ops.		// MMA types have different layout based on how they are used in matmul ops.
// Figure the right layout to use by looking at op uses.		// Figure the right layout to use by looking at op uses.
Show All 27 Lines	static void convertTransferReadOp(vector::TransferReadOp op,

// Handle broadcast by setting the stride to 0.		// Handle broadcast by setting the stride to 0.
if (auto cstExpr =		if (auto cstExpr =
map.getResult(isTranspose).dyn_cast<AffineConstantExpr>()) {		map.getResult(isTranspose).dyn_cast<AffineConstantExpr>()) {
assert(cstExpr.getValue() == 0);		assert(cstExpr.getValue() == 0);
stride = 0;		stride = 0;
}		}
assert(stride);		assert(stride);
		Value mappingResult = op.getResult();
		auto elType = op.getVectorType().getElementType();
const char *fragType = inferFragType(op);		const char *fragType = inferFragType(op);
		if (op->hasOneUse()) {
		auto extOp = dyn_cast<arith::ExtSIOp>(*op->user_begin());
		// Infer the signedness of the mma type from the signed extend.
		if (extOp) {
		elType = IntegerType::get(op.getContext(),
		elType.cast<IntegerType>().getWidth(),
		IntegerType::Signed);
		mappingResult = extOp.getResult();
		fragType = inferFragType(extOp);
		}
		}
gpu::MMAMatrixType type =		gpu::MMAMatrixType type =
gpu::MMAMatrixType::get(op.getVectorType().getShape(),		gpu::MMAMatrixType::get(op.getVectorType().getShape(), elType, fragType);
op.getVectorType().getElementType(), fragType);
Value load = b.create<gpu::SubgroupMmaLoadMatrixOp>(		Value load = b.create<gpu::SubgroupMmaLoadMatrixOp>(
op.getLoc(), type, op.getSource(), op.getIndices(),		op.getLoc(), type, op.getSource(), op.getIndices(),
b.getIndexAttr(*stride), isTranspose ? b.getUnitAttr() : UnitAttr());		b.getIndexAttr(*stride), isTranspose ? b.getUnitAttr() : UnitAttr());
valueMapping[op.getResult()] = load;		valueMapping[mappingResult] = load;
}		}

static void convertTransferWriteOp(vector::TransferWriteOp op,		static void convertTransferWriteOp(vector::TransferWriteOp op,
llvm::DenseMap<Value, Value> &valueMapping) {		llvm::DenseMap<Value, Value> &valueMapping) {
assert(transferWriteSupportsMMAMatrixType(op));		assert(transferWriteSupportsMMAMatrixType(op));
std::optional<int64_t> stride =		std::optional<int64_t> stride =
getMemrefConstantHorizontalStride(op.getShapedType());		getMemrefConstantHorizontalStride(op.getShapedType());
assert(stride);		assert(stride);
▲ Show 20 Lines • Show All 581 Lines • Show Last 20 Lines

mlir/lib/Dialect/GPU/IR/GPUDialect.cpp

Show First 20 Lines • Show All 72 Lines • ▼ Show 20 Lines	ArrayRef<int64_t> MMAMatrixType::getShape() const {
return getImpl()->getShape();		return getImpl()->getShape();
}		}

Type MMAMatrixType::getElementType() const { return getImpl()->elementType; }		Type MMAMatrixType::getElementType() const { return getImpl()->elementType; }

StringRef MMAMatrixType::getOperand() const { return getImpl()->getOperand(); }		StringRef MMAMatrixType::getOperand() const { return getImpl()->getOperand(); }

bool MMAMatrixType::isValidElementType(Type elementType) {		bool MMAMatrixType::isValidElementType(Type elementType) {
return elementType.isF16() \|\| elementType.isF32();		return elementType.isF16() \|\| elementType.isF32() \|\|
		elementType.isUnsignedInteger(8) \|\| elementType.isSignedInteger(8) \|\|
		elementType.isInteger(32);
}		}

LogicalResult		LogicalResult
MMAMatrixType::verify(function_ref<InFlightDiagnostic()> emitError,		MMAMatrixType::verify(function_ref<InFlightDiagnostic()> emitError,
ArrayRef<int64_t> shape, Type elementType,		ArrayRef<int64_t> shape, Type elementType,
StringRef operand) {		StringRef operand) {
if (!operand.equals("AOp") && !operand.equals("BOp") &&		if (!operand.equals("AOp") && !operand.equals("BOp") &&
!operand.equals("COp"))		!operand.equals("COp"))
return emitError() << "operand expected to be one of AOp, BOp or COp";		return emitError() << "operand expected to be one of AOp, BOp or COp";

if (shape.size() != 2)		if (shape.size() != 2)
return emitError() << "MMAMatrixType must have exactly two dimensions";		return emitError() << "MMAMatrixType must have exactly two dimensions";

if (!MMAMatrixType::isValidElementType(elementType))		if (!MMAMatrixType::isValidElementType(elementType))
return emitError() << "MMAMatrixType elements must be F16 or F32";		return emitError()
		<< "MMAMatrixType elements must be SI8, UI8, I32, F16, or F32";

return success();		return success();
}		}

//===----------------------------------------------------------------------===//		//===----------------------------------------------------------------------===//
// GPUDialect		// GPUDialect
//===----------------------------------------------------------------------===//		//===----------------------------------------------------------------------===//

▲ Show 20 Lines • Show All 1,340 Lines • Show Last 20 Lines

mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp

Show First 20 Lines • Show All 531 Lines • ▼ Show 20 Lines	auto elementType = (type && type.getBody().size() == 2)
: nullptr;		: nullptr;
if (!elementType \|\| elementType.getWidth() != 1)		if (!elementType \|\| elementType.getWidth() != 1)
return emitError("expected return type to be a two-element struct with "		return emitError("expected return type to be a two-element struct with "
"i1 as the second element");		"i1 as the second element");
return success();		return success();
}		}

std::pair<mlir::Type, unsigned> NVVM::inferMMAType(NVVM::MMATypes type,		std::pair<mlir::Type, unsigned> NVVM::inferMMAType(NVVM::MMATypes type,
NVVM::MMAFrag frag,		NVVM::MMAFrag frag, int nRow,
		int nCol,
MLIRContext *context) {		MLIRContext *context) {
unsigned numberElements = 0;		unsigned numberElements = 0;
Type elementType;		Type elementType;
OpBuilder builder(context);		OpBuilder builder(context);
Type f16x2 = VectorType::get(2, builder.getF16Type());		Type f16x2 = VectorType::get(2, builder.getF16Type());
if (type == NVVM::MMATypes::f16) {		if (type == NVVM::MMATypes::f16) {
elementType = f16x2;		elementType = f16x2;
if (frag == NVVM::MMAFrag::a \|\| frag == NVVM::MMAFrag::b)		if (frag == NVVM::MMAFrag::a \|\| frag == NVVM::MMAFrag::b)
numberElements = 8;		numberElements = 8;
else		else
numberElements = 4;		numberElements = 4;
} else if (type == NVVM::MMATypes::f32) {		} else if (type == NVVM::MMATypes::f32) {
elementType = builder.getF32Type();		elementType = builder.getF32Type();
numberElements = 8;		numberElements = 8;
} else if (type == NVVM::MMATypes::tf32) {		} else if (type == NVVM::MMATypes::tf32) {
elementType = builder.getI32Type();		elementType = builder.getI32Type();
numberElements = 4;		numberElements = 4;
		} else if (type == NVVM::MMATypes::s8 \|\| type == NVVM::MMATypes::u8) {
		elementType = builder.getI32Type();
		int parallelSize = 0;
		if (frag == NVVM::MMAFrag::a)
		parallelSize = nRow;
		if (frag == NVVM::MMAFrag::b)
		parallelSize = nCol;

		// m == 16 && n == 16 && k == 16
		if (parallelSize == 16)
		numberElements = 2;
		// m == 8 && n == 32 && k == 16 or m == 32 && n == 8 && k == 16
		else if (parallelSize == 8)
		numberElements = 1;
		else if (parallelSize == 32)
		numberElements = 4;
		} else if (type == NVVM::MMATypes::s32) {
		elementType = builder.getI32Type();
		numberElements = 8;
}		}
assert(numberElements != 0 && elementType != nullptr);		assert(numberElements != 0 && elementType != nullptr);
return std::make_pair(elementType, numberElements);		return std::make_pair(elementType, numberElements);
}		}

		static std::pair<mlir::Type, unsigned>
		inferMMATypeFromMNK(NVVM::MMATypes type, NVVM::MMAFrag frag, int m, int n,
		int k, MLIRContext *context) {
		int nRow, nCol;
		if (frag == NVVM::MMAFrag::a) {
		nRow = m;
		nCol = k;
		} else if (frag == NVVM::MMAFrag::b) {
		nRow = k;
		nCol = n;
		} else {
		nRow = m;
		nCol = n;
		}
		assert(nRow && nCol);
		return inferMMAType(type, frag, nRow, nCol, context);
		}

LogicalResult NVVM::WMMALoadOp::verify() {		LogicalResult NVVM::WMMALoadOp::verify() {
unsigned addressSpace =		unsigned addressSpace =
getPtr().getType().cast<LLVM::LLVMPointerType>().getAddressSpace();		getPtr().getType().cast<LLVM::LLVMPointerType>().getAddressSpace();
if (addressSpace != 0 && addressSpace != 1 && addressSpace != 3)		if (addressSpace != 0 && addressSpace != 1 && addressSpace != 3)
return emitOpError("expected source pointer in memory "		return emitOpError("expected source pointer in memory "
"space 0, 1, 3");		"space 0, 1, 3");

if (NVVM::WMMALoadOp::getIntrinsicID(getM(), getN(), getK(), getLayout(),		if (NVVM::WMMALoadOp::getIntrinsicID(getM(), getN(), getK(), getLayout(),
getEltype(), getFrag()) == 0)		getEltype(), getFrag()) == 0)
return emitOpError() << "invalid attribute combination";		return emitOpError() << "invalid attribute combination";
std::pair<Type, unsigned> typeInfo =		std::pair<Type, unsigned> typeInfo = inferMMATypeFromMNK(
inferMMAType(getEltype(), getFrag(), getContext());		getEltype(), getFrag(), getM(), getN(), getK(), getContext());
Type dstType = LLVM::LLVMStructType::getLiteral(		Type dstType = LLVM::LLVMStructType::getLiteral(
getContext(), SmallVector<Type, 8>(typeInfo.second, typeInfo.first));		getContext(), SmallVector<Type, 8>(typeInfo.second, typeInfo.first));
if (getType() != dstType)		if (getType() != dstType)
return emitOpError("expected destination type is a structure of ")		return emitOpError("expected destination type is a structure of ")
<< typeInfo.second << " elements of type " << typeInfo.first;		<< typeInfo.second << " elements of type " << typeInfo.first;
return success();		return success();
}		}

LogicalResult NVVM::WMMAStoreOp::verify() {		LogicalResult NVVM::WMMAStoreOp::verify() {
unsigned addressSpace =		unsigned addressSpace =
getPtr().getType().cast<LLVM::LLVMPointerType>().getAddressSpace();		getPtr().getType().cast<LLVM::LLVMPointerType>().getAddressSpace();
if (addressSpace != 0 && addressSpace != 1 && addressSpace != 3)		if (addressSpace != 0 && addressSpace != 1 && addressSpace != 3)
return emitOpError("expected operands to be a source pointer in memory "		return emitOpError("expected operands to be a source pointer in memory "
"space 0, 1, 3");		"space 0, 1, 3");

if (NVVM::WMMAStoreOp::getIntrinsicID(getM(), getN(), getK(), getLayout(),		if (NVVM::WMMAStoreOp::getIntrinsicID(getM(), getN(), getK(), getLayout(),
getEltype()) == 0)		getEltype()) == 0)
return emitOpError() << "invalid attribute combination";		return emitOpError() << "invalid attribute combination";
std::pair<Type, unsigned> typeInfo =		std::pair<Type, unsigned> typeInfo = inferMMATypeFromMNK(
inferMMAType(getEltype(), NVVM::MMAFrag::c, getContext());		getEltype(), NVVM::MMAFrag::c, getM(), getN(), getK(), getContext());
if (getArgs().size() != typeInfo.second)		if (getArgs().size() != typeInfo.second)
return emitOpError() << "expected " << typeInfo.second << " data operands";		return emitOpError() << "expected " << typeInfo.second << " data operands";
if (llvm::any_of(getArgs(), [&typeInfo](Value operands) {		if (llvm::any_of(getArgs(), [&typeInfo](Value operands) {
return operands.getType() != typeInfo.first;		return operands.getType() != typeInfo.first;
}))		}))
return emitOpError() << "expected data operands of type " << typeInfo.first;		return emitOpError() << "expected data operands of type " << typeInfo.first;
return success();		return success();
}		}

LogicalResult NVVM::WMMAMmaOp::verify() {		LogicalResult NVVM::WMMAMmaOp::verify() {
if (NVVM::WMMAMmaOp::getIntrinsicID(getM(), getN(), getK(), getLayoutA(),		if (NVVM::WMMAMmaOp::getIntrinsicID(getM(), getN(), getK(), getLayoutA(),
getLayoutB(), getEltypeA(),		getLayoutB(), getEltypeA(),
getEltypeB()) == 0)		getEltypeB()) == 0)
return emitOpError() << "invalid attribute combination";		return emitOpError() << "invalid attribute combination";
std::pair<Type, unsigned> typeInfoA =		std::pair<Type, unsigned> typeInfoA = inferMMATypeFromMNK(
inferMMAType(getEltypeA(), NVVM::MMAFrag::a, getContext());		getEltypeA(), NVVM::MMAFrag::a, getM(), getN(), getK(), getContext());
std::pair<Type, unsigned> typeInfoB =		std::pair<Type, unsigned> typeInfoB = inferMMATypeFromMNK(
inferMMAType(getEltypeA(), NVVM::MMAFrag::b, getContext());		getEltypeA(), NVVM::MMAFrag::b, getM(), getN(), getK(), getContext());
std::pair<Type, unsigned> typeInfoC =		std::pair<Type, unsigned> typeInfoC = inferMMATypeFromMNK(
inferMMAType(getEltypeB(), NVVM::MMAFrag::c, getContext());		getEltypeB(), NVVM::MMAFrag::c, getM(), getN(), getK(), getContext());
SmallVector<Type, 32> arguments;		SmallVector<Type, 32> arguments;
arguments.append(typeInfoA.second, typeInfoA.first);		arguments.append(typeInfoA.second, typeInfoA.first);
arguments.append(typeInfoB.second, typeInfoB.first);		arguments.append(typeInfoB.second, typeInfoB.first);
arguments.append(typeInfoC.second, typeInfoC.first);		arguments.append(typeInfoC.second, typeInfoC.first);
unsigned numArgs = arguments.size();		unsigned numArgs = arguments.size();
if (getArgs().size() != numArgs)		if (getArgs().size() != numArgs)
return emitOpError() << "expected " << numArgs << " arguments";		return emitOpError() << "expected " << numArgs << " arguments";
for (unsigned i = 0; i < numArgs; i++) {		for (unsigned i = 0; i < numArgs; i++) {
▲ Show 20 Lines • Show All 95 Lines • Show Last 20 Lines

mlir/test/Conversion/GPUToNVVM/wmma-ops-to-nvvm.mlir

Show All 36 Lines	func.func @gpu_wmma_load_op() -> (!gpu.mma_matrix<16x16xf16, "AOp">) {
return %0 : !gpu.mma_matrix<16x16xf16, "AOp">		return %0 : !gpu.mma_matrix<16x16xf16, "AOp">
}		}
}		}

// -----		// -----

gpu.module @test_module {		gpu.module @test_module {

		// CHECK-LABEL: func @gpu_wmma_int8_load_op() ->
		// CHECK-SAME: !llvm.struct<(i32, i32)>
		// CHECK32-LABEL: func @gpu_wmma_int8_load_op() ->
		func.func @gpu_wmma_int8_load_op() -> (!gpu.mma_matrix<16x16xsi8, "AOp">) {
		%wg = memref.alloca() {alignment = 32} : memref<32x32xi8, 3>
		%i = arith.constant 16 : index
		%j = arith.constant 16 : index
		%0 = gpu.subgroup_mma_load_matrix %wg[%i, %j] {leadDimension = 32 : index, transpose} : memref<32x32xi8, 3> -> !gpu.mma_matrix<16x16xsi8, "AOp">
		// CHECK: %[[INX:.*]] = llvm.mlir.constant(16 : index) : i64
		// CHECK: %{{.}} = llvm.insertvalue %{{.}}, %{{.}}[{{.}}, {{.*}}]
		// CHECK: %[[BASE:.]] = llvm.extractvalue %{{.}}[1] : !llvm.struct<(ptr<i8, 3>, ptr<i8, 3>, i64, array<2 x i64>, array<2 x i64>)>
		// CHECK: %[[LDM:.*]] = llvm.mlir.constant(32 : index) : i64
		// CHECK: %[[LI:.*]] = llvm.mul %[[INX]], %[[LDM]] : i64
		// CHECK: %[[LIJ:.*]] = llvm.add %[[LI]], %[[INX]] : i64
		// CHECK: %[[ADDRESS:.*]] = llvm.getelementptr %[[BASE]][%[[LIJ]]] : (!llvm.ptr<i8, 3>, i64) -> !llvm.ptr<i8, 3>
		// CHECK: %[[LDM32:.*]] = llvm.mlir.constant(32 : index) : i32
		// CHECK: %[[FRAG:.*]] = nvvm.wmma.load %[[ADDRESS]], %[[LDM32]]
		// CHECK-SAME: {eltype = #nvvm.mma_type<s8>, frag = #nvvm.mma_frag<a>, k = 16 : i32, layout = #nvvm.mma_layout<col>, m = 16 : i32, n = 16 : i32} : (!llvm.ptr<i8, 3>) -> !llvm.struct<(i32, i32)>
		// CHECK: llvm.return %[[FRAG]] : !llvm.struct<(i32, i32)>

		// CHECK32: %[[INX:.*]] = llvm.mlir.constant(16 : index) : i32
		// CHECK32: %{{.}} = llvm.insertvalue %{{.}}, %{{.}}[{{.}}, {{.*}}]
		// CHECK32: %[[BASE:.]] = llvm.extractvalue %{{.}}[1] : !llvm.struct<(ptr<i8, 3>, ptr<i8, 3>, i32, array<2 x i32>, array<2 x i32>)>
		// CHECK32: %[[LDM:.*]] = llvm.mlir.constant(32 : index) : i32
		// CHECK32: %[[LI:.*]] = llvm.mul %[[INX]], %[[LDM]] : i32
		// CHECK32: %[[LIJ:.*]] = llvm.add %[[LI]], %[[INX]] : i32
		// CHECK32: %[[ADDRESS:.*]] = llvm.getelementptr %[[BASE]][%[[LIJ]]] : (!llvm.ptr<i8, 3>, i32) -> !llvm.ptr<i8, 3>
		// CHECK32: %[[LDM32:.*]] = llvm.mlir.constant(32 : index) : i32
		// CHECK32: %[[FRAG:.*]] = nvvm.wmma.load %[[ADDRESS]], %[[LDM32]]
		// CHECK32-SAME: {eltype = #nvvm.mma_type<s8>, frag = #nvvm.mma_frag<a>, k = 16 : i32, layout = #nvvm.mma_layout<col>, m = 16 : i32, n = 16 : i32} : (!llvm.ptr<i8, 3>) -> !llvm.struct<(i32, i32)>
		// CHECK32: llvm.return %[[FRAG]] : !llvm.struct<(i32, i32)>
		return %0 : !gpu.mma_matrix<16x16xsi8, "AOp">
		}
		}

		// -----

		gpu.module @test_module {

// CHECK-LABEL: func @gpu_wmma_store_op		// CHECK-LABEL: func @gpu_wmma_store_op
// CHECK-SAME: (%[[D:.*]]: !llvm.struct<(vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>)>)		// CHECK-SAME: (%[[D:.*]]: !llvm.struct<(vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>)>)
// CHECK32-LABEL: func @gpu_wmma_store_op		// CHECK32-LABEL: func @gpu_wmma_store_op
// CHECK32-SAME: (%[[D:.*]]: !llvm.struct<(vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>)>)		// CHECK32-SAME: (%[[D:.*]]: !llvm.struct<(vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>)>)
func.func @gpu_wmma_store_op(%arg0 : !gpu.mma_matrix<16x16xf16, "COp">) -> () {		func.func @gpu_wmma_store_op(%arg0 : !gpu.mma_matrix<16x16xf16, "COp">) -> () {
%sg = memref.alloca(){alignment = 32} : memref<32x32xf16, 3>		%sg = memref.alloca(){alignment = 32} : memref<32x32xf16, 3>
%i = arith.constant 16 : index		%i = arith.constant 16 : index
%j = arith.constant 16 : index		%j = arith.constant 16 : index
▲ Show 20 Lines • Show All 68 Lines • ▼ Show 20 Lines	func.func @gpu_wmma_mma_op(%A : !gpu.mma_matrix<16x16xf16, "AOp">, %B : !gpu.mma_matrix<16x16xf16, "BOp">, %C : !gpu.mma_matrix<16x16xf16, "COp">) -> (!gpu.mma_matrix<16x16xf16, "COp">) {
return %D : !gpu.mma_matrix<16x16xf16, "COp">		return %D : !gpu.mma_matrix<16x16xf16, "COp">
}		}
}		}

// -----		// -----

gpu.module @test_module {		gpu.module @test_module {

		// CHECK-LABEL: func @gpu_wmma_mma_int8_op
		// CHECK-SAME: (%[[A:.]]: !llvm.struct<(i32, i32, i32, i32)>, %[[B:.]]: !llvm.struct<(i32)>, %[[C:.*]]: !llvm.struct<(i32, i32, i32, i32, i32, i32, i32, i32)>)
		func.func @gpu_wmma_mma_int8_op(%A : !gpu.mma_matrix<32x16xsi8, "AOp">, %B : !gpu.mma_matrix<16x8xsi8, "BOp">, %C : !gpu.mma_matrix<32x8xi32, "COp">) -> (!gpu.mma_matrix<32x8xi32, "COp">) {
		%D = gpu.subgroup_mma_compute %A, %B, %C {a_transpose} : !gpu.mma_matrix<32x16xsi8, "AOp">, !gpu.mma_matrix<16x8xsi8, "BOp"> -> !gpu.mma_matrix<32x8xi32, "COp">
		// CHECK: %[[A1:.*]] = llvm.extractvalue %[[A]][0] : !llvm.struct<(i32, i32, i32, i32)>
		// CHECK: %[[A2:.*]] = llvm.extractvalue %[[A]][1] : !llvm.struct<(i32, i32, i32, i32)>
		// CHECK: %[[A3:.*]] = llvm.extractvalue %[[A]][2] : !llvm.struct<(i32, i32, i32, i32)>
		// CHECK: %[[A4:.*]] = llvm.extractvalue %[[A]][3] : !llvm.struct<(i32, i32, i32, i32)>
		// CHECK: %[[B1:.*]] = llvm.extractvalue %[[B]][0] : !llvm.struct<(i32)>
		// CHECK: %[[C1:.*]] = llvm.extractvalue %[[C]][0] : !llvm.struct<(i32, i32, i32, i32, i32, i32, i32, i32)>
		// CHECK: %[[C2:.*]] = llvm.extractvalue %[[C]][1] : !llvm.struct<(i32, i32, i32, i32, i32, i32, i32, i32)>
		// CHECK: %[[C3:.*]] = llvm.extractvalue %[[C]][2] : !llvm.struct<(i32, i32, i32, i32, i32, i32, i32, i32)>
		// CHECK: %[[C4:.*]] = llvm.extractvalue %[[C]][3] : !llvm.struct<(i32, i32, i32, i32, i32, i32, i32, i32)>
		// CHECK: %[[C5:.*]] = llvm.extractvalue %[[C]][4] : !llvm.struct<(i32, i32, i32, i32, i32, i32, i32, i32)>
		// CHECK: %[[C6:.*]] = llvm.extractvalue %[[C]][5] : !llvm.struct<(i32, i32, i32, i32, i32, i32, i32, i32)>
		// CHECK: %[[C7:.*]] = llvm.extractvalue %[[C]][6] : !llvm.struct<(i32, i32, i32, i32, i32, i32, i32, i32)>
		// CHECK: %[[C8:.*]] = llvm.extractvalue %[[C]][7] : !llvm.struct<(i32, i32, i32, i32, i32, i32, i32, i32)>
		// CHECK: %[[RES:.*]] = nvvm.wmma.mma %[[A1]], %[[A2]], %[[A3]], %[[A4]], %[[B1]], %[[C1]], %[[C2]], %[[C3]], %[[C4]], %[[C5]], %[[C6]], %[[C7]], %[[C8]]
		// CHECK-SAME: {eltypeA = #nvvm.mma_type<s8>, eltypeB = #nvvm.mma_type<s32>, k = 16 : i32, layoutA = #nvvm.mma_layout<col>, layoutB = #nvvm.mma_layout<row>, m = 32 : i32, n = 8 : i32} : (
		// CHECK-SAME: i32, {{.*}}) -> !llvm.struct<(i32, i32, i32, i32, i32, i32, i32, i32)>
		// CHECK: llvm.return %[[RES]] : !llvm.struct<(i32, i32, i32, i32, i32, i32, i32, i32)>
		return %D : !gpu.mma_matrix<32x8xi32, "COp">
		}
		}

		// -----

		gpu.module @test_module {

// CHECK-LABEL: func @gpu_wmma_mma_loop_op		// CHECK-LABEL: func @gpu_wmma_mma_loop_op
// CHECK: %[[C:.+]] = nvvm.wmma.load %{{.}}, %{{.}} {eltype = #nvvm.mma_type<f16>, frag = #nvvm.mma_frag<c>, k = 16 : i32, layout = #nvvm.mma_layout<row>, m = 16 : i32, n = 16 : i32} : (!llvm.ptr<f16>) -> !llvm.struct<(vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>)>		// CHECK: %[[C:.+]] = nvvm.wmma.load %{{.}}, %{{.}} {eltype = #nvvm.mma_type<f16>, frag = #nvvm.mma_frag<c>, k = 16 : i32, layout = #nvvm.mma_layout<row>, m = 16 : i32, n = 16 : i32} : (!llvm.ptr<f16>) -> !llvm.struct<(vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>)>
// CHECK: llvm.br ^bb1(%{{.*}}, %[[C]] : i64, !llvm.struct<(vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>)>)		// CHECK: llvm.br ^bb1(%{{.*}}, %[[C]] : i64, !llvm.struct<(vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>)>)
// CHECK: ^bb1(%{{.*}}: i64, %[[ACC:.+]]: !llvm.struct<(vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>)>): // 2 preds: ^bb0, ^bb2		// CHECK: ^bb1(%{{.*}}: i64, %[[ACC:.+]]: !llvm.struct<(vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>)>): // 2 preds: ^bb0, ^bb2
// CHECK: llvm.cond_br %{{.*}}, ^bb2, ^bb3		// CHECK: llvm.cond_br %{{.*}}, ^bb2, ^bb3
// CHECK: ^bb2: // pred: ^bb1		// CHECK: ^bb2: // pred: ^bb1
// CHECK: %[[A:.+]] = nvvm.wmma.load %{{.}}, %{{.}} {eltype = #nvvm.mma_type<f16>, frag = #nvvm.mma_frag<a>, k = 16 : i32, layout = #nvvm.mma_layout<row>, m = 16 : i32, n = 16 : i32} : (!llvm.ptr<f16>) -> !llvm.struct<(vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>)>		// CHECK: %[[A:.+]] = nvvm.wmma.load %{{.}}, %{{.}} {eltype = #nvvm.mma_type<f16>, frag = #nvvm.mma_frag<a>, k = 16 : i32, layout = #nvvm.mma_layout<row>, m = 16 : i32, n = 16 : i32} : (!llvm.ptr<f16>) -> !llvm.struct<(vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>)>
// CHECK: %[[B:.+]] = nvvm.wmma.load %{{.}}, %{{.}} {eltype = #nvvm.mma_type<f16>, frag = #nvvm.mma_frag<b>, k = 16 : i32, layout = #nvvm.mma_layout<row>, m = 16 : i32, n = 16 : i32} : (!llvm.ptr<f16>) -> !llvm.struct<(vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>)>		// CHECK: %[[B:.+]] = nvvm.wmma.load %{{.}}, %{{.}} {eltype = #nvvm.mma_type<f16>, frag = #nvvm.mma_frag<b>, k = 16 : i32, layout = #nvvm.mma_layout<row>, m = 16 : i32, n = 16 : i32} : (!llvm.ptr<f16>) -> !llvm.struct<(vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>)>
▲ Show 20 Lines • Show All 139 Lines • Show Last 20 Lines

mlir/test/Conversion/VectorToGPU/vector-to-mma-ops.mlir

Show First 20 Lines • Show All 219 Lines • ▼ Show 20 Lines	func.func @matmul_transposed_broadcasted_2d(%arg0: memref<32x32xf16>, %arg1: memref<32x32xf16>, %arg2: memref<16x16xf16>) {
%cst = arith.constant 0.000000e+00 : f16		%cst = arith.constant 0.000000e+00 : f16
%A = vector.transfer_read %arg0[%c0, %c0], %cst {in_bounds = [true, true], permutation_map = affine_map<(d0, d1) -> (d1, 0)>} : memref<32x32xf16>, vector<16x16xf16>		%A = vector.transfer_read %arg0[%c0, %c0], %cst {in_bounds = [true, true], permutation_map = affine_map<(d0, d1) -> (d1, 0)>} : memref<32x32xf16>, vector<16x16xf16>
%B = vector.transfer_read %arg1[%c0, %c0], %cst {in_bounds = [true, true], permutation_map = affine_map<(d0, d1) -> (d1, 0)>} : memref<32x32xf16>, vector<16x16xf16>		%B = vector.transfer_read %arg1[%c0, %c0], %cst {in_bounds = [true, true], permutation_map = affine_map<(d0, d1) -> (d1, 0)>} : memref<32x32xf16>, vector<16x16xf16>
%C = vector.transfer_read %arg2[%c0, %c0], %cst {in_bounds = [true, true]} : memref<16x16xf16>, vector<16x16xf16>		%C = vector.transfer_read %arg2[%c0, %c0], %cst {in_bounds = [true, true]} : memref<16x16xf16>, vector<16x16xf16>
%D = vector.contract {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %A, %B, %C : vector<16x16xf16>, vector<16x16xf16> into vector<16x16xf16>		%D = vector.contract {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %A, %B, %C : vector<16x16xf16>, vector<16x16xf16> into vector<16x16xf16>
vector.transfer_write %D, %arg2[%c0, %c0] {in_bounds = [true, true]} : vector<16x16xf16>, memref<16x16xf16>		vector.transfer_write %D, %arg2[%c0, %c0] {in_bounds = [true, true]} : vector<16x16xf16>, memref<16x16xf16>
return		return
}		}

		// Do not convert to subgroup_mma ops with integer types if signedness cannot be inferred.
		// CHECK-LABEL: func @matmul_no_extend_int8
		// CHECK-DAG: %[[A:.+]] = vector.transfer_read %{{.}}[%{{.}}, %{{.}}], %{{.}} {in_bounds = [true, true]} : memref<16x16xi8>, vector<16x16xi8>
		// CHECK-DAG: %[[B:.+]] = vector.transfer_read %{{.}}[%{{.}}, %{{.}}], %{{.}} {in_bounds = [true, true]} : memref<16x16xi8>, vector<16x16xi8>
		// CHECK-DAG: %[[C:.+]] = vector.transfer_read %{{.}}[%{{.}}, %{{.}}], %{{.}} {in_bounds = [true, true]} : memref<16x16xi32>, vector<16x16xi32>
		// CHECK: %[[D:.+]] = vector.contract {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %[[A]], %[[B]], %[[C]] : vector<16x16xi8>, vector<16x16xi8> into vector<16x16xi32>
		// CHECK: vector.transfer_write %{{.}}, %{{.}}[%{{.}}, %{{.}}] {in_bounds = [true, true]} : vector<16x16xi32>, memref<16x16xi32>
		func.func @matmul_no_extend_int8(%arg0: memref<16x16xi8>, %arg1: memref<16x16xi8>, %arg2: memref<16x16xi32>) {
		%cst_0 = arith.constant dense<0> : vector<16x16xi8>
		%c0 = arith.constant 0 : index
		%cst_i8 = arith.constant 0 : i8
		%cst_i32 = arith.constant 0 : i32
		%A = vector.transfer_read %arg0[%c0, %c0], %cst_i8 {in_bounds = [true, true]} : memref<16x16xi8>, vector<16x16xi8>
		%B = vector.transfer_read %arg1[%c0, %c0], %cst_i8 {permutation_map = #map0, in_bounds = [true, true]} : memref<16x16xi8>, vector<16x16xi8>
		%C = vector.transfer_read %arg2[%c0, %c0], %cst_i32 {in_bounds = [true, true]} : memref<16x16xi32>, vector<16x16xi32>
		%D = vector.contract {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %A, %B, %C : vector<16x16xi8>, vector<16x16xi8> into vector<16x16xi32>
		vector.transfer_write %D, %arg2[%c0, %c0] {in_bounds = [true, true]} : vector<16x16xi32>, memref<16x16xi32>
		return
		}

		// CHECK-LABEL: func @matmul_int8
		// CHECK-DAG: %[[A:.+]] = gpu.subgroup_mma_load_matrix %{{.}}[%{{.}}, %{{.*}}] {leadDimension = 16 : index} : memref<16x16xi8> -> !gpu.mma_matrix<16x16xsi8, "AOp">
		// CHECK-DAG: %[[B:.+]] = gpu.subgroup_mma_load_matrix %{{.}}[%{{.}}, %{{.*}}] {leadDimension = 16 : index} : memref<16x16xi8> -> !gpu.mma_matrix<16x16xsi8, "BOp">
		// CHECK-DAG: %[[C:.+]] = gpu.subgroup_mma_load_matrix %{{.}}[%{{.}}, %{{.*}}] {leadDimension = 16 : index} : memref<16x16xi32> -> !gpu.mma_matrix<16x16xi32, "COp">
		// CHECK: %[[D:.+]] = gpu.subgroup_mma_compute %[[A]], %[[B]], %[[C]] : !gpu.mma_matrix<16x16xsi8, "AOp">, !gpu.mma_matrix<16x16xsi8, "BOp"> -> !gpu.mma_matrix<16x16xi32, "COp">
		// CHECK: gpu.subgroup_mma_store_matrix %[[D]], %{{.}}[%{{.}}, %{{.*}}] {leadDimension = 16 : index} : !gpu.mma_matrix<16x16xi32, "COp">, memref<16x16xi32>
		func.func @matmul_int8(%arg0: memref<16x16xi8>, %arg1: memref<16x16xi8>, %arg2: memref<16x16xi32>) {
		%cst_0 = arith.constant dense<0> : vector<16x16xi8>
		%c0 = arith.constant 0 : index
		%cst_i8 = arith.constant 0 : i8
		%cst_i32 = arith.constant 0 : i32
		%Ar = vector.transfer_read %arg0[%c0, %c0], %cst_i8 {in_bounds = [true, true]} : memref<16x16xi8>, vector<16x16xi8>
		%Br = vector.transfer_read %arg1[%c0, %c0], %cst_i8 {permutation_map = #map0, in_bounds = [true, true]} : memref<16x16xi8>, vector<16x16xi8>
		%C = vector.transfer_read %arg2[%c0, %c0], %cst_i32 {in_bounds = [true, true]} : memref<16x16xi32>, vector<16x16xi32>
		%Ae = arith.extsi %Ar : vector<16x16xi8> to vector<16x16xi32>
		%Be = arith.extsi %Br : vector<16x16xi8> to vector<16x16xi32>
		%D = vector.contract {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %Ae, %Be, %C : vector<16x16xi32>, vector<16x16xi32> into vector<16x16xi32>
		vector.transfer_write %D, %arg2[%c0, %c0] {in_bounds = [true, true]} : vector<16x16xi32>, memref<16x16xi32>
		return
		}

mlir/test/Dialect/GPU/invalid.mlir

Show First 20 Lines • Show All 479 Lines • ▼ Show 20 Lines	func.func @mmamatrix_operand_type(){
return		return
}		}

// -----		// -----

func.func @mmamatrix_invalid_element_type(){		func.func @mmamatrix_invalid_element_type(){
%wg = memref.alloca() {alignment = 32} : memref<32x32xf16, 3>		%wg = memref.alloca() {alignment = 32} : memref<32x32xf16, 3>
%i = arith.constant 16 : index		%i = arith.constant 16 : index
// expected-error @+1 {{MMAMatrixType elements must be F16 or F32}}		// expected-error @+1 {{MMAMatrixType elements must be SI8, UI8, I32, F16, or F32}}
%0 = gpu.subgroup_mma_load_matrix %wg[%i, %i] {leadDimension = 32 : index} : memref<32x32xf16, 3> -> !gpu.mma_matrix<16x16xi32, "AOp">		%0 = gpu.subgroup_mma_load_matrix %wg[%i, %i] {leadDimension = 32 : index} : memref<32x32xf16, 3> -> !gpu.mma_matrix<16x16xbf16, "AOp">
return		return
}		}

// -----		// -----

#layout_map_col_major = affine_map<(i, j) -> (j, i)>		#layout_map_col_major = affine_map<(i, j) -> (j, i)>

func.func @mmaLoadOp_identity_layout(){		func.func @mmaLoadOp_identity_layout(){
%wg = memref.alloca() {alignment = 32} : memref<32x32xf16, #layout_map_col_major, 3>		%wg = memref.alloca() {alignment = 32} : memref<32x32xf16, #layout_map_col_major, 3>
%i = arith.constant 16 : index		%i = arith.constant 16 : index
// expected-error @+1 {{expected source memref most minor dim must have unit stride}}		// expected-error @+1 {{expected source memref most minor dim must have unit stride}}
%0 = gpu.subgroup_mma_load_matrix %wg[%i, %i] {leadDimension = 32 : index} : memref<32x32xf16, #layout_map_col_major, 3> -> !gpu.mma_matrix<16x16xf16, "AOp">		%0 = gpu.subgroup_mma_load_matrix %wg[%i, %i] {leadDimension = 32 : index} : memref<32x32xf16, #layout_map_col_major, 3> -> !gpu.mma_matrix<16x16xf16, "AOp">
return		return
}		}

// -----		// -----

func.func @mma_invalid_memref_type(%src: memref<32x4xvector<4x8xf32>>, %i: index) {		func.func @mma_invalid_memref_type(%src: memref<32x4xvector<4x8xf32>>, %i: index) {
// expected-error @+1 {{operand #0 must be memref of 16-bit float or 32-bit float or vector of 16-bit float or 32-bit float values of ranks 1 values}}		// expected-error @+1 {{operand #0 must be memref of 8-bit signless integer or 32-bit signless integer or 16-bit float or 32-bit float or vector of 8-bit signless integer or 32-bit signless integer or 16-bit float or 32-bit float values of ranks 1 values}}
%0 = gpu.subgroup_mma_load_matrix %src[%i, %i] {leadDimension = 4 : index} : memref<32x4xvector<4x8xf32>> -> !gpu.mma_matrix<16x16xf16, "AOp">		%0 = gpu.subgroup_mma_load_matrix %src[%i, %i] {leadDimension = 4 : index} : memref<32x4xvector<4x8xf32>> -> !gpu.mma_matrix<16x16xf16, "AOp">
return		return
}		}

// -----		// -----

#layout_map_col_major = affine_map<(i, j) -> (j, i)>		#layout_map_col_major = affine_map<(i, j) -> (j, i)>

▲ Show 20 Lines • Show All 96 Lines • Show Last 20 Lines

This is an archive of the discontinued LLVM Phabricator instance.

[mlir][gpu] Add support for integer types in gpu.subgroup_mma ops
ClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 495650

mlir/include/mlir/Dialect/GPU/IR/GPUBase.td

mlir/include/mlir/Dialect/GPU/IR/GPUOps.td

mlir/include/mlir/Dialect/LLVMIR/NVVMDialect.h

mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td

mlir/lib/Conversion/GPUToNVVM/WmmaOpsToNvvm.cpp

mlir/lib/Conversion/VectorToGPU/VectorToGPU.cpp

mlir/lib/Dialect/GPU/IR/GPUDialect.cpp

mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp

mlir/test/Conversion/GPUToNVVM/wmma-ops-to-nvvm.mlir

mlir/test/Conversion/VectorToGPU/vector-to-mma-ops.mlir

mlir/test/Dialect/GPU/invalid.mlir

This is an archive of the discontinued LLVM Phabricator instance.

[mlir][gpu] Add support for integer types in gpu.subgroup_mma opsClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 495650

mlir/include/mlir/Dialect/GPU/IR/GPUBase.td

mlir/include/mlir/Dialect/GPU/IR/GPUOps.td

mlir/include/mlir/Dialect/LLVMIR/NVVMDialect.h

mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td

mlir/lib/Conversion/GPUToNVVM/WmmaOpsToNvvm.cpp

mlir/lib/Conversion/VectorToGPU/VectorToGPU.cpp

mlir/lib/Dialect/GPU/IR/GPUDialect.cpp

mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp

mlir/test/Conversion/GPUToNVVM/wmma-ops-to-nvvm.mlir

mlir/test/Conversion/VectorToGPU/vector-to-mma-ops.mlir

mlir/test/Dialect/GPU/invalid.mlir

[mlir][gpu] Add support for integer types in gpu.subgroup_mma ops
ClosedPublic