Diff 478441

mlir/include/mlir/Dialect/GPU/IR/GPUOps.td

Show First 20 Lines • Show All 1,096 Lines • ▼ Show 20 Lines	let description = [{
using all the threads in a subgroup.		using all the threads in a subgroup.

This operation takes a memref as its first operand: it is the source matrix		This operation takes a memref as its first operand: it is the source matrix
from which data is to be loaded. The op returns a `!gpu.mma_matrix`. The		from which data is to be loaded. The op returns a `!gpu.mma_matrix`. The
source memref can be in global memory or shared memory. The load address is		source memref can be in global memory or shared memory. The load address is
determined using `indices`. The matrix being loaded into is the result. The		determined using `indices`. The matrix being loaded into is the result. The
`leadDimension` attribute specifies the leading dimension size of the source		`leadDimension` attribute specifies the leading dimension size of the source
matrix which eventually allows the lowering to determine the size of each		matrix which eventually allows the lowering to determine the size of each
row.		row. If the `transpose` attribute is present then the op does a transposed load.

This op is often meant to be used along with `gpu.subgroup_mma_store_matrix` and		This op is often meant to be used along with `gpu.subgroup_mma_store_matrix` and
`gpu.subgroup_mma_compute`.		`gpu.subgroup_mma_compute`.

Example:		Example:

```mlir		```mlir
%0 = gpu.subgroup_mma_load_matrix src[%i,%j] : {leadDimension = 32 : i32}		%0 = gpu.subgroup_mma_load_matrix src[%i,%j] : {leadDimension = 32 : i32}
: memref<32x32xf16, 3>, !gpu.mma_matrix<16x16xf16, "AOp">		: memref<32x32xf16, 3>, !gpu.mma_matrix<16x16xf16, "AOp">
```		```
}];		}];

let arguments = (ins Arg<GPU_MMAMemRef, "", [MemRead]>:$srcMemref,		let arguments = (ins Arg<GPU_MMAMemRef, "", [MemRead]>:$srcMemref,
Variadic<Index>:$indices,		Variadic<Index>:$indices,
IndexAttr:$leadDimension);		IndexAttr:$leadDimension,
		OptionalAttr<UnitAttr>:$transpose);
		ThomasRaouxUnsubmitted Done Reply Inline Actions Use `OptionalAttr<UnitAttr>` type instead. This will make the syntax and code simpler ThomasRaoux: Use `OptionalAttr<UnitAttr>` type instead. This will make the syntax and code simpler
		antiagainstUnsubmitted Done Reply Inline Actions Can we also add explanation to this attribute in the description? antiagainst: Can we also add explanation to this attribute in the description?
		qedawkinsAuthorUnsubmitted Done Reply Inline Actions Should the semantics here be that the result of the load is transposed or that the values are arranged in memory already transposed? qedawkins: Should the semantics here be that the result of the load is transposed or that the values are…

let results = (outs GPU_MMAMatrix:$res);		let results = (outs GPU_MMAMatrix:$res);

let assemblyFormat = [{		let assemblyFormat = [{
$srcMemref`[`$indices`]` attr-dict `:` type($srcMemref) `->` type($res)		$srcMemref`[`$indices`]` attr-dict `:` type($srcMemref) `->` type($res)
}];		}];
let hasVerifier = 1;		let hasVerifier = 1;
}		}
▲ Show 20 Lines • Show All 199 Lines • Show Last 20 Lines

mlir/lib/Conversion/GPUToNVVM/WmmaOpsToNvvm.cpp

Show First 20 Lines • Show All 71 Lines • ▼ Show 20 Lines	struct WmmaLoadOpToNVVMLowering
LogicalResult		LogicalResult
matchAndRewrite(gpu::SubgroupMmaLoadMatrixOp subgroupMmaLoadMatrixOp,		matchAndRewrite(gpu::SubgroupMmaLoadMatrixOp subgroupMmaLoadMatrixOp,
OpAdaptor adaptor,		OpAdaptor adaptor,
ConversionPatternRewriter &rewriter) const override {		ConversionPatternRewriter &rewriter) const override {
Operation *op = subgroupMmaLoadMatrixOp.getOperation();		Operation *op = subgroupMmaLoadMatrixOp.getOperation();
if (failed(areAllLLVMTypes(op, adaptor.getOperands(), rewriter)))		if (failed(areAllLLVMTypes(op, adaptor.getOperands(), rewriter)))
return failure();		return failure();

		// TODO: Support transposed mma loads.
		if (subgroupMmaLoadMatrixOp.getTranspose())
		return failure();

// Get the shape of the MMAMatrix type being returned. The shape will		// Get the shape of the MMAMatrix type being returned. The shape will
// choose which intrinsic this op will be lowered to.		// choose which intrinsic this op will be lowered to.
gpu::MMAMatrixType retType =		gpu::MMAMatrixType retType =
subgroupMmaLoadMatrixOp.getRes().getType().cast<gpu::MMAMatrixType>();		subgroupMmaLoadMatrixOp.getRes().getType().cast<gpu::MMAMatrixType>();
ArrayRef<int64_t> retTypeShape = retType.getShape();		ArrayRef<int64_t> retTypeShape = retType.getShape();
int64_t m = 0;		int64_t m = 0;
int64_t n = 0;		int64_t n = 0;
int64_t k = 0;		int64_t k = 0;
▲ Show 20 Lines • Show All 284 Lines • Show Last 20 Lines

mlir/lib/Conversion/GPUToSPIRV/WmmaOpsToSPIRV.cpp

Show First 20 Lines • Show All 81 Lines • ▼ Show 20 Lines	matchAndRewrite(gpu::SubgroupMmaLoadMatrixOp subgroupMmaLoadMatrixOp,
Value bufferPtr = spirv::getElementPtr(		Value bufferPtr = spirv::getElementPtr(
*getTypeConverter<SPIRVTypeConverter>(), memrefType,		*getTypeConverter<SPIRVTypeConverter>(), memrefType,
adaptor.getSrcMemref(), adaptor.getIndices(), loc, rewriter);		adaptor.getSrcMemref(), adaptor.getIndices(), loc, rewriter);
auto coopType = convertMMAToSPIRVType(retType);		auto coopType = convertMMAToSPIRVType(retType);
int64_t stride = subgroupMmaLoadMatrixOp.getLeadDimension().getSExtValue();		int64_t stride = subgroupMmaLoadMatrixOp.getLeadDimension().getSExtValue();
auto i32Type = rewriter.getI32Type();		auto i32Type = rewriter.getI32Type();
auto strideValue = rewriter.create<spirv::ConstantOp>(		auto strideValue = rewriter.create<spirv::ConstantOp>(
loc, i32Type, IntegerAttr::get(i32Type, stride));		loc, i32Type, IntegerAttr::get(i32Type, stride));
auto coloumnMajor = rewriter.create<spirv::ConstantOp>(		bool useColMajor =
loc, rewriter.getI1Type(), rewriter.getBoolAttr(false));		static_cast<bool>(subgroupMmaLoadMatrixOp.getTranspose());
		ThomasRaouxUnsubmitted Done Reply Inline Actions you can just do `bool useColMajor = subgroupMmaLoadMatrixOp.getTranspose()`? ThomasRaoux: you can just do `bool useColMajor = subgroupMmaLoadMatrixOp.getTranspose()`?
		qedawkinsAuthorUnsubmitted Done Reply Inline Actions It's an OptionalAttr<UnitAttr> so it needs to be this. It could just be a UnitAttr directly, although I don't know what the standard approach here is. qedawkins: It's an OptionalAttr<UnitAttr> so it needs to be this. It could just be a UnitAttr directly…
		ThomasRaouxUnsubmitted Done Reply Inline Actions I believe you can cast it to bool directly using `static_cast<bool>` ThomasRaoux: I believe you can cast it to bool directly using `static_cast<bool>`
		auto columnMajor = rewriter.create<spirv::ConstantOp>(
		loc, rewriter.getI1Type(), rewriter.getBoolAttr(useColMajor));
rewriter.replaceOpWithNewOp<spirv::NVCooperativeMatrixLoadOp>(		rewriter.replaceOpWithNewOp<spirv::NVCooperativeMatrixLoadOp>(
subgroupMmaLoadMatrixOp, coopType, bufferPtr, strideValue, coloumnMajor,		subgroupMmaLoadMatrixOp, coopType, bufferPtr, strideValue, columnMajor,
spirv::MemoryAccessAttr());		spirv::MemoryAccessAttr());
return success();		return success();
}		}
};		};

/// This class implements the conversion of GPU MMA StoreOp to		/// This class implements the conversion of GPU MMA StoreOp to
/// CooperativeMatrixStore op in the SPIRV dialect.		/// CooperativeMatrixStore op in the SPIRV dialect.
struct WmmaStoreOpToSPIRVLowering		struct WmmaStoreOpToSPIRVLowering
▲ Show 20 Lines • Show All 101 Lines • Show Last 20 Lines

mlir/lib/Conversion/VectorToGPU/VectorToGPU.cpp

Show First 20 Lines • Show All 86 Lines • ▼ Show 20 Lines	if (!useNvGpu &&
return false;		return false;
if (useNvGpu &&		if (useNvGpu &&
contract.getIndexingMapsArray() != infer({{m, k}, {n, k}, {m, n}}))		contract.getIndexingMapsArray() != infer({{m, k}, {n, k}, {m, n}}))
return false;		return false;

return true;		return true;
}		}

		// Return true if the given map represents a transposed matrix load,
		// i.e. (d0, d1, ...) -> (dn-1, dn-2).
		static bool isTransposeMatrixLoadMap(OpBuilder &b, AffineMap permutationMap) {
		auto nDim = permutationMap.getNumDims();
		if (nDim < 2)
		return false;

		AffineExpr innerDim = b.getAffineDimExpr(nDim - 1);
		AffineExpr outerDim = b.getAffineDimExpr(nDim - 2);
		return permutationMap ==
		AffineMap::get(nDim, 0, {innerDim, outerDim}, b.getContext());
		}

// Return the stide for the dimension 0 of \|type\| if it is a memref and has a		// Return the stide for the dimension 0 of \|type\| if it is a memref and has a
// constant stride.		// constant stride.
static std::optional<int64_t>		static std::optional<int64_t>
getMemrefConstantHorizontalStride(ShapedType type) {		getMemrefConstantHorizontalStride(ShapedType type) {
auto memrefType = type.dyn_cast<MemRefType>();		auto memrefType = type.dyn_cast<MemRefType>();
if (!memrefType)		if (!memrefType)
return false;		return false;
// If the memref is 0 or 1D the horizontal stride is 0.		// If the memref is 0 or 1D the horizontal stride is 0.
Show All 19 Lines	static bool transferReadSupportsMMAMatrixType(vector::TransferReadOp readOp,
if (!getMemrefConstantHorizontalStride(readOp.getShapedType()))		if (!getMemrefConstantHorizontalStride(readOp.getShapedType()))
return false;		return false;
AffineMap map = readOp.getPermutationMap();		AffineMap map = readOp.getPermutationMap();
OpBuilder b(readOp.getContext());		OpBuilder b(readOp.getContext());
AffineExpr innerDim = b.getAffineDimExpr(map.getNumDims() - 1);		AffineExpr innerDim = b.getAffineDimExpr(map.getNumDims() - 1);
AffineExpr zero = b.getAffineConstantExpr(0);		AffineExpr zero = b.getAffineConstantExpr(0);
auto broadcastInnerDim = AffineMap::get(map.getNumDims(), 0, {zero, innerDim},		auto broadcastInnerDim = AffineMap::get(map.getNumDims(), 0, {zero, innerDim},
readOp.getContext());		readOp.getContext());

		ThomasRaouxUnsubmitted Done Reply Inline Actions I don’t think this is the correct way to check for transpose, you need to check that the map is actually a transpose. ThomasRaoux: I don’t think this is the correct way to check for transpose, you need to check that the map is…
if (!useNvGpu) {		if (!useNvGpu) {
// TODO: Support transpose once it is added to GPU dialect ops.		bool result = map.isMinorIdentity() \|\| map == broadcastInnerDim \|\|
// For now we only support (d0, d1) -> (d0, d1) and (d0, d1) -> (0, d1).		isTransposeMatrixLoadMap(b, map);
return map.isMinorIdentity() \|\| map == broadcastInnerDim;		return result;
		ThomasRaouxUnsubmitted Done Reply Inline Actions why do we need this restriction? This is something we will need to relax. we should basically detect transpose by check equality to a map like this: `(d0, d1, d2, ...) -> (dn-2, dn-1) ThomasRaoux: why do we need this restriction? This is something we will need to relax. we should basically…
}		}

return true;		return true;
}		}

// Return true if the transfer op can be converted to a MMA matrix store.		// Return true if the transfer op can be converted to a MMA matrix store.
static bool		static bool
transferWriteSupportsMMAMatrixType(vector::TransferWriteOp writeOp) {		transferWriteSupportsMMAMatrixType(vector::TransferWriteOp writeOp) {
▲ Show 20 Lines • Show All 297 Lines • ▼ Show 20 Lines	if (map.getResult(0).isa<AffineConstantExpr>()) {
stride = 0;		stride = 0;
}		}
assert(stride);		assert(stride);
const char *fragType = inferFragType(op);		const char *fragType = inferFragType(op);
gpu::MMAMatrixType type =		gpu::MMAMatrixType type =
gpu::MMAMatrixType::get(op.getVectorType().getShape(),		gpu::MMAMatrixType::get(op.getVectorType().getShape(),
op.getVectorType().getElementType(), fragType);		op.getVectorType().getElementType(), fragType);
OpBuilder b(op);		OpBuilder b(op);
		bool isTranspose = isTransposeMatrixLoadMap(b, map);
Value load = b.create<gpu::SubgroupMmaLoadMatrixOp>(		Value load = b.create<gpu::SubgroupMmaLoadMatrixOp>(
op.getLoc(), type, op.getSource(), op.getIndices(),		op.getLoc(), type, op.getSource(), op.getIndices(),
b.getIndexAttr(*stride));		b.getIndexAttr(*stride), isTranspose ? b.getUnitAttr() : UnitAttr());
valueMapping[op.getResult()] = load;		valueMapping[op.getResult()] = load;
}		}

static void convertTransferWriteOp(vector::TransferWriteOp op,		static void convertTransferWriteOp(vector::TransferWriteOp op,
llvm::DenseMap<Value, Value> &valueMapping) {		llvm::DenseMap<Value, Value> &valueMapping) {
assert(transferWriteSupportsMMAMatrixType(op));		assert(transferWriteSupportsMMAMatrixType(op));
std::optional<int64_t> stride =		std::optional<int64_t> stride =
getMemrefConstantHorizontalStride(op.getShapedType());		getMemrefConstantHorizontalStride(op.getShapedType());
▲ Show 20 Lines • Show All 571 Lines • Show Last 20 Lines

mlir/test/Conversion/VectorToGPU/vector-to-mma-ops.mlir

// RUN: mlir-opt %s -pass-pipeline="builtin.module(func.func(convert-vector-to-gpu),canonicalize)" \| FileCheck %s		// RUN: mlir-opt %s -pass-pipeline="builtin.module(func.func(convert-vector-to-gpu),canonicalize)" \| FileCheck %s

#map0 = affine_map<(d0, d1) -> (d1, d0)>		#map0 = affine_map<(d0, d1) -> (d1, d0)>
#map1 = affine_map<(d0, d1, d2) -> (d0, d2)>		#map1 = affine_map<(d0, d1, d2) -> (d0, d2)>
#map2 = affine_map<(d0, d1, d2) -> (d1, d2)>		#map2 = affine_map<(d0, d1, d2) -> (d1, d2)>
#map3 = affine_map<(d0, d1, d2) -> (d0, d1)>		#map3 = affine_map<(d0, d1, d2) -> (d0, d1)>
#map4 = affine_map<(d0) -> (d0, 0)>		#map4 = affine_map<(d0) -> (d0, 0)>
		#map5 = affine_map<(d0, d1) -> (d0, d1)>

// CHECK-LABEL: func @matmul		// CHECK-LABEL: func @matmul
// CHECK-DAG: %[[A:.+]] = gpu.subgroup_mma_load_matrix %{{.}}[%{{.}}, %{{.*}}] {leadDimension = 16 : index} : memref<16x16xf16> -> !gpu.mma_matrix<16x16xf16, "AOp">		// CHECK-DAG: %[[A:.+]] = gpu.subgroup_mma_load_matrix %{{.}}[%{{.}}, %{{.*}}] {leadDimension = 16 : index} : memref<16x16xf16> -> !gpu.mma_matrix<16x16xf16, "AOp">
// CHECK-DAG: %[[B:.+]] = gpu.subgroup_mma_load_matrix %{{.}}[%{{.}}, %{{.*}}] {leadDimension = 16 : index} : memref<16x16xf16> -> !gpu.mma_matrix<16x16xf16, "BOp">		// CHECK-DAG: %[[B:.+]] = gpu.subgroup_mma_load_matrix %{{.}}[%{{.}}, %{{.*}}] {leadDimension = 16 : index} : memref<16x16xf16> -> !gpu.mma_matrix<16x16xf16, "BOp">
// CHECK-DAG: %[[C:.+]] = gpu.subgroup_mma_load_matrix %{{.}}[%{{.}}, %{{.*}}] {leadDimension = 16 : index} : memref<16x16xf16> -> !gpu.mma_matrix<16x16xf16, "COp">		// CHECK-DAG: %[[C:.+]] = gpu.subgroup_mma_load_matrix %{{.}}[%{{.}}, %{{.*}}] {leadDimension = 16 : index} : memref<16x16xf16> -> !gpu.mma_matrix<16x16xf16, "COp">
// CHECK: %[[D:.+]] = gpu.subgroup_mma_compute %[[A]], %[[B]], %[[C]] : !gpu.mma_matrix<16x16xf16, "AOp">, !gpu.mma_matrix<16x16xf16, "BOp"> -> !gpu.mma_matrix<16x16xf16, "COp">		// CHECK: %[[D:.+]] = gpu.subgroup_mma_compute %[[A]], %[[B]], %[[C]] : !gpu.mma_matrix<16x16xf16, "AOp">, !gpu.mma_matrix<16x16xf16, "BOp"> -> !gpu.mma_matrix<16x16xf16, "COp">
// CHECK: gpu.subgroup_mma_store_matrix %[[D]], %{{.}}[%{{.}}, %{{.*}}] {leadDimension = 16 : index} : !gpu.mma_matrix<16x16xf16, "COp">, memref<16x16xf16>		// CHECK: gpu.subgroup_mma_store_matrix %[[D]], %{{.}}[%{{.}}, %{{.*}}] {leadDimension = 16 : index} : !gpu.mma_matrix<16x16xf16, "COp">, memref<16x16xf16>
func.func @matmul(%arg0: memref<16x16xf16>, %arg1: memref<16x16xf16>, %arg2: memref<16x16xf16>) {		func.func @matmul(%arg0: memref<16x16xf16>, %arg1: memref<16x16xf16>, %arg2: memref<16x16xf16>) {
▲ Show 20 Lines • Show All 149 Lines • ▼ Show 20 Lines	func.func @matmul_memref_strided(%arg0: memref<2x16x16xf16, affine_map<(d0, d1, d2) -> (d0 * 512 + d1 * 32 + d2)>>, %arg1: memref<16xf16>, %arg2: memref<2x16x16xf16>) {
%cst = arith.constant 0.000000e+00 : f16		%cst = arith.constant 0.000000e+00 : f16
%A = vector.transfer_read %arg0[%c0, %c0, %c0], %cst {in_bounds = [true, true]} : memref<2x16x16xf16, affine_map<(d0, d1, d2) -> (d0 * 512 + d1 * 32 + d2)>>, vector<16x16xf16>		%A = vector.transfer_read %arg0[%c0, %c0, %c0], %cst {in_bounds = [true, true]} : memref<2x16x16xf16, affine_map<(d0, d1, d2) -> (d0 * 512 + d1 * 32 + d2)>>, vector<16x16xf16>
%B = vector.transfer_read %arg1[%c0], %cst {permutation_map = #map4, in_bounds = [true, true]} : memref<16xf16>, vector<16x16xf16>		%B = vector.transfer_read %arg1[%c0], %cst {permutation_map = #map4, in_bounds = [true, true]} : memref<16xf16>, vector<16x16xf16>
%C = vector.transfer_read %arg2[%c0, %c0, %c0], %cst {in_bounds = [true, true]} : memref<2x16x16xf16>, vector<16x16xf16>		%C = vector.transfer_read %arg2[%c0, %c0, %c0], %cst {in_bounds = [true, true]} : memref<2x16x16xf16>, vector<16x16xf16>
%D = vector.contract {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %A, %B, %C : vector<16x16xf16>, vector<16x16xf16> into vector<16x16xf16>		%D = vector.contract {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %A, %B, %C : vector<16x16xf16>, vector<16x16xf16> into vector<16x16xf16>
vector.transfer_write %D, %arg2[%c0, %c0, %c0] {in_bounds = [true, true]} : vector<16x16xf16>, memref<2x16x16xf16>		vector.transfer_write %D, %arg2[%c0, %c0, %c0] {in_bounds = [true, true]} : vector<16x16xf16>, memref<2x16x16xf16>
return		return
}		}

		// CHECK-LABEL: func @matmul_transposed
		// CHECK-DAG: %[[A:.+]] = gpu.subgroup_mma_load_matrix %{{.}}[%{{.}}, %{{.*}}] {leadDimension = 16 : index} : memref<16x16xf16> -> !gpu.mma_matrix<16x16xf16, "AOp">
		// CHECK-DAG: %[[B:.+]] = gpu.subgroup_mma_load_matrix %{{.}}[%{{.}}, %{{.*}}] {leadDimension = 16 : index, transpose} : memref<16x16xf16> -> !gpu.mma_matrix<16x16xf16, "BOp">
		// CHECK-DAG: %[[C:.+]] = gpu.subgroup_mma_load_matrix %{{.}}[%{{.}}, %{{.*}}] {leadDimension = 16 : index} : memref<16x16xf16> -> !gpu.mma_matrix<16x16xf16, "COp">
		// CHECK: %[[D:.+]] = gpu.subgroup_mma_compute %[[A]], %[[B]], %[[C]] : !gpu.mma_matrix<16x16xf16, "AOp">, !gpu.mma_matrix<16x16xf16, "BOp"> -> !gpu.mma_matrix<16x16xf16, "COp">
		// CHECK: gpu.subgroup_mma_store_matrix %[[D]], %{{.}}[%{{.}}, %{{.*}}] {leadDimension = 16 : index} : !gpu.mma_matrix<16x16xf16, "COp">, memref<16x16xf16>
		func.func @matmul_transposed(%arg0: memref<16x16xf16>, %arg1: memref<16x16xf16>, %arg2: memref<16x16xf16>) {
		%cst_0 = arith.constant dense<0.000000e+00> : vector<16x16xf16>
		%c0 = arith.constant 0 : index
		%cst = arith.constant 0.000000e+00 : f16
		%A = vector.transfer_read %arg0[%c0, %c0], %cst {in_bounds = [true, true]} : memref<16x16xf16>, vector<16x16xf16>
		%B = vector.transfer_read %arg1[%c0, %c0], %cst {permutation_map = #map5, in_bounds = [true, true]} : memref<16x16xf16>, vector<16x16xf16>
		%C = vector.transfer_read %arg2[%c0, %c0], %cst {in_bounds = [true, true]} : memref<16x16xf16>, vector<16x16xf16>
		%D = vector.contract {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %A, %B, %C : vector<16x16xf16>, vector<16x16xf16> into vector<16x16xf16>
		vector.transfer_write %D, %arg2[%c0, %c0] {in_bounds = [true, true]} : vector<16x16xf16>, memref<16x16xf16>
		return
		}

This is an archive of the discontinued LLVM Phabricator instance.

[mlir][gpu] Adding support for transposed mma_load_matrix
ClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 478441

mlir/include/mlir/Dialect/GPU/IR/GPUOps.td

mlir/lib/Conversion/GPUToNVVM/WmmaOpsToNvvm.cpp

mlir/lib/Conversion/GPUToSPIRV/WmmaOpsToSPIRV.cpp

mlir/lib/Conversion/VectorToGPU/VectorToGPU.cpp

mlir/test/Conversion/VectorToGPU/vector-to-mma-ops.mlir

This is an archive of the discontinued LLVM Phabricator instance.

[mlir][gpu] Adding support for transposed mma_load_matrixClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 478441

mlir/include/mlir/Dialect/GPU/IR/GPUOps.td

mlir/lib/Conversion/GPUToNVVM/WmmaOpsToNvvm.cpp

mlir/lib/Conversion/GPUToSPIRV/WmmaOpsToSPIRV.cpp

mlir/lib/Conversion/VectorToGPU/VectorToGPU.cpp

mlir/test/Conversion/VectorToGPU/vector-to-mma-ops.mlir

[mlir][gpu] Adding support for transposed mma_load_matrix
ClosedPublic