This is an archive of the discontinued LLVM Phabricator instance.

Paths

Table of Contentst

-
mlir/
-
include/mlir/Dialect/NVGPU/Utils/
-
mlir/
-
Dialect/
-
NVGPU/
-
Utils/
-
MMAUtils.h
-
lib/
-
Conversion/VectorToGPU/
-
VectorToGPU/
-
VectorToGPU.cpp
-
Dialect/NVGPU/Utils/
-
NVGPU/
-
Utils/
-
MMAUtils.cpp
-
test/Conversion/VectorToGPU/
-
Conversion/
-
VectorToGPU/
-
vector-to-mma-ops-mma-sync.mlir

Differential D155753

[mlir][VectorToGPU] Update memref stride preconditions on `nvgpu.mma.sync` path
ClosedPublic

Authored by christopherbate on Jul 19 2023, 2:33 PM.

Download Raw Diff

Details

Reviewers

aartbik
nicolasvasilache
herhut
dcaballe
ThomasRaoux
manishucsd

Commits

rGcafb6284d18b: [mlir][VectorToGPU] Update memref stride preconditions on `nvgpu.mma.sync` path

Summary

This change removes the requirement that the row stride be statically known when
converting vector.transfer_read and vector.transfer_write to distributed
SIMT operations in the nvgpu lowering path. It also adds a check to verify
that the last dimension of the source memref is statically known to have stride
1 since this is assumed in the conversion logic. No other change should be
required since the generated vector.load operations are never created across
dimensions other than the last. The routines for checking preconditions on
vector.transfer_read/write are moved to under nvgpu utilities.

The change is NFC with respect to the GPU dialect lowering path.

Diff Detail

Repository: rG LLVM Github Monorepo

Event Timeline

christopherbate created this revision.Jul 19 2023, 2:33 PM

Herald added a reviewer: aartbik. · View Herald TranscriptJul 19 2023, 2:33 PM

Herald added a project: Restricted Project. · View Herald Transcript

Herald added subscribers: bviyer, Moerafaat, zero9178 and 22 others. · View Herald Transcript

christopherbate requested review of this revision.Jul 19 2023, 2:33 PM

Herald added a reviewer: nicolasvasilache. · View Herald TranscriptJul 19 2023, 2:33 PM

Herald added a reviewer: herhut. · View Herald Transcript

Herald added a reviewer: dcaballe. · View Herald Transcript

Herald added a project: Restricted Project. · View Herald Transcript

Herald added subscribers: stephenneuendorffer, nicolasvasilache. · View Herald Transcript

Harbormaster completed remote builds in B246680: Diff 542195.Jul 19 2023, 3:16 PM

christopherbate added a reviewer: ThomasRaoux.Jul 20 2023, 1:14 PM

christopherbate added a reviewer: manishucsd.

LGTM

This revision is now accepted and ready to land.Jul 20 2023, 8:38 PM

Closed by commit rGcafb6284d18b: [mlir][VectorToGPU] Update memref stride preconditions on `nvgpu.mma.sync` path (authored by christopherbate). · Explain WhySep 14 2023, 12:52 PM

This revision was automatically updated to reflect the committed changes.

christopherbate added a commit: rGcafb6284d18b: [mlir][VectorToGPU] Update memref stride preconditions on `nvgpu.mma.sync` path.

Herald added a subscriber: sunshaoce. · View Herald TranscriptSep 14 2023, 12:52 PM

Revision Contents

Path

Size

mlir/

include/

mlir/

Dialect/

NVGPU/

Utils/

MMAUtils.h

12 lines

lib/

Conversion/

VectorToGPU/

VectorToGPU.cpp

53 lines

Dialect/

NVGPU/

Utils/

MMAUtils.cpp

51 lines

test/

Conversion/

VectorToGPU/

vector-to-mma-ops-mma-sync.mlir

126 lines

Diff 556806

mlir/include/mlir/Dialect/NVGPU/Utils/MMAUtils.h

Show First 20 Lines • Show All 87 Lines • ▼ Show 20 Lines	FailureOr<LdMatrixParams> getLdMatrixParams(const WarpMatrixInfo &type,
bool transpose);		bool transpose);
/// Returns an AffineMap which maps a single dimension representing the laneId		/// Returns an AffineMap which maps a single dimension representing the laneId
/// to two results representing offsets within the matrix operand that should		/// to two results representing offsets within the matrix operand that should
/// be the pointer locations a thread should pass to the ldmatrix instruction.		/// be the pointer locations a thread should pass to the ldmatrix instruction.
FailureOr<AffineMap>		FailureOr<AffineMap>
getLaneIdToLdMatrixMatrixCoord(OpBuilder &builder, Location loc,		getLaneIdToLdMatrixMatrixCoord(OpBuilder &builder, Location loc,
const LdMatrixParams &params);		const LdMatrixParams &params);

		/// Returns whether the `vector.transfer_read` instruction can be interpreted
		/// as a warp-level cooperative matrix load operation. This function is meant to
		/// be used to establish whether `op` is part of a chain of such warp-level
		/// operations.
		bool canLowerToWarpMatrixOperation(vector::TransferReadOp op);

		/// Returns whether the `vector.transfer_write` instruction can be interpreted
		/// as a warp-level cooperative matrix store operation. This function is meant
		/// to be used to establish whether `op` is part of a chain of such warp-level
		/// operations.
		bool canLowerToWarpMatrixOperation(vector::TransferWriteOp op);

} // namespace nvgpu		} // namespace nvgpu
} // namespace mlir		} // namespace mlir

#endif // MLIR_DIALECT_NVGPU_UTILS_MMAUTILS_H		#endif // MLIR_DIALECT_NVGPU_UTILS_MMAUTILS_H

mlir/lib/Conversion/VectorToGPU/VectorToGPU.cpp

Show First 20 Lines • Show All 113 Lines • ▼ Show 20 Lines	static bool isTransposeMatrixLoadMap(AffineMap permutationMap) {

AffineExpr innerDim = b.getAffineDimExpr(nDim - 1);		AffineExpr innerDim = b.getAffineDimExpr(nDim - 1);
AffineExpr outerDim = b.getAffineDimExpr(nDim - 2);		AffineExpr outerDim = b.getAffineDimExpr(nDim - 2);
// Support both transposed and transposed+broadcasted cases.		// Support both transposed and transposed+broadcasted cases.
return permutationMap == AffineMap::get(nDim, 0, {innerDim, outerDim}, ctx) \|\|		return permutationMap == AffineMap::get(nDim, 0, {innerDim, outerDim}, ctx) \|\|
permutationMap == AffineMap::get(nDim, 0, {innerDim, zero}, ctx);		permutationMap == AffineMap::get(nDim, 0, {innerDim, zero}, ctx);
}		}

// Return the stide for the dimension 0 of \|type\| if it is a memref and has a		// Return the stide for the second-to-last dimension of \|type\| if it is a memref
// constant stride.		// and has a constant stride.
static std::optional<int64_t>		static std::optional<int64_t> getStaticallyKnownRowStride(ShapedType type) {
getMemrefConstantHorizontalStride(ShapedType type) {
auto memrefType = dyn_cast<MemRefType>(type);		auto memrefType = dyn_cast<MemRefType>(type);
if (!memrefType)		if (!memrefType)
return false;		return false;
// If the memref is 0 or 1D the horizontal stride is 0.		// If the memref is 0 or 1D the horizontal stride is 0.
if (memrefType.getRank() < 2)		if (memrefType.getRank() < 2)
return 0;		return 0;
int64_t offset = 0;		int64_t offset = 0;
SmallVector<int64_t, 2> strides;		SmallVector<int64_t, 2> strides;
if (failed(getStridesAndOffset(memrefType, strides, offset)) \|\|		if (failed(getStridesAndOffset(memrefType, strides, offset)) \|\|
strides.back() != 1)		strides.back() != 1)
return std::nullopt;		return std::nullopt;
int64_t stride = strides[strides.size() - 2];		int64_t stride = strides[strides.size() - 2];
if (stride == ShapedType::kDynamic)		if (stride == ShapedType::kDynamic)
return std::nullopt;		return std::nullopt;
return stride;		return stride;
}		}

// Return true if the transfer op can be converted to a MMA matrix load.		// Return true if the transfer op can be converted to a MMA matrix load.
static bool transferReadSupportsMMAMatrixType(vector::TransferReadOp readOp,		static bool transferReadSupportsMMAMatrixType(vector::TransferReadOp readOp) {
bool useNvGpu) {
if (readOp.getMask() \|\| readOp.hasOutOfBoundsDim() \|\|		if (readOp.getMask() \|\| readOp.hasOutOfBoundsDim() \|\|
readOp.getVectorType().getRank() != 2)		readOp.getVectorType().getRank() != 2)
return false;		return false;
if (!getMemrefConstantHorizontalStride(readOp.getShapedType()))		if (!getStaticallyKnownRowStride(readOp.getShapedType()))
return false;		return false;

// Only allow integer types if the signedness can be inferred.		// Only allow integer types if the signedness can be inferred.
if (!useNvGpu && readOp.getVectorType().getElementType().isInteger(8))		if (readOp.getVectorType().getElementType().isInteger(8))
if (!readOp->hasOneUse() \|\| (!isa<arith::ExtSIOp>(*readOp->user_begin()) &&		if (!readOp->hasOneUse() \|\| (!isa<arith::ExtSIOp>(*readOp->user_begin()) &&
!isa<arith::ExtUIOp>(*readOp->user_begin())))		!isa<arith::ExtUIOp>(*readOp->user_begin())))
return false;		return false;

AffineMap map = readOp.getPermutationMap();		AffineMap map = readOp.getPermutationMap();

MLIRContext *ctx = readOp.getContext();		MLIRContext *ctx = readOp.getContext();
AffineExpr innerDim = getAffineDimExpr(map.getNumDims() - 1, ctx);		AffineExpr innerDim = getAffineDimExpr(map.getNumDims() - 1, ctx);
AffineExpr zero = getAffineConstantExpr(0, ctx);		AffineExpr zero = getAffineConstantExpr(0, ctx);
auto broadcastInnerDim =		auto broadcastInnerDim =
AffineMap::get(map.getNumDims(), 0, {zero, innerDim}, ctx);		AffineMap::get(map.getNumDims(), 0, {zero, innerDim}, ctx);
		return map.isMinorIdentity() \|\| map == broadcastInnerDim \|\|
if (!useNvGpu) {
bool result = map.isMinorIdentity() \|\| map == broadcastInnerDim \|\|
isTransposeMatrixLoadMap(map);		isTransposeMatrixLoadMap(map);
return result;
}

return true;
}		}

// Return true if the transfer op can be converted to a MMA matrix store.		// Return true if the transfer op can be converted to a MMA matrix store.
static bool		static bool
transferWriteSupportsMMAMatrixType(vector::TransferWriteOp writeOp) {		transferWriteSupportsMMAMatrixType(vector::TransferWriteOp writeOp) {
// TODO: support 0-d corner case.		// TODO: support 0-d corner case.
if (writeOp.getTransferRank() == 0)		if (writeOp.getTransferRank() == 0)
return false;		return false;

if (writeOp.getMask() \|\| writeOp.hasOutOfBoundsDim() \|\|		if (writeOp.getMask() \|\| writeOp.hasOutOfBoundsDim() \|\|
writeOp.getVectorType().getRank() != 2)		writeOp.getVectorType().getRank() != 2)
return false;		return false;
if (!getMemrefConstantHorizontalStride(writeOp.getShapedType()))		if (!getStaticallyKnownRowStride(writeOp.getShapedType()))
return false;		return false;
// TODO: Support transpose once it is added to GPU dialect ops.		// TODO: Support transpose once it is added to GPU dialect ops.
if (!writeOp.getPermutationMap().isMinorIdentity())		if (!writeOp.getPermutationMap().isMinorIdentity())
return false;		return false;
return true;		return true;
}		}

/// Return true if the constant is a splat to a 2D vector so that it can be		/// Return true if the constant is a splat to a 2D vector so that it can be
▲ Show 20 Lines • Show All 86 Lines • ▼ Show 20 Lines	extractStridedSliceSupportsMMAMatrixType(vector::ExtractStridedSliceOp op) {

return false;		return false;
}		}

static bool supportsMMaMatrixType(Operation *op, bool useNvGpu) {		static bool supportsMMaMatrixType(Operation *op, bool useNvGpu) {
if (isa<scf::ForOp, scf::YieldOp>(op))		if (isa<scf::ForOp, scf::YieldOp>(op))
return true;		return true;
if (auto transferRead = dyn_cast<vector::TransferReadOp>(op))		if (auto transferRead = dyn_cast<vector::TransferReadOp>(op))
return transferReadSupportsMMAMatrixType(transferRead, useNvGpu);		return useNvGpu ? nvgpu::canLowerToWarpMatrixOperation(transferRead)
		: transferReadSupportsMMAMatrixType(transferRead);
if (auto transferWrite = dyn_cast<vector::TransferWriteOp>(op))		if (auto transferWrite = dyn_cast<vector::TransferWriteOp>(op))
return transferWriteSupportsMMAMatrixType(transferWrite);		return useNvGpu ? nvgpu::canLowerToWarpMatrixOperation(transferWrite)
		: transferWriteSupportsMMAMatrixType(transferWrite);
if (auto extractStridedSlice = dyn_cast<vector::ExtractStridedSliceOp>(op))		if (auto extractStridedSlice = dyn_cast<vector::ExtractStridedSliceOp>(op))
return useNvGpu &&		return useNvGpu &&
extractStridedSliceSupportsMMAMatrixType(extractStridedSlice);		extractStridedSliceSupportsMMAMatrixType(extractStridedSlice);
if (auto contract = dyn_cast<vector::ContractionOp>(op))		if (auto contract = dyn_cast<vector::ContractionOp>(op))
return contractSupportsMMAMatrixType(contract, useNvGpu);		return contractSupportsMMAMatrixType(contract, useNvGpu);
if (auto constant = dyn_cast<arith::ConstantOp>(op))		if (auto constant = dyn_cast<arith::ConstantOp>(op))
return constantSupportsMMAMatrixType(constant);		return constantSupportsMMAMatrixType(constant);
if (auto broadcast = dyn_cast<vector::BroadcastOp>(op))		if (auto broadcast = dyn_cast<vector::BroadcastOp>(op))
▲ Show 20 Lines • Show All 68 Lines • ▼ Show 20 Lines	op->walk([&](vector::ContractionOp contract) {
if (opToConvert.contains(contract.getOperation()))		if (opToConvert.contains(contract.getOperation()))
return;		return;
SetVector<Operation *> dependentOps =		SetVector<Operation *> dependentOps =
getSliceContract(contract, backwardSliceOptions, forwardSliceOptions);		getSliceContract(contract, backwardSliceOptions, forwardSliceOptions);
// If any instruction cannot use MMA matrix type drop the whole		// If any instruction cannot use MMA matrix type drop the whole
// chain. MMA matrix are stored in an opaque type so they cannot be used		// chain. MMA matrix are stored in an opaque type so they cannot be used
// by all operations.		// by all operations.
if (llvm::any_of(dependentOps, [useNvGpu](Operation *op) {		if (llvm::any_of(dependentOps, [useNvGpu](Operation *op) {
return !supportsMMaMatrixType(op, useNvGpu);		if (!supportsMMaMatrixType(op, useNvGpu)) {
		LLVM_DEBUG(DBGS() << "cannot convert op: " << *op << "\n");
		return true;
		}
		return false;
}))		}))
return;		return;

opToConvert.insert(dependentOps.begin(), dependentOps.end());		opToConvert.insert(dependentOps.begin(), dependentOps.end());
});		});
// Sort the operations so that we can convert them in topological order.		// Sort the operations so that we can convert them in topological order.
return topologicalSort(opToConvert);		return topologicalSort(opToConvert);
}		}

namespace {		namespace {
// Transform contract into (m, k)x(k, n)x(m, n) form so that it can be converted		// Transform contract into (m, k)x(k, n)x(m, n) form so that it can be converted
▲ Show 20 Lines • Show All 146 Lines • ▼ Show 20 Lines

static LogicalResult		static LogicalResult
convertTransferReadOp(RewriterBase &rewriter, vector::TransferReadOp op,		convertTransferReadOp(RewriterBase &rewriter, vector::TransferReadOp op,
llvm::DenseMap<Value, Value> &valueMapping) {		llvm::DenseMap<Value, Value> &valueMapping) {
OpBuilder::InsertionGuard g(rewriter);		OpBuilder::InsertionGuard g(rewriter);
rewriter.setInsertionPoint(op);		rewriter.setInsertionPoint(op);

assert(op.getTransferRank() > 0 && "unexpected 0-d transfer");		assert(op.getTransferRank() > 0 && "unexpected 0-d transfer");
assert(transferReadSupportsMMAMatrixType(op, /useNvGpu=/false));		assert(transferReadSupportsMMAMatrixType(op) &&
		"expected convertible operation");

std::optional<int64_t> stride =		std::optional<int64_t> stride =
getMemrefConstantHorizontalStride(op.getShapedType());		getStaticallyKnownRowStride(op.getShapedType());
if (!stride.has_value()) {		if (!stride.has_value()) {
LLVM_DEBUG(DBGS() << "no stride\n");		LLVM_DEBUG(DBGS() << "no stride\n");
return rewriter.notifyMatchFailure(op, "no stride");		return rewriter.notifyMatchFailure(op, "no stride");
}		}

AffineMap map = op.getPermutationMap();		AffineMap map = op.getPermutationMap();
bool isTranspose = isTransposeMatrixLoadMap(map);		bool isTranspose = isTransposeMatrixLoadMap(map);

Show All 34 Lines
static LogicalResult		static LogicalResult
convertTransferWriteOp(RewriterBase &rewriter, vector::TransferWriteOp op,		convertTransferWriteOp(RewriterBase &rewriter, vector::TransferWriteOp op,
llvm::DenseMap<Value, Value> &valueMapping) {		llvm::DenseMap<Value, Value> &valueMapping) {
OpBuilder::InsertionGuard g(rewriter);		OpBuilder::InsertionGuard g(rewriter);
rewriter.setInsertionPoint(op);		rewriter.setInsertionPoint(op);

assert(transferWriteSupportsMMAMatrixType(op));		assert(transferWriteSupportsMMAMatrixType(op));
std::optional<int64_t> stride =		std::optional<int64_t> stride =
getMemrefConstantHorizontalStride(op.getShapedType());		getStaticallyKnownRowStride(op.getShapedType());
if (!stride.has_value()) {		if (!stride.has_value()) {
LLVM_DEBUG(DBGS() << "no stride\n");		LLVM_DEBUG(DBGS() << "no stride\n");
return rewriter.notifyMatchFailure(op, "no stride");		return rewriter.notifyMatchFailure(op, "no stride");
}		}

auto it = valueMapping.find(op.getVector());		auto it = valueMapping.find(op.getVector());
if (it == valueMapping.end()) {		if (it == valueMapping.end()) {
LLVM_DEBUG(DBGS() << "no mapping\n");		LLVM_DEBUG(DBGS() << "no mapping\n");
▲ Show 20 Lines • Show All 695 Lines • ▼ Show 20 Lines	if (llvm::TypeSwitch<Operation *, LogicalResult>(op)
})		})
.Case([&](arith::ConstantOp constOp) {		.Case([&](arith::ConstantOp constOp) {
return convertConstantOpMmaSync(rewriter, constOp, valueMapping);		return convertConstantOpMmaSync(rewriter, constOp, valueMapping);
})		})
.Default([&](Operation *op) {		.Default([&](Operation *op) {
return op->emitError() << "unhandled vector to mma type: " << *op;		return op->emitError() << "unhandled vector to mma type: " << *op;
})		})
.failed()) {		.failed()) {
return op->emitError() << "Failed to convert op " << *op;		return op->emitOpError()
		<< "failed to convert op during vector-to-nvgpu conversion";
}		}
}		}
return success();		return success();
}		}

namespace {		namespace {

struct ConvertVectorToGPUPass		struct ConvertVectorToGPUPass
: public impl::ConvertVectorToGPUBase<ConvertVectorToGPUPass> {		: public impl::ConvertVectorToGPUBase<ConvertVectorToGPUPass> {

explicit ConvertVectorToGPUPass(bool useNvGpu_) {		explicit ConvertVectorToGPUPass(bool useNvGpu_) {
useNvGpu.setValue(useNvGpu_);		useNvGpu.setValue(useNvGpu_);
}		}

void runOnOperation() override {		void runOnOperation() override {
RewritePatternSet patterns(&getContext());		RewritePatternSet patterns(&getContext());
populatePrepareVectorToMMAPatterns(patterns, useNvGpu.getValue());		populatePrepareVectorToMMAPatterns(patterns, useNvGpu.getValue());
if (failed(		if (failed(
applyPatternsAndFoldGreedily(getOperation(), std::move(patterns))))		applyPatternsAndFoldGreedily(getOperation(), std::move(patterns))))
return signalPassFailure();		return signalPassFailure();

IRRewriter rewriter(&getContext());		IRRewriter rewriter(&getContext());
if (useNvGpu.getValue()) {		if (useNvGpu) {
if (failed(		if (failed(
convertVectorToNVVMCompatibleMMASync(rewriter, getOperation())))		convertVectorToNVVMCompatibleMMASync(rewriter, getOperation())))
return signalPassFailure();		return signalPassFailure();
		return;
}		}
(void)convertVectorToMMAOps(rewriter, getOperation());		(void)convertVectorToMMAOps(rewriter, getOperation());
}		}
};		};

} // namespace		} // namespace

std::unique_ptr<Pass> mlir::createConvertVectorToGPUPass(bool useNvGpu) {		std::unique_ptr<Pass> mlir::createConvertVectorToGPUPass(bool useNvGpu) {
return std::make_unique<ConvertVectorToGPUPass>(useNvGpu);		return std::make_unique<ConvertVectorToGPUPass>(useNvGpu);
}		}

mlir/lib/Dialect/NVGPU/Utils/MMAUtils.cpp

Show First 20 Lines • Show All 266 Lines • ▼ Show 20 Lines	nvgpu::getLaneIdToLdMatrixMatrixCoord(OpBuilder &builder, Location loc,
// This case corresponds to col-major matrixA or row-major matrixB or		// This case corresponds to col-major matrixA or row-major matrixB or
// col-major matrixC. This is when the memory layout in `srcMemref` does not		// col-major matrixC. This is when the memory layout in `srcMemref` does not
// match mma.sync hardware vector register operand layout.		// match mma.sync hardware vector register operand layout.
if (params.contiguousDimType == vector::IteratorType::parallel)		if (params.contiguousDimType == vector::IteratorType::parallel)
return makeMap({contiguous, strided});		return makeMap({contiguous, strided});

return failure();		return failure();
}		}

		bool nvgpu::canLowerToWarpMatrixOperation(vector::TransferReadOp op) {
		if (op.getMask() \|\| op.hasOutOfBoundsDim())
		return false;
		VectorType type = op.getType();
		// The result type should be 2D. Note that it is possible to expand support so
		// that we are robust to extra unit dimensions that failed to fold, but that
		// would significantly increase downstream code complexity in the conversion
		// step. For now, we rely on other patterns to ensure canonical 2D form is
		// used when targeting the `nvgpu.mma.sync` lowering path.
		if (!type.hasStaticShape() \|\| type.getRank() != 2)
		return false;

		// Currently we can't support reads on tensor types because we need stride
		// information to ensure correctness of downstream assumptions. It is possible
		// to enable this if caller can assert that tensor will be lowered in a
		// particular manner.
		auto sourceType = dyn_cast<MemRefType>(op.getSource().getType());
		if (!sourceType)
		return false;

		// Check that the last dimension of the read is contiguous. Note that it is
		// possible to expand support for this by scalarizing all the loads during
		// conversion.
		auto [strides, offset] = mlir::getStridesAndOffset(sourceType);
		return strides.back() == 1;
		}

		bool nvgpu::canLowerToWarpMatrixOperation(vector::TransferWriteOp op) {
		if (op.getMask() \|\| op.hasOutOfBoundsDim() \|\| op.getTransferRank() == 0)
		return false;
		VectorType type = op.getVectorType();
		if (!type.hasStaticShape() \|\| type.getRank() != 2)
		return false;
		// TODO: Currently we rely on lowering to a `vector.store` operation. We could
		// support the transposed write case by lowering to scalarized `memref.store`
		// operations.
		if (!op.getPermutationMap().isMinorIdentity())
		return false;
		// Currently we can't support reads on tensor types because we need stride
		// information to ensure correctness of downstream assumptions.
		auto sourceType = dyn_cast<MemRefType>(op.getSource().getType());
		if (!sourceType)
		return false;

		// Check that the last dimension of the target memref is contiguous. Note that
		// it is possible to expand support for this by scalarizing all the stores
		// during conversion.
		auto [strides, offset] = mlir::getStridesAndOffset(sourceType);
		return strides.back() == 1;
		}

mlir/test/Conversion/VectorToGPU/vector-to-mma-ops-mma-sync.mlir

Show First 20 Lines • Show All 41 Lines • ▼ Show 20 Lines	func.func @m16n8k32_int8_row_row_row(%arg0: memref<128x128xi8, #gpu.address_space<workgroup>>, %arg1: memref<128x128xi8, #gpu.address_space<workgroup>>, %arg2: memref<128x128xi32>) {

// Verify that the operandA load is lowered to warp-wide ldmatrix.		// Verify that the operandA load is lowered to warp-wide ldmatrix.

// CHECK: [[m_coord:%.+]] = affine.apply [[$strided_map]]()[{{%.+}}]		// CHECK: [[m_coord:%.+]] = affine.apply [[$strided_map]]()[{{%.+}}]
// CHECK: [[k_coord:%.+]] = affine.apply [[$contiguous_map]]()[{{%.+}}]		// CHECK: [[k_coord:%.+]] = affine.apply [[$contiguous_map]]()[{{%.+}}]
// CHECK: nvgpu.ldmatrix %arg0[[[m_coord]], [[k_coord]]] {numTiles = 4 : i32, transpose = false} : memref<128x128xi8, #gpu.address_space<workgroup>> -> vector<4x4xi8>		// CHECK: nvgpu.ldmatrix %arg0[[[m_coord]], [[k_coord]]] {numTiles = 4 : i32, transpose = false} : memref<128x128xi8, #gpu.address_space<workgroup>> -> vector<4x4xi8>

// Verify that the operandB load is lowered to scalar load to be able		// Verify that the operandB load is lowered to scalar load to be able
// to transpose at 8-bit granularity. ldmatrix can only transpose at		// to transpose at 8-bit granularity. ldmatrix can only transpose at
// 16-bit granularity.		// 16-bit granularity.

// CHECK-DAG: [[row:%.+]] = affine.apply [[$rowB0_map]]()[{{%.+}}]		// CHECK-DAG: [[row:%.+]] = affine.apply [[$rowB0_map]]()[{{%.+}}]
// CHECK-DAG: [[col:%.+]] = affine.apply [[$colB0_map]]()[{{%.+}}]		// CHECK-DAG: [[col:%.+]] = affine.apply [[$colB0_map]]()[{{%.+}}]
// CHECK: memref.load %arg1[[[row]], [[col]]] : memref<128x128xi8, #gpu.address_space<workgroup>>		// CHECK: memref.load %arg1[[[row]], [[col]]] : memref<128x128xi8, #gpu.address_space<workgroup>>
// CHECK-DAG: [[row:%.+]] = affine.apply [[$rowB1_map]]()[{{%.+}}]		// CHECK-DAG: [[row:%.+]] = affine.apply [[$rowB1_map]]()[{{%.+}}]
// CHECK-DAG: [[col:%.+]] = affine.apply [[$colB0_map]]()[{{%.+}}]		// CHECK-DAG: [[col:%.+]] = affine.apply [[$colB0_map]]()[{{%.+}}]
// CHECK: memref.load %arg1[[[row]], [[col]]] : memref<128x128xi8, #gpu.address_space<workgroup>>		// CHECK: memref.load %arg1[[[row]], [[col]]] : memref<128x128xi8, #gpu.address_space<workgroup>>
▲ Show 20 Lines • Show All 218 Lines • ▼ Show 20 Lines	func.func @multi_dim_m16n8k16_fp16_row_row_row(%arg0: memref<4x32x1x32xf16, #gpu.address_space<workgroup>>, %arg1: memref<4x1x32x32xf16, #gpu.address_space<workgroup>>, %arg2: memref<1x32x40xf16, #gpu.address_space<workgroup>>) {
// CHECK-DAG: [[k_coord:%.+]] = affine.apply [[$contiguous_map]]		// CHECK-DAG: [[k_coord:%.+]] = affine.apply [[$contiguous_map]]
// CHECK: [[fragmentA:%.+]] = nvgpu.ldmatrix %arg0[[[c0]], [[m_coord]], [[c0]], [[k_coord]]] {numTiles = 4 : i32, transpose = false}		// CHECK: [[fragmentA:%.+]] = nvgpu.ldmatrix %arg0[[[c0]], [[m_coord]], [[c0]], [[k_coord]]] {numTiles = 4 : i32, transpose = false}
%A = vector.transfer_read %arg0[%c0, %c0, %c0, %c0], %cst {in_bounds = [true, true], permutation_map = #map_a} : memref<4x32x1x32xf16, #gpu.address_space<workgroup>>, vector<16x16xf16>		%A = vector.transfer_read %arg0[%c0, %c0, %c0, %c0], %cst {in_bounds = [true, true], permutation_map = #map_a} : memref<4x32x1x32xf16, #gpu.address_space<workgroup>>, vector<16x16xf16>

// CHECK-DAG: [[n_coord:%.+]] = affine.apply [[$contiguous_map]]		// CHECK-DAG: [[n_coord:%.+]] = affine.apply [[$contiguous_map]]
// CHECK-DAG: [[k_coord:%.+]] = affine.apply [[$strided_map]]		// CHECK-DAG: [[k_coord:%.+]] = affine.apply [[$strided_map]]
// CHECK-DAG: [[fragmentB:%.+]] = nvgpu.ldmatrix %arg1[[[c0]], [[c0]], [[k_coord]], [[n_coord]]] {numTiles = 4 : i32, transpose = true}		// CHECK-DAG: [[fragmentB:%.+]] = nvgpu.ldmatrix %arg1[[[c0]], [[c0]], [[k_coord]], [[n_coord]]] {numTiles = 4 : i32, transpose = true}
%B = vector.transfer_read %arg1[%c0, %c0, %c0, %c0], %cst {in_bounds = [true, true], permutation_map = #map_b} : memref<4x1x32x32xf16, #gpu.address_space<workgroup>>, vector<16x16xf16>		%B = vector.transfer_read %arg1[%c0, %c0, %c0, %c0], %cst {in_bounds = [true, true], permutation_map = #map_b} : memref<4x1x32x32xf16, #gpu.address_space<workgroup>>, vector<16x16xf16>

// CHECK-DAG: [[m_coord:%.+]] = affine.apply [[$strided_map]]		// CHECK-DAG: [[m_coord:%.+]] = affine.apply [[$strided_map]]
// CHECK-DAG: [[n_coord:%.+]] = affine.apply [[$contiguous_map]]		// CHECK-DAG: [[n_coord:%.+]] = affine.apply [[$contiguous_map]]
// CHECK-DAG: [[fragmentC:%.*]] = nvgpu.ldmatrix %arg2[[[c0]], [[m_coord]], [[n_coord]]] {numTiles = 4 : i32, transpose = false}		// CHECK-DAG: [[fragmentC:%.*]] = nvgpu.ldmatrix %arg2[[[c0]], [[m_coord]], [[n_coord]]] {numTiles = 4 : i32, transpose = false}
%C = vector.transfer_read %arg2[%c0, %c0, %c0], %cst {in_bounds = [true, true]} : memref<1x32x40xf16, #gpu.address_space<workgroup>>, vector<16x16xf16>		%C = vector.transfer_read %arg2[%c0, %c0, %c0], %cst {in_bounds = [true, true]} : memref<1x32x40xf16, #gpu.address_space<workgroup>>, vector<16x16xf16>

// CHECK-DAG: [[fragmentB0:%.+]] = vector.extract_strided_slice [[fragmentB]] {offsets = [0, 0], sizes = [2, 2], strides = [1, 1]} : vector<4x2xf16> to vector<2x2xf16>		// CHECK-DAG: [[fragmentB0:%.+]] = vector.extract_strided_slice [[fragmentB]] {offsets = [0, 0], sizes = [2, 2], strides = [1, 1]} : vector<4x2xf16> to vector<2x2xf16>
// CHECK-DAG: [[fragmentC0:%.+]] = vector.extract_strided_slice [[fragmentC]] {offsets = [0, 0], sizes = [2, 2], strides = [1, 1]} : vector<4x2xf16> to vector<2x2xf16>		// CHECK-DAG: [[fragmentC0:%.+]] = vector.extract_strided_slice [[fragmentC]] {offsets = [0, 0], sizes = [2, 2], strides = [1, 1]} : vector<4x2xf16> to vector<2x2xf16>
// CHECK: nvgpu.mma.sync([[fragmentA]], [[fragmentB0]], [[fragmentC0]]) {mmaShape = [16, 8, 16]} : (vector<4x2xf16>, vector<2x2xf16>, vector<2x2xf16>) -> vector<2x2xf16>		// CHECK: nvgpu.mma.sync([[fragmentA]], [[fragmentB0]], [[fragmentC0]]) {mmaShape = [16, 8, 16]} : (vector<4x2xf16>, vector<2x2xf16>, vector<2x2xf16>) -> vector<2x2xf16>
▲ Show 20 Lines • Show All 414 Lines • ▼ Show 20 Lines	func.func @m16n8k32_int8_row_col_row(%arg0: memref<128x128xi8, #gpu.address_space<workgroup>>, %arg1: memref<128x128xi8, #gpu.address_space<workgroup>>, %arg2: memref<128x128xi32>) {
// CHECK: vector.store [[v]], %arg2[[[row]], [[col]]] : memref<128x128xi32>, vector<2xi32>		// CHECK: vector.store [[v]], %arg2[[[row]], [[col]]] : memref<128x128xi32>, vector<2xi32>
// CHECK: [[v:%.+]] = vector.extract [[d]][1] : vector<2x2xi32>		// CHECK: [[v:%.+]] = vector.extract [[d]][1] : vector<2x2xi32>
// CHECK: [[row:%.+]] = affine.apply [[$rowC8_map]]()[[[lane]]]		// CHECK: [[row:%.+]] = affine.apply [[$rowC8_map]]()[[[lane]]]
// CHECK: [[col:%.+]] = affine.apply [[$colC0_map]]()[[[lane]]]		// CHECK: [[col:%.+]] = affine.apply [[$colC0_map]]()[[[lane]]]
// CHECK: vector.store [[v]], %arg2[[[row]], [[col]]] : memref<128x128xi32>, vector<2xi32>		// CHECK: vector.store [[v]], %arg2[[[row]], [[col]]] : memref<128x128xi32>, vector<2xi32>
vector.transfer_write %D, %arg2[%c0, %c0] {in_bounds = [true, true]} : vector<16x8xi32>, memref<128x128xi32>		vector.transfer_write %D, %arg2[%c0, %c0] {in_bounds = [true, true]} : vector<16x8xi32>, memref<128x128xi32>
return		return
}		}

		// -----


		#map0 = affine_map<(d0, d1) -> (d1, d0)>
		#map1 = affine_map<(d0, d1, d2) -> (d0, d2)>
		#map2 = affine_map<(d0, d1, d2) -> (d1, d2)>
		#map3 = affine_map<(d0, d1, d2) -> (d0, d1)>
		!smem_type = memref<20x20xf16, strided<[?, 1], offset: ?>, #gpu.address_space<workgroup>>

		// This test case is identical to m16n8k16 test case, but it tests that having
		// n row dimension with unknown stride is handled correctly.

		// CHECK-DAG: [[$strided_map:#.+]] = affine_map<()[s0] -> (s0 mod 16)>
		// CHECK-DAG: [[$contiguous_map:#.+]] = affine_map<()[s0] -> ((s0 floordiv 16) * 8)>
		// CHECK-LABEL: func @strided_memref_read_write
		func.func @strided_memref_read_write(%arg0: !smem_type,
		%arg1: !smem_type,
		%arg2: !smem_type) {
		%cst_0 = arith.constant dense<0.000000e+00> : vector<16x8xf16>
		%c0 = arith.constant 0 : index
		%cst = arith.constant 0.000000e+00 : f16

		// CHECK-DAG: [[m_coord:%.+]] = affine.apply [[$strided_map]]
		// CHECK-DAG: [[k_coord:%.+]] = affine.apply [[$contiguous_map]]
		// CHECK: nvgpu.ldmatrix %arg0[[[m_coord]], [[k_coord]]] {numTiles = 4 : i32, transpose = false}
		// CHECK-DAG: [[n_coord:%.+]] = affine.apply [[$contiguous_map]]
		// CHECK-DAG: [[k_coord:%.+]] = affine.apply [[$strided_map]]
		// CHECK: nvgpu.ldmatrix %arg1[[[k_coord]], [[n_coord]]] {numTiles = 2 : i32, transpose = true}
		%A = vector.transfer_read %arg0[%c0, %c0], %cst {in_bounds = [true, true]} : !smem_type, vector<16x16xf16>
		%B = vector.transfer_read %arg1[%c0, %c0], %cst {permutation_map = #map0, in_bounds = [true, true]} : !smem_type, vector<8x16xf16>
		%C = vector.transfer_read %arg2[%c0, %c0], %cst {in_bounds = [true, true]} : !smem_type, vector<16x8xf16>
		%D = vector.contract {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>}
		%A, %B, %C : vector<16x16xf16>, vector<8x16xf16> into vector<16x8xf16>
		vector.transfer_write %D, %arg2[%c0, %c0] {in_bounds = [true, true]} : vector<16x8xf16>, !smem_type
		return
		}

		// -----


		#map0 = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
		#map1 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>
		#map2 = affine_map<(d0, d1, d2, d3) -> (d2, d0, d3)>
		#map3 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>
		!smem_type = memref<20x20x20xf16, strided<[?, ?, 1], offset: ?>, #gpu.address_space<workgroup>>

		// CHECK-LABEL: func @unsupported_non_2d_load_store
		func.func @unsupported_non_2d_load_store(%arg0: !smem_type,
		%arg1: !smem_type,
		%arg2: !smem_type) {
		%cst_0 = arith.constant dense<0.000000e+00> : vector<16x8xf16>
		%c0 = arith.constant 0 : index
		%cst = arith.constant 0.000000e+00 : f16

		// CHECK-NOT: nvgpu.ldmatrix
		// CHECK-NOT: nvgpu.mma
		%A = vector.transfer_read %arg0[%c0, %c0, %c0], %cst {in_bounds = [true, true, true]} : !smem_type, vector<1x16x16xf16>
		%B = vector.transfer_read %arg1[%c0, %c0, %c0], %cst {permutation_map = #map0, in_bounds = [true, true, true]} : !smem_type, vector<8x1x16xf16>
		%C = vector.transfer_read %arg2[%c0, %c0, %c0], %cst {in_bounds = [true, true, true]} : !smem_type, vector<1x16x8xf16>
		%D = vector.contract {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "reduction"], kind = #vector.kind<add>}
		%A, %B, %C : vector<1x16x16xf16>, vector<8x1x16xf16> into vector<1x16x8xf16>
		vector.transfer_write %D, %arg2[%c0, %c0, %c0] {in_bounds = [true, true, true]} : vector<1x16x8xf16>, !smem_type
		return
		}

		// -----

		#map0 = affine_map<(d0, d1) -> (d1, d0)>
		#map1 = affine_map<(d0, d1, d2) -> (d0, d2)>
		#map2 = affine_map<(d0, d1, d2) -> (d1, d2)>
		#map3 = affine_map<(d0, d1, d2) -> (d0, d1)>

		!smem_type = memref<20x20xf16, strided<[?, ?], offset: ?>, #gpu.address_space<workgroup>>

		// CHECK-LABEL: func @unsupported_fully_dynamic_strides
		func.func @unsupported_fully_dynamic_strides(%arg0: !smem_type,
		%arg1: !smem_type,
		%arg2: !smem_type) {
		%cst_0 = arith.constant dense<0.000000e+00> : vector<16x8xf16>
		%c0 = arith.constant 0 : index
		%cst = arith.constant 0.000000e+00 : f16

		// CHECK-NOT: nvgpu.ldmatrix
		// CHECK-NOT: nvgpu.mma
		%A = vector.transfer_read %arg0[%c0, %c0], %cst {in_bounds = [true, true]} : !smem_type, vector<16x16xf16>
		%B = vector.transfer_read %arg1[%c0, %c0], %cst {permutation_map = #map0, in_bounds = [true, true]} : !smem_type, vector<8x16xf16>
		%C = vector.transfer_read %arg2[%c0, %c0], %cst {in_bounds = [true, true]} : !smem_type, vector<16x8xf16>
		%D = vector.contract {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>}
		%A, %B, %C : vector<16x16xf16>, vector<8x16xf16> into vector<16x8xf16>
		vector.transfer_write %D, %arg2[%c0, %c0] {in_bounds = [true, true]} : vector<16x8xf16>, !smem_type
		return
		}

		// -----

		#map0 = affine_map<(d0, d1) -> (d1, d0)>
		#map1 = affine_map<(d0, d1, d2) -> (d0, d2)>
		#map2 = affine_map<(d0, d1, d2) -> (d1, d2)>
		#map3 = affine_map<(d0, d1, d2) -> (d0, d1)>


		!smem_type = memref<20x20xf16, strided<[?, 1], offset: ?>, #gpu.address_space<workgroup>>

		// CHECK-LABEL: func @unsupported_transposed_store
		func.func @unsupported_transposed_store(%arg0: !smem_type,
		%arg1: !smem_type,
		%arg2: !smem_type) {
		%cst_0 = arith.constant dense<0.000000e+00> : vector<16x8xf16>
		%c0 = arith.constant 0 : index
		%cst = arith.constant 0.000000e+00 : f16

		// CHECK-NOT: nvgpu.ldmatrix
		// CHECK-NOT: nvgpu.mma
		%A = vector.transfer_read %arg0[%c0, %c0], %cst {in_bounds = [true, true]} : !smem_type, vector<16x16xf16>
		%B = vector.transfer_read %arg1[%c0, %c0], %cst {permutation_map = #map0, in_bounds = [true, true]} : !smem_type, vector<8x16xf16>
		%C = vector.transfer_read %arg2[%c0, %c0], %cst {in_bounds = [true, true]} : !smem_type, vector<16x8xf16>
		%D = vector.contract {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>}
		%A, %B, %C : vector<16x16xf16>, vector<8x16xf16> into vector<16x8xf16>
		vector.transfer_write %D, %arg2[%c0, %c0] {in_bounds = [true, true], permutation_map = affine_map<(d0, d1)->(d1, d0)>} : vector<16x8xf16>, !smem_type
		return
		}