Diff 410981

mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h

	Show First 20 Lines • Show All 53 Lines • ▼ Show 20 Lines
	/// were decomposed previously.			/// were decomposed previously.
	void populateConvolutionVectorizationPatterns(RewritePatternSet &patterns,			void populateConvolutionVectorizationPatterns(RewritePatternSet &patterns,
	PatternBenefit benefit = 1);			PatternBenefit benefit = 1);

	/// Populate patterns that convert `ElementwiseMappable` ops to linalg			/// Populate patterns that convert `ElementwiseMappable` ops to linalg
	/// parallel loops.			/// parallel loops.
	void populateElementwiseToLinalgConversionPatterns(RewritePatternSet &patterns);			void populateElementwiseToLinalgConversionPatterns(RewritePatternSet &patterns);

				/// Populate patterns that are only useful in the context of sparse tensors.
				void populateSparseTensorRewriting(RewritePatternSet &patterns);

	/// Function type which is used to control when to stop fusion. It is expected			/// Function type which is used to control when to stop fusion. It is expected
	/// that OpOperand is not modified in the callback. The OpOperand is not marked			/// that OpOperand is not modified in the callback. The OpOperand is not marked
	/// as const to allow callers to use non-const methods.			/// as const to allow callers to use non-const methods.
	using ControlElementwiseOpsFusionFn =			using ControlElementwiseOpsFusionFn =
	std::function<bool(const OpResult &producer, OpOperand &consumer)>;			std::function<bool(const OpResult &producer, OpOperand &consumer)>;

	/// Patterns to fold an expanding (collapsing) tensor_reshape operation with its			/// Patterns to fold an expanding (collapsing) tensor_reshape operation with its
	/// producer (consumer) generic operation by expanding the dimensionality of the			/// producer (consumer) generic operation by expanding the dimensionality of the
	▲ Show 20 Lines • Show All 1,408 Lines • Show Last 20 Lines

mlir/lib/Dialect/Linalg/Transforms/CMakeLists.txt

Show All 14 Lines	add_mlir_dialect_library(MLIRLinalgTransforms
HoistPadding.cpp		HoistPadding.cpp
InlineScalarOperands.cpp		InlineScalarOperands.cpp
Interchange.cpp		Interchange.cpp
Loops.cpp		Loops.cpp
LinalgStrategyPasses.cpp		LinalgStrategyPasses.cpp
NamedOpConversions.cpp		NamedOpConversions.cpp
PadOpInterchange.cpp		PadOpInterchange.cpp
Promotion.cpp		Promotion.cpp
		SparseTensorRewriting.cpp
Tiling.cpp		Tiling.cpp
Transforms.cpp		Transforms.cpp
Vectorization.cpp		Vectorization.cpp

ADDITIONAL_HEADER_DIRS		ADDITIONAL_HEADER_DIRS
${MLIR_MAIN_INCLUDE_DIR}/mlir/Dialect/Linalg		${MLIR_MAIN_INCLUDE_DIR}/mlir/Dialect/Linalg

DEPENDS		DEPENDS
Show All 37 Lines

mlir/lib/Dialect/Linalg/Transforms/ElementwiseOpFusion.cpp

Show First 20 Lines • Show All 43 Lines • ▼ Show 20 Lines	static AffineMap getIndexingMapOfProducerOperandsInCoordinatesOfFusedOp(
// the indexing map to be computed is a map from consumer loop -> producer		// the indexing map to be computed is a map from consumer loop -> producer
// arg tensor index.		// arg tensor index.
// producerResultIndexMap is a map from producer loop -> tensor index.		// producerResultIndexMap is a map from producer loop -> tensor index.
// Compute the inverse to get map from tensor index -> producer loop.		// Compute the inverse to get map from tensor index -> producer loop.
// The inverse is a map from producer result tensor index -> producer loop.		// The inverse is a map from producer result tensor index -> producer loop.
AffineMap invProducerResultIndexMap =		AffineMap invProducerResultIndexMap =
inversePermutation(producerResultIndexMap);		inversePermutation(producerResultIndexMap);
assert(invProducerResultIndexMap &&		assert(invProducerResultIndexMap &&
"expected producer result indexig map to be invertible");		"expected producer result indexing map to be invertible");

LinalgOp producer = cast<LinalgOp>(producerOpOperand->getOwner());		LinalgOp producer = cast<LinalgOp>(producerOpOperand->getOwner());
// argMap is a map from producer loop -> producer arg tensor index.		// argMap is a map from producer loop -> producer arg tensor index.
AffineMap argMap = producer.getTiedIndexingMap(producerOpOperand);		AffineMap argMap = producer.getTiedIndexingMap(producerOpOperand);

// Compose argMap with invProducerResultIndexMap to get a map from		// Compose argMap with invProducerResultIndexMap to get a map from
// producer result tensor index -> producer arg tensor index.		// producer result tensor index -> producer arg tensor index.
AffineMap t1 = argMap.compose(invProducerResultIndexMap);		AffineMap t1 = argMap.compose(invProducerResultIndexMap);
▲ Show 20 Lines • Show All 2,185 Lines • ▼ Show 20 Lines	void mlir::linalg::populateFoldReshapeOpsByExpansionPatterns(
const ControlElementwiseOpsFusionFn &controlFoldingReshapes) {		const ControlElementwiseOpsFusionFn &controlFoldingReshapes) {
patterns.add<FoldReshapeWithGenericOpByExpansion>(patterns.getContext(),		patterns.add<FoldReshapeWithGenericOpByExpansion>(patterns.getContext(),
controlFoldingReshapes);		controlFoldingReshapes);
patterns.add<FoldWithProducerReshapeOpByExpansion>(patterns.getContext(),		patterns.add<FoldWithProducerReshapeOpByExpansion>(patterns.getContext(),
controlFoldingReshapes);		controlFoldingReshapes);
}		}

void mlir::linalg::populateFoldReshapeOpsByCollapsingPatterns(		void mlir::linalg::populateFoldReshapeOpsByCollapsingPatterns(
RewritePatternSet &patterns,		RewritePatternSet &patterns,
		mravishankarUnsubmitted Done Reply Inline Actions This should always be a yeild op. So you can just use `cast<YieldOp>` mravishankar: This should always be a yeild op. So you can just use `cast<YieldOp>`
const ControlElementwiseOpsFusionFn &controlFoldingReshapes) {		const ControlElementwiseOpsFusionFn &controlFoldingReshapes) {
patterns.add<FoldWithProducerReshapeOpByCollapsing>(patterns.getContext(),		patterns.add<FoldWithProducerReshapeOpByCollapsing>(patterns.getContext(),
controlFoldingReshapes);		controlFoldingReshapes);
}		}

void mlir::linalg::populateElementwiseOpsFusionPatterns(		void mlir::linalg::populateElementwiseOpsFusionPatterns(
RewritePatternSet &patterns, LinalgElementwiseFusionOptions options) {		RewritePatternSet &patterns, LinalgElementwiseFusionOptions options) {
auto *context = patterns.getContext();		auto *context = patterns.getContext();
patterns.add<FuseElementwiseOps, FoldScalarOrSplatConstant,		patterns.add<FuseElementwiseOps, FoldScalarOrSplatConstant,
FoldConstantTranspose>(context,		FoldConstantTranspose>(context,
options.controlElementwiseOpsFusionFn);		options.controlElementwiseOpsFusionFn);
patterns.add<RemoveOutsDependency>(context);		patterns.add<RemoveOutsDependency>(context);
		populateSparseTensorRewriting(patterns);
populateFoldReshapeOpsByExpansionPatterns(patterns,		populateFoldReshapeOpsByExpansionPatterns(patterns,
options.controlFoldingReshapesFn);		options.controlFoldingReshapesFn);
AffineApplyOp::getCanonicalizationPatterns(patterns, context);		AffineApplyOp::getCanonicalizationPatterns(patterns, context);
GenericOp::getCanonicalizationPatterns(patterns, context);		GenericOp::getCanonicalizationPatterns(patterns, context);
tensor::ExpandShapeOp::getCanonicalizationPatterns(patterns, context);		tensor::ExpandShapeOp::getCanonicalizationPatterns(patterns, context);
tensor::CollapseShapeOp::getCanonicalizationPatterns(patterns, context);		tensor::CollapseShapeOp::getCanonicalizationPatterns(patterns, context);
context->getLoadedDialect<LinalgDialect>()->getCanonicalizationPatterns(		context->getLoadedDialect<LinalgDialect>()->getCanonicalizationPatterns(
patterns);		patterns);
▲ Show 20 Lines • Show All 73 Lines • Show Last 20 Lines

mlir/lib/Dialect/Linalg/Transforms/SparseTensorRewriting.cpp

This file was added.

//===- SparseTensorRewriting.cpp - Sparse tensor rewriting rules ----------===//

// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.

// See https://llvm.org/LICENSE.txt for license information.

// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

//===----------------------------------------------------------------------===//

// This file implements linalg dialect rewriting specific to sparse tensors.

// Sparsity should be mostly transparent to the linalg dialect optimizations

// (i.e., the dense and sparse take the same path). However, in some cases,

// optimizations only make sense in the context of sparse tensors. This file

// implements such sparsity specific rewriting rules.

//===----------------------------------------------------------------------===//

#include "mlir/Dialect/Linalg/IR/Linalg.h"

#include "mlir/Dialect/Linalg/Transforms/Transforms.h"

#include "mlir/Dialect/SparseTensor/IR/SparseTensor.h"

#include "mlir/IR/AffineMap.h"

#include "mlir/IR/Matchers.h"

#include "mlir/Support/LLVM.h"

using namespace mlir;

using namespace mlir::linalg;

using namespace mlir::sparse_tensor;

//===---------------------------------------------------------------------===//

// Helper methods for the actual rewriting rules.

//===---------------------------------------------------------------------===//

// Helper to detect a sparse tensor type operand.

static bool isSparseTensor(OpOperand *op) {

if (auto enc = getSparseTensorEncoding(op->get().getType())) {

ArrayRef<SparseTensorEncodingAttr::DimLevelType> dimTypes =

enc.getDimLevelType();

for (unsigned i = 0, e = dimTypes.size(); i < e; i++)

if (dimTypes[i] == SparseTensorEncodingAttr::DimLevelType::Compressed)

return true; // at least one compressed

}

return false;

}

// Helper method to find zero or empty initialization.

static bool isEmptyInit(OpOperand *op) {

Value val = op->get();

if (matchPattern(val, m_Zero()))

return true;

if (matchPattern(val, m_AnyZeroFloat()))

return true;

if (val.getDefiningOp<InitTensorOp>())

return true;

if (val.getDefiningOp<InitOp>())

return true;

return false;

}

wrengrUnsubmitted

Not Done

Value val = op->get();

- if (matchPattern(val, m_Zero()))

- return true;

- if (matchPattern(val, m_AnyZeroFloat()))

- return true;

- if (val.getDefiningOp<InitTensorOp>())

- return true;

- if (val.getDefiningOp<InitOp>())

- return true;

- return false;

- }

+ return matchPattern(val, m_Zero())

+ || matchPattern(val, m_AnyZeroFloat())

+ || val.getDefiningOp<InitTensorOp>()

+ || val.getDefiningOp<InitOp>();}

// Helper to detect sampling operation.

Why not simplify this to the suggested edit?

wrengr: Why not simplify this to the suggested edit?

// Helper to detect sampling operation.

static bool isSampling(GenericOp op) {

auto yieldOp = cast<linalg::YieldOp>(op.region().front().getTerminator());

if (auto def = yieldOp.getOperand(0).getDefiningOp()) {

if (isa<arith::MulFOp>(def) || isa<arith::MulIOp>(def)) {

// Both scalar input arguments used exactly once.

Value s1 = op.getBlock()->getArgument(0);

Value s2 = op.getBlock()->getArgument(1);

return (def->getOperand(0) == s1 && def->getOperand(1) == s2) ||

(def->getOperand(1) == s1 && def->getOperand(0) == s2);

}

return false;

}

// Helper to detect chain of multiplications that do not involve x.

static bool isMulChain(Value val, Value x) {

if (auto arg = val.dyn_cast<BlockArgument>())

return arg != x;

if (auto def = val.getDefiningOp()) {

if (isa<arith::MulFOp>(def) || isa<arith::MulIOp>(def))

return isMulChain(def->getOperand(0), x) &&

mravishankarUnsubmitted

Done

Its not required for this patch, but would be really cool to abstract out the distribution pattern here. Effectively make the operations involved here, i.e. the multiply and the add ops, (template?) parameters of the pattern. Then this generalizes to any set of ops where distributive property holds. Thats just a wish though, I dont expect that to be implemented :P .

mravishankar: Its not required for this patch, but would be really cool to abstract out the distribution…

aartbikAuthorUnsubmitted

Done

I agree, but when I was trying to reuse code from the elt-wise fusion module, none of them did exactly what I needed ;-) So I am afraid I need to see one or two more examples before getting a good sense on what could be factored out here.

aartbik: I agree, but when I was trying to reuse code from the elt-wise fusion module, none of them did…

isMulChain(def->getOperand(1), x);

}

return false;

}

// Helper to detect x = x + <multiplications>.

static bool isSumOfMul(GenericOp op) {

auto yieldOp = cast<linalg::YieldOp>(op.region().front().getTerminator());

if (auto def = yieldOp.getOperand(0).getDefiningOp()) {

if (isa<arith::AddFOp>(def) || isa<arith::AddIOp>(def)) {

Value x = op.getBlock()->getArguments().back();

return (def->getOperand(0) == x && isMulChain(def->getOperand(1), x)) ||

(def->getOperand(1) == x && isMulChain(def->getOperand(0), x));

}

return false;

}

//===---------------------------------------------------------------------===//

// The actual sparse tensor rewriting rules.

//===---------------------------------------------------------------------===//

namespace {

/// Rewriting rule that converts two kernels:

///

/// T(i,j) = SUM(k, A(i,j,k) * B(i,j,k) * ... )

/// X(i,j) = S(i,j) * T(i,j)

///

/// into a single kernel, using distributive law:

///

/// X(i,j) = SUM(k, S(i,j) * A(i,j,k) * B(i,j,k) * ... )

///

/// This kind of fusion (merging two ops into one but using arithmetic

/// equalities that may not hold for floating-point computations) would

/// be undesirable in the dense case, since we distribute the multiplication

/// into the reduction loop. However, for sparse sampling tensor S, such

/// a fusion may actually reduce the asymptotic complexity of the kernel,

/// since intermediate results may be nullified.

mravishankarUnsubmitted

Not Done

I actually dont see this as a fusion pattern. This is changing order of operations, and satisfies equality only for integers. For floats this would be within a tolerance correct. So this is an arithmetic transformation. I tend to think of fusion as cases that are not changing the order of operations. but rather taking operations from different iteration space and giving them an different schedule while still maintaining the same depedences (the length of the dependence vector might change, but all dependences are preserved).
I'd suggest renaming this as DistributeMultiplyOverAddPattern (or something more succinct if you can come up with it)

mravishankar: I actually dont see this as a fusion pattern. This is changing order of operations, and…

aartbikAuthorUnsubmitted

Done

Interesting. It this nomenclature common place? To me, we have two kernels before and one kernel after, so we "fused" the kernels into one ;-)

But I am okay with a name that reflects the arithmetic reordering better. See if you like the new name.

aartbik: Interesting. It this nomenclature common place? To me, we have two kernels before and one…

struct FuseSparseMultiplyOverAdd : public OpRewritePattern<GenericOp> {

using OpRewritePattern<GenericOp>::OpRewritePattern;

LogicalResult matchAndRewrite(GenericOp op,

PatternRewriter &rewriter) const override {

// Check consumer.

if (!op.hasTensorSemantics() || op.getNumInputs() != 2 ||

op.getNumResults() != 1)

return failure();

if (op.getNumParallelLoops() != op.getNumLoops())

return failure();

if (!op.getTiedIndexingMap(op.getOutputOperand(0)).isIdentity() ||

!op.getTiedIndexingMap(op.getInputOperand(0)).isIdentity() ||

!op.getTiedIndexingMap(op.getInputOperand(1)).isIdentity())

return failure();

// Find consuming OP2(sparse, other) or OP2(other, sparse). The other

wrengrUnsubmitted

Not Done

Why not combine these ifs via ||?

wrengr: Why not combine these `if`s via `||`?

aartbikAuthorUnsubmitted

Done

I will send out a follow up CL with this, Wren. Thanks!

aartbik: I will send out a follow up CL with this, Wren. Thanks!

// operand can be sparse or dense, since the point of this rewriting rule

mravishankarUnsubmitted

Done

Do we need to actually need them to be identity? This would work as well if they are projected permutations?

mravishankar: Do we need to actually need them to be identity? This would work as well if they are projected…

aartbikAuthorUnsubmitted

Done

In this first instance yes, because I just do "mimic" at L162.
We could, if needed, account for permutations.

aartbik: In this first instance yes, because I just do "mimic" at L162. We could, if needed, account for…

// is detecting a situation in which *more* sparsity is introduced into

// a computation, be it already sparse or still dense.

unsigned other = 0;

if (isSparseTensor(op.getInputOperand(0)))

other = 1;

else if (!isSparseTensor(op.getInputOperand(1)))

return failure();

// Check producer.

auto prod = dyn_cast_or_null<GenericOp>(

op.getInputOperand(other)->get().getDefiningOp());

if (!prod || !prod.hasTensorSemantics() || prod.getNumResults() != 1)

return failure();

if (!prod.getResult(0).hasOneUse())

return failure();

// Sampling consumer and sum of multiplication chain producer.

wrengrUnsubmitted

Not Done

ditto

wrengr: ditto

if (!isEmptyInit(op.getOutputOperand(0)) ||

!isEmptyInit(prod.getOutputOperand(0)))

return failure();

if (!isSampling(op) || !isSumOfMul(prod))

return failure();

// Modify operand structure of producer and consumer.

mravishankarUnsubmitted

Done

Nit: Change this to

if (!condition) return failure();
<current body>

mravishankar: Nit: Change this to ``` if (!condition) return failure(); <current body> ```

wrengrUnsubmitted

Not Done

ditto

wrengr: ditto

Location loc = prod.getLoc();

SmallVector<Value> inputOps = prod.getInputOperands();

SmallVector<Value> outputOps = op.getOutputOperands();

SmallVector<AffineMap> fusedIndexMaps = prod.getIndexingMaps();

inputOps.push_back(op.getInputOperand(1 - other)->get());

fusedIndexMaps.push_back(fusedIndexMaps.back()); // mimic other

// Fuse producer and consumer into a new generic op.

auto fusedOp = rewriter.create<GenericOp>(

loc, op.getResult(0).getType(), inputOps, outputOps,

rewriter.getAffineMapArrayAttr(fusedIndexMaps), prod.iterator_types(),

/*doc=*/nullptr, /*library_call=*/nullptr);

Block &prodBlock = prod.region().front();

Block &consBlock = op.region().front();

BlockAndValueMapping mapper;

Block *fusedBlock = new Block();

fusedOp.region().push_back(fusedBlock);

unsigned num = prodBlock.getNumArguments();

for (unsigned i = 0; i < num - 1; i++)

addArg(mapper, fusedBlock, prodBlock.getArgument(i));

addArg(mapper, fusedBlock, consBlock.getArgument(1 - other));

addArg(mapper, fusedBlock, prodBlock.getArgument(num - 1));

// Clone bodies of the producer and consumer in new evaluation order.

auto acc = prodBlock.getTerminator()->getOperand(0).getDefiningOp();

mravishankarUnsubmitted

Done

Nit: I am not sure how much this indirection is buying in terms of code clarity. Its calling a method that is also a one-liner.

mravishankar: Nit: I am not sure how much this indirection is buying in terms of code clarity. Its calling a…

aartbikAuthorUnsubmitted

Done

Well, if you "inline" it, it actually becomes two/three lines each, and requires braces on the loop. The trick is that assigning to the parameter saves the line for the declaration ;-)

aartbik: Well, if you "inline" it, it actually becomes two/three lines each, and requires braces on the…

auto sampler = consBlock.getTerminator()->getOperand(0).getDefiningOp();

rewriter.setInsertionPointToStart(fusedBlock);

Value last;

for (auto &op : prodBlock.without_terminator())

if (&op != acc) {

last = op.getResult(0);

rewriter.clone(op, mapper);

}

mapper.map(consBlock.getArgument(other), fusedBlock->back().getResult(0));

mapper.map(last, rewriter.clone(*sampler, mapper)->getResult(0));

last = rewriter.clone(*acc, mapper)->getResult(0);

rewriter.create<linalg::YieldOp>(loc, last);

// Replace consumer with fused operation. Old producer

// and consumer ops will be removed by DCE.

rewriter.replaceOp(op, fusedOp->getResults());

return success();

}

private:

// Helper to add argument and record the mapping.

static void addArg(BlockAndValueMapping &mapper, Block *b, BlockArgument a) {

mapper.map(a, b->addArgument(a.getType(), a.getLoc()));

}

};

} // namespace

//===---------------------------------------------------------------------===//

// Methods that add patterns described in this file to a pattern list.

//===---------------------------------------------------------------------===//

void mlir::linalg::populateSparseTensorRewriting(RewritePatternSet &patterns) {

auto *context = patterns.getContext();

patterns.add<FuseSparseMultiplyOverAdd>(context);

}

mravishankarUnsubmitted

Done

If you agree this is just an arithmetic transformation, I'd rename this to populateDistributeMultiplyOverAdditionPattern (or something more succinct - naming is hard :P )

mravishankar: If you agree this is just an arithmetic transformation, I'd rename this to…

aartbikAuthorUnsubmitted

Done

I anticipate this file to grow with more sparse tensor rewriting rules, so kept it very vague at ":populateSparseTensorRewriting",

aartbik: I anticipate this file to grow with more sparse tensor rewriting rules, so kept it very vague…

mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_sampled_mm_fusion.mlir

// RUN: mlir-opt %s --sparse-compiler \| \		// RUN: mlir-opt %s --sparse-compiler \| \
// RUN: mlir-cpu-runner -e entry -entry-point-result=void \		// RUN: mlir-cpu-runner -e entry -entry-point-result=void \
// RUN: -shared-libs=%mlir_integration_test_dir/libmlir_c_runner_utils%shlibext \| \		// RUN: -shared-libs=%mlir_integration_test_dir/libmlir_c_runner_utils%shlibext \| \
// RUN: FileCheck %s		// RUN: FileCheck %s
//		//
// Do the same run, but now with SIMDization as well. This should not change the outcome.		// Do the same run, but now with SIMDization as well. This should not change the outcome.
//		//
// RUN: mlir-opt %s -sparse-compiler="vectorization-strategy=2 vl=8" \| \		// RUN: mlir-opt %s --sparse-compiler="vectorization-strategy=2 vl=8" \| \
// RUN: mlir-cpu-runner -e entry -entry-point-result=void \		// RUN: mlir-cpu-runner -e entry -entry-point-result=void \
// RUN: -shared-libs=%mlir_integration_test_dir/libmlir_c_runner_utils%shlibext \| \		// RUN: -shared-libs=%mlir_integration_test_dir/libmlir_c_runner_utils%shlibext \| \
// RUN: FileCheck %s		// RUN: FileCheck %s

#SM = #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "compressed" ] }>		#SM = #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "compressed" ] }>

#trait_sampled_dense_dense = {		#trait_sampled_dense_dense = {
indexing_maps = [		indexing_maps = [
Show All 24 Lines	#trait_scale = {
iterator_types = ["parallel", "parallel"]		iterator_types = ["parallel", "parallel"]
}		}

//		//
// Integration test for sampled dense dense matmul fusion.		// Integration test for sampled dense dense matmul fusion.
//		//
module {		module {
//		//
// A kernel that computes a direct sampled matrix matrix multiplication.		// A kernel that computes a direct sampled matrix matrix multiplication
		// (with dense result).
//		//
func @sampled_dd(%args: tensor<8x8xf64, #SM>,		func @sampled_dd(%args: tensor<8x8xf64, #SM>,
%arga: tensor<8x8xf64>,		%arga: tensor<8x8xf64>,
%argb: tensor<8x8xf64>) -> tensor<8x8xf64> {		%argb: tensor<8x8xf64>) -> tensor<8x8xf64> {
%1 = arith.constant dense<0.0> : tensor<8x8xf64>		%1 = arith.constant dense<0.0> : tensor<8x8xf64>
%2 = linalg.generic #trait_sampled_dense_dense		%2 = linalg.generic #trait_sampled_dense_dense
ins(%args, %arga, %argb: tensor<8x8xf64, #SM>,		ins(%args, %arga, %argb: tensor<8x8xf64, #SM>,
tensor<8x8xf64>, tensor<8x8xf64>)		tensor<8x8xf64>, tensor<8x8xf64>)
outs(%1: tensor<8x8xf64>) {		outs(%1: tensor<8x8xf64>) {
^bb(%s: f64, %a: f64, %b: f64, %x: f64):		^bb(%s: f64, %a: f64, %b: f64, %x: f64):
%p = arith.mulf %a, %b : f64		%p = arith.mulf %a, %b : f64
%q = arith.mulf %s, %p : f64		%q = arith.mulf %s, %p : f64
%r = arith.addf %x, %q : f64		%r = arith.addf %x, %q : f64
linalg.yield %r : f64		linalg.yield %r : f64
} -> tensor<8x8xf64>		} -> tensor<8x8xf64>
return %2 : tensor<8x8xf64>		return %2 : tensor<8x8xf64>
}		}

//		//
// A kernel that computes an unfused sampled matrix matrix multiplication.		// A kernel that computes an unfused sampled matrix matrix multiplication
		// (with dense result).
//		//
func @sampled_dd_unfused(%args: tensor<8x8xf64, #SM>,		func @sampled_dd_unfused(%args: tensor<8x8xf64, #SM>,
%arga: tensor<8x8xf64>,		%arga: tensor<8x8xf64>,
%argb: tensor<8x8xf64>) -> (tensor<8x8xf64>, tensor<8x8xf64>) {		%argb: tensor<8x8xf64>) -> tensor<8x8xf64> {
		// Perform dense-dense matrix matrix multiplication.
%1 = arith.constant dense<0.0> : tensor<8x8xf64>		%1 = arith.constant dense<0.0> : tensor<8x8xf64>
%2 = linalg.generic #trait_matmul		%2 = linalg.generic #trait_matmul
ins(%arga, %argb : tensor<8x8xf64>, tensor<8x8xf64>)		ins(%arga, %argb : tensor<8x8xf64>, tensor<8x8xf64>)
outs(%1 : tensor<8x8xf64>) {		outs(%1 : tensor<8x8xf64>) {
^bb0(%a: f64, %b: f64, %x: f64):		^bb0(%a: f64, %b: f64, %x: f64):
%p = arith.mulf %a, %b : f64		%p = arith.mulf %a, %b : f64
%q = arith.addf %x, %p : f64		%q = arith.addf %x, %p : f64
linalg.yield %q : f64		linalg.yield %q : f64
} -> tensor<8x8xf64>		} -> tensor<8x8xf64>
		// Sample the result with elements-wise multiplication with sparse matrix.
%3 = arith.constant dense<0.0> : tensor<8x8xf64>		%3 = linalg.generic #trait_scale
%4 = linalg.generic #trait_scale
ins(%2, %args : tensor<8x8xf64>, tensor<8x8xf64, #SM>)		ins(%2, %args : tensor<8x8xf64>, tensor<8x8xf64, #SM>)
outs(%3 : tensor<8x8xf64>) {		outs(%1 : tensor<8x8xf64>) {
^bb0(%t: f64, %s: f64, %x: f64):		^bb0(%t: f64, %s: f64, %x: f64):
%r = arith.mulf %t, %s : f64		%r = arith.mulf %t, %s : f64
linalg.yield %r : f64		linalg.yield %r : f64
} -> tensor<8x8xf64>		} -> tensor<8x8xf64>
		return %3 : tensor<8x8xf64>
		}

		//
		// A kernel that computes a direct sampled matrix matrix multiplication
		// (with sparse result).
		//
		func @sparse_sampled_dd(%args: tensor<8x8xf64, #SM>,
		%arga: tensor<8x8xf64>,
		%argb: tensor<8x8xf64>) -> tensor<8x8xf64, #SM> {
		%c8 = arith.constant 8 : index
		%1 = sparse_tensor.init [%c8, %c8] : tensor<8x8xf64, #SM>
		%2 = linalg.generic #trait_sampled_dense_dense
		ins(%args, %arga, %argb: tensor<8x8xf64, #SM>,
		tensor<8x8xf64>, tensor<8x8xf64>)
		outs(%1: tensor<8x8xf64, #SM>) {
		^bb(%s: f64, %a: f64, %b: f64, %x: f64):
		%p = arith.mulf %a, %b : f64
		%q = arith.mulf %s, %p : f64
		%r = arith.addf %x, %q : f64
		linalg.yield %r : f64
		} -> tensor<8x8xf64, #SM>
		return %2 : tensor<8x8xf64, #SM>
		}

return %4, %2 : tensor<8x8xf64>, tensor<8x8xf64>		//
		// A kernel that computes an unfused sampled matrix matrix multiplication
		// (with sparse result).
		//
		func @sparse_sampled_dd_unfused(
		%args: tensor<8x8xf64, #SM>,
		%arga: tensor<8x8xf64>,
		%argb: tensor<8x8xf64>) -> tensor<8x8xf64, #SM> {
		// Perform dense-dense matrix matrix multiplication.
		%1 = arith.constant dense<0.0> : tensor<8x8xf64>
		%2 = linalg.generic #trait_matmul
		ins(%arga, %argb : tensor<8x8xf64>, tensor<8x8xf64>)
		outs(%1 : tensor<8x8xf64>) {
		^bb0(%a: f64, %b: f64, %x: f64):
		%p = arith.mulf %a, %b : f64
		%q = arith.addf %x, %p : f64
		linalg.yield %q : f64
		} -> tensor<8x8xf64>
		// Sample the result with elements-wise multiplication with sparse matrix.
		%c8 = arith.constant 8 : index
		%3 = sparse_tensor.init [%c8, %c8] : tensor<8x8xf64, #SM>
		%4 = linalg.generic #trait_scale
		ins(%2, %args : tensor<8x8xf64>, tensor<8x8xf64, #SM>)
		outs(%3 : tensor<8x8xf64, #SM>) {
		^bb0(%t: f64, %s: f64, %x: f64):
		%r = arith.mulf %t, %s : f64
		linalg.yield %r : f64
		} -> tensor<8x8xf64, #SM>
		return %4 : tensor<8x8xf64, #SM>
}		}

//		//
// Main driver.		// Main driver.
//		//
func @entry() {		func @entry() {
%d0 = arith.constant 0.0 : f64		%d0 = arith.constant 0.0 : f64
%c0 = arith.constant 0 : index		%c0 = arith.constant 0 : index

%t = arith.constant sparse<[[0, 0], [7,7]], [1.0, 2.0]>		%t = arith.constant sparse<[[0, 0], [7,7]], [1.0, 2.0]>
: tensor<8x8xf64>		: tensor<8x8xf64>
%s = sparse_tensor.convert %t		%s = sparse_tensor.convert %t
: tensor<8x8xf64> to tensor<8x8xf64, #SM>		: tensor<8x8xf64> to tensor<8x8xf64, #SM>

%a = arith.constant dense<3.0> : tensor<8x8xf64>		%a = arith.constant dense<3.0> : tensor<8x8xf64>
%b = arith.constant dense<4.0> : tensor<8x8xf64>		%b = arith.constant dense<4.0> : tensor<8x8xf64>

// Call the kernels.		// Call the kernels.
%0 = call @sampled_dd(%s, %a, %b)		%0 = call @sampled_dd(%s, %a, %b)
: (tensor<8x8xf64, #SM>,		: (tensor<8x8xf64, #SM>,
tensor<8x8xf64>, tensor<8x8xf64>) -> tensor<8x8xf64>		tensor<8x8xf64>, tensor<8x8xf64>) -> tensor<8x8xf64>
%1, %2 = call @sampled_dd_unfused(%s, %a, %b)		%1 = call @sampled_dd_unfused(%s, %a, %b)
: (tensor<8x8xf64, #SM>,		: (tensor<8x8xf64, #SM>,
tensor<8x8xf64>, tensor<8x8xf64>) -> (tensor<8x8xf64>, tensor<8x8xf64>)		tensor<8x8xf64>, tensor<8x8xf64>) -> tensor<8x8xf64>
		%2 = call @sparse_sampled_dd(%s, %a, %b)
		: (tensor<8x8xf64, #SM>,
		tensor<8x8xf64>, tensor<8x8xf64>) -> tensor<8x8xf64, #SM>
		%3 = call @sparse_sampled_dd_unfused(%s, %a, %b)
		: (tensor<8x8xf64, #SM>,
		tensor<8x8xf64>, tensor<8x8xf64>) -> tensor<8x8xf64, #SM>

// Verify the outputs.		// Verify the outputs.
//		//
// CHECK: ( ( 96, 0, 0, 0, 0, 0, 0, 0 ), ( 0, 0, 0, 0, 0, 0, 0, 0 ),		// CHECK: ( ( 96, 0, 0, 0, 0, 0, 0, 0 ), ( 0, 0, 0, 0, 0, 0, 0, 0 ),
// CHECK-SAME: ( 0, 0, 0, 0, 0, 0, 0, 0 ), ( 0, 0, 0, 0, 0, 0, 0, 0 ),		// CHECK-SAME: ( 0, 0, 0, 0, 0, 0, 0, 0 ), ( 0, 0, 0, 0, 0, 0, 0, 0 ),
// CHECK-SAME: ( 0, 0, 0, 0, 0, 0, 0, 0 ), ( 0, 0, 0, 0, 0, 0, 0, 0 ),		// CHECK-SAME: ( 0, 0, 0, 0, 0, 0, 0, 0 ), ( 0, 0, 0, 0, 0, 0, 0, 0 ),
// CHECK-SAME: ( 0, 0, 0, 0, 0, 0, 0, 0 ), ( 0, 0, 0, 0, 0, 0, 0, 192 ) )		// CHECK-SAME: ( 0, 0, 0, 0, 0, 0, 0, 0 ), ( 0, 0, 0, 0, 0, 0, 0, 192 ) )
//		//
// CHECK: ( ( 96, 0, 0, 0, 0, 0, 0, 0 ), ( 0, 0, 0, 0, 0, 0, 0, 0 ),		// CHECK: ( ( 96, 0, 0, 0, 0, 0, 0, 0 ), ( 0, 0, 0, 0, 0, 0, 0, 0 ),
// CHECK-SAME: ( 0, 0, 0, 0, 0, 0, 0, 0 ), ( 0, 0, 0, 0, 0, 0, 0, 0 ),		// CHECK-SAME: ( 0, 0, 0, 0, 0, 0, 0, 0 ), ( 0, 0, 0, 0, 0, 0, 0, 0 ),
// CHECK-SAME: ( 0, 0, 0, 0, 0, 0, 0, 0 ), ( 0, 0, 0, 0, 0, 0, 0, 0 ),		// CHECK-SAME: ( 0, 0, 0, 0, 0, 0, 0, 0 ), ( 0, 0, 0, 0, 0, 0, 0, 0 ),
// CHECK-SAME: ( 0, 0, 0, 0, 0, 0, 0, 0 ), ( 0, 0, 0, 0, 0, 0, 0, 192 ) )		// CHECK-SAME: ( 0, 0, 0, 0, 0, 0, 0, 0 ), ( 0, 0, 0, 0, 0, 0, 0, 192 ) )
//		//
		// CHECK-NEXT: ( 96, 192, 0, 0 )
		//
		// CHECK-NEXT: ( 96, 192, 0, 0 )
		//
%m0 = bufferization.to_memref %0 : memref<8x8xf64>		%m0 = bufferization.to_memref %0 : memref<8x8xf64>
%m1 = bufferization.to_memref %1 : memref<8x8xf64>		%m1 = bufferization.to_memref %1 : memref<8x8xf64>
%m2 = bufferization.to_memref %2 : memref<8x8xf64>		%m2 = sparse_tensor.values %2 : tensor<8x8xf64, #SM> to memref<?xf64>
		%m3 = sparse_tensor.values %3 : tensor<8x8xf64, #SM> to memref<?xf64>
%v0 = vector.transfer_read %m0[%c0, %c0], %d0		%v0 = vector.transfer_read %m0[%c0, %c0], %d0
: memref<8x8xf64>, vector<8x8xf64>		: memref<8x8xf64>, vector<8x8xf64>
%v1 = vector.transfer_read %m1[%c0, %c0], %d0		%v1 = vector.transfer_read %m1[%c0, %c0], %d0
: memref<8x8xf64>, vector<8x8xf64>		: memref<8x8xf64>, vector<8x8xf64>
		%v2 = vector.transfer_read %m2[%c0], %d0 : memref<?xf64>, vector<4xf64>
		%v3 = vector.transfer_read %m3[%c0], %d0 : memref<?xf64>, vector<4xf64>
vector.print %v0 : vector<8x8xf64>		vector.print %v0 : vector<8x8xf64>
vector.print %v1 : vector<8x8xf64>		vector.print %v1 : vector<8x8xf64>
		vector.print %v2 : vector<4xf64>
		vector.print %v3 : vector<4xf64>

// Release the resources.		// Release the resources.
sparse_tensor.release %s : tensor<8x8xf64, #SM>		sparse_tensor.release %s : tensor<8x8xf64, #SM>
memref.dealloc %m0 : memref<8x8xf64>		memref.dealloc %m0 : memref<8x8xf64>
memref.dealloc %m1 : memref<8x8xf64>		memref.dealloc %m1 : memref<8x8xf64>
memref.dealloc %m2 : memref<8x8xf64>		sparse_tensor.release %2 : tensor<8x8xf64, #SM>
		sparse_tensor.release %3 : tensor<8x8xf64, #SM>

return		return
}		}
}		}

mlir/test/Integration/Dialect/SparseTensor/taco/test_SDDMM.py

	Show All 27 Lines
	# Define the SDDMM kernel. Since this performs the reduction as			# Define the SDDMM kernel. Since this performs the reduction as
	# sum(k, S[i, j] * A[i, k] * B[k, j])			# sum(k, S[i, j] * A[i, k] * B[k, j])
	# we only compute the intermediate dense matrix product that are actually			# we only compute the intermediate dense matrix product that are actually
	# needed to compute the result, with proper asymptotic complexity.			# needed to compute the result, with proper asymptotic complexity.
	X[i, j] = S[i, j] * A[i, k] * B[k, j]			X[i, j] = S[i, j] * A[i, k] * B[k, j]

	# Alternative way to define SDDMM kernel. Since this performs the reduction as			# Alternative way to define SDDMM kernel. Since this performs the reduction as
	# sum(k, A[i, k] * B[k, j]) * S[i, j]			# sum(k, A[i, k] * B[k, j]) * S[i, j]
	# the MLIR lowering results in two separate tensor index expressions that			# the MLIR lowering results in two separate tensor index expressions that are
	# need to be fused properly to guarantee proper asymptotic complexity.			# fused prior to running the sparse compiler in order to guarantee proper
				# asymptotic complexity.
	Y[i, j] = A[i, k] * B[k, j] * S[i, j]			Y[i, j] = A[i, k] * B[k, j] * S[i, j]

	expected = """; extended FROSTT format			expected = """; extended FROSTT format
	2 1			2 1
	8 8			8 8
	1 8 2016			1 8 2016
	"""			"""

	Show All 12 Lines

This is an archive of the discontinued LLVM Phabricator instance.

[mlir][sparse][linalg] add linalg rewriting specific to sparse tensors
ClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 410981

mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h

mlir/lib/Dialect/Linalg/Transforms/CMakeLists.txt

mlir/lib/Dialect/Linalg/Transforms/ElementwiseOpFusion.cpp

mlir/lib/Dialect/Linalg/Transforms/SparseTensorRewriting.cpp

mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_sampled_mm_fusion.mlir

mlir/test/Integration/Dialect/SparseTensor/taco/test_SDDMM.py

This is an archive of the discontinued LLVM Phabricator instance.

[mlir][sparse][linalg] add linalg rewriting specific to sparse tensorsClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 410981

mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h

mlir/lib/Dialect/Linalg/Transforms/CMakeLists.txt

mlir/lib/Dialect/Linalg/Transforms/ElementwiseOpFusion.cpp

mlir/lib/Dialect/Linalg/Transforms/SparseTensorRewriting.cpp

mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_sampled_mm_fusion.mlir

mlir/test/Integration/Dialect/SparseTensor/taco/test_SDDMM.py

[mlir][sparse][linalg] add linalg rewriting specific to sparse tensors
ClosedPublic