Diff 312639

mlir/lib/Dialect/Linalg/Transforms/Sparsification.cpp

Show First 20 Lines • Show All 46 Lines • ▼ Show 20 Lines
#include "mlir/Dialect/SCF/SCF.h"		#include "mlir/Dialect/SCF/SCF.h"
#include "mlir/Dialect/StandardOps/IR/Ops.h"		#include "mlir/Dialect/StandardOps/IR/Ops.h"

using namespace mlir;		using namespace mlir;

namespace {		namespace {

enum class Kind { kTensor, kInvariant, kMulF, kMulI, kAddF, kAddI };		enum class Kind { kTensor, kInvariant, kMulF, kMulI, kAddF, kAddI };
		enum class Dim { kSparse, kDense, kUndef };
		penpornkUnsubmitted Done Reply Inline Actions It's great that we are switching to enum. In the format extension paper there are 6 types [1]. :) [1] http://tensor-compiler.org/chou-oopsla18-taco-formats.pdf -- page 123:7 penpornk: It's great that we are switching to enum. In the format extension paper there are 6 types [1].
		aartbikAuthorUnsubmitted Done Reply Inline Actions Yes, the initial goal was to make "not-used" a bit more apparent (rather than sweeping them under the dense carpet ;-), but you are right this also prepares future extension! aartbik: Yes, the initial goal was to make "not-used" a bit more apparent (rather than sweeping them…

/// Tensor expression. Represents a MLIR expression in tensor index notation.		/// Tensor expression. Represents a MLIR expression in tensor index notation.
/// For tensors, e0 denotes the tensor index. For invariants, the IR value is		/// For tensors, e0 denotes the tensor index. For invariants, the IR value is
/// stored directly. For binary operations, e0 and e1 denote the index of the		/// stored directly. For binary operations, e0 and e1 denote the index of the
/// children tensor expressions.		/// children tensor expressions.
struct TensorExp {		struct TensorExp {
TensorExp(Kind k, unsigned x, unsigned y, Value v)		TensorExp(Kind k, unsigned x, unsigned y, Value v)
: kind(k), e0(x), e1(y), val(v) {		: kind(k), e0(x), e1(y), val(v) {
Show All 13 Lines
/// Lattice point. Each lattice point consists of a conjunction of tensor		/// Lattice point. Each lattice point consists of a conjunction of tensor
/// loop indices (encoded in a bitvector) and the index of the corresponding		/// loop indices (encoded in a bitvector) and the index of the corresponding
/// tensor expression.		/// tensor expression.
struct LatPoint {		struct LatPoint {
LatPoint(unsigned n, unsigned e, unsigned b) : bits(n, false), exp(e) {		LatPoint(unsigned n, unsigned e, unsigned b) : bits(n, false), exp(e) {
bits.set(b);		bits.set(b);
}		}
LatPoint(const llvm::BitVector &b, unsigned e) : bits(b), exp(e) {}		LatPoint(const llvm::BitVector &b, unsigned e) : bits(b), exp(e) {}
/// Conjunction of tensor loop indices as bitvector.		/// Conjunction of tensor loop indices as bitvector. This represents
		/// all indices involved in the tensor expression
llvm::BitVector bits;		llvm::BitVector bits;
		/// Simplified conjunction of tensor loop indices as bitvector. This
		/// represents a simplified condition under which this tensor expression
		/// must execute. Pre-computed during codegen to avoid repeated eval.
		llvm::BitVector simple;
/// Index of the tensor expresssion.		/// Index of the tensor expresssion.
unsigned exp;		unsigned exp;
};		};

/// A class to handle all iteration lattice operations. This class abstracts		/// A class to handle all iteration lattice operations. This class abstracts
/// away from some implementation details of storing iteration lattices and		/// away from some implementation details of storing iteration lattices and
/// tensor expressions. This allows for fine-tuning performance characteristics		/// tensor expressions. This allows for fine-tuning performance characteristics
/// independently from the basic algorithm if bottlenecks are identified.		/// independently from the basic algorithm if bottlenecks are identified.
class Merger {		class Merger {
public:		public:
		/// Constructs a merger for the given number of tensors and loops. The
		/// user supplies the number of tensors involved in the kernel, with the
		penpornkUnsubmitted Done Reply Inline Actions Should "last tensor" be "2nd-to-last tensor" or "last non-artificial tensor"? (This sounds verbose but it technically isn't the last tensor anymore so we should be clear.) penpornk: Should "last tensor" be "2nd-to-last tensor" or "last non-artificial tensor"? (This sounds…
		aartbikAuthorUnsubmitted Done Reply Inline Actions Yes, last was relative to the given #, but I agree this could be phrased better. Reworded. aartbik: Yes, last was relative to the given #, but I agree this could be phrased better. Reworded.
		/// last tensor in this set denoting the output tensor. The merger adds an
		penpornkUnsubmitted Done Reply Inline Actions Nit: Maybe add "at the end" after "non-existing tensor"? penpornk: Nit: Maybe add "at the end" after "non-existing tensor"?
		/// additional synthetic tensor at the end of this set to represent all
		/// invariant expressions in the kernel.
Merger(unsigned t, unsigned l)		Merger(unsigned t, unsigned l)
: numTensors(t), numLoops(l), isSparse(t, std::vector<bool>(l, false)) {}		: outTensor(t - 1), numTensors(t + 1), numLoops(l),
		dims(t + 1, std::vector<Dim>(l, Dim::kUndef)) {}

/// Adds a tensor expression. Returns its index.		/// Adds a tensor expression. Returns its index.
unsigned addExp(Kind k, unsigned e0, unsigned e1 = -1u, Value v = Value()) {		unsigned addExp(Kind k, unsigned e0, unsigned e1 = -1u, Value v = Value()) {
unsigned e = tensorExps.size();		unsigned e = tensorExps.size();
tensorExps.push_back(TensorExp(k, e0, e1, v));		tensorExps.push_back(TensorExp(k, e0, e1, v));
return e;		return e;
}		}
unsigned addExp(Kind k, Value v) { return addExp(k, -1u, -1u, v); }		unsigned addExp(Kind k, Value v) { return addExp(k, -1u, -1u, v); }
Show All 21 Lines	unsigned conjLatPoint(Kind kind, unsigned p0, unsigned p1) {
unsigned p = latPoints.size();		unsigned p = latPoints.size();
llvm::BitVector nb = llvm::BitVector(latPoints[p0].bits);		llvm::BitVector nb = llvm::BitVector(latPoints[p0].bits);
nb \|= latPoints[p1].bits;		nb \|= latPoints[p1].bits;
unsigned e = addExp(kind, latPoints[p0].exp, latPoints[p1].exp);		unsigned e = addExp(kind, latPoints[p0].exp, latPoints[p1].exp);
latPoints.push_back(LatPoint(nb, e));		latPoints.push_back(LatPoint(nb, e));
return p;		return p;
}		}

/// Conjunctive merge of L1 and L2 is conjunction of cartesian product.		/// Conjunctive merge of two lattice sets L0 and L1 is conjunction of
/// Returns the index of the new set.		/// cartesian product. Returns the index of the new set.
unsigned takeConj(Kind kind, unsigned s0, unsigned s1) {		unsigned takeConj(Kind kind, unsigned s0, unsigned s1) {
unsigned s = addSet();		unsigned s = addSet();
for (unsigned p0 : latSets[s0])		for (unsigned p0 : latSets[s0])
for (unsigned p1 : latSets[s1])		for (unsigned p1 : latSets[s1])
latSets[s].push_back(conjLatPoint(kind, p0, p1));		latSets[s].push_back(conjLatPoint(kind, p0, p1));
return s;		return s;
}		}

/// Disjunctive merge of L0 and L1 is (L0 /\_op L1, L0, L1).		/// Disjunctive merge of two lattice sets L0 and L1 is (L0 /\_op L1, L0, L1).
		penpornkUnsubmitted Done Reply Inline Actions What does `/\_op` mean? Do you mean to draw disjunction `\/`? penpornk: What does `/\_op` mean? Do you mean to draw disjunction `\/`?
		aartbikAuthorUnsubmitted Done Reply Inline Actions No, it really is conjunction operator. This may be confusing at first, but check Fred's paper, section 5.1 aartbik: No, it really is conjunction operator. This may be confusing at first, but check Fred's paper…
/// Returns the index of the new set.		/// Returns the index of the new set.
unsigned takeDisj(Kind kind, unsigned s0, unsigned s1) {		unsigned takeDisj(Kind kind, unsigned s0, unsigned s1) {
unsigned s = takeConj(kind, s0, s1);		unsigned s = takeConj(kind, s0, s1);
for (unsigned p : latSets[s0])		for (unsigned p : latSets[s0])
latSets[s].push_back(p);		latSets[s].push_back(p);
for (unsigned p : latSets[s1])		for (unsigned p : latSets[s1])
latSets[s].push_back(p);		latSets[s].push_back(p);
return s;		return s;
}		}

/// Optimizes the iteration lattice points in the given set. This		/// Optimizes the iteration lattice points in the given set. This
/// method should be called right before code generation to avoid		/// method should be called right before code generation to avoid
/// generating redundant loops and conditions.		/// generating redundant loops and conditions.
unsigned optimize(unsigned s0) {		unsigned optimizeSet(unsigned s0) {
unsigned s = addSet();		unsigned s = addSet();
assert(latSets[s0].size() != 0);		assert(latSets[s0].size() != 0);
unsigned p0 = latSets[s0][0];		unsigned p0 = latSets[s0][0];
for (unsigned p1 : latSets[s0]) {		for (unsigned p1 : latSets[s0]) {
bool add = true;		bool add = true;
		llvm::BitVector simple = simplifyCond(s0, p1);
if (p0 != p1) {		if (p0 != p1) {
// Is this a straightforward copy?		// Is this a straightforward copy?
unsigned e = latPoints[p1].exp;		unsigned e = latPoints[p1].exp;
if (exp(e).kind == Kind::kTensor && exp(e).e0 == numTensors - 1)		if (exp(e).kind == Kind::kTensor && exp(e).e0 == outTensor)
continue;		continue;
// Is any dense index exhausted?		// Only dense exhausted?
llvm::BitVector tmp = latPoints[p1].bits;		llvm::BitVector tmp = latPoints[p1].bits;
tmp ^= latPoints[p0].bits;		tmp ^= latPoints[p0].bits;
if (hasAnyOf(tmp, false))		if (!hasAnyDimOf(tmp, Dim::kSparse))
continue;		continue;
// Is this a direct duplication of an earlier conjunction?		// Duplication of an earlier conjunction?
for (unsigned p2 : latSets[s]) {		for (unsigned p2 : latSets[s]) {
tmp = latPoints[p1].bits;		tmp = simple;
tmp ^= latPoints[p2].bits;		tmp ^= latPoints[p2].simple;
if (tmp.count() == 0) {		if (tmp.count() == 0) {
add = false;		add = false;
break;		break;
}		}
}		}
assert(!add \|\| latGT(p0, p1));		assert(!add \|\| latGT(p0, p1));
}		}
if (add)		if (add) {
latSets[s].push_back(p1);		latSets[s].push_back(p1);
		latPoints[latSets[s].back()].simple = simple;
		}
}		}
return s;		return s;
}		}

// Returns true if Li > Lj.		/// Simplifies the conditions in a conjunction of a given lattice point
		/// within the given set using just two basic rules:
		/// (1) multiple dense conditions are reduced to single dense, and
		/// (2) a singleton sparse/dense is reduced to sparse/random access.
		llvm::BitVector simplifyCond(unsigned s, unsigned p0) {
		// First determine if this lattice point is a singleton, i.e.,
		// the last point in a lattice, no other is less than this one.
		bool isSingleton = true;
		for (unsigned p1 : latSets[s]) {
		if (p0 != p1 && latGT(p0, p1)) {
		penpornkUnsubmitted Not Done Reply Inline Actions Should the quick check `p0 != p1` be in latGT instead? Or do you want to avoid the function call? (It's small enough and is likely inlined though.) penpornk: Should the quick check `p0 != p1` be in latGT instead? Or do you want to avoid the function…
		aartbikAuthorUnsubmitted Done Reply Inline Actions No, because I really want LT, and not LE in this case. I could add a latEQ if you think that is better, and then combine the two into a latLE? aartbik: No, because I really want LT, and not LE in this case. I could add a latEQ if you think that is…
		penpornkUnsubmitted Done Reply Inline Actions I think this is fine. Thank you for the clarification! :) penpornk: I think this is fine. Thank you for the clarification! :)
		unsigned e = latPoints[p1].exp;
		if (exp(e).kind == Kind::kTensor && exp(e).e0 == outTensor)
		continue;
		llvm::BitVector tmp = latPoints[p1].bits;
		tmp ^= latPoints[p0].bits;
		if (hasAnyDimOf(tmp, Dim::kSparse)) {
		isSingleton = false;
		break;
		}
		}
		}
		// Now apply the two basic rules.
		llvm::BitVector simple = latPoints[p0].bits;
		bool reset = isSingleton && hasAnyDimOf(simple, Dim::kSparse);
		for (unsigned b = 0, be = simple.size(); b < be; b++) {
		if (simple[b] && !isDim(b, Dim::kSparse)) {
		if (reset)
		simple.reset(b);
		reset = true;
		}
		}
		return simple;
		}

		/// Returns true if Li > Lj.
bool latGT(unsigned i, unsigned j) const {		bool latGT(unsigned i, unsigned j) const {
const llvm::BitVector &bitsi = latPoints[i].bits;		const llvm::BitVector &bitsi = latPoints[i].bits;
const llvm::BitVector &bitsj = latPoints[j].bits;		const llvm::BitVector &bitsj = latPoints[j].bits;
assert(bitsi.size() == bitsj.size());		assert(bitsi.size() == bitsj.size());
if (bitsi.count() > bitsj.count()) {		if (bitsi.count() > bitsj.count()) {
for (unsigned b = 0, be = bitsj.size(); b < be; b++)		for (unsigned b = 0, be = bitsj.size(); b < be; b++)
if (bitsj[b] && !bitsi[b])		if (bitsj[b] && !bitsi[b])
return false;		return false;
return true;		return true;
}		}
return false;		return false;
}		}

// Bit translation.		/// Bit translation.
unsigned tensor(unsigned b) const { return b % numTensors; }		unsigned tensor(unsigned b) const { return b % numTensors; }
unsigned index(unsigned b) const { return b / numTensors; }		unsigned index(unsigned b) const { return b / numTensors; }

// Returns true if bit corresponds to sparse access.		/// Returns true if bit corresponds to queried dim.
bool isSparseBit(unsigned b) const {		bool isDim(unsigned b, Dim d) const { return isDim(tensor(b), index(b), d); }
return isSparseAccess(tensor(b), index(b));
}

// Returns true if tensor access at given index is sparse.		/// Returns true if tensor access at given index has queried dim.
bool isSparseAccess(unsigned t, unsigned i) const {		bool isDim(unsigned t, unsigned i, Dim d) const {
assert(t < numTensors && i < numLoops);		assert(t < numTensors && i < numLoops);
return isSparse[t][i];		return dims[t][i] == d;
}		}

// Returns true if any set bit corresponds to sparse/dense access.		/// Returns true if any set bit corresponds to queried dim.
bool hasAnyOf(const llvm::BitVector &bits, bool sparse) const {		bool hasAnyDimOf(const llvm::BitVector &bits, Dim d) const {
for (unsigned b = 0, be = bits.size(); b < be; b++)		for (unsigned b = 0, be = bits.size(); b < be; b++)
if (bits[b] && isSparseBit(b) == sparse)		if (bits[b] && isDim(b, d))
return true;		return true;
return false;		return false;
}		}

// Getters.		// Setter
std::vector<std::vector<bool>> &sparse() { return isSparse; }		void setDim(unsigned t, unsigned i, Dim d) { dims[t][i] = d; }

		/// Getters.
TensorExp &exp(unsigned e) { return tensorExps[e]; }		TensorExp &exp(unsigned e) { return tensorExps[e]; }
LatPoint &lat(unsigned l) { return latPoints[l]; }		LatPoint &lat(unsigned l) { return latPoints[l]; }
SmallVector<unsigned, 16> &set(unsigned s) { return latSets[s]; }		SmallVector<unsigned, 16> &set(unsigned s) { return latSets[s]; }

private:		private:
		const unsigned outTensor;
const unsigned numTensors;		const unsigned numTensors;
const unsigned numLoops;		const unsigned numLoops;

std::vector<std::vector<bool>> isSparse;		std::vector<std::vector<Dim>> dims;
llvm::SmallVector<TensorExp, 32> tensorExps;		llvm::SmallVector<TensorExp, 32> tensorExps;
llvm::SmallVector<LatPoint, 16> latPoints;		llvm::SmallVector<LatPoint, 16> latPoints;
llvm::SmallVector<SmallVector<unsigned, 16>, 8> latSets;		llvm::SmallVector<SmallVector<unsigned, 16>, 8> latSets;
};		};

// Code generation.		// Code generation.
struct CodeGen {		struct CodeGen {
CodeGen(linalg::SparsificationOptions o, unsigned numTensors,		CodeGen(linalg::SparsificationOptions o, unsigned numTensors,
unsigned numLoops)		unsigned numLoops)
: options(o), loops(numLoops), sizes(numLoops), buffers(numTensors),		: options(o), loops(numLoops), sizes(numLoops), buffers(numTensors),
pointers(numTensors, std::vector<Value>(numLoops)),		pointers(numTensors, std::vector<Value>(numLoops)),
indices(numTensors, std::vector<Value>(numLoops)),		indices(numTensors, std::vector<Value>(numLoops)),
highs(numTensors, std::vector<Value>(numLoops)),		highs(numTensors, std::vector<Value>(numLoops)),
pidxs(numTensors, std::vector<Value>(numLoops)),		pidxs(numTensors, std::vector<Value>(numLoops)),
idxs(numTensors, std::vector<Value>(numLoops)) {}		idxs(numTensors, std::vector<Value>(numLoops)), redExp(-1u), redVal() {}
// Sparsification options.		/// Sparsification options.
linalg::SparsificationOptions options;		linalg::SparsificationOptions options;
// Universal dense indices and upper bounds (by index). The loops array		/// Universal dense indices and upper bounds (by index). The loops array
// is updated with the value of the universal dense index in the current		/// is updated with the value of the universal dense index in the current
// loop. The sizes array is set once with the inferred dimension sizes.		/// loop. The sizes array is set once with the inferred dimension sizes.
std::vector<Value> loops;		std::vector<Value> loops;
std::vector<Value> sizes;		std::vector<Value> sizes;
// Buffers for storing dense and sparse numerical values (by tensor).		/// Buffers for storing dense and sparse numerical values (by tensor).
// This array is set once during bufferization of all tensors.		/// This array is set once during bufferization of all tensors.
std::vector<Value> buffers;		std::vector<Value> buffers;
// Sparse storage schemes (1-D): pointers and indices (by tensor and index).		/// Sparse storage schemes (1-D): pointers and indices (by tensor and index).
// This array is set once during bufferization of all sparse tensors.		/// This array is set once during bufferization of all sparse tensors.
std::vector<std::vector<Value>> pointers;		std::vector<std::vector<Value>> pointers;
std::vector<std::vector<Value>> indices;		std::vector<std::vector<Value>> indices;
// Sparse iteration information (by tensor and index). These arrays		/// Sparse iteration information (by tensor and index). These arrays
// are updated to remain current within the current loop.		/// are updated to remain current within the current loop.
std::vector<std::vector<Value>> highs;		std::vector<std::vector<Value>> highs;
std::vector<std::vector<Value>> pidxs;		std::vector<std::vector<Value>> pidxs;
std::vector<std::vector<Value>> idxs;		std::vector<std::vector<Value>> idxs;
		/// Current reduction, updated during code generation. When indices of a
		/// reduction are exhausted, all inner loops can "scalarize" the reduction.
		// TODO: currently only done for (a chain of) innermost for-loops, where it
		// is most effective; we could generalize to more outer and while-loops.
		unsigned redExp;
		Value redVal;
};		};

} // namespace		} // namespace

/// Helper method to inspect sparse annotations in the linalg operation.		/// Helper method to inspect sparse annotations in the linalg operation.
/// Fills the per-dimension sparsity information for all tensors.		/// Fills the per-dimension sparsity information for all tensors.
static void findSparseAnnotations(linalg::GenericOp op,		static void findSparseAnnotations(Merger &merger, linalg::GenericOp op) {
std::vector<std::vector<bool>> &isSparse) {
unsigned numTensors = op.getNumInputsAndOutputs();		unsigned numTensors = op.getNumInputsAndOutputs();
ArrayAttr sparseAttr = op.sparseAttr();		ArrayAttr sparseAttr = op.sparseAttr();
for (unsigned t = 0; t < numTensors; t++) {		for (unsigned t = 0; t < numTensors; t++) {
auto map = op.getIndexingMap(t);		auto map = op.getIndexingMap(t);
auto dimAttr = sparseAttr[t].cast<ArrayAttr>();		auto dimAttr = sparseAttr[t].cast<ArrayAttr>();
// For each tensor, we accept a per-dimension Sparse or Dense annotation.		// For each tensor, we accept a per-dimension Sparse or Dense annotation.
// This is translated to the loop index that indexes that dimension.		// This is translated to the loop index that indexes that dimension.
unsigned rank = op.getShapedType(t).getRank();		unsigned rank = op.getShapedType(t).getRank();
for (unsigned d = 0; d < rank; d++)		for (unsigned d = 0; d < rank; d++) {
if (isSparseDim(dimAttr[d])) {
unsigned idx = map.getDimPosition(d);		unsigned idx = map.getDimPosition(d);
isSparse[t][idx] = true;		if (isSparseDim(dimAttr[d])) {
		merger.setDim(t, idx, Dim::kSparse);
} else {		} else {
assert(isDenseDim(dimAttr[d]));		assert(isDenseDim(dimAttr[d]));
		merger.setDim(t, idx, Dim::kDense);
		}
}		}
}		}
}		}

/// A DFS helper to compute a topological sort. Note that recursion is		/// A DFS helper to compute a topological sort. Note that recursion is
/// bounded by the number of implicit loops, which is always small.		/// bounded by the number of implicit loops, which is always small.
/// Returns false when a cycle is detected.		/// Returns false when a cycle is detected.
static bool topSortDFS(unsigned i, std::vector<unsigned> &visit,		static bool topSortDFS(unsigned i, std::vector<unsigned> &visit,
std::vector<unsigned> &topSort,		std::vector<unsigned> &topSort,
▲ Show 20 Lines • Show All 96 Lines • ▼ Show 20 Lines

/// Builds the iteration lattices in a bottom-up traversal given the remaining		/// Builds the iteration lattices in a bottom-up traversal given the remaining
/// tensor (sub)expression and the next loop index in the iteration graph.		/// tensor (sub)expression and the next loop index in the iteration graph.
static unsigned buildLattices(Merger &merger, linalg::GenericOp op,		static unsigned buildLattices(Merger &merger, linalg::GenericOp op,
unsigned exp, unsigned idx) {		unsigned exp, unsigned idx) {
Kind kind = merger.exp(exp).kind;		Kind kind = merger.exp(exp).kind;
if (kind == Kind::kTensor \|\| kind == Kind::kInvariant) {		if (kind == Kind::kTensor \|\| kind == Kind::kInvariant) {
// Either the index is really used in the tensor expression, or it is		// Either the index is really used in the tensor expression, or it is
// set to the "non-existing dense index" in that dimension. Invariant		// set to the undefined index in that dimension. An invariant expression
// expressions borrow the output tensor indices.		// is set to a synthetic tensor with undefined indices only.
unsigned s = merger.addSet();		unsigned s = merger.addSet();
unsigned t = kind == Kind::kTensor ? merger.exp(exp).e0		unsigned t = kind == Kind::kTensor ? merger.exp(exp).e0
: op.getNumInputsAndOutputs() - 1;		: op.getNumInputsAndOutputs();
merger.set(s).push_back(merger.addLat(t, idx, exp));		merger.set(s).push_back(merger.addLat(t, idx, exp));
return s;		return s;
}		}
unsigned s0 = buildLattices(merger, op, merger.exp(exp).e0, idx);		unsigned s0 = buildLattices(merger, op, merger.exp(exp).e0, idx);
unsigned s1 = buildLattices(merger, op, merger.exp(exp).e1, idx);		unsigned s1 = buildLattices(merger, op, merger.exp(exp).e1, idx);
switch (kind) {		switch (kind) {
case Kind::kTensor:		case Kind::kTensor:
case Kind::kInvariant:		case Kind::kInvariant:
▲ Show 20 Lines • Show All 41 Lines • ▼ Show 20 Lines	for (unsigned t = 0; t < numTensors; t++) {
auto shape = tensorType.getShape();		auto shape = tensorType.getShape();
auto map = op.getIndexingMap(t);		auto map = op.getIndexingMap(t);
// Scan all dimensions of current tensor.		// Scan all dimensions of current tensor.
bool allDense = true;		bool allDense = true;
args.clear();		args.clear();
for (unsigned d = 0, rank = shape.size(); d < rank; d++) {		for (unsigned d = 0, rank = shape.size(); d < rank; d++) {
unsigned i = map.getDimPosition(d);		unsigned i = map.getDimPosition(d);
// Handle sparse storage schemes.		// Handle sparse storage schemes.
if (merger.isSparseAccess(t, i)) {		if (merger.isDim(t, i, Dim::kSparse)) {
allDense = false;		allDense = false;
auto dynShape = {ShapedType::kDynamicSize};		auto dynShape = {ShapedType::kDynamicSize};
auto ptrTp = MemRefType::get(		auto ptrTp = MemRefType::get(
dynShape, genIntType(rewriter, codegen.options.ptrType));		dynShape, genIntType(rewriter, codegen.options.ptrType));
auto indTp = MemRefType::get(		auto indTp = MemRefType::get(
dynShape, genIntType(rewriter, codegen.options.indType));		dynShape, genIntType(rewriter, codegen.options.indType));
codegen.pointers[t][i] = rewriter.create<AllocaOp>(loc, ptrTp, unknown);		codegen.pointers[t][i] = rewriter.create<AllocaOp>(loc, ptrTp, unknown);
codegen.indices[t][i] = rewriter.create<AllocaOp>(loc, indTp, unknown);		codegen.indices[t][i] = rewriter.create<AllocaOp>(loc, indTp, unknown);
Show All 29 Lines
}		}

/// Generates a load on a dense or sparse tensor.		/// Generates a load on a dense or sparse tensor.
static Value genTensorLoad(Merger &merger, CodeGen &codegen,		static Value genTensorLoad(Merger &merger, CodeGen &codegen,
PatternRewriter &rewriter, linalg::GenericOp op,		PatternRewriter &rewriter, linalg::GenericOp op,
unsigned exp) {		unsigned exp) {
// Test if the load was hoisted to a higher loop nest.		// Test if the load was hoisted to a higher loop nest.
Value val = merger.exp(exp).val;		Value val = merger.exp(exp).val;
if (val) {		if (val)
merger.exp(exp).val = Value(); // reset
return val;		return val;
}
// Actual load.		// Actual load.
SmallVector<Value, 4> args;		SmallVector<Value, 4> args;
unsigned tensor = merger.exp(exp).e0;		unsigned tensor = merger.exp(exp).e0;
auto map = op.getIndexingMap(tensor);		auto map = op.getIndexingMap(tensor);
bool sparse = false;		bool sparse = false;
for (unsigned i = 0, m = map.getNumResults(); i < m; ++i) {		for (unsigned i = 0, m = map.getNumResults(); i < m; ++i) {
unsigned idx = map.getDimPosition(i);		unsigned idx = map.getDimPosition(i);
args.push_back(codegen.loops[idx]); // universal dense index		args.push_back(codegen.loops[idx]); // universal dense index
if (sparse \|\| merger.isSparseAccess(tensor, idx)) {		if (sparse \|\| merger.isDim(tensor, idx, Dim::kSparse)) {
sparse = true;		sparse = true;
args.clear();		args.clear();
args.push_back(codegen.pidxs[tensor][idx]); // position index		args.push_back(codegen.pidxs[tensor][idx]); // position index
}		}
}		}
Location loc = op.getLoc();		Location loc = op.getLoc();
Value ptr = codegen.buffers[tensor];		Value ptr = codegen.buffers[tensor];
return rewriter.create<LoadOp>(loc, ptr, args);		return rewriter.create<LoadOp>(loc, ptr, args);
}		}

/// Generates a store on a dense tensor.		/// Generates a store on a dense tensor.
static void genTensorStore(Merger &merger, CodeGen &codegen,		static void genTensorStore(Merger &merger, CodeGen &codegen,
PatternRewriter &rewriter, linalg::GenericOp op,		PatternRewriter &rewriter, linalg::GenericOp op,
unsigned tensor, Value rhs) {		unsigned tensor, Value rhs) {
		// Test if this is a scalarized reduction.
		unsigned lhs = op.getNumInputsAndOutputs() - 1;
		if (lhs == tensor && codegen.redVal) {
		codegen.redVal = rhs;
		return;
		}
		// Actual load.
SmallVector<Value, 4> args;		SmallVector<Value, 4> args;
auto map = op.getIndexingMap(tensor);		auto map = op.getIndexingMap(tensor);
for (unsigned i = 0, m = map.getNumResults(); i < m; ++i) {		for (unsigned i = 0, m = map.getNumResults(); i < m; ++i) {
unsigned idx = map.getDimPosition(i);		unsigned idx = map.getDimPosition(i);
args.push_back(codegen.loops[idx]); // universal dense index		args.push_back(codegen.loops[idx]); // universal dense index
}		}
Location loc = op.getLoc();		Location loc = op.getLoc();
Value ptr = codegen.buffers[tensor];		Value ptr = codegen.buffers[tensor];
Show All 37 Lines	static Value genExp(Merger &merger, CodeGen &codegen, PatternRewriter &rewriter,
case Kind::kAddI:		case Kind::kAddI:
return rewriter.create<AddIOp>(op.getLoc(), v0, v1);		return rewriter.create<AddIOp>(op.getLoc(), v0, v1);
}		}
}		}

/// Hoists loop invariant tensor loads for which indices have been exhausted.		/// Hoists loop invariant tensor loads for which indices have been exhausted.
static void genInvariants(Merger &merger, CodeGen &codegen,		static void genInvariants(Merger &merger, CodeGen &codegen,
PatternRewriter &rewriter, linalg::GenericOp op,		PatternRewriter &rewriter, linalg::GenericOp op,
unsigned exp) {		unsigned exp, unsigned ldx, bool hoist) {
if (merger.exp(exp).kind == Kind::kTensor) {		if (merger.exp(exp).kind == Kind::kTensor) {
unsigned lhs = op.getNumInputsAndOutputs() - 1;		// Inspect tensor indices.
		bool atLevel = ldx == -1u;
unsigned tensor = merger.exp(exp).e0;		unsigned tensor = merger.exp(exp).e0;
if (tensor == lhs)
return; // TODO: scalarize reduction as well (using scf.yield)
auto map = op.getIndexingMap(tensor);		auto map = op.getIndexingMap(tensor);
for (unsigned i = 0, m = map.getNumResults(); i < m; ++i) {		for (unsigned i = 0, m = map.getNumResults(); i < m; ++i) {
unsigned idx = map.getDimPosition(i);		unsigned idx = map.getDimPosition(i);
if (!codegen.loops[idx])		if (!codegen.loops[idx])
return; // still in play		return; // still in play
		else if (idx == ldx)
		atLevel = true;
		}
		// All exhausted at this level (atLevel denotes exactly at this level).
		unsigned lhs = op.getNumInputsAndOutputs() - 1;
		if (lhs == tensor) {
		codegen.redExp = hoist ? exp : -1u;
		} else if (atLevel) {
		merger.exp(exp).val =
		hoist ? genTensorLoad(merger, codegen, rewriter, op, exp) : Value();
}		}
// All exhausted at this level.
merger.exp(exp).val = genTensorLoad(merger, codegen, rewriter, op, exp);

} else if (merger.exp(exp).kind != Kind::kInvariant) {		} else if (merger.exp(exp).kind != Kind::kInvariant) {
// Traverse into the binary operations. Note that we only hoist		// Traverse into the binary operations. Note that we only hoist
// tensor loads, since subsequent MLIR/LLVM passes know how to		// tensor loads, since subsequent MLIR/LLVM passes know how to
// deal with all other kinds of derived loop invariants.		// deal with all other kinds of derived loop invariants.
genInvariants(merger, codegen, rewriter, op, merger.exp(exp).e0);		unsigned e0 = merger.exp(exp).e0;
genInvariants(merger, codegen, rewriter, op, merger.exp(exp).e1);		unsigned e1 = merger.exp(exp).e1;
		genInvariants(merger, codegen, rewriter, op, e0, ldx, hoist);
		genInvariants(merger, codegen, rewriter, op, e1, ldx, hoist);
}		}
}		}

/// Generates initialization code for the subsequent loop sequence at		/// Generates initialization code for the subsequent loop sequence at
/// current index level. Returns true if the loop sequence needs to		/// current index level. Returns true if the loop sequence needs to
/// maintain the universal index.		/// maintain the universal index.
static bool genInit(Merger &merger, CodeGen &codegen, PatternRewriter &rewriter,		static bool genInit(Merger &merger, CodeGen &codegen, PatternRewriter &rewriter,
linalg::GenericOp op, std::vector<unsigned> &topSort,		linalg::GenericOp op, std::vector<unsigned> &topSort,
unsigned at, llvm::BitVector &inits) {		unsigned at, llvm::BitVector &inits) {
bool needsUniv = false;		bool needsUniv = false;
Location loc = op.getLoc();		Location loc = op.getLoc();
unsigned idx = topSort[at];		unsigned idx = topSort[at];

// Initialize sparse positions.		// Initialize sparse positions.
for (unsigned b = 0, be = inits.size(); b < be; b++) {		for (unsigned b = 0, be = inits.size(); b < be; b++) {
if (inits[b]) {		if (inits[b]) {
unsigned tensor = merger.tensor(b);		unsigned tensor = merger.tensor(b);
assert(idx == merger.index(b));		assert(idx == merger.index(b));
if (merger.isSparseBit(b)) {		if (merger.isDim(b, Dim::kSparse)) {
// Initialize sparse index.		// Initialize sparse index.
unsigned pat = at;		unsigned pat = at;
for (; pat != 0; pat--) {		for (; pat != 0; pat--) {
if (codegen.pidxs[tensor][topSort[pat - 1]])		if (codegen.pidxs[tensor][topSort[pat - 1]])
break;		break;
}		}
Value ptr = codegen.pointers[tensor][idx];		Value ptr = codegen.pointers[tensor][idx];
Value one = rewriter.create<ConstantIndexOp>(loc, 1);		Value one = rewriter.create<ConstantIndexOp>(loc, 1);
Show All 22 Lines	static Operation *genFor(Merger &merger, CodeGen &codegen,
unsigned fb = indices.find_first();		unsigned fb = indices.find_first();
unsigned tensor = merger.tensor(fb);		unsigned tensor = merger.tensor(fb);
assert(idx == merger.index(fb));		assert(idx == merger.index(fb));

// Parallelization strategy. Any implicit loop in the Linalg operation that		// Parallelization strategy. Any implicit loop in the Linalg operation that
// is marked "parallel" is a candidate. Whether it is actually converted to		// is marked "parallel" is a candidate. Whether it is actually converted to
// a parallel operation depends on the requested strategy.		// a parallel operation depends on the requested strategy.
auto iteratorTypes = op.iterator_types().getValue();		auto iteratorTypes = op.iterator_types().getValue();
bool isSparse = merger.isSparseBit(fb);		bool isSparse = merger.isDim(fb, Dim::kSparse);
bool isParallel = linalg::isParallelIteratorType(iteratorTypes[idx]);		bool isParallel = linalg::isParallelIteratorType(iteratorTypes[idx]);
switch (codegen.options.parallelizationStrategy) {		switch (codegen.options.parallelizationStrategy) {
case linalg::SparseParallelizationStrategy::kNone:		case linalg::SparseParallelizationStrategy::kNone:
isParallel = false;		isParallel = false;
break;		break;
case linalg::SparseParallelizationStrategy::kDenseOuterLoop:		case linalg::SparseParallelizationStrategy::kDenseOuterLoop:
isParallel &= isOuter && !isSparse;		isParallel &= isOuter && !isSparse;
break;		break;
Show All 27 Lines	if (isParallel) {
if (isSparse)		if (isSparse)
codegen.pidxs[tensor][idx] = parOp.getInductionVars()[0];		codegen.pidxs[tensor][idx] = parOp.getInductionVars()[0];
else		else
codegen.loops[idx] = parOp.getInductionVars()[0];		codegen.loops[idx] = parOp.getInductionVars()[0];
rewriter.setInsertionPointToStart(parOp.getBody());		rewriter.setInsertionPointToStart(parOp.getBody());
return parOp;		return parOp;
}		}

// Emit a sequential loop.		// Emit a sequential loop, potentially with a scalarized reduction.
scf::ForOp forOp = rewriter.create<scf::ForOp>(loc, lo, hi, step);		bool scalarRed = isInner && codegen.redExp != -1u;
		SmallVector<Value, 4> operands;
		if (scalarRed) {
		Value load =
		codegen.redVal
		? codegen.redVal // chained with previous for-loop
		: genTensorLoad(merger, codegen, rewriter, op, codegen.redExp);
		operands.push_back(load);
		}
		scf::ForOp forOp = rewriter.create<scf::ForOp>(loc, lo, hi, step, operands);
		if (scalarRed) {
		codegen.redVal = merger.exp(codegen.redExp).val =
		forOp.getRegionIterArgs().front();
		}
		// Assign induction variable to sparse or dense index.
if (isSparse)		if (isSparse)
codegen.pidxs[tensor][idx] = forOp.getInductionVar();		codegen.pidxs[tensor][idx] = forOp.getInductionVar();
else		else
codegen.loops[idx] = forOp.getInductionVar();		codegen.loops[idx] = forOp.getInductionVar();
rewriter.setInsertionPointToStart(forOp.getBody());		rewriter.setInsertionPointToStart(forOp.getBody());
return forOp;		return forOp;
}		}

/// Emit a while-loop for co-iteration over multiple indices.		/// Emit a while-loop for co-iteration over multiple indices.
static Operation *genWhile(Merger &merger, CodeGen &codegen,		static Operation *genWhile(Merger &merger, CodeGen &codegen,
PatternRewriter &rewriter, linalg::GenericOp op,		PatternRewriter &rewriter, linalg::GenericOp op,
unsigned idx, bool needsUniv,		unsigned idx, bool needsUniv,
llvm::BitVector &indices) {		llvm::BitVector &indices) {
SmallVector<Type, 4> types;		SmallVector<Type, 4> types;
SmallVector<Value, 4> operands;		SmallVector<Value, 4> operands;
// Construct the while-loop with a parameter for each index.		// Construct the while-loop with a parameter for each index.
Type indexType = rewriter.getIndexType();		Type indexType = rewriter.getIndexType();
for (unsigned b = 0, be = indices.size(); b < be; b++) {		for (unsigned b = 0, be = indices.size(); b < be; b++) {
if (indices[b] && merger.isSparseBit(b)) {		if (indices[b] && merger.isDim(b, Dim::kSparse)) {
unsigned tensor = merger.tensor(b);		unsigned tensor = merger.tensor(b);
assert(idx == merger.index(b));		assert(idx == merger.index(b));
types.push_back(indexType);		types.push_back(indexType);
operands.push_back(codegen.pidxs[tensor][idx]);		operands.push_back(codegen.pidxs[tensor][idx]);
}		}
}		}
if (needsUniv) {		if (needsUniv) {
types.push_back(indexType);		types.push_back(indexType);
operands.push_back(codegen.loops[idx]);		operands.push_back(codegen.loops[idx]);
}		}
Location loc = op.getLoc();		Location loc = op.getLoc();
scf::WhileOp whileOp = rewriter.create<scf::WhileOp>(loc, types, operands);		scf::WhileOp whileOp = rewriter.create<scf::WhileOp>(loc, types, operands);
Block *before = rewriter.createBlock(&whileOp.before(), {}, types);		Block *before = rewriter.createBlock(&whileOp.before(), {}, types);
Block *after = rewriter.createBlock(&whileOp.after(), {}, types);		Block *after = rewriter.createBlock(&whileOp.after(), {}, types);

// Build the "before" region, which effectively consists		// Build the "before" region, which effectively consists
// of a conjunction of "i < upper" tests on all induction.		// of a conjunction of "i < upper" tests on all induction.
rewriter.setInsertionPointToStart(&whileOp.before().front());		rewriter.setInsertionPointToStart(&whileOp.before().front());
Value cond;		Value cond;
unsigned o = 0;		unsigned o = 0;
for (unsigned b = 0, be = indices.size(); b < be; b++) {		for (unsigned b = 0, be = indices.size(); b < be; b++) {
if (indices[b] && merger.isSparseBit(b)) {		if (indices[b] && merger.isDim(b, Dim::kSparse)) {
unsigned tensor = merger.tensor(b);		unsigned tensor = merger.tensor(b);
assert(idx == merger.index(b));		assert(idx == merger.index(b));
Value op1 = before->getArgument(o);		Value op1 = before->getArgument(o);
Value op2 = codegen.highs[tensor][idx];		Value op2 = codegen.highs[tensor][idx];
Value opc = rewriter.create<CmpIOp>(loc, CmpIPredicate::ult, op1, op2);		Value opc = rewriter.create<CmpIOp>(loc, CmpIPredicate::ult, op1, op2);
cond = cond ? rewriter.create<AndOp>(loc, cond, opc) : opc;		cond = cond ? rewriter.create<AndOp>(loc, cond, opc) : opc;
codegen.pidxs[tensor][idx] = after->getArgument(o++);		codegen.pidxs[tensor][idx] = after->getArgument(o++);
}		}
Show All 29 Lines	static void genLocals(Merger &merger, CodeGen &codegen,
std::vector<unsigned> &topSort, unsigned at,		std::vector<unsigned> &topSort, unsigned at,
bool needsUniv, llvm::BitVector &locals) {		bool needsUniv, llvm::BitVector &locals) {
Location loc = op.getLoc();		Location loc = op.getLoc();
unsigned idx = topSort[at];		unsigned idx = topSort[at];

// Initialize sparse indices.		// Initialize sparse indices.
Value min;		Value min;
for (unsigned b = 0, be = locals.size(); b < be; b++) {		for (unsigned b = 0, be = locals.size(); b < be; b++) {
if (locals[b] && merger.isSparseBit(b)) {		if (locals[b] && merger.isDim(b, Dim::kSparse)) {
unsigned tensor = merger.tensor(b);		unsigned tensor = merger.tensor(b);
assert(idx == merger.index(b));		assert(idx == merger.index(b));
Value ptr = codegen.indices[tensor][idx];		Value ptr = codegen.indices[tensor][idx];
Value s = codegen.pidxs[tensor][idx];		Value s = codegen.pidxs[tensor][idx];
Value load = genLoad(rewriter, loc, ptr, s);		Value load = genLoad(rewriter, loc, ptr, s);
codegen.idxs[tensor][idx] = load;		codegen.idxs[tensor][idx] = load;
if (!needsUniv) {		if (!needsUniv) {
if (min) {		if (min) {
Show All 10 Lines	static void genLocals(Merger &merger, CodeGen &codegen,
// Merge dense universal index over minimum.		// Merge dense universal index over minimum.
if (min) {		if (min) {
assert(!needsUniv);		assert(!needsUniv);
codegen.loops[idx] = min;		codegen.loops[idx] = min;
}		}

// Initialize dense positions.		// Initialize dense positions.
for (unsigned b = 0, be = locals.size(); b < be; b++) {		for (unsigned b = 0, be = locals.size(); b < be; b++) {
if (locals[b] && !merger.isSparseBit(b)) {		if (locals[b] && merger.isDim(b, Dim::kDense)) {
unsigned tensor = merger.tensor(b);		unsigned tensor = merger.tensor(b);
assert(idx == merger.index(b));		assert(idx == merger.index(b));
if (!codegen.highs[tensor][idx])
continue; // unused dimension
unsigned pat = at;		unsigned pat = at;
for (; pat != 0; pat--)		for (; pat != 0; pat--)
if (codegen.pidxs[tensor][topSort[pat - 1]])		if (codegen.pidxs[tensor][topSort[pat - 1]])
break;		break;
Value p = (pat == 0) ? rewriter.create<ConstantIndexOp>(loc, 0)		Value p = (pat == 0) ? rewriter.create<ConstantIndexOp>(loc, 0)
: codegen.pidxs[tensor][topSort[pat - 1]];		: codegen.pidxs[tensor][topSort[pat - 1]];
Value m = rewriter.create<MulIOp>(loc, codegen.sizes[idx], p);		Value m = rewriter.create<MulIOp>(loc, codegen.sizes[idx], p);
codegen.pidxs[tensor][idx] =		codegen.pidxs[tensor][idx] =
rewriter.create<AddIOp>(loc, m, codegen.loops[idx]);		rewriter.create<AddIOp>(loc, m, codegen.loops[idx]);
}		}
}		}
}		}

/// Generates the induction structure for a while-loop.		/// Generates the induction structure for a while-loop.
static void genWhileInduction(Merger &merger, CodeGen &codegen,		static void genWhileInduction(Merger &merger, CodeGen &codegen,
PatternRewriter &rewriter, linalg::GenericOp op,		PatternRewriter &rewriter, linalg::GenericOp op,
unsigned idx, bool needsUniv,		unsigned idx, bool needsUniv,
llvm::BitVector &induction, ResultRange results) {		llvm::BitVector &induction, ResultRange results) {
Location loc = op.getLoc();		Location loc = op.getLoc();
unsigned o = 0;		unsigned o = 0;
SmallVector<Value, 4> operands;		SmallVector<Value, 4> operands;
Value one = rewriter.create<ConstantIndexOp>(loc, 1);		Value one = rewriter.create<ConstantIndexOp>(loc, 1);
for (unsigned b = 0, be = induction.size(); b < be; b++)		for (unsigned b = 0, be = induction.size(); b < be; b++) {
if (induction[b] && merger.isSparseBit(b)) {		if (induction[b] && merger.isDim(b, Dim::kSparse)) {
unsigned tensor = merger.tensor(b);		unsigned tensor = merger.tensor(b);
assert(idx == merger.index(b));		assert(idx == merger.index(b));
Value op1 = codegen.idxs[tensor][idx];		Value op1 = codegen.idxs[tensor][idx];
Value op2 = codegen.loops[idx];		Value op2 = codegen.loops[idx];
Value op3 = codegen.pidxs[tensor][idx];		Value op3 = codegen.pidxs[tensor][idx];
Value cmp = rewriter.create<CmpIOp>(loc, CmpIPredicate::eq, op1, op2);		Value cmp = rewriter.create<CmpIOp>(loc, CmpIPredicate::eq, op1, op2);
Value add = rewriter.create<AddIOp>(loc, op3, one);		Value add = rewriter.create<AddIOp>(loc, op3, one);
operands.push_back(rewriter.create<SelectOp>(loc, cmp, add, op3));		operands.push_back(rewriter.create<SelectOp>(loc, cmp, add, op3));
codegen.pidxs[tensor][idx] = results[o++];		codegen.pidxs[tensor][idx] = results[o++];
}		}
		}
if (needsUniv) {		if (needsUniv) {
operands.push_back(rewriter.create<AddIOp>(loc, codegen.loops[idx], one));		operands.push_back(rewriter.create<AddIOp>(loc, codegen.loops[idx], one));
codegen.loops[idx] = results[o++];		codegen.loops[idx] = results[o++];
}		}
assert(o == operands.size());		assert(o == operands.size());
rewriter.create<scf::YieldOp>(loc, operands);		rewriter.create<scf::YieldOp>(loc, operands);
}		}

/// Generates a single if-statement within a while-loop.		/// Generates a single if-statement within a while-loop.
static void genIf(Merger &merger, CodeGen &codegen, PatternRewriter &rewriter,		static scf::IfOp genIf(Merger &merger, CodeGen &codegen,
linalg::GenericOp op, unsigned idx,		PatternRewriter &rewriter, linalg::GenericOp op,
llvm::BitVector &conditions, scf::IfOp &ifOp) {		unsigned idx, llvm::BitVector &conditions) {
Location loc = op.getLoc();		Location loc = op.getLoc();
if (ifOp)
rewriter.setInsertionPointToStart(&ifOp.elseRegion().front());
Value cond;		Value cond;
for (unsigned b = 0, be = conditions.size(); b < be; b++) {		for (unsigned b = 0, be = conditions.size(); b < be; b++) {
if (conditions[b]) {		if (conditions[b]) {
unsigned tensor = merger.tensor(b);		unsigned tensor = merger.tensor(b);
assert(idx == merger.index(b));		assert(idx == merger.index(b));
Value clause;		Value clause;
if (merger.isSparseBit(b)) {		if (merger.isDim(b, Dim::kSparse)) {
Value op1 = codegen.idxs[tensor][idx];		Value op1 = codegen.idxs[tensor][idx];
Value op2 = codegen.loops[idx];		Value op2 = codegen.loops[idx];
clause = rewriter.create<CmpIOp>(loc, CmpIPredicate::eq, op1, op2);		clause = rewriter.create<CmpIOp>(loc, CmpIPredicate::eq, op1, op2);
} else {		} else {
clause = rewriter.create<ConstantIntOp>(loc, 1, 1); // true		clause = rewriter.create<ConstantIntOp>(loc, 1, 1); // true
}		}
cond = cond ? rewriter.create<AndOp>(loc, cond, clause) : clause;		cond = cond ? rewriter.create<AndOp>(loc, cond, clause) : clause;
}		}
}		}
ifOp = rewriter.create<scf::IfOp>(loc, cond, /else/ true);		scf::IfOp ifOp = rewriter.create<scf::IfOp>(loc, cond, /else/ true);
rewriter.setInsertionPointToStart(&ifOp.thenRegion().front());		rewriter.setInsertionPointToStart(&ifOp.thenRegion().front());
}		return ifOp;

/// Optimize the loop indices of Li with two rules rules:
/// (1) convert multiple dense to single dense, and
/// (2) convert singleton sparse/dense to sparse/random access.
static void optimizeIndices(Merger merger, unsigned lsize,
llvm::BitVector &indices) {
if (merger.hasAnyOf(indices, false)) {
bool reset = lsize == 1 && merger.hasAnyOf(indices, true);
for (unsigned b = 0, be = indices.size(); b < be; b++) {
if (indices[b] && !merger.isSparseBit(b)) {
if (reset)
indices.reset(b);
reset = true;
}
}
}
}		}

/// Recursively generates code while computing iteration lattices in order		/// Recursively generates code while computing iteration lattices in order
/// to manage the complexity of implementing co-iteration over unions		/// to manage the complexity of implementing co-iteration over unions
/// and intersections of sparse iterations spaces.		/// and intersections of sparse iterations spaces.
static void genStmt(Merger &merger, CodeGen &codegen, PatternRewriter &rewriter,		static void genStmt(Merger &merger, CodeGen &codegen, PatternRewriter &rewriter,
linalg::GenericOp op, std::vector<unsigned> &topSort,		linalg::GenericOp op, std::vector<unsigned> &topSort,
unsigned exp, unsigned at) {		unsigned exp, unsigned at) {
// At each leaf, assign remaining tensor (sub)expression to output tensor.		// At each leaf, assign remaining tensor (sub)expression to output tensor.
if (at == topSort.size()) {		if (at == topSort.size()) {
unsigned lhs = op.getNumInputsAndOutputs() - 1;		unsigned lhs = op.getNumInputsAndOutputs() - 1;
Value rhs = genExp(merger, codegen, rewriter, op, exp);		Value rhs = genExp(merger, codegen, rewriter, op, exp);
genTensorStore(merger, codegen, rewriter, op, lhs, rhs);		genTensorStore(merger, codegen, rewriter, op, lhs, rhs);
return;		return;
}		}

// Construct iteration lattices for current loop index, with L0 at top.		// Construct iteration lattices for current loop index, with L0 at top.
// Then emit initialization code for the loop sequence at this level.		// Then emit initialization code for the loop sequence at this level.
// We maintain the universal dense index if dense indices are still		// We maintain the universal dense index if dense indices are still
// in play for a non-singleton loop sequence.		// in play for a non-singleton loop sequence.
		// Location loc = op.getLoc();
unsigned idx = topSort[at];		unsigned idx = topSort[at];
unsigned lts = merger.optimize(buildLattices(merger, op, exp, idx));		unsigned lts = merger.optimizeSet(buildLattices(merger, op, exp, idx));
unsigned lsize = merger.set(lts).size();		unsigned lsize = merger.set(lts).size();
assert(lsize != 0);		assert(lsize != 0);
unsigned l0 = merger.set(lts)[0];		unsigned l0 = merger.set(lts)[0];
LatPoint lat0 = merger.lat(l0);		unsigned ldx = at == 0 ? -1u : topSort[at - 1];
genInvariants(merger, codegen, rewriter, op, exp);		genInvariants(merger, codegen, rewriter, op, exp, ldx, /hoist=/true);
bool needsUniv =		bool needsUniv = genInit(merger, codegen, rewriter, op, topSort, at,
genInit(merger, codegen, rewriter, op, topSort, at, lat0.bits) &&		merger.lat(l0).bits) &&
lsize > 1;		lsize > 1;

// Emit a loop for every lattice point L0 >= Li.		// Emit a loop for every lattice point L0 >= Li.
for (unsigned li : merger.set(lts)) {		for (unsigned i = 0; i < lsize; i++) {
LatPoint lati = merger.lat(li);		unsigned li = merger.set(lts)[i];

// Emit loop.		// Emit loop.
llvm::BitVector indices = lati.bits;		llvm::BitVector indices = merger.lat(li).simple;
optimizeIndices(merger, lsize, indices);
Operation *loop =		Operation *loop =
genLoop(merger, codegen, rewriter, op, topSort, at, needsUniv, indices);		genLoop(merger, codegen, rewriter, op, topSort, at, needsUniv, indices);
genLocals(merger, codegen, rewriter, op, topSort, at, needsUniv, lati.bits);		genLocals(merger, codegen, rewriter, op, topSort, at, needsUniv,
		merger.lat(li).bits);

// Visit all lattices points with Li >= Lj to generate the		// Visit all lattices points with Li >= Lj to generate the
// loop-body, possibly with if statements for coiteration.		// loop-body, possibly with if statements for coiteration.
bool isWhile = dyn_cast<scf::WhileOp>(loop) != nullptr;		bool isWhile = dyn_cast<scf::WhileOp>(loop) != nullptr;
scf::IfOp ifOp;		for (unsigned j = 0; j < lsize; j++) {
for (unsigned lj : merger.set(lts)) {		unsigned lj = merger.set(lts)[j];
		unsigned ej = merger.lat(lj).exp;
if (li == lj \|\| merger.latGT(li, lj)) {		if (li == lj \|\| merger.latGT(li, lj)) {
LatPoint latj = merger.lat(lj);		if (li != lj) {
llvm::BitVector tmp = latj.bits;		llvm::BitVector tmp = merger.lat(lj).bits;
tmp ^= lati.bits;		tmp ^= merger.lat(li).bits;
if (merger.hasAnyOf(tmp, false))		if (!merger.hasAnyDimOf(tmp, Dim::kSparse))
continue; // dense exhausted within if/else		continue; // only dense exhausted within if/else
		}
// Recurse into body of each branch.		// Recurse into body of each branch.
if (isWhile)		if (isWhile) {
genIf(merger, codegen, rewriter, op, idx, latj.bits, ifOp);		scf::IfOp ifOp =
genStmt(merger, codegen, rewriter, op, topSort, latj.exp, at + 1);		genIf(merger, codegen, rewriter, op, idx, merger.lat(lj).simple);
		genStmt(merger, codegen, rewriter, op, topSort, ej, at + 1);
		rewriter.setInsertionPointToStart(&ifOp.elseRegion().front());
		} else {
		genStmt(merger, codegen, rewriter, op, topSort, ej, at + 1);
		}
}		}
}		}

// Wrap-up induction and restore insertion point.		// Wrap-up induction and restore insertion point.
if (isWhile) {		if (isWhile) {
scf::WhileOp whileOp = cast<scf::WhileOp>(loop);		scf::WhileOp whileOp = cast<scf::WhileOp>(loop);
rewriter.setInsertionPointToEnd(&whileOp.after().front());		rewriter.setInsertionPointToEnd(&whileOp.after().front());
genWhileInduction(merger, codegen, rewriter, op, idx, needsUniv,		genWhileInduction(merger, codegen, rewriter, op, idx, needsUniv,
lati.bits, whileOp.results());		merger.lat(li).bits, whileOp.results());
} else {		} else {
needsUniv = false;		needsUniv = false;
		if (codegen.redVal) {
		rewriter.create<scf::YieldOp>(op.getLoc(), codegen.redVal);
		codegen.redVal = loop->getResult(0);
		}
}		}
rewriter.setInsertionPointAfter(loop);		rewriter.setInsertionPointAfter(loop);
}		}

		// Wrap-up loop sequence.
		Value red = codegen.redVal;
		if (red) {
		codegen.redVal = merger.exp(codegen.redExp).val = Value(); // end chain
		unsigned lhs = op.getNumInputsAndOutputs() - 1;
		genTensorStore(merger, codegen, rewriter, op, lhs, red);
		}
codegen.loops[idx] = Value();		codegen.loops[idx] = Value();
		genInvariants(merger, codegen, rewriter, op, exp, ldx, /hoist=/false);
}		}

namespace {		namespace {

/// Sparse rewriting rule for generic Lingalg operation.		/// Sparse rewriting rule for generic Lingalg operation.
struct GenericOpSparsifier : public OpRewritePattern<linalg::GenericOp> {		struct GenericOpSparsifier : public OpRewritePattern<linalg::GenericOp> {
public:		public:
GenericOpSparsifier(MLIRContext *context, linalg::SparsificationOptions o)		GenericOpSparsifier(MLIRContext *context, linalg::SparsificationOptions o)
: OpRewritePattern<linalg::GenericOp>(context), options(o) {}		: OpRewritePattern<linalg::GenericOp>(context), options(o) {}

LogicalResult matchAndRewrite(linalg::GenericOp op,		LogicalResult matchAndRewrite(linalg::GenericOp op,
PatternRewriter &rewriter) const override {		PatternRewriter &rewriter) const override {
// Detects sparse annotations and translate the per-dimension sparsity		// Detects sparse annotations and translate the per-dimension sparsity
// information for all tensors to loop indices in the kernel.		// information for all tensors to loop indices in the kernel.
if (!op.hasSparseSemantics())		if (!op.hasSparseSemantics())
return failure();		return failure();
assert(op.getNumOutputs() == 1);		assert(op.getNumOutputs() == 1);
unsigned numTensors = op.getNumInputsAndOutputs();		unsigned numTensors = op.getNumInputsAndOutputs();
unsigned numLoops = op.iterator_types().getValue().size();		unsigned numLoops = op.iterator_types().getValue().size();
Merger merger(numTensors, numLoops);		Merger merger(numTensors, numLoops);
findSparseAnnotations(op, merger.sparse());		findSparseAnnotations(merger, op);

// Computes a topologically sorted iteration graph to ensure		// Computes a topologically sorted iteration graph to ensure
// tensors are visited in natural index order. Fails on cycles.		// tensors are visited in natural index order. Fails on cycles.
// This assumes that higher-level passes have already put the		// This assumes that higher-level passes have already put the
// tensors in each tensor expression in a feasible order.		// tensors in each tensor expression in a feasible order.
// TODO: try again without dense constraints on failure or		// TODO: try again without dense constraints on failure or
// even try to insert sparse reorderings to resolve cycles		// even try to insert sparse reorderings to resolve cycles
std::vector<unsigned> topSort;		std::vector<unsigned> topSort;
Show All 34 Lines

mlir/test/Dialect/Linalg/sparse_1d.mlir

Show First 20 Lines • Show All 630 Lines • ▼ Show 20 Lines	%0 = linalg.generic #trait_ss
ins(%arga, %argb: tensor<32xf32>, tensor<32xf32>) {		ins(%arga, %argb: tensor<32xf32>, tensor<32xf32>) {
^bb(%a: f32, %b: f32):		^bb(%a: f32, %b: f32):
%0 = mulf %a, %b : f32		%0 = mulf %a, %b : f32
linalg.yield %0 : f32		linalg.yield %0 : f32
} -> tensor<32xf32>		} -> tensor<32xf32>
return %0 : tensor<32xf32>		return %0 : tensor<32xf32>
}		}

		#trait_two_way_inv = {
		indexing_maps = [
		affine_map<(i) -> (i)>, // a
		affine_map<(i) -> (i)>, // b
		affine_map<(i) -> (i)> // x (out)
		],
		sparse = [
		[ "S" ], // a
		[ "S" ], // b
		[ "D" ] // x
		],
		iterator_types = ["parallel"],
		doc = "x(i) = a(i) * c + b(i) * c"
		}

		// CHECK-LABEL: func @two_way_inv(
		// CHECK-SAME: %[[VAL_0:.*0]]: tensor<16xf32>,
		// CHECK-SAME: %[[VAL_1:.*1]]: tensor<16xf32>,
		// CHECK-SAME: %[[VAL_2:.*2]]: f32) -> tensor<16xf32> {
		// CHECK: %[[VAL_3:.*]] = constant 999 : index
		// CHECK: %[[VAL_4:.*]] = constant 0 : index
		// CHECK: %[[VAL_5:.*]] = constant 1 : index
		// CHECK: %[[VAL_6:.*]] = alloca(%[[VAL_3]]) : memref<?xindex>
		// CHECK: %[[VAL_7:.*]] = alloca(%[[VAL_3]]) : memref<?xindex>
		// CHECK: %[[VAL_8:.*]] = alloca(%[[VAL_3]]) : memref<?xf32>
		// CHECK: %[[VAL_9:.*]] = alloca(%[[VAL_3]]) : memref<?xindex>
		// CHECK: %[[VAL_10:.*]] = alloca(%[[VAL_3]]) : memref<?xindex>
		// CHECK: %[[VAL_11:.*]] = alloca(%[[VAL_3]]) : memref<?xf32>
		// CHECK: %[[VAL_12:.*]] = alloca() : memref<16xf32>
		// CHECK: %[[VAL_13:.*]] = load %[[VAL_6]]{{\[}}%[[VAL_4]]] : memref<?xindex>
		// CHECK: %[[VAL_14:.*]] = load %[[VAL_6]]{{\[}}%[[VAL_5]]] : memref<?xindex>
		// CHECK: %[[VAL_15:.*]] = load %[[VAL_9]]{{\[}}%[[VAL_4]]] : memref<?xindex>
		// CHECK: %[[VAL_16:.*]] = load %[[VAL_9]]{{\[}}%[[VAL_5]]] : memref<?xindex>
		// CHECK: %[[VAL_17:.]]:3 = scf.while (%[[VAL_18:.]] = %[[VAL_13]], %[[VAL_19:.]] = %[[VAL_15]], %[[VAL_20:.]] = %[[VAL_4]]) : (index, index, index) -> (index, index, index) {
		// CHECK: %[[VAL_21:.*]] = cmpi "ult", %[[VAL_18]], %[[VAL_14]] : index
		// CHECK: %[[VAL_22:.*]] = cmpi "ult", %[[VAL_19]], %[[VAL_16]] : index
		// CHECK: %[[VAL_23:.*]] = and %[[VAL_21]], %[[VAL_22]] : i1
		// CHECK: scf.condition(%[[VAL_23]]) %[[VAL_18]], %[[VAL_19]], %[[VAL_20]] : index, index, index
		// CHECK: } do {
		// CHECK: ^bb0(%[[VAL_24:.]]: index, %[[VAL_25:.]]: index, %[[VAL_26:.*]]: index):
		// CHECK: %[[VAL_27:.*]] = load %[[VAL_7]]{{\[}}%[[VAL_24]]] : memref<?xindex>
		// CHECK: %[[VAL_28:.*]] = load %[[VAL_10]]{{\[}}%[[VAL_25]]] : memref<?xindex>
		// CHECK: %[[VAL_29:.*]] = cmpi "eq", %[[VAL_27]], %[[VAL_26]] : index
		// CHECK: %[[VAL_30:.*]] = cmpi "eq", %[[VAL_28]], %[[VAL_26]] : index
		// CHECK: %[[VAL_31:.*]] = and %[[VAL_29]], %[[VAL_30]] : i1
		// CHECK: scf.if %[[VAL_31]] {
		// CHECK: %[[VAL_32:.*]] = load %[[VAL_8]]{{\[}}%[[VAL_24]]] : memref<?xf32>
		// CHECK: %[[VAL_33:.*]] = mulf %[[VAL_32]], %[[VAL_2]] : f32
		// CHECK: %[[VAL_34:.*]] = load %[[VAL_11]]{{\[}}%[[VAL_25]]] : memref<?xf32>
		// CHECK: %[[VAL_35:.*]] = mulf %[[VAL_34]], %[[VAL_2]] : f32
		// CHECK: %[[VAL_36:.*]] = addf %[[VAL_33]], %[[VAL_35]] : f32
		// CHECK: store %[[VAL_36]], %[[VAL_12]]{{\[}}%[[VAL_26]]] : memref<16xf32>
		// CHECK: } else {
		// CHECK: %[[VAL_37:.*]] = cmpi "eq", %[[VAL_27]], %[[VAL_26]] : index
		// CHECK: scf.if %[[VAL_37]] {
		// CHECK: %[[VAL_38:.*]] = load %[[VAL_8]]{{\[}}%[[VAL_24]]] : memref<?xf32>
		// CHECK: %[[VAL_39:.*]] = mulf %[[VAL_38]], %[[VAL_2]] : f32
		// CHECK: store %[[VAL_39]], %[[VAL_12]]{{\[}}%[[VAL_26]]] : memref<16xf32>
		// CHECK: } else {
		// CHECK: %[[VAL_40:.*]] = cmpi "eq", %[[VAL_28]], %[[VAL_26]] : index
		// CHECK: scf.if %[[VAL_40]] {
		// CHECK: %[[VAL_41:.*]] = load %[[VAL_11]]{{\[}}%[[VAL_25]]] : memref<?xf32>
		// CHECK: %[[VAL_42:.*]] = mulf %[[VAL_41]], %[[VAL_2]] : f32
		// CHECK: store %[[VAL_42]], %[[VAL_12]]{{\[}}%[[VAL_26]]] : memref<16xf32>
		// CHECK: } else {
		// CHECK: }
		// CHECK: }
		// CHECK: }
		// CHECK: %[[VAL_43:.*]] = cmpi "eq", %[[VAL_27]], %[[VAL_26]] : index
		// CHECK: %[[VAL_44:.*]] = addi %[[VAL_24]], %[[VAL_5]] : index
		// CHECK: %[[VAL_45:.*]] = select %[[VAL_43]], %[[VAL_44]], %[[VAL_24]] : index
		// CHECK: %[[VAL_46:.*]] = cmpi "eq", %[[VAL_28]], %[[VAL_26]] : index
		// CHECK: %[[VAL_47:.*]] = addi %[[VAL_25]], %[[VAL_5]] : index
		// CHECK: %[[VAL_48:.*]] = select %[[VAL_46]], %[[VAL_47]], %[[VAL_25]] : index
		// CHECK: %[[VAL_49:.*]] = addi %[[VAL_26]], %[[VAL_5]] : index
		// CHECK: scf.yield %[[VAL_45]], %[[VAL_48]], %[[VAL_49]] : index, index, index
		// CHECK: }
		// CHECK: scf.for %[[VAL_50:.]] = %[[VAL_51:.]]#0 to %[[VAL_14]] step %[[VAL_5]] {
		// CHECK: %[[VAL_52:.*]] = load %[[VAL_8]]{{\[}}%[[VAL_50]]] : memref<?xf32>
		// CHECK: %[[VAL_53:.*]] = mulf %[[VAL_52]], %[[VAL_2]] : f32
		// CHECK: store %[[VAL_53]], %[[VAL_12]]{{\[}}%[[VAL_51]]#2] : memref<16xf32>
		// CHECK: }
		// CHECK: scf.for %[[VAL_54:.]] = %[[VAL_55:.]]#1 to %[[VAL_16]] step %[[VAL_5]] {
		// CHECK: %[[VAL_56:.*]] = load %[[VAL_10]]{{\[}}%[[VAL_54]]] : memref<?xindex>
		// CHECK: %[[VAL_57:.*]] = load %[[VAL_11]]{{\[}}%[[VAL_54]]] : memref<?xf32>
		// CHECK: %[[VAL_58:.*]] = mulf %[[VAL_57]], %[[VAL_2]] : f32
		// CHECK: store %[[VAL_58]], %[[VAL_12]]{{\[}}%[[VAL_56]]] : memref<16xf32>
		// CHECK: }
		// CHECK: %[[VAL_59:.*]] = tensor_load %[[VAL_12]] : memref<16xf32>
		// CHECK: return %[[VAL_59]] : tensor<16xf32>
		// CHECK: }
		func @two_way_inv(%arga: tensor<16xf32>,
		%argb: tensor<16xf32>, %argc: f32) -> tensor<16xf32> {
		%0 = linalg.generic #trait_two_way_inv
		ins(%arga, %argb : tensor<16xf32>, tensor<16xf32>) {
		^bb(%a : f32, %b : f32):
		%0 = mulf %a, %argc : f32
		%1 = mulf %b, %argc : f32
		%2 = addf %0, %1 : f32
		linalg.yield %2: f32
		} -> tensor<16xf32>
		return %0 : tensor<16xf32>
		}

		// CHECK-LABEL: func @two_way_inv_alt(
		// CHECK-SAME: %[[VAL_0:.*0]]: tensor<16xf32>,
		// CHECK-SAME: %[[VAL_1:.*1]]: tensor<16xf32>,
		// CHECK-SAME: %[[VAL_2:.*2]]: f32) -> tensor<16xf32> {
		// CHECK: %[[VAL_3:.*]] = constant 999 : index
		// CHECK: %[[VAL_4:.*]] = constant 0 : index
		// CHECK: %[[VAL_5:.*]] = constant 1 : index
		// CHECK: %[[VAL_6:.*]] = alloca(%[[VAL_3]]) : memref<?xindex>
		// CHECK: %[[VAL_7:.*]] = alloca(%[[VAL_3]]) : memref<?xindex>
		// CHECK: %[[VAL_8:.*]] = alloca(%[[VAL_3]]) : memref<?xf32>
		// CHECK: %[[VAL_9:.*]] = alloca(%[[VAL_3]]) : memref<?xindex>
		// CHECK: %[[VAL_10:.*]] = alloca(%[[VAL_3]]) : memref<?xindex>
		// CHECK: %[[VAL_11:.*]] = alloca(%[[VAL_3]]) : memref<?xf32>
		// CHECK: %[[VAL_12:.*]] = alloca() : memref<16xf32>
		// CHECK: %[[VAL_13:.*]] = load %[[VAL_6]]{{\[}}%[[VAL_4]]] : memref<?xindex>
		// CHECK: %[[VAL_14:.*]] = load %[[VAL_6]]{{\[}}%[[VAL_5]]] : memref<?xindex>
		// CHECK: %[[VAL_15:.*]] = load %[[VAL_9]]{{\[}}%[[VAL_4]]] : memref<?xindex>
		// CHECK: %[[VAL_16:.*]] = load %[[VAL_9]]{{\[}}%[[VAL_5]]] : memref<?xindex>
		// CHECK: %[[VAL_17:.]]:3 = scf.while (%[[VAL_18:.]] = %[[VAL_13]], %[[VAL_19:.]] = %[[VAL_15]], %[[VAL_20:.]] = %[[VAL_4]]) : (index, index, index) -> (index, index, index) {
		// CHECK: %[[VAL_21:.*]] = cmpi "ult", %[[VAL_18]], %[[VAL_14]] : index
		// CHECK: %[[VAL_22:.*]] = cmpi "ult", %[[VAL_19]], %[[VAL_16]] : index
		// CHECK: %[[VAL_23:.*]] = and %[[VAL_21]], %[[VAL_22]] : i1
		// CHECK: scf.condition(%[[VAL_23]]) %[[VAL_18]], %[[VAL_19]], %[[VAL_20]] : index, index, index
		// CHECK: } do {
		// CHECK: ^bb0(%[[VAL_24:.]]: index, %[[VAL_25:.]]: index, %[[VAL_26:.*]]: index):
		// CHECK: %[[VAL_27:.*]] = load %[[VAL_7]]{{\[}}%[[VAL_24]]] : memref<?xindex>
		// CHECK: %[[VAL_28:.*]] = load %[[VAL_10]]{{\[}}%[[VAL_25]]] : memref<?xindex>
		// CHECK: %[[VAL_29:.*]] = cmpi "eq", %[[VAL_27]], %[[VAL_26]] : index
		// CHECK: %[[VAL_30:.*]] = cmpi "eq", %[[VAL_28]], %[[VAL_26]] : index
		// CHECK: %[[VAL_31:.*]] = and %[[VAL_29]], %[[VAL_30]] : i1
		// CHECK: scf.if %[[VAL_31]] {
		// CHECK: %[[VAL_32:.*]] = load %[[VAL_8]]{{\[}}%[[VAL_24]]] : memref<?xf32>
		// CHECK: %[[VAL_33:.*]] = load %[[VAL_11]]{{\[}}%[[VAL_25]]] : memref<?xf32>
		// CHECK: %[[VAL_34:.*]] = addf %[[VAL_32]], %[[VAL_33]] : f32
		// CHECK: %[[VAL_35:.*]] = mulf %[[VAL_34]], %[[VAL_2]] : f32
		// CHECK: store %[[VAL_35]], %[[VAL_12]]{{\[}}%[[VAL_26]]] : memref<16xf32>
		// CHECK: } else {
		// CHECK: %[[VAL_36:.*]] = cmpi "eq", %[[VAL_27]], %[[VAL_26]] : index
		// CHECK: scf.if %[[VAL_36]] {
		// CHECK: %[[VAL_37:.*]] = load %[[VAL_8]]{{\[}}%[[VAL_24]]] : memref<?xf32>
		// CHECK: %[[VAL_38:.*]] = mulf %[[VAL_37]], %[[VAL_2]] : f32
		// CHECK: store %[[VAL_38]], %[[VAL_12]]{{\[}}%[[VAL_26]]] : memref<16xf32>
		// CHECK: } else {
		// CHECK: %[[VAL_39:.*]] = cmpi "eq", %[[VAL_28]], %[[VAL_26]] : index
		// CHECK: scf.if %[[VAL_39]] {
		// CHECK: %[[VAL_40:.*]] = load %[[VAL_11]]{{\[}}%[[VAL_25]]] : memref<?xf32>
		// CHECK: %[[VAL_41:.*]] = mulf %[[VAL_40]], %[[VAL_2]] : f32
		// CHECK: store %[[VAL_41]], %[[VAL_12]]{{\[}}%[[VAL_26]]] : memref<16xf32>
		// CHECK: } else {
		// CHECK: }
		// CHECK: }
		// CHECK: }
		// CHECK: %[[VAL_42:.*]] = cmpi "eq", %[[VAL_27]], %[[VAL_26]] : index
		// CHECK: %[[VAL_43:.*]] = addi %[[VAL_24]], %[[VAL_5]] : index
		// CHECK: %[[VAL_44:.*]] = select %[[VAL_42]], %[[VAL_43]], %[[VAL_24]] : index
		// CHECK: %[[VAL_45:.*]] = cmpi "eq", %[[VAL_28]], %[[VAL_26]] : index
		// CHECK: %[[VAL_46:.*]] = addi %[[VAL_25]], %[[VAL_5]] : index
		// CHECK: %[[VAL_47:.*]] = select %[[VAL_45]], %[[VAL_46]], %[[VAL_25]] : index
		// CHECK: %[[VAL_48:.*]] = addi %[[VAL_26]], %[[VAL_5]] : index
		// CHECK: scf.yield %[[VAL_44]], %[[VAL_47]], %[[VAL_48]] : index, index, index
		// CHECK: }
		// CHECK: scf.for %[[VAL_49:.]] = %[[VAL_50:.]]#0 to %[[VAL_14]] step %[[VAL_5]] {
		// CHECK: %[[VAL_51:.*]] = load %[[VAL_8]]{{\[}}%[[VAL_49]]] : memref<?xf32>
		// CHECK: %[[VAL_52:.*]] = mulf %[[VAL_51]], %[[VAL_2]] : f32
		// CHECK: store %[[VAL_52]], %[[VAL_12]]{{\[}}%[[VAL_50]]#2] : memref<16xf32>
		// CHECK: }
		// CHECK: scf.for %[[VAL_53:.]] = %[[VAL_54:.]]#1 to %[[VAL_16]] step %[[VAL_5]] {
		// CHECK: %[[VAL_55:.*]] = load %[[VAL_10]]{{\[}}%[[VAL_53]]] : memref<?xindex>
		// CHECK: %[[VAL_56:.*]] = load %[[VAL_11]]{{\[}}%[[VAL_53]]] : memref<?xf32>
		// CHECK: %[[VAL_57:.*]] = mulf %[[VAL_56]], %[[VAL_2]] : f32
		// CHECK: store %[[VAL_57]], %[[VAL_12]]{{\[}}%[[VAL_55]]] : memref<16xf32>
		// CHECK: }
		// CHECK: %[[VAL_58:.*]] = tensor_load %[[VAL_12]] : memref<16xf32>
		// CHECK: return %[[VAL_58]] : tensor<16xf32>
		// CHECK: }
		func @two_way_inv_alt(%arga: tensor<16xf32>,
		%argb: tensor<16xf32>, %argc: f32) -> tensor<16xf32> {
		// Same kernel, but now expressed as "x(i) = (a(i) + b(i)) * c".
		%0 = linalg.generic #trait_two_way_inv
		ins(%arga, %argb : tensor<16xf32>, tensor<16xf32>) {
		^bb(%a : f32, %b : f32):
		%0 = addf %a, %b : f32
		%1 = mulf %0, %argc : f32
		linalg.yield %1: f32
		} -> tensor<16xf32>
		return %0 : tensor<16xf32>
		}

#trait_sum_reduction = {		#trait_sum_reduction = {
indexing_maps = [		indexing_maps = [
affine_map<(i) -> (i)>, // a		affine_map<(i) -> (i)>, // a
affine_map<(i) -> ()> // x (scalar out)		affine_map<(i) -> ()> // x (scalar out)
],		],
sparse = [		sparse = [
[ "S" ], // a		[ "S" ], // a
[ ] // x		[ ] // x
],		],
iterator_types = ["reduction"],		iterator_types = ["reduction"],
doc = "x = SUM_i a(i)"		doc = "x += SUM_i a(i)"
}		}

// CHECK-LABEL: func @sum_reduction(		// CHECK-LABEL: func @sum_reduction(
// CHECK-SAME: %[[VAL_0:.*]]: tensor<?xf32>,		// CHECK-SAME: %[[VAL_0:.*]]: tensor<?xf32>,
// CHECK-SAME: %[[VAL_1:.*]]: tensor<f32>) -> tensor<f32> {		// CHECK-SAME: %[[VAL_1:.*]]: tensor<f32>) -> tensor<f32> {
// CHECK: %[[VAL_2:.*]] = constant 999 : index		// CHECK: %[[VAL_2:.*]] = constant 999 : index
// CHECK: %[[VAL_3:.*]] = constant 0 : index		// CHECK: %[[VAL_3:.*]] = constant 0 : index
// CHECK: %[[VAL_4:.*]] = constant 1 : index		// CHECK: %[[VAL_4:.*]] = constant 1 : index
// CHECK: %[[VAL_5:.*]] = alloca(%[[VAL_2]]) : memref<?xindex>		// CHECK: %[[VAL_5:.*]] = alloca(%[[VAL_2]]) : memref<?xindex>
// CHECK: %[[VAL_6:.*]] = alloca(%[[VAL_2]]) : memref<?xindex>		// CHECK: %[[VAL_6:.*]] = alloca(%[[VAL_2]]) : memref<?xindex>
// CHECK: %[[VAL_7:.*]] = alloca(%[[VAL_2]]) : memref<?xf32>		// CHECK: %[[VAL_7:.*]] = alloca(%[[VAL_2]]) : memref<?xf32>
// CHECK: %[[VAL_8:.*]] = alloca() : memref<f32>		// CHECK: %[[VAL_8:.*]] = alloca() : memref<f32>
// CHECK: %[[VAL_9:.*]] = load %[[VAL_5]]{{\[}}%[[VAL_3]]] : memref<?xindex>		// CHECK: %[[VAL_9:.*]] = load %[[VAL_5]]{{\[}}%[[VAL_3]]] : memref<?xindex>
// CHECK: %[[VAL_10:.*]] = load %[[VAL_5]]{{\[}}%[[VAL_4]]] : memref<?xindex>		// CHECK: %[[VAL_10:.*]] = load %[[VAL_5]]{{\[}}%[[VAL_4]]] : memref<?xindex>
// CHECK: scf.for %[[VAL_11:.*]] = %[[VAL_9]] to %[[VAL_10]] step %[[VAL_4]] {		// CHECK: %[[VAL_11:.*]] = load %[[VAL_8]][] : memref<f32>
// CHECK: %[[VAL_12:.*]] = load %[[VAL_8]][] : memref<f32>		// CHECK: %[[VAL_12:.]] = scf.for %[[VAL_13:.]] = %[[VAL_9]] to %[[VAL_10]] step %[[VAL_4]] iter_args(%[[VAL_14:.*]] = %[[VAL_11]]) -> (f32) {
// CHECK: %[[VAL_13:.*]] = load %[[VAL_7]]{{\[}}%[[VAL_11]]] : memref<?xf32>		// CHECK: %[[VAL_15:.*]] = load %[[VAL_7]]{{\[}}%[[VAL_13]]] : memref<?xf32>
// CHECK: %[[VAL_14:.*]] = addf %[[VAL_12]], %[[VAL_13]] : f32		// CHECK: %[[VAL_16:.*]] = addf %[[VAL_14]], %[[VAL_15]] : f32
// CHECK: store %[[VAL_14]], %[[VAL_8]][] : memref<f32>		// CHECK: scf.yield %[[VAL_16]] : f32
// CHECK: }		// CHECK: }
// CHECK: %[[VAL_15:.*]] = tensor_load %[[VAL_8]] : memref<f32>		// CHECK: store %[[VAL_17:.*]], %[[VAL_8]][] : memref<f32>
// CHECK: return %[[VAL_15]] : tensor<f32>		// CHECK: %[[VAL_18:.*]] = tensor_load %[[VAL_8]] : memref<f32>
		// CHECK: return %[[VAL_18]] : tensor<f32>
// CHECK: }		// CHECK: }
func @sum_reduction(%arga: tensor<?xf32>, %argx: tensor<f32>) -> tensor<f32> {		func @sum_reduction(%arga: tensor<?xf32>, %argx: tensor<f32>) -> tensor<f32> {
%0 = linalg.generic #trait_sum_reduction		%0 = linalg.generic #trait_sum_reduction
ins(%arga : tensor<?xf32>)		ins(%arga : tensor<?xf32>)
init(%argx : tensor<f32>) {		init(%argx : tensor<f32>) {
^bb(%a : f32, %x : f32):		^bb(%a : f32, %x : f32):
%0 = addf %x, %a : f32		%0 = addf %x, %a : f32
linalg.yield %0: f32		linalg.yield %0: f32
} -> tensor<f32>		} -> tensor<f32>
return %0 : tensor<f32>		return %0 : tensor<f32>
}		}

		#trait_sum_reduction_ss = {
		indexing_maps = [
		affine_map<(i) -> (i)>, // a
		affine_map<(i) -> (i)>, // b
		affine_map<(i)-> ()> // x (scalar out)
		],
		sparse = [
		[ "S" ], // a
		[ "S" ], // b
		[ ] // x
		],
		iterator_types = ["reduction"],
		doc = "x += SUM_i a(i) + b(i)"
		}

		// CHECK-LABEL: func @sum_reduction_ss(
		// CHECK-SAME: %[[VAL_0:.*0]]: tensor<16xf32>,
		// CHECK-SAME: %[[VAL_1:.*1]]: tensor<16xf32>,
		// CHECK-SAME: %[[VAL_2:.*2]]: tensor<f32>) -> tensor<f32> {
		// CHECK: %[[VAL_3:.*]] = constant 999 : index
		// CHECK: %[[VAL_4:.*]] = constant 0 : index
		// CHECK: %[[VAL_5:.*]] = constant 1 : index
		// CHECK: %[[VAL_6:.*]] = alloca(%[[VAL_3]]) : memref<?xindex>
		// CHECK: %[[VAL_7:.*]] = alloca(%[[VAL_3]]) : memref<?xindex>
		// CHECK: %[[VAL_8:.*]] = alloca(%[[VAL_3]]) : memref<?xf32>
		// CHECK: %[[VAL_9:.*]] = alloca(%[[VAL_3]]) : memref<?xindex>
		// CHECK: %[[VAL_10:.*]] = alloca(%[[VAL_3]]) : memref<?xindex>
		// CHECK: %[[VAL_11:.*]] = alloca(%[[VAL_3]]) : memref<?xf32>
		// CHECK: %[[VAL_12:.*]] = alloca() : memref<f32>
		// CHECK: %[[VAL_13:.*]] = load %[[VAL_6]]{{\[}}%[[VAL_4]]] : memref<?xindex>
		// CHECK: %[[VAL_14:.*]] = load %[[VAL_6]]{{\[}}%[[VAL_5]]] : memref<?xindex>
		// CHECK: %[[VAL_15:.*]] = load %[[VAL_9]]{{\[}}%[[VAL_4]]] : memref<?xindex>
		// CHECK: %[[VAL_16:.*]] = load %[[VAL_9]]{{\[}}%[[VAL_5]]] : memref<?xindex>
		// CHECK: %[[VAL_17:.]]:3 = scf.while (%[[VAL_18:.]] = %[[VAL_13]], %[[VAL_19:.]] = %[[VAL_15]], %[[VAL_20:.]] = %[[VAL_4]]) : (index, index, index) -> (index, index, index) {
		// CHECK: %[[VAL_21:.*]] = cmpi "ult", %[[VAL_18]], %[[VAL_14]] : index
		// CHECK: %[[VAL_22:.*]] = cmpi "ult", %[[VAL_19]], %[[VAL_16]] : index
		// CHECK: %[[VAL_23:.*]] = and %[[VAL_21]], %[[VAL_22]] : i1
		// CHECK: scf.condition(%[[VAL_23]]) %[[VAL_18]], %[[VAL_19]], %[[VAL_20]] : index, index, index
		// CHECK: } do {
		// CHECK: ^bb0(%[[VAL_24:.]]: index, %[[VAL_25:.]]: index, %[[VAL_26:.*]]: index):
		// CHECK: %[[VAL_27:.*]] = load %[[VAL_7]]{{\[}}%[[VAL_24]]] : memref<?xindex>
		// CHECK: %[[VAL_28:.*]] = load %[[VAL_10]]{{\[}}%[[VAL_25]]] : memref<?xindex>
		// CHECK: %[[VAL_29:.*]] = cmpi "eq", %[[VAL_27]], %[[VAL_26]] : index
		// CHECK: %[[VAL_30:.*]] = cmpi "eq", %[[VAL_28]], %[[VAL_26]] : index
		// CHECK: %[[VAL_31:.*]] = and %[[VAL_29]], %[[VAL_30]] : i1
		// CHECK: scf.if %[[VAL_31]] {
		// CHECK: %[[VAL_32:.*]] = load %[[VAL_12]][] : memref<f32>
		// CHECK: %[[VAL_33:.*]] = load %[[VAL_8]]{{\[}}%[[VAL_24]]] : memref<?xf32>
		// CHECK: %[[VAL_34:.*]] = load %[[VAL_11]]{{\[}}%[[VAL_25]]] : memref<?xf32>
		// CHECK: %[[VAL_35:.*]] = addf %[[VAL_33]], %[[VAL_34]] : f32
		// CHECK: %[[VAL_36:.*]] = addf %[[VAL_32]], %[[VAL_35]] : f32
		// CHECK: store %[[VAL_36]], %[[VAL_12]][] : memref<f32>
		// CHECK: } else {
		// CHECK: %[[VAL_37:.*]] = cmpi "eq", %[[VAL_27]], %[[VAL_26]] : index
		// CHECK: scf.if %[[VAL_37]] {
		// CHECK: %[[VAL_38:.*]] = load %[[VAL_12]][] : memref<f32>
		// CHECK: %[[VAL_39:.*]] = load %[[VAL_8]]{{\[}}%[[VAL_24]]] : memref<?xf32>
		// CHECK: %[[VAL_40:.*]] = addf %[[VAL_38]], %[[VAL_39]] : f32
		// CHECK: store %[[VAL_40]], %[[VAL_12]][] : memref<f32>
		// CHECK: } else {
		// CHECK: %[[VAL_41:.*]] = cmpi "eq", %[[VAL_28]], %[[VAL_26]] : index
		// CHECK: scf.if %[[VAL_41]] {
		// CHECK: %[[VAL_42:.*]] = load %[[VAL_12]][] : memref<f32>
		// CHECK: %[[VAL_43:.*]] = load %[[VAL_11]]{{\[}}%[[VAL_25]]] : memref<?xf32>
		// CHECK: %[[VAL_44:.*]] = addf %[[VAL_42]], %[[VAL_43]] : f32
		// CHECK: store %[[VAL_44]], %[[VAL_12]][] : memref<f32>
		// CHECK: } else {
		// CHECK: }
		// CHECK: }
		// CHECK: }
		// CHECK: %[[VAL_45:.*]] = cmpi "eq", %[[VAL_27]], %[[VAL_26]] : index
		// CHECK: %[[VAL_46:.*]] = addi %[[VAL_24]], %[[VAL_5]] : index
		// CHECK: %[[VAL_47:.*]] = select %[[VAL_45]], %[[VAL_46]], %[[VAL_24]] : index
		// CHECK: %[[VAL_48:.*]] = cmpi "eq", %[[VAL_28]], %[[VAL_26]] : index
		// CHECK: %[[VAL_49:.*]] = addi %[[VAL_25]], %[[VAL_5]] : index
		// CHECK: %[[VAL_50:.*]] = select %[[VAL_48]], %[[VAL_49]], %[[VAL_25]] : index
		// CHECK: %[[VAL_51:.*]] = addi %[[VAL_26]], %[[VAL_5]] : index
		// CHECK: scf.yield %[[VAL_47]], %[[VAL_50]], %[[VAL_51]] : index, index, index
		// CHECK: }
		// CHECK: %[[VAL_52:.*]] = load %[[VAL_12]][] : memref<f32>
		// CHECK: %[[VAL_53:.]] = scf.for %[[VAL_54:.]] = %[[VAL_55:.]]#0 to %[[VAL_14]] step %[[VAL_5]] iter_args(%[[VAL_56:.]] = %[[VAL_52]]) -> (f32) {
		// CHECK: %[[VAL_57:.*]] = load %[[VAL_8]]{{\[}}%[[VAL_54]]] : memref<?xf32>
		// CHECK: %[[VAL_58:.*]] = addf %[[VAL_56]], %[[VAL_57]] : f32
		// CHECK: scf.yield %[[VAL_58]] : f32
		// CHECK: }
		// CHECK: %[[VAL_59:.]] = scf.for %[[VAL_60:.]] = %[[VAL_61:.]]#1 to %[[VAL_16]] step %[[VAL_5]] iter_args(%[[VAL_62:.]] = %[[VAL_63:.*]]) -> (f32) {
		// CHECK: %[[VAL_64:.*]] = load %[[VAL_11]]{{\[}}%[[VAL_60]]] : memref<?xf32>
		// CHECK: %[[VAL_65:.*]] = addf %[[VAL_62]], %[[VAL_64]] : f32
		// CHECK: scf.yield %[[VAL_65]] : f32
		// CHECK: }
		// CHECK: store %[[VAL_66:.*]], %[[VAL_12]][] : memref<f32>
		// CHECK: %[[VAL_67:.*]] = tensor_load %[[VAL_12]] : memref<f32>
		// CHECK: return %[[VAL_67]] : tensor<f32>
		// CHECK: }
		func @sum_reduction_ss(%arga: tensor<16xf32>,
		%argb: tensor<16xf32>,
		%argx: tensor<f32>) -> tensor<f32> {
		// Just for testing. This case would be better expressed
		// as two separate reductions kernels.
		%0 = linalg.generic #trait_sum_reduction_ss
		ins(%arga, %argb: tensor<16xf32>, tensor<16xf32>)
		init(%argx : tensor<f32>) {
		^bb(%a : f32, %b : f32, %x : f32):
		%0 = addf %a, %b : f32
		%1 = addf %x, %0 : f32
		linalg.yield %1: f32
		} -> tensor<f32>
		return %0 : tensor<f32>
		}

		#trait_sum_reduction_inv_ss = {
		indexing_maps = [
		affine_map<(i) -> (i)>, // a
		affine_map<(i) -> ()>, // b
		affine_map<(i) -> (i)>, // c
		affine_map<(i) -> ()> // x (out)
		],
		sparse = [
		[ "S" ], // a
		[ ], // b
		[ "S" ], // c
		[ ] // x
		],
		iterator_types = ["reduction"],
		doc = "x += SUM_i a(i) * b + c(i)"
		}

		// CHECK-LABEL: func @sum_reduction_inv(
		// CHECK-SAME: %[[VAL_0:.*0]]: tensor<16xf32>,
		// CHECK-SAME: %[[VAL_1:.*1]]: tensor<f32>,
		// CHECK-SAME: %[[VAL_2:.*2]]: tensor<16xf32>,
		// CHECK-SAME: %[[VAL_3:.*3]]: tensor<f32>) -> tensor<f32> {
		// CHECK: %[[VAL_4:.*]] = constant 999 : index
		// CHECK: %[[VAL_5:.*]] = constant 0 : index
		// CHECK: %[[VAL_6:.*]] = constant 1 : index
		// CHECK: %[[VAL_7:.*]] = alloca(%[[VAL_4]]) : memref<?xindex>
		// CHECK: %[[VAL_8:.*]] = alloca(%[[VAL_4]]) : memref<?xindex>
		// CHECK: %[[VAL_9:.*]] = alloca(%[[VAL_4]]) : memref<?xf32>
		// CHECK: %[[VAL_10:.*]] = alloca() : memref<f32>
		// CHECK: %[[VAL_11:.*]] = alloca(%[[VAL_4]]) : memref<?xindex>
		// CHECK: %[[VAL_12:.*]] = alloca(%[[VAL_4]]) : memref<?xindex>
		// CHECK: %[[VAL_13:.*]] = alloca(%[[VAL_4]]) : memref<?xf32>
		// CHECK: %[[VAL_14:.*]] = alloca() : memref<f32>
		// CHECK: %[[VAL_15:.*]] = load %[[VAL_10]][] : memref<f32>
		// CHECK: %[[VAL_16:.*]] = load %[[VAL_7]]{{\[}}%[[VAL_5]]] : memref<?xindex>
		// CHECK: %[[VAL_17:.*]] = load %[[VAL_7]]{{\[}}%[[VAL_6]]] : memref<?xindex>
		// CHECK: %[[VAL_18:.*]] = load %[[VAL_11]]{{\[}}%[[VAL_5]]] : memref<?xindex>
		// CHECK: %[[VAL_19:.*]] = load %[[VAL_11]]{{\[}}%[[VAL_6]]] : memref<?xindex>
		// CHECK: %[[VAL_20:.]]:3 = scf.while (%[[VAL_21:.]] = %[[VAL_16]], %[[VAL_22:.]] = %[[VAL_18]], %[[VAL_23:.]] = %[[VAL_5]]) : (index, index, index) -> (index, index, index) {
		// CHECK: %[[VAL_24:.*]] = cmpi "ult", %[[VAL_21]], %[[VAL_17]] : index
		// CHECK: %[[VAL_25:.*]] = cmpi "ult", %[[VAL_22]], %[[VAL_19]] : index
		// CHECK: %[[VAL_26:.*]] = and %[[VAL_24]], %[[VAL_25]] : i1
		// CHECK: scf.condition(%[[VAL_26]]) %[[VAL_21]], %[[VAL_22]], %[[VAL_23]] : index, index, index
		// CHECK: } do {
		// CHECK: ^bb0(%[[VAL_27:.]]: index, %[[VAL_28:.]]: index, %[[VAL_29:.*]]: index):
		// CHECK: %[[VAL_30:.*]] = load %[[VAL_8]]{{\[}}%[[VAL_27]]] : memref<?xindex>
		// CHECK: %[[VAL_31:.*]] = load %[[VAL_12]]{{\[}}%[[VAL_28]]] : memref<?xindex>
		// CHECK: %[[VAL_32:.*]] = cmpi "eq", %[[VAL_30]], %[[VAL_29]] : index
		// CHECK: %[[VAL_33:.*]] = cmpi "eq", %[[VAL_31]], %[[VAL_29]] : index
		// CHECK: %[[VAL_34:.*]] = and %[[VAL_32]], %[[VAL_33]] : i1
		// CHECK: scf.if %[[VAL_34]] {
		// CHECK: %[[VAL_35:.*]] = load %[[VAL_14]][] : memref<f32>
		// CHECK: %[[VAL_36:.*]] = load %[[VAL_9]]{{\[}}%[[VAL_27]]] : memref<?xf32>
		// CHECK: %[[VAL_37:.*]] = mulf %[[VAL_36]], %[[VAL_15]] : f32
		// CHECK: %[[VAL_38:.*]] = load %[[VAL_13]]{{\[}}%[[VAL_28]]] : memref<?xf32>
		// CHECK: %[[VAL_39:.*]] = addf %[[VAL_37]], %[[VAL_38]] : f32
		// CHECK: %[[VAL_40:.*]] = addf %[[VAL_35]], %[[VAL_39]] : f32
		// CHECK: store %[[VAL_40]], %[[VAL_14]][] : memref<f32>
		// CHECK: } else {
		// CHECK: %[[VAL_41:.*]] = cmpi "eq", %[[VAL_30]], %[[VAL_29]] : index
		// CHECK: scf.if %[[VAL_41]] {
		// CHECK: %[[VAL_42:.*]] = load %[[VAL_14]][] : memref<f32>
		// CHECK: %[[VAL_43:.*]] = load %[[VAL_9]]{{\[}}%[[VAL_27]]] : memref<?xf32>
		// CHECK: %[[VAL_44:.*]] = mulf %[[VAL_43]], %[[VAL_15]] : f32
		// CHECK: %[[VAL_45:.*]] = addf %[[VAL_42]], %[[VAL_44]] : f32
		// CHECK: store %[[VAL_45]], %[[VAL_14]][] : memref<f32>
		// CHECK: } else {
		// CHECK: %[[VAL_46:.*]] = cmpi "eq", %[[VAL_31]], %[[VAL_29]] : index
		// CHECK: scf.if %[[VAL_46]] {
		// CHECK: %[[VAL_47:.*]] = load %[[VAL_14]][] : memref<f32>
		// CHECK: %[[VAL_48:.*]] = load %[[VAL_13]]{{\[}}%[[VAL_28]]] : memref<?xf32>
		// CHECK: %[[VAL_49:.*]] = addf %[[VAL_47]], %[[VAL_48]] : f32
		// CHECK: store %[[VAL_49]], %[[VAL_14]][] : memref<f32>
		// CHECK: } else {
		// CHECK: }
		// CHECK: }
		// CHECK: }
		// CHECK: %[[VAL_50:.*]] = cmpi "eq", %[[VAL_30]], %[[VAL_29]] : index
		// CHECK: %[[VAL_51:.*]] = addi %[[VAL_27]], %[[VAL_6]] : index
		// CHECK: %[[VAL_52:.*]] = select %[[VAL_50]], %[[VAL_51]], %[[VAL_27]] : index
		// CHECK: %[[VAL_53:.*]] = cmpi "eq", %[[VAL_31]], %[[VAL_29]] : index
		// CHECK: %[[VAL_54:.*]] = addi %[[VAL_28]], %[[VAL_6]] : index
		// CHECK: %[[VAL_55:.*]] = select %[[VAL_53]], %[[VAL_54]], %[[VAL_28]] : index
		// CHECK: %[[VAL_56:.*]] = addi %[[VAL_29]], %[[VAL_6]] : index
		// CHECK: scf.yield %[[VAL_52]], %[[VAL_55]], %[[VAL_56]] : index, index, index
		// CHECK: }
		// CHECK: %[[VAL_57:.*]] = load %[[VAL_14]][] : memref<f32>
		// CHECK: %[[VAL_58:.]] = scf.for %[[VAL_59:.]] = %[[VAL_60:.]]#0 to %[[VAL_17]] step %[[VAL_6]] iter_args(%[[VAL_61:.]] = %[[VAL_57]]) -> (f32) {
		// CHECK: %[[VAL_62:.*]] = load %[[VAL_9]]{{\[}}%[[VAL_59]]] : memref<?xf32>
		// CHECK: %[[VAL_63:.*]] = mulf %[[VAL_62]], %[[VAL_15]] : f32
		// CHECK: %[[VAL_64:.*]] = addf %[[VAL_61]], %[[VAL_63]] : f32
		// CHECK: scf.yield %[[VAL_64]] : f32
		// CHECK: }
		// CHECK: %[[VAL_65:.]] = scf.for %[[VAL_66:.]] = %[[VAL_67:.]]#1 to %[[VAL_19]] step %[[VAL_6]] iter_args(%[[VAL_68:.]] = %[[VAL_69:.*]]) -> (f32) {
		// CHECK: %[[VAL_70:.*]] = load %[[VAL_13]]{{\[}}%[[VAL_66]]] : memref<?xf32>
		// CHECK: %[[VAL_71:.*]] = addf %[[VAL_68]], %[[VAL_70]] : f32
		// CHECK: scf.yield %[[VAL_71]] : f32
		// CHECK: }
		// CHECK: store %[[VAL_72:.*]], %[[VAL_14]][] : memref<f32>
		// CHECK: %[[VAL_73:.*]] = tensor_load %[[VAL_14]] : memref<f32>
		// CHECK: return %[[VAL_73]] : tensor<f32>
		// CHECK: }
		func @sum_reduction_inv(%arga: tensor<16xf32>,
		%argb: tensor<f32>,
		%argc: tensor<16xf32>,
		%argx: tensor<f32>) -> tensor<f32> {
		// Just for testing. This case would be better expressed
		// as two separate reductions kernels.
		%0 = linalg.generic #trait_sum_reduction_inv_ss
		ins(%arga, %argb, %argc : tensor<16xf32>, tensor<f32>, tensor<16xf32>)
		init(%argx : tensor<f32>) {
		^bb(%a : f32, %b : f32, %c : f32, %x : f32):
		%0 = mulf %a, %b : f32
		%1 = addf %0, %c : f32
		%2 = addf %x, %1 : f32
		linalg.yield %2: f32
		} -> tensor<f32>
		return %0 : tensor<f32>
		}

mlir/test/Dialect/Linalg/sparse_2d.mlir

Show First 20 Lines • Show All 1,006 Lines • ▼ Show 20 Lines	indexing_maps = [
affine_map<(i,j) -> (i)> // x (out)		affine_map<(i,j) -> (i)> // x (out)
],		],
sparse = [		sparse = [
[ "D", "S" ], // A		[ "D", "S" ], // A
[ "D" ], // b		[ "D" ], // b
[ "D" ] // x		[ "D" ] // x
],		],
iterator_types = ["parallel", "reduction"],		iterator_types = ["parallel", "reduction"],
doc = "x(i) += A(i,j) * b(j)"		doc = "x(i) += SUM_j A(i,j) * b(j)"
}		}

// CHECK-LABEL: func @matvec(		// CHECK-LABEL: func @matvec(
// CHECK-SAME: %[[VAL_0:.*]]: tensor<16x32xf32>,		// CHECK-SAME: %[[VAL_0:.*]]: tensor<16x32xf32>,
// CHECK-SAME: %[[VAL_1:.*]]: tensor<32xf32>,		// CHECK-SAME: %[[VAL_1:.*]]: tensor<32xf32>,
// CHECK-SAME: %[[VAL_2:.*]]: tensor<16xf32>) -> tensor<16xf32> {		// CHECK-SAME: %[[VAL_2:.*]]: tensor<16xf32>) -> tensor<16xf32> {
// CHECK: %[[VAL_3:.*]] = constant 999 : index		// CHECK: %[[VAL_3:.*]] = constant 999 : index
// CHECK: %[[VAL_4:.*]] = constant 16 : index		// CHECK: %[[VAL_4:.*]] = constant 16 : index
// CHECK: %[[VAL_5:.*]] = constant 0 : index		// CHECK: %[[VAL_5:.*]] = constant 0 : index
// CHECK: %[[VAL_6:.*]] = constant 1 : index		// CHECK: %[[VAL_6:.*]] = constant 1 : index
// CHECK: %[[VAL_7:.*]] = alloca(%[[VAL_3]]) : memref<?xindex>		// CHECK: %[[VAL_7:.*]] = alloca(%[[VAL_3]]) : memref<?xindex>
// CHECK: %[[VAL_8:.*]] = alloca(%[[VAL_3]]) : memref<?xindex>		// CHECK: %[[VAL_8:.*]] = alloca(%[[VAL_3]]) : memref<?xindex>
// CHECK: %[[VAL_9:.*]] = alloca(%[[VAL_3]]) : memref<?xf32>		// CHECK: %[[VAL_9:.*]] = alloca(%[[VAL_3]]) : memref<?xf32>
// CHECK: %[[VAL_10:.*]] = alloca() : memref<32xf32>		// CHECK: %[[VAL_10:.*]] = alloca() : memref<32xf32>
// CHECK: %[[VAL_11:.*]] = alloca() : memref<16xf32>		// CHECK: %[[VAL_11:.*]] = alloca() : memref<16xf32>
// CHECK: scf.for %[[VAL_12:.*]] = %[[VAL_5]] to %[[VAL_4]] step %[[VAL_6]] {		// CHECK: scf.for %[[VAL_12:.*]] = %[[VAL_5]] to %[[VAL_4]] step %[[VAL_6]] {
// CHECK: %[[VAL_13:.*]] = load %[[VAL_7]]{{\[}}%[[VAL_12]]] : memref<?xindex>		// CHECK: %[[VAL_13:.*]] = load %[[VAL_7]]{{\[}}%[[VAL_12]]] : memref<?xindex>
// CHECK: %[[VAL_14:.*]] = addi %[[VAL_12]], %[[VAL_6]] : index		// CHECK: %[[VAL_14:.*]] = addi %[[VAL_12]], %[[VAL_6]] : index
// CHECK: %[[VAL_15:.*]] = load %[[VAL_7]]{{\[}}%[[VAL_14]]] : memref<?xindex>		// CHECK: %[[VAL_15:.*]] = load %[[VAL_7]]{{\[}}%[[VAL_14]]] : memref<?xindex>
// CHECK: scf.for %[[VAL_16:.*]] = %[[VAL_13]] to %[[VAL_15]] step %[[VAL_6]] {		// CHECK: %[[VAL_16:.*]] = load %[[VAL_11]]{{\[}}%[[VAL_12]]] : memref<16xf32>
// CHECK: %[[VAL_17:.*]] = load %[[VAL_8]]{{\[}}%[[VAL_16]]] : memref<?xindex>		// CHECK: %[[VAL_17:.]] = scf.for %[[VAL_18:.]] = %[[VAL_13]] to %[[VAL_15]] step %[[VAL_6]] iter_args(%[[VAL_19:.*]] = %[[VAL_16]]) -> (f32) {
// CHECK: %[[VAL_18:.*]] = load %[[VAL_9]]{{\[}}%[[VAL_16]]] : memref<?xf32>		// CHECK: %[[VAL_20:.*]] = load %[[VAL_8]]{{\[}}%[[VAL_18]]] : memref<?xindex>
// CHECK: %[[VAL_19:.*]] = load %[[VAL_10]]{{\[}}%[[VAL_17]]] : memref<32xf32>		// CHECK: %[[VAL_21:.*]] = load %[[VAL_9]]{{\[}}%[[VAL_18]]] : memref<?xf32>
// CHECK: %[[VAL_20:.*]] = mulf %[[VAL_18]], %[[VAL_19]] : f32		// CHECK: %[[VAL_22:.*]] = load %[[VAL_10]]{{\[}}%[[VAL_20]]] : memref<32xf32>
// CHECK: %[[VAL_21:.*]] = load %[[VAL_11]]{{\[}}%[[VAL_12]]] : memref<16xf32>		// CHECK: %[[VAL_23:.*]] = mulf %[[VAL_21]], %[[VAL_22]] : f32
// CHECK: %[[VAL_22:.*]] = addf %[[VAL_20]], %[[VAL_21]] : f32		// CHECK: %[[VAL_24:.*]] = addf %[[VAL_23]], %[[VAL_19]] : f32
// CHECK: store %[[VAL_22]], %[[VAL_11]]{{\[}}%[[VAL_12]]] : memref<16xf32>		// CHECK: scf.yield %[[VAL_24]] : f32
// CHECK: }		// CHECK: }
		// CHECK: store %[[VAL_25:.*]], %[[VAL_11]]{{\[}}%[[VAL_12]]] : memref<16xf32>
// CHECK: }		// CHECK: }
// CHECK: %[[VAL_23:.*]] = tensor_load %[[VAL_11]] : memref<16xf32>		// CHECK: %[[VAL_26:.*]] = tensor_load %[[VAL_11]] : memref<16xf32>
// CHECK: return %[[VAL_23]] : tensor<16xf32>		// CHECK: return %[[VAL_26]] : tensor<16xf32>
// CHECK: }		// CHECK: }
func @matvec(%argA: tensor<16x32xf32>, %argb: tensor<32xf32>, %argx: tensor<16xf32>) -> tensor<16xf32> {		func @matvec(%argA: tensor<16x32xf32>, %argb: tensor<32xf32>, %argx: tensor<16xf32>) -> tensor<16xf32> {
%0 = linalg.generic #trait_matvec		%0 = linalg.generic #trait_matvec
ins(%argA, %argb : tensor<16x32xf32>, tensor<32xf32>)		ins(%argA, %argb : tensor<16x32xf32>, tensor<32xf32>)
init(%argx : tensor<16xf32>) {		init(%argx : tensor<16xf32>) {
^bb(%A: f32, %b: f32, %x: f32):		^bb(%A: f32, %b: f32, %x: f32):
%0 = mulf %A, %b : f32		%0 = mulf %A, %b : f32
%1 = addf %0, %x : f32		%1 = addf %0, %x : f32
linalg.yield %1 : f32		linalg.yield %1 : f32
} -> tensor<16xf32>		} -> tensor<16xf32>
return %0 : tensor<16xf32>		return %0 : tensor<16xf32>
}		}

#trait_sum_reduction = {		#trait_sum_reduction = {
indexing_maps = [		indexing_maps = [
affine_map<(i,j) -> (i,j)>, // a		affine_map<(i,j) -> (i,j)>, // A
affine_map<(i,j) -> ()> // x (scalar out)		affine_map<(i,j) -> ()> // x (scalar out)
],		],
sparse = [		sparse = [
[ "D","S" ], // a		[ "D", "S" ], // A
[ ] // x		[ ] // x
],		],
iterator_types = ["reduction", "reduction"],		iterator_types = ["reduction", "reduction"],
doc = "x = SUM_ij a(i,j)"		doc = "x += SUM_ij A(i,j)"
}		}

// CHECK-LABEL: func @sum_reduction(		// CHECK-LABEL: func @sum_reduction(
// CHECK-SAME: %[[VAL_0:.*0]]: tensor<10x20xf32>,		// CHECK-SAME: %[[VAL_0:.*]]: tensor<10x20xf32>,
// CHECK-SAME: %[[VAL_1:.*1]]: tensor<f32>) -> tensor<f32> {		// CHECK-SAME: %[[VAL_1:.*]]: tensor<f32>) -> tensor<f32> {
// CHECK: %[[VAL_2:.*]] = constant 999 : index		// CHECK: %[[VAL_2:.*]] = constant 999 : index
// CHECK: %[[VAL_3:.*]] = constant 10 : index		// CHECK: %[[VAL_3:.*]] = constant 10 : index
// CHECK: %[[VAL_4:.*]] = constant 0 : index		// CHECK: %[[VAL_4:.*]] = constant 0 : index
// CHECK: %[[VAL_5:.*]] = constant 1 : index		// CHECK: %[[VAL_5:.*]] = constant 1 : index
// CHECK: %[[VAL_6:.*]] = alloca(%[[VAL_2]]) : memref<?xindex>		// CHECK: %[[VAL_6:.*]] = alloca(%[[VAL_2]]) : memref<?xindex>
// CHECK: %[[VAL_7:.*]] = alloca(%[[VAL_2]]) : memref<?xindex>		// CHECK: %[[VAL_7:.*]] = alloca(%[[VAL_2]]) : memref<?xindex>
// CHECK: %[[VAL_8:.*]] = alloca(%[[VAL_2]]) : memref<?xf32>		// CHECK: %[[VAL_8:.*]] = alloca(%[[VAL_2]]) : memref<?xf32>
// CHECK: %[[VAL_9:.*]] = alloca() : memref<f32>		// CHECK: %[[VAL_9:.*]] = alloca() : memref<f32>
// CHECK: scf.for %[[VAL_10:.*]] = %[[VAL_4]] to %[[VAL_3]] step %[[VAL_5]] {		// CHECK: scf.for %[[VAL_10:.*]] = %[[VAL_4]] to %[[VAL_3]] step %[[VAL_5]] {
// CHECK: %[[VAL_11:.*]] = load %[[VAL_6]]{{\[}}%[[VAL_10]]] : memref<?xindex>		// CHECK: %[[VAL_11:.*]] = load %[[VAL_6]]{{\[}}%[[VAL_10]]] : memref<?xindex>
// CHECK: %[[VAL_12:.*]] = addi %[[VAL_10]], %[[VAL_5]] : index		// CHECK: %[[VAL_12:.*]] = addi %[[VAL_10]], %[[VAL_5]] : index
// CHECK: %[[VAL_13:.*]] = load %[[VAL_6]]{{\[}}%[[VAL_12]]] : memref<?xindex>		// CHECK: %[[VAL_13:.*]] = load %[[VAL_6]]{{\[}}%[[VAL_12]]] : memref<?xindex>
// CHECK: scf.for %[[VAL_14:.*]] = %[[VAL_11]] to %[[VAL_13]] step %[[VAL_5]] {		// CHECK: %[[VAL_14:.*]] = load %[[VAL_9]][] : memref<f32>
// CHECK: %[[VAL_15:.*]] = load %[[VAL_9]][] : memref<f32>		// CHECK: %[[VAL_15:.]] = scf.for %[[VAL_16:.]] = %[[VAL_11]] to %[[VAL_13]] step %[[VAL_5]] iter_args(%[[VAL_17:.*]] = %[[VAL_14]]) -> (f32) {
// CHECK: %[[VAL_16:.*]] = load %[[VAL_8]]{{\[}}%[[VAL_14]]] : memref<?xf32>		// CHECK: %[[VAL_18:.*]] = load %[[VAL_8]]{{\[}}%[[VAL_16]]] : memref<?xf32>
// CHECK: %[[VAL_17:.*]] = addf %[[VAL_15]], %[[VAL_16]] : f32		// CHECK: %[[VAL_19:.*]] = addf %[[VAL_17]], %[[VAL_18]] : f32
// CHECK: store %[[VAL_17]], %[[VAL_9]][] : memref<f32>		// CHECK: scf.yield %[[VAL_19]] : f32
// CHECK: }		// CHECK: }
		// CHECK: store %[[VAL_20:.*]], %[[VAL_9]][] : memref<f32>
// CHECK: }		// CHECK: }
// CHECK: %[[VAL_18:.*]] = tensor_load %[[VAL_9]] : memref<f32>		// CHECK: %[[VAL_21:.*]] = tensor_load %[[VAL_9]] : memref<f32>
// CHECK: return %[[VAL_18]] : tensor<f32>		// CHECK: return %[[VAL_21]] : tensor<f32>
// CHECK: }		// CHECK: }
func @sum_reduction(%arga: tensor<10x20xf32>, %argx: tensor<f32>) -> tensor<f32> {		func @sum_reduction(%arga: tensor<10x20xf32>, %argx: tensor<f32>) -> tensor<f32> {
%0 = linalg.generic #trait_sum_reduction		%0 = linalg.generic #trait_sum_reduction
ins(%arga : tensor<10x20xf32>)		ins(%arga : tensor<10x20xf32>)
init(%argx : tensor<f32>) {		init(%argx : tensor<f32>) {
^bb(%a : f32, %x : f32):		^bb(%a : f32, %x : f32):
%0 = addf %x, %a : f32		%0 = addf %x, %a : f32
linalg.yield %0: f32		linalg.yield %0: f32
▲ Show 20 Lines • Show All 60 Lines • ▼ Show 20 Lines	#trait_sampled_dense_dense = {
],		],
sparse = [		sparse = [
[ "S", "S" ], // S		[ "S", "S" ], // S
[ "D", "D" ], // A		[ "D", "D" ], // A
[ "D", "D" ], // B		[ "D", "D" ], // B
[ "D", "D" ] // X		[ "D", "D" ] // X
],		],
iterator_types = ["parallel", "parallel", "reduction"],		iterator_types = ["parallel", "parallel", "reduction"],
doc = "X(i,j) = S(i,j) SUM_k A(i,k) B(k,j)"		doc = "X(i,j) += S(i,j) SUM_k A(i,k) B(k,j)"
}		}

// CHECK-LABEL: func @sampled_dense_dense(		// CHECK-LABEL: func @sampled_dense_dense(
// CHECK-SAME: %[[VAL_0:.*0]]: tensor<?x?xf32>,		// CHECK-SAME: %[[VAL_0:.*0]]: tensor<?x?xf32>,
// CHECK-SAME: %[[VAL_1:.*1]]: tensor<?x?xf32>,		// CHECK-SAME: %[[VAL_1:.*1]]: tensor<?x?xf32>,
// CHECK-SAME: %[[VAL_2:.*2]]: tensor<?x?xf32>,		// CHECK-SAME: %[[VAL_2:.*2]]: tensor<?x?xf32>,
// CHECK-SAME: %[[VAL_3:.*3]]: tensor<?x?xf32>) -> tensor<?x?xf32> {		// CHECK-SAME: %[[VAL_3:.*3]]: tensor<?x?xf32>) -> tensor<?x?xf32> {
// CHECK: %[[VAL_4:.*]] = constant 999 : index		// CHECK: %[[VAL_4:.*]] = constant 999 : index
▲ Show 20 Lines • Show All 47 Lines • ▼ Show 20 Lines	init(%argx : tensor<?x?xf32>) {
^bb(%s : f32, %a : f32, %b : f32, %x : f32):		^bb(%s : f32, %a : f32, %b : f32, %x : f32):
%0 = mulf %a, %b : f32		%0 = mulf %a, %b : f32
%1 = mulf %s, %0 : f32		%1 = mulf %s, %0 : f32
%2 = addf %x, %1 : f32		%2 = addf %x, %1 : f32
linalg.yield %2: f32		linalg.yield %2: f32
} -> tensor<?x?xf32>		} -> tensor<?x?xf32>
return %0 : tensor<?x?xf32>		return %0 : tensor<?x?xf32>
}		}

		#trait_sum_kernel_with_inv = {
		indexing_maps = [
		affine_map<(i,j) -> (i,j)>, // A
		affine_map<(i,j) -> (i,j)>, // B
		affine_map<(i,j) -> (i,j)>, // C
		affine_map<(i,j) -> (i)>, // d
		affine_map<(i,j) -> ()>, // e
		affine_map<(i,j) -> (i)> // x (out)
		],
		sparse = [
		[ "S", "S" ], // A
		[ "D", "S" ], // B
		[ "D", "S" ], // C
		[ "D" ], // d
		[ ], // e
		[ "D" ] // x
		],
		iterator_types = ["parallel", "reduction"],
		doc = "x(i) = SUM_j A(i,j) * B(i,j) * d(i) * e + C(i,j)"
		}

		// CHECK-LABEL: func @sum_kernel_with_inv(
		// CHECK-SAME: %[[VAL_0:.*0]]: tensor<?x?xf32>,
		// CHECK-SAME: %[[VAL_1:.*1]]: tensor<?x?xf32>,
		// CHECK-SAME: %[[VAL_2:.*2]]: tensor<?x?xf32>,
		// CHECK-SAME: %[[VAL_3:.*3]]: tensor<?xf32>,
		// CHECK-SAME: %[[VAL_4:.*4]]: tensor<f32>,
		// CHECK-SAME: %[[VAL_5:.*5]]: tensor<?xf32>) -> tensor<?xf32> {
		// CHECK: %[[VAL_6:.*]] = constant 999 : index
		// CHECK: %[[VAL_7:.*]] = constant 0 : index
		// CHECK: %[[VAL_8:.*]] = constant true
		// CHECK: %[[VAL_9:.*]] = constant 1 : index
		// CHECK: %[[VAL_10:.*]] = alloca(%[[VAL_6]]) : memref<?xindex>
		// CHECK: %[[VAL_11:.*]] = alloca(%[[VAL_6]]) : memref<?xindex>
		// CHECK: %[[VAL_12:.*]] = alloca(%[[VAL_6]]) : memref<?xindex>
		// CHECK: %[[VAL_13:.*]] = alloca(%[[VAL_6]]) : memref<?xindex>
		// CHECK: %[[VAL_14:.*]] = alloca(%[[VAL_6]]) : memref<?xf32>
		// CHECK: %[[VAL_15:.*]] = alloca(%[[VAL_6]]) : memref<?xindex>
		// CHECK: %[[VAL_16:.*]] = alloca(%[[VAL_6]]) : memref<?xindex>
		// CHECK: %[[VAL_17:.*]] = alloca(%[[VAL_6]]) : memref<?xf32>
		// CHECK: %[[VAL_18:.*]] = alloca(%[[VAL_6]]) : memref<?xindex>
		// CHECK: %[[VAL_19:.*]] = alloca(%[[VAL_6]]) : memref<?xindex>
		// CHECK: %[[VAL_20:.*]] = alloca(%[[VAL_6]]) : memref<?xf32>
		// CHECK: %[[VAL_21:.*]] = dim %[[VAL_3]], %[[VAL_7]] : tensor<?xf32>
		// CHECK: %[[VAL_22:.*]] = alloca(%[[VAL_21]]) : memref<?xf32>
		// CHECK: %[[VAL_23:.*]] = alloca() : memref<f32>
		// CHECK: %[[VAL_24:.*]] = dim %[[VAL_5]], %[[VAL_7]] : tensor<?xf32>
		// CHECK: %[[VAL_25:.*]] = alloca(%[[VAL_24]]) : memref<?xf32>
		// CHECK: %[[VAL_26:.*]] = load %[[VAL_23]][] : memref<f32>
		// CHECK: %[[VAL_27:.*]] = load %[[VAL_10]]{{\[}}%[[VAL_7]]] : memref<?xindex>
		// CHECK: %[[VAL_28:.*]] = load %[[VAL_10]]{{\[}}%[[VAL_9]]] : memref<?xindex>
		// CHECK: %[[VAL_29:.]]:2 = scf.while (%[[VAL_30:.]] = %[[VAL_27]], %[[VAL_31:.*]] = %[[VAL_7]]) : (index, index) -> (index, index) {
		// CHECK: %[[VAL_32:.*]] = cmpi "ult", %[[VAL_30]], %[[VAL_28]] : index
		// CHECK: scf.condition(%[[VAL_32]]) %[[VAL_30]], %[[VAL_31]] : index, index
		// CHECK: } do {
		// CHECK: ^bb0(%[[VAL_33:.]]: index, %[[VAL_34:.]]: index):
		// CHECK: %[[VAL_35:.*]] = load %[[VAL_11]]{{\[}}%[[VAL_33]]] : memref<?xindex>
		// CHECK: %[[VAL_36:.*]] = cmpi "eq", %[[VAL_35]], %[[VAL_34]] : index
		// CHECK: scf.if %[[VAL_36]] {
		// CHECK: %[[VAL_37:.*]] = load %[[VAL_22]]{{\[}}%[[VAL_34]]] : memref<?xf32>
		// CHECK: %[[VAL_38:.*]] = load %[[VAL_12]]{{\[}}%[[VAL_33]]] : memref<?xindex>
		// CHECK: %[[VAL_39:.*]] = addi %[[VAL_33]], %[[VAL_9]] : index
		// CHECK: %[[VAL_40:.*]] = load %[[VAL_12]]{{\[}}%[[VAL_39]]] : memref<?xindex>
		// CHECK: %[[VAL_41:.*]] = load %[[VAL_15]]{{\[}}%[[VAL_34]]] : memref<?xindex>
		// CHECK: %[[VAL_42:.*]] = addi %[[VAL_34]], %[[VAL_9]] : index
		// CHECK: %[[VAL_43:.*]] = load %[[VAL_15]]{{\[}}%[[VAL_42]]] : memref<?xindex>
		// CHECK: %[[VAL_44:.*]] = load %[[VAL_18]]{{\[}}%[[VAL_34]]] : memref<?xindex>
		// CHECK: %[[VAL_45:.*]] = addi %[[VAL_34]], %[[VAL_9]] : index
		// CHECK: %[[VAL_46:.*]] = load %[[VAL_18]]{{\[}}%[[VAL_45]]] : memref<?xindex>
		// CHECK: %[[VAL_47:.]]:4 = scf.while (%[[VAL_48:.]] = %[[VAL_38]], %[[VAL_49:.]] = %[[VAL_41]], %[[VAL_50:.]] = %[[VAL_44]], %[[VAL_51:.*]] = %[[VAL_7]]) : (index, index, index, index) -> (index, index, index, index) {
		// CHECK: %[[VAL_52:.*]] = cmpi "ult", %[[VAL_48]], %[[VAL_40]] : index
		// CHECK: %[[VAL_53:.*]] = cmpi "ult", %[[VAL_49]], %[[VAL_43]] : index
		// CHECK: %[[VAL_54:.*]] = and %[[VAL_52]], %[[VAL_53]] : i1
		// CHECK: %[[VAL_55:.*]] = cmpi "ult", %[[VAL_50]], %[[VAL_46]] : index
		// CHECK: %[[VAL_56:.*]] = and %[[VAL_54]], %[[VAL_55]] : i1
		// CHECK: scf.condition(%[[VAL_56]]) %[[VAL_48]], %[[VAL_49]], %[[VAL_50]], %[[VAL_51]] : index, index, index, index
		// CHECK: } do {
		// CHECK: ^bb0(%[[VAL_57:.]]: index, %[[VAL_58:.]]: index, %[[VAL_59:.]]: index, %[[VAL_60:.]]: index):
		// CHECK: %[[VAL_61:.*]] = load %[[VAL_13]]{{\[}}%[[VAL_57]]] : memref<?xindex>
		// CHECK: %[[VAL_62:.*]] = load %[[VAL_16]]{{\[}}%[[VAL_58]]] : memref<?xindex>
		// CHECK: %[[VAL_63:.*]] = load %[[VAL_19]]{{\[}}%[[VAL_59]]] : memref<?xindex>
		// CHECK: %[[VAL_64:.*]] = cmpi "eq", %[[VAL_61]], %[[VAL_60]] : index
		// CHECK: %[[VAL_65:.*]] = cmpi "eq", %[[VAL_62]], %[[VAL_60]] : index
		// CHECK: %[[VAL_66:.*]] = and %[[VAL_64]], %[[VAL_65]] : i1
		// CHECK: %[[VAL_67:.*]] = cmpi "eq", %[[VAL_63]], %[[VAL_60]] : index
		// CHECK: %[[VAL_68:.*]] = and %[[VAL_66]], %[[VAL_67]] : i1
		// CHECK: scf.if %[[VAL_68]] {
		// CHECK: %[[VAL_69:.*]] = load %[[VAL_25]]{{\[}}%[[VAL_34]]] : memref<?xf32>
		// CHECK: %[[VAL_70:.*]] = load %[[VAL_14]]{{\[}}%[[VAL_57]]] : memref<?xf32>
		// CHECK: %[[VAL_71:.*]] = load %[[VAL_17]]{{\[}}%[[VAL_58]]] : memref<?xf32>
		// CHECK: %[[VAL_72:.*]] = mulf %[[VAL_70]], %[[VAL_71]] : f32
		// CHECK: %[[VAL_73:.*]] = mulf %[[VAL_72]], %[[VAL_37]] : f32
		// CHECK: %[[VAL_74:.*]] = mulf %[[VAL_73]], %[[VAL_26]] : f32
		// CHECK: %[[VAL_75:.*]] = load %[[VAL_20]]{{\[}}%[[VAL_59]]] : memref<?xf32>
		// CHECK: %[[VAL_76:.*]] = addf %[[VAL_74]], %[[VAL_75]] : f32
		// CHECK: %[[VAL_77:.*]] = addf %[[VAL_69]], %[[VAL_76]] : f32
		// CHECK: store %[[VAL_77]], %[[VAL_25]]{{\[}}%[[VAL_34]]] : memref<?xf32>
		// CHECK: } else {
		// CHECK: %[[VAL_78:.*]] = cmpi "eq", %[[VAL_61]], %[[VAL_60]] : index
		// CHECK: %[[VAL_79:.*]] = cmpi "eq", %[[VAL_62]], %[[VAL_60]] : index
		// CHECK: %[[VAL_80:.*]] = and %[[VAL_78]], %[[VAL_79]] : i1
		// CHECK: scf.if %[[VAL_80]] {
		// CHECK: %[[VAL_81:.*]] = load %[[VAL_25]]{{\[}}%[[VAL_34]]] : memref<?xf32>
		// CHECK: %[[VAL_82:.*]] = load %[[VAL_14]]{{\[}}%[[VAL_57]]] : memref<?xf32>
		// CHECK: %[[VAL_83:.*]] = load %[[VAL_17]]{{\[}}%[[VAL_58]]] : memref<?xf32>
		// CHECK: %[[VAL_84:.*]] = mulf %[[VAL_82]], %[[VAL_83]] : f32
		// CHECK: %[[VAL_85:.*]] = mulf %[[VAL_84]], %[[VAL_37]] : f32
		// CHECK: %[[VAL_86:.*]] = mulf %[[VAL_85]], %[[VAL_26]] : f32
		// CHECK: %[[VAL_87:.*]] = addf %[[VAL_81]], %[[VAL_86]] : f32
		// CHECK: store %[[VAL_87]], %[[VAL_25]]{{\[}}%[[VAL_34]]] : memref<?xf32>
		// CHECK: } else {
		// CHECK: %[[VAL_88:.*]] = cmpi "eq", %[[VAL_63]], %[[VAL_60]] : index
		// CHECK: scf.if %[[VAL_88]] {
		// CHECK: %[[VAL_89:.*]] = load %[[VAL_25]]{{\[}}%[[VAL_34]]] : memref<?xf32>
		// CHECK: %[[VAL_90:.*]] = load %[[VAL_20]]{{\[}}%[[VAL_59]]] : memref<?xf32>
		// CHECK: %[[VAL_91:.*]] = addf %[[VAL_89]], %[[VAL_90]] : f32
		// CHECK: store %[[VAL_91]], %[[VAL_25]]{{\[}}%[[VAL_34]]] : memref<?xf32>
		// CHECK: } else {
		// CHECK: }
		// CHECK: }
		// CHECK: }
		// CHECK: %[[VAL_92:.*]] = cmpi "eq", %[[VAL_61]], %[[VAL_60]] : index
		// CHECK: %[[VAL_93:.*]] = addi %[[VAL_57]], %[[VAL_9]] : index
		// CHECK: %[[VAL_94:.*]] = select %[[VAL_92]], %[[VAL_93]], %[[VAL_57]] : index
		// CHECK: %[[VAL_95:.*]] = cmpi "eq", %[[VAL_62]], %[[VAL_60]] : index
		// CHECK: %[[VAL_96:.*]] = addi %[[VAL_58]], %[[VAL_9]] : index
		// CHECK: %[[VAL_97:.*]] = select %[[VAL_95]], %[[VAL_96]], %[[VAL_58]] : index
		// CHECK: %[[VAL_98:.*]] = cmpi "eq", %[[VAL_63]], %[[VAL_60]] : index
		// CHECK: %[[VAL_99:.*]] = addi %[[VAL_59]], %[[VAL_9]] : index
		// CHECK: %[[VAL_100:.*]] = select %[[VAL_98]], %[[VAL_99]], %[[VAL_59]] : index
		// CHECK: %[[VAL_101:.*]] = addi %[[VAL_60]], %[[VAL_9]] : index
		// CHECK: scf.yield %[[VAL_94]], %[[VAL_97]], %[[VAL_100]], %[[VAL_101]] : index, index, index, index
		// CHECK: }
		// CHECK: %[[VAL_102:.]]:3 = scf.while (%[[VAL_103:.]] = %[[VAL_104:.]]#0, %[[VAL_105:.]] = %[[VAL_104]]#1, %[[VAL_106:.*]] = %[[VAL_104]]#3) : (index, index, index) -> (index, index, index) {
		// CHECK: %[[VAL_107:.*]] = cmpi "ult", %[[VAL_103]], %[[VAL_40]] : index
		// CHECK: %[[VAL_108:.*]] = cmpi "ult", %[[VAL_105]], %[[VAL_43]] : index
		// CHECK: %[[VAL_109:.*]] = and %[[VAL_107]], %[[VAL_108]] : i1
		// CHECK: scf.condition(%[[VAL_109]]) %[[VAL_103]], %[[VAL_105]], %[[VAL_106]] : index, index, index
		// CHECK: } do {
		// CHECK: ^bb0(%[[VAL_110:.]]: index, %[[VAL_111:.]]: index, %[[VAL_112:.*]]: index):
		// CHECK: %[[VAL_113:.*]] = load %[[VAL_13]]{{\[}}%[[VAL_110]]] : memref<?xindex>
		// CHECK: %[[VAL_114:.*]] = load %[[VAL_16]]{{\[}}%[[VAL_111]]] : memref<?xindex>
		// CHECK: %[[VAL_115:.*]] = cmpi "eq", %[[VAL_113]], %[[VAL_112]] : index
		// CHECK: %[[VAL_116:.*]] = cmpi "eq", %[[VAL_114]], %[[VAL_112]] : index
		// CHECK: %[[VAL_117:.*]] = and %[[VAL_115]], %[[VAL_116]] : i1
		// CHECK: scf.if %[[VAL_117]] {
		// CHECK: %[[VAL_118:.*]] = load %[[VAL_25]]{{\[}}%[[VAL_34]]] : memref<?xf32>
		// CHECK: %[[VAL_119:.*]] = load %[[VAL_14]]{{\[}}%[[VAL_110]]] : memref<?xf32>
		// CHECK: %[[VAL_120:.*]] = load %[[VAL_17]]{{\[}}%[[VAL_111]]] : memref<?xf32>
		// CHECK: %[[VAL_121:.*]] = mulf %[[VAL_119]], %[[VAL_120]] : f32
		// CHECK: %[[VAL_122:.*]] = mulf %[[VAL_121]], %[[VAL_37]] : f32
		// CHECK: %[[VAL_123:.*]] = mulf %[[VAL_122]], %[[VAL_26]] : f32
		// CHECK: %[[VAL_124:.*]] = addf %[[VAL_118]], %[[VAL_123]] : f32
		// CHECK: store %[[VAL_124]], %[[VAL_25]]{{\[}}%[[VAL_34]]] : memref<?xf32>
		// CHECK: } else {
		// CHECK: }
		// CHECK: %[[VAL_125:.*]] = cmpi "eq", %[[VAL_113]], %[[VAL_112]] : index
		// CHECK: %[[VAL_126:.*]] = addi %[[VAL_110]], %[[VAL_9]] : index
		// CHECK: %[[VAL_127:.*]] = select %[[VAL_125]], %[[VAL_126]], %[[VAL_110]] : index
		// CHECK: %[[VAL_128:.*]] = cmpi "eq", %[[VAL_114]], %[[VAL_112]] : index
		// CHECK: %[[VAL_129:.*]] = addi %[[VAL_111]], %[[VAL_9]] : index
		// CHECK: %[[VAL_130:.*]] = select %[[VAL_128]], %[[VAL_129]], %[[VAL_111]] : index
		// CHECK: %[[VAL_131:.*]] = addi %[[VAL_112]], %[[VAL_9]] : index
		// CHECK: scf.yield %[[VAL_127]], %[[VAL_130]], %[[VAL_131]] : index, index, index
		// CHECK: }
		// CHECK: %[[VAL_132:.*]] = load %[[VAL_25]]{{\[}}%[[VAL_34]]] : memref<?xf32>
		// CHECK: %[[VAL_133:.]] = scf.for %[[VAL_134:.]] = %[[VAL_135:.]]#2 to %[[VAL_46]] step %[[VAL_9]] iter_args(%[[VAL_136:.]] = %[[VAL_132]]) -> (f32) {
		// CHECK: %[[VAL_137:.*]] = load %[[VAL_20]]{{\[}}%[[VAL_134]]] : memref<?xf32>
		// CHECK: %[[VAL_138:.*]] = addf %[[VAL_136]], %[[VAL_137]] : f32
		// CHECK: scf.yield %[[VAL_138]] : f32
		// CHECK: }
		// CHECK: store %[[VAL_139:.*]], %[[VAL_25]]{{\[}}%[[VAL_34]]] : memref<?xf32>
		// CHECK: } else {
		// CHECK: scf.if %[[VAL_8]] {
		// CHECK: %[[VAL_140:.*]] = load %[[VAL_18]]{{\[}}%[[VAL_34]]] : memref<?xindex>
		// CHECK: %[[VAL_141:.*]] = addi %[[VAL_34]], %[[VAL_9]] : index
		// CHECK: %[[VAL_142:.*]] = load %[[VAL_18]]{{\[}}%[[VAL_141]]] : memref<?xindex>
		// CHECK: %[[VAL_143:.*]] = load %[[VAL_25]]{{\[}}%[[VAL_34]]] : memref<?xf32>
		// CHECK: %[[VAL_144:.]] = scf.for %[[VAL_145:.]] = %[[VAL_140]] to %[[VAL_142]] step %[[VAL_9]] iter_args(%[[VAL_146:.*]] = %[[VAL_143]]) -> (f32) {
		// CHECK: %[[VAL_147:.*]] = load %[[VAL_20]]{{\[}}%[[VAL_145]]] : memref<?xf32>
		// CHECK: %[[VAL_148:.*]] = addf %[[VAL_146]], %[[VAL_147]] : f32
		// CHECK: scf.yield %[[VAL_148]] : f32
		// CHECK: }
		// CHECK: store %[[VAL_149:.*]], %[[VAL_25]]{{\[}}%[[VAL_34]]] : memref<?xf32>
		// CHECK: } else {
		// CHECK: }
		// CHECK: }
		// CHECK: %[[VAL_150:.*]] = cmpi "eq", %[[VAL_35]], %[[VAL_34]] : index
		// CHECK: %[[VAL_151:.*]] = addi %[[VAL_33]], %[[VAL_9]] : index
		// CHECK: %[[VAL_152:.*]] = select %[[VAL_150]], %[[VAL_151]], %[[VAL_33]] : index
		// CHECK: %[[VAL_153:.*]] = addi %[[VAL_34]], %[[VAL_9]] : index
		// CHECK: scf.yield %[[VAL_152]], %[[VAL_153]] : index, index
		// CHECK: }
		// CHECK: scf.for %[[VAL_154:.]] = %[[VAL_155:.]]#1 to %[[VAL_24]] step %[[VAL_9]] {
		// CHECK: %[[VAL_156:.*]] = load %[[VAL_18]]{{\[}}%[[VAL_154]]] : memref<?xindex>
		// CHECK: %[[VAL_157:.*]] = addi %[[VAL_154]], %[[VAL_9]] : index
		// CHECK: %[[VAL_158:.*]] = load %[[VAL_18]]{{\[}}%[[VAL_157]]] : memref<?xindex>
		// CHECK: %[[VAL_159:.*]] = load %[[VAL_25]]{{\[}}%[[VAL_154]]] : memref<?xf32>
		// CHECK: %[[VAL_160:.]] = scf.for %[[VAL_161:.]] = %[[VAL_156]] to %[[VAL_158]] step %[[VAL_9]] iter_args(%[[VAL_162:.*]] = %[[VAL_159]]) -> (f32) {
		// CHECK: %[[VAL_163:.*]] = load %[[VAL_20]]{{\[}}%[[VAL_161]]] : memref<?xf32>
		// CHECK: %[[VAL_164:.*]] = addf %[[VAL_162]], %[[VAL_163]] : f32
		// CHECK: scf.yield %[[VAL_164]] : f32
		// CHECK: }
		// CHECK: store %[[VAL_165:.*]], %[[VAL_25]]{{\[}}%[[VAL_154]]] : memref<?xf32>
		// CHECK: }
		// CHECK: %[[VAL_166:.*]] = tensor_load %[[VAL_25]] : memref<?xf32>
		// CHECK: return %[[VAL_166]] : tensor<?xf32>
		// CHECK: }
		func @sum_kernel_with_inv(%arga: tensor<?x?xf32>,
		%argb: tensor<?x?xf32>,
		%argc: tensor<?x?xf32>,
		%argd: tensor<?xf32>,
		%arge: tensor<f32>,
		%argx: tensor<?xf32>) -> tensor<?xf32> {
		%0 = linalg.generic #trait_sum_kernel_with_inv
		ins(%arga, %argb, %argc, %argd, %arge : tensor<?x?xf32>,
		tensor<?x?xf32>,
		tensor<?x?xf32>,
		tensor<?xf32>,
		tensor<f32>)
		init(%argx : tensor<?xf32>) {
		^bb(%a : f32, %b : f32, %c : f32, %d : f32, %e : f32, %x : f32):
		%0 = mulf %a, %b : f32
		%1 = mulf %0, %d : f32
		%2 = mulf %1, %e : f32
		%3 = addf %2, %c : f32
		%4 = addf %x, %3 : f32
		linalg.yield %4: f32
		} -> tensor<?xf32>
		return %0 : tensor<?xf32>
		}

mlir/test/Dialect/Linalg/sparse_3d.mlir

Show First 20 Lines • Show All 1,154 Lines • ▼ Show 20 Lines	#trait_kernel_3d = {
],		],
sparse = [		sparse = [
[ "D", "D", "S" ], // B		[ "D", "D", "S" ], // B
[ "D", "D" ], // C		[ "D", "D" ], // C
[ "D", "D" ], // D		[ "D", "D" ], // D
[ "D", "D" ] // A		[ "D", "D" ] // A
],		],
iterator_types = ["parallel", "parallel", "reduction", "reduction"],		iterator_types = ["parallel", "parallel", "reduction", "reduction"],
doc = "A(i,j) = SUM_k,l B(i,k,l) * C(k,j) * D(l,j)"		doc = "A(i,j) += SUM_k,l B(i,k,l) * C(k,j) * D(l,j)"
}		}

// CHECK-LABEL: func @kernel_3d(		// CHECK-LABEL: func @kernel_3d(
// CHECK-SAME: %[[VAL_0:.*0]]: tensor<?x?xf32>,		// CHECK-SAME: %[[VAL_0:.*0]]: tensor<?x?xf32>,
// CHECK-SAME: %[[VAL_1:.*1]]: tensor<?x?x?xf32>,		// CHECK-SAME: %[[VAL_1:.*1]]: tensor<?x?x?xf32>,
// CHECK-SAME: %[[VAL_2:.*2]]: tensor<?x?xf32>,		// CHECK-SAME: %[[VAL_2:.*2]]: tensor<?x?xf32>,
// CHECK-SAME: %[[VAL_3:.*3]]: tensor<?x?xf32>) -> tensor<?x?xf32> {		// CHECK-SAME: %[[VAL_3:.*3]]: tensor<?x?xf32>) -> tensor<?x?xf32> {
// CHECK: %[[VAL_4:.*]] = constant 999 : index		// CHECK: %[[VAL_4:.*]] = constant 999 : index
▲ Show 20 Lines • Show All 46 Lines • ▼ Show 20 Lines	%0 = linalg.generic #trait_kernel_3d
^bb(%b: f32, %c: f32, %d : f32, %a : f32):		^bb(%b: f32, %c: f32, %d : f32, %a : f32):
%0 = mulf %b, %c : f32		%0 = mulf %b, %c : f32
%1 = mulf %0, %d : f32		%1 = mulf %0, %d : f32
%2 = addf %1, %a : f32		%2 = addf %1, %a : f32
linalg.yield %2 : f32		linalg.yield %2 : f32
} -> tensor<?x?xf32>		} -> tensor<?x?xf32>
return %0 : tensor<?x?xf32>		return %0 : tensor<?x?xf32>
}		}

#trait_sum_reduction = {		#trait_sum_reduction = {
indexing_maps = [		indexing_maps = [
affine_map<(i,j,k) -> (i,j,k)>, // a		affine_map<(i,j,k) -> (i,j,k)>, // A
affine_map<(i,j,k) -> ()> // x (scalar out)		affine_map<(i,j,k) -> ()> // x (scalar out)
],		],
sparse = [		sparse = [
[ "S", "S", "S" ], // a		[ "S", "S", "S" ], // A
[ ] // x		[ ] // x
],		],
iterator_types = ["reduction", "reduction", "reduction"],		iterator_types = ["reduction", "reduction", "reduction"],
doc = "x = SUM_ijk a(i,j,k)"		doc = "x += SUM_ijk A(i,j,k)"
}		}

// CHECK-LABEL: func @sum_reduction(		// CHECK-LABEL: func @sum_reduction(
// CHECK-SAME: %[[VAL_0:.*]]: tensor<10x20x30xf32>,		// CHECK-SAME: %[[VAL_0:.*]]: tensor<10x20x30xf32>,
// CHECK-SAME: %[[VAL_1:.*]]: tensor<f32>) -> tensor<f32> {		// CHECK-SAME: %[[VAL_1:.*]]: tensor<f32>) -> tensor<f32> {
// CHECK: %[[VAL_2:.*]] = constant 999 : index		// CHECK: %[[VAL_2:.*]] = constant 999 : index
// CHECK: %[[VAL_3:.*]] = constant 0 : index		// CHECK: %[[VAL_3:.*]] = constant 0 : index
// CHECK: %[[VAL_4:.*]] = constant 1 : index		// CHECK: %[[VAL_4:.*]] = constant 1 : index
Show All 10 Lines
// CHECK: scf.for %[[VAL_15:.*]] = %[[VAL_13]] to %[[VAL_14]] step %[[VAL_4]] {		// CHECK: scf.for %[[VAL_15:.*]] = %[[VAL_13]] to %[[VAL_14]] step %[[VAL_4]] {
// CHECK: %[[VAL_16:.*]] = load %[[VAL_7]]{{\[}}%[[VAL_15]]] : memref<?xindex>		// CHECK: %[[VAL_16:.*]] = load %[[VAL_7]]{{\[}}%[[VAL_15]]] : memref<?xindex>
// CHECK: %[[VAL_17:.*]] = addi %[[VAL_15]], %[[VAL_4]] : index		// CHECK: %[[VAL_17:.*]] = addi %[[VAL_15]], %[[VAL_4]] : index
// CHECK: %[[VAL_18:.*]] = load %[[VAL_7]]{{\[}}%[[VAL_17]]] : memref<?xindex>		// CHECK: %[[VAL_18:.*]] = load %[[VAL_7]]{{\[}}%[[VAL_17]]] : memref<?xindex>
// CHECK: scf.for %[[VAL_19:.*]] = %[[VAL_16]] to %[[VAL_18]] step %[[VAL_4]] {		// CHECK: scf.for %[[VAL_19:.*]] = %[[VAL_16]] to %[[VAL_18]] step %[[VAL_4]] {
// CHECK: %[[VAL_20:.*]] = load %[[VAL_9]]{{\[}}%[[VAL_19]]] : memref<?xindex>		// CHECK: %[[VAL_20:.*]] = load %[[VAL_9]]{{\[}}%[[VAL_19]]] : memref<?xindex>
// CHECK: %[[VAL_21:.*]] = addi %[[VAL_19]], %[[VAL_4]] : index		// CHECK: %[[VAL_21:.*]] = addi %[[VAL_19]], %[[VAL_4]] : index
// CHECK: %[[VAL_22:.*]] = load %[[VAL_9]]{{\[}}%[[VAL_21]]] : memref<?xindex>		// CHECK: %[[VAL_22:.*]] = load %[[VAL_9]]{{\[}}%[[VAL_21]]] : memref<?xindex>
// CHECK: scf.for %[[VAL_23:.*]] = %[[VAL_20]] to %[[VAL_22]] step %[[VAL_4]] {		// CHECK: %[[VAL_23:.*]] = load %[[VAL_12]][] : memref<f32>
// CHECK: %[[VAL_24:.*]] = load %[[VAL_12]][] : memref<f32>		// CHECK: %[[VAL_24:.]] = scf.for %[[VAL_25:.]] = %[[VAL_20]] to %[[VAL_22]] step %[[VAL_4]] iter_args(%[[VAL_26:.*]] = %[[VAL_23]]) -> (f32) {
// CHECK: %[[VAL_25:.*]] = load %[[VAL_11]]{{\[}}%[[VAL_23]]] : memref<?xf32>		// CHECK: %[[VAL_27:.*]] = load %[[VAL_11]]{{\[}}%[[VAL_25]]] : memref<?xf32>
// CHECK: %[[VAL_26:.*]] = addf %[[VAL_24]], %[[VAL_25]] : f32		// CHECK: %[[VAL_28:.*]] = addf %[[VAL_26]], %[[VAL_27]] : f32
// CHECK: store %[[VAL_26]], %[[VAL_12]][] : memref<f32>		// CHECK: scf.yield %[[VAL_28]] : f32
// CHECK: }		// CHECK: }
		// CHECK: store %[[VAL_29:.*]], %[[VAL_12]][] : memref<f32>
// CHECK: }		// CHECK: }
// CHECK: }		// CHECK: }
// CHECK: %[[VAL_27:.*]] = tensor_load %[[VAL_12]] : memref<f32>		// CHECK: %[[VAL_30:.*]] = tensor_load %[[VAL_12]] : memref<f32>
// CHECK: return %[[VAL_27]] : tensor<f32>		// CHECK: return %[[VAL_30]] : tensor<f32>
// CHECK: }		// CHECK: }
func @sum_reduction(%arga: tensor<10x20x30xf32>, %argx: tensor<f32>) -> tensor<f32> {		func @sum_reduction(%arga: tensor<10x20x30xf32>, %argx: tensor<f32>) -> tensor<f32> {
%0 = linalg.generic #trait_sum_reduction		%0 = linalg.generic #trait_sum_reduction
ins(%arga : tensor<10x20x30xf32>)		ins(%arga : tensor<10x20x30xf32>)
init(%argx : tensor<f32>) {		init(%argx : tensor<f32>) {
^bb(%a : f32, %x : f32):		^bb(%a : f32, %x : f32):
%0 = addf %x, %a : f32		%0 = addf %x, %a : f32
linalg.yield %0: f32		linalg.yield %0: f32
} -> tensor<f32>		} -> tensor<f32>
return %0 : tensor<f32>		return %0 : tensor<f32>
}		}

		#trait_sum_reduction_inv = {
		indexing_maps = [
		affine_map<(i,j,k) -> (i,j,k)>, // A
		affine_map<(i,j,k) -> (i)>, // b
		affine_map<(i,j,k) -> ()> // x (scalar out)
		],
		sparse = [
		[ "D", "D", "D" ], // A
		[ "D" ], // b
		[ ] // x
		],
		iterator_types = ["reduction", "reduction", "reduction"],
		doc = "x += SUM_i A(i,j,k) * b(i)"
		}

		// CHECK-LABEL: func @sum_reduction_inv(
		// CHECK-SAME: %[[VAL_0:.*]]: tensor<?x?x?xf32>,
		// CHECK-SAME: %[[VAL_1:.*]]: tensor<?xf32>,
		// CHECK-SAME: %[[VAL_2:.*]]: tensor<f32>) -> tensor<f32> {
		// CHECK: %[[VAL_3:.*]] = constant 2 : index
		// CHECK: %[[VAL_4:.*]] = constant 0 : index
		// CHECK: %[[VAL_5:.*]] = constant 1 : index
		// CHECK: %[[VAL_6:.*]] = dim %[[VAL_0]], %[[VAL_4]] : tensor<?x?x?xf32>
		// CHECK: %[[VAL_7:.*]] = dim %[[VAL_0]], %[[VAL_5]] : tensor<?x?x?xf32>
		// CHECK: %[[VAL_8:.*]] = dim %[[VAL_0]], %[[VAL_3]] : tensor<?x?x?xf32>
		// CHECK: %[[VAL_9:.*]] = alloca(%[[VAL_6]], %[[VAL_7]], %[[VAL_8]]) : memref<?x?x?xf32>
		// CHECK: %[[VAL_10:.*]] = dim %[[VAL_1]], %[[VAL_4]] : tensor<?xf32>
		// CHECK: %[[VAL_11:.*]] = alloca(%[[VAL_10]]) : memref<?xf32>
		// CHECK: %[[VAL_12:.*]] = alloca() : memref<f32>
		// CHECK: scf.for %[[VAL_13:.*]] = %[[VAL_4]] to %[[VAL_10]] step %[[VAL_5]] {
		// CHECK: %[[VAL_14:.*]] = load %[[VAL_11]]{{\[}}%[[VAL_13]]] : memref<?xf32>
		// CHECK: scf.for %[[VAL_15:.*]] = %[[VAL_4]] to %[[VAL_7]] step %[[VAL_5]] {
		// CHECK: %[[VAL_16:.*]] = load %[[VAL_12]][] : memref<f32>
		// CHECK: %[[VAL_17:.]] = scf.for %[[VAL_18:.]] = %[[VAL_4]] to %[[VAL_8]] step %[[VAL_5]] iter_args(%[[VAL_19:.*]] = %[[VAL_16]]) -> (f32) {
		// CHECK: %[[VAL_20:.*]] = load %[[VAL_9]]{{\[}}%[[VAL_13]], %[[VAL_15]], %[[VAL_18]]] : memref<?x?x?xf32>
		// CHECK: %[[VAL_21:.*]] = mulf %[[VAL_20]], %[[VAL_14]] : f32
		// CHECK: %[[VAL_22:.*]] = addf %[[VAL_19]], %[[VAL_21]] : f32
		// CHECK: scf.yield %[[VAL_22]] : f32
		// CHECK: }
		// CHECK: store %[[VAL_23:.*]], %[[VAL_12]][] : memref<f32>
		// CHECK: }
		// CHECK: }
		// CHECK: %[[VAL_24:.*]] = tensor_load %[[VAL_12]] : memref<f32>
		// CHECK: return %[[VAL_24]] : tensor<f32>
		// CHECK: }
		func @sum_reduction_inv(%arga: tensor<?x?x?xf32>,
		%argb: tensor<?xf32>,
		%argx: tensor<f32>) -> tensor<f32> {
		%0 = linalg.generic #trait_sum_reduction_inv
		ins(%arga, %argb : tensor<?x?x?xf32>, tensor<?xf32>)
		init(%argx : tensor<f32>) {
		^bb(%a : f32, %b : f32, %x : f32):
		%0 = mulf %a, %b : f32
		%1 = addf %x, %0 : f32
		linalg.yield %1: f32
		} -> tensor<f32>
		return %0 : tensor<f32>
		}

#trait_invariants = {		#trait_invariants = {
indexing_maps = [		indexing_maps = [
affine_map<(i,j,k) -> (i)>, // a		affine_map<(i,j,k) -> (i)>, // a
affine_map<(i,j,k) -> (j)>, // b		affine_map<(i,j,k) -> (j)>, // b
affine_map<(i,j,k) -> (k)>, // c		affine_map<(i,j,k) -> (k)>, // c
affine_map<(i,j,k) -> (i,j,k)> // x		affine_map<(i,j,k) -> (i,j,k)> // X (out)
],		],
sparse = [		sparse = [
[ "D" ], // a		[ "D" ], // a
[ "D" ], // b		[ "D" ], // b
[ "D" ], // c		[ "D" ], // c
[ "D", "D", "D" ] // x		[ "D", "D", "D" ] // X
],		],
iterator_types = ["parallel", "parallel", "parallel"],		iterator_types = ["parallel", "parallel", "parallel"],
doc = "x(i,j,k) = a(i) * b(j) * c(k)"		doc = "X(i,j,k) = a(i) * b(j) * c(k)"
}		}

// CHECK-LABEL: func @invariants(		// CHECK-LABEL: func @invariants(
// CHECK-SAME: %[[VAL_0:.*]]: tensor<10xf32>,		// CHECK-SAME: %[[VAL_0:.*]]: tensor<10xf32>,
// CHECK-SAME: %[[VAL_1:.*]]: tensor<20xf32>,		// CHECK-SAME: %[[VAL_1:.*]]: tensor<20xf32>,
// CHECK-SAME: %[[VAL_2:.*]]: tensor<30xf32>) -> tensor<10x20x30xf32> {		// CHECK-SAME: %[[VAL_2:.*]]: tensor<30xf32>) -> tensor<10x20x30xf32> {
// CHECK: %[[VAL_3:.*]] = constant 10 : index		// CHECK: %[[VAL_3:.*]] = constant 10 : index
// CHECK: %[[VAL_4:.*]] = constant 20 : index		// CHECK: %[[VAL_4:.*]] = constant 20 : index
Show All 34 Lines

This is an archive of the discontinued LLVM Phabricator instance.

[mlir][sparse] scalarize reductions in for-loops during sparse codegen
ClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 312639

mlir/lib/Dialect/Linalg/Transforms/Sparsification.cpp

mlir/test/Dialect/Linalg/sparse_1d.mlir

mlir/test/Dialect/Linalg/sparse_2d.mlir

mlir/test/Dialect/Linalg/sparse_3d.mlir

This is an archive of the discontinued LLVM Phabricator instance.

[mlir][sparse] scalarize reductions in for-loops during sparse codegenClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 312639

mlir/lib/Dialect/Linalg/Transforms/Sparsification.cpp

mlir/test/Dialect/Linalg/sparse_1d.mlir

mlir/test/Dialect/Linalg/sparse_2d.mlir

mlir/test/Dialect/Linalg/sparse_3d.mlir

[mlir][sparse] scalarize reductions in for-loops during sparse codegen
ClosedPublic