Diff 460126

mlir/lib/Dialect/SparseTensor/Transforms/Sparsification.cpp

Show First 20 Lines • Show All 217 Lines • ▼ Show 20 Lines	for (unsigned d = 0, rank = map.getNumResults(); d < rank; d++) {
AffineExpr a = map.getResult(perm(enc, d));		AffineExpr a = map.getResult(perm(enc, d));
if (!findAffine(merger, tensor, a, toDimLevelFormat(enc, d)))		if (!findAffine(merger, tensor, a, toDimLevelFormat(enc, d)))
return false; // inadmissable affine expression		return false; // inadmissable affine expression
}		}
}		}
return annotated;		return annotated;
}		}

/// A DFS helper to compute a topological sort. Note that recursion is		/// A helper to compute a topological sort. O(n^2) time complexity
/// bounded by the number of implicit loops, which is always small.		/// as we use adj matrix for the graph.
/// Returns false when a cycle is detected.		/// The sorted result will put the first Reduction iterator to the
		aartbikUnsubmitted Done Reply Inline Actions interator -> iterator aartbik: interator -> iterator
static bool topSortDFS(unsigned i, std::vector<unsigned> &visit,		/// latest possible index.
		static bool topSortOptimal(unsigned n, ArrayRef<Attribute> iteratorTypes,
		aartbikUnsubmitted Done Reply Inline Actions A nice side-effect of using the "worklist" is that you also removed the recursion, which will make a lot of people happy ;-) aartbik: A nice side-effect of using the "worklist" is that you also removed the recursion, which will…
std::vector<unsigned> &topSort,		std::vector<unsigned> &topSort,
		std::vector<unsigned> &inDegree,
std::vector<std::vector<bool>> &adjM) {		std::vector<std::vector<bool>> &adjM) {
if (visit[i] != 0)		std::vector<unsigned> redIt; // reduce iterator with 0 degree
return visit[i] != 1; // 1 denotes cycle!		std::vector<unsigned> parIt; // parallel iterator with 0 degree
		PeimingAuthorUnsubmitted Done Reply Inline Actions We do not really need a priority queue in our case, as we have limited value to sort, this bucket sort-like approach is faster and more straightforward. Peiming: We do not really need a priority queue in our case, as we have limited value to sort, this…
visit[i] = 1;		for (unsigned i = 0; i < n; i++) {
for (unsigned j = 0, e = visit.size(); j < e; j++)		if (inDegree[i] == 0) {
if (adjM[i][j])		if (linalg::isReductionIterator(iteratorTypes[i]))
if (!topSortDFS(j, visit, topSort, adjM))		redIt.push_back(i);
return false;		else
visit[i] = 2;		parIt.push_back(i);
topSort.push_back(i);		}
return true;		}

		while (!redIt.empty() \|\| !parIt.empty()) {
		// We always choose parallel iterator if there is any.
		auto &it = !parIt.empty() ? parIt : redIt;
		auto src = it.back();
		topSort.push_back(src);
		it.pop_back();
		// Update in-degree, and push 0-degree node into worklist.
		aartbikUnsubmitted Done Reply Inline Actions in-degree aartbik: in-degree
		for (unsigned dst = 0; dst < n; dst++)
		if (adjM[src][dst] && --inDegree[dst] == 0) {
		if (linalg::isReductionIterator(iteratorTypes[dst]))
		redIt.push_back(dst);
		else
		parIt.push_back(dst);
		}
		}
		return topSort.size() == n;
}		}

/// Helper method to add all constraints from the indices in one affine		/// Helper method to add all constraints from the indices in one affine
/// expression before all indices in the other affine expression. For		/// expression before all indices in the other affine expression. For
/// example i0+i1 < i2+i3+1 yields i0<i2, i0<i3, i1<i2, and i1<i3.		/// example i0+i1 < i2+i3+1 yields i0<i2, i0<i3, i1<i2, and i1<i3.
static void addAffineOrderings(std::vector<std::vector<bool>> &adjM,		static void addAffineOrderings(std::vector<std::vector<bool>> &adjM,
AffineExpr a, AffineExpr b, unsigned fidx) {		std::vector<unsigned> &inDegree, AffineExpr a,
		AffineExpr b, unsigned fidx) {
switch (a.getKind()) {		switch (a.getKind()) {
case AffineExprKind::DimId: {		case AffineExprKind::DimId: {
unsigned idx = a.cast<AffineDimExpr>().getPosition();		unsigned idx = a.cast<AffineDimExpr>().getPosition();
if (b)		if (b)
addAffineOrderings(adjM, b, AffineExpr(), idx);		addAffineOrderings(adjM, inDegree, b, AffineExpr(), idx);
else		else if (!adjM[fidx][idx]) {
adjM[fidx][idx] = true;		adjM[fidx][idx] = true;
		inDegree[idx]++;
		aartbikUnsubmitted Done Reply Inline Actions make this if (b) { } else if (!adjM...) { } aartbik: make this if (b) { } else if (!adjM...) { }
		}
break;		break;
}		}
case AffineExprKind::Add:		case AffineExprKind::Add:
case AffineExprKind::Mul: {		case AffineExprKind::Mul: {
auto binOp = a.cast<AffineBinaryOpExpr>();		auto binOp = a.cast<AffineBinaryOpExpr>();
addAffineOrderings(adjM, binOp.getLHS(), b, fidx);		addAffineOrderings(adjM, inDegree, binOp.getLHS(), b, fidx);
addAffineOrderings(adjM, binOp.getRHS(), b, fidx);		addAffineOrderings(adjM, inDegree, binOp.getRHS(), b, fidx);
break;		break;
}		}
default:		default:
break;		break;
}		}
}		}

/// Computes a topologically sorted iteration graph for the linalg operation.		/// Computes a topologically sorted iteration graph for the linalg operation.
/// Ensures all tensors are visited in natural index order. This is essential		/// Ensures all tensors are visited in natural index order. This is essential
/// for sparse storage formats since these only support access along fixed		/// for sparse storage formats since these only support access along fixed
/// dimensions. Even for dense storage formats, however, the natural index		/// dimensions. Even for dense storage formats, however, the natural index
/// order yields innermost unit-stride access with better spatial locality.		/// order yields innermost unit-stride access with better spatial locality.
static bool computeIterationGraph(Merger &merger, linalg::GenericOp op,		static bool computeIterationGraph(Merger &merger, linalg::GenericOp op,
std::vector<unsigned> &topSort, unsigned mask,		std::vector<unsigned> &topSort, unsigned mask,
OpOperand *skip = nullptr) {		OpOperand *skip = nullptr) {
// Set up an n x n from/to adjacency matrix of the iteration graph		// Set up an n x n from/to adjacency matrix of the iteration graph
// for the implicit loop indices i_0 .. i_n-1.		// for the implicit loop indices i_0 .. i_n-1.
unsigned n = op.getNumLoops();		unsigned n = op.getNumLoops();
std::vector<std::vector<bool>> adjM(n, std::vector<bool>(n, false));		std::vector<std::vector<bool>> adjM(n, std::vector<bool>(n, false));
		std::vector<unsigned> inDegree(n, 0); // in-degree of each node.
		auto iteratorTypes = op.iterator_types().getValue();
		aartbikUnsubmitted Done Reply Inline Actions in-degree aartbik: in-degree
// Iterate over the indexing maps of every tensor in the tensor expression.		// Iterate over the indexing maps of every tensor in the tensor expression.
for (OpOperand *t : op.getInputAndOutputOperands()) {		for (OpOperand *t : op.getInputAndOutputOperands()) {
// Skip tensor during cycle resolution.		// Skip tensor during cycle resolution.
if (t == skip)		if (t == skip)
continue;		continue;
// Get map and encoding.		// Get map and encoding.
auto map = op.getTiedIndexingMap(t);		auto map = op.getTiedIndexingMap(t);
auto enc = getSparseTensorEncoding(t->get().getType());		auto enc = getSparseTensorEncoding(t->get().getType());
assert(map.getNumDims() == n);		assert(map.getNumDims() == n);
// Skip dense tensor constraints when not requested.		// Skip dense tensor constraints when not requested.
if (!(mask & SortMask::kIncludeDense) && !enc)		if (!(mask & SortMask::kIncludeDense) && !enc)
continue;		continue;
// Each tensor expression and optional dimension ordering (row-major		// Each tensor expression and optional dimension ordering (row-major
// by default) puts an ordering constraint on the loop indices. For		// by default) puts an ordering constraint on the loop indices. For
// example, the tensor expresion A_ijk forces the ordering i < j < k		// example, the tensor expresion A_ijk forces the ordering i < j < k
// on the loop indices if no explicit dimension ordering is given.		// on the loop indices if no explicit dimension ordering is given.
for (unsigned d = 1, rank = map.getNumResults(); d < rank; d++) {		for (unsigned d = 1, rank = map.getNumResults(); d < rank; d++) {
AffineExpr f = map.getResult(perm(enc, d - 1));		AffineExpr f = map.getResult(perm(enc, d - 1));
AffineExpr t = map.getResult(perm(enc, d));		AffineExpr t = map.getResult(perm(enc, d));
addAffineOrderings(adjM, f, t, 0);		addAffineOrderings(adjM, inDegree, f, t, 0);
}		}
// Push unrelated loops into sparse iteration space, so these		// Push unrelated loops into sparse iteration space, so these
// will be skipped more often.		// will be skipped more often.
if (mask & SortMask::kIncludeUndef) {		if (mask & SortMask::kIncludeUndef) {
unsigned tensor = t->getOperandNumber();		unsigned tensor = t->getOperandNumber();
for (unsigned i = 0; i < n; i++)		for (unsigned i = 0; i < n; i++)
if (merger.isDimLevelType(tensor, i, DimLvlType::kCompressed) \|\|		if (merger.isDimLevelType(tensor, i, DimLvlType::kCompressed) \|\|
merger.isDimLevelType(tensor, i, DimLvlType::kSingleton))		merger.isDimLevelType(tensor, i, DimLvlType::kSingleton))
for (unsigned j = 0; j < n; j++)		for (unsigned j = 0; j < n; j++)
if (merger.isDimLevelType(tensor, j, DimLvlType::kUndef))		if (merger.isDimLevelType(tensor, j, DimLvlType::kUndef)) {
adjM[i][j] = true;		adjM[i][j] = true;
		inDegree[j]++;
		}
}		}
		aartbikUnsubmitted Not Done Reply Inline Actions Could we have achieved a similar heuristic by adding one extra "mask" layer in which we add parallel -> reduce edges? It is not completely the same, but it feels like we are using heurstics at various levels now. Conversely, with the new heuristic could we get rid of this "unrelated" step? aartbik: Could we have achieved a similar heuristic by adding one extra "mask" layer in which we add…
		PeimingAuthorUnsubmitted Done Reply Inline Actions I think it might be hard, because what we really care is only the first reduction iterator. Also, I am not sure when and how Undef level is used here, maybe yourself are the better person to answer it ;-) Peiming: I think it might be hard, because what we really care is only the first reduction iterator.
		aartbikUnsubmitted Not Done Reply Inline Actions Yeah, I think we should try removing this and keep the heuristics simple (dense -> sparse with mask) and then your much better (parallel < reduction) for each invocation. But we can try removing that after this revision goes in.... aartbik: Yeah, I think we should try removing this and keep the heuristics simple (dense -> sparse with…
}		}
// Topologically sort the iteration graph to determine loop order.		// Topologically sort the iteration graph to determine loop order.
// Report failure for a cyclic iteration graph.		// Report failure for a cyclic iteration graph.
topSort.clear();		topSort.clear();
topSort.reserve(n);		topSort.reserve(n);
std::vector<unsigned> visit(n, 0);		return topSortOptimal(n, iteratorTypes, topSort, inDegree, adjM);
for (unsigned i = 0; i < n; i++)
if (visit[i] == 0)
if (!topSortDFS(i, visit, topSort, adjM))
return false; // cycle!
std::reverse(std::begin(topSort), std::end(topSort));
return true;
}		}

/// Returns true if tensor materializes uninitialized into the computation.		/// Returns true if tensor materializes uninitialized into the computation.
static bool isMaterializing(Value val) {		static bool isMaterializing(Value val) {
return val.getDefiningOp<linalg::InitTensorOp>() \|\|		return val.getDefiningOp<linalg::InitTensorOp>() \|\|
		PeimingAuthorUnsubmitted Done Reply Inline Actions An optimal solution could be computing all possible topo result and pick the best one, but I am afraid it would be very inefficient. Is there any other known algorithm that I am not aware of can be applied here? Peiming: An optimal solution could be computing all possible topo result and pick the best one, but I am…
		PeimingAuthorUnsubmitted Done Reply Inline Actions An optimal solution could be computing all possible topo result and pick the best one, but I am afraid it would be very inefficient. Is there any other known algorithm that I am not aware of can be applied here? Probably a priority queue can helps to get an O(NlogN) solution, is that more desirable? Peiming: > An optimal solution could be computing all possible topo result and pick the best one, but I…
		aartbikUnsubmitted Done Reply Inline Actions I actually have a allTopSorts() implementation in my local workspace that I used in the past to see what version to pick if there are several. I think the n is usually small enough not too worry too much, but also note that we actually are still in the phase where we separate policy from mechanism, i.e. we make sure we can generate code but assume something else will tell us what choices to make rather than introducing very advanced heuristics. Perhaps we can do the same trick as we do for the L1822 with an extra case that takes the isAdmissble into account? aartbik: I actually have a allTopSorts() implementation in my local workspace that I used in the past to…
val.getDefiningOp<bufferization::AllocTensorOp>();		val.getDefiningOp<bufferization::AllocTensorOp>();
}		}

/// Returns true when the tensor expression is admissable for codegen.		/// Returns true when the tensor expression is admissable for codegen.
/// Since all sparse input tensors are admissable, we just need to check		/// Since all sparse input tensors are admissable, we just need to check
/// whether the out tensor in the tensor expression codegen is admissable.		/// whether the out tensor in the tensor expression codegen is admissable.
/// Sets `sparseOut` to the tensor and `outerParNest` to the outer injective		/// Sets `sparseOut` to the tensor and `outerParNest` to the outer injective
/// nesting depth when a "truly dynamic" sparse tensor output occurs.		/// nesting depth when a "truly dynamic" sparse tensor output occurs.
▲ Show 20 Lines • Show All 926 Lines • ▼ Show 20 Lines	static Operation *genFor(Merger &merger, CodeGen &codegen, OpBuilder &builder,
auto iteratorTypes = op.iterator_types().getValue();		auto iteratorTypes = op.iterator_types().getValue();
bool isReduction = linalg::isReductionIterator(iteratorTypes[idx]);		bool isReduction = linalg::isReductionIterator(iteratorTypes[idx]);
bool isSparse = merger.isDimLevelType(fb, DimLvlType::kCompressed);		bool isSparse = merger.isDimLevelType(fb, DimLvlType::kCompressed);
bool isVector = isVectorFor(codegen, isInner, isReduction, isSparse) &&		bool isVector = isVectorFor(codegen, isInner, isReduction, isSparse) &&
denseUnitStrides(merger, op, idx);		denseUnitStrides(merger, op, idx);
bool isParallel =		bool isParallel =
isParallelFor(codegen, isOuter, isReduction, isSparse, isVector);		isParallelFor(codegen, isOuter, isReduction, isSparse, isVector);

assert(!merger.isDimLevelType(fb, DimLvlType::kSingleton) && "TODO: implement");		assert(!merger.isDimLevelType(fb, DimLvlType::kSingleton) &&
		"TODO: implement");

// Prepare vector length.		// Prepare vector length.
if (isVector)		if (isVector)
codegen.curVecLength = codegen.options.vectorLength;		codegen.curVecLength = codegen.options.vectorLength;

// Loop bounds and increment.		// Loop bounds and increment.
Location loc = op.getLoc();		Location loc = op.getLoc();
Value lo = isSparse ? codegen.pidxs[tensor][idx] : codegen.loops[idx];		Value lo = isSparse ? codegen.pidxs[tensor][idx] : codegen.loops[idx];
▲ Show 20 Lines • Show All 510 Lines • ▼ Show 20 Lines	LogicalResult matchAndRewrite(linalg::GenericOp op,
// information for all tensors to loop indices in the kernel.		// information for all tensors to loop indices in the kernel.
assert(op.getNumOutputs() == 1);		assert(op.getNumOutputs() == 1);
unsigned numTensors = op.getNumInputsAndOutputs();		unsigned numTensors = op.getNumInputsAndOutputs();
unsigned numLoops = op.iterator_types().getValue().size();		unsigned numLoops = op.iterator_types().getValue().size();
Merger merger(numTensors, numLoops);		Merger merger(numTensors, numLoops);
if (!findSparseAnnotations(merger, op))		if (!findSparseAnnotations(merger, op))
return failure();		return failure();

		// Builds the tensor expression for the Linalg operation in SSA form.
		aartbikUnsubmitted Done Reply Inline Actions Keep the original comment for this block // Builds the tensor expression for the Linalg operation in SSA form. aartbik: Keep the original comment for this block // Builds the tensor expression for the Linalg…
		Optional<unsigned> optExp = merger.buildTensorExpFromLinalg(op);
		if (!optExp.has_value())
		return failure();

		unsigned exp = optExp.value();
		OpOperand *sparseOut = nullptr;
		unsigned outerParNest = 0;
// Computes a topologically sorted iteration graph to ensure tensors		// Computes a topologically sorted iteration graph to ensure tensors
// are visited in natural index order. Gradually relaxes the considered		// are visited in natural index order. Gradually relaxes the considered
// constraints until an acyclic iteration graph results, such that sparse		// constraints until an acyclic iteration graph results, such that sparse
// code generation can proceed. As a last resort, an attempt is made		// code generation can proceed. As a last resort, an attempt is made
// to resolve cycles by inserting a conversion.		// to resolve cycles by inserting a conversion.
std::vector<unsigned> topSort;		std::vector<unsigned> topSort;
if (!computeIterationGraph(merger, op, topSort, SortMask::kIncludeAll) &&		// Whether the current GenericOp is admissible
!computeIterationGraph(merger, op, topSort, SortMask::kIncludeUndef) &&		bool isAdmissible = false;
!computeIterationGraph(merger, op, topSort, SortMask::kIncludeDense) &&		// An const list of all masks that we used for interation graph
!computeIterationGraph(merger, op, topSort, SortMask::kSparseOnly)) {		// computation. Must be ordered from strict -> loose.
return resolveCycle(merger, rewriter, op);		const auto allMask = {SortMask::kIncludeAll, SortMask::kIncludeUndef,
		SortMask::kIncludeDense, SortMask::kSparseOnly};
		for (auto mask : allMask) {
		if (computeIterationGraph(merger, op, topSort, mask) &&
		aartbikUnsubmitted Done Reply Inline Actions Make the nested if a && on conditions for shorter code aartbik: Make the nested if a && on conditions for shorter code
		isAdmissableTensorExp(merger, op, topSort, exp, &sparseOut,
		outerParNest)) {
		// This is an admissible GenericOp.
		isAdmissible = true;
		break;
		}
		// else try a less strict constraints.
}		}

// Builds the tensor expression for the Linalg operation in SSA form.		if (!isAdmissible)
Optional<unsigned> optExp = merger.buildTensorExpFromLinalg(op);		// Give it one last shot to resolve the cycle.
if (!optExp.has_value())		return resolveCycle(merger, rewriter, op);
return failure();
unsigned exp = optExp.value();

// Rejects an inadmissable tensor expression.
OpOperand *sparseOut = nullptr;
unsigned outerParNest = 0;
if (!isAdmissableTensorExp(merger, op, topSort, exp, &sparseOut,
outerParNest))
return failure();

// Recursively generates code.		// Recursively generates code if admissible.
merger.setHasSparseOut(sparseOut != nullptr);		merger.setHasSparseOut(sparseOut != nullptr);
CodeGen codegen(options, numTensors, numLoops, sparseOut, outerParNest);		CodeGen codegen(options, numTensors, numLoops, sparseOut, outerParNest);
genBuffers(merger, codegen, rewriter, op);		genBuffers(merger, codegen, rewriter, op);
genStmt(merger, codegen, rewriter, op, topSort, exp, 0);		genStmt(merger, codegen, rewriter, op, topSort, exp, 0);
genResult(merger, codegen, rewriter, op);		genResult(merger, codegen, rewriter, op);
return success();		return success();
}		}

▲ Show 20 Lines • Show All 52 Lines • Show Last 20 Lines

mlir/test/Dialect/SparseTensor/sparse_sddmm.mlir

Show First 20 Lines • Show All 117 Lines • ▼ Show 20 Lines	%3 = linalg.generic #trait_scale
outs(%1 : tensor<8x8xf64>) {		outs(%1 : tensor<8x8xf64>) {
^bb0(%t: f64, %s: f64, %x: f64):		^bb0(%t: f64, %s: f64, %x: f64):
%r = arith.mulf %t, %s : f64		%r = arith.mulf %t, %s : f64
linalg.yield %r : f64		linalg.yield %r : f64
} -> tensor<8x8xf64>		} -> tensor<8x8xf64>
return %3 : tensor<8x8xf64>		return %3 : tensor<8x8xf64>
}		}

// CHECK-LABEL: func.func @sparse_sampled_dd_unfused(
// CHECK-SAME: %[[VAL_0:.]]: tensor<8x8xf64, #sparse_tensor.encoding<{{.}}>>,		// CHECK-LABEL: func @sparse_sampled_dd_unfused(
// CHECK-SAME: %[[VAL_1:.*]]: tensor<8x8xf64>,		// CHECK-SAME: %[[TMP_arg0:.*]]: tensor<8x8xf64, #sparse_tensor.encoding
// CHECK-SAME: %[[VAL_2:.]]: tensor<8x8xf64>) -> tensor<8x8xf64, #sparse_tensor.encoding<{{.}}>> {		// CHECK-SAME: %[[TMP_arg1:.*]]: tensor<8x8xf64>,
// CHECK-DAG: %[[VAL_3:.*]] = arith.constant 8 : index		// CHECK-SAME: %[[TMP_arg2:.*]]: tensor<8x8xf64>)
// CHECK-DAG: %[[VAL_4:.*]] = arith.constant 0 : index		// CHECK-DAG: %[[TMP_c8:.*]] = arith.constant 8 : index
// CHECK-DAG: %[[VAL_5:.*]] = arith.constant 1 : index		// CHECK-DAG: %[[TMP_c2:.*]] = arith.constant 2 : index
// CHECK-DAG: %[[VAL_6:.*]] = arith.constant 2 : index		// CHECK-DAG: %[[TMP_c0:.*]] = arith.constant 0 : index
// CHECK-DAG: %[[VAL_7:.*]] = arith.constant 0.000000e+00 : f64		// CHECK-DAG: %[[TMP_c1:.*]] = arith.constant 1 : index
// CHECK-DAG: %[[VAL_8:.*]] = arith.constant dense<0.000000e+00> : tensor<8x8xf64>		// CHECK-DAG: %[[TMP_false:.*]] = arith.constant false
// CHECK: %[[VAL_9:.*]] = bufferization.alloc_tensor() copy(%[[VAL_8]]) {bufferization.escape = [false]} : tensor<8x8xf64>		// CHECK-DAG: %[[TMP_true:.*]] = arith.constant true
// CHECK: %[[VAL_10:.]] = bufferization.alloc_tensor() {bufferization.escape = [false]} : tensor<8x8xf64, #sparse_tensor.encoding<{{.}}>>		// CHECK-DAG: %[[TMP_cst:.*]] = arith.constant dense<0.000000e+00> : tensor<8x8xf64>
// CHECK: %[[VAL_11:.*]] = bufferization.to_memref %[[VAL_1]] : memref<8x8xf64>		// CHECK: %[[TMP_0:.*]] = bufferization.alloc_tensor() copy(%[[TMP_cst]]) {bufferization.escape = [false]}
// CHECK: %[[VAL_12:.*]] = bufferization.to_memref %[[VAL_2]] : memref<8x8xf64>		// CHECK: %[[TMP_1:.*]] = bufferization.alloc_tensor() {bufferization.escape = [false]}
// CHECK: %[[VAL_13:.]] = sparse_tensor.pointers %[[VAL_0]] {dimension = 0 : index} : tensor<8x8xf64, #sparse_tensor.encoding<{{.}}>> to memref<?xindex>		// CHECK: %[[TMP_2:.*]] = bufferization.to_memref %[[TMP_arg1]] : memref<8x8xf64>
// CHECK: %[[VAL_14:.]] = sparse_tensor.indices %[[VAL_0]] {dimension = 0 : index} : tensor<8x8xf64, #sparse_tensor.encoding<{{.}}>> to memref<?xindex>		// CHECK: %[[TMP_3:.*]] = bufferization.to_memref %[[TMP_arg2]] : memref<8x8xf64>
// CHECK: %[[VAL_15:.]] = sparse_tensor.pointers %[[VAL_0]] {dimension = 1 : index} : tensor<8x8xf64, #sparse_tensor.encoding<{{.}}>> to memref<?xindex>		// CHECK: %[[TMP_4:.*]] = sparse_tensor.pointers %[[TMP_arg0]] {dimension = 0 : index}
// CHECK: %[[VAL_16:.]] = sparse_tensor.indices %[[VAL_0]] {dimension = 1 : index} : tensor<8x8xf64, #sparse_tensor.encoding<{{.}}>> to memref<?xindex>		// CHECK: %[[TMP_5:.*]] = sparse_tensor.indices %[[TMP_arg0]] {dimension = 0 : index}
// CHECK: %[[VAL_17:.]] = sparse_tensor.values %[[VAL_0]] : tensor<8x8xf64, #sparse_tensor.encoding<{{.}}>> to memref<?xf64>		// CHECK: %[[TMP_6:.*]] = sparse_tensor.pointers %[[TMP_arg0]] {dimension = 1 : index}
// CHECK: %[[VAL_18:.*]] = memref.alloca(%[[VAL_6]]) : memref<?xindex>		// CHECK: %[[TMP_7:.*]] = sparse_tensor.indices %[[TMP_arg0]] {dimension = 1 : index}
// CHECK: %[[VAL_19:.*]] = memref.alloca() : memref<f64>		// CHECK: %[[TMP_8:.*]] = sparse_tensor.values %[[TMP_arg0]]
// CHECK: %[[VAL_20:.*]] = memref.load %[[VAL_13]]{{\[}}%[[VAL_4]]] : memref<?xindex>		// CHECK: %[[TMP_9:.*]] = memref.alloca(%[[TMP_c2]]) : memref<?xindex>
// CHECK: %[[VAL_21:.*]] = memref.load %[[VAL_13]]{{\[}}%[[VAL_5]]] : memref<?xindex>		// CHECK: %[[TMP_10:.*]] = memref.load %[[TMP_4]][%[[TMP_c0]]] : memref<?xindex>
// CHECK: scf.for %[[VAL_22:.*]] = %[[VAL_20]] to %[[VAL_21]] step %[[VAL_5]] {		// CHECK: %[[TMP_11:.*]] = memref.load %[[TMP_4]][%[[TMP_c1]]] : memref<?xindex>
// CHECK: %[[VAL_23:.*]] = memref.load %[[VAL_14]]{{\[}}%[[VAL_22]]] : memref<?xindex>		// CHECK: scf.for %[[TMP_arg3:.*]] = %[[TMP_10]] to %[[TMP_11]] step %[[TMP_c1]] {
// CHECK: memref.store %[[VAL_23]], %[[VAL_18]]{{\[}}%[[VAL_4]]] : memref<?xindex>		// CHECK: %[[TMP_13:.*]] = memref.load %[[TMP_5]][%[[TMP_arg3]]] : memref<?xindex>
// CHECK: %[[VAL_24:.*]] = memref.load %[[VAL_15]]{{\[}}%[[VAL_22]]] : memref<?xindex>		// CHECK: memref.store %[[TMP_13]], %[[TMP_9]][%[[TMP_c0]]] : memref<?xindex>
// CHECK: %[[VAL_25:.*]] = arith.addi %[[VAL_22]], %[[VAL_5]] : index		// CHECK: %[[TMP_values:.]], %[[TMP_filled:.]], %[[TMP_added:.]], %[[TMP_count:.]] = sparse_tensor.expand %[[TMP_1]]
// CHECK: %[[VAL_26:.*]] = memref.load %[[VAL_15]]{{\[}}%[[VAL_25]]] : memref<?xindex>		// CHECK: %[[TMP_14:.]] = scf.for %[[TMP_arg4:.]] = %[[TMP_c0]] to %[[TMP_c8]] step %[[TMP_c1]] iter_args(%[[TMP_arg5:.*]] = %[[TMP_count]]) -> (index) {
// CHECK: scf.for %[[VAL_27:.*]] = %[[VAL_24]] to %[[VAL_26]] step %[[VAL_5]] {		// CHECK: %[[TMP_15:.*]] = memref.load %[[TMP_2]][%[[TMP_13]], %[[TMP_arg4]]] : memref<8x8xf64>
// CHECK: %[[VAL_28:.*]] = memref.load %[[VAL_16]]{{\[}}%[[VAL_27]]] : memref<?xindex>		// CHECK: %[[TMP_16:.*]] = memref.load %[[TMP_6]][%[[TMP_arg3]]] : memref<?xindex>
// CHECK: memref.store %[[VAL_28]], %[[VAL_18]]{{\[}}%[[VAL_5]]] : memref<?xindex>		// CHECK: %[[TMP_17:.*]] = arith.addi %[[TMP_arg3]], %[[TMP_c1]] : index
// CHECK: %[[VAL_29:.*]] = memref.load %[[VAL_17]]{{\[}}%[[VAL_27]]] : memref<?xf64>		// CHECK: %[[TMP_18:.*]] = memref.load %[[TMP_6]][%[[TMP_17]]] : memref<?xindex>
// CHECK: %[[VAL_30:.]] = scf.for %[[VAL_31:.]] = %[[VAL_4]] to %[[VAL_3]] step %[[VAL_5]] iter_args(%[[VAL_32:.*]] = %[[VAL_7]]) -> (f64) {		// CHECK: %[[TMP_19:.]] = scf.for %[[TMP_arg6:.]] = %[[TMP_16]] to %[[TMP_18]] step %[[TMP_c1]] iter_args(%[[TMP_arg7:.*]] = %[[TMP_arg5]]) -> (index) {
// CHECK: memref.store %[[VAL_31]], %[[VAL_18]]{{\[}}%[[VAL_6]]] : memref<?xindex>		// CHECK: %[[TMP_20:.*]] = memref.load %[[TMP_7]][%[[TMP_arg6]]] : memref<?xindex>
// CHECK: %[[VAL_33:.*]] = memref.load %[[VAL_11]]{{\[}}%[[VAL_23]], %[[VAL_31]]] : memref<8x8xf64>		// CHECK: %[[TMP_21:.*]] = memref.load %[[TMP_values]][%[[TMP_20]]] : memref<?xf64>
// CHECK: %[[VAL_34:.*]] = memref.load %[[VAL_12]]{{\[}}%[[VAL_31]], %[[VAL_28]]] : memref<8x8xf64>		// CHECK: %[[TMP_22:.*]] = memref.load %[[TMP_3]][%[[TMP_arg4]], %[[TMP_20]]] : memref<8x8xf64>
PeimingAuthorUnsubmitted Done Reply Inline Actions These two accesses here have worse spatial locality, one is access by row and one is access by column. Peiming: These two accesses here have worse spatial locality, one is access by row and one is access by…
		PeimingAuthorUnsubmitted Done Reply Inline Actions It should have a better spatial locality here compared to the previously generated code? Peiming: It should have a better spatial locality here compared to the previously generated code?
		aartbikUnsubmitted Done Reply Inline Actions We actually end up with better code here, since the expand/compress is better for keeping the resulting matrix sparse.. However, ironically, we end up with a par-red-par loop, whereas the original was par-par-red aartbik: We actually end up with better code here, since the expand/compress is better for keeping the…
		PeimingAuthorUnsubmitted Done Reply Inline Actions Lol, yeah. But we picked a set of more strict (better) order constraints. Peiming: Lol, yeah. But we picked a set of more strict (better) order constraints.
// CHECK: %[[VAL_35:.*]] = arith.mulf %[[VAL_33]], %[[VAL_34]] : f64		// CHECK: %[[TMP_23:.*]] = arith.mulf %[[TMP_15]], %[[TMP_22]] : f64
// CHECK: %[[VAL_36:.*]] = arith.mulf %[[VAL_35]], %[[VAL_29]] : f64		// CHECK: %[[TMP_24:.*]] = memref.load %[[TMP_8]][%[[TMP_arg6]]] : memref<?xf64>
// CHECK: %[[VAL_37:.*]] = arith.addf %[[VAL_32]], %[[VAL_36]] : f64		// CHECK: %[[TMP_25:.*]] = arith.mulf %[[TMP_23]], %[[TMP_24]] : f64
// CHECK: scf.yield %[[VAL_37]] : f64		// CHECK: %[[TMP_26:.*]] = arith.addf %[[TMP_21]], %[[TMP_25]] : f64
// CHECK: }		// CHECK: %[[TMP_27:.*]] = memref.load %[[TMP_filled]][%[[TMP_20]]] : memref<?xi1>
// CHECK: memref.store %[[VAL_30:.*]], %[[VAL_19]][] : memref<f64>		// CHECK: %[[TMP_28:.*]] = arith.cmpi eq, %[[TMP_27]], %[[TMP_false]] : i1
// CHECK: sparse_tensor.insert %[[VAL_10]], %[[VAL_18]], %[[VAL_19]] : tensor<8x8xf64, #sparse_tensor.encoding<{{.*}}>>, memref<?xindex>, memref<f64>		// CHECK: %[[TMP_29:.*]] = scf.if %[[TMP_28]] -> (index) {
		// CHECK: memref.store %[[TMP_true]], %[[TMP_filled]][%[[TMP_20]]] : memref<?xi1>
		// CHECK: memref.store %[[TMP_20]], %[[TMP_added]][%[[TMP_arg7]]] : memref<?xindex>
		// CHECK: %[[TMP_30:.*]] = arith.addi %[[TMP_arg7]], %[[TMP_c1]] : index
		// CHECK: scf.yield %[[TMP_30]] : index
		// CHECK: } else {
		// CHECK: scf.yield %[[TMP_arg7]] : index
		// CHECK: }
		// CHECK: memref.store %[[TMP_26]], %[[TMP_values]][%[[TMP_20]]] : memref<?xf64>
		// CHECK: scf.yield %[[TMP_29]] : index
// CHECK: }		// CHECK: }
		// CHECK: scf.yield %[[TMP_19]] : index
// CHECK: }		// CHECK: }
// CHECK: %[[VAL_39:.]] = sparse_tensor.load %[[VAL_10]] hasInserts : tensor<8x8xf64, #sparse_tensor.encoding<{{.}}>>		// CHECK: sparse_tensor.compress %[[TMP_1]], %[[TMP_9]], %[[TMP_values]], %[[TMP_filled]], %[[TMP_added]], %[[TMP_14]]
// CHECK: return %[[VAL_39]] : tensor<8x8xf64, #sparse_tensor.encoding<{{.*}}>>
// CHECK: }		// CHECK: }
		// CHECK: %[[TMP_12:.*]] = sparse_tensor.load %[[TMP_1]] hasInserts
		// CHECK: return %[[TMP_12]] : tensor<8x8xf64, #sparse_tensor.encoding
func.func @sparse_sampled_dd_unfused(%args: tensor<8x8xf64, #SM>,		func.func @sparse_sampled_dd_unfused(%args: tensor<8x8xf64, #SM>,
%arga: tensor<8x8xf64>,		%arga: tensor<8x8xf64>,
%argb: tensor<8x8xf64>) -> tensor<8x8xf64, #SM> {		%argb: tensor<8x8xf64>) -> tensor<8x8xf64, #SM> {
// Perform dense-dense matrix matrix multiplication.		// Perform dense-dense matrix matrix multiplication.
%1 = arith.constant dense<0.0> : tensor<8x8xf64>		%1 = arith.constant dense<0.0> : tensor<8x8xf64>
%2 = linalg.generic #trait_matmul		%2 = linalg.generic #trait_matmul
ins(%arga, %argb : tensor<8x8xf64>, tensor<8x8xf64>)		ins(%arga, %argb : tensor<8x8xf64>, tensor<8x8xf64>)
outs(%1 : tensor<8x8xf64>) {		outs(%1 : tensor<8x8xf64>) {
Show All 16 Lines

mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_filter_conv2d.mlir

Show All 18 Lines	func.func @conv2d(%input: tensor<8x8xi32>,
%filter: tensor<3x3xi32, #DCSR>,		%filter: tensor<3x3xi32, #DCSR>,
%output: tensor<6x6xi32>) -> tensor<6x6xi32> {		%output: tensor<6x6xi32>) -> tensor<6x6xi32> {
%0 = linalg.conv_2d		%0 = linalg.conv_2d
ins (%input, %filter: tensor<8x8xi32>, tensor<3x3xi32, #DCSR>)		ins (%input, %filter: tensor<8x8xi32>, tensor<3x3xi32, #DCSR>)
outs (%output: tensor<6x6xi32>) -> tensor<6x6xi32>		outs (%output: tensor<6x6xi32>) -> tensor<6x6xi32>
return %0 : tensor<6x6xi32>		return %0 : tensor<6x6xi32>
}		}

		func.func @conv2d_sparse_out(%input: tensor<8x8xi32>,
		%filter: tensor<3x3xi32, #DCSR>) -> tensor<6x6xi32, #DCSR> {
		%s = bufferization.alloc_tensor() : tensor<6x6xi32, #DCSR>
		%0 = linalg.conv_2d
		ins (%input, %filter: tensor<8x8xi32>, tensor<3x3xi32, #DCSR>)
		outs (%s: tensor<6x6xi32, #DCSR>) -> tensor<6x6xi32, #DCSR>
		return %0 : tensor<6x6xi32, #DCSR>
		}

func.func @entry() {		func.func @entry() {
%c0 = arith.constant 0 : index		%c0 = arith.constant 0 : index
%i0 = arith.constant 0 : i32		%i0 = arith.constant 0 : i32

// A typical edge detection filter.		// A typical edge detection filter.
%filter = arith.constant dense<[		%filter = arith.constant dense<[
[ 1, 0, -1 ],		[ 1, 0, -1 ],
[ 0, 0, 0 ],		[ 0, 0, 0 ],
Show All 13 Lines	%input = arith.constant dense<[
[ 1, 3, 3, 4, 3, 0, 7, 8 ]		[ 1, 3, 3, 4, 3, 0, 7, 8 ]
]> : tensor<8x8xi32>		]> : tensor<8x8xi32>

// Call the kernel.		// Call the kernel.
%output = arith.constant dense<0> : tensor<6x6xi32>		%output = arith.constant dense<0> : tensor<6x6xi32>
%0 = call @conv2d(%input, %sparse_filter, %output)		%0 = call @conv2d(%input, %sparse_filter, %output)
: (tensor<8x8xi32>,		: (tensor<8x8xi32>,
tensor<3x3xi32, #DCSR>, tensor<6x6xi32>) -> tensor<6x6xi32>		tensor<3x3xi32, #DCSR>, tensor<6x6xi32>) -> tensor<6x6xi32>
		%1 = call @conv2d_sparse_out(%input, %sparse_filter)
		: (tensor<8x8xi32>,
		tensor<3x3xi32, #DCSR>) -> tensor<6x6xi32, #DCSR>

// Verify the output.		// Verify the output.
//		//
// CHECK: ( ( 0, 0, -1, -6, -1, 6 ),		// CHECK: ( ( 0, 0, -1, -6, -1, 6 ),
// CHECK-SAME: ( -1, 0, 1, 0, 1, 0 ),		// CHECK-SAME: ( -1, 0, 1, 0, 1, 0 ),
// CHECK-SAME: ( 0, -1, 1, 0, 0, 0 ),		// CHECK-SAME: ( 0, -1, 1, 0, 0, 0 ),
// CHECK-SAME: ( -1, 0, 0, 0, 0, 0 ),		// CHECK-SAME: ( -1, 0, 0, 0, 0, 0 ),
// CHECK-SAME: ( 0, 0, 3, 6, -3, -6 ),		// CHECK-SAME: ( 0, 0, 3, 6, -3, -6 ),
// CHECK-SAME: ( 2, -1, 3, 0, -3, 0 ) )		// CHECK-SAME: ( 2, -1, 3, 0, -3, 0 ) )
//		//
%v = vector.transfer_read %0[%c0, %c0], %i0		%v = vector.transfer_read %0[%c0, %c0], %i0
: tensor<6x6xi32>, vector<6x6xi32>		: tensor<6x6xi32>, vector<6x6xi32>
vector.print %v : vector<6x6xi32>		vector.print %v : vector<6x6xi32>

		//
		// Should be the same as dense output
		// CHECK: ( ( 0, 0, -1, -6, -1, 6 ),
		// CHECK-SAME: ( -1, 0, 1, 0, 1, 0 ),
		// CHECK-SAME: ( 0, -1, 1, 0, 0, 0 ),
		// CHECK-SAME: ( -1, 0, 0, 0, 0, 0 ),
		// CHECK-SAME: ( 0, 0, 3, 6, -3, -6 ),
		// CHECK-SAME: ( 2, -1, 3, 0, -3, 0 ) )
		//
		%sparse_ret = sparse_tensor.convert %1
		: tensor<6x6xi32, #DCSR> to tensor<6x6xi32>
		%v1 = vector.transfer_read %sparse_ret[%c0, %c0], %i0
		: tensor<6x6xi32>, vector<6x6xi32>
		vector.print %v1 : vector<6x6xi32>

// Release the resources.		// Release the resources.
bufferization.dealloc_tensor %sparse_filter : tensor<3x3xi32, #DCSR>		bufferization.dealloc_tensor %sparse_filter : tensor<3x3xi32, #DCSR>
		bufferization.dealloc_tensor %1 : tensor<6x6xi32, #DCSR>
return		return
}		}
}		}

This is an archive of the discontinued LLVM Phabricator instance.

[mlir][sparse] Make sparse compiler more admissible.
ClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 460126

mlir/lib/Dialect/SparseTensor/Transforms/Sparsification.cpp

mlir/test/Dialect/SparseTensor/sparse_sddmm.mlir

mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_filter_conv2d.mlir

This is an archive of the discontinued LLVM Phabricator instance.

[mlir][sparse] Make sparse compiler more admissible.ClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 460126

mlir/lib/Dialect/SparseTensor/Transforms/Sparsification.cpp

mlir/test/Dialect/SparseTensor/sparse_sddmm.mlir

mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_filter_conv2d.mlir

[mlir][sparse] Make sparse compiler more admissible.
ClosedPublic