Diff 316481

mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h

Show First 20 Lines • Show All 861 Lines • ▼ Show 20 Lines

enum class SparseIntType { kNative, kI64, kI32 };

/// Sparsification options.

struct SparsificationOptions {

SparsificationOptions(SparseParallelizationStrategy p,

SparseVectorizationStrategy v, unsigned vl,

SparseIntType pt, SparseIntType it)

: parallelizationStrategy(p), vectorizationStrategy(v), vectorLength(vl),

ptrType(pt), indType(it) {}

ptrType(pt), indType(it) {

// TODO: remove restriction when vectors with index elements are supported

assert((v != SparseVectorizationStrategy::kAnyStorageInnerLoop ||

(ptrType != SparseIntType::kNative &&

indType != SparseIntType::kNative)) &&

"This combination requires support for vectors with index elements");

penpornkUnsubmitted

Done

indType != SparseIntType::kNative)) &&

- "this combination requires support for vectors with index elements");

+ "This combination requires support for vectors with index elements");

}

SparsificationOptions()

Nit: Upper-case T?

penpornk: Nit: Upper-case T?

}

SparsificationOptions()

: SparsificationOptions(SparseParallelizationStrategy::kNone,

SparseVectorizationStrategy::kNone, 1u,

SparseIntType::kNative, SparseIntType::kNative) {}

SparseParallelizationStrategy parallelizationStrategy;

SparseVectorizationStrategy vectorizationStrategy;

unsigned vectorLength;

SparseIntType ptrType;

Show All 12 Lines

mlir/lib/Dialect/Linalg/Transforms/Sparsification.cpp

Show All 40 Lines

// variable lengths and std::vector for vectors with fixed lengths. // variable lengths and std::vector for vectors with fixed lengths.

//===----------------------------------------------------------------------===// //===----------------------------------------------------------------------===//

#include "mlir/Dialect/Linalg/IR/LinalgOps.h" #include "mlir/Dialect/Linalg/IR/LinalgOps.h"

#include "mlir/Dialect/Linalg/Transforms/Transforms.h" #include "mlir/Dialect/Linalg/Transforms/Transforms.h"

#include "mlir/Dialect/Linalg/Utils/Utils.h" #include "mlir/Dialect/Linalg/Utils/Utils.h"

#include "mlir/Dialect/SCF/SCF.h" #include "mlir/Dialect/SCF/SCF.h"

#include "mlir/Dialect/StandardOps/IR/Ops.h" #include "mlir/Dialect/StandardOps/IR/Ops.h"

#include "mlir/IR/Matchers.h"

using namespace mlir; using namespace mlir;

namespace { namespace {

enum class Kind { kTensor, kInvariant, kMulF, kMulI, kAddF, kAddI }; enum class Kind { kTensor, kInvariant, kMulF, kMulI, kAddF, kAddI };

enum class Dim { kSparse, kDense, kUndef }; enum class Dim { kSparse, kDense, kUndef };

▲ Show 20 Lines • Show All 239 Lines • ▼ Show 20 Lines

struct CodeGen { struct CodeGen {

CodeGen(linalg::SparsificationOptions o, unsigned numTensors, CodeGen(linalg::SparsificationOptions o, unsigned numTensors,

unsigned numLoops) unsigned numLoops)

: options(o), loops(numLoops), sizes(numLoops), buffers(numTensors), : options(o), loops(numLoops), sizes(numLoops), buffers(numTensors),

pointers(numTensors, std::vector<Value>(numLoops)), pointers(numTensors, std::vector<Value>(numLoops)),

indices(numTensors, std::vector<Value>(numLoops)), indices(numTensors, std::vector<Value>(numLoops)),

highs(numTensors, std::vector<Value>(numLoops)), highs(numTensors, std::vector<Value>(numLoops)),

pidxs(numTensors, std::vector<Value>(numLoops)), pidxs(numTensors, std::vector<Value>(numLoops)),

idxs(numTensors, std::vector<Value>(numLoops)), redExp(-1u), redVal() {} idxs(numTensors, std::vector<Value>(numLoops)), redExp(-1u), redVal(),

curVecLength(1), curVecMask() {}

/// Sparsification options. /// Sparsification options.

linalg::SparsificationOptions options; linalg::SparsificationOptions options;

/// Universal dense indices and upper bounds (by index). The loops array /// Universal dense indices and upper bounds (by index). The loops array

/// is updated with the value of the universal dense index in the current /// is updated with the value of the universal dense index in the current

/// loop. The sizes array is set once with the inferred dimension sizes. /// loop. The sizes array is set once with the inferred dimension sizes.

std::vector<Value> loops; std::vector<Value> loops;

std::vector<Value> sizes; std::vector<Value> sizes;

/// Buffers for storing dense and sparse numerical values (by tensor). /// Buffers for storing dense and sparse numerical values (by tensor).

Show All 9 Lines struct CodeGen {

std::vector<std::vector<Value>> pidxs; std::vector<std::vector<Value>> pidxs;

std::vector<std::vector<Value>> idxs; std::vector<std::vector<Value>> idxs;

/// Current reduction, updated during code generation. When indices of a /// Current reduction, updated during code generation. When indices of a

/// reduction are exhausted, all inner loops can "scalarize" the reduction. /// reduction are exhausted, all inner loops can "scalarize" the reduction.

// TODO: currently only done for (a chain of) innermost for-loops, where it // TODO: currently only done for (a chain of) innermost for-loops, where it

// is most effective; we could generalize to more outer and while-loops. // is most effective; we could generalize to more outer and while-loops.

unsigned redExp; unsigned redExp;

Value redVal; Value redVal;

// Current vector length and mask.

unsigned curVecLength;

penpornkUnsubmitted

Done

Nit: Should we make the name more meaningful, e.g., vLen, etc.?

penpornk: Nit: Should we make the name more meaningful, e.g., vLen, etc.?

ftynseUnsubmitted

Done

MLIR has a distinct preference for long verbose names, even vLen is frowned upon. Something currentVectorLength or curVecLength in the worst case. (We also tend to talk about vector size rather than length).

ftynse: MLIR has a distinct preference for long verbose names, even `vLen` is frowned upon. Something…

Value curVecMask;

}; };

} // namespace } // namespace

/// Helper method to inspect sparse annotations in the linalg operation. /// Helper method to inspect sparse annotations in the linalg operation.

/// Fills the per-dimension sparsity information for all tensors. /// Fills the per-dimension sparsity information for all tensors.

static void findSparseAnnotations(Merger &merger, linalg::GenericOp op) { static void findSparseAnnotations(Merger &merger, linalg::GenericOp op) {

unsigned numTensors = op.getNumShapedOperands(); unsigned numTensors = op.getNumShapedOperands();

▲ Show 20 Lines • Show All 215 Lines • ▼ Show 20 Lines for (unsigned t = 0; t < numTensors; t++) {

} else { } else {

auto sparseTp = MemRefType::get({ShapedType::kDynamicSize}, auto sparseTp = MemRefType::get({ShapedType::kDynamicSize},

tensorType.getElementType()); tensorType.getElementType());

codegen.buffers[t] = rewriter.create<AllocaOp>(loc, sparseTp, unknown); codegen.buffers[t] = rewriter.create<AllocaOp>(loc, sparseTp, unknown);

} }

/// Constructs vector type from pointer.

static VectorType vectorType(CodeGen &codegen, Value ptr) {

Type etp = ptr.getType().cast<MemRefType>().getElementType();

return VectorType::get(codegen.curVecLength, etp);

}

/// Constructs vector iteration mask.

static Value genVectorMask(CodeGen &codegen, PatternRewriter &rewriter,

Value iv, Value lo, Value hi, Value step) {

Location loc = iv.getLoc();

VectorType mtp =

VectorType::get(codegen.curVecLength, rewriter.getIntegerType(1));

// Special case if the vector length evenly divides the trip count (for

// example, "for i = 0, 128, 16"). A constant all-true mask is generated

// so that all subsequent masked memory operations are immediately folded

// into unconditional memory operations.

IntegerAttr loInt, hiInt, stepInt;

if (matchPattern(lo, m_Constant(&loInt)) &&

matchPattern(hi, m_Constant(&hiInt)) &&

matchPattern(step, m_Constant(&stepInt))) {

if (((hiInt.getInt() - loInt.getInt()) % stepInt.getInt()) == 0)

penpornkUnsubmitted

Done

Nit: int64_t hi here shares the name with the parameter Value hi. They are in different scopes so it works, but it bugs me a bit. Can we rename?

penpornk: Nit: `int64_t hi` here shares the name with the parameter `Value hi`. They are in different…

return rewriter.create<vector::ConstantMaskOp>(

ftynseUnsubmitted

Done

This can use matchPattern(value, m_Constant(attr))

ftynse: This can use `matchPattern(value, m_Constant(attr))`

aartbikAuthorUnsubmitted

Done

Ah, I was looking for something like that. Great!

aartbik: Ah, I was looking for something like that. Great!

loc, mtp, rewriter.getI64ArrayAttr(codegen.curVecLength));

}

// Otherwise, generate a vector mask that avoids overrunning the upperbound

// during vector execution. Here we rely on subsequent loop optimizations to

// avoid executing the mask in all iterations, for example, by splitting the

// loop into an unconditional vector loop and a scalar cleanup loop.

Value end = rewriter.create<SubIOp>(loc, hi, iv);

return rewriter.create<vector::CreateMaskOp>(loc, mtp, end);

penpornkUnsubmitted

Done

// avoid executing the mask in all iterations, for example, by splitting the

- // loop into an unconditional vector lopo and a scalar cleanup loop.

+ // loop into an unconditional vector loop and a scalar cleanup loop.

Value end = rewriter.create<SubIOp>(loc, hi, iv);

Typo.

penpornk: Typo.

}

/// Generates a vectorized load lhs = a[ind[lo:hi]] or lhs = a[lo:hi].

static Value genVectorLoad(CodeGen &codegen, PatternRewriter &rewriter,

Value ptr, ArrayRef<Value> args) {

penpornkUnsubmitted

Done

Why the .. around rhs?

penpornk: Why the `..` around rhs?

penpornkUnsubmitted

Done

Nit: At first glance, I intepreted / as a division and that confused me. Maybe change it to "or"?

penpornk: Nit: At first glance, I intepreted `/` as a division and that confused me. Maybe change it to…

Location loc = ptr.getLoc();

VectorType vtp = vectorType(codegen, ptr);

Value pass = rewriter.create<ConstantOp>(loc, vtp, rewriter.getZeroAttr(vtp));

if (args.back().getType().isa<VectorType>())

return rewriter.create<vector::GatherOp>(loc, vtp, ptr, args.back(),

codegen.curVecMask, pass);

return rewriter.create<vector::MaskedLoadOp>(loc, vtp, ptr, args,

codegen.curVecMask, pass);

}

/// Generates a vectorized store a[ind[lo:hi]] = rhs or a[lo:hi] = rhs.

static void genVectorStore(CodeGen &codegen, PatternRewriter &rewriter,

Value rhs, Value ptr, ArrayRef<Value> args) {

Location loc = ptr.getLoc();

penpornkUnsubmitted

Done

codegen.vMask, pass_thru);

}

- /// Generates a vectorized store a[ind[lo:hi] / a[lo:hi] = rhs.

+ /// Generates a vectorized store a[ind[lo:hi]] or a[lo:hi] = rhs.

static void genVectorStore(CodeGen &codegen, PatternRewriter &rewriter,

Missing a closing bracket. Also, same comment on /.

penpornk: Missing a closing bracket. Also, same comment on `/`.

if (args.back().getType().isa<VectorType>())

rewriter.create<vector::ScatterOp>(loc, ptr, args.back(),

codegen.curVecMask, rhs);

else

rewriter.create<vector::MaskedStoreOp>(loc, ptr, args, codegen.curVecMask,

rhs);

}

/// Generates a vectorized invariant. Here we rely on subsequent loop

/// optimizations to hoist the invariant broadcast out of the vector loop.

static Value genVectorInvariantValue(CodeGen &codegen,

PatternRewriter &rewriter, Value val) {

VectorType vtp = VectorType::get(codegen.curVecLength, val.getType());

return rewriter.create<vector::BroadcastOp>(val.getLoc(), vtp, val);

}

/// Generates a load on a dense or sparse tensor. /// Generates a load on a dense or sparse tensor.

static Value genTensorLoad(Merger &merger, CodeGen &codegen, static Value genTensorLoad(Merger &merger, CodeGen &codegen,

PatternRewriter &rewriter, linalg::GenericOp op, PatternRewriter &rewriter, linalg::GenericOp op,

unsigned exp) { unsigned exp) {

// Test if the load was hoisted to a higher loop nest. // Test if the load was hoisted to a higher loop nest.

Value val = merger.exp(exp).val; Value val = merger.exp(exp).val;

if (val) if (val)

return val; return val;

// Actual load. // Actual load.

SmallVector<Value, 4> args; SmallVector<Value, 4> args;

unsigned tensor = merger.exp(exp).e0; unsigned tensor = merger.exp(exp).e0;

auto map = op.getIndexingMap(tensor); auto map = op.getIndexingMap(tensor);

bool sparse = false; bool sparse = false;

for (unsigned i = 0, m = map.getNumResults(); i < m; ++i) { for (unsigned i = 0, m = map.getNumResults(); i < m; ++i) {

unsigned idx = map.getDimPosition(i); unsigned idx = map.getDimPosition(i);

args.push_back(codegen.loops[idx]); // universal dense index args.push_back(codegen.loops[idx]); // universal dense index

if (sparse || merger.isDim(tensor, idx, Dim::kSparse)) { if (sparse || merger.isDim(tensor, idx, Dim::kSparse)) {

sparse = true; sparse = true;

args.clear(); args.clear();

args.push_back(codegen.pidxs[tensor][idx]); // position index args.push_back(codegen.pidxs[tensor][idx]); // position index

} }

Location loc = op.getLoc(); Location loc = op.getLoc();

Value ptr = codegen.buffers[tensor]; Value ptr = codegen.buffers[tensor];

if (codegen.curVecLength > 1)

return genVectorLoad(codegen, rewriter, ptr, args);

return rewriter.create<LoadOp>(loc, ptr, args); return rewriter.create<LoadOp>(loc, ptr, args);

} }

/// Generates a store on a dense tensor. /// Generates a store on a dense tensor.

static void genTensorStore(Merger &merger, CodeGen &codegen, static void genTensorStore(Merger &merger, CodeGen &codegen,

PatternRewriter &rewriter, linalg::GenericOp op, PatternRewriter &rewriter, linalg::GenericOp op,

unsigned tensor, Value rhs) { unsigned tensor, Value rhs) {

// Test if this is a scalarized reduction. // Test if this is a scalarized reduction.

unsigned lhs = op.getNumShapedOperands() - 1; unsigned lhs = op.getNumShapedOperands() - 1;

if (lhs == tensor && codegen.redVal) { if (lhs == tensor && codegen.redVal) {

codegen.redVal = rhs; codegen.redVal = rhs;

return; return;

} }

// Actual load. // Actual store.

SmallVector<Value, 4> args; SmallVector<Value, 4> args;

auto map = op.getIndexingMap(tensor); auto map = op.getIndexingMap(tensor);

for (unsigned i = 0, m = map.getNumResults(); i < m; ++i) { for (unsigned i = 0, m = map.getNumResults(); i < m; ++i) {

unsigned idx = map.getDimPosition(i); unsigned idx = map.getDimPosition(i);

args.push_back(codegen.loops[idx]); // universal dense index args.push_back(codegen.loops[idx]); // universal dense index

} }

Location loc = op.getLoc(); Location loc = op.getLoc();

Value ptr = codegen.buffers[tensor]; Value ptr = codegen.buffers[tensor];

if (codegen.curVecLength > 1)

genVectorStore(codegen, rewriter, rhs, ptr, args);

else

rewriter.create<StoreOp>(loc, rhs, ptr, args); rewriter.create<StoreOp>(loc, rhs, ptr, args);

} }

/// Generates a pointer/index load from the sparse storage scheme. /// Generates a pointer/index load from the sparse storage scheme.

static Value genLoad(PatternRewriter &rewriter, Location loc, Value ptr, static Value genLoad(CodeGen &codegen, PatternRewriter &rewriter, Location loc,

Value s) { Value ptr, Value s) {

if (codegen.curVecLength > 1)

return genVectorLoad(codegen, rewriter, ptr, {s});

Value load = rewriter.create<LoadOp>(loc, ptr, s); Value load = rewriter.create<LoadOp>(loc, ptr, s);

return load.getType().isa<IndexType>() return load.getType().isa<IndexType>()

? load ? load

: rewriter.create<IndexCastOp>(loc, load, rewriter.getIndexType()); : rewriter.create<IndexCastOp>(loc, load, rewriter.getIndexType());

} }

/// Generates an invariant value. /// Generates an invariant value.

static Value genInvariantValue(Merger &merger, CodeGen &codegen, static Value genInvariantValue(Merger &merger, CodeGen &codegen,

PatternRewriter &rewriter, unsigned exp) { PatternRewriter &rewriter, unsigned exp) {

return merger.exp(exp).val; Value val = merger.exp(exp).val;

if (codegen.curVecLength > 1)

return genVectorInvariantValue(codegen, rewriter, val);

return val;

} }

/// Recursively generates tensor expression. /// Recursively generates tensor expression.

static Value genExp(Merger &merger, CodeGen &codegen, PatternRewriter &rewriter, static Value genExp(Merger &merger, CodeGen &codegen, PatternRewriter &rewriter,

linalg::GenericOp op, unsigned exp) { linalg::GenericOp op, unsigned exp) {

if (merger.exp(exp).kind == Kind::kTensor) if (merger.exp(exp).kind == Kind::kTensor)

return genTensorLoad(merger, codegen, rewriter, op, exp); return genTensorLoad(merger, codegen, rewriter, op, exp);

else if (merger.exp(exp).kind == Kind::kInvariant) else if (merger.exp(exp).kind == Kind::kInvariant)

▲ Show 20 Lines • Show All 71 Lines • ▼ Show 20 Lines if (inits[b]) {

for (; pat != 0; pat--) { for (; pat != 0; pat--) {

if (codegen.pidxs[tensor][topSort[pat - 1]]) if (codegen.pidxs[tensor][topSort[pat - 1]])

break; break;

} }

Value ptr = codegen.pointers[tensor][idx]; Value ptr = codegen.pointers[tensor][idx];

Value one = rewriter.create<ConstantIndexOp>(loc, 1); Value one = rewriter.create<ConstantIndexOp>(loc, 1);

Value p0 = (pat == 0) ? rewriter.create<ConstantIndexOp>(loc, 0) Value p0 = (pat == 0) ? rewriter.create<ConstantIndexOp>(loc, 0)

: codegen.pidxs[tensor][topSort[pat - 1]]; : codegen.pidxs[tensor][topSort[pat - 1]];

codegen.pidxs[tensor][idx] = genLoad(rewriter, loc, ptr, p0); codegen.pidxs[tensor][idx] = genLoad(codegen, rewriter, loc, ptr, p0);

Value p1 = rewriter.create<AddIOp>(loc, p0, one); Value p1 = rewriter.create<AddIOp>(loc, p0, one);

codegen.highs[tensor][idx] = genLoad(rewriter, loc, ptr, p1); codegen.highs[tensor][idx] = genLoad(codegen, rewriter, loc, ptr, p1);

} else { } else {

// Dense index still in play. // Dense index still in play.

needsUniv = true; needsUniv = true;

} }

// Initialize the universal dense index. // Initialize the universal dense index.

codegen.loops[idx] = rewriter.create<ConstantIndexOp>(loc, 0); codegen.loops[idx] = rewriter.create<ConstantIndexOp>(loc, 0);

return needsUniv; return needsUniv;

} }

/// Returns vectorization strategy. Any implicit inner loop in the Linalg

/// operation is a candidate. Whether it is actually converted to SIMD code

/// depends on the requested strategy.

static bool isVectorFor(CodeGen &codegen, bool isInner, bool isSparse) {

switch (codegen.options.vectorizationStrategy) {

case linalg::SparseVectorizationStrategy::kNone:

return false;

case linalg::SparseVectorizationStrategy::kDenseInnerLoop:

return isInner && !isSparse;

case linalg::SparseVectorizationStrategy::kAnyStorageInnerLoop:

return isInner;

}

/// Returns parallelization strategy. Any implicit loop in the Linalg operation

/// that is marked "parallel" is a candidate. Whether it is actually converted

/// to a parallel operation depends on the requested strategy.

static bool isParallelFor(CodeGen &codegen, bool isOuter, bool isReduction,

penpornkUnsubmitted

Done

Nit: Forall doesn't really imply parallelization. How about isParallelFor?

penpornk: Nit: `Forall` doesn't really imply parallelization. How about `isParallelFor`?

bool isSparse, bool isVector) {

switch (codegen.options.parallelizationStrategy) {

case linalg::SparseParallelizationStrategy::kNone:

return false;

case linalg::SparseParallelizationStrategy::kDenseOuterLoop:

return isOuter && !isSparse && !isReduction && !isVector;

case linalg::SparseParallelizationStrategy::kAnyStorageOuterLoop:

return isOuter && !isReduction && !isVector;

case linalg::SparseParallelizationStrategy::kDenseAnyLoop:

return !isSparse && !isReduction && !isVector;

case linalg::SparseParallelizationStrategy::kAnyStorageAnyLoop:

return !isReduction && !isVector;

penpornkUnsubmitted

Done

A loop can be both parallelized and vectorized. Is the !isVector to simplify things for now? (I'm mostly thinking of a non-reduction loop -- I remember that we'll keep parallelizing reduction loops for later.) Should we mention this in the comment?

penpornk: A loop can be both parallelized and vectorized. Is the `!isVector` to simplify things for now?

aartbikAuthorUnsubmitted

Done

Ah, for now *same* loop parallelization/vectorization is not supported (we could, I agree). So this logic makes sure that is not changed until proper support is added.

aartbik: Ah, for now *same* loop parallelization/vectorization is not supported (we could, I agree). So…

}

/// Generates a for-loop on a single index. /// Generates a for-loop on a single index.

static Operation *genFor(Merger &merger, CodeGen &codegen, static Operation *genFor(Merger &merger, CodeGen &codegen,

PatternRewriter &rewriter, linalg::GenericOp op, PatternRewriter &rewriter, linalg::GenericOp op,

bool isOuter, bool isInner, unsigned idx, bool isOuter, bool isInner, unsigned idx,

llvm::BitVector &indices) { llvm::BitVector &indices) {

unsigned fb = indices.find_first(); unsigned fb = indices.find_first();

unsigned tensor = merger.tensor(fb); unsigned tensor = merger.tensor(fb);

assert(idx == merger.index(fb)); assert(idx == merger.index(fb));

// Parallelization strategy. Any implicit loop in the Linalg operation that

// is marked "parallel" is a candidate. Whether it is actually converted to

// a parallel operation depends on the requested strategy.

auto iteratorTypes = op.iterator_types().getValue(); auto iteratorTypes = op.iterator_types().getValue();

bool isReduction = linalg::isReductionIteratorType(iteratorTypes[idx]);

bool isSparse = merger.isDim(fb, Dim::kSparse); bool isSparse = merger.isDim(fb, Dim::kSparse);

bool isParallel = linalg::isParallelIteratorType(iteratorTypes[idx]); bool isVector = isVectorFor(codegen, isInner, isSparse);

switch (codegen.options.parallelizationStrategy) { bool isParallel =

case linalg::SparseParallelizationStrategy::kNone: isParallelFor(codegen, isOuter, isReduction, isSparse, isVector);

isParallel = false;

break; // Prepare vector length.

case linalg::SparseParallelizationStrategy::kDenseOuterLoop: if (isVector)

isParallel &= isOuter && !isSparse; codegen.curVecLength = codegen.options.vectorLength;

break;

case linalg::SparseParallelizationStrategy::kAnyStorageOuterLoop:

isParallel &= isOuter;

break;

case linalg::SparseParallelizationStrategy::kDenseAnyLoop:

isParallel &= !isSparse;

break;

case linalg::SparseParallelizationStrategy::kAnyStorageAnyLoop:

break;

}

// Loop bounds and increment. // Loop bounds and increment.

Location loc = op.getLoc(); Location loc = op.getLoc();

Value lo; Value lo = isSparse ? codegen.pidxs[tensor][idx] : codegen.loops[idx];

Value hi; Value hi = isSparse ? codegen.highs[tensor][idx] : codegen.sizes[idx];

Value step = rewriter.create<ConstantIndexOp>(loc, 1); Value step = rewriter.create<ConstantIndexOp>(loc, codegen.curVecLength);

Value index;

if (isSparse) {

lo = codegen.pidxs[tensor][idx];

hi = codegen.highs[tensor][idx];

} else {

lo = codegen.loops[idx];

hi = codegen.sizes[idx];

}

// Emit a parallel loop. // Emit a parallel loop.

if (isParallel) { if (isParallel) {

assert(!isVector);

scf::ParallelOp parOp = rewriter.create<scf::ParallelOp>(loc, lo, hi, step); scf::ParallelOp parOp = rewriter.create<scf::ParallelOp>(loc, lo, hi, step);

if (isSparse) if (isSparse)

codegen.pidxs[tensor][idx] = parOp.getInductionVars()[0]; codegen.pidxs[tensor][idx] = parOp.getInductionVars()[0];

else else

codegen.loops[idx] = parOp.getInductionVars()[0]; codegen.loops[idx] = parOp.getInductionVars()[0];

rewriter.setInsertionPointToStart(parOp.getBody()); rewriter.setInsertionPointToStart(parOp.getBody());

return parOp; return parOp;

} }

// Emit a sequential loop, potentially with a scalarized reduction. // Emit a sequential loop, potentially with a scalarized reduction.

bool scalarRed = isInner && codegen.redExp != -1u; bool scalarRed = isInner && codegen.redExp != -1u;

SmallVector<Value, 4> operands; SmallVector<Value, 4> operands;

if (scalarRed) { if (scalarRed) {

Value load = Value load;

codegen.redVal if (codegen.redVal) {

? codegen.redVal // chained with previous for-loop load = codegen.redVal; // chained with previous for-loop

: genTensorLoad(merger, codegen, rewriter, op, codegen.redExp); } else if (isVector) {

// TODO: assumes + reductions for now

VectorType vtp = vectorType(codegen, codegen.buffers[codegen.redExp]);

load = rewriter.create<ConstantOp>(loc, vtp, rewriter.getZeroAttr(vtp));

} else {

load = genTensorLoad(merger, codegen, rewriter, op, codegen.redExp);

}

operands.push_back(load); operands.push_back(load);

} }

scf::ForOp forOp = rewriter.create<scf::ForOp>(loc, lo, hi, step, operands); scf::ForOp forOp = rewriter.create<scf::ForOp>(loc, lo, hi, step, operands);

if (scalarRed) { if (scalarRed) {

codegen.redVal = merger.exp(codegen.redExp).val = codegen.redVal = merger.exp(codegen.redExp).val =

forOp.getRegionIterArgs().front(); forOp.getRegionIterArgs().front();

} }

// Assign induction variable to sparse or dense index. // Assign induction variable to sparse or dense index.

Value iv = forOp.getInductionVar();

if (isSparse) if (isSparse)

codegen.pidxs[tensor][idx] = forOp.getInductionVar(); codegen.pidxs[tensor][idx] = iv;

else else

codegen.loops[idx] = forOp.getInductionVar(); codegen.loops[idx] = iv;

rewriter.setInsertionPointToStart(forOp.getBody()); rewriter.setInsertionPointToStart(forOp.getBody());

// Share vector iteration mask between all subsequent loads/stores.

if (isVector)

codegen.curVecMask = genVectorMask(codegen, rewriter, iv, lo, hi, step);

return forOp; return forOp;

} }

/// Emit a while-loop for co-iteration over multiple indices. /// Emit a while-loop for co-iteration over multiple indices.

static Operation *genWhile(Merger &merger, CodeGen &codegen, static Operation *genWhile(Merger &merger, CodeGen &codegen,

PatternRewriter &rewriter, linalg::GenericOp op, PatternRewriter &rewriter, linalg::GenericOp op,

unsigned idx, bool needsUniv, unsigned idx, bool needsUniv,

llvm::BitVector &indices) { llvm::BitVector &indices) {

▲ Show 20 Lines • Show All 70 Lines • ▼ Show 20 Lines static void genLocals(Merger &merger, CodeGen &codegen,

// Initialize sparse indices. // Initialize sparse indices.

Value min; Value min;

for (unsigned b = 0, be = locals.size(); b < be; b++) { for (unsigned b = 0, be = locals.size(); b < be; b++) {

if (locals[b] && merger.isDim(b, Dim::kSparse)) { if (locals[b] && merger.isDim(b, Dim::kSparse)) {

unsigned tensor = merger.tensor(b); unsigned tensor = merger.tensor(b);

assert(idx == merger.index(b)); assert(idx == merger.index(b));

Value ptr = codegen.indices[tensor][idx]; Value ptr = codegen.indices[tensor][idx];

Value s = codegen.pidxs[tensor][idx]; Value s = codegen.pidxs[tensor][idx];

Value load = genLoad(rewriter, loc, ptr, s); Value load = genLoad(codegen, rewriter, loc, ptr, s);

codegen.idxs[tensor][idx] = load; codegen.idxs[tensor][idx] = load;

if (!needsUniv) { if (!needsUniv) {

if (min) { if (min) {

Value cmp = Value cmp =

rewriter.create<CmpIOp>(loc, CmpIPredicate::ult, load, min); rewriter.create<CmpIOp>(loc, CmpIPredicate::ult, load, min);

min = rewriter.create<SelectOp>(loc, cmp, load, min); min = rewriter.create<SelectOp>(loc, cmp, load, min);

} else { } else {

min = load; min = load;

▲ Show 20 Lines • Show All 95 Lines • ▼ Show 20 Lines if (at == topSort.size()) {

genTensorStore(merger, codegen, rewriter, op, lhs, rhs); genTensorStore(merger, codegen, rewriter, op, lhs, rhs);

return; return;

} }

// Construct iteration lattices for current loop index, with L0 at top. // Construct iteration lattices for current loop index, with L0 at top.

// Then emit initialization code for the loop sequence at this level. // Then emit initialization code for the loop sequence at this level.

// We maintain the universal dense index if dense indices are still // We maintain the universal dense index if dense indices are still

// in play for a non-singleton loop sequence. // in play for a non-singleton loop sequence.

// Location loc = op.getLoc(); Location loc = op.getLoc();

unsigned idx = topSort[at]; unsigned idx = topSort[at];

unsigned lts = merger.optimizeSet(buildLattices(merger, op, exp, idx)); unsigned lts = merger.optimizeSet(buildLattices(merger, op, exp, idx));

unsigned lsize = merger.set(lts).size(); unsigned lsize = merger.set(lts).size();

assert(lsize != 0); assert(lsize != 0);

unsigned l0 = merger.set(lts)[0]; unsigned l0 = merger.set(lts)[0];

unsigned ldx = at == 0 ? -1u : topSort[at - 1]; unsigned ldx = at == 0 ? -1u : topSort[at - 1];

genInvariants(merger, codegen, rewriter, op, exp, ldx, /*hoist=*/true); genInvariants(merger, codegen, rewriter, op, exp, ldx, /*hoist=*/true);

bool needsUniv = genInit(merger, codegen, rewriter, op, topSort, at, bool needsUniv = genInit(merger, codegen, rewriter, op, topSort, at,

merger.lat(l0).bits) && merger.lat(l0).bits) &&

lsize > 1; lsize > 1;

// Emit a loop for every lattice point L0 >= Li. // Emit a loop for every lattice point L0 >= Li.

for (unsigned i = 0; i < lsize; i++) { for (unsigned i = 0; i < lsize; i++) {

unsigned li = merger.set(lts)[i]; unsigned li = merger.set(lts)[i];

// Emit loop. // Emit loop.

codegen.curVecLength = 1;

llvm::BitVector indices = merger.lat(li).simple; llvm::BitVector indices = merger.lat(li).simple;

Operation *loop = Operation *loop =

genLoop(merger, codegen, rewriter, op, topSort, at, needsUniv, indices); genLoop(merger, codegen, rewriter, op, topSort, at, needsUniv, indices);

genLocals(merger, codegen, rewriter, op, topSort, at, needsUniv, genLocals(merger, codegen, rewriter, op, topSort, at, needsUniv,

merger.lat(li).bits); merger.lat(li).bits);

// Visit all lattices points with Li >= Lj to generate the // Visit all lattices points with Li >= Lj to generate the

// loop-body, possibly with if statements for coiteration. // loop-body, possibly with if statements for coiteration.

Show All 24 Lines for (unsigned i = 0; i < lsize; i++) {

if (isWhile) { if (isWhile) {

scf::WhileOp whileOp = cast<scf::WhileOp>(loop); scf::WhileOp whileOp = cast<scf::WhileOp>(loop);

rewriter.setInsertionPointToEnd(&whileOp.after().front()); rewriter.setInsertionPointToEnd(&whileOp.after().front());

genWhileInduction(merger, codegen, rewriter, op, idx, needsUniv, genWhileInduction(merger, codegen, rewriter, op, idx, needsUniv,

merger.lat(li).bits, whileOp.results()); merger.lat(li).bits, whileOp.results());

} else { } else {

needsUniv = false; needsUniv = false;

if (codegen.redVal) { if (codegen.redVal) {

rewriter.create<scf::YieldOp>(op.getLoc(), codegen.redVal); rewriter.create<scf::YieldOp>(loc, codegen.redVal);

codegen.redVal = loop->getResult(0); codegen.redVal = loop->getResult(0);

} }

rewriter.setInsertionPointAfter(loop); rewriter.setInsertionPointAfter(loop);

} }

// Wrap-up loop sequence. // Wrap-up loop sequence.

Value red = codegen.redVal; Value red = codegen.redVal;

if (red) { if (red) {

codegen.redVal = merger.exp(codegen.redExp).val = Value(); // end chain codegen.redVal = merger.exp(codegen.redExp).val = Value(); // end chain

unsigned lhs = op.getNumShapedOperands() - 1; unsigned lhs = op.getNumShapedOperands() - 1;

if (codegen.curVecLength > 1) {

codegen.curVecLength = 1;

Value ld = genTensorLoad(merger, codegen, rewriter, op, codegen.redExp);

red = rewriter.create<vector::ReductionOp>(

loc, ld.getType(), rewriter.getStringAttr("add"), red, ld);

}

genTensorStore(merger, codegen, rewriter, op, lhs, red); genTensorStore(merger, codegen, rewriter, op, lhs, red);

} }

codegen.loops[idx] = Value();

genInvariants(merger, codegen, rewriter, op, exp, ldx, /*hoist=*/false); genInvariants(merger, codegen, rewriter, op, exp, ldx, /*hoist=*/false);

codegen.loops[idx] = Value();

} }

namespace { namespace {

/// Sparse rewriting rule for generic Lingalg operation. /// Sparse rewriting rule for generic Lingalg operation.

struct GenericOpSparsifier : public OpRewritePattern<linalg::GenericOp> { struct GenericOpSparsifier : public OpRewritePattern<linalg::GenericOp> {

public: public:

GenericOpSparsifier(MLIRContext *context, linalg::SparsificationOptions o) GenericOpSparsifier(MLIRContext *context, linalg::SparsificationOptions o)

▲ Show 20 Lines • Show All 55 Lines • Show Last 20 Lines

mlir/test/Dialect/Linalg/sparse_vector.mlir

This file was added.

				// RUN: mlir-opt %s -test-sparsification="vectorization-strategy=0 ptr-type=2 ind-type=2 vl=16" \| \
				// RUN: FileCheck %s --check-prefix=CHECK-VEC0
				// RUN: mlir-opt %s -test-sparsification="vectorization-strategy=1 ptr-type=2 ind-type=2 vl=16" \| \
				// RUN: FileCheck %s --check-prefix=CHECK-VEC1
				// RUN: mlir-opt %s -test-sparsification="vectorization-strategy=2 ptr-type=2 ind-type=2 vl=16" \| \
				// RUN: FileCheck %s --check-prefix=CHECK-VEC2

				#trait_scale_d = {
				indexing_maps = [
				affine_map<(i) -> (i)>, // a
				affine_map<(i) -> (i)> // x (out)
				],
				sparse = [
				[ "D" ], // a
				[ "D" ] // x
				],
				iterator_types = ["parallel"],
				doc = "x(i) = a(i) * b"
				}

				//
				// CHECK-VEC0-LABEL: func @scale_d
				// CHECK-VEC0-DAG: %[[c0:.*]] = constant 0 : index
				// CHECK-VEC0-DAG: %[[c1:.*]] = constant 1 : index
				penpornkUnsubmitted Done Reply Inline Actions Thank you for manually naming the captures! \dTvTb/ penpornk: Thank you for manually naming the captures! \dTvTb/
				// CHECK-VEC0-DAG: %[[c1024:.*]] = constant 1024 : index
				// CHECK-VEC0: scf.for %[[i:.*]] = %[[c0]] to %[[c1024]] step %[[c1]] {
				// CHECK-VEC0: %[[l:.]] = load %{{.}}[%[[i]]] : memref<1024xf32>
				// CHECK-VEC0: %[[m:.]] = mulf %[[l]], %{{.}} : f32
				// CHECK-VEC0: store %[[m]], %{{.*}}[%[[i]]] : memref<1024xf32>
				// CHECK-VEC0: }
				// CHECK-VEC0: return
				//
				// CHECK-VEC1-LABEL: func @scale_d
				// CHECK-VEC1-DAG: %[[c0:.*]] = constant 0 : index
				// CHECK-VEC1-DAG: %[[c16:.*]] = constant 16 : index
				// CHECK-VEC1-DAG: %[[c1024:.*]] = constant 1024 : index
				// CHECK-VEC1: scf.for %[[i:.*]] = %[[c0]] to %[[c1024]] step %[[c16]] {
				// CHECK-VEC1: %[[r:.]] = vector.transfer_read %{{.}}[%[[i]]], %{{.*}} {masked = [false]} : memref<1024xf32>, vector<16xf32>
				// CHECK-VEC1: %[[b:.]] = vector.broadcast %{{.}} : f32 to vector<16xf32>
				// CHECK-VEC1: %[[m:.*]] = mulf %[[r]], %[[b]] : vector<16xf32>
				// CHECK-VEC1: vector.transfer_write %[[m]], %{{.*}}[%[[i]]] {masked = [false]} : vector<16xf32>, memref<1024xf32>
				// CHECK-VEC1: }
				// CHECK-VEC1: return
				//
				// CHECK-VEC2-LABEL: func @scale_d
				// CHECK-VEC2-DAG: %[[c0:.*]] = constant 0 : index
				// CHECK-VEC2-DAG: %[[c16:.*]] = constant 16 : index
				// CHECK-VEC2-DAG: %[[c1024:.*]] = constant 1024 : index
				// CHECK-VEC2: scf.for %[[i:.*]] = %[[c0]] to %[[c1024]] step %[[c16]] {
				// CHECK-VEC2: %[[r:.]] = vector.transfer_read %{{.}}[%[[i]]], %{{.*}} {masked = [false]} : memref<1024xf32>, vector<16xf32>
				// CHECK-VEC2: %[[b:.]] = vector.broadcast %{{.}} : f32 to vector<16xf32>
				// CHECK-VEC2: %[[m:.*]] = mulf %[[r]], %[[b]] : vector<16xf32>
				// CHECK-VEC2: vector.transfer_write %[[m]], %{{.*}}[%[[i]]] {masked = [false]} : vector<16xf32>, memref<1024xf32>
				// CHECK-VEC2: }
				// CHECK-VEC2: return
				//
				func @scale_d(%arga: tensor<1024xf32>, %scale: f32) -> tensor<1024xf32> {
				%0 = linalg.generic #trait_scale_d
				ins(%arga: tensor<1024xf32>)
				outs(%arga: tensor<1024xf32>) {
				^bb(%a: f32, %s : f32):
				%0 = mulf %a, %scale : f32
				linalg.yield %0 : f32
				} -> tensor<1024xf32>
				return %0 : tensor<1024xf32>
				}

				#trait_mul_s = {
				indexing_maps = [
				affine_map<(i) -> (i)>, // a
				affine_map<(i) -> (i)>, // b
				affine_map<(i) -> (i)> // x (out)
				],
				sparse = [
				[ "S" ], // a
				[ "D" ], // b
				[ "D" ] // x
				],
				iterator_types = ["parallel"],
				doc = "x(i) = a(i) * b(i)"
				}

				//
				// CHECK-VEC0-LABEL: func @mul_s
				// CHECK-VEC0-DAG: %[[c0:.*]] = constant 0 : index
				// CHECK-VEC0-DAG: %[[c1:.*]] = constant 1 : index
				// CHECK-VEC0: %[[p:.]] = load %{{.}}[%[[c0]]] : memref<?xi32>
				// CHECK-VEC0: %[[q:.*]] = index_cast %[[p]] : i32 to index
				// CHECK-VEC0: %[[r:.]] = load %{{.}}[%[[c1]]] : memref<?xi32>
				// CHECK-VEC0: %[[s:.*]] = index_cast %[[r]] : i32 to index
				// CHECK-VEC0: scf.for %[[i:.*]] = %[[q]] to %[[s]] step %[[c1]] {
				// CHECK-VEC0: %[[li:.]] = load %{{.}}[%[[i]]] : memref<?xi32>
				// CHECK-VEC0: %[[ci:.*]] = index_cast %[[li]] : i32 to index
				// CHECK-VEC0: %[[la:.]] = load %{{.}}[%[[i]]] : memref<?xf32>
				// CHECK-VEC0: %[[lb:.]] = load %{{.}}[%[[ci]]] : memref<1024xf32>
				// CHECK-VEC0: %[[m:.*]] = mulf %[[la]], %[[lb]] : f32
				penpornkUnsubmitted Not Done Reply Inline Actions Just for my information, why do we have 2 and 3 after `load %{{.}}`? penpornk:* Just for my information, why do we have 2 and 3 after `load %{{.*}}`?
				aartbikAuthorUnsubmitted Done Reply Inline Actions The "l" was for load, and then numbering up. But I renamed without numbering, but referring back to the tensor names, hopefully more clear. aartbik: The "l" was for load, and then numbering up. But I renamed without numbering, but referring…
				aartbikAuthorUnsubmitted Done Reply Inline Actions Oh, I see now you did not refer to the label name. I will remove this too. aartbik: Oh, I see now you did not refer to the label name. I will remove this too.
				// CHECK-VEC0: store %[[m]], %{{.*}}[%[[ci]]] : memref<1024xf32>
				// CHECK-VEC0: }
				// CHECK-VEC0: return
				//
				// CHECK-VEC1-LABEL: func @mul_s
				// CHECK-VEC1-DAG: %[[c0:.*]] = constant 0 : index
				// CHECK-VEC1-DAG: %[[c1:.*]] = constant 1 : index
				// CHECK-VEC1: %[[p:.]] = load %{{.}}[%[[c0]]] : memref<?xi32>
				// CHECK-VEC1: %[[q:.*]] = index_cast %[[p]] : i32 to index
				// CHECK-VEC1: %[[r:.]] = load %{{.}}[%[[c1]]] : memref<?xi32>
				// CHECK-VEC1: %[[s:.*]] = index_cast %[[r]] : i32 to index
				// CHECK-VEC1: scf.for %[[i:.*]] = %[[q]] to %[[s]] step %[[c1]] {
				// CHECK-VEC1: %[[li:.]] = load %{{.}}[%[[i]]] : memref<?xi32>
				// CHECK-VEC1: %[[ci:.*]] = index_cast %[[li]] : i32 to index
				// CHECK-VEC1: %[[la:.]] = load %{{.}}[%[[i]]] : memref<?xf32>
				// CHECK-VEC1: %[[lb:.]] = load %{{.}}[%[[ci]]] : memref<1024xf32>
				// CHECK-VEC1: %[[m:.*]] = mulf %[[la]], %[[lb]] : f32
				// CHECK-VEC1: store %[[m]], %{{.*}}[%[[ci]]] : memref<1024xf32>
				// CHECK-VEC1: }
				// CHECK-VEC1: return
				//
				// CHECK-VEC2-LABEL: func @mul_s
				// CHECK-VEC2-DAG: %[[c0:.*]] = constant 0 : index
				// CHECK-VEC2-DAG: %[[c1:.*]] = constant 1 : index
				// CHECK-VEC2-DAG: %[[c16:.*]] = constant 16 : index
				// CHECK-VEC2: %[[p:.]] = load %{{.}}[%[[c0]]] : memref<?xi32>
				// CHECK-VEC2: %[[q:.*]] = index_cast %[[p]] : i32 to index
				// CHECK-VEC2: %[[r:.]] = load %{{.}}[%[[c1]]] : memref<?xi32>
				// CHECK-VEC2: %[[s:.*]] = index_cast %[[r]] : i32 to index
				// CHECK-VEC2: scf.for %[[i:.*]] = %[[q]] to %[[s]] step %[[c16]] {
				// CHECK-VEC2: %[[sub:.*]] = subi %[[s]], %[[i]] : index
				// CHECK-VEC2: %[[mask:.*]] = vector.create_mask %[[sub]] : vector<16xi1>
				// CHECK-VEC2: %[[li:.]] = vector.maskedload %{{.}}[%[[i]]], %[[mask]], %{{.*}} : memref<?xi32>, vector<16xi1>, vector<16xi32> into vector<16xi32>
				// CHECK-VEC2: %[[la:.]] = vector.maskedload %{{.}}[%[[i]]], %[[mask]], %{{.*}} : memref<?xf32>, vector<16xi1>, vector<16xf32> into vector<16xf32>
				// CHECK-VEC2: %[[lb:.]] = vector.gather %{{.}}[%[[li]]], %[[mask]], %{{.*}} : memref<1024xf32>, vector<16xi32>, vector<16xi1>, vector<16xf32> into vector<16xf32>
				// CHECK-VEC2: %[[m:.*]] = mulf %[[la]], %[[lb]] : vector<16xf32>
				// CHECK-VEC2: vector.scatter %{{.*}}[%[[li]]], %[[mask]], %[[m]] : memref<1024xf32>, vector<16xi32>, vector<16xi1>, vector<16xf32>
				// CHECK-VEC2: }
				// CHECK-VEC2: return
				//
				func @mul_s(%arga: tensor<1024xf32>, %argb: tensor<1024xf32>) -> tensor<1024xf32> {
				%0 = linalg.generic #trait_mul_s
				ins(%arga, %argb: tensor<1024xf32>, tensor<1024xf32>)
				outs(%arga: tensor<1024xf32>) {
				^bb(%a: f32, %b: f32, %s : f32):
				%0 = mulf %a, %b : f32
				linalg.yield %0 : f32
				} -> tensor<1024xf32>
				return %0 : tensor<1024xf32>
				}

				#trait_reduction_d = {
				indexing_maps = [
				affine_map<(i) -> (i)>, // a
				affine_map<(i) -> (i)>, // b
				affine_map<(i) -> ()> // x (out)
				],
				sparse = [
				[ "D" ], // a
				[ "D" ], // b
				[ ] // x
				],
				iterator_types = ["reduction"],
				doc = "x += a(i) * b(i)"
				}

				//
				// CHECK-VEC0-LABEL: func @reduction_d
				// CHECK-VEC0-DAG: %[[c0:.*]] = constant 0 : index
				// CHECK-VEC0-DAG: %[[c1:.*]] = constant 1 : index
				// CHECK-VEC0-DAG: %[[c1024:.*]] = constant 1024 : index
				// CHECK-VEC0: %[[red:.]] = scf.for %[[i:.]] = %[[c0]] to %[[c1024]] step %[[c1]] iter_args(%[[red_in:.]] = %{{.}}) -> (f32) {
				// CHECK-VEC0: %[[la:.]] = load %{{.}}[%[[i]]] : memref<1024xf32>
				penpornkUnsubmitted Done Reply Inline Actions Do we need a `%[[v0:.]] = constant 0 : f32` so the `%{{.}}` in `iter_args` can be `%[[v0]]`? penpornk: Do we need a `%[[v0:.]] = constant 0 : f32` so the `%{{.}}` in `iter_args` can be `%[[v0]]`?
				aartbikAuthorUnsubmitted Done Reply Inline Actions In this case we actually load the "tensor" scalar as initial value, and I did not want to commit to that pattern to rigidity yet. aartbik: In this case we actually load the "tensor" scalar as initial value, and I did not want to…
				// CHECK-VEC0: %[[lb:.]] = load %{{.}}[%[[i]]] : memref<1024xf32>
				// CHECK-VEC0: %[[m:.*]] = mulf %[[la]], %[[lb]] : f32
				// CHECK-VEC0: %[[a:.*]] = addf %[[red_in]], %[[m]] : f32
				// CHECK-VEC0: scf.yield %[[a]] : f32
				// CHECK-VEC0: }
				// CHECK-VEC0: return
				//
				// CHECK-VEC1-LABEL: func @reduction_d
				// CHECK-VEC1-DAG: %[[c0:.*]] = constant 0 : index
				// CHECK-VEC1-DAG: %[[c16:.*]] = constant 16 : index
				// CHECK-VEC1-DAG: %[[c1024:.*]] = constant 1024 : index
				// CHECK-VEC1-DAG: %[[v0:.*]] = constant dense<0.000000e+00> : vector<16xf32>
				// CHECK-VEC1: %[[red:.]] = scf.for %[[i:.]] = %[[c0]] to %[[c1024]] step %[[c16]] iter_args(%[[red_in:.*]] = %[[v0]]) -> (vector<16xf32>) {
				// CHECK-VEC1: %[[la:.]] = vector.transfer_read %{{.}}[%[[i]]], %cst_0 {masked = [false]} : memref<1024xf32>, vector<16xf32>
				// CHECK-VEC1: %[[lb:.]] = vector.transfer_read %{{.}}[%[[i]]], %cst_0 {masked = [false]} : memref<1024xf32>, vector<16xf32>
				// CHECK-VEC1: %[[m:.*]] = mulf %[[la]], %[[lb]] : vector<16xf32>
				// CHECK-VEC1: %[[a:.*]] = addf %[[red_in]], %[[m]] : vector<16xf32>
				// CHECK-VEC1: scf.yield %[[a]] : vector<16xf32>
				// CHECK-VEC1: }
				// CHECK-VEC1: %{{.}} = vector.reduction "add", %[[red]], %{{.}} : vector<16xf32> into f32
				// CHECK-VEC1: return
				//
				// CHECK-VEC2-LABEL: func @reduction_d
				// CHECK-VEC2-DAG: %[[c0:.*]] = constant 0 : index
				// CHECK-VEC2-DAG: %[[c16:.*]] = constant 16 : index
				// CHECK-VEC2-DAG: %[[c1024:.*]] = constant 1024 : index
				// CHECK-VEC2-DAG: %[[v0:.*]] = constant dense<0.000000e+00> : vector<16xf32>
				// CHECK-VEC2: %[[red:.]] = scf.for %[[i:.]] = %[[c0]] to %[[c1024]] step %[[c16]] iter_args(%[[red_in:.*]] = %[[v0]]) -> (vector<16xf32>) {
				// CHECK-VEC2: %[[la:.]] = vector.transfer_read %{{.}}[%[[i]]], %cst_0 {masked = [false]} : memref<1024xf32>, vector<16xf32>
				// CHECK-VEC2: %[[lb:.]] = vector.transfer_read %{{.}}[%[[i]]], %cst_0 {masked = [false]} : memref<1024xf32>, vector<16xf32>
				// CHECK-VEC2: %[[m:.*]] = mulf %[[la]], %[[lb]] : vector<16xf32>
				// CHECK-VEC2: %[[a:.*]] = addf %[[red_in]], %[[m]] : vector<16xf32>
				// CHECK-VEC2: scf.yield %[[a]] : vector<16xf32>
				// CHECK-VEC2: }
				// CHECK-VEC2: %{{.}} = vector.reduction "add", %[[red]], %{{.}} : vector<16xf32> into f32
				// CHECK-VEC2: return
				//
				func @reduction_d(%arga: tensor<1024xf32>, %argb: tensor<1024xf32>, %argx: tensor<f32>) -> tensor<f32> {
				%0 = linalg.generic #trait_reduction_d
				ins(%arga, %argb: tensor<1024xf32>, tensor<1024xf32>)
				outs(%argx: tensor<f32>) {
				^bb(%a: f32, %b : f32, %x : f32):
				%0 = mulf %a, %b : f32
				%1 = addf %x, %0 : f32
				linalg.yield %1 : f32
				} -> tensor<f32>
				return %0 : tensor<f32>
				}

				#trait_mul_ds = {
				indexing_maps = [
				affine_map<(i,j) -> (i,j)>, // a
				affine_map<(i,j) -> (i,j)>, // b
				affine_map<(i,j) -> (i,j)> // x (out)
				],
				sparse = [
				[ "D", "S" ], // a
				[ "D", "D" ], // b
				[ "D", "D" ] // x
				],
				iterator_types = ["parallel", "parallel"],
				doc = "x(i,j) = a(i,j) * b(i,j)"
				}

				//
				// CHECK-VEC0-LABEL: func @mul_ds
				// CHECK-VEC0-DAG: %[[c0:.*]] = constant 0 : index
				// CHECK-VEC0-DAG: %[[c1:.*]] = constant 1 : index
				// CHECK-VEC0-DAG: %[[c512:.*]] = constant 512 : index
				// CHECK-VEC0: scf.for %[[i:.*]] = %[[c0]] to %[[c512]] step %[[c1]] {
				// CHECK-VEC0: %[[p:.]] = load %{{.}}[%[[i]]] : memref<?xi32>
				// CHECK-VEC0: %[[q:.*]] = index_cast %[[p]] : i32 to index
				// CHECK-VEC0: %[[a:.*]] = addi %[[i]], %[[c1]] : index
				// CHECK-VEC0: %[[r:.]] = load %{{.}}[%[[a]]] : memref<?xi32>
				// CHECK-VEC0: %[[s:.*]] = index_cast %[[r]] : i32 to index
				// CHECK-VEC0: scf.for %[[j:.*]] = %[[q]] to %[[s]] step %[[c1]] {
				// CHECK-VEC0: %[[lj:.]] = load %{{.}}[%[[j]]] : memref<?xi32>
				// CHECK-VEC0: %[[cj:.*]] = index_cast %[[lj]] : i32 to index
				// CHECK-VEC0: %[[la:.]] = load %{{.}}[%[[j]]] : memref<?xf32>
				// CHECK-VEC0: %[[lb:.]] = load %{{.}}[%[[i]], %[[cj]]] : memref<512x1024xf32>
				// CHECK-VEC0: %[[m:.*]] = mulf %[[la]], %[[lb]] : f32
				// CHECK-VEC0: store %[[m]], %{{.*}}[%[[i]], %[[cj]]] : memref<512x1024xf32>
				// CHECK-VEC0: }
				// CHECK-VEC0: }
				// CHECK-VEC0: return
				//
				// CHECK-VEC1-LABEL: func @mul_ds
				// CHECK-VEC1-DAG: %[[c0:.*]] = constant 0 : index
				// CHECK-VEC1-DAG: %[[c1:.*]] = constant 1 : index
				// CHECK-VEC1-DAG: %[[c512:.*]] = constant 512 : index
				// CHECK-VEC1: scf.for %[[i:.*]] = %[[c0]] to %[[c512]] step %[[c1]] {
				// CHECK-VEC1: %[[p:.]] = load %{{.}}[%[[i]]] : memref<?xi32>
				// CHECK-VEC1: %[[q:.*]] = index_cast %[[p]] : i32 to index
				// CHECK-VEC1: %[[a:.*]] = addi %[[i]], %[[c1]] : index
				// CHECK-VEC1: %[[r:.]] = load %{{.}}[%[[a]]] : memref<?xi32>
				// CHECK-VEC1: %[[s:.*]] = index_cast %[[r]] : i32 to index
				// CHECK-VEC1: scf.for %[[j:.*]] = %[[q]] to %[[s]] step %[[c1]] {
				// CHECK-VEC1: %[[lj:.]] = load %{{.}}[%[[j]]] : memref<?xi32>
				// CHECK-VEC1: %[[cj:.*]] = index_cast %[[lj]] : i32 to index
				// CHECK-VEC1: %[[la:.]] = load %{{.}}[%[[j]]] : memref<?xf32>
				// CHECK-VEC1: %[[lb:.]] = load %{{.}}[%[[i]], %[[cj]]] : memref<512x1024xf32>
				// CHECK-VEC1: %[[m:.*]] = mulf %[[la]], %[[lb]] : f32
				// CHECK-VEC1: store %[[m]], %{{.*}}[%[[i]], %[[cj]]] : memref<512x1024xf32>
				// CHECK-VEC1: }
				// CHECK-VEC1: }
				// CHECK-VEC1: return
				//
				// CHECK-VEC2-LABEL: func @mul_ds
				// CHECK-VEC2-DAG: %[[c0:.*]] = constant 0 : index
				// CHECK-VEC2-DAG: %[[c1:.*]] = constant 1 : index
				// CHECK-VEC2-DAG: %[[c16:.*]] = constant 16 : index
				// CHECK-VEC2-DAG: %[[c512:.*]] = constant 512 : index
				// CHECK-VEC2: scf.for %[[i:.*]] = %[[c0]] to %[[c512]] step %[[c1]] {
				// CHECK-VEC2: %[[p:.]] = load %{{.}}[%[[i]]] : memref<?xi32>
				// CHECK-VEC2: %[[q:.*]] = index_cast %[[p]] : i32 to index
				// CHECK-VEC2: %[[a:.*]] = addi %[[i]], %[[c1]] : index
				// CHECK-VEC2: %[[r:.]] = load %{{.}}[%[[a]]] : memref<?xi32>
				// CHECK-VEC2: %[[s:.*]] = index_cast %[[r]] : i32 to index
				// CHECK-VEC2: scf.for %[[j:.*]] = %[[q]] to %[[s]] step %[[c16]] {
				// CHECK-VEC2: %[[sub:.*]] = subi %[[s]], %[[j]] : index
				// CHECK-VEC2: %[[mask:.*]] = vector.create_mask %[[sub]] : vector<16xi1>
				// CHECK-VEC2: %[[lj:.]] = vector.maskedload %{{.}}[%arg3], %[[mask]], %{{.*}} : memref<?xi32>, vector<16xi1>, vector<16xi32> into vector<16xi32>
				// CHECK-VEC2: %[[la:.]] = vector.maskedload %{{.}}[%arg3], %[[mask]], %{{.*}} : memref<?xf32>, vector<16xi1>, vector<16xf32> into vector<16xf32>
				// CHECK-VEC2: %[[lb:.]] = vector.gather %{{.}}[%[[lj]]], %[[mask]], %{{.*}} : memref<512x1024xf32>, vector<16xi32>, vector<16xi1>, vector<16xf32> into vector<16xf32>
				// CHECK-VEC2: %[[m:.*]] = mulf %[[la]], %[[lb]] : vector<16xf32>
				// CHECK-VEC2: vector.scatter %{{.*}}[%[[lj]]], %[[mask]], %[[m]] : memref<512x1024xf32>, vector<16xi32>, vector<16xi1>, vector<16xf32>
				// CHECK-VEC2: }
				// CHECK-VEC2: }
				// CHECK-VEC2: return
				//
				func @mul_ds(%arga: tensor<512x1024xf32>, %argb: tensor<512x1024xf32>) -> tensor<512x1024xf32> {
				%0 = linalg.generic #trait_mul_ds
				ins(%arga, %argb: tensor<512x1024xf32>, tensor<512x1024xf32>)
				outs(%arga: tensor<512x1024xf32>) {
				^bb(%a: f32, %b: f32, %s : f32):
				%0 = mulf %a, %b : f32
				linalg.yield %0 : f32
				} -> tensor<512x1024xf32>
				return %0 : tensor<512x1024xf32>
				}

mlir/test/lib/Transforms/TestSparsification.cpp

//===- TestSparsification.cpp - Test sparsification of tensors ------------===//		//===- TestSparsification.cpp - Test sparsification of tensors ------------===//
//		//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.		// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.		// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception		// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//		//
//===----------------------------------------------------------------------===//		//===----------------------------------------------------------------------===//

#include "mlir/Dialect/Linalg/Transforms/Transforms.h"		#include "mlir/Dialect/Linalg/Transforms/Transforms.h"
		#include "mlir/Dialect/Vector/VectorOps.h"
#include "mlir/Pass/Pass.h"		#include "mlir/Pass/Pass.h"
#include "mlir/Transforms/GreedyPatternRewriteDriver.h"		#include "mlir/Transforms/GreedyPatternRewriteDriver.h"

using namespace mlir;		using namespace mlir;

namespace {		namespace {

struct TestSparsification		struct TestSparsification
▲ Show 20 Lines • Show All 71 Lines • ▼ Show 20 Lines	void runOnFunction() override {
auto *ctx = &getContext();		auto *ctx = &getContext();
OwningRewritePatternList patterns;		OwningRewritePatternList patterns;
// Translate strategy flags to strategy options.		// Translate strategy flags to strategy options.
linalg::SparsificationOptions options(parallelOption(), vectorOption(),		linalg::SparsificationOptions options(parallelOption(), vectorOption(),
vectorLength, typeOption(ptrType),		vectorLength, typeOption(ptrType),
typeOption(indType));		typeOption(indType));
// Apply rewriting.		// Apply rewriting.
linalg::populateSparsificationPatterns(ctx, patterns, options);		linalg::populateSparsificationPatterns(ctx, patterns, options);
		vector::populateVectorToVectorCanonicalizationPatterns(patterns, ctx);
applyPatternsAndFoldGreedily(getFunction(), std::move(patterns));		applyPatternsAndFoldGreedily(getFunction(), std::move(patterns));
}		}
};		};

} // end anonymous namespace		} // end anonymous namespace

namespace mlir {		namespace mlir {
namespace test {		namespace test {
Show All 9 Lines

This is an archive of the discontinued LLVM Phabricator instance.

[mlir][sparse] add vectorization strategies to sparse compiler
ClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 316481

mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h

mlir/lib/Dialect/Linalg/Transforms/Sparsification.cpp

mlir/test/Dialect/Linalg/sparse_vector.mlir

mlir/test/lib/Transforms/TestSparsification.cpp

This is an archive of the discontinued LLVM Phabricator instance.

[mlir][sparse] add vectorization strategies to sparse compilerClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 316481

mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h

mlir/lib/Dialect/Linalg/Transforms/Sparsification.cpp

mlir/test/Dialect/Linalg/sparse_vector.mlir

mlir/test/lib/Transforms/TestSparsification.cpp

[mlir][sparse] add vectorization strategies to sparse compiler
ClosedPublic