Diff 255614

mlir/include/mlir/Analysis/AffineStructures.h

Show First 20 Lines • Show All 475 Lines • ▼ Show 20 Lines	public:
/// Returns the constant lower bound for the pos^th identifier if there is		/// Returns the constant lower bound for the pos^th identifier if there is
/// one; None otherwise.		/// one; None otherwise.
Optional<int64_t> getConstantLowerBound(unsigned pos) const;		Optional<int64_t> getConstantLowerBound(unsigned pos) const;

/// Returns the constant upper bound for the pos^th identifier if there is		/// Returns the constant upper bound for the pos^th identifier if there is
/// one; None otherwise.		/// one; None otherwise.
Optional<int64_t> getConstantUpperBound(unsigned pos) const;		Optional<int64_t> getConstantUpperBound(unsigned pos) const;

/// Gets the lower and upper bound of the pos^th identifier treating		/// Gets the lower and upper bound of the `offset` + `pos`th identifier
/// [0, offset) U [offset + num, symStartPos) as dimensions and		/// treating [0, offset) U [offset + num, symStartPos) as dimensions and
/// [symStartPos, getNumDimAndSymbolIds) as symbols. The returned		/// [symStartPos, getNumDimAndSymbolIds) as symbols, and `pos` lies in
/// multi-dimensional maps in the pair represent the max and min of		/// [0, num). The multi-dimensional maps in the returned pair represent the
/// potentially multiple affine expressions. The upper bound is exclusive.		/// max and min of potentially multiple affine expressions. The upper bound is
/// 'localExprs' holds pre-computed AffineExpr's for all local identifiers in		/// exclusive. `localExprs` holds pre-computed AffineExpr's for all local
/// the system.		/// identifiers in the system.
std::pair<AffineMap, AffineMap>		std::pair<AffineMap, AffineMap>
getLowerAndUpperBound(unsigned pos, unsigned offset, unsigned num,		getLowerAndUpperBound(unsigned pos, unsigned offset, unsigned num,
unsigned symStartPos, ArrayRef<AffineExpr> localExprs,		unsigned symStartPos, ArrayRef<AffineExpr> localExprs,
MLIRContext *context) const;		MLIRContext *context) const;

/// Gather positions of all lower and upper bounds of the identifier at `pos`,		/// Gather positions of all lower and upper bounds of the identifier at `pos`,
/// and optionally any equalities on it. In addition, the bounds are to be		/// and optionally any equalities on it. In addition, the bounds are to be
/// independent of identifiers in position range [`offset`, `offset` + `num`).		/// independent of identifiers in position range [`offset`, `offset` + `num`).
▲ Show 20 Lines • Show All 161 Lines • Show Last 20 Lines

mlir/include/mlir/Analysis/Utils.h

Show First 20 Lines • Show All 214 Lines • ▼ Show 20 Lines	struct MemRefRegion {
void setWrite(bool flag) { write = flag; }		void setWrite(bool flag) { write = flag; }

/// Returns a constant upper bound on the number of elements in this region if		/// Returns a constant upper bound on the number of elements in this region if
/// bounded by a known constant (always possible for static shapes), None		/// bounded by a known constant (always possible for static shapes), None
/// otherwise. Note that the symbols of the region are treated specially,		/// otherwise. Note that the symbols of the region are treated specially,
/// i.e., the returned bounding constant holds for any given value of the		/// i.e., the returned bounding constant holds for any given value of the
/// symbol identifiers. The 'shape' vector is set to the corresponding		/// symbol identifiers. The 'shape' vector is set to the corresponding
/// dimension-wise bounds major to minor. We use int64_t instead of uint64_t		/// dimension-wise bounds major to minor. We use int64_t instead of uint64_t
/// since index types can be at most int64_t.		/// since index types can be at most int64_t. `lbs` are set to the lower
		/// bounds for each of the rank dimensions, and lbDivisors contains the
		/// corresponding denominators for floorDivs.
Optional<int64_t> getConstantBoundingSizeAndShape(		Optional<int64_t> getConstantBoundingSizeAndShape(
SmallVectorImpl<int64_t> *shape = nullptr,		SmallVectorImpl<int64_t> *shape = nullptr,
std::vector<SmallVector<int64_t, 4>> *lbs = nullptr,		std::vector<SmallVector<int64_t, 4>> *lbs = nullptr,
SmallVectorImpl<int64_t> *lbDivisors = nullptr) const;		SmallVectorImpl<int64_t> *lbDivisors = nullptr) const;

		/// Gets the lower and upper bound map for the dimensional identifier at
		/// `pos`.
		void getLowerAndUpperBound(unsigned pos, AffineMap &lbMap,
		AffineMap &ubMap) const;

/// A wrapper around FlatAffineConstraints::getConstantBoundOnDimSize(). 'pos'		/// A wrapper around FlatAffineConstraints::getConstantBoundOnDimSize(). 'pos'
/// corresponds to the position of the memref shape's dimension (major to		/// corresponds to the position of the memref shape's dimension (major to
/// minor) which matches 1:1 with the dimensional identifier positions in		/// minor) which matches 1:1 with the dimensional identifier positions in
//'cst'.		//'cst'.
Optional<int64_t>		Optional<int64_t>
getConstantBoundOnDimSize(unsigned pos,		getConstantBoundOnDimSize(unsigned pos,
SmallVectorImpl<int64_t> *lb = nullptr,		SmallVectorImpl<int64_t> *lb = nullptr,
int64_t *lbFloorDivisor = nullptr) const {		int64_t *lbFloorDivisor = nullptr) const {
▲ Show 20 Lines • Show All 65 Lines • Show Last 20 Lines

mlir/lib/Analysis/AffineStructures.cpp

Show First 20 Lines • Show All 1,389 Lines • ▼ Show 20 Lines
std::pair<AffineMap, AffineMap> FlatAffineConstraints::getLowerAndUpperBound(		std::pair<AffineMap, AffineMap> FlatAffineConstraints::getLowerAndUpperBound(
unsigned pos, unsigned offset, unsigned num, unsigned symStartPos,		unsigned pos, unsigned offset, unsigned num, unsigned symStartPos,
ArrayRef<AffineExpr> localExprs, MLIRContext *context) const {		ArrayRef<AffineExpr> localExprs, MLIRContext *context) const {
assert(pos + offset < getNumDimIds() && "invalid dim start pos");		assert(pos + offset < getNumDimIds() && "invalid dim start pos");
assert(symStartPos >= (pos + offset) && "invalid sym start pos");		assert(symStartPos >= (pos + offset) && "invalid sym start pos");
assert(getNumLocalIds() == localExprs.size() &&		assert(getNumLocalIds() == localExprs.size() &&
"incorrect local exprs count");		"incorrect local exprs count");

SmallVector<unsigned, 4> lbIndices, ubIndices;		SmallVector<unsigned, 4> lbIndices, ubIndices, eqIndices;
getLowerAndUpperBoundIndices(pos + offset, &lbIndices, &ubIndices);		getLowerAndUpperBoundIndices(pos + offset, &lbIndices, &ubIndices, &eqIndices,
		offset, num);

/// Add to 'b' from 'a' in set [0, offset) U [offset + num, symbStartPos).		/// Add to 'b' from 'a' in set [0, offset) U [offset + num, symbStartPos).
auto addCoeffs = [&](ArrayRef<int64_t> a, SmallVectorImpl<int64_t> &b) {		auto addCoeffs = [&](ArrayRef<int64_t> a, SmallVectorImpl<int64_t> &b) {
b.clear();		b.clear();
for (unsigned i = 0, e = a.size(); i < e; ++i) {		for (unsigned i = 0, e = a.size(); i < e; ++i) {
if (i < offset \|\| i >= offset + num)		if (i < offset \|\| i >= offset + num)
b.push_back(a[i]);		b.push_back(a[i]);
}		}
};		};

SmallVector<int64_t, 8> lb, ub;		SmallVector<int64_t, 8> lb, ub;
SmallVector<AffineExpr, 4> exprs;		SmallVector<AffineExpr, 4> lbExprs;
unsigned dimCount = symStartPos - num;		unsigned dimCount = symStartPos - num;
unsigned symCount = getNumDimAndSymbolIds() - symStartPos;		unsigned symCount = getNumDimAndSymbolIds() - symStartPos;
exprs.reserve(lbIndices.size());		lbExprs.reserve(lbIndices.size() + eqIndices.size());
// Lower bound expressions.		// Lower bound expressions.
for (auto idx : lbIndices) {		for (auto idx : lbIndices) {
auto ineq = getInequality(idx);		auto ineq = getInequality(idx);
// Extract the lower bound (in terms of other coeff's + const), i.e., if		// Extract the lower bound (in terms of other coeff's + const), i.e., if
// i - j + 1 >= 0 is the constraint, 'pos' is for i the lower bound is j		// i - j + 1 >= 0 is the constraint, 'pos' is for i the lower bound is j
// - 1.		// - 1.
addCoeffs(ineq, lb);		addCoeffs(ineq, lb);
std::transform(lb.begin(), lb.end(), lb.begin(), std::negate<int64_t>());		std::transform(lb.begin(), lb.end(), lb.begin(), std::negate<int64_t>());
auto expr =		auto expr =
getAffineExprFromFlatForm(lb, dimCount, symCount, localExprs, context);		getAffineExprFromFlatForm(lb, dimCount, symCount, localExprs, context);
exprs.push_back(expr);		// expr ceildiv divisor is (expr + divisor - 1) floordiv divisor
		int64_t divisor = std::abs(ineq[pos + offset]);
		expr = (expr + divisor - 1).floorDiv(divisor);
		lbExprs.push_back(expr);
}		}
auto lbMap =
exprs.empty() ? AffineMap() : AffineMap::get(dimCount, symCount, exprs);

exprs.clear();		SmallVector<AffineExpr, 4> ubExprs;
exprs.reserve(ubIndices.size());		ubExprs.reserve(ubIndices.size() + eqIndices.size());
// Upper bound expressions.		// Upper bound expressions.
for (auto idx : ubIndices) {		for (auto idx : ubIndices) {
auto ineq = getInequality(idx);		auto ineq = getInequality(idx);
// Extract the upper bound (in terms of other coeff's + const).		// Extract the upper bound (in terms of other coeff's + const).
addCoeffs(ineq, ub);		addCoeffs(ineq, ub);
auto expr =		auto expr =
getAffineExprFromFlatForm(ub, dimCount, symCount, localExprs, context);		getAffineExprFromFlatForm(ub, dimCount, symCount, localExprs, context);
		expr = expr.floorDiv(std::abs(ineq[pos + offset]));
		// Upper bound is exclusive.
		ubExprs.push_back(expr + 1);
		}

		// Equalities. It's both a lower and a upper bound.
		SmallVector<int64_t, 4> b;
		for (auto idx : eqIndices) {
		auto eq = getEquality(idx);
		addCoeffs(eq, b);
		if (eq[pos + offset] > 0)
		std::transform(b.begin(), b.end(), b.begin(), std::negate<int64_t>());

		// Extract the upper bound (in terms of other coeff's + const).
		auto expr =
		getAffineExprFromFlatForm(b, dimCount, symCount, localExprs, context);
		expr = expr.floorDiv(std::abs(eq[pos + offset]));
// Upper bound is exclusive.		// Upper bound is exclusive.
exprs.push_back(expr + 1);		ubExprs.push_back(expr + 1);
		// Lower bound.
		expr =
		getAffineExprFromFlatForm(b, dimCount, symCount, localExprs, context);
		expr = expr.ceilDiv(std::abs(eq[pos + offset]));
		lbExprs.push_back(expr);
}		}
auto ubMap =
exprs.empty() ? AffineMap() : AffineMap::get(dimCount, symCount, exprs);		auto lbMap = lbExprs.empty() ? AffineMap()
		: AffineMap::get(dimCount, symCount, lbExprs);

		auto ubMap = ubExprs.empty() ? AffineMap()
		: AffineMap::get(dimCount, symCount, ubExprs);

return {lbMap, ubMap};		return {lbMap, ubMap};
}		}

/// Computes the lower and upper bounds of the first 'num' dimensional		/// Computes the lower and upper bounds of the first 'num' dimensional
/// identifiers (starting at 'offset') as affine maps of the remaining		/// identifiers (starting at 'offset') as affine maps of the remaining
/// identifiers (dimensional and symbolic identifiers). Local identifiers are		/// identifiers (dimensional and symbolic identifiers). Local identifiers are
/// themselves explicitly computed as affine functions of other identifiers in		/// themselves explicitly computed as affine functions of other identifiers in
▲ Show 20 Lines • Show All 126 Lines • ▼ Show 20 Lines	if (expr) {
// Work on a copy so that we don't update this constraint system.		// Work on a copy so that we don't update this constraint system.
if (!tmpClone) {		if (!tmpClone) {
tmpClone.emplace(FlatAffineConstraints(*this));		tmpClone.emplace(FlatAffineConstraints(*this));
// Removing redundant inequalities is necessary so that we don't get		// Removing redundant inequalities is necessary so that we don't get
// redundant loop bounds.		// redundant loop bounds.
tmpClone->removeRedundantInequalities();		tmpClone->removeRedundantInequalities();
}		}
std::tie(lbMap, ubMap) = tmpClone->getLowerAndUpperBound(		std::tie(lbMap, ubMap) = tmpClone->getLowerAndUpperBound(
pos, offset, num, getNumDimIds(), {}, context);		pos, offset, num, getNumDimIds(), /localExprs=/{}, context);
}		}

// If the above fails, we'll just use the constant lower bound and the		// If the above fails, we'll just use the constant lower bound and the
// constant upper bound (if they exist) as the slice bounds.		// constant upper bound (if they exist) as the slice bounds.
// TODO(b/126426796): being conservative for the moment in cases that		// TODO(b/126426796): being conservative for the moment in cases that
// lead to multiple bounds - until getConstDifference in LoopFusion.cpp is		// lead to multiple bounds - until getConstDifference in LoopFusion.cpp is
// fixed (b/126426796).		// fixed (b/126426796).
if (!lbMap \|\| lbMap.getNumResults() > 1) {		if (!lbMap \|\| lbMap.getNumResults() > 1) {
▲ Show 20 Lines • Show All 1,027 Lines • ▼ Show 20 Lines	static BoundCmpResult compareBounds(ArrayRef<int64_t> a, ArrayRef<int64_t> b) {

if (a.back() == b.back())		if (a.back() == b.back())
return Equal;		return Equal;

return a.back() < b.back() ? Less : Greater;		return a.back() < b.back() ? Less : Greater;
}		}
} // namespace		} // namespace

		// Returns constraints that are common to both A & B.
		static void getCommonConstraints(const FlatAffineConstraints &A,
		const FlatAffineConstraints &B,
		FlatAffineConstraints &C) {
		C.reset(A.getNumDimIds(), A.getNumSymbolIds(), A.getNumLocalIds());
		// A naive O(n^2) check should be enough here given the input sizes.
		for (unsigned r = 0, e = A.getNumInequalities(); r < e; ++r) {
		for (unsigned s = 0, f = B.getNumInequalities(); s < f; ++s) {
		if (A.getInequality(r) == B.getInequality(s)) {
		C.addInequality(A.getInequality(r));
		break;
		}
		}
		}
		for (unsigned r = 0, e = A.getNumEqualities(); r < e; ++r) {
		for (unsigned s = 0, f = B.getNumEqualities(); s < f; ++s) {
		if (A.getEquality(r) == B.getEquality(s)) {
		C.addEquality(A.getEquality(r));
		break;
		}
		}
		}
		}

// Computes the bounding box with respect to 'other' by finding the min of the		// Computes the bounding box with respect to 'other' by finding the min of the
// lower bounds and the max of the upper bounds along each of the dimensions.		// lower bounds and the max of the upper bounds along each of the dimensions.
LogicalResult		LogicalResult
FlatAffineConstraints::unionBoundingBox(const FlatAffineConstraints &otherCst) {		FlatAffineConstraints::unionBoundingBox(const FlatAffineConstraints &otherCst) {
assert(otherCst.getNumDimIds() == numDims && "dims mismatch");		assert(otherCst.getNumDimIds() == numDims && "dims mismatch");
assert(otherCst.getIds()		assert(otherCst.getIds()
.slice(0, getNumDimIds())		.slice(0, getNumDimIds())
.equals(getIds().slice(0, getNumDimIds())) &&		.equals(getIds().slice(0, getNumDimIds())) &&
"dim values mismatch");		"dim values mismatch");
assert(otherCst.getNumLocalIds() == 0 && "local ids not supported here");		assert(otherCst.getNumLocalIds() == 0 && "local ids not supported here");
assert(getNumLocalIds() == 0 && "local ids not supported yet here");		assert(getNumLocalIds() == 0 && "local ids not supported yet here");

		// Align `other` to this.
Optional<FlatAffineConstraints> otherCopy;		Optional<FlatAffineConstraints> otherCopy;
if (!areIdsAligned(*this, otherCst)) {		if (!areIdsAligned(*this, otherCst)) {
otherCopy.emplace(FlatAffineConstraints(otherCst));		otherCopy.emplace(FlatAffineConstraints(otherCst));
mergeAndAlignIds(/offset=/numDims, this, &otherCopy.getValue());		mergeAndAlignIds(/offset=/numDims, this, &otherCopy.getValue());
}		}

const auto &other = otherCopy ? *otherCopy : otherCst;		const auto &otherAligned = otherCopy ? *otherCopy : otherCst;

		// Get the constraints common to both systems; these will be added as is to
		// the union.
		FlatAffineConstraints commonCst;
		getCommonConstraints(*this, otherAligned, commonCst);

std::vector<SmallVector<int64_t, 8>> boundingLbs;		std::vector<SmallVector<int64_t, 8>> boundingLbs;
std::vector<SmallVector<int64_t, 8>> boundingUbs;		std::vector<SmallVector<int64_t, 8>> boundingUbs;
boundingLbs.reserve(2 * getNumDimIds());		boundingLbs.reserve(2 * getNumDimIds());
boundingUbs.reserve(2 * getNumDimIds());		boundingUbs.reserve(2 * getNumDimIds());

// To hold lower and upper bounds for each dimension.		// To hold lower and upper bounds for each dimension.
SmallVector<int64_t, 4> lb, otherLb, ub, otherUb;		SmallVector<int64_t, 4> lb, otherLb, ub, otherUb;
// To compute min of lower bounds and max of upper bounds for each dimension.		// To compute min of lower bounds and max of upper bounds for each dimension.
SmallVector<int64_t, 4> minLb(getNumSymbolIds() + 1);		SmallVector<int64_t, 4> minLb(getNumSymbolIds() + 1);
SmallVector<int64_t, 4> maxUb(getNumSymbolIds() + 1);		SmallVector<int64_t, 4> maxUb(getNumSymbolIds() + 1);
// To compute final new lower and upper bounds for the union.		// To compute final new lower and upper bounds for the union.
SmallVector<int64_t, 8> newLb(getNumCols()), newUb(getNumCols());		SmallVector<int64_t, 8> newLb(getNumCols()), newUb(getNumCols());

int64_t lbFloorDivisor, otherLbFloorDivisor;		int64_t lbFloorDivisor, otherLbFloorDivisor;
for (unsigned d = 0, e = getNumDimIds(); d < e; ++d) {		for (unsigned d = 0, e = getNumDimIds(); d < e; ++d) {
auto extent = getConstantBoundOnDimSize(d, &lb, &lbFloorDivisor, &ub);		auto extent = getConstantBoundOnDimSize(d, &lb, &lbFloorDivisor, &ub);
if (!extent.hasValue())		if (!extent.hasValue())
// TODO(bondhugula): symbolic extents when necessary.		// TODO(bondhugula): symbolic extents when necessary.
// TODO(bondhugula): handle union if a dimension is unbounded.		// TODO(bondhugula): handle union if a dimension is unbounded.
return failure();		return failure();

auto otherExtent = other.getConstantBoundOnDimSize(		auto otherExtent = otherAligned.getConstantBoundOnDimSize(
d, &otherLb, &otherLbFloorDivisor, &otherUb);		d, &otherLb, &otherLbFloorDivisor, &otherUb);
if (!otherExtent.hasValue() \|\| lbFloorDivisor != otherLbFloorDivisor)		if (!otherExtent.hasValue() \|\| lbFloorDivisor != otherLbFloorDivisor)
// TODO(bondhugula): symbolic extents when necessary.		// TODO(bondhugula): symbolic extents when necessary.
return failure();		return failure();

assert(lbFloorDivisor > 0 && "divisor always expected to be positive");		assert(lbFloorDivisor > 0 && "divisor always expected to be positive");

auto res = compareBounds(lb, otherLb);		auto res = compareBounds(lb, otherLb);
// Identify min.		// Identify min.
if (res == BoundCmpResult::Less \|\| res == BoundCmpResult::Equal) {		if (res == BoundCmpResult::Less \|\| res == BoundCmpResult::Equal) {
minLb = lb;		minLb = lb;
// Since the divisor is for a floordiv, we need to convert to ceildiv,		// Since the divisor is for a floordiv, we need to convert to ceildiv,
// i.e., i >= expr floordiv div <=> i >= (expr - div + 1) ceildiv div <=>		// i.e., i >= expr floordiv div <=> i >= (expr - div + 1) ceildiv div <=>
// div * i >= expr - div + 1.		// div * i >= expr - div + 1.
minLb.back() -= lbFloorDivisor - 1;		minLb.back() -= lbFloorDivisor - 1;
} else if (res == BoundCmpResult::Greater) {		} else if (res == BoundCmpResult::Greater) {
minLb = otherLb;		minLb = otherLb;
minLb.back() -= otherLbFloorDivisor - 1;		minLb.back() -= otherLbFloorDivisor - 1;
} else {		} else {
// Uncomparable - check for constant lower/upper bounds.		// Uncomparable - check for constant lower/upper bounds.
auto constLb = getConstantLowerBound(d);		auto constLb = getConstantLowerBound(d);
auto constOtherLb = other.getConstantLowerBound(d);		auto constOtherLb = otherAligned.getConstantLowerBound(d);
if (!constLb.hasValue() \|\| !constOtherLb.hasValue())		if (!constLb.hasValue() \|\| !constOtherLb.hasValue())
return failure();		return failure();
std::fill(minLb.begin(), minLb.end(), 0);		std::fill(minLb.begin(), minLb.end(), 0);
minLb.back() = std::min(constLb.getValue(), constOtherLb.getValue());		minLb.back() = std::min(constLb.getValue(), constOtherLb.getValue());
}		}

// Do the same for ub's but max of upper bounds. Identify max.		// Do the same for ub's but max of upper bounds. Identify max.
auto uRes = compareBounds(ub, otherUb);		auto uRes = compareBounds(ub, otherUb);
if (uRes == BoundCmpResult::Greater \|\| uRes == BoundCmpResult::Equal) {		if (uRes == BoundCmpResult::Greater \|\| uRes == BoundCmpResult::Equal) {
maxUb = ub;		maxUb = ub;
} else if (uRes == BoundCmpResult::Less) {		} else if (uRes == BoundCmpResult::Less) {
maxUb = otherUb;		maxUb = otherUb;
} else {		} else {
// Uncomparable - check for constant lower/upper bounds.		// Uncomparable - check for constant lower/upper bounds.
auto constUb = getConstantUpperBound(d);		auto constUb = getConstantUpperBound(d);
auto constOtherUb = other.getConstantUpperBound(d);		auto constOtherUb = otherAligned.getConstantUpperBound(d);
if (!constUb.hasValue() \|\| !constOtherUb.hasValue())		if (!constUb.hasValue() \|\| !constOtherUb.hasValue())
return failure();		return failure();
std::fill(maxUb.begin(), maxUb.end(), 0);		std::fill(maxUb.begin(), maxUb.end(), 0);
maxUb.back() = std::max(constUb.getValue(), constOtherUb.getValue());		maxUb.back() = std::max(constUb.getValue(), constOtherUb.getValue());
}		}

std::fill(newLb.begin(), newLb.end(), 0);		std::fill(newLb.begin(), newLb.end(), 0);
std::fill(newUb.begin(), newUb.end(), 0);		std::fill(newUb.begin(), newUb.end(), 0);
Show All 13 Lines	FlatAffineConstraints::unionBoundingBox(const FlatAffineConstraints &otherCst) {
}		}

// Clear all constraints and add the lower/upper bounds for the bounding box.		// Clear all constraints and add the lower/upper bounds for the bounding box.
clearConstraints();		clearConstraints();
for (unsigned d = 0, e = getNumDimIds(); d < e; ++d) {		for (unsigned d = 0, e = getNumDimIds(); d < e; ++d) {
addInequality(boundingLbs[d]);		addInequality(boundingLbs[d]);
addInequality(boundingUbs[d]);		addInequality(boundingUbs[d]);
}		}

		// Add the constraints that were common to both systems.
		append(commonCst);
		removeTrivialRedundancy();

// TODO(mlir-team): copy over pure symbolic constraints from this and 'other'		// TODO(mlir-team): copy over pure symbolic constraints from this and 'other'
// over to the union (since the above are just the union along dimensions); we		// over to the union (since the above are just the union along dimensions); we
// shouldn't be discarding any other constraints on the symbols.		// shouldn't be discarding any other constraints on the symbols.

return success();		return success();
}		}

/// Compute an explicit representation for local vars. For all systems coming		/// Compute an explicit representation for local vars. For all systems coming
▲ Show 20 Lines • Show All 178 Lines • Show Last 20 Lines

mlir/lib/Analysis/Utils.cpp

Show First 20 Lines • Show All 58 Lines • ▼ Show 20 Lines	ComputationSliceState::getAsConstraints(FlatAffineConstraints *cst) {
cst->reset(numDims, numSymbols, 0, values);		cst->reset(numDims, numSymbols, 0, values);

// Add loop bound constraints for values which are loop IVs and equality		// Add loop bound constraints for values which are loop IVs and equality
// constraints for symbols which are constants.		// constraints for symbols which are constants.
for (const auto &value : values) {		for (const auto &value : values) {
assert(cst->containsId(value) && "value expected to be present");		assert(cst->containsId(value) && "value expected to be present");
if (isValidSymbol(value)) {		if (isValidSymbol(value)) {
// Check if the symbol is a constant.		// Check if the symbol is a constant.

if (auto cOp = dyn_cast_or_null<ConstantIndexOp>(value.getDefiningOp()))		if (auto cOp = dyn_cast_or_null<ConstantIndexOp>(value.getDefiningOp()))
cst->setIdToConstant(value, cOp.getValue());		cst->setIdToConstant(value, cOp.getValue());
} else if (auto loop = getForInductionVarOwner(value)) {		} else if (auto loop = getForInductionVarOwner(value)) {
if (failed(cst->addAffineForOpDomain(loop)))		if (failed(cst->addAffineForOpDomain(loop)))
return failure();		return failure();
}		}
}		}

Show All 22 Lines	Optional<int64_t> MemRefRegion::getConstantBoundingSizeAndShape(
SmallVectorImpl<int64_t> *lbDivisors) const {		SmallVectorImpl<int64_t> *lbDivisors) const {
auto memRefType = memref.getType().cast<MemRefType>();		auto memRefType = memref.getType().cast<MemRefType>();
unsigned rank = memRefType.getRank();		unsigned rank = memRefType.getRank();
if (shape)		if (shape)
shape->reserve(rank);		shape->reserve(rank);

assert(rank == cst.getNumDimIds() && "inconsistent memref region");		assert(rank == cst.getNumDimIds() && "inconsistent memref region");

		// Use a copy of the region constraints that has upper/lower bounds for each
		// memref dimension with static size added to guard against potential
		andydavis1Unsubmitted Done Reply Inline Actions Have you run into over approximations again? andydavis1: Have you run into over approximations again?
		bondhugulaAuthorUnsubmitted Done Reply Inline Actions It's the same over approximation that existed - but I've changed affine data copy generate to use /addMemRefDimBounds=/false with MemRefRegion::compute to prevent redundant bounds from being added for the common case. So, instead, this is adding them when getting the constant bounding size and shape, but not when we do getLowerAndUpperBound on that region to get the range for the copy loops. This basically means the code that does the copying now risks going out of bounds when there is overapproximation. Ultimately, we shouldn't be using approximation based projection at all for region computation, and instead work with the equalities/local expressions to keep the bounds accurate -- if that's not possible (due to yet unimplemented detection or complex cases we may not be interested in), the region computation should just fail and bail out. We have a similar over approximation with unionBoundingBox. This approximation shouldn't be done for write regions; we should bail out if we can't be exact in those case. For this patch, we have two options: (1) we could keep it like this (use addMemRefDimBounds = false with region compute) and then work on getting rid of the use of project in region compute. Once that's done, we don't need to add memref dim bounds anywhere; (2) we addMemrefDimBounds = true for region computation and update test cases because there'd be some redundant bounds. This still means we would later need to get rid of the over approximation (and fail instead) to avoid extra writes (which impact correctness) and extra reads (which may only impact performance). Let me know which one you prefer. bondhugula: It's the same over approximation that existed - but I've changed affine data copy generate to…
		andydavis1Unsubmitted Done Reply Inline Actions OK thanks. Yes, lets go with option (1). Do you need to make additional changes to this revision for option (1)? andydavis1: OK thanks. Yes, lets go with option (1). Do you need to make additional changes to this…
		bondhugulaAuthorUnsubmitted Done Reply Inline Actions This revision already does option (1). No more additional changes needed for it. bondhugula: This revision already does option (1). No more additional changes needed for it.
		// over-approximation from projection or union bounding box. We may not add
		// this on the region itself since they might just be redundant constraints
		// that will need non-trivials means to eliminate.
		FlatAffineConstraints cstWithShapeBounds(cst);
		for (unsigned r = 0; r < rank; r++) {
		cstWithShapeBounds.addConstantLowerBound(r, 0);
		int64_t dimSize = memRefType.getDimSize(r);
		if (ShapedType::isDynamic(dimSize))
		continue;
		cstWithShapeBounds.addConstantUpperBound(r, dimSize - 1);
		}

// Find a constant upper bound on the extent of this memref region along each		// Find a constant upper bound on the extent of this memref region along each
// dimension.		// dimension.
int64_t numElements = 1;		int64_t numElements = 1;
int64_t diffConstant;		int64_t diffConstant;
int64_t lbDivisor;		int64_t lbDivisor;
for (unsigned d = 0; d < rank; d++) {		for (unsigned d = 0; d < rank; d++) {
SmallVector<int64_t, 4> lb;		SmallVector<int64_t, 4> lb;
Optional<int64_t> diff = cst.getConstantBoundOnDimSize(d, &lb, &lbDivisor);		Optional<int64_t> diff =
		cstWithShapeBounds.getConstantBoundOnDimSize(d, &lb, &lbDivisor);
if (diff.hasValue()) {		if (diff.hasValue()) {
diffConstant = diff.getValue();		diffConstant = diff.getValue();
assert(lbDivisor > 0);		assert(lbDivisor > 0);
} else {		} else {
// If no constant bound is found, then it can always be bound by the		// If no constant bound is found, then it can always be bound by the
// memref's dim size if the latter has a constant size along this dim.		// memref's dim size if the latter has a constant size along this dim.
auto dimSize = memRefType.getDimSize(d);		auto dimSize = memRefType.getDimSize(d);
if (dimSize == -1)		if (dimSize == -1)
return None;		return None;
diffConstant = dimSize;		diffConstant = dimSize;
// Lower bound becomes 0.		// Lower bound becomes 0.
lb.resize(cst.getNumSymbolIds() + 1, 0);		lb.resize(cstWithShapeBounds.getNumSymbolIds() + 1, 0);
lbDivisor = 1;		lbDivisor = 1;
}		}
numElements *= diffConstant;		numElements *= diffConstant;
if (lbs) {		if (lbs) {
lbs->push_back(lb);		lbs->push_back(lb);
assert(lbDivisors && "both lbs and lbDivisor or none");		assert(lbDivisors && "both lbs and lbDivisor or none");
lbDivisors->push_back(lbDivisor);		lbDivisors->push_back(lbDivisor);
}		}
if (shape) {		if (shape) {
shape->push_back(diffConstant);		shape->push_back(diffConstant);
}		}
}		}
return numElements;		return numElements;
}		}

		void MemRefRegion::getLowerAndUpperBound(unsigned pos, AffineMap &lbMap,
		AffineMap &ubMap) const {
		assert(pos < cst.getNumDimIds() && "invalid position");
		auto memRefType = memref.getType().cast<MemRefType>();
		unsigned rank = memRefType.getRank();

		assert(rank == cst.getNumDimIds() && "inconsistent memref region");

		auto boundPairs = cst.getLowerAndUpperBound(
		pos, /offset=/0, /num=/rank, cst.getNumDimAndSymbolIds(),
		/localExprs=/{}, memRefType.getContext());
		lbMap = boundPairs.first;
		ubMap = boundPairs.second;
		assert(lbMap && "lower bound for a region must exist");
		assert(ubMap && "upper bound for a region must exist");
		assert(lbMap.getNumInputs() == cst.getNumDimAndSymbolIds() - rank);
		assert(ubMap.getNumInputs() == cst.getNumDimAndSymbolIds() - rank);
		}

LogicalResult MemRefRegion::unionBoundingBox(const MemRefRegion &other) {		LogicalResult MemRefRegion::unionBoundingBox(const MemRefRegion &other) {
assert(memref == other.memref);		assert(memref == other.memref);
return cst.unionBoundingBox(*other.getConstraints());		return cst.unionBoundingBox(*other.getConstraints());
}		}

/// Computes the memory region accessed by this memref with the region		/// Computes the memory region accessed by this memref with the region
/// represented as constraints symbolic/parametric in 'loopDepth' loops		/// represented as constraints symbolic/parametric in 'loopDepth' loops
/// surrounding opInst and any additional Function symbols.		/// surrounding opInst and any additional Function symbols.
▲ Show 20 Lines • Show All 150 Lines • ▼ Show 20 Lines	if (addMemRefDimBounds) {
for (unsigned r = 0; r < rank; r++) {		for (unsigned r = 0; r < rank; r++) {
cst.addConstantLowerBound(r, 0);		cst.addConstantLowerBound(r, 0);
int64_t dimSize = memRefType.getDimSize(r);		int64_t dimSize = memRefType.getDimSize(r);
if (ShapedType::isDynamic(dimSize))		if (ShapedType::isDynamic(dimSize))
continue;		continue;
cst.addConstantUpperBound(r, dimSize - 1);		cst.addConstantUpperBound(r, dimSize - 1);
}		}
}		}
		cst.removeTrivialRedundancy();

LLVM_DEBUG(llvm::dbgs() << "Memory region:\n");		LLVM_DEBUG(llvm::dbgs() << "Memory region:\n");
LLVM_DEBUG(cst.dump());		LLVM_DEBUG(cst.dump());
return success();		return success();
}		}

// TODO(mlir-team): improve/complete this when we have target data.		// TODO(mlir-team): improve/complete this when we have target data.
static unsigned getMemRefEltSizeInBytes(MemRefType memRefType) {		static unsigned getMemRefEltSizeInBytes(MemRefType memRefType) {
▲ Show 20 Lines • Show All 708 Lines • Show Last 20 Lines

mlir/lib/Dialect/Affine/Transforms/AffineDataCopyGeneration.cpp

Show All 15 Lines
// necessary for hardware that explicitly managed levels in the memory		// necessary for hardware that explicitly managed levels in the memory
// hierarchy, and where DMAs may have to be used. This optimization is often		// hierarchy, and where DMAs may have to be used. This optimization is often
// performed on already tiled code.		// performed on already tiled code.
//		//
//===----------------------------------------------------------------------===//		//===----------------------------------------------------------------------===//

#include "mlir/Analysis/Utils.h"		#include "mlir/Analysis/Utils.h"
#include "mlir/Dialect/Affine/IR/AffineOps.h"		#include "mlir/Dialect/Affine/IR/AffineOps.h"
#include "mlir/Dialect/StandardOps/IR/Ops.h"
#include "mlir/Dialect/Affine/Passes.h"		#include "mlir/Dialect/Affine/Passes.h"
		#include "mlir/Dialect/StandardOps/IR/Ops.h"
		#include "mlir/IR/PatternMatch.h"
#include "mlir/Pass/Pass.h"		#include "mlir/Pass/Pass.h"
#include "mlir/Transforms/LoopUtils.h"		#include "mlir/Transforms/LoopUtils.h"
#include "llvm/ADT/MapVector.h"		#include "llvm/ADT/MapVector.h"
#include "llvm/Support/CommandLine.h"		#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Debug.h"		#include "llvm/Support/Debug.h"
#include <algorithm>		#include <algorithm>

#define DEBUG_TYPE "affine-data-copy-generate"		#define DEBUG_TYPE "affine-data-copy-generate"
▲ Show 20 Lines • Show All 225 Lines • ▼ Show 20 Lines	void AffineDataCopyGeneration::runOnFunction() {

// Clear recorded copy nests.		// Clear recorded copy nests.
copyNests.clear();		copyNests.clear();

for (auto &block : f)		for (auto &block : f)
runOnBlock(&block, copyNests);		runOnBlock(&block, copyNests);

// Promote any single iteration loops in the copy nests.		// Promote any single iteration loops in the copy nests.
for (auto nest : copyNests) {		for (auto nest : copyNests)
nest->walk([](AffineForOp forOp) { promoteIfSingleIteration(forOp); });		nest->walk([](AffineForOp forOp) { promoteIfSingleIteration(forOp); });

		// Promoting single iteration loops could lead to simplification of
		// load's/store's. We will run canonicalization patterns on load/stores.
		// TODO: this whole function load/store canonicalization should be replaced by
		// canonicalization that is limited to only the load/store ops
		// introduced/touched by this pass (those inside 'copyNests'). This would be
		// possible once the necessary support is available in the pattern rewriter.
		if (!copyNests.empty()) {
		OwningRewritePatternList patterns;
		AffineLoadOp::getCanonicalizationPatterns(patterns, &getContext());
		AffineStoreOp::getCanonicalizationPatterns(patterns, &getContext());
		applyPatternsGreedily(f, std::move(patterns));
}		}
}		}
		dcaballeUnsubmitted Done Reply Inline Actions I'm not asking for any changes now but just wondering if it would make sense in the future to do all of these "clean-up" optimizations in a separate pass(es) that we can invoke as needed, maybe after running a bunch of optimizations instead of trying to optimize right after each one if it's not absolutely necessary. I guess that could reduce compile time and avoid duplicating this clean-up per pass. IIRC, loop fusion performed also some optimization around temporary tensors after fusion. Not sure if that optimizations would also fit into this category. dcaballe: I'm not asking for any changes now but just wondering if it would make sense in the future to…
		bondhugulaAuthorUnsubmitted Done Reply Inline Actions Yes, this is an issue common to several passes - as do whether we want to do light weight cleanup at the end. If it's really simple canonicalizations, it should really have no impact on compile time (so long as you are doing only the necessary stuff). Its real benefit is that it makes the output of the pass more intuitive to read and test cases easier to write / more readable. One issue here is that the current greedy pattern rewriter would run folding and DCE on all ops irrespective of the patterns, and so we get all sorts of unexpected simplifications from the pass and in the test cases. I'm sending out a patch/proposal to add a flag to applyPatternsGreedily that makes it only run the supplied patterns and not do any folding/DCE. This is also needed when entering the pass/utility when you want to canonicalize things by selectively applying some patterns (instead of requiring the client to do it). It's not always feasible to check whether it's already in the canonical form - would require a lot of extra code. bondhugula: Yes, this is an issue common to several passes - as do whether we want to do light weight…
		mehdi_aminiUnsubmitted Done Reply Inline Actions It is common that a pass would clean-up behind itself when it knows exactly what to cleanup: for example while you're promoting a single iteration loop you know that you may have specific code to clean in the promoted block and you perform these directly. This is very targeted and "cheap". Here is seems borderline though: it applies a some canonicalization patterns unconditionally at the function scope level. mehdi_amini: It is common that a pass would clean-up behind itself when it knows exactly what to cleanup…
		bondhugulaAuthorUnsubmitted Done Reply Inline Actions That's right. I'd like to ideally avoid doing function scope canonicalizations and only restrict this cleanup to load/store op's we touched (which can be easily collected and they are few in number - as many as the memrefs packed/copied) - but the current pattern rewriter doesn't support that. bondhugula: That's right. I'd like to ideally avoid doing function scope canonicalizations and only…
		dcaballeUnsubmitted Done Reply Inline Actions Agree! I like the idea of doing some trivial simplifications (1-it loop) as long as it is on code that the pass is generating/modifying and it's not too involved to do so. However, "trivial" and "involved" are a bit subjective terms and I think this is one of those things that will get convoluted over time if we don't keep it really low profile. My personal opinion is that between simplicity and convenience, I lean towards the former and I would do this only for really trivial cases. I would say that bookkeeping operations over the algorithm to be simplified at the end for convenience is a bit borderline for me. Of course, this is my personal opinion, totally arguable :). I think we should also not simplify anything if the pass doesn't do any core transformation. Otherwise, that might create a dependency between the simplifications done by the pass and subsequent ones. Should we limit the load/store simplification to cases where there is a least a copyNest? dcaballe: Agree! I like the idea of doing some trivial simplifications (1-it loop) as long as it is on…
		mehdi_aminiUnsubmitted Done Reply Inline Actions I would say that bookkeeping operations over the algorithm to be simplified at the end for convenience is a bit borderline for me. I would say it has to be balanced with having to schedule a complete run of canonicalize before the following pass, I suspect this is gonna be case-by-case. Sometimes the context changes over time as well, for example this complex technique in LLVM had more impact when it was introduced and now will get removed based on more recent benchmarks: http://lists.llvm.org/pipermail/llvm-dev/2020-April/140542.html I think we should also not simplify anything if the pass doesn't do any core transformation. Otherwise, that might create a dependency between the simplifications done by the pass and subsequent ones. +1 on this. But I suspect Uday would like to make it even more targeted when it'll be possible to. mehdi_amini: > I would say that bookkeeping operations over the algorithm to be simplified at the end for…
		bondhugulaAuthorUnsubmitted Done Reply Inline Actions @dcaballe @mehdi_amini - all of these are really good points, and I strongly agree. For this patch, I'd like to change the whole function load/store canononicalization happening here, but that depends on when another line of revisions that are pending land. I could remove this and update test cases making the test case checks more clumsy or have a TODO and make the canonicalization more focussed when the pattern rewriter support improves. bondhugula: @dcaballe @mehdi_amini - all of these are really good points, and I strongly agree. For this…
		dcaballeUnsubmitted Done Reply Inline Actions I'm ok with the TODO + later revisit approach and protecting the load/store canonicalization with `if (!copyNests.empty())`, if that makes sense. dcaballe: I'm ok with the TODO + later revisit approach and protecting the load/store canonicalization…
		mehdi_aminiUnsubmitted Done Reply Inline Actions I don't expect you to hold this revision, coming back and improve later is perfectly fine with me (in case it wasn't clear). mehdi_amini: I don't expect you to hold this revision, coming back and improve later is perfectly fine with…

mlir/lib/Transforms/Utils/LoopUtils.cpp

Show First 20 Lines • Show All 1,452 Lines • ▼ Show 20 Lines	for (int d = bufferShape.size() - 1; d >= 1; d--) {
// next major dimension.		// next major dimension.
if (bufferShape[d] < dimSize && bufferShape[d - 1] > 1) {		if (bufferShape[d] < dimSize && bufferShape[d - 1] > 1) {
strideInfos->push_back({stride, numEltPerStride});		strideInfos->push_back({stride, numEltPerStride});
}		}
}		}
}		}

/// Generates a point-wise copy from/to `memref' to/from `fastMemRef' and		/// Generates a point-wise copy from/to `memref' to/from `fastMemRef' and
/// returns the outermost AffineForOp of the copy loop nest. `memIndicesStart'		/// returns the outermost AffineForOp of the copy loop nest. `lbMaps` and
/// holds the lower coordinates of the region in the original memref to copy		/// `ubMaps` along with `lbOperands` and `ubOperands` hold the lower and upper
/// in/out. If `copyOut' is true, generates a copy-out; otherwise a copy-in.		/// bound information for the copy loop nest. `fastBufOffsets` contain the
static AffineForOp generatePointWiseCopy(Location loc, Value memref,		/// expressions to be subtracted out from the respective copy loop iterators in
Value fastMemRef,		/// order to index the fast buffer. If `copyOut' is true, generates a copy-out;
AffineMap memAffineMap,		/// otherwise a copy-in. Builder `b` should be set to the point the copy nest is
ArrayRef<Value> memIndicesStart,		/// inserted.
ArrayRef<int64_t> fastBufferShape,		//
bool isCopyOut, OpBuilder b) {		/// The copy-in nest is generated as follows as an example for a 2-d region:
assert(!memIndicesStart.empty() && "only 1-d or more memrefs");		/// for x = ...
		/// for y = ...
// The copy-in nest is generated as follows as an example for a 2-d region:		/// fast_buf[x - offset_x][y - offset_y] = memref[x][y]
// for x = ...		///
// for y = ...		static AffineForOp
// fast_buf[x][y] = buf[mem_x + x][mem_y + y]		generatePointWiseCopy(Location loc, Value memref, Value fastMemRef,
		ArrayRef<AffineMap> lbMaps, ArrayRef<Value> lbOperands,
		ArrayRef<AffineMap> ubMaps, ArrayRef<Value> ubOperands,
		ArrayRef<AffineExpr> fastBufOffsets, bool isCopyOut,
		OpBuilder b) {
		assert(llvm::all_of(lbMaps, [&](AffineMap lbMap) {
		return lbMap.getNumInputs() == lbOperands.size();
		}));
		assert(llvm::all_of(ubMaps, [&](AffineMap ubMap) {
		return ubMap.getNumInputs() == ubOperands.size();
		}));

		unsigned rank = memref.getType().cast<MemRefType>().getRank();
		assert(lbMaps.size() == rank && "wrong number of lb maps");
		dcaballeUnsubmitted Done Reply Inline Actions used in line 1496 dcaballe: used in line 1496
		bondhugulaAuthorUnsubmitted Done Reply Inline Actions Thanks! bondhugula: Thanks!
		assert(ubMaps.size() == rank && "wrong number of ub maps");

SmallVector<Value, 4> fastBufIndices, memIndices;		SmallVector<Value, 4> memIndices;
		SmallVector<AffineExpr, 4> fastBufExprs;
		SmallVector<Value, 4> fastBufMapOperands;
AffineForOp copyNestRoot;		AffineForOp copyNestRoot;
for (unsigned d = 0, e = fastBufferShape.size(); d < e; ++d) {		for (unsigned d = 0; d < rank; ++d) {
auto forOp = b.create<AffineForOp>(loc, 0, fastBufferShape[d]);		auto forOp = createCanonicalizedAffineForOp(b, loc, lbOperands, lbMaps[d],
		ubOperands, ubMaps[d]);
if (d == 0)		if (d == 0)
copyNestRoot = forOp;		copyNestRoot = forOp;

b = forOp.getBodyBuilder();		b = forOp.getBodyBuilder();
fastBufIndices.push_back(forOp.getInductionVar());

Value memBase =		auto fastBufOffsetMap =
(memAffineMap == b.getMultiDimIdentityMap(memAffineMap.getNumDims()))		AffineMap::get(lbOperands.size(), 0, {fastBufOffsets[d]});
? memIndicesStart[d]		auto offset = b.create<AffineApplyOp>(loc, fastBufOffsetMap, lbOperands);
: b.create<AffineApplyOp>(
loc,		// Construct the subscript for the fast memref being copied into/from:
AffineMap::get(memAffineMap.getNumDims(),		// x - offset_x.
memAffineMap.getNumSymbols(),		fastBufExprs.push_back(b.getAffineDimExpr(2 * d + 1) -
memAffineMap.getResult(d)),		b.getAffineDimExpr(2 * d));
memIndicesStart);		fastBufMapOperands.push_back(offset);
		fastBufMapOperands.push_back(forOp.getInductionVar());
// Construct the subscript for the slow memref being copied.
auto memIndex = b.create<AffineApplyOp>(		// Subscript for the slow memref being copied.
loc,		memIndices.push_back(forOp.getInductionVar());
AffineMap::get(2, 0, b.getAffineDimExpr(0) + b.getAffineDimExpr(1)),		}
ValueRange({memBase, forOp.getInductionVar()}));
memIndices.push_back(memIndex);		auto fastBufMap = AffineMap::get(2 * rank, /symbolCount=/0, fastBufExprs);
}		fullyComposeAffineMapAndOperands(&fastBufMap, &fastBufMapOperands);
		fastBufMap = simplifyAffineMap(fastBufMap);
		canonicalizeMapAndOperands(&fastBufMap, &fastBufMapOperands);

if (!isCopyOut) {		if (!isCopyOut) {
// Copy in.		// Copy in.
auto load = b.create<AffineLoadOp>(loc, memref, memIndices);		auto load = b.create<AffineLoadOp>(loc, memref, memIndices);
b.create<AffineStoreOp>(loc, load, fastMemRef, fastBufIndices);		b.create<AffineStoreOp>(loc, load, fastMemRef, fastBufMap,
		fastBufMapOperands);
return copyNestRoot;		return copyNestRoot;
}		}

// Copy out.		// Copy out.
auto load = b.create<AffineLoadOp>(loc, fastMemRef, fastBufIndices);		auto load =
		b.create<AffineLoadOp>(loc, fastMemRef, fastBufMap, fastBufMapOperands);
b.create<AffineStoreOp>(loc, load, memref, memIndices);		b.create<AffineStoreOp>(loc, load, memref, memIndices);
return copyNestRoot;		return copyNestRoot;
}		}

static InFlightDiagnostic LLVM_ATTRIBUTE_UNUSED		static InFlightDiagnostic LLVM_ATTRIBUTE_UNUSED
emitRemarkForBlock(Block &block) {		emitRemarkForBlock(Block &block) {
return block.getParentOp()->emitRemark();		return block.getParentOp()->emitRemark();
}		}
▲ Show 20 Lines • Show All 74 Lines • ▼ Show 20 Lines	static LogicalResult generateCopy(
}		}

if (numElements.getValue() == 0) {		if (numElements.getValue() == 0) {
LLVM_DEBUG(llvm::dbgs() << "Nothing to copy\n");		LLVM_DEBUG(llvm::dbgs() << "Nothing to copy\n");
*sizeInBytes = 0;		*sizeInBytes = 0;
return success();		return success();
}		}

		SmallVector<AffineMap, 4> lbMaps(rank), ubMaps(rank);
		for (unsigned i = 0; i < rank; ++i)
		region.getLowerAndUpperBound(i, lbMaps[i], ubMaps[i]);

const FlatAffineConstraints *cst = region.getConstraints();		const FlatAffineConstraints *cst = region.getConstraints();
// 'regionSymbols' hold values that this memory region is symbolic/parametric		// 'regionSymbols' hold values that this memory region is symbolic/parametric
// on; these typically include loop IVs surrounding the level at which the		// on; these typically include loop IVs surrounding the level at which the
// copy generation is being done or other valid symbols in MLIR.		// copy generation is being done or other valid symbols in MLIR.
SmallVector<Value, 8> regionSymbols;		SmallVector<Value, 8> regionSymbols;
cst->getIdValues(rank, cst->getNumIds(), &regionSymbols);		cst->getIdValues(rank, cst->getNumIds(), &regionSymbols);

// Construct the index expressions for the fast memory buffer. The index		// Construct the index expressions for the fast memory buffer. The index
// expression for a particular dimension of the fast buffer is obtained by		// expression for a particular dimension of the fast buffer is obtained by
// subtracting out the lower bound on the original memref's data region		// subtracting out the lower bound on the original memref's data region
// along the corresponding dimension.		// along the corresponding dimension.

// Index start offsets for faster memory buffer relative to the original.		// Index start offsets for faster memory buffer relative to the original.
SmallVector<AffineExpr, 4> offsets;		SmallVector<AffineExpr, 4> fastBufOffsets;
offsets.reserve(rank);		fastBufOffsets.reserve(rank);
for (unsigned d = 0; d < rank; d++) {		for (unsigned d = 0; d < rank; d++) {
assert(lbs[d].size() == cst->getNumCols() - rank && "incorrect bound size");		assert(lbs[d].size() == cst->getNumCols() - rank && "incorrect bound size");

AffineExpr offset = top.getAffineConstantExpr(0);		AffineExpr offset = top.getAffineConstantExpr(0);
for (unsigned j = 0, e = cst->getNumCols() - rank - 1; j < e; j++) {		for (unsigned j = 0, e = cst->getNumCols() - rank - 1; j < e; j++)
offset = offset + lbs[d][j] * top.getAffineDimExpr(j);		offset = offset + lbs[d][j] * top.getAffineDimExpr(j);
}
assert(lbDivisors[d] > 0);		assert(lbDivisors[d] > 0);
offset =		offset =
(offset + lbs[d][cst->getNumCols() - 1 - rank]).floorDiv(lbDivisors[d]);		(offset + lbs[d][cst->getNumCols() - 1 - rank]).floorDiv(lbDivisors[d]);

// Set copy start location for this dimension in the lower memory space		// Set copy start location for this dimension in the lower memory space
// memref.		// memref.
if (auto caf = offset.dyn_cast<AffineConstantExpr>()) {		if (auto caf = offset.dyn_cast<AffineConstantExpr>()) {
auto indexVal = caf.getValue();		auto indexVal = caf.getValue();
Show All 10 Lines	if (auto caf = offset.dyn_cast<AffineConstantExpr>()) {
cst->getNumDimIds() + cst->getNumSymbolIds() - rank, 0, offset);		cst->getNumDimIds() + cst->getNumSymbolIds() - rank, 0, offset);
memIndices.push_back(b.create<AffineApplyOp>(loc, map, regionSymbols));		memIndices.push_back(b.create<AffineApplyOp>(loc, map, regionSymbols));
}		}
// The fast buffer is copied into at location zero; addressing is relative.		// The fast buffer is copied into at location zero; addressing is relative.
bufIndices.push_back(zeroIndex);		bufIndices.push_back(zeroIndex);

// Record the offsets since they are needed to remap the memory accesses of		// Record the offsets since they are needed to remap the memory accesses of
// the original memref further below.		// the original memref further below.
offsets.push_back(offset);		fastBufOffsets.push_back(offset);
}		}

// The faster memory space buffer.		// The faster memory space buffer.
Value fastMemRef;		Value fastMemRef;

// Check if a buffer was already created.		// Check if a buffer was already created.
bool existingBuf = fastBufferMap.count(memref) > 0;		bool existingBuf = fastBufferMap.count(memref) > 0;
if (!existingBuf) {		if (!existingBuf) {
▲ Show 20 Lines • Show All 51 Lines • ▼ Show 20 Lines	static LogicalResult generateCopy(
// Create fully composed affine maps for each memref.		// Create fully composed affine maps for each memref.
auto memAffineMap = b.getMultiDimIdentityMap(memIndices.size());		auto memAffineMap = b.getMultiDimIdentityMap(memIndices.size());
fullyComposeAffineMapAndOperands(&memAffineMap, &memIndices);		fullyComposeAffineMapAndOperands(&memAffineMap, &memIndices);
auto bufAffineMap = b.getMultiDimIdentityMap(bufIndices.size());		auto bufAffineMap = b.getMultiDimIdentityMap(bufIndices.size());
fullyComposeAffineMapAndOperands(&bufAffineMap, &bufIndices);		fullyComposeAffineMapAndOperands(&bufAffineMap, &bufIndices);

if (!copyOptions.generateDma) {		if (!copyOptions.generateDma) {
// Point-wise copy generation.		// Point-wise copy generation.
auto copyNest = generatePointWiseCopy(loc, memref, fastMemRef, memAffineMap,		auto copyNest =
memIndices, fastBufferShape,		generatePointWiseCopy(loc, memref, fastMemRef, lbMaps,
		/lbOperands=/regionSymbols, ubMaps,
		/ubOperands=/regionSymbols, fastBufOffsets,
/isCopyOut=/region.isWrite(), b);		/isCopyOut=/region.isWrite(), b);

// Record this so that we can skip it from yet another copy.		// Record this so that we can skip it from yet another copy.
copyNests.insert(copyNest);		copyNests.insert(copyNest);

// Since new ops are being appended (for copy out's), adjust the end to		// Since new ops are being appended (for copy out's), adjust the end to
// mark end of block range being processed if necessary.		// mark end of block range being processed if necessary.
if (region.isWrite() && isCopyOutAtEndOfBlock)		if (region.isWrite() && isCopyOutAtEndOfBlock)
*nEnd = Block::iterator(copyNest.getOperation());		*nEnd = Block::iterator(copyNest.getOperation());
▲ Show 20 Lines • Show All 55 Lines • ▼ Show 20 Lines	static LogicalResult generateCopy(
// d2, d3 correspond to the original indices (%i, %j).		// d2, d3 correspond to the original indices (%i, %j).
SmallVector<AffineExpr, 4> remapExprs;		SmallVector<AffineExpr, 4> remapExprs;
remapExprs.reserve(rank);		remapExprs.reserve(rank);
for (unsigned i = 0; i < rank; i++) {		for (unsigned i = 0; i < rank; i++) {
// The starting operands of indexRemap will be regionSymbols (the symbols on		// The starting operands of indexRemap will be regionSymbols (the symbols on
// which the memref region is parametric); then those corresponding to		// which the memref region is parametric); then those corresponding to
// the memref's original indices follow.		// the memref's original indices follow.
auto dimExpr = b.getAffineDimExpr(regionSymbols.size() + i);		auto dimExpr = b.getAffineDimExpr(regionSymbols.size() + i);
remapExprs.push_back(dimExpr - offsets[i]);		remapExprs.push_back(dimExpr - fastBufOffsets[i]);
}		}
auto indexRemap = AffineMap::get(regionSymbols.size() + rank, 0, remapExprs);		auto indexRemap = AffineMap::get(regionSymbols.size() + rank, 0, remapExprs);

// Record the begin since it may be invalidated by memref replacement.		// Record the begin since it may be invalidated by memref replacement.
Block::iterator prevOfBegin;		Block::iterator prevOfBegin;
bool isBeginAtStartOfBlock = (begin == block->begin());		bool isBeginAtStartOfBlock = (begin == block->begin());
if (!isBeginAtStartOfBlock)		if (!isBeginAtStartOfBlock)
prevOfBegin = std::prev(begin);		prevOfBegin = std::prev(begin);
▲ Show 20 Lines • Show All 118 Lines • ▼ Show 20 Lines	if (auto loadOp = dyn_cast<AffineLoadOp>(opInst)) {
return;		return;
} else {		} else {
// Neither load nor a store op.		// Neither load nor a store op.
return;		return;
}		}

// Compute the MemRefRegion accessed.		// Compute the MemRefRegion accessed.
auto region = std::make_unique<MemRefRegion>(opInst->getLoc());		auto region = std::make_unique<MemRefRegion>(opInst->getLoc());
if (failed(region->compute(opInst, copyDepth))) {		if (failed(region->compute(opInst, copyDepth, /sliceState=/nullptr,
		/addMemRefDimBounds=/false))) {
LLVM_DEBUG(llvm::dbgs()		LLVM_DEBUG(llvm::dbgs()
<< "Error obtaining memory region: semi-affine maps?\n");		<< "Error obtaining memory region: semi-affine maps?\n");
LLVM_DEBUG(llvm::dbgs() << "over-approximating to the entire memref\n");		LLVM_DEBUG(llvm::dbgs() << "over-approximating to the entire memref\n");
if (!getFullMemRefAsRegion(opInst, copyDepth, region.get())) {		if (!getFullMemRefAsRegion(opInst, copyDepth, region.get())) {
LLVM_DEBUG(		LLVM_DEBUG(
opInst->emitError("non-constant memref sizes not yet supported"));		opInst->emitError("non-constant memref sizes not yet supported"));
error = true;		error = true;
return;		return;
▲ Show 20 Lines • Show All 109 Lines • ▼ Show 20 Lines	if (llvm::DebugFlag && (forOp = dyn_cast<AffineForOp>(&*begin))) {
forOp.emitRemark()		forOp.emitRemark()
<< sizeInKib		<< sizeInKib
<< " KiB of copy buffers in fast memory space for this block\n";		<< " KiB of copy buffers in fast memory space for this block\n";
}		}

if (totalCopyBuffersSizeInBytes > copyOptions.fastMemCapacityBytes) {		if (totalCopyBuffersSizeInBytes > copyOptions.fastMemCapacityBytes) {
StringRef str = "Total size of all copy buffers' for this block "		StringRef str = "Total size of all copy buffers' for this block "
"exceeds fast memory capacity\n";		"exceeds fast memory capacity\n";
block->getParentOp()->emitError(str);		block->getParentOp()->emitWarning(str);
}		}

return totalCopyBuffersSizeInBytes;		return totalCopyBuffersSizeInBytes;
}		}

// A convenience version of affineDataCopyGenerate for all ops in the body of		// A convenience version of affineDataCopyGenerate for all ops in the body of
// an AffineForOp.		// an AffineForOp.
uint64_t mlir::affineDataCopyGenerate(AffineForOp forOp,		uint64_t mlir::affineDataCopyGenerate(AffineForOp forOp,
▲ Show 20 Lines • Show All 272 Lines • Show Last 20 Lines

mlir/test/Dialect/Affine/affine-data-copy.mlir

Show All 11 Lines
// -copy-skip-non-stride-loops forces the copies to be placed right inside the		// -copy-skip-non-stride-loops forces the copies to be placed right inside the
// tile space loops, avoiding the sensitivity of copy placement depth to memory		// tile space loops, avoiding the sensitivity of copy placement depth to memory
// footprint -- so that one could write a definite test case and not have to		// footprint -- so that one could write a definite test case and not have to
// update it each time something related to the cost functions change.		// update it each time something related to the cost functions change.

#id = affine_map<(d0) -> (d0)>		#id = affine_map<(d0) -> (d0)>
#ub = affine_map<(d0) -> (d0 + 128)>		#ub = affine_map<(d0) -> (d0 + 128)>

// Map used to index the original memref while copying.
// CHECK-DAG: [[MEM_IDX_MAP:map[0-9]+]] = affine_map<(d0, d1) -> (d0 + d1)>
// Map used to index the buffer while computing.		// Map used to index the buffer while computing.
// CHECK-DAG: [[MAP_IDENTITY:map[0-9]+]] = affine_map<(d0) -> (d0)>		// CHECK-DAG: [[MAP_IDENTITY:map[0-9]+]] = affine_map<(d0) -> (d0)>
// CHECK-DAG: [[MAP_PLUS_128:map[0-9]+]] = affine_map<(d0) -> (d0 + 128)>		// CHECK-DAG: [[MAP_PLUS_128:map[0-9]+]] = affine_map<(d0) -> (d0 + 128)>
// CHECK-DAG: [[BUF_IDX_MAP:map[0-9]+]] = affine_map<(d0, d1, d2, d3) -> (-d0 + d2, -d1 + d3)>

// CHECK-LABEL: func @matmul		// CHECK-LABEL: func @matmul
// FILTER-LABEL: func @matmul		// FILTER-LABEL: func @matmul
func @matmul(%A: memref<4096x4096xf32>, %B: memref<4096x4096xf32>, %C: memref<4096x4096xf32>) -> memref<4096x4096xf32> {		func @matmul(%A: memref<4096x4096xf32>, %B: memref<4096x4096xf32>, %C: memref<4096x4096xf32>) -> memref<4096x4096xf32> {
affine.for %i = 0 to 4096 step 128 {		affine.for %i = 0 to 4096 step 128 {
affine.for %j = 0 to 4096 step 128 {		affine.for %j = 0 to 4096 step 128 {
affine.for %k = 0 to 4096 step 128 {		affine.for %k = 0 to 4096 step 128 {
affine.for %ii = #id(%i) to #ub(%i) {		affine.for %ii = #id(%i) to #ub(%i) {
Show All 11 Lines	affine.for %j = 0 to 4096 step 128 {
}		}
}		}
}		}
return %C : memref<4096x4096xf32>		return %C : memref<4096x4096xf32>
}		}

// Buffers of size 128x128 get created here for all three matrices.		// Buffers of size 128x128 get created here for all three matrices.

// CHECK: affine.for %{{.*}} = 0 to 4096 step 128 {		// CHECK: affine.for %[[I:.*]] = 0 to 4096 step 128 {
// CHECK: affine.for %{{.*}} = 0 to 4096 step 128 {		// CHECK: affine.for %[[J:.*]] = 0 to 4096 step 128 {
// CHECK: [[BUFC:%[0-9]+]] = alloc() : memref<128x128xf32>		// CHECK: [[BUFC:%[0-9]+]] = alloc() : memref<128x128xf32>

// The result matrix's copy gets hoisted out.		// The result matrix's copy gets hoisted out.
// Result matrix copy-in.		// Result matrix copy-in.
// CHECK: affine.for %{{.*}} = 0 to 128 {		// CHECK: affine.for %[[II:.]] = #[[MAP_IDENTITY]](%{{.}}) to #[[MAP_PLUS_128]](%{{.*}}) {
// CHECK: affine.apply #[[MEM_IDX_MAP]](%{{.}}, %{{.}})		// CHECK: affine.for %[[JJ:.]] = #[[MAP_IDENTITY]](%{{.}}) to #[[MAP_PLUS_128]](%{{.*}}) {
// CHECK: affine.for %{{.*}} = 0 to 128 {
// CHECK: affine.apply #[[MEM_IDX_MAP]](%{{.}}, %{{.}})
// CHECK: affine.load %{{.}}[%{{.}}, %{{.*}}] : memref<4096x4096xf32>		// CHECK: affine.load %{{.}}[%{{.}}, %{{.*}}] : memref<4096x4096xf32>
// CHECK: affine.store %{{.}}, [[BUFC]][%{{.}}, %{{.*}}] : memref<128x128xf32>		// CHECK: affine.store %{{.*}}, [[BUFC]][-%[[I]] + %[[II]], -%[[J]] + %[[JJ]]] : memref<128x128xf32>
// CHECK: }		// CHECK: }
// CHECK: }		// CHECK: }

// LHS matrix copy-in.		// LHS matrix copy-in.
// CHECK: affine.for %{{.*}} = 0 to 4096 step 128 {		// CHECK: affine.for %[[K:.*]] = 0 to 4096 step 128 {
// CHECK: [[BUFA:%[0-9]+]] = alloc() : memref<128x128xf32>		// CHECK: [[BUFA:%[0-9]+]] = alloc() : memref<128x128xf32>
// CHECK: affine.for %{{.*}} = 0 to 128 {		// CHECK: affine.for %[[II:.]] = #[[MAP_IDENTITY]](%{{.}}) to #[[MAP_PLUS_128]](%{{.*}}) {
// CHECK: affine.apply #[[MEM_IDX_MAP]](%{{.}}, %{{.}})		// CHECK: affine.for %[[KK:.]] = #[[MAP_IDENTITY]](%{{.}}) to #[[MAP_PLUS_128]](%{{.*}}) {
// CHECK: affine.for %{{.*}} = 0 to 128 {
// CHECK: affine.apply #[[MEM_IDX_MAP]](%{{.}}, %{{.}})
// CHECK: affine.load %{{.}}[%{{.}}, %{{.*}}] : memref<4096x4096xf32>		// CHECK: affine.load %{{.}}[%{{.}}, %{{.*}}] : memref<4096x4096xf32>
// CHECK: affine.store %{{.}}, [[BUFA]][%{{.}}, %{{.*}}] : memref<128x128xf32>		// CHECK: affine.store %{{.*}}, [[BUFA]][-%[[I]] + %[[II]], -%[[K]] + %[[KK]]] : memref<128x128xf32>
// CHECK: }		// CHECK: }
// CHECK: }		// CHECK: }

// RHS matrix copy-in.		// RHS matrix copy-in.
// CHECK: [[BUFB:%[0-9]+]] = alloc() : memref<128x128xf32>		// CHECK: [[BUFB:%[0-9]+]] = alloc() : memref<128x128xf32>
// CHECK: affine.for %{{.*}} = 0 to 128 {		// CHECK: affine.for %[[KK:.]] = #[[MAP_IDENTITY]](%{{.}}) to #[[MAP_PLUS_128]](%{{.*}}) {
// CHECK: affine.apply #[[MEM_IDX_MAP]](%{{.}}, %{{.}})		// CHECK: affine.for %[[JJ:.]] = #[[MAP_IDENTITY]](%{{.}}) to #[[MAP_PLUS_128]](%{{.*}}) {
// CHECK: affine.for %{{.*}} = 0 to 128 {
// CHECK: affine.apply #[[MEM_IDX_MAP]](%{{.}}, %{{.}})
// CHECK: affine.load %{{.}}[%{{.}}, %{{.*}}] : memref<4096x4096xf32>		// CHECK: affine.load %{{.}}[%{{.}}, %{{.*}}] : memref<4096x4096xf32>
// CHECK: affine.store %{{.}}, [[BUFB]][%{{.}}, %{{.*}}] : memref<128x128xf32>		// CHECK: affine.store %{{.*}}, [[BUFB]][-%[[K]] + %[[KK]], -%[[J]] + %[[JJ]]] : memref<128x128xf32>
// CHECK: }		// CHECK: }
// CHECK: }		// CHECK: }

// Computation on the fast buffers.		// Computation on the fast buffers.
// CHECK: affine.for %{{.}} = #[[MAP_IDENTITY]](%{{.}}) to #[[MAP_PLUS_128]](%{{.*}}) {		// CHECK: affine.for %{{.}} = #[[MAP_IDENTITY]](%{{.}}) to #[[MAP_PLUS_128]](%{{.*}}) {
// CHECK: affine.for %{{.}} = #[[MAP_IDENTITY]](%{{.}}) to #[[MAP_PLUS_128]](%{{.*}}) {		// CHECK: affine.for %{{.}} = #[[MAP_IDENTITY]](%{{.}}) to #[[MAP_PLUS_128]](%{{.*}}) {
// CHECK: affine.for %{{.}} = #[[MAP_IDENTITY]](%{{.}}) to #[[MAP_PLUS_128]](%{{.*}}) {		// CHECK: affine.for %{{.}} = #[[MAP_IDENTITY]](%{{.}}) to #[[MAP_PLUS_128]](%{{.*}}) {
// CHECK: affine.load [[BUFA]][-%{{.}} + %{{.}}, -%{{.}} + %{{.}}] : memref<128x128xf32>		// CHECK: affine.load [[BUFA]][-%{{.}} + %{{.}}, -%{{.}} + %{{.}}] : memref<128x128xf32>
// CHECK: affine.load [[BUFB]][-%{{.}} + %{{.}}, -%{{.}} + %{{.}}] : memref<128x128xf32>		// CHECK: affine.load [[BUFB]][-%{{.}} + %{{.}}, -%{{.}} + %{{.}}] : memref<128x128xf32>
// CHECK: affine.load [[BUFC]][-%{{.}} + %{{.}}, -%{{.}} + %{{.}}] : memref<128x128xf32>		// CHECK: affine.load [[BUFC]][-%{{.}} + %{{.}}, -%{{.}} + %{{.}}] : memref<128x128xf32>
// CHECK: mulf %{{.}}, %{{.}} : f32		// CHECK: mulf %{{.}}, %{{.}} : f32
// CHECK: addf %{{.}}, %{{.}} : f32		// CHECK: addf %{{.}}, %{{.}} : f32
// CHECK: affine.store %{{.}}, [[BUFC]][-%{{.}} + %{{.}}, -%{{.}} + %{{.*}}] : memref<128x128xf32>		// CHECK: affine.store %{{.}}, [[BUFC]][-%{{.}} + %{{.}}, -%{{.}} + %{{.*}}] : memref<128x128xf32>
// CHECK: }		// CHECK: }
// CHECK: }		// CHECK: }
// CHECK: }		// CHECK: }
// CHECK: dealloc [[BUFB]] : memref<128x128xf32>		// CHECK: dealloc [[BUFB]] : memref<128x128xf32>
// CHECK: dealloc [[BUFA]] : memref<128x128xf32>		// CHECK: dealloc [[BUFA]] : memref<128x128xf32>
// CHECK: }		// CHECK: }
// CHECK: affine.apply #map{{.}}(%{{.}}, %{{.*}})
// CHECK: affine.apply #map{{.}}(%{{.}}, %{{.*}})

// Result matrix copy out.		// Result matrix copy out.
// CHECK: affine.for %{{.*}} = 0 to 128 {		// CHECK: affine.for %{{.}} = #[[MAP_IDENTITY]](%{{.}}) to #[[MAP_PLUS_128]](%{{.*}}) {
// CHECK: affine.apply #[[MEM_IDX_MAP]](%{{.}}, %{{.}})		// CHECK: affine.for %{{.}} = #[[MAP_IDENTITY]](%{{.}}) to #[[MAP_PLUS_128]](%{{.*}}) {
// CHECK: affine.for %{{.*}} = 0 to 128 {		// CHECK: affine.load [[BUFC]][-%{{.}} + %{{.}}, -%{{.}} + %{{.}}] : memref<128x128xf32>
// CHECK: affine.apply #[[MEM_IDX_MAP]](%{{.}}, %{{.}})		// CHECK: store %{{.}}, %{{.}}[%{{.}}, %{{.}}] : memref<4096x4096xf32>
// CHECK: [[BUFA]] = affine.load [[BUFC]][%{{.}}, %{{.}}] : memref<128x128xf32>
// CHECK: store [[BUFA]], %{{.}}[%{{.}}, %{{.*}}] : memref<4096x4096xf32>
// CHECK: }		// CHECK: }
// CHECK: }		// CHECK: }
// CHECK: dealloc [[BUFC]] : memref<128x128xf32>		// CHECK: dealloc [[BUFC]] : memref<128x128xf32>
// CHECK: }		// CHECK: }
// CHECK: }		// CHECK: }

// Check that only one memref is copied when memref filter is used.		// Check that only one memref is copied when memref filter is used.

// FILTER: affine.for %{{.*}} = 0 to 4096 step 128 {		// FILTER: affine.for %{{.*}} = 0 to 4096 step 128 {
// FILTER: alloc() : memref<128x4096xf32>		// FILTER: alloc() : memref<128x4096xf32>
// FILTER-NOT: alloc()		// FILTER-NOT: alloc()
// FILTER: affine.for %{{.*}} = 0 to 128 {		// FILTER: affine.for
		dcaballeUnsubmitted Done Reply Inline Actions nit: you can remove the `%{{.}}` dcaballe:* nit: you can remove the `%{{.*}}`
// FILTER: affine.for %{{.*}} = 0 to 4096 {		// FILTER: affine.for %{{.*}} = 0 to 4096 {
// FILTER: affine.for %{{.*}} = 0 to 4096 step 128 {		// FILTER: affine.for %{{.*}} = 0 to 4096 step 128 {
// FILTER-NEXT: affine.for %{{.*}} = 0 to 4096 step 128 {		// FILTER-NEXT: affine.for %{{.*}} = 0 to 4096 step 128 {
// FILTER-NEXT: affine.for %{{.}} = #map{{.}}(%{{.}}) to #map{{.}}(%{{.*}}) {		// FILTER-NEXT: affine.for %{{.}} = #map{{.}}(%{{.}}) to #map{{.}}(%{{.*}}) {
// FILTER-NEXT: affine.for %{{.}} = #map{{.}}(%{{.}}) to #map{{.}}(%{{.*}}) {		// FILTER-NEXT: affine.for %{{.}} = #map{{.}}(%{{.}}) to #map{{.}}(%{{.*}}) {
// FILTER-NEXT: affine.for %{{.}} = #map{{.}}(%{{.}}) to #map{{.}}(%{{.*}}) {		// FILTER-NEXT: affine.for %{{.}} = #map{{.}}(%{{.}}) to #map{{.}}(%{{.*}}) {
// FILTER: dealloc %1 : memref<128x4096xf32>		// FILTER: dealloc %{{.*}} : memref<128x4096xf32>
// FILTER-NOT: dealloc %1 : memref<128x4096xf32>		// FILTER-NOT: dealloc %{{.*}} : memref<128x4096xf32>

// -----		// -----

//		//
// This test case will lead to single element buffers. These are eventually		// This test case will lead to single element buffers. These are eventually
// expected to be turned into registers via alloca and mem2reg.		// expected to be turned into registers via alloca and mem2reg.
//		//
// CHECK-SMALL-LABEL: func @foo		// CHECK-SMALL-LABEL: func @single_elt_buffers
// FILTER-LABEL: func @foo		// FILTER-LABEL: func @single_elt_buffers
// MEMREF_REGION-LABEL: func @foo		// MEMREF_REGION-LABEL: func @single_elt_buffers
func @foo(%arg0: memref<1024x1024xf32>, %arg1: memref<1024x1024xf32>, %arg2: memref<1024x1024xf32>) -> memref<1024x1024xf32> {		func @single_elt_buffers(%arg0: memref<1024x1024xf32>, %arg1: memref<1024x1024xf32>, %arg2: memref<1024x1024xf32>) -> memref<1024x1024xf32> {
affine.for %i = 0 to 1024 {		affine.for %i = 0 to 1024 {
affine.for %j = 0 to 1024 {		affine.for %j = 0 to 1024 {
affine.for %k = 0 to 1024 {		affine.for %k = 0 to 1024 {
%6 = affine.load %arg1[%k, %j] : memref<1024x1024xf32>		%6 = affine.load %arg1[%k, %j] : memref<1024x1024xf32>
%7 = affine.load %arg2[%i, %j] : memref<1024x1024xf32>		%7 = affine.load %arg2[%i, %j] : memref<1024x1024xf32>
%9 = addf %6, %7 : f32		%9 = addf %6, %7 : f32
affine.store %9, %arg2[%i, %j] : memref<1024x1024xf32>		affine.store %9, %arg2[%i, %j] : memref<1024x1024xf32>
}		}
}		}
}		}
return %arg2 : memref<1024x1024xf32>		return %arg2 : memref<1024x1024xf32>
}		}
// CHECK-SMALL: affine.for %arg{{.*}} = 0 to 1024 {		// CHECK-SMALL: affine.for %arg{{.*}} = 0 to 1024 {
// CHECK-SMALL: affine.for %arg{{.*}} = 0 to 1024 {		// CHECK-SMALL: affine.for %arg{{.*}} = 0 to 1024 {
// CHECK-SMALL: affine.apply #map{{.}}(%arg{{.}}, %arg{{.*}})
// CHECK-SMALL: affine.apply #map{{.}}(%arg{{.}}, %arg{{.*}})
// CHECK-SMALL: alloc() : memref<1x1xf32>		// CHECK-SMALL: alloc() : memref<1x1xf32>
// CHECK-SMALL: affine.apply #map{{.}}(%arg{{.}}, %c0{{.*}})
// CHECK-SMALL: affine.apply #map{{.}}(%arg{{.}}, %c0{{.*}})
// CHECK-SMALL: affine.load %arg{{.}}[%{{.}}, %{{.*}}] : memref<1024x1024xf32>		// CHECK-SMALL: affine.load %arg{{.}}[%{{.}}, %{{.*}}] : memref<1024x1024xf32>
// CHECK-SMALL: affine.store %{{.}}, %{{.}}[%c0{{.}}, %c0{{.}}] : memref<1x1xf32>		// CHECK-SMALL: affine.store %{{.}}, %{{.}}[0, 0] : memref<1x1xf32>
// CHECK-SMALL: affine.for %arg{{.*}} = 0 to 1024 {		// CHECK-SMALL: affine.for %arg{{.*}} = 0 to 1024 {
// CHECK-SMALL: affine.apply #map{{.}}(%arg{{.}}, %arg{{.*}})
// CHECK-SMALL: affine.apply #map{{.}}(%arg{{.}}, %arg{{.*}})
// CHECK-SMALL: alloc() : memref<1x1xf32>		// CHECK-SMALL: alloc() : memref<1x1xf32>
// CHECK-SMALL: affine.apply #map{{.}}(%arg{{.}}, %c0{{.*}})
// CHECK-SMALL: affine.apply #map{{.}}(%arg{{.}}, %c0{{.*}})
// CHECK-SMALL: affine.load %arg{{.}}[%{{.}}, %{{.*}}] : memref<1024x1024xf32>		// CHECK-SMALL: affine.load %arg{{.}}[%{{.}}, %{{.*}}] : memref<1024x1024xf32>
// CHECK-SMALL: affine.store %{{.}}, %{{.}}[%c0{{.}}, %c0{{.}}] : memref<1x1xf32>		// CHECK-SMALL: affine.store %{{.}}, %{{.}}[0, 0] : memref<1x1xf32>
// CHECK-SMALL: affine.load %{{.*}}[0, 0] : memref<1x1xf32>		// CHECK-SMALL: affine.load %{{.*}}[0, 0] : memref<1x1xf32>
// CHECK-SMALL: affine.load %{{.*}}[0, 0] : memref<1x1xf32>		// CHECK-SMALL: affine.load %{{.*}}[0, 0] : memref<1x1xf32>
// CHECK-SMALL: addf %{{.}}, %{{.}} : f32		// CHECK-SMALL: addf %{{.}}, %{{.}} : f32
// CHECK-SMALL: affine.store %{{.}}, %{{.}}[0, 0] : memref<1x1xf32>		// CHECK-SMALL: affine.store %{{.}}, %{{.}}[0, 0] : memref<1x1xf32>
// CHECK-SMALL: dealloc %{{.*}} : memref<1x1xf32>		// CHECK-SMALL: dealloc %{{.*}} : memref<1x1xf32>
// CHECK-SMALL: }		// CHECK-SMALL: }
// CHECK-SMALL: affine.apply #map{{.}}(%arg{{.}}, %arg{{.*}})		// CHECK-SMALL: affine.load %{{.*}}[0, 0] : memref<1x1xf32>
// CHECK-SMALL: affine.apply #map{{.}}(%arg{{.}}, %arg{{.*}})
// CHECK-SMALL: affine.apply #map{{.}}(%arg{{.}}, %c0{{.*}})
// CHECK-SMALL: affine.apply #map{{.}}(%arg{{.}}, %c0{{.*}})
// CHECK-SMALL: affine.load %{{.}}[%c0{{.}}, %c0{{.*}}] : memref<1x1xf32>
// CHECK-SMALL: affine.store %{{.}}, %arg{{.}}[%{{.}}, %{{.}}] : memref<1024x1024xf32>		// CHECK-SMALL: affine.store %{{.}}, %arg{{.}}[%{{.}}, %{{.}}] : memref<1024x1024xf32>
// CHECK-SMALL: dealloc %{{.*}} : memref<1x1xf32>		// CHECK-SMALL: dealloc %{{.*}} : memref<1x1xf32>
// CHECK-SMALL: }		// CHECK-SMALL: }
// CHECK-SMALL: }		// CHECK-SMALL: }
// CHECK-SMALL: return		// CHECK-SMALL: return

// Check that only one memref is copied when memref filter is used.		// Check that only one memref is copied when memref filter is used.

Show All 9 Lines
// FILTER: return		// FILTER: return

// CHeck that only one memref is copied, because for-memref-region is enabled		// CHeck that only one memref is copied, because for-memref-region is enabled
// (and the first ever encountered load is analyzed).		// (and the first ever encountered load is analyzed).
// MEMREF_REGION: alloc() : memref<1024x1024xf32>		// MEMREF_REGION: alloc() : memref<1024x1024xf32>
// MEMREF_REGION-NOT: alloc()		// MEMREF_REGION-NOT: alloc()
// MEMREF_REGION: affine.for %{{.*}} = 0 to 1024 {		// MEMREF_REGION: affine.for %{{.*}} = 0 to 1024 {
// MEMREF_REGION: affine.for %{{.*}} = 0 to 1024 {		// MEMREF_REGION: affine.for %{{.*}} = 0 to 1024 {
// MEMREF_REGION: affine.for %{{.*}} = 0 to 1024 {		// MEMREF_REGION: }
		// MEMREF_REGION: }
		// MEMREF_REGION-NEXT: affine.for %{{.*}} = 0 to 1024 {
// MEMREF_REGION-NEXT: affine.for %{{.*}} = 0 to 1024 {		// MEMREF_REGION-NEXT: affine.for %{{.*}} = 0 to 1024 {
// MEMREF_REGION-NEXT: affine.for %{{.*}} = 0 to 1024 {		// MEMREF_REGION-NEXT: affine.for %{{.*}} = 0 to 1024 {
// MEMREF_REGION: dealloc %{{.*}} : memref<1024x1024xf32>		// MEMREF_REGION: dealloc %{{.*}} : memref<1024x1024xf32>
// MEMREF_REGION-NOT: dealloc		// MEMREF_REGION-NOT: dealloc
		// MEMREF_REGION-NEXT: return

		// -----

		// This pattern typically appears with tiling with tile sizes that don't divide
		// the loop trip counts.

		#map_ub = affine_map<(d0) -> (4096, d0 + 100)>

		// CHECK-DAG: [[MAP_IDENTITY:map[0-9]+]] = affine_map<(d0) -> (d0)>
		// CHECK-DAG: [[MAP_MIN_UB1:map[0-9]+]] = affine_map<(d0) -> (d0 + 100, 4096)>
		// CHECK-DAG: [[MAP_MIN_UB2:map[0-9]+]] = affine_map<(d0) -> (4096, d0 + 100)>

		// CHECK-LABEL: func @min_upper_bound
		func @min_upper_bound(%A: memref<4096xf32>) -> memref<4096xf32> {
		affine.for %i = 0 to 4096 step 100 {
		affine.for %ii = affine_map<(d0) -> (d0)>(%i) to min #map_ub(%i) {
		%5 = affine.load %A[%ii] : memref<4096xf32>
		%6 = mulf %5, %5 : f32
		affine.store %6, %A[%ii] : memref<4096xf32>
		}
		}
		return %A : memref<4096xf32>
		}
		// CHECK: affine.for %[[IV1:.*]] = 0 to 4096 step 100
		// CHECK-NEXT: %[[BUF:.*]] = alloc() : memref<100xf32>
		// CHECK-NEXT: affine.for %[[IV2:.*]] = #[[MAP_IDENTITY]](%[[IV1]]) to min #[[MAP_MIN_UB1]](%[[IV1]]) {
		// CHECK-NEXT: affine.load %{{.*}}[%[[IV2]]] : memref<4096xf32>
		// CHECK-NEXT: affine.store %{{.*}}, %[[BUF]][-%[[IV1]] + %[[IV2]]] : memref<100xf32>
		// CHECK-NEXT: }
		// CHECK-NEXT: affine.for %[[IV2:.*]] = #[[MAP_IDENTITY]](%[[IV1]]) to min #[[MAP_MIN_UB2]](%[[IV1]]) {
		// CHECK-NEXT: affine.load %[[BUF]][-%[[IV1]] + %[[IV2]]] : memref<100xf32>
		// CHECK-NEXT: mulf
		// CHECK-NEXT: affine.store %{{.*}}, %[[BUF]][-%[[IV1]] + %[[IV2]]] : memref<100xf32>
		// CHECK-NEXT: }
		// CHECK-NEXT: affine.for %[[IV2:.*]] = #[[MAP_IDENTITY]](%[[IV1]]) to min #[[MAP_MIN_UB1]](%[[IV1]]) {
		// CHECK-NEXT: affine.load %[[BUF]][-%[[IV1]] + %[[IV2]]] : memref<100xf32>
		// CHECK-NEXT: affine.store %{{.}}, %{{.}}[%[[IV2]]] : memref<4096xf32>
		// CHECK-NEXT: }
		// CHECK-NEXT: dealloc %[[BUF]] : memref<100xf32>
		// CHECK-NEXT: }

		// -----

		// Lower bound is a max; upper bound is a min. This pattern typically appears
		// with multi-level tiling when the tile sizes used don't divide loop trip
		// counts.

		#lb = affine_map<(d0, d1) -> (d0 * 512, d1 * 6)>
		#ub = affine_map<(d0, d1) -> (d0 * 512 + 512, d1 * 6 + 6)>

		// CHECK-DAG: #[[LB:.]] = affine_map<()[s0, s1] -> (s0 512, s1 * 6)>
		// CHECK-DAG: #[[UB:.]] = affine_map<()[s0, s1] -> (s0 512 + 512, s1 * 6 + 6)>

		// CHECK-LABEL: max_lower_bound(%{{.*}}: memref<2048x516xf64>,
		// CHECK-SAME: [[i:arg[0-9]+]]
		// CHECK-SAME: [[j:arg[0-9]+]]
		func @max_lower_bound(%M: memref<2048x516xf64>, %i : index, %j : index) {
		affine.for %ii = 0 to 2048 {
		affine.for %jj = max #lb(%i, %j) to min #ub(%i, %j) {
		affine.load %M[%ii, %jj] : memref<2048x516xf64>
		}
		}
		return
		}

		// CHECK: %[[BUF=.*]] = alloc() : memref<2048x6xf64>
		// CHECK-NEXT: affine.for %[[ii:.*]] = 0 to 2048 {
		// CHECK-NEXT: affine.for %[[jj:.*]] = max #[[LB]]()[%[[i]], %[[j]]] to min #[[UB]]()[%[[i]], %[[j]]] {
		// CHECK-NEXT: affine.load %{{.*}}[%[[ii]], %[[jj]]] : memref<2048x516xf64>
		// CHECK-NEXT: affine.store %{{.}}, %[[BUF]][%[[ii]], %[[jj]] - symbol(%[[j]]) 6] : memref<2048x6xf64>
		// CHECK-NEXT: }
		// CHECK-NEXT: }
		// CHECK-NEXT: affine.for %[[ii_:.*]] = 0 to 2048 {
		// CHECK-NEXT: affine.for %[[jj_:.]] = max #[[LB]]()[%{{.}}, %{{.}}] to min #[[UB]]()[%{{.}}, %{{.*}}] {
		// CHECK-NEXT: affine.load %[[BUF]][%[[ii_]], %[[jj_]] - symbol(%[[j]]) * 6] : memref<2048x6xf64>
		// CHECK-NEXT: }
		// CHECK-NEXT: }
		// CHECK-NEXT: dealloc %[[BUF]] : memref<2048x6xf64>

mlir/test/Dialect/Affine/dma-generate.mlir

// RUN: mlir-opt -allow-unregistered-dialect %s -split-input-file -affine-data-copy-generate -affine-data-copy-generate-dma -affine-data-copy-generate-fast-mem-space=2 -affine-data-copy-generate-skip-non-unit-stride-loops -verify-diagnostics \| FileCheck %s		// RUN: mlir-opt -allow-unregistered-dialect %s -split-input-file -affine-data-copy-generate -affine-data-copy-generate-dma -affine-data-copy-generate-fast-mem-space=2 -affine-data-copy-generate-skip-non-unit-stride-loops -verify-diagnostics \| FileCheck %s
// RUN: mlir-opt -allow-unregistered-dialect %s -split-input-file -affine-data-copy-generate -affine-data-copy-generate-dma -affine-data-copy-generate-fast-mem-capacity=16 -affine-data-copy-generate-fast-mem-space=2 \| FileCheck %s --check-prefix FAST-MEM-16KB		// RUN: mlir-opt -allow-unregistered-dialect %s -split-input-file -affine-data-copy-generate -affine-data-copy-generate-dma -affine-data-copy-generate-fast-mem-capacity=16 -affine-data-copy-generate-fast-mem-space=2 \| FileCheck %s --check-prefix FAST-MEM-16KB

// We run most test cases with -copy-skip-non-unit-stride-loops to allow testing		// We run most test cases with -copy-skip-non-unit-stride-loops to allow testing
// DMA generation at inner levels easily - since the DMA generation would		// DMA generation at inner levels easily - since the DMA generation would
// otherwise always generate DMAs at the outermost level (default for fast mem		// otherwise always generate DMAs at the outermost level (default for fast mem
// capacity is infinite). Using a specific capacity makes it harder to write		// capacity is infinite). Using a specific capacity makes it harder to write
// a test case as one would have to calculate total footprints. With		// a test case as one would have to calculate total footprints. With
// -copy-skip-non-unit-stride-loops, non-unit strides will always be skipped and		// -copy-skip-non-unit-stride-loops, non-unit strides will always be skipped and
// its inner loops will be traversed till a unit stride loop is found (or the		// its inner loops will be traversed till a unit stride loop is found (or the
// innermost block is reached).		// innermost block is reached).

// -----		// -----

// Index of the buffer for the second DMA is remapped.		// Index of the buffer for the second DMA is remapped.
// CHECK-DAG: [[MAP_PLUS_256:#map[0-9]+]] = affine_map<(d0) -> (d0 + 256)>
// CHECK-DAG: [[MAP0:#map[0-9]+]] = affine_map<(d0) -> (d0)>		// CHECK-DAG: [[MAP0:#map[0-9]+]] = affine_map<(d0) -> (d0)>

// CHECK-LABEL: func @loop_nest_1d() {		// CHECK-LABEL: func @loop_nest_1d() {
func @loop_nest_1d() {		func @loop_nest_1d() {
%A = alloc() : memref<256 x f32>		%A = alloc() : memref<256 x f32>
%B = alloc() : memref<512 x f32>		%B = alloc() : memref<512 x f32>
%F = alloc() : memref<256 x f32, 2>		%F = alloc() : memref<256 x f32, 2>
// First DMA buffer.		// First DMA buffer.
// CHECK: alloc() : memref<256xf32>		// CHECK: alloc() : memref<256xf32>
// CHECK: alloc() : memref<256xf32, 2>		// CHECK: alloc() : memref<256xf32, 2>
// Tag for first DMA.		// Tag for first DMA.
// CHECK: alloc() : memref<1xi32>		// CHECK: alloc() : memref<1xi32>
// First DMA transfer.		// First DMA transfer.
// CHECK: affine.dma_start %{{.}}[%{{.}}], %{{.}}[%{{.}}], %{{.}}[%{{.}}], %{{.*}} : memref<256xf32>, memref<256xf32, 2>, memref<1xi32>		// CHECK: affine.dma_start %{{.}}[%{{.}}], %{{.}}[%{{.}}], %{{.}}[%{{.}}], %{{.*}} : memref<256xf32>, memref<256xf32, 2>, memref<1xi32>
// CHECK: affine.dma_wait %{{.}}[%{{.}}], %{{.*}} : memref<1xi32>		// CHECK: affine.dma_wait %{{.}}[%{{.}}], %{{.*}} : memref<1xi32>
// Second DMA buffer.		// Second DMA buffer.
// CHECK: alloc() : memref<256xf32, 2>		// CHECK: alloc() : memref<256xf32, 2>
// Tag for second DMA.		// Tag for second DMA.
// CHECK: alloc() : memref<1xi32>		// CHECK: alloc() : memref<1xi32>
// Second DMA transfer.		// Second DMA transfer.
// CHECK: affine.dma_start %{{.}}[%{{.}}], %{{.}}[%{{.}}], %{{.}}[%{{.}}], %{{.*}} : memref<512xf32>, memref<256xf32, 2>, memref<1xi32>		// CHECK: affine.dma_start %{{.}}[%{{.}}], %{{.}}[%{{.}}], %{{.}}[%{{.}}], %{{.*}} : memref<512xf32>, memref<256xf32, 2>, memref<1xi32>
// CHECK-NEXT: affine.dma_wait %{{.}}[%{{.}}], %{{.*}} : memref<1xi32>		// CHECK-NEXT: affine.dma_wait %{{.}}[%{{.}}], %{{.*}} : memref<1xi32>
// CHECK: affine.for %{{.*}} = 0 to 256 {		// CHECK: affine.for %[[IV:.*]] = 0 to 256 {
// CHECK-NEXT: affine.load %{{.}}[%{{.}}] : memref<256xf32, 2>		// CHECK-NEXT: affine.load %{{.}}[%{{.}}] : memref<256xf32, 2>
// CHECK: affine.apply [[MAP_PLUS_256]](%{{.*}})		// Buffer for '%{{.*}}' in faster memref space is of smaller size: 256xf32
// Buffer for '%{{.*}}' in faster memref space is smaller size: 256xf32		// Affine map for load on B is composed and becomes identity.
// Affine map for 'affine.load %{{.}}' is composed: %{{.}} + 256 - 256 = %{{.*}}.		// CHECK: affine.load %{{.*}}[%[[IV]]] : memref<256xf32, 2>
// CHECK-NEXT: %{{.}} = affine.load %{{.}}[%{{.*}}] : memref<256xf32, 2>
// Already in faster memory space.		// Already in faster memory space.
// CHECK: affine.load %{{.}}[%{{.}}] : memref<256xf32, 2>		// CHECK: affine.load %{{.*}}[%[[IV]]] : memref<256xf32, 2>
// CHECK-NEXT: }		// CHECK-NEXT: }
// CHECK-NEXT: dealloc %{{.*}} : memref<1xi32>		// CHECK-NEXT: dealloc %{{.*}} : memref<1xi32>
// CHECK-NEXT: dealloc %{{.*}} : memref<256xf32, 2>		// CHECK-NEXT: dealloc %{{.*}} : memref<256xf32, 2>
// CHECK-NEXT: dealloc %{{.*}} : memref<1xi32>		// CHECK-NEXT: dealloc %{{.*}} : memref<1xi32>
// CHECK-NEXT: dealloc %{{.*}} : memref<256xf32, 2>		// CHECK-NEXT: dealloc %{{.*}} : memref<256xf32, 2>
// CHECK-NEXT: return		// CHECK-NEXT: return
affine.for %i = 0 to 256 {		affine.for %i = 0 to 256 {
affine.load %A[%i] : memref<256 x f32>		affine.load %A[%i] : memref<256 x f32>
Show All 23 Lines
// CHECK-DAG: affine.dma_wait [[TAGA]][%{{.}}], %{{.}} : memref<1xi32>		// CHECK-DAG: affine.dma_wait [[TAGA]][%{{.}}], %{{.}} : memref<1xi32>
// INCOMING DMA for C.		// INCOMING DMA for C.
// CHECK-DAG: affine.dma_start %{{.}}[%{{.}}, %{{.}}], [[BUFC]][%{{.}}, %{{.}}], [[TAGC]][%{{.}}], %{{.*}} : memref<512x32xf32>, memref<512x32xf32, 2>, memref<1xi32>		// CHECK-DAG: affine.dma_start %{{.}}[%{{.}}, %{{.}}], [[BUFC]][%{{.}}, %{{.}}], [[TAGC]][%{{.}}], %{{.*}} : memref<512x32xf32>, memref<512x32xf32, 2>, memref<1xi32>
// CHECK-DAG: affine.dma_wait [[TAGC]][%{{.}}], %{{.}} : memref<1xi32>		// CHECK-DAG: affine.dma_wait [[TAGC]][%{{.}}], %{{.}} : memref<1xi32>
// CHECK-NEXT: affine.for %{{.*}} = 0 to 32 {		// CHECK-NEXT: affine.for %{{.*}} = 0 to 32 {
// CHECK-NEXT: affine.for %{{.*}} = 0 to 32 {		// CHECK-NEXT: affine.for %{{.*}} = 0 to 32 {
// CHECK-NEXT: affine.for %{{.*}} = 0 to 32 {		// CHECK-NEXT: affine.for %{{.*}} = 0 to 32 {
// CHECK-NEXT: affine.for %{{.*}} = 0 to 16 {		// CHECK-NEXT: affine.for %{{.*}} = 0 to 16 {
// CHECK-NEXT: affine.apply #map{{[0-9]+}}(%{{.}}, %{{.}})		// CHECK: affine.load [[BUFB]][%{{.}} 16 + %{{.}}, %{{.}}] : memref<512x32xf32, 2>
// CHECK-NEXT: %{{.}} = affine.load [[BUFB]][%{{.}} * 16 + %{{.}}, %{{.}}] : memref<512x32xf32, 2>
// CHECK-NEXT: "foo"(%{{.*}}) : (f32) -> ()		// CHECK-NEXT: "foo"(%{{.*}}) : (f32) -> ()
// CHECK-NEXT: }		// CHECK-NEXT: }
// CHECK-NEXT: affine.for %{{.*}} = 0 to 16 {		// CHECK-NEXT: affine.for %{{.*}} = 0 to 16 {
// CHECK-NEXT: affine.apply #map{{[0-9]+}}(%{{.}}, %{{.}})		// CHECK: affine.load [[BUFA]][%{{.}} 16 + %{{.}}, %{{.}}] : memref<512x32xf32, 2>
// CHECK-NEXT: affine.load [[BUFA]][%{{.}} 16 + %{{.}}, %{{.}}] : memref<512x32xf32, 2>
// CHECK-NEXT: "bar"(%{{.*}}) : (f32) -> ()		// CHECK-NEXT: "bar"(%{{.*}}) : (f32) -> ()
// CHECK-NEXT: }		// CHECK-NEXT: }
// CHECK-NEXT: affine.for %{{.*}} = 0 to 16 {		// CHECK-NEXT: affine.for %{{.*}} = 0 to 16 {
// CHECK-NEXT: "abc_compute"() : () -> f32		// CHECK-NEXT: "abc_compute"() : () -> f32
// CHECK-NEXT: affine.apply #map{{[0-9]+}}(%{{.}}, %{{.}})		// CHECK: affine.load [[BUFC]][%{{.}} 16 + %{{.}}, %{{.}}] : memref<512x32xf32, 2>
// CHECK-NEXT: affine.load [[BUFC]][%{{.}} 16 + %{{.}}, %{{.}}] : memref<512x32xf32, 2>
// CHECK-NEXT: "addf32"(%{{.}}, %{{.}}) : (f32, f32) -> f32		// CHECK-NEXT: "addf32"(%{{.}}, %{{.}}) : (f32, f32) -> f32
// CHECK-NEXT: affine.store %{{.}}, [[BUFC]][%{{.}} * 16 + %{{.}}, %{{.}}] : memref<512x32xf32, 2>		// CHECK-NEXT: affine.store %{{.}}, [[BUFC]][%{{.}} * 16 + %{{.}}, %{{.}}] : memref<512x32xf32, 2>
// CHECK-NEXT: }		// CHECK-NEXT: }
// CHECK-NEXT: "foobar"() : () -> ()		// CHECK-NEXT: "foobar"() : () -> ()
// CHECK-NEXT: }		// CHECK-NEXT: }
// CHECK-NEXT: }		// CHECK-NEXT: }
// CHECK-NEXT: }		// CHECK-NEXT: }
// OUTGOING DMA for C.		// OUTGOING DMA for C.
▲ Show 20 Lines • Show All 43 Lines • ▼ Show 20 Lines
// -----		// -----

// A loop nest with a modulo 2 access. A strided DMA is not needed here a 1x2		// A loop nest with a modulo 2 access. A strided DMA is not needed here a 1x2
// region within a 256 x 8 memref.		// region within a 256 x 8 memref.
//		//
// CHECK-LABEL: func @loop_nest_modulo() {		// CHECK-LABEL: func @loop_nest_modulo() {
// CHECK: alloc() : memref<256x8xf32>		// CHECK: alloc() : memref<256x8xf32>
// CHECK-NEXT: affine.for %{{.*}} = 0 to 32 step 4 {		// CHECK-NEXT: affine.for %{{.*}} = 0 to 32 step 4 {
// CHECK-NEXT: affine.apply #map{{[0-9]+}}(%{{.*}})		// CHECK: alloc() : memref<1x2xf32, 2>
// CHECK-NEXT: alloc() : memref<1x2xf32, 2>
// CHECK-NEXT: alloc() : memref<1xi32>		// CHECK-NEXT: alloc() : memref<1xi32>
// Composition of the affine map for '%{{.}}' causes '%{{.}}' to be added as a symbol.		// Composition of the affine map for '%{{.}}' causes '%{{.}}' to be added as a symbol.
// CHECK-NEXT: affine.dma_start %{{.}}[%{{.}}, 0], %{{.}}[%{{.}}, %{{.}}], %{{.}}[%{{.}}], %{{.}} : memref<256x8xf32>, memref<1x2xf32, 2>, memref<1xi32>		// CHECK-NEXT: affine.dma_start %{{.}}[%{{.}}, 0], %{{.}}[%{{.}}, %{{.}}], %{{.}}[%{{.}}], %{{.}} : memref<256x8xf32>, memref<1x2xf32, 2>, memref<1xi32>
// CHECK-NEXT: affine.dma_wait %{{.}}[%{{.}}], %{{.*}} : memref<1xi32>		// CHECK-NEXT: affine.dma_wait %{{.}}[%{{.}}], %{{.*}} : memref<1xi32>
// CHECK-NEXT: affine.for %{{.*}} = 0 to 8 {		// CHECK-NEXT: affine.for %{{.*}} = 0 to 8 {
// ...		// ...
// ...		// ...
// CHECK: }		// CHECK: }
▲ Show 20 Lines • Show All 58 Lines • ▼ Show 20 Lines	affine.for %j = 0 to affine_map<()[s0] -> (s0)> ()[%N] {
affine.load %A[%one, %j] : memref<100 x 100 x f32>		affine.load %A[%one, %j] : memref<100 x 100 x f32>
}		}
}		}
return		return
}		}

// -----		// -----

// CHECK-DAG: [[MAP_SYM_SHIFT:#map[0-9]+]] = affine_map<(d0, d1)[s0, s1] -> (d1 + s0 + s1)>

// CHECK-LABEL: func @dma_with_symbolic_accesses		// CHECK-LABEL: func @dma_with_symbolic_accesses
func @dma_with_symbolic_accesses(%A : memref<100x100xf32>, %M : index) {		func @dma_with_symbolic_accesses(%A : memref<100x100xf32>, %M : index) {
%N = constant 9 : index		%N = constant 9 : index
affine.for %i = 0 to 100 {		affine.for %i = 0 to 100 {
affine.for %j = 0 to 100 {		affine.for %j = 0 to 100 {
%idy = affine.apply affine_map<(d0, d1) [s0, s1] -> (d1 + s0 + s1)>(%i, %j)[%M, %N]		%idy = affine.apply affine_map<(d0, d1) [s0, s1] -> (d1 + s0 + s1)>(%i, %j)[%M, %N]
affine.load %A[%i, %idy] : memref<100 x 100 x f32>		affine.load %A[%i, %idy] : memref<100 x 100 x f32>
}		}
}		}
return		return
// CHECK: alloc() : memref<100x100xf32, 2>		// CHECK: alloc() : memref<100x100xf32, 2>
// CHECK-NEXT: alloc() : memref<1xi32>		// CHECK-NEXT: alloc() : memref<1xi32>
// CHECK-NEXT: affine.dma_start %{{.}}[0, symbol(%{{.}}) + 9], %{{.}}[%{{.}}, %{{.}}], %{{.}}[%{{.}}], %{{.}}		// CHECK-NEXT: affine.dma_start %{{.}}[0, symbol(%{{.}}) + 9], %{{.}}[%{{.}}, %{{.}}], %{{.}}[%{{.}}], %{{.}}
// CHECK-NEXT: affine.dma_wait %{{.}}[%{{.}}], %{{.*}}		// CHECK-NEXT: affine.dma_wait %{{.}}[%{{.}}], %{{.*}}
// CHECK-NEXT: affine.for %{{.*}} = 0 to 100 {		// CHECK-NEXT: affine.for %[[IV0:.*]] = 0 to 100 {
// CHECK-NEXT: affine.for %{{.*}} = 0 to 100 {		// CHECK-NEXT: affine.for %[[IV1:.*]] = 0 to 100 {
// CHECK-NEXT: affine.apply [[MAP_SYM_SHIFT]](%{{.}}, %{{.}})[%{{.}}, %{{.}}]		// CHECK: affine.load %{{.*}}[%[[IV0]], %[[IV1]]] : memref<100x100xf32, 2>
// CHECK-NEXT: affine.load %{{.}}[%{{.}}, %{{.*}}] : memref<100x100xf32, 2>
// CHECK-NEXT: }		// CHECK-NEXT: }
// CHECK-NEXT: }		// CHECK-NEXT: }
// CHECK: return		// CHECK: return
}		}

// -----		// -----

// CHECK-LABEL: func @dma_with_symbolic_loop_bounds		// CHECK-LABEL: func @dma_with_symbolic_loop_bounds
▲ Show 20 Lines • Show All 50 Lines • ▼ Show 20 Lines	affine.for %j = 0 to 1024 {
}		}
}		}
}		}
return		return
}		}

// -----		// -----

// CHECK-DAG: [[MAP_PLUS_64:#map[0-9]+]] = affine_map<(d0) -> (d0 + 64)>
// CHECK-DAG: [[MAP_PLUS_128:#map[0-9]+]] = affine_map<(d0) -> (d0 + 128)>
// CHECK-DAG: [[MAP_PLUS_2:#map[0-9]+]] = affine_map<(d0) -> (d0 + 2)>
// CHECK-DAG: [[MAP_PLUS_192:#map[0-9]+]] = affine_map<(d0) -> (d0 + 192)>

// The first load accesses ([2,258), [128,384))		// The first load accesses ([2,258), [128,384))
// The second load accesses ([64,320), [2,258))		// The second load accesses ([64,320), [2,258))
// The first store writes to ([2,258), [192,448))		// The first store writes to ([2,258), [192,448))
// The second store writes to ([128,320), [2,258))		// The second store writes to ([128,320), [2,258))
// The union of all these regions is of size 318 x 446 and has its origin at (2,		// The union of all these regions is of size 318 x 446 and has its origin at (2,
// 2), i.e., the window ([2,320), [2,448)) in the original space.		// 2), i.e., the window ([2,320), [2,448)) in the original space.

// CHECK-LABEL: func @multi_load_store_union() {		// CHECK-LABEL: func @multi_load_store_union() {
Show All 21 Lines
// CHECK: alloc() : memref<512x512xf32>		// CHECK: alloc() : memref<512x512xf32>
// CHECK-NEXT: alloc() : memref<382x446xf32, 2>		// CHECK-NEXT: alloc() : memref<382x446xf32, 2>
// CHECK-NEXT: alloc() : memref<1xi32>		// CHECK-NEXT: alloc() : memref<1xi32>
// CHECK-NEXT: affine.dma_start %{{.}}[%{{.}}, %{{.}}], %{{.}}[%{{.}}, %{{.}}], %{{.}}[%{{.}}], %{{.}}, %{{.}}, %{{.*}} : memref<512x512xf32>, memref<382x446xf32, 2>, memref<1xi32>		// CHECK-NEXT: affine.dma_start %{{.}}[%{{.}}, %{{.}}], %{{.}}[%{{.}}, %{{.}}], %{{.}}[%{{.}}], %{{.}}, %{{.}}, %{{.*}} : memref<512x512xf32>, memref<382x446xf32, 2>, memref<1xi32>
// CHECK-NEXT: affine.dma_wait %{{.}}[%{{.}}], %{{.*}} : memref<1xi32>		// CHECK-NEXT: affine.dma_wait %{{.}}[%{{.}}], %{{.*}} : memref<1xi32>
// CHECK-NEXT: alloc() : memref<1xi32>		// CHECK-NEXT: alloc() : memref<1xi32>
// CHECK-NEXT: affine.for %{{.*}} = 0 to 256 {		// CHECK-NEXT: affine.for %{{.*}} = 0 to 256 {
// CHECK-NEXT: affine.for %{{.*}} = 0 to 256 {		// CHECK-NEXT: affine.for %{{.*}} = 0 to 256 {
// CHECK-NEXT: affine.apply [[MAP_PLUS_64]](%{{.*}})		// CHECK: affine.load %{{.}}[%{{.}}, %{{.*}} + 126] : memref<382x446xf32, 2>
// CHECK-NEXT: affine.apply [[MAP_PLUS_128]](%{{.*}})
// CHECK-NEXT: affine.apply [[MAP_PLUS_2]](%{{.*}})
// CHECK-NEXT: affine.apply [[MAP_PLUS_2]](%{{.*}})
// CHECK-NEXT: affine.load %{{.}}[%{{.}}, %{{.*}} + 126] : memref<382x446xf32, 2>
// CHECK-NEXT: affine.load %{{.}}[%{{.}} + 62, %{{.*}}] : memref<382x446xf32, 2>		// CHECK-NEXT: affine.load %{{.}}[%{{.}} + 62, %{{.*}}] : memref<382x446xf32, 2>
// CHECK-NEXT: affine.apply [[MAP_PLUS_128]](%{{.*}})		// CHECK: affine.store %{{.}}, %{{.}}[%{{.}}, %{{.}} + 190] : memref<382x446xf32, 2>
// CHECK-NEXT: affine.apply [[MAP_PLUS_192]](%{{.*}})
// CHECK-NEXT: affine.store %{{.}}, %{{.}}[%{{.}}, %{{.}} + 190] : memref<382x446xf32, 2>
// CHECK-NEXT: affine.store %{{.}}, %{{.}}[%{{.}} + 126, %{{.}}] : memref<382x446xf32, 2>		// CHECK-NEXT: affine.store %{{.}}, %{{.}}[%{{.}} + 126, %{{.}}] : memref<382x446xf32, 2>
// CHECK-NEXT: }		// CHECK-NEXT: }
// CHECK-NEXT: }		// CHECK-NEXT: }
// CHECK-NEXT: affine.dma_start %{{.}}[%{{.}}, %{{.}}], %{{.}}[%{{.}}, %{{.}}], %{{.}}[%{{.}}], %{{.}}, %{{.}}, %{{.*}} : memref<382x446xf32, 2>, memref<512x512xf32>, memref<1xi32>		// CHECK-NEXT: affine.dma_start %{{.}}[%{{.}}, %{{.}}], %{{.}}[%{{.}}, %{{.}}], %{{.}}[%{{.}}], %{{.}}, %{{.}}, %{{.*}} : memref<382x446xf32, 2>, memref<512x512xf32>, memref<1xi32>
// CHECK-NEXT: affine.dma_wait %{{.}}[%{{.}}], %{{.*}} : memref<1xi32>		// CHECK-NEXT: affine.dma_wait %{{.}}[%{{.}}], %{{.*}} : memref<1xi32>
// CHECK-NEXT: dealloc %{{.*}} : memref<1xi32>		// CHECK-NEXT: dealloc %{{.*}} : memref<1xi32>
// CHECK-NEXT: dealloc %{{.*}} : memref<1xi32>		// CHECK-NEXT: dealloc %{{.*}} : memref<1xi32>
// CHECK-NEXT: dealloc %{{.*}} : memref<382x446xf32, 2>		// CHECK-NEXT: dealloc %{{.*}} : memref<382x446xf32, 2>
▲ Show 20 Lines • Show All 84 Lines • ▼ Show 20 Lines	affine.for %i2 = affine_map<(d0) -> (d0)>(%i0) to affine_map<(d0) -> (d0 + 4)>(%i0) {
affine.store %0, %arg0[%i2] : memref<1027xf32>		affine.store %0, %arg0[%i2] : memref<1027xf32>
}		}
}		}
return		return
}		}
// CHECK: [[BUF:%[0-9]+]] = alloc() : memref<1027xf32, 2>		// CHECK: [[BUF:%[0-9]+]] = alloc() : memref<1027xf32, 2>
// CHECK-NEXT: [[MEM:%[0-9]+]] = alloc() : memref<1xi32>		// CHECK-NEXT: [[MEM:%[0-9]+]] = alloc() : memref<1xi32>
// CHECK-NEXT: affine.for %{{.*}} = 0 to 1024 {		// CHECK-NEXT: affine.for %{{.*}} = 0 to 1024 {
// CHECK-NEXT: affine.for %{{.}} = {{#map[0-9]+}}(%{{.}}) to {{#map[0-9]+}}(%{{.*}}) {		// CHECK-NEXT: affine.for %[[I2:.]] = {{#map[0-9]+}}(%{{.}}) to {{#map[0-9]+}}(%{{.*}}) {
// CHECK-NEXT: constant 0.000000e+00 : f32		// CHECK: affine.store %{{.*}}, [[BUF]][%[[I2]]] : memref<1027xf32, 2>
// CHECK-NEXT: affine.store %{{.}}, [[BUF]][%{{.}}] : memref<1027xf32, 2>
// CHECK-NEXT: }		// CHECK-NEXT: }
// CHECK-NEXT: }		// CHECK-NEXT: }
// CHECK-NEXT: affine.dma_start [[BUF]][%{{.}}], %{{.}}[%{{.}}], [[MEM]][%{{.}}], %{{.*}} : memref<1027xf32, 2>, memref<1027xf32>, memref<1xi32>		// CHECK-NEXT: affine.dma_start [[BUF]][%{{.}}], %{{.}}[%{{.}}], [[MEM]][%{{.}}], %{{.*}} : memref<1027xf32, 2>, memref<1027xf32>, memref<1xi32>
// CHECK-NEXT: affine.dma_wait [[MEM]][%{{.}}], %{{.}} : memref<1xi32>		// CHECK-NEXT: affine.dma_wait [[MEM]][%{{.}}], %{{.}} : memref<1xi32>

// -----		// -----

// CHECK-DAG: [[MAP_READ_OFFSET:#map[0-9]+]] = affine_map<(d0) -> (d0 + 100)>
// CHECK-DAG: [[MAP_WRITE_OFFSET:#map[0-9]+]] = affine_map<(d0) -> (d0 + 25)>

func @test_read_write_region_union() {		func @test_read_write_region_union() {
%0 = alloc() : memref<256xf32>		%0 = alloc() : memref<256xf32>
affine.for %i0 = 0 to 10 {		affine.for %i0 = 0 to 10 {
// memref dims: [0, 256)		// memref dims: [0, 256)
// read region: [100, 110)		// read region: [100, 110)
// write region: [25, 35)		// write region: [25, 35)
// union region: [25, 110)		// union region: [25, 110)
%a0 = affine.apply affine_map<(d0) -> (d0 + 100)>(%i0)		%a0 = affine.apply affine_map<(d0) -> (d0 + 100)>(%i0)
%a1 = affine.apply affine_map<(d0) -> (d0 + 25)>(%i0)		%a1 = affine.apply affine_map<(d0) -> (d0 + 25)>(%i0)
%1 = affine.load %0[%a0] : memref<256xf32>		%1 = affine.load %0[%a0] : memref<256xf32>
affine.store %1, %0[%a1] : memref<256xf32>		affine.store %1, %0[%a1] : memref<256xf32>
}		}
return		return
}		}

// CHECK: alloc() : memref<256xf32>		// CHECK: alloc() : memref<256xf32>
// CHECK-NEXT: alloc() : memref<85xf32, 2>		// CHECK-NEXT: alloc() : memref<85xf32, 2>
// CHECK-NEXT: alloc() : memref<1xi32>		// CHECK-NEXT: alloc() : memref<1xi32>
// CHECK-NEXT: affine.dma_start %{{.}}[%{{.}}], %{{.}}[%{{.}}], %{{.}}[%{{.}}], %{{.*}} : memref<256xf32>, memref<85xf32, 2>, memref<1xi32>		// CHECK-NEXT: affine.dma_start %{{.}}[%{{.}}], %{{.}}[%{{.}}], %{{.}}[%{{.}}], %{{.*}} : memref<256xf32>, memref<85xf32, 2>, memref<1xi32>
// CHECK-NEXT: affine.dma_wait %{{.}}[%{{.}}], %{{.*}} : memref<1xi32>		// CHECK-NEXT: affine.dma_wait %{{.}}[%{{.}}], %{{.*}} : memref<1xi32>
// CHECK-NEXT: alloc() : memref<1xi32>		// CHECK-NEXT: alloc() : memref<1xi32>
// CHECK-NEXT: affine.for %{{.*}} = 0 to 10 {		// CHECK-NEXT: affine.for %{{.*}} = 0 to 10 {
// CHECK-NEXT: affine.apply [[MAP_READ_OFFSET]](%{{.*}})		// CHECK: affine.load %{{.}}[%{{.}} + 75] : memref<85xf32, 2>
// CHECK-NEXT: affine.apply [[MAP_WRITE_OFFSET]](%{{.*}})
// CHECK-NEXT: affine.load %{{.}}[%{{.}} + 75] : memref<85xf32, 2>
// CHECK-NEXT: affine.store %{{.}}, %{{.}}[%{{.*}}] : memref<85xf32, 2>		// CHECK-NEXT: affine.store %{{.}}, %{{.}}[%{{.*}}] : memref<85xf32, 2>
// CHECK-NEXT: }		// CHECK-NEXT: }
// CHECK-NEXT: affine.dma_start %{{.}}[%{{.}}], %{{.}}[%{{.}}], %{{.}}[%{{.}}], %{{.*}} : memref<85xf32, 2>, memref<256xf32>, memref<1xi32>		// CHECK-NEXT: affine.dma_start %{{.}}[%{{.}}], %{{.}}[%{{.}}], %{{.}}[%{{.}}], %{{.*}} : memref<85xf32, 2>, memref<256xf32>, memref<1xi32>
// CHECK-NEXT: affine.dma_wait %{{.}}[%{{.}}], %{{.*}} : memref<1xi32>		// CHECK-NEXT: affine.dma_wait %{{.}}[%{{.}}], %{{.*}} : memref<1xi32>

// -----		// -----

// This should create a buffer of size 2 affine.for %arg2.		// This should create a buffer of size 2 affine.for %arg2.
▲ Show 20 Lines • Show All 139 Lines • Show Last 20 Lines

mlir/test/lib/Dialect/Affine/TestAffineDataCopy.cpp

//===- TestAffineDataCopy.cpp - Test affine data copy utility -------------===//		//===- TestAffineDataCopy.cpp - Test affine data copy utility -------------===//
//		//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.		// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.		// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception		// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//		//
//===----------------------------------------------------------------------===//		//===----------------------------------------------------------------------===//
//		//
// This file implements a pass to test affine data copy utility functions and		// This file implements a pass to test affine data copy utility functions and
// options.		// options.
//		//
//===----------------------------------------------------------------------===//		//===----------------------------------------------------------------------===//

#include "mlir/Analysis/Utils.h"		#include "mlir/Analysis/Utils.h"
#include "mlir/Dialect/Affine/IR/AffineOps.h"		#include "mlir/Dialect/Affine/IR/AffineOps.h"
		#include "mlir/IR/PatternMatch.h"
#include "mlir/Pass/Pass.h"		#include "mlir/Pass/Pass.h"
#include "mlir/Transforms/LoopUtils.h"		#include "mlir/Transforms/LoopUtils.h"
#include "mlir/Transforms/Passes.h"		#include "mlir/Transforms/Passes.h"

#define PASS_NAME "test-affine-data-copy"		#define PASS_NAME "test-affine-data-copy"

using namespace mlir;		using namespace mlir;

▲ Show 20 Lines • Show All 47 Lines • ▼ Show 20 Lines	if (clMemRefFilter \|\| clTestGenerateCopyForMemRegion) {
}		}
}		}

AffineCopyOptions copyOptions = {/generateDma=/false,		AffineCopyOptions copyOptions = {/generateDma=/false,
/slowMemorySpace=/0,		/slowMemorySpace=/0,
/fastMemorySpace=/0,		/fastMemorySpace=/0,
/tagMemorySpace=/0,		/tagMemorySpace=/0,
/fastMemCapacityBytes=/32 * 1024 * 1024UL};		/fastMemCapacityBytes=/32 * 1024 * 1024UL};
if (clMemRefFilter) {
DenseSet<Operation *> copyNests;		DenseSet<Operation *> copyNests;
		if (clMemRefFilter) {
affineDataCopyGenerate(loopNest, copyOptions, load.getMemRef(), copyNests);		affineDataCopyGenerate(loopNest, copyOptions, load.getMemRef(), copyNests);
} else if (clTestGenerateCopyForMemRegion) {		} else if (clTestGenerateCopyForMemRegion) {
CopyGenerateResult result;		CopyGenerateResult result;
MemRefRegion region(loopNest.getLoc());		MemRefRegion region(loopNest.getLoc());
region.compute(load, /loopDepth=/0);		region.compute(load, /loopDepth=/0);
generateCopyForMemRegion(region, loopNest, copyOptions, result);		generateCopyForMemRegion(region, loopNest, copyOptions, result);
}		}

		// Promote any single iteration loops in the copy nests.
		for (auto nest : copyNests)
		nest->walk([](AffineForOp forOp) { promoteIfSingleIteration(forOp); });

		// Promoting single iteration loops could lead to simplification
		// of load's/store's. We will run the canonicalization patterns again.
		OwningRewritePatternList patterns;
		AffineLoadOp::getCanonicalizationPatterns(patterns, &getContext());
		AffineStoreOp::getCanonicalizationPatterns(patterns, &getContext());
		applyPatternsGreedily(getFunction(), std::move(patterns));
}		}

namespace mlir {		namespace mlir {
void registerTestAffineDataCopyPass() {		void registerTestAffineDataCopyPass() {
PassRegistration<TestAffineDataCopy>(		PassRegistration<TestAffineDataCopy>(
PASS_NAME, "Tests affine data copy utility functions.");		PASS_NAME, "Tests affine data copy utility functions.");
}		}
} // namespace mlir		} // namespace mlir

This is an archive of the discontinued LLVM Phabricator instance.

[MLIR] fix/update affine data copy utility for max/min bounds
ClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 255614

mlir/include/mlir/Analysis/AffineStructures.h

mlir/include/mlir/Analysis/Utils.h

mlir/lib/Analysis/AffineStructures.cpp

mlir/lib/Analysis/Utils.cpp

mlir/lib/Dialect/Affine/Transforms/AffineDataCopyGeneration.cpp

mlir/lib/Transforms/Utils/LoopUtils.cpp

mlir/test/Dialect/Affine/affine-data-copy.mlir

mlir/test/Dialect/Affine/dma-generate.mlir

mlir/test/lib/Dialect/Affine/TestAffineDataCopy.cpp

This is an archive of the discontinued LLVM Phabricator instance.

[MLIR] fix/update affine data copy utility for max/min boundsClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 255614

mlir/include/mlir/Analysis/AffineStructures.h

mlir/include/mlir/Analysis/Utils.h

mlir/lib/Analysis/AffineStructures.cpp

mlir/lib/Analysis/Utils.cpp

mlir/lib/Dialect/Affine/Transforms/AffineDataCopyGeneration.cpp

mlir/lib/Transforms/Utils/LoopUtils.cpp

mlir/test/Dialect/Affine/affine-data-copy.mlir

mlir/test/Dialect/Affine/dma-generate.mlir

mlir/test/lib/Dialect/Affine/TestAffineDataCopy.cpp

[MLIR] fix/update affine data copy utility for max/min bounds
ClosedPublic