Diff 272086

mlir/lib/Dialect/SCF/Transforms/ParallelLoopTiling.cpp

Show All 24 Lines
/// Tile a parallel loop of the form		/// Tile a parallel loop of the form
/// scf.parallel (%i0, %i1) = (%arg0, %arg1) to (%arg2, %arg3)		/// scf.parallel (%i0, %i1) = (%arg0, %arg1) to (%arg2, %arg3)
/// step (%arg4, %arg5)		/// step (%arg4, %arg5)
///		///
/// into		/// into
/// scf.parallel (%i0, %i1) = (%arg0, %arg1) to (%arg2, %arg3)		/// scf.parallel (%i0, %i1) = (%arg0, %arg1) to (%arg2, %arg3)
/// step (%arg4*tileSize[0],		/// step (%arg4*tileSize[0],
/// %arg5*tileSize[1])		/// %arg5*tileSize[1])
/// scf.parallel (%j0, %j1) = (0, 0) to (min(tileSize[0], %arg2-%i0)		/// scf.parallel (%j0, %j1) = (0, 0) to (min(%arg4*tileSize[0], %arg2-%i0)
/// min(tileSize[1], %arg3-%i1))		/// min(%arg5*tileSize[1], %arg3-%i1))
/// step (%arg4, %arg5)		/// step (%arg4, %arg5)
///		///
/// where the uses of %i0 and %i1 in the loop body are replaced by		/// where the uses of %i0 and %i1 in the loop body are replaced by
/// %i0 + j0 and %i1 + %j1.		/// %i0 + j0 and %i1 + %j1.
//		//
/// The old loop is replaced with the new one.		/// The old loop is replaced with the new one.
void mlir::scf::tileParallelLoop(ParallelOp op, ArrayRef<int64_t> tileSizes) {		void mlir::scf::tileParallelLoop(ParallelOp op, ArrayRef<int64_t> tileSizes) {
OpBuilder b(op);		OpBuilder b(op);
auto zero = b.create<ConstantIndexOp>(op.getLoc(), 0);		auto zero = b.create<ConstantIndexOp>(op.getLoc(), 0);
SmallVector<Value, 2> tileSizeConstants;		SmallVector<Value, 2> tileSizeConstants;
tileSizeConstants.reserve(op.upperBound().size());		tileSizeConstants.reserve(op.upperBound().size());
for (size_t i = 0, end = op.upperBound().size(); i != end; ++i) {		for (size_t i = 0, end = op.upperBound().size(); i != end; ++i) {
if (i < tileSizes.size())		if (i < tileSizes.size())
tileSizeConstants.push_back(		tileSizeConstants.push_back(
b.create<ConstantIndexOp>(op.getLoc(), tileSizes[i]));		b.create<ConstantIndexOp>(op.getLoc(), tileSizes[i]));
else		else
// Just pick 1 for the remaining dimensions.		// Just pick 1 for the remaining dimensions.
tileSizeConstants.push_back(b.create<ConstantIndexOp>(op.getLoc(), 1));		tileSizeConstants.push_back(b.create<ConstantIndexOp>(op.getLoc(), 1));
}		}

// Create the outer loop with adjusted steps.		// Create the outer loop with adjusted steps.
SmallVector<Value, 2> newSteps;		SmallVector<Value, 2> newSteps;
newSteps.reserve(op.step().size());		newSteps.reserve(op.step().size());
for (auto step : llvm::zip(op.step(), tileSizeConstants)) {		for (auto step : llvm::zip(op.step(), tileSizeConstants)) {
		herhutUnsubmitted Done Reply Inline Actions Why do you dislike the `llvm::zip`? herhut: Why do you dislike the `llvm::zip`?
newSteps.push_back(		newSteps.push_back(
b.create<MulIOp>(op.getLoc(), std::get<0>(step), std::get<1>(step)));		b.create<MulIOp>(op.getLoc(), std::get<0>(step), std::get<1>(step)));
}		}
auto outerLoop = b.create<ParallelOp>(op.getLoc(), op.lowerBound(),		auto outerLoop = b.create<ParallelOp>(op.getLoc(), op.lowerBound(),
op.upperBound(), newSteps);		op.upperBound(), newSteps);
b.setInsertionPointToStart(outerLoop.getBody());		b.setInsertionPointToStart(outerLoop.getBody());

// Compute min(size, dim - offset) to avoid out-of-bounds accesses.		// Compute min(size, dim - offset) to avoid out-of-bounds accesses.
// FIXME: Instead of using min, we want to replicate the tail. This would give		// FIXME: Instead of using min, we want to replicate the tail. This would give
// the inner loop constant bounds for easy vectorization.		// the inner loop constant bounds for easy vectorization.
auto minMap = AffineMap::get(		auto minMap = AffineMap::get(
/dimCount=/3, /symbolCount=/0,		/dimCount=/3, /symbolCount=/0,
{getAffineDimExpr(/position=/0, b.getContext()),		{getAffineDimExpr(/position=/0, b.getContext()),
getAffineDimExpr(/position=/1, b.getContext()) -		getAffineDimExpr(/position=/1, b.getContext()) -
getAffineDimExpr(/position=/2, b.getContext())},		getAffineDimExpr(/position=/2, b.getContext())},
b.getContext());		b.getContext());

// Create the inner loop with adjusted bounds.		// Create the inner loop with adjusted bounds.
SmallVector<Value, 2> newBounds;		SmallVector<Value, 2> newBounds;
newBounds.reserve(op.upperBound().size());		newBounds.reserve(op.upperBound().size());
for (auto bounds : llvm::zip(tileSizeConstants, outerLoop.upperBound(),		for (auto dim : llvm::zip(outerLoop.lowerBound(), outerLoop.upperBound(),
outerLoop.getInductionVars())) {		outerLoop.step(), outerLoop.getInductionVars(),
		herhutUnsubmitted Done Reply Inline Actions This optimization makes sense here, as it is hard to recover this out of the generated affine min expression. But the comment could explain this better. Something like: If we statically know the size of the outer loops iteration and it is divisible by the tiling factor, we can use a static bound for the inner loop. Otherwise, we have to dynamically compute the bound for each iteration of the outer loop. herhut: This optimization makes sense here, as it is hard to recover this out of the generated affine…
newBounds.push_back(b.create<AffineMinOp>(		op.step(), tileSizeConstants)) {
op.getLoc(), b.getIndexType(), minMap,		Value lowerBound, upperBound, newStep, iv, step, tileSizeConstant;
ValueRange{std::get<0>(bounds), std::get<1>(bounds),		std::tie(lowerBound, upperBound, newStep, iv, step, tileSizeConstant) = dim;
		ftynseUnsubmitted Done Reply Inline Actions I could not parse this comment ftynse: I could not parse this comment
std::get<2>(bounds)}));		// Collect the statically known loop bounds
		auto lowerBoundConstant =
		dyn_cast_or_null<ConstantIndexOp>(lowerBound.getDefiningOp());
		auto upperBoundConstant =
		dyn_cast_or_null<ConstantIndexOp>(upperBound.getDefiningOp());
		herhutUnsubmitted Done Reply Inline Actions Why not zip over `op.step` and `tileSizeConstants` to get these two? That avoids the pattern matching. You can then assign all of them at the beginning of the loop to give them local names. Something like Value lowerBound, upperBound, originalStep, newStep, index; std::tie... herhut: Why not zip over `op.step` and `tileSizeConstants` to get these two? That avoids the pattern…
		auto stepConstant = dyn_cast_or_null<ConstantIndexOp>(step.getDefiningOp());
		auto tileSize =
		cast<ConstantIndexOp>(tileSizeConstant.getDefiningOp()).getValue();
		// If we statically know the size of the outer loops iteration and it is
		bondhugulaUnsubmitted Done Reply Inline Actions Please rephrase this: "size of the outer loops iteration" isn't meaningful. Also, you are only considering the case of lb, ub, step being constant, which isn't the same of trip counts being statically known/constant (for eg., %i = %N to %N + 16). bondhugula: Please rephrase this: "size of the outer loops iteration" isn't meaningful. Also, you are only…
		// divisible by the tiling factor, we can use a static bound for the inner
		// loop. Otherwise, we have to dynamically compute the bound for each
		// iteration of the outer loop.
		if (lowerBoundConstant && upperBoundConstant && stepConstant &&
		(upperBoundConstant.getValue() - lowerBoundConstant.getValue()) %
		(stepConstant.getValue() * tileSize) ==
		rriddleUnsubmitted Done Reply Inline Actions This is an extremely complex condition, please add braces and/or preferably split the conditions. rriddle: This is an extremely complex condition, please add braces and/or preferably split the…
		0)
		newBounds.push_back(newStep);
		else
		newBounds.push_back(
		rriddleUnsubmitted Done Reply Inline Actions Add braces here, this if is not trivial. rriddle: Add braces here, this if is not trivial.
		b.create<AffineMinOp>(op.getLoc(), b.getIndexType(), minMap,
		ValueRange{newStep, upperBound, iv}));
		herhutUnsubmitted Done Reply Inline Actions You have assigned names to these `std::get` previously by assigning to locals. Why not use them here? herhut: You have assigned names to these `std::get` previously by assigning to locals. Why not use them…
}		}
auto innerLoop = b.create<ParallelOp>(		auto innerLoop = b.create<ParallelOp>(
op.getLoc(), SmallVector<Value, 2>(newBounds.size(), zero), newBounds,		op.getLoc(), SmallVector<Value, 2>(newBounds.size(), zero), newBounds,
op.step());		op.step());

// Steal the body of the old parallel loop and erase it.		// Steal the body of the old parallel loop and erase it.
innerLoop.region().takeBody(op.region());		innerLoop.region().takeBody(op.region());

// Insert computation for new index vectors and replace uses.		// Insert computation for new index vectors and replace uses.
b.setInsertionPointToStart(innerLoop.getBody());		b.setInsertionPointToStart(innerLoop.getBody());
for (auto ivs :		for (auto ivs :
llvm::zip(innerLoop.getInductionVars(), outerLoop.getInductionVars())) {		llvm::zip(innerLoop.getInductionVars(), outerLoop.getInductionVars())) {
Value inner_index = std::get<0>(ivs);		Value inner_index = std::get<0>(ivs);
AddIOp newIndex =		AddIOp newIndex =
b.create<AddIOp>(op.getLoc(), std::get<0>(ivs), std::get<1>(ivs));		b.create<AddIOp>(op.getLoc(), std::get<0>(ivs), std::get<1>(ivs));
inner_index.replaceAllUsesExcept(		inner_index.replaceAllUsesExcept(
newIndex, SmallPtrSet<Operation *, 1>{newIndex.getOperation()});		newIndex, SmallPtrSet<Operation *, 1>{newIndex.getOperation()});
}		}
		ftynseUnsubmitted Done Reply Inline Actions Maybe we should consider a separate "canonicalization" pass (or an actual canonicalization) that removes single-iteration loops completely. There is one on affine loops. ftynse: Maybe we should consider a separate "canonicalization" pass (or an actual canonicalization)…

op.erase();		op.erase();
		herhutUnsubmitted Done Reply Inline Actions I would rather have this as a separate canonicalization pattern that removes parallel loops with trip-count 1. herhut: I would rather have this as a separate canonicalization pattern that removes parallel loops…
}		}

/// Get a list of most nested parallel loops. Assumes that ParallelOps are only		/// Get a list of most nested parallel loops. Assumes that ParallelOps are only
/// directly nested.		/// directly nested.
static bool getInnermostNestedLoops(Block *block,		static bool getInnermostNestedLoops(Block *block,
SmallVectorImpl<ParallelOp> &loops) {		SmallVectorImpl<ParallelOp> &loops) {
bool hasInnerLoop = false;		bool hasInnerLoop = false;
for (auto parallelOp : block->getOps<ParallelOp>()) {		for (auto parallelOp : block->getOps<ParallelOp>()) {
Show All 13 Lines	struct ParallelLoopTiling
}		}

void runOnFunction() override {		void runOnFunction() override {
SmallVector<ParallelOp, 2> mostNestedParallelOps;		SmallVector<ParallelOp, 2> mostNestedParallelOps;
for (Block &block : getFunction()) {		for (Block &block : getFunction()) {
getInnermostNestedLoops(&block, mostNestedParallelOps);		getInnermostNestedLoops(&block, mostNestedParallelOps);
}		}
for (ParallelOp pLoop : mostNestedParallelOps) {		for (ParallelOp pLoop : mostNestedParallelOps) {
		// FIXME: Add reduction support.
		if (pLoop.getNumReductions() == 0)
tileParallelLoop(pLoop, tileSizes);		tileParallelLoop(pLoop, tileSizes);
}		}
}		}
};		};
} // namespace		} // namespace

std::unique_ptr<Pass>		std::unique_ptr<Pass>
mlir::createParallelLoopTilingPass(ArrayRef<int64_t> tileSizes) {		mlir::createParallelLoopTilingPass(ArrayRef<int64_t> tileSizes) {
return std::make_unique<ParallelLoopTiling>(tileSizes);		return std::make_unique<ParallelLoopTiling>(tileSizes);
}		}

mlir/test/Dialect/SCF/parallel-loop-tiling.mlir

Show All 9 Lines	scf.parallel (%i0, %i1) = (%arg0, %arg1) to (%arg2, %arg3) step (%arg4, %arg5) {
%sum_elem = addf %B_elem, %C_elem : f32		%sum_elem = addf %B_elem, %C_elem : f32
store %sum_elem, %result[%i0, %i1] : memref<?x?xf32>		store %sum_elem, %result[%i0, %i1] : memref<?x?xf32>
}		}
return		return
}		}

// CHECK: #map0 = affine_map<(d0, d1, d2) -> (d0, d1 - d2)>		// CHECK: #map0 = affine_map<(d0, d1, d2) -> (d0, d1 - d2)>
// CHECK-LABEL: func @parallel_loop(		// CHECK-LABEL: func @parallel_loop(
// CHECK-SAME: [[VAL_0:%.]]: index, [[VAL_1:%.]]: index, [[VAL_2:%.]]: index, [[VAL_3:%.]]: index, [[VAL_4:%.]]: index, [[VAL_5:%.]]: index, [[VAL_6:%.]]: memref<?x?xf32>, [[VAL_7:%.]]: memref<?x?xf32>, [[VAL_8:%.]]: memref<?x?xf32>, [[VAL_9:%.]]: memref<?x?xf32>) {		// CHECK-SAME: [[ARG1:%.]]: index, [[ARG2:%.]]: index, [[ARG3:%.]]: index, [[ARG4:%.]]: index, [[ARG5:%.]]: index, [[ARG6:%.]]: index, [[ARG7:%.]]: memref<?x?xf32>, [[ARG8:%.]]: memref<?x?xf32>, [[ARG9:%.]]: memref<?x?xf32>, [[ARG10:%.]]: memref<?x?xf32>) {
// CHECK: [[VAL_10:%.*]] = constant 0 : index		// CHECK: [[C0:%.*]] = constant 0 : index
// CHECK: [[VAL_11:%.*]] = constant 1 : index		// CHECK: [[C1:%.*]] = constant 1 : index
// CHECK: [[VAL_12:%.*]] = constant 4 : index		// CHECK: [[C4:%.*]] = constant 4 : index
// CHECK: [[VAL_13:%.*]] = muli [[VAL_4]], [[VAL_11]] : index		// CHECK: [[V1:%.*]] = muli [[ARG5]], [[C1]] : index
// CHECK: [[VAL_14:%.*]] = muli [[VAL_5]], [[VAL_12]] : index		// CHECK: [[V2:%.*]] = muli [[ARG6]], [[C4]] : index
// CHECK: scf.parallel ([[VAL_15:%.]], [[VAL_16:%.]]) = ([[VAL_0]], [[VAL_1]]) to ([[VAL_2]], [[VAL_3]]) step ([[VAL_13]], [[VAL_14]]) {		// CHECK: scf.parallel ([[V3:%.]], [[V4:%.]]) = ([[ARG1]], [[ARG2]]) to ([[ARG3]], [[ARG4]]) step ([[V1]], [[V2]]) {
// CHECK: [[VAL_17:%.*]] = affine.min #map0([[VAL_11]], [[VAL_2]], [[VAL_15]])		// CHECK: [[V5:%.*]] = affine.min #map0([[V1]], [[ARG3]], [[V3]])
// CHECK: [[VAL_18:%.*]] = affine.min #map0([[VAL_12]], [[VAL_3]], [[VAL_16]])		// CHECK: [[V6:%.*]] = affine.min #map0([[V2]], [[ARG4]], [[V4]])
// CHECK: scf.parallel ([[VAL_19:%.]], [[VAL_20:%.]]) = ([[VAL_10]], [[VAL_10]]) to ([[VAL_17]], [[VAL_18]]) step ([[VAL_4]], [[VAL_5]]) {		// CHECK: scf.parallel ([[V7:%.]], [[V8:%.]]) = ([[C0]], [[C0]]) to ([[V5]], [[V6]]) step ([[ARG5]], [[ARG6]]) {
// CHECK: [[VAL_21:%.*]] = addi [[VAL_19]], [[VAL_15]] : index		// CHECK: [[V9:%.*]] = addi [[V7]], [[V3]] : index
// CHECK: [[VAL_22:%.*]] = addi [[VAL_20]], [[VAL_16]] : index		// CHECK: [[V10:%.*]] = addi [[V8]], [[V4]] : index
// CHECK: [[VAL_23:%.*]] = load [[VAL_7]]{{\[}}[[VAL_21]], [[VAL_22]]] : memref<?x?xf32>		// CHECK: [[V11:%.*]] = load [[ARG8]]{{\[}}[[V9]], [[V10]]] : memref<?x?xf32>
// CHECK: [[VAL_24:%.*]] = load [[VAL_8]]{{\[}}[[VAL_21]], [[VAL_22]]] : memref<?x?xf32>		// CHECK: [[V12:%.*]] = load [[ARG9]]{{\[}}[[V9]], [[V10]]] : memref<?x?xf32>
// CHECK: [[VAL_25:%.*]] = addf [[VAL_23]], [[VAL_24]] : f32		// CHECK: [[V13:%.*]] = addf [[V11]], [[V12]] : f32
// CHECK: store [[VAL_25]], [[VAL_9]]{{\[}}[[VAL_21]], [[VAL_22]]] : memref<?x?xf32>		// CHECK: store [[V13]], [[ARG10]]{{\[}}[[V9]], [[V10]]] : memref<?x?xf32>
		// CHECK: }
		// CHECK: }
		// CHECK: return

		// -----

		func @static_loop_with_step() {
		%c0 = constant 0 : index
		%c3 = constant 3 : index
		%c24 = constant 24 : index
		scf.parallel (%i0, %i1) = (%c0, %c0) to (%c24, %c24) step (%c3, %c3) {
		}
		return
		}

		// CHECK-LABEL: func @static_loop_with_step() {
		// CHECK: [[C0:%.*]] = constant 0 : index
		// CHECK: [[C3:%.*]] = constant 3 : index
		// CHECK: [[C24:%.*]] = constant 24 : index
		// CHECK: [[C0_1:%.*]] = constant 0 : index
		bondhugulaUnsubmitted Done Reply Inline Actions Please avoid using VAL_<running_number> - instead C0, C3, C0_1, etc. for readability. A substitution at this point. bondhugula: Please avoid using VAL_<running_number> - instead C0, C3, C0_1, etc. for readability. A…
		// CHECK: [[C1:%.*]] = constant 1 : index
		// CHECK: [[C4:%.*]] = constant 4 : index
		// CHECK: [[V1:%.*]] = muli [[C3]], [[C1]] : index
		// CHECK: [[V2:%.*]] = muli [[C3]], [[C4]] : index
		// CHECK: scf.parallel ([[V3:%.]], [[V4:%.]]) = ([[C0]], [[C0]]) to ([[C24]], [[C24]]) step ([[V1]], [[V2]]) {
		// CHECK: scf.parallel ([[V5:%.]], [[V6:%.]]) = ([[C0_1]], [[C0_1]]) to ([[V1]], [[V2]]) step ([[C3]], [[C3]]) {
		// CHECK: = addi [[V5]], [[V3]] : index
		// CHECK: = addi [[V6]], [[V4]] : index
// CHECK: }		// CHECK: }
// CHECK: }		// CHECK: }
// CHECK: return		// CHECK: return

// -----		// -----

func @tile_nested_innermost() {		func @tile_nested_innermost() {
%c2 = constant 2 : index		%c2 = constant 2 : index
%c0 = constant 0 : index		%c0 = constant 0 : index
%c1 = constant 1 : index		%c1 = constant 1 : index
scf.parallel (%i, %j) = (%c0, %c0) to (%c2, %c2) step (%c1, %c1) {		scf.parallel (%i, %j) = (%c0, %c0) to (%c2, %c2) step (%c1, %c1) {
scf.parallel (%k, %l) = (%c0, %c0) to (%c2, %c2) step (%c1, %c1) {		scf.parallel (%k, %l) = (%c0, %c0) to (%c2, %c2) step (%c1, %c1) {
}		}
}		}
scf.parallel (%i, %j) = (%c0, %c0) to (%c2, %c2) step (%c1, %c1) {		scf.parallel (%i, %j) = (%c0, %c0) to (%c2, %c2) step (%c1, %c1) {
}		}
return		return
}		}

// CHECK-LABEL: func @tile_nested_innermost() {		// CHECK-LABEL: func @tile_nested_innermost() {
// CHECK: [[VAL_24:%.*]] = constant 2 : index		// CHECK: [[C2:%.*]] = constant 2 : index
// CHECK: [[VAL_25:%.*]] = constant 0 : index		// CHECK: [[C0:%.*]] = constant 0 : index
// CHECK: [[VAL_26:%.*]] = constant 1 : index		// CHECK: [[C1:%.*]] = constant 1 : index
// CHECK: scf.parallel ([[VAL_27:%.]], [[VAL_28:%.]]) = ([[VAL_25]], [[VAL_25]]) to ([[VAL_24]], [[VAL_24]]) step ([[VAL_26]], [[VAL_26]]) {		// CHECK: scf.parallel ([[V1:%.]], [[V2:%.]]) = ([[C0]], [[C0]]) to ([[C2]], [[C2]]) step ([[C1]], [[C1]]) {
// CHECK: [[VAL_29:%.*]] = constant 0 : index		// CHECK: [[C0_1:%.*]] = constant 0 : index
// CHECK: [[VAL_30:%.*]] = constant 1 : index		// CHECK: [[C1_1:%.*]] = constant 1 : index
// CHECK: [[VAL_31:%.*]] = constant 4 : index		// CHECK: [[C4:%.*]] = constant 4 : index
// CHECK: [[VAL_32:%.*]] = muli [[VAL_26]], [[VAL_30]] : index		// CHECK: [[V3:%.*]] = muli [[C1]], [[C1_1]] : index
// CHECK: [[VAL_33:%.*]] = muli [[VAL_26]], [[VAL_31]] : index		// CHECK: [[V4:%.*]] = muli [[C1]], [[C4]] : index
// CHECK: scf.parallel ([[VAL_34:%.]], [[VAL_35:%.]]) = ([[VAL_25]], [[VAL_25]]) to ([[VAL_24]], [[VAL_24]]) step ([[VAL_32]], [[VAL_33]]) {		// CHECK: scf.parallel ([[V5:%.]], [[V6:%.]]) = ([[C0]], [[C0]]) to ([[C2]], [[C2]]) step ([[V3]], [[V4]]) {
// CHECK: [[VAL_36:%.*]] = affine.min #map0([[VAL_30]], [[VAL_24]], [[VAL_34]])		// CHECK: [[V7:%.*]] = affine.min #map0([[V4]], [[C2]], [[V6]])
// CHECK: [[VAL_37:%.*]] = affine.min #map0([[VAL_31]], [[VAL_24]], [[VAL_35]])		// CHECK: scf.parallel ([[V8:%.]], [[V9:%.]]) = ([[C0_1]], [[C0_1]]) to ([[V3]], [[V7]]) step ([[C1]], [[C1]]) {
// CHECK: scf.parallel ([[VAL_38:%.]], [[VAL_39:%.]]) = ([[VAL_29]], [[VAL_29]]) to ([[VAL_36]], [[VAL_37]]) step ([[VAL_26]], [[VAL_26]]) {		// CHECK: = addi [[V8]], [[V5]] : index
		// CHECK: = addi [[V9]], [[V6]] : index
// CHECK: }		// CHECK: }
// CHECK: }		// CHECK: }
// CHECK: }		// CHECK: }
// CHECK: [[VAL_40:%.*]] = constant 0 : index		// CHECK: [[C0_2:%.*]] = constant 0 : index
// CHECK: [[VAL_41:%.*]] = constant 1 : index		// CHECK: [[C1_2:%.*]] = constant 1 : index
// CHECK: [[VAL_42:%.*]] = constant 4 : index		// CHECK: [[C4_1:%.*]] = constant 4 : index
// CHECK: [[VAL_43:%.*]] = muli [[VAL_26]], [[VAL_41]] : index		// CHECK: [[V10:%.*]] = muli [[C1]], [[C1_2]] : index
// CHECK: [[VAL_44:%.*]] = muli [[VAL_26]], [[VAL_42]] : index		// CHECK: [[V11:%.*]] = muli [[C1]], [[C4_1]] : index
// CHECK: scf.parallel ([[VAL_45:%.]], [[VAL_46:%.]]) = ([[VAL_25]], [[VAL_25]]) to ([[VAL_24]], [[VAL_24]]) step ([[VAL_43]], [[VAL_44]]) {		// CHECK: scf.parallel ([[V12:%.]], [[V13:%.]]) = ([[C0]], [[C0]]) to ([[C2]], [[C2]]) step ([[V10]], [[V11]]) {
// CHECK: [[VAL_47:%.*]] = affine.min #map0([[VAL_41]], [[VAL_24]], [[VAL_45]])		// CHECK: [[V14:%.*]] = affine.min #map0([[V11]], [[C2]], [[V13]])
// CHECK: [[VAL_48:%.*]] = affine.min #map0([[VAL_42]], [[VAL_24]], [[VAL_46]])		// CHECK: scf.parallel ([[V15:%.]], [[V16:%.]]) = ([[C0_2]], [[C0_2]]) to ([[V10]], [[V14]]) step ([[C1]], [[C1]]) {
// CHECK: scf.parallel ([[VAL_49:%.]], [[VAL_50:%.]]) = ([[VAL_40]], [[VAL_40]]) to ([[VAL_47]], [[VAL_48]]) step ([[VAL_26]], [[VAL_26]]) {		// CHECK: = addi [[V15]], [[V12]] : index
		// CHECK: = addi [[V16]], [[V13]] : index
// CHECK: }		// CHECK: }
// CHECK: }		// CHECK: }
// CHECK: return		// CHECK: return
// CHECK: }		// CHECK: }

This is an archive of the discontinued LLVM Phabricator instance.

[mlir] parallel loop tiling optimization for loops with static bounds
ClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 272086

mlir/lib/Dialect/SCF/Transforms/ParallelLoopTiling.cpp

mlir/test/Dialect/SCF/parallel-loop-tiling.mlir

This is an archive of the discontinued LLVM Phabricator instance.

[mlir] parallel loop tiling optimization for loops with static boundsClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 272086

mlir/lib/Dialect/SCF/Transforms/ParallelLoopTiling.cpp

mlir/test/Dialect/SCF/parallel-loop-tiling.mlir

[mlir] parallel loop tiling optimization for loops with static bounds
ClosedPublic