Diff 272002

mlir/include/mlir/Dialect/SCF/SCFOps.td

Show First 20 Lines • Show All 340 Lines • ▼ Show 20 Lines	def ParallelOp : SCF_Op<"parallel",

let extraClassDeclaration = [{		let extraClassDeclaration = [{
ValueRange getInductionVars() {		ValueRange getInductionVars() {
return getBody()->getArguments();		return getBody()->getArguments();
}		}
unsigned getNumLoops() { return step().size(); }		unsigned getNumLoops() { return step().size(); }
unsigned getNumReductions() { return initVals().size(); }		unsigned getNumReductions() { return initVals().size(); }
}];		}];

		let hasCanonicalizer = 1;
}		}

def ReduceOp : SCF_Op<"reduce", [HasParent<"ParallelOp">]> {		def ReduceOp : SCF_Op<"reduce", [HasParent<"ParallelOp">]> {
let summary = "reduce operation for parallel for";		let summary = "reduce operation for parallel for";
let description = [{		let description = [{
"scf.reduce" is an operation occurring inside "scf.parallel" operations.		"scf.reduce" is an operation occurring inside "scf.parallel" operations.
It consists of one block with two arguments which have the same type as the		It consists of one block with two arguments which have the same type as the
operand of "scf.reduce".		operand of "scf.reduce".
▲ Show 20 Lines • Show All 82 Lines • Show Last 20 Lines

mlir/lib/Dialect/SCF/SCF.cpp

//===- SCF.cpp - Structured Control Flow Operations -----------------------===//		//===- SCF.cpp - Structured Control Flow Operations -----------------------===//
//		//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.		// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.		// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception		// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//		//
//===----------------------------------------------------------------------===//		//===----------------------------------------------------------------------===//

#include "mlir/Dialect/SCF/SCF.h"		#include "mlir/Dialect/SCF/SCF.h"
#include "mlir/Dialect/StandardOps/IR/Ops.h"		#include "mlir/Dialect/StandardOps/IR/Ops.h"
#include "mlir/IR/AffineExpr.h"		#include "mlir/IR/AffineExpr.h"
#include "mlir/IR/AffineMap.h"		#include "mlir/IR/AffineMap.h"
		#include "mlir/IR/BlockAndValueMapping.h"
#include "mlir/IR/Builders.h"		#include "mlir/IR/Builders.h"
#include "mlir/IR/Function.h"		#include "mlir/IR/Function.h"
#include "mlir/IR/Matchers.h"		#include "mlir/IR/Matchers.h"
#include "mlir/IR/Module.h"		#include "mlir/IR/Module.h"
#include "mlir/IR/OpImplementation.h"		#include "mlir/IR/OpImplementation.h"
#include "mlir/IR/PatternMatch.h"		#include "mlir/IR/PatternMatch.h"
#include "mlir/IR/StandardTypes.h"		#include "mlir/IR/StandardTypes.h"
#include "mlir/IR/Value.h"		#include "mlir/IR/Value.h"
		#include "mlir/Support/LLVM.h"
		#include "mlir/Support/LogicalResult.h"
#include "mlir/Support/MathExtras.h"		#include "mlir/Support/MathExtras.h"

using namespace mlir;		using namespace mlir;
using namespace mlir::scf;		using namespace mlir::scf;

//===----------------------------------------------------------------------===//		//===----------------------------------------------------------------------===//
// SCFDialect		// SCFDialect
//===----------------------------------------------------------------------===//		//===----------------------------------------------------------------------===//
▲ Show 20 Lines • Show All 672 Lines • ▼ Show 20 Lines	ParallelOp mlir::scf::getParallelForInductionVarOwner(Value val) {
auto ivArg = val.dyn_cast<BlockArgument>();		auto ivArg = val.dyn_cast<BlockArgument>();
if (!ivArg)		if (!ivArg)
return ParallelOp();		return ParallelOp();
assert(ivArg.getOwner() && "unlinked block argument");		assert(ivArg.getOwner() && "unlinked block argument");
auto *containingOp = ivArg.getOwner()->getParentOp();		auto *containingOp = ivArg.getOwner()->getParentOp();
return dyn_cast<ParallelOp>(containingOp);		return dyn_cast<ParallelOp>(containingOp);
}		}

		namespace {
		// Collapse loop dimensions that perform a single iteration.
		struct CollapseSingleIterationLoops : public OpRewritePattern<ParallelOp> {
		using OpRewritePattern<ParallelOp>::OpRewritePattern;

		LogicalResult matchAndRewrite(ParallelOp op,
		PatternRewriter &rewriter) const override {
		BlockAndValueMapping mapping;
		// Compute new loop bounds that omit all single-iteration loop dimensions.
		SmallVector<Value, 2> newLowerBounds;
		SmallVector<Value, 2> newUpperBounds;
		SmallVector<Value, 2> newSteps;
		newLowerBounds.reserve(op.lowerBound().size());
		newUpperBounds.reserve(op.upperBound().size());
		newSteps.reserve(op.step().size());
		for (auto bounds : llvm::zip(op.lowerBound(), op.upperBound(), op.step(),
		op.getBody()->getArguments())) {
		auto lowerBound = dyn_cast_or_null<ConstantIndexOp>(
		std::get<0>(bounds).getDefiningOp());
		auto upperBound = dyn_cast_or_null<ConstantIndexOp>(
		std::get<1>(bounds).getDefiningOp());
		auto step = dyn_cast_or_null<ConstantIndexOp>(
		std::get<2>(bounds).getDefiningOp());
		// Replace the loop induction variable by the lower bound if the loop
		// performs a single iteration. Otherwise, copy the loop bounds.
		if (lowerBound && upperBound && step &&
		(upperBound.getValue() - lowerBound.getValue()) <= step.getValue()) {
		herhutUnsubmitted Not Done Reply Inline Actions You also need to check whether `upperBound - lowerBound > 0`. In the other case, the loop has no iterations while the rewritten loop has one. herhut: You also need to check whether `upperBound - lowerBound > 0`. In the other case, the loop has…
		mapping.map(std::get<3>(bounds), std::get<0>(bounds));
		} else {
		newLowerBounds.push_back(std::get<0>(bounds));
		newUpperBounds.push_back(std::get<1>(bounds));
		newSteps.push_back(std::get<2>(bounds));
		}
		}
		// Exit if all or none of the loop dimensions performs a single iteration.
		if (newLowerBounds.size() == 0 \|\|
		newLowerBounds.size() == op.lowerBound().size())
		return failure();
		// Replace the parallel loop by lower-dimensional parallel loop.
		auto newOp =
		rewriter.create<ParallelOp>(op.getLoc(), newLowerBounds, newUpperBounds,
		newSteps, op.initVals(), nullptr);
		rewriter.cloneRegionBefore(op.region(), newOp.region(),
		newOp.region().begin(), mapping);
		rewriter.replaceOp(op, newOp.getResults());
		return success();
		}
		};
		} // namespace

		void ParallelOp::getCanonicalizationPatterns(OwningRewritePatternList &results,
		MLIRContext *context) {
		results.insert<CollapseSingleIterationLoops>(context);
		}

//===----------------------------------------------------------------------===//		//===----------------------------------------------------------------------===//
// ReduceOp		// ReduceOp
//===----------------------------------------------------------------------===//		//===----------------------------------------------------------------------===//

void ReduceOp::build(		void ReduceOp::build(
OpBuilder &builder, OperationState &result, Value operand,		OpBuilder &builder, OperationState &result, Value operand,
function_ref<void(OpBuilder &, Location, Value, Value)> bodyBuilderFn) {		function_ref<void(OpBuilder &, Location, Value, Value)> bodyBuilderFn) {
auto type = operand.getType();		auto type = operand.getType();
▲ Show 20 Lines • Show All 127 Lines • Show Last 20 Lines

mlir/lib/Dialect/SCF/Transforms/ParallelLoopTiling.cpp

Show All 24 Lines
/// Tile a parallel loop of the form		/// Tile a parallel loop of the form
/// scf.parallel (%i0, %i1) = (%arg0, %arg1) to (%arg2, %arg3)		/// scf.parallel (%i0, %i1) = (%arg0, %arg1) to (%arg2, %arg3)
/// step (%arg4, %arg5)		/// step (%arg4, %arg5)
///		///
/// into		/// into
/// scf.parallel (%i0, %i1) = (%arg0, %arg1) to (%arg2, %arg3)		/// scf.parallel (%i0, %i1) = (%arg0, %arg1) to (%arg2, %arg3)
/// step (%arg4*tileSize[0],		/// step (%arg4*tileSize[0],
/// %arg5*tileSize[1])		/// %arg5*tileSize[1])
/// scf.parallel (%j0, %j1) = (0, 0) to (min(tileSize[0], %arg2-%i0)		/// scf.parallel (%j0, %j1) = (0, 0) to (min(%arg4*tileSize[0], %arg2-%i0)
/// min(tileSize[1], %arg3-%i1))		/// min(%arg5*tileSize[1], %arg3-%i1))
/// step (%arg4, %arg5)		/// step (%arg4, %arg5)
///		///
/// where the uses of %i0 and %i1 in the loop body are replaced by		/// where the uses of %i0 and %i1 in the loop body are replaced by
/// %i0 + j0 and %i1 + %j1.		/// %i0 + j0 and %i1 + %j1.
//		//
/// The old loop is replaced with the new one.		/// The old loop is replaced with the new one.
void mlir::scf::tileParallelLoop(ParallelOp op, ArrayRef<int64_t> tileSizes) {		void mlir::scf::tileParallelLoop(ParallelOp op, ArrayRef<int64_t> tileSizes) {
OpBuilder b(op);		OpBuilder b(op);
auto zero = b.create<ConstantIndexOp>(op.getLoc(), 0);		auto zero = b.create<ConstantIndexOp>(op.getLoc(), 0);
SmallVector<Value, 2> tileSizeConstants;		SmallVector<Value, 2> tileSizeConstants;
tileSizeConstants.reserve(op.upperBound().size());		tileSizeConstants.reserve(op.upperBound().size());
for (size_t i = 0, end = op.upperBound().size(); i != end; ++i) {		for (size_t i = 0, end = op.upperBound().size(); i != end; ++i) {
if (i < tileSizes.size())		if (i < tileSizes.size())
tileSizeConstants.push_back(		tileSizeConstants.push_back(
b.create<ConstantIndexOp>(op.getLoc(), tileSizes[i]));		b.create<ConstantIndexOp>(op.getLoc(), tileSizes[i]));
else		else
// Just pick 1 for the remaining dimensions.		// Just pick 1 for the remaining dimensions.
tileSizeConstants.push_back(b.create<ConstantIndexOp>(op.getLoc(), 1));		tileSizeConstants.push_back(b.create<ConstantIndexOp>(op.getLoc(), 1));
}		}

// Create the outer loop with adjusted steps.		// Create the outer loop with adjusted steps.
SmallVector<Value, 2> newSteps;		SmallVector<Value, 2> newSteps;
newSteps.reserve(op.step().size());		newSteps.reserve(op.step().size());
for (auto step : llvm::zip(op.step(), tileSizeConstants)) {		for (auto step : llvm::zip(op.step(), tileSizeConstants)) {
		herhutUnsubmitted Done Reply Inline Actions Why do you dislike the `llvm::zip`? herhut: Why do you dislike the `llvm::zip`?
newSteps.push_back(		newSteps.push_back(
b.create<MulIOp>(op.getLoc(), std::get<0>(step), std::get<1>(step)));		b.create<MulIOp>(op.getLoc(), std::get<0>(step), std::get<1>(step)));
}		}
auto outerLoop = b.create<ParallelOp>(op.getLoc(), op.lowerBound(),		auto outerLoop = b.create<ParallelOp>(op.getLoc(), op.lowerBound(),
op.upperBound(), newSteps);		op.upperBound(), newSteps);
b.setInsertionPointToStart(outerLoop.getBody());		b.setInsertionPointToStart(outerLoop.getBody());

// Compute min(size, dim - offset) to avoid out-of-bounds accesses.		// Compute min(size, dim - offset) to avoid out-of-bounds accesses.
// FIXME: Instead of using min, we want to replicate the tail. This would give		// FIXME: Instead of using min, we want to replicate the tail. This would give
// the inner loop constant bounds for easy vectorization.		// the inner loop constant bounds for easy vectorization.
auto minMap = AffineMap::get(		auto minMap = AffineMap::get(
/dimCount=/3, /symbolCount=/0,		/dimCount=/3, /symbolCount=/0,
{getAffineDimExpr(/position=/0, b.getContext()),		{getAffineDimExpr(/position=/0, b.getContext()),
getAffineDimExpr(/position=/1, b.getContext()) -		getAffineDimExpr(/position=/1, b.getContext()) -
getAffineDimExpr(/position=/2, b.getContext())},		getAffineDimExpr(/position=/2, b.getContext())},
b.getContext());		b.getContext());

// Create the inner loop with adjusted bounds.		// Create the inner loop with adjusted bounds.
SmallVector<Value, 2> newBounds;		SmallVector<Value, 2> newBounds;
newBounds.reserve(op.upperBound().size());		newBounds.reserve(op.upperBound().size());
for (auto bounds : llvm::zip(tileSizeConstants, outerLoop.upperBound(),		for (auto bounds :
outerLoop.getInductionVars())) {		llvm::zip(outerLoop.lowerBound(), outerLoop.upperBound(),
		herhutUnsubmitted Done Reply Inline Actions This optimization makes sense here, as it is hard to recover this out of the generated affine min expression. But the comment could explain this better. Something like: If we statically know the size of the outer loops iteration and it is divisible by the tiling factor, we can use a static bound for the inner loop. Otherwise, we have to dynamically compute the bound for each iteration of the outer loop. herhut: This optimization makes sense here, as it is hard to recover this out of the generated affine…
		outerLoop.step(), outerLoop.getInductionVars())) {
		auto lowerBound =
		dyn_cast_or_null<ConstantIndexOp>(std::get<0>(bounds).getDefiningOp());
		ftynseUnsubmitted Done Reply Inline Actions I could not parse this comment ftynse: I could not parse this comment
		auto upperBound =
		dyn_cast_or_null<ConstantIndexOp>(std::get<1>(bounds).getDefiningOp());
		// Access the step and the tile size via the multiplication operation
		// computing the newStep.
		auto step = dyn_cast_or_null<ConstantIndexOp>(
		herhutUnsubmitted Done Reply Inline Actions Why not zip over `op.step` and `tileSizeConstants` to get these two? That avoids the pattern matching. You can then assign all of them at the beginning of the loop to give them local names. Something like Value lowerBound, upperBound, originalStep, newStep, index; std::tie... herhut: Why not zip over `op.step` and `tileSizeConstants` to get these two? That avoids the pattern…
		std::get<2>(bounds).getDefiningOp()->getOperand(0).getDefiningOp());
		auto tileSize = cast<ConstantIndexOp>(
		std::get<2>(bounds).getDefiningOp()->getOperand(1).getDefiningOp());
		// If we statically know the size of the outer loops iteration and it is
		bondhugulaUnsubmitted Done Reply Inline Actions Please rephrase this: "size of the outer loops iteration" isn't meaningful. Also, you are only considering the case of lb, ub, step being constant, which isn't the same of trip counts being statically known/constant (for eg., %i = %N to %N + 16). bondhugula: Please rephrase this: "size of the outer loops iteration" isn't meaningful. Also, you are only…
		// divisible by the tiling factor, we can use a static bound for the inner
		// loop. Otherwise, we have to dynamically compute the bound for each
		// iteration of the outer loop.
		if (lowerBound && upperBound && step &&
		(upperBound.getValue() - lowerBound.getValue()) %
		(step.getValue() * tileSize.getValue()) ==
		rriddleUnsubmitted Done Reply Inline Actions This is an extremely complex condition, please add braces and/or preferably split the conditions. rriddle: This is an extremely complex condition, please add braces and/or preferably split the…
		0)
		newBounds.push_back(std::get<2>(bounds));
		else
newBounds.push_back(b.create<AffineMinOp>(		newBounds.push_back(b.create<AffineMinOp>(
		rriddleUnsubmitted Done Reply Inline Actions Add braces here, this if is not trivial. rriddle: Add braces here, this if is not trivial.
op.getLoc(), b.getIndexType(), minMap,		op.getLoc(), b.getIndexType(), minMap,
ValueRange{std::get<0>(bounds), std::get<1>(bounds),		ValueRange{std::get<2>(bounds), std::get<1>(bounds),
		herhutUnsubmitted Done Reply Inline Actions You have assigned names to these `std::get` previously by assigning to locals. Why not use them here? herhut: You have assigned names to these `std::get` previously by assigning to locals. Why not use them…
std::get<2>(bounds)}));		std::get<3>(bounds)}));
}		}
auto innerLoop = b.create<ParallelOp>(		auto innerLoop = b.create<ParallelOp>(
op.getLoc(), SmallVector<Value, 2>(newBounds.size(), zero), newBounds,		op.getLoc(), SmallVector<Value, 2>(newBounds.size(), zero), newBounds,
op.step());		op.step());

// Steal the body of the old parallel loop and erase it.		// Steal the body of the old parallel loop and erase it.
innerLoop.region().takeBody(op.region());		innerLoop.region().takeBody(op.region());

// Insert computation for new index vectors and replace uses.		// Insert computation for new index vectors and replace uses.
b.setInsertionPointToStart(innerLoop.getBody());		b.setInsertionPointToStart(innerLoop.getBody());
for (auto ivs :		for (auto ivs :
llvm::zip(innerLoop.getInductionVars(), outerLoop.getInductionVars())) {		llvm::zip(innerLoop.getInductionVars(), outerLoop.getInductionVars())) {
Value inner_index = std::get<0>(ivs);		Value inner_index = std::get<0>(ivs);
AddIOp newIndex =		AddIOp newIndex =
b.create<AddIOp>(op.getLoc(), std::get<0>(ivs), std::get<1>(ivs));		b.create<AddIOp>(op.getLoc(), std::get<0>(ivs), std::get<1>(ivs));
inner_index.replaceAllUsesExcept(		inner_index.replaceAllUsesExcept(
newIndex, SmallPtrSet<Operation *, 1>{newIndex.getOperation()});		newIndex, SmallPtrSet<Operation *, 1>{newIndex.getOperation()});
}		}
		ftynseUnsubmitted Done Reply Inline Actions Maybe we should consider a separate "canonicalization" pass (or an actual canonicalization) that removes single-iteration loops completely. There is one on affine loops. ftynse: Maybe we should consider a separate "canonicalization" pass (or an actual canonicalization)…

op.erase();		op.erase();
		herhutUnsubmitted Done Reply Inline Actions I would rather have this as a separate canonicalization pattern that removes parallel loops with trip-count 1. herhut: I would rather have this as a separate canonicalization pattern that removes parallel loops…
}		}

/// Get a list of most nested parallel loops. Assumes that ParallelOps are only		/// Get a list of most nested parallel loops. Assumes that ParallelOps are only
/// directly nested.		/// directly nested.
static bool getInnermostNestedLoops(Block *block,		static bool getInnermostNestedLoops(Block *block,
SmallVectorImpl<ParallelOp> &loops) {		SmallVectorImpl<ParallelOp> &loops) {
bool hasInnerLoop = false;		bool hasInnerLoop = false;
for (auto parallelOp : block->getOps<ParallelOp>()) {		for (auto parallelOp : block->getOps<ParallelOp>()) {
Show All 13 Lines	struct ParallelLoopTiling
}		}

void runOnFunction() override {		void runOnFunction() override {
SmallVector<ParallelOp, 2> mostNestedParallelOps;		SmallVector<ParallelOp, 2> mostNestedParallelOps;
for (Block &block : getFunction()) {		for (Block &block : getFunction()) {
getInnermostNestedLoops(&block, mostNestedParallelOps);		getInnermostNestedLoops(&block, mostNestedParallelOps);
}		}
for (ParallelOp pLoop : mostNestedParallelOps) {		for (ParallelOp pLoop : mostNestedParallelOps) {
		// FIXME: Add reduction support.
		if (pLoop.getNumReductions() == 0)
tileParallelLoop(pLoop, tileSizes);		tileParallelLoop(pLoop, tileSizes);
}		}
}		}
};		};
} // namespace		} // namespace

std::unique_ptr<Pass>		std::unique_ptr<Pass>
mlir::createParallelLoopTilingPass(ArrayRef<int64_t> tileSizes) {		mlir::createParallelLoopTilingPass(ArrayRef<int64_t> tileSizes) {
return std::make_unique<ParallelLoopTiling>(tileSizes);		return std::make_unique<ParallelLoopTiling>(tileSizes);
}		}

mlir/test/Dialect/SCF/canonicalize.mlir

This file was added.

				// RUN: mlir-opt %s -pass-pipeline='func(canonicalize)' \| FileCheck %s

				func @single_iteration(%A: memref<?x?x?xi32>) {
				%c0 = constant 0 : index
				%c1 = constant 1 : index
				%c2 = constant 2 : index
				%c3 = constant 3 : index
				%c6 = constant 6 : index
				%c7 = constant 7 : index
				%c10 = constant 10 : index
				scf.parallel (%i0, %i1, %i2) = (%c0, %c3, %c7) to (%c1, %c6, %c10) step (%c1, %c2, %c3) {
				%c42 = constant 42 : i32
				store %c42, %A[%i0, %i1, %i2] : memref<?x?x?xi32>
				scf.yield
				}
				return
				}

				// CHECK-LABEL: func @single_iteration(
				// CHECK-SAME: [[ARG0:%.*]]: memref<?x?x?xi32>) {
				// CHECK: [[C0:%.*]] = constant 0 : index
				// CHECK: [[C2:%.*]] = constant 2 : index
				// CHECK: [[C3:%.*]] = constant 3 : index
				// CHECK: [[C6:%.*]] = constant 6 : index
				// CHECK: [[C7:%.*]] = constant 7 : index
				// CHECK: [[C42:%.*]] = constant 42 : i32
				// CHECK: scf.parallel ([[V0:%.*]]) = ([[C3]]) to ([[C6]]) step ([[C2]]) {
				// CHECK: store [[C42]], [[ARG0]]{{\[}}[[C0]], [[V0]], [[C7]]] : memref<?x?x?xi32>
				// CHECK: scf.yield
				// CHECK: }
				// CHECK: return

mlir/test/Dialect/SCF/parallel-loop-tiling.mlir

Show All 9 Lines	scf.parallel (%i0, %i1) = (%arg0, %arg1) to (%arg2, %arg3) step (%arg4, %arg5) {
%sum_elem = addf %B_elem, %C_elem : f32		%sum_elem = addf %B_elem, %C_elem : f32
store %sum_elem, %result[%i0, %i1] : memref<?x?xf32>		store %sum_elem, %result[%i0, %i1] : memref<?x?xf32>
}		}
return		return
}		}

// CHECK: #map0 = affine_map<(d0, d1, d2) -> (d0, d1 - d2)>		// CHECK: #map0 = affine_map<(d0, d1, d2) -> (d0, d1 - d2)>
// CHECK-LABEL: func @parallel_loop(		// CHECK-LABEL: func @parallel_loop(
// CHECK-SAME: [[VAL_0:%.]]: index, [[VAL_1:%.]]: index, [[VAL_2:%.]]: index, [[VAL_3:%.]]: index, [[VAL_4:%.]]: index, [[VAL_5:%.]]: index, [[VAL_6:%.]]: memref<?x?xf32>, [[VAL_7:%.]]: memref<?x?xf32>, [[VAL_8:%.]]: memref<?x?xf32>, [[VAL_9:%.]]: memref<?x?xf32>) {		// CHECK-SAME: [[ARG1:%.]]: index, [[ARG2:%.]]: index, [[ARG3:%.]]: index, [[ARG4:%.]]: index, [[ARG5:%.]]: index, [[ARG6:%.]]: index, [[ARG7:%.]]: memref<?x?xf32>, [[ARG8:%.]]: memref<?x?xf32>, [[ARG9:%.]]: memref<?x?xf32>, [[ARG10:%.]]: memref<?x?xf32>) {
// CHECK: [[VAL_10:%.*]] = constant 0 : index		// CHECK: [[C0:%.*]] = constant 0 : index
// CHECK: [[VAL_11:%.*]] = constant 1 : index		// CHECK: [[C1:%.*]] = constant 1 : index
// CHECK: [[VAL_12:%.*]] = constant 4 : index		// CHECK: [[C4:%.*]] = constant 4 : index
// CHECK: [[VAL_13:%.*]] = muli [[VAL_4]], [[VAL_11]] : index		// CHECK: [[V1:%.*]] = muli [[ARG5]], [[C1]] : index
// CHECK: [[VAL_14:%.*]] = muli [[VAL_5]], [[VAL_12]] : index		// CHECK: [[V2:%.*]] = muli [[ARG6]], [[C4]] : index
// CHECK: scf.parallel ([[VAL_15:%.]], [[VAL_16:%.]]) = ([[VAL_0]], [[VAL_1]]) to ([[VAL_2]], [[VAL_3]]) step ([[VAL_13]], [[VAL_14]]) {		// CHECK: scf.parallel ([[V3:%.]], [[V4:%.]]) = ([[ARG1]], [[ARG2]]) to ([[ARG3]], [[ARG4]]) step ([[V1]], [[V2]]) {
// CHECK: [[VAL_17:%.*]] = affine.min #map0([[VAL_11]], [[VAL_2]], [[VAL_15]])		// CHECK: [[V5:%.*]] = affine.min #map0([[V1]], [[ARG3]], [[V3]])
// CHECK: [[VAL_18:%.*]] = affine.min #map0([[VAL_12]], [[VAL_3]], [[VAL_16]])		// CHECK: [[V6:%.*]] = affine.min #map0([[V2]], [[ARG4]], [[V4]])
// CHECK: scf.parallel ([[VAL_19:%.]], [[VAL_20:%.]]) = ([[VAL_10]], [[VAL_10]]) to ([[VAL_17]], [[VAL_18]]) step ([[VAL_4]], [[VAL_5]]) {		// CHECK: scf.parallel ([[V7:%.]], [[V8:%.]]) = ([[C0]], [[C0]]) to ([[V5]], [[V6]]) step ([[ARG5]], [[ARG6]]) {
// CHECK: [[VAL_21:%.*]] = addi [[VAL_19]], [[VAL_15]] : index		// CHECK: [[V9:%.*]] = addi [[V7]], [[V3]] : index
// CHECK: [[VAL_22:%.*]] = addi [[VAL_20]], [[VAL_16]] : index		// CHECK: [[V10:%.*]] = addi [[V8]], [[V4]] : index
// CHECK: [[VAL_23:%.*]] = load [[VAL_7]]{{\[}}[[VAL_21]], [[VAL_22]]] : memref<?x?xf32>		// CHECK: [[V11:%.*]] = load [[ARG8]]{{\[}}[[V9]], [[V10]]] : memref<?x?xf32>
// CHECK: [[VAL_24:%.*]] = load [[VAL_8]]{{\[}}[[VAL_21]], [[VAL_22]]] : memref<?x?xf32>		// CHECK: [[V12:%.*]] = load [[ARG9]]{{\[}}[[V9]], [[V10]]] : memref<?x?xf32>
// CHECK: [[VAL_25:%.*]] = addf [[VAL_23]], [[VAL_24]] : f32		// CHECK: [[V13:%.*]] = addf [[V11]], [[V12]] : f32
// CHECK: store [[VAL_25]], [[VAL_9]]{{\[}}[[VAL_21]], [[VAL_22]]] : memref<?x?xf32>		// CHECK: store [[V13]], [[ARG10]]{{\[}}[[V9]], [[V10]]] : memref<?x?xf32>
		// CHECK: }
		// CHECK: }
		// CHECK: return

		// -----

		func @static_loop_with_step() {
		%c0 = constant 0 : index
		%c3 = constant 3 : index
		%c24 = constant 24 : index
		scf.parallel (%i0, %i1) = (%c0, %c0) to (%c24, %c24) step (%c3, %c3) {
		}
		return
		}

		// CHECK-LABEL: func @static_loop_with_step() {
		// CHECK: [[C0:%.*]] = constant 0 : index
		// CHECK: [[C3:%.*]] = constant 3 : index
		// CHECK: [[C24:%.*]] = constant 24 : index
		// CHECK: [[C0_1:%.*]] = constant 0 : index
		bondhugulaUnsubmitted Done Reply Inline Actions Please avoid using VAL_<running_number> - instead C0, C3, C0_1, etc. for readability. A substitution at this point. bondhugula: Please avoid using VAL_<running_number> - instead C0, C3, C0_1, etc. for readability. A…
		// CHECK: [[C1:%.*]] = constant 1 : index
		// CHECK: [[C4:%.*]] = constant 4 : index
		// CHECK: [[V1:%.*]] = muli [[C3]], [[C1]] : index
		// CHECK: [[V2:%.*]] = muli [[C3]], [[C4]] : index
		// CHECK: scf.parallel ([[V3:%.]], [[V4:%.]]) = ([[C0]], [[C0]]) to ([[C24]], [[C24]]) step ([[V1]], [[V2]]) {
		// CHECK: scf.parallel ([[V5:%.]], [[V6:%.]]) = ([[C0_1]], [[C0_1]]) to ([[V1]], [[V2]]) step ([[C3]], [[C3]]) {
		// CHECK: = addi [[V5]], [[V3]] : index
		// CHECK: = addi [[V6]], [[V4]] : index
// CHECK: }		// CHECK: }
// CHECK: }		// CHECK: }
// CHECK: return		// CHECK: return

// -----		// -----

func @tile_nested_innermost() {		func @tile_nested_innermost() {
%c2 = constant 2 : index		%c2 = constant 2 : index
%c0 = constant 0 : index		%c0 = constant 0 : index
%c1 = constant 1 : index		%c1 = constant 1 : index
scf.parallel (%i, %j) = (%c0, %c0) to (%c2, %c2) step (%c1, %c1) {		scf.parallel (%i, %j) = (%c0, %c0) to (%c2, %c2) step (%c1, %c1) {
scf.parallel (%k, %l) = (%c0, %c0) to (%c2, %c2) step (%c1, %c1) {		scf.parallel (%k, %l) = (%c0, %c0) to (%c2, %c2) step (%c1, %c1) {
}		}
}		}
scf.parallel (%i, %j) = (%c0, %c0) to (%c2, %c2) step (%c1, %c1) {		scf.parallel (%i, %j) = (%c0, %c0) to (%c2, %c2) step (%c1, %c1) {
}		}
return		return
}		}

// CHECK-LABEL: func @tile_nested_innermost() {		// CHECK-LABEL: func @tile_nested_innermost() {
// CHECK: [[VAL_24:%.*]] = constant 2 : index		// CHECK: [[C2:%.*]] = constant 2 : index
// CHECK: [[VAL_25:%.*]] = constant 0 : index		// CHECK: [[C0:%.*]] = constant 0 : index
// CHECK: [[VAL_26:%.*]] = constant 1 : index		// CHECK: [[C1:%.*]] = constant 1 : index
// CHECK: scf.parallel ([[VAL_27:%.]], [[VAL_28:%.]]) = ([[VAL_25]], [[VAL_25]]) to ([[VAL_24]], [[VAL_24]]) step ([[VAL_26]], [[VAL_26]]) {		// CHECK: scf.parallel ([[V1:%.]], [[V2:%.]]) = ([[C0]], [[C0]]) to ([[C2]], [[C2]]) step ([[C1]], [[C1]]) {
// CHECK: [[VAL_29:%.*]] = constant 0 : index		// CHECK: [[C0_1:%.*]] = constant 0 : index
// CHECK: [[VAL_30:%.*]] = constant 1 : index		// CHECK: [[C1_1:%.*]] = constant 1 : index
// CHECK: [[VAL_31:%.*]] = constant 4 : index		// CHECK: [[C4:%.*]] = constant 4 : index
// CHECK: [[VAL_32:%.*]] = muli [[VAL_26]], [[VAL_30]] : index		// CHECK: [[V3:%.*]] = muli [[C1]], [[C1_1]] : index
// CHECK: [[VAL_33:%.*]] = muli [[VAL_26]], [[VAL_31]] : index		// CHECK: [[V4:%.*]] = muli [[C1]], [[C4]] : index
// CHECK: scf.parallel ([[VAL_34:%.]], [[VAL_35:%.]]) = ([[VAL_25]], [[VAL_25]]) to ([[VAL_24]], [[VAL_24]]) step ([[VAL_32]], [[VAL_33]]) {		// CHECK: scf.parallel ([[V5:%.]], [[V6:%.]]) = ([[C0]], [[C0]]) to ([[C2]], [[C2]]) step ([[V3]], [[V4]]) {
// CHECK: [[VAL_36:%.*]] = affine.min #map0([[VAL_30]], [[VAL_24]], [[VAL_34]])		// CHECK: [[V7:%.*]] = affine.min #map0([[V4]], [[C2]], [[V6]])
// CHECK: [[VAL_37:%.*]] = affine.min #map0([[VAL_31]], [[VAL_24]], [[VAL_35]])		// CHECK: scf.parallel ([[V8:%.]], [[V9:%.]]) = ([[C0_1]], [[C0_1]]) to ([[V3]], [[V7]]) step ([[C1]], [[C1]]) {
// CHECK: scf.parallel ([[VAL_38:%.]], [[VAL_39:%.]]) = ([[VAL_29]], [[VAL_29]]) to ([[VAL_36]], [[VAL_37]]) step ([[VAL_26]], [[VAL_26]]) {		// CHECK: = addi [[V8]], [[V5]] : index
		// CHECK: = addi [[V9]], [[V6]] : index
// CHECK: }		// CHECK: }
// CHECK: }		// CHECK: }
// CHECK: }		// CHECK: }
// CHECK: [[VAL_40:%.*]] = constant 0 : index		// CHECK: [[C0_2:%.*]] = constant 0 : index
// CHECK: [[VAL_41:%.*]] = constant 1 : index		// CHECK: [[C1_2:%.*]] = constant 1 : index
// CHECK: [[VAL_42:%.*]] = constant 4 : index		// CHECK: [[C4_1:%.*]] = constant 4 : index
// CHECK: [[VAL_43:%.*]] = muli [[VAL_26]], [[VAL_41]] : index		// CHECK: [[V10:%.*]] = muli [[C1]], [[C1_2]] : index
// CHECK: [[VAL_44:%.*]] = muli [[VAL_26]], [[VAL_42]] : index		// CHECK: [[V11:%.*]] = muli [[C1]], [[C4_1]] : index
// CHECK: scf.parallel ([[VAL_45:%.]], [[VAL_46:%.]]) = ([[VAL_25]], [[VAL_25]]) to ([[VAL_24]], [[VAL_24]]) step ([[VAL_43]], [[VAL_44]]) {		// CHECK: scf.parallel ([[V12:%.]], [[V13:%.]]) = ([[C0]], [[C0]]) to ([[C2]], [[C2]]) step ([[V10]], [[V11]]) {
// CHECK: [[VAL_47:%.*]] = affine.min #map0([[VAL_41]], [[VAL_24]], [[VAL_45]])		// CHECK: [[V14:%.*]] = affine.min #map0([[V11]], [[C2]], [[V13]])
// CHECK: [[VAL_48:%.*]] = affine.min #map0([[VAL_42]], [[VAL_24]], [[VAL_46]])		// CHECK: scf.parallel ([[V15:%.]], [[V16:%.]]) = ([[C0_2]], [[C0_2]]) to ([[V10]], [[V14]]) step ([[C1]], [[C1]]) {
// CHECK: scf.parallel ([[VAL_49:%.]], [[VAL_50:%.]]) = ([[VAL_40]], [[VAL_40]]) to ([[VAL_47]], [[VAL_48]]) step ([[VAL_26]], [[VAL_26]]) {		// CHECK: = addi [[V15]], [[V12]] : index
		// CHECK: = addi [[V16]], [[V13]] : index
// CHECK: }		// CHECK: }
// CHECK: }		// CHECK: }
// CHECK: return		// CHECK: return
// CHECK: }		// CHECK: }

mlir/test/Transforms/parallel-loop-collapsing.mlir

Show All 10 Lines	func @parallel_many_dims() {
%c7 = constant 7 : index		%c7 = constant 7 : index
%c8 = constant 8 : index		%c8 = constant 8 : index
%c9 = constant 9 : index		%c9 = constant 9 : index
%c10 = constant 10 : index		%c10 = constant 10 : index
%c11 = constant 11 : index		%c11 = constant 11 : index
%c12 = constant 12 : index		%c12 = constant 12 : index
%c13 = constant 13 : index		%c13 = constant 13 : index
%c14 = constant 14 : index		%c14 = constant 14 : index
		%c15 = constant 15 : index
		%c26 = constant 26 : index

scf.parallel (%i0, %i1, %i2, %i3, %i4) = (%c0, %c3, %c6, %c9, %c12) to (%c2, %c5, %c8, %c11, %c14)		scf.parallel (%i0, %i1, %i2, %i3, %i4) = (%c0, %c3, %c6, %c9, %c12) to (%c2, %c5, %c8, %c11, %c14)
step (%c1, %c4, %c7, %c10, %c13) {		step (%c1, %c4, %c7, %c10, %c13) {
%result = "magic.op"(%i0, %i1, %i2, %i3, %i4): (index, index, index, index, index) -> index		%result = "magic.op"(%i0, %i1, %i2, %i3, %i4): (index, index, index, index, index) -> index
}		}
return		return
}		}

// CHECK-LABEL: func @parallel_many_dims() {		// CHECK-LABEL: func @parallel_many_dims() {
// CHECK: [[C6:%.*]] = constant 6 : index		// CHECK: [[C6:%.*]] = constant 6 : index
// CHECK: [[C7:%.*]] = constant 7 : index
// CHECK: [[C9:%.*]] = constant 9 : index		// CHECK: [[C9:%.*]] = constant 9 : index
// CHECK: [[C10:%.*]] = constant 10 : index		// CHECK: [[C10:%.*]] = constant 10 : index
// CHECK: [[C12:%.*]] = constant 12 : index		// CHECK: [[C12:%.*]] = constant 12 : index
// CHECK: [[C13:%.*]] = constant 13 : index
// CHECK: [[C3:%.*]] = constant 3 : index
// CHECK: [[C0:%.*]] = constant 0 : index		// CHECK: [[C0:%.*]] = constant 0 : index
// CHECK: [[C1:%.*]] = constant 1 : index		// CHECK: [[C1:%.*]] = constant 1 : index
// CHECK: [[C2:%.*]] = constant 2 : index		// CHECK: [[C2:%.*]] = constant 2 : index
// CHECK: scf.parallel ([[NEW_I0:%.]], [[NEW_I1:%.]], [[NEW_I2:%.*]]) = ([[C0]], [[C0]], [[C0]]) to ([[C2]], [[C1]], [[C1]]) step ([[C1]], [[C1]], [[C1]]) {		// CHECK: [[C3:%.*]] = constant 3 : index
		// CHECK: scf.parallel ([[NEW_I0:%.*]]) = ([[C0]]) to ([[C2]]) step ([[C1]]) {
// CHECK: [[I0:%.*]] = remi_signed [[NEW_I0]], [[C2]] : index		// CHECK: [[I0:%.*]] = remi_signed [[NEW_I0]], [[C2]] : index
// CHECK: [[VAL_16:%.*]] = muli [[NEW_I1]], [[C13]] : index		// CHECK: [[V18:%.*]] = muli [[NEW_I0]], [[C10]] : index
// CHECK: [[I4:%.*]] = addi [[VAL_16]], [[C12]] : index		// CHECK: [[I3:%.*]] = addi [[V18]], [[C9]] : index
// CHECK: [[VAL_18:%.*]] = muli [[NEW_I0]], [[C10]] : index		// CHECK: "magic.op"([[I0]], [[C3]], [[C6]], [[I3]], [[C12]]) : (index, index, index, index, index) -> index
// CHECK: [[I3:%.*]] = addi [[VAL_18]], [[C9]] : index
// CHECK: [[VAL_20:%.*]] = muli [[NEW_I2]], [[C7]] : index
// CHECK: [[I2:%.*]] = addi [[VAL_20]], [[C6]] : index
// CHECK: "magic.op"([[I0]], [[C3]], [[I2]], [[I3]], [[I4]]) : (index, index, index, index, index) -> index
// CHECK: scf.yield		// CHECK: scf.yield
// CHECK-NEXT: }		// CHECK-NEXT: }
// CHECK-NEXT: return		// CHECK-NEXT: return

This is an archive of the discontinued LLVM Phabricator instance.

[mlir] parallel loop tiling optimization for loops with static bounds
ClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 272002

mlir/include/mlir/Dialect/SCF/SCFOps.td

mlir/lib/Dialect/SCF/SCF.cpp

mlir/lib/Dialect/SCF/Transforms/ParallelLoopTiling.cpp

mlir/test/Dialect/SCF/canonicalize.mlir

mlir/test/Dialect/SCF/parallel-loop-tiling.mlir

mlir/test/Transforms/parallel-loop-collapsing.mlir

This is an archive of the discontinued LLVM Phabricator instance.

[mlir] parallel loop tiling optimization for loops with static boundsClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 272002

mlir/include/mlir/Dialect/SCF/SCFOps.td

mlir/lib/Dialect/SCF/SCF.cpp

mlir/lib/Dialect/SCF/Transforms/ParallelLoopTiling.cpp

mlir/test/Dialect/SCF/canonicalize.mlir

mlir/test/Dialect/SCF/parallel-loop-tiling.mlir

mlir/test/Transforms/parallel-loop-collapsing.mlir

[mlir] parallel loop tiling optimization for loops with static bounds
ClosedPublic