Diff 300209

mlir/include/mlir/Transforms/Passes.h

	Show All 34 Lines
	/// Creates a pass that moves allocations upwards to reduce the number of			/// Creates a pass that moves allocations upwards to reduce the number of
	/// required copies that are inserted during the BufferDeallocation pass.			/// required copies that are inserted during the BufferDeallocation pass.
	std::unique_ptr<Pass> createBufferHoistingPass();			std::unique_ptr<Pass> createBufferHoistingPass();

	/// Creates a pass that moves allocations upwards out of loops. This avoids			/// Creates a pass that moves allocations upwards out of loops. This avoids
	/// reallocations inside of loops.			/// reallocations inside of loops.
	std::unique_ptr<Pass> createBufferLoopHoistingPass();			std::unique_ptr<Pass> createBufferLoopHoistingPass();

				/// Creates a pass that promotes heap-based allocations to stack-based ones.
				std::unique_ptr<Pass>
				createPromoteBuffersToStackPass(unsigned maxAllocSizeInBytes = 1024);

	/// Creates an instance of the Canonicalizer pass.			/// Creates an instance of the Canonicalizer pass.
	std::unique_ptr<Pass> createCanonicalizerPass();			std::unique_ptr<Pass> createCanonicalizerPass();

	/// Create a pass that removes unnecessary Copy operations.			/// Create a pass that removes unnecessary Copy operations.
	std::unique_ptr<Pass> createCopyRemovalPass();			std::unique_ptr<Pass> createCopyRemovalPass();

	/// Creates a pass to perform common sub expression elimination.			/// Creates a pass to perform common sub expression elimination.
	std::unique_ptr<Pass> createCSEPass();			std::unique_ptr<Pass> createCSEPass();
	▲ Show 20 Lines • Show All 68 Lines • Show Last 20 Lines

mlir/include/mlir/Transforms/Passes.td

Show First 20 Lines • Show All 195 Lines • ▼ Show 20 Lines	let summary = "Optimizes placement of allocation operations by moving them "
"out of loop nests";		"out of loop nests";
let description = [{		let description = [{
This pass implements an approach to aggressively move allocations upwards		This pass implements an approach to aggressively move allocations upwards
out of loop nests. It does not move allocations into common dominators.		out of loop nests. It does not move allocations into common dominators.
}];		}];
let constructor = "mlir::createBufferLoopHoistingPass()";		let constructor = "mlir::createBufferLoopHoistingPass()";
}		}

		def PromoteBuffersToStack : FunctionPass<"promote-buffers-to-stack"> {
		let summary = "Promotes heap-based allocations to automatically managed "
		"stack-based allocations";
		let description = [{
		This pass implements a simple algorithm to convert heap-based memory
		allocations to stack-based ones. It uses a built-in heuristic to decide
		whether it makes sense to convert an allocation.
		}];
		let constructor = "mlir::createPromoteBuffersToStackPass()";
		let options = [
		Option<"maxAllocSizeInBytes", "max-alloc-size-in-bytes", "unsigned",
		/default=/"1024",
		"Define the maximum size in bytes to promote allocations to stack.">,
		];
		}

def Canonicalizer : Pass<"canonicalize"> {		def Canonicalizer : Pass<"canonicalize"> {
let summary = "Canonicalize operations";		let summary = "Canonicalize operations";
let description = [{		let description = [{
This pass performs various types of canonicalizations over a set of		This pass performs various types of canonicalizations over a set of
operations. See [Operation Canonicalization](Canonicalization.md) for more		operations. See [Operation Canonicalization](Canonicalization.md) for more
details.		details.
}];		}];
let constructor = "mlir::createCanonicalizerPass()";		let constructor = "mlir::createCanonicalizerPass()";
▲ Show 20 Lines • Show All 349 Lines • Show Last 20 Lines

mlir/lib/Transforms/BufferOptimizations.cpp

//===- BufferOptimizations.cpp - pre-pass optimizations for bufferization -===//		//===- BufferOptimizations.cpp - pre-pass optimizations for bufferization -===//
//		//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.		// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.		// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception		// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//		//
//===----------------------------------------------------------------------===//		//===----------------------------------------------------------------------===//
//		//
// This file implements logic for two optimization passes. These passes try to		// This file implements logic for three optimization passes. The first two
// hoist alloc nodes to reduce the number of allocations and copies during		// passes try to move alloc nodes out of blocks to reduce the number of
// buffer deallocation.		// allocations and copies during buffer deallocation. The third pass tries to
		// convert heap-based allocations to stack-based allocations, if possible.

#include "PassDetail.h"		#include "PassDetail.h"
#include "mlir/IR/Operation.h"		#include "mlir/IR/Operation.h"
#include "mlir/Interfaces/LoopLikeInterface.h"		#include "mlir/Interfaces/LoopLikeInterface.h"
#include "mlir/Pass/Pass.h"		#include "mlir/Pass/Pass.h"
#include "mlir/Transforms/Bufferize.h"		#include "mlir/Transforms/Bufferize.h"
#include "mlir/Transforms/Passes.h"		#include "mlir/Transforms/Passes.h"

using namespace mlir;		using namespace mlir;

		/// Returns true if the given operation implements a known high-level region-
		/// based control-flow interface.
		static bool isKnownControlFlowInterface(Operation *op) {
		return isa<LoopLikeOpInterface, RegionBranchOpInterface>(op);
		}

		/// Check if the size of the allocation is less than the given size. The
		/// transformation is only applied to small buffers since large buffers could
		/// exceed the stack space.
		static bool isSmallAlloc(Value alloc, unsigned maximumSizeInBytes) {
		auto type = alloc.getType().dyn_cast<ShapedType>();
		if (!type \|\| !type.hasStaticShape())
		return false;
		return type.getSizeInBits() < maximumSizeInBytes * 8;
		}

		/// Checks whether the given aliases leave the allocation scope.
		static bool
		leavesAllocationScope(Region *parentRegion,
		const BufferPlacementAliasAnalysis::ValueSetT &aliases) {
		for (Value alias : aliases) {
		for (auto *use : alias.getUsers()) {
		// If there is at least one alias that leaves the parent region, we know
		herhutUnsubmitted Done Reply Inline Actions Thinking about this more, I believe the idea here is that if the value escapes, then there must be at least one alias at the level of the allocation scope, so this will return true. Even if it ignores aliases that are already outside of the allocation scope. Is that correct? Maybe add a comment? herhut: Thinking about this more, I believe the idea here is that if the value escapes, then there must…
		// that this alias escapes the whole region and hence the associated
		// allocation leaves allocation scope.
		if (use->hasTrait<OpTrait::ReturnLike>() &&
		use->getParentRegion() == parentRegion)
		return true;
		}
		}
		return false;
		herhutUnsubmitted Done Reply Inline Actions Please update the comment to reflect that it only returns an allocation scope if a suitable one can be found, i.e., one that is not outside potentially looping control flow. herhut: Please update the comment to reflect that it only returns an allocation scope if a suitable one…
		}

		/// Checks, if an automated allocation scope for a given alloc value exists.
		static bool
		hasAllocationScope(Value alloc,
		const BufferPlacementAliasAnalysis &aliasAnalysis) {
		Region *region = alloc.getParentRegion();
		do {
		if (Operation *parentOp = region->getParentOp()) {
		herhutUnsubmitted Done Reply Inline Actions This is not sufficient. Rather, if we do not know the ops control flow (does not implement `RegionBranchOpInterface`) or if it can have loop-like behavior (the `RegionBranchOpInterface` has a cycle in the region dependencies), we cannot do this. If it has non-looping behavior (like `scf.if`) it is ok to transform into an `alloca` but the `alloca` should not be hoisted out of the conditional. herhut: This is not sufficient. Rather, if we do not know the ops control flow (does not implement…
		// Check if the operation is an automatic allocation scope and whether an
		// alias leaves the scope. This means, an allocation yields out of
		// this scope and can not be transformed in a stack-based allocation.
		herhutUnsubmitted Done Reply Inline Actions This was an oversight in the previous review. I think you generally want a `mightBeLoop` that also returns true of the operation does _not implement RegionBranchOpInterface_. We have to be conservative when doing these optimizations (this one and the other two hoisting optimizations, as well). Adding a test for this would be great. herhut: This was an oversight in the previous review. I think you generally want a `mightBeLoop` that…
		if (parentOp->hasTrait<OpTrait::AutomaticAllocationScope>() &&
		!leavesAllocationScope(region, aliasAnalysis.resolve(alloc)))
		return true;
		// Check if the operation is a known control flow interface and break the
		// loop to avoid transformation in loops. Furthermore skip transformation
		// if the operation does not implement a RegionBeanchOpInterface.
		if (BufferPlacementTransformationBase::isLoop(parentOp) \|\|
		herhutUnsubmitted Done Reply Inline Actions You could use the `isKnownControlFlowInterface` here, too. Then it is consistent with the workings of the general pass mechanics. Up to you. herhut: You could use the `isKnownControlFlowInterface` here, too. Then it is consistent with the…
		!isKnownControlFlowInterface(parentOp))
		break;
		}
		} while ((region = region->getParentRegion()));
		return false;
		}

namespace {		namespace {

//===----------------------------------------------------------------------===//		//===----------------------------------------------------------------------===//
// BufferAllocationHoisting		// BufferAllocationHoisting
//===----------------------------------------------------------------------===//		//===----------------------------------------------------------------------===//

/// A base implementation compatible with the `BufferAllocationHoisting` class.		/// A base implementation compatible with the `BufferAllocationHoisting` class.
struct BufferAllocationHoistingStateBase {		struct BufferAllocationHoistingStateBase {
/// A pointer to the current dominance info.		/// A pointer to the current dominance info.
DominanceInfo *dominators;		DominanceInfo *dominators;

/// The current allocation value.		/// The current allocation value.
Value allocValue;		Value allocValue;
		herhutUnsubmitted Done Reply Inline Actions Could this just be a bunch of static helper functions? What is the benefit to put this into a class? herhut: Could this just be a bunch of static helper functions? What is the benefit to put this into a…

/// The current placement block (if any).		/// The current placement block (if any).
Block *placementBlock;		Block *placementBlock;
		herhutUnsubmitted Done Reply Inline Actions Avoid `auto` if it does not safe much typing. herhut: Avoid `auto` if it does not safe much typing.

/// Initializes the state base.		/// Initializes the state base.
BufferAllocationHoistingStateBase(DominanceInfo *dominators, Value allocValue,		BufferAllocationHoistingStateBase(DominanceInfo *dominators, Value allocValue,
Block *placementBlock)		Block *placementBlock)
: dominators(dominators), allocValue(allocValue),		: dominators(dominators), allocValue(allocValue),
placementBlock(placementBlock) {}		placementBlock(placementBlock) {}
};		};

/// Implements the actual hoisting logic for allocation nodes.		/// Implements the actual hoisting logic for allocation nodes.
template <typename StateT>		template <typename StateT>
class BufferAllocationHoisting : public BufferPlacementTransformationBase {		class BufferAllocationHoisting : public BufferPlacementTransformationBase {
private:
/// Returns true if the given operation implements a known high-level region-
/// based control-flow interface.
static bool isKnownControlFlowInterface(Operation *op) {
return isa<LoopLikeOpInterface, RegionBranchOpInterface>(op);
}

public:		public:
BufferAllocationHoisting(Operation *op)		BufferAllocationHoisting(Operation *op)
: BufferPlacementTransformationBase(op), dominators(op),		: BufferPlacementTransformationBase(op), dominators(op),
postDominators(op) {}		postDominators(op) {}

/// Moves allocations upwards.		/// Moves allocations upwards.
void hoist() {		void hoist() {
for (BufferPlacementAllocs::AllocEntry &entry : allocs) {		for (BufferPlacementAllocs::AllocEntry &entry : allocs) {
Value allocValue = std::get<0>(entry);		Value allocValue = std::get<0>(entry);
Operation *definingOp = allocValue.getDefiningOp();		Operation *definingOp = allocValue.getDefiningOp();
assert(definingOp && "No defining op");		assert(definingOp && "No defining op");
auto operands = definingOp->getOperands();		auto operands = definingOp->getOperands();
		herhutUnsubmitted Done Reply Inline Actions This should be a pass option. herhut: This should be a pass option.
auto resultAliases = aliases.resolve(allocValue);		auto resultAliases = aliases.resolve(allocValue);
// Determine the common dominator block of all aliases.		// Determine the common dominator block of all aliases.
Block *dominatorBlock =		Block *dominatorBlock =
findCommonDominator(allocValue, resultAliases, dominators);		findCommonDominator(allocValue, resultAliases, dominators);
// Init the initial hoisting state.		// Init the initial hoisting state.
StateT state(&dominators, allocValue, allocValue.getParentBlock());		StateT state(&dominators, allocValue, allocValue.getParentBlock());
// Check for additional allocation dependencies to compute an upper bound		// Check for additional allocation dependencies to compute an upper bound
// for hoisting.		// for hoisting.
Block *dependencyBlock = nullptr;		Block *dependencyBlock = nullptr;
		herhutUnsubmitted Done Reply Inline Actions This is sufficient if the allocation scope is the top-level scope. Otherwise, the use could also be in some further up parent of the scope. You need to query the ancestor relationship here. herhut: This is sufficient if the allocation scope is the top-level scope. Otherwise, the use could…
if (!operands.empty()) {		if (!operands.empty()) {
// If this node has dependencies, check all dependent nodes with respect		// If this node has dependencies, check all dependent nodes with respect
// to a common post dominator. This ensures that all dependency values		// to a common post dominator. This ensures that all dependency values
// have been computed before allocating the buffer.		// have been computed before allocating the buffer.
ValueSetT dependencies(std::next(operands.begin()), operands.end());		ValueSetT dependencies(std::next(operands.begin()), operands.end());
dependencyBlock = findCommonDominator(*operands.begin(), dependencies,		dependencyBlock = findCommonDominator(*operands.begin(), dependencies,
postDominators);		postDominators);
}		}
▲ Show 20 Lines • Show All 131 Lines • ▼ Show 20 Lines	struct BufferAllocationLoopHoistingState : BufferAllocationHoistingStateBase {
/// operations out of loops only.		/// operations out of loops only.
void recordMoveToDominator(Block *block) {}		void recordMoveToDominator(Block *block) {}

/// Sets the current placement block to the given block.		/// Sets the current placement block to the given block.
void recordMoveToParent(Block *block) { placementBlock = block; }		void recordMoveToParent(Block *block) { placementBlock = block; }
};		};

//===----------------------------------------------------------------------===//		//===----------------------------------------------------------------------===//
		// BufferPlacementPromotion
		//===----------------------------------------------------------------------===//

		/// Promotes heap-based allocations to stack-based allocations (if possible).
		class BufferPlacementPromotion : BufferPlacementTransformationBase {
		public:
		BufferPlacementPromotion(Operation *op)
		: BufferPlacementTransformationBase(op) {}

		/// Promote buffers to stack-based allocations.
		void promote(unsigned maximumSize) {
		for (BufferPlacementAllocs::AllocEntry &entry : allocs) {
		Value alloc = std::get<0>(entry);
		// Checking several requirements to transform an AllocOp into an AllocaOp.
		// The transformation is done if the allocation is limited to a given
		herhutUnsubmitted Done Reply Inline Actions It is safer to query the result type for whether it is static. One could have an alloc operation that returns dynamic memrefs even without operands. herhut: It is safer to query the result type for whether it is static. One could have an alloc…
		// size. Furthermore, a deallocation must not be defined for this
		// allocation entry and a parent allocation scope must exist.
		if (!isSmallAlloc(alloc, maximumSize) \|\| std::get<1>(entry) \|\|
		!hasAllocationScope(alloc, aliases))
		continue;

		Operation *startOperation = BufferPlacementAllocs::getStartOperation(
		alloc, alloc.getParentBlock(), liveness);
		// Build a new alloca that is associated with its parent
		// `AutomaticAllocationScope` determined during the initialization phase.
		OpBuilder builder(startOperation);
		auto alloca = builder.create<AllocaOp>(
		alloc.getLoc(), alloc.getType().cast<MemRefType>());

		// Replace the original alloc by a newly created alloca.
		Operation *allocOp = alloc.getDefiningOp();
		allocOp->replaceAllUsesWith(alloca.getOperation());
		allocOp->erase();
		}
		}
		};

		//===----------------------------------------------------------------------===//
// BufferOptimizationPasses		// BufferOptimizationPasses
//===----------------------------------------------------------------------===//		//===----------------------------------------------------------------------===//

/// The buffer hoisting pass that hoists allocation nodes into dominating		/// The buffer hoisting pass that hoists allocation nodes into dominating
/// blocks.		/// blocks.
struct BufferHoistingPass : BufferHoistingBase<BufferHoistingPass> {		struct BufferHoistingPass : BufferHoistingBase<BufferHoistingPass> {

void runOnFunction() override {		void runOnFunction() override {
Show All 10 Lines	struct BufferLoopHoistingPass : BufferLoopHoistingBase<BufferLoopHoistingPass> {
void runOnFunction() override {		void runOnFunction() override {
// Hoist all allocations out of loops.		// Hoist all allocations out of loops.
BufferAllocationHoisting<BufferAllocationLoopHoistingState> optimizer(		BufferAllocationHoisting<BufferAllocationLoopHoistingState> optimizer(
getFunction());		getFunction());
optimizer.hoist();		optimizer.hoist();
}		}
};		};

		/// The promote buffer to stack pass that tries to convert alloc nodes into
		/// alloca nodes.
		struct PromoteBuffersToStackPass
		: PromoteBuffersToStackBase<PromoteBuffersToStackPass> {

		PromoteBuffersToStackPass(unsigned maxAllocSizeInBytes)
		: maximumSize(maxAllocSizeInBytes) {}

		void runOnFunction() override {
		// Move all allocation nodes and convert candidates into allocas.
		BufferPlacementPromotion optimizer(getFunction());
		optimizer.promote(maximumSize);
		}

		private:
		const unsigned maximumSize;
		};

} // end anonymous namespace		} // end anonymous namespace

std::unique_ptr<Pass> mlir::createBufferHoistingPass() {		std::unique_ptr<Pass> mlir::createBufferHoistingPass() {
return std::make_unique<BufferHoistingPass>();		return std::make_unique<BufferHoistingPass>();
}		}

std::unique_ptr<Pass> mlir::createBufferLoopHoistingPass() {		std::unique_ptr<Pass> mlir::createBufferLoopHoistingPass() {
return std::make_unique<BufferLoopHoistingPass>();		return std::make_unique<BufferLoopHoistingPass>();
}		}

		std::unique_ptr<Pass>
		mlir::createPromoteBuffersToStackPass(unsigned maxAllocSizeInBytes) {
		return std::make_unique<PromoteBuffersToStackPass>(maxAllocSizeInBytes);
		}

mlir/test/Transforms/promote-buffers-to-stack.mlir

This file was added.

				// RUN: mlir-opt -promote-buffers-to-stack -split-input-file %s \| FileCheck %s

				// This file checks the behavior of PromoteBuffersToStack pass for converting
				// AllocOps into AllocaOps, if possible.

				// Test Case:
				// bb0
				// / \
				// bb1 bb2 <- Initial position of AllocOp
				// \ /
				// bb3
				// PromoteBuffersToStack expected behavior: It should convert %0 into an
				// AllocaOp.
				herhutUnsubmitted Done Reply Inline Actions Nit: please update comment. herhut: Nit: please update comment.

				#map0 = affine_map<(d0) -> (d0)>

				// CHECK-LABEL: func @condBranch
				func @condBranch(%arg0: i1, %arg1: memref<2xf32>, %arg2: memref<2xf32>) {
				cond_br %arg0, ^bb1, ^bb2
				^bb1:
				br ^bb3(%arg1 : memref<2xf32>)
				^bb2:
				%0 = alloc() : memref<2xf32>
				linalg.generic {indexing_maps = [#map0, #map0], iterator_types = ["parallel"]}
				ins(%arg1: memref<2xf32>)
				outs(%0: memref<2xf32>) {
				^bb0(%gen1_arg0: f32, %gen1_arg1: f32):
				%tmp1 = exp %gen1_arg0 : f32
				linalg.yield %tmp1 : f32
				}
				br ^bb3(%0 : memref<2xf32>)
				^bb3(%1: memref<2xf32>):
				"linalg.copy"(%1, %arg2) : (memref<2xf32>, memref<2xf32>) -> ()
				return
				}

				// CHECK-NEXT: cond_br {{.*}}
				// CHECK: ^bb2
				// CHECK-NEXT: %[[ALLOCA:.*]] = alloca()
				// CHECK: linalg.copy
				// CHECK-NEXT: return

				// -----

				// Test Case:
				// bb0
				// / \
				// bb1 bb2 <- Initial position of AllocOp
				// \ /
				// bb3
				// PromoteBuffersToStack expected behavior:
				// Since the alloc has dynamic type, it is not converted into an alloca.

				#map0 = affine_map<(d0) -> (d0)>

				// CHECK-LABEL: func @condBranchDynamicType
				func @condBranchDynamicType(
				%arg0: i1,
				%arg1: memref<?xf32>,
				%arg2: memref<?xf32>,
				%arg3: index) {
				cond_br %arg0, ^bb1, ^bb2(%arg3: index)
				^bb1:
				br ^bb3(%arg1 : memref<?xf32>)
				^bb2(%0: index):
				%1 = alloc(%0) : memref<?xf32>
				linalg.generic {indexing_maps = [#map0, #map0], iterator_types = ["parallel"]}
				ins(%arg1: memref<?xf32>)
				outs(%1: memref<?xf32>) {
				^bb0(%gen1_arg0: f32, %gen1_arg1: f32):
				%tmp1 = exp %gen1_arg0 : f32
				linalg.yield %tmp1 : f32
				}
				br ^bb3(%1 : memref<?xf32>)
				^bb3(%2: memref<?xf32>):
				"linalg.copy"(%2, %arg2) : (memref<?xf32>, memref<?xf32>) -> ()
				return
				}

				// CHECK-NEXT: cond_br
				// CHECK: ^bb2
				// CHECK: ^bb2(%[[IDX:.]]:{{.}})
				// CHECK-NEXT: %[[ALLOC0:.*]] = alloc(%[[IDX]])
				// CHECK-NEXT: linalg.generic
				// CHECK: br ^bb3
				// CHECK-NEXT: ^bb3(%[[ALLOC0:.]]:{{.}})
				// CHECK: linalg.copy(%[[ALLOC0]],
				// CHECK-NEXT: return

				// -----

				// Test Case: Existing AllocOp with no users.
				// PromoteBuffersToStack expected behavior: It should convert it to an
				// AllocaOp.

				// CHECK-LABEL: func @emptyUsesValue
				func @emptyUsesValue(%arg0: memref<4xf32>) {
				%0 = alloc() : memref<4xf32>
				return
				}
				// CHECK-NEXT: %[[ALLOCA:.*]] = alloca()
				// CHECK-NEXT: return

				// -----

				// Test Case:
				// bb0
				// / \
				// \| bb1 <- Initial position of AllocOp
				// \ /
				// bb2
				// PromoteBuffersToStack expected behavior: It should convert it into an
				// AllocaOp.

				#map0 = affine_map<(d0) -> (d0)>
				herhutUnsubmitted Done Reply Inline Actions Nit: here, too. And also for the changes below. herhut: Nit: here, too. And also for the changes below.

				// CHECK-LABEL: func @criticalEdge
				func @criticalEdge(%arg0: i1, %arg1: memref<2xf32>, %arg2: memref<2xf32>) {
				cond_br %arg0, ^bb1, ^bb2(%arg1 : memref<2xf32>)
				^bb1:
				%0 = alloc() : memref<2xf32>
				linalg.generic {indexing_maps = [#map0, #map0], iterator_types = ["parallel"]}
				ins(%arg1: memref<2xf32>)
				outs(%0: memref<2xf32>) {
				^bb0(%gen1_arg0: f32, %gen1_arg1: f32):
				%tmp1 = exp %gen1_arg0 : f32
				linalg.yield %tmp1 : f32
				}
				br ^bb2(%0 : memref<2xf32>)
				^bb2(%1: memref<2xf32>):
				"linalg.copy"(%1, %arg2) : (memref<2xf32>, memref<2xf32>) -> ()
				return
				}

				// CHECK-NEXT: cond_br {{.*}}
				// CHECK: ^bb1
				// CHECK-NEXT: %[[ALLOCA:.*]] = alloca()
				// CHECK: linalg.copy
				// CHECK-NEXT: return

				// -----

				// Test Case:
				// bb0 <- Initial position of AllocOp
				// / \
				// \| bb1
				// \ /
				// bb2
				// PromoteBuffersToStack expected behavior: It converts the alloc in an alloca.

				herhutUnsubmitted Done Reply Inline Actions nit: allco -> alloc herhut: nit: allco -> alloc
				#map0 = affine_map<(d0) -> (d0)>

				// CHECK-LABEL: func @invCriticalEdge
				func @invCriticalEdge(%arg0: i1, %arg1: memref<2xf32>, %arg2: memref<2xf32>) {
				%0 = alloc() : memref<2xf32>
				linalg.generic {indexing_maps = [#map0, #map0], iterator_types = ["parallel"]}
				ins(%arg1: memref<2xf32>)
				outs(%0: memref<2xf32>) {
				^bb0(%gen1_arg0: f32, %gen1_arg1: f32):
				%tmp1 = exp %gen1_arg0 : f32
				linalg.yield %tmp1 : f32
				}
				cond_br %arg0, ^bb1, ^bb2(%arg1 : memref<2xf32>)
				^bb1:
				br ^bb2(%0 : memref<2xf32>)
				^bb2(%1: memref<2xf32>):
				"linalg.copy"(%1, %arg2) : (memref<2xf32>, memref<2xf32>) -> ()
				return
				}

				// CHECK-NEXT: %[[ALLOCA:.*]] = alloca()
				// CHECK: cond_br
				// CHECK: linalg.copy
				// CHECK-NEXT: return

				// -----

				// Test Case:
				// bb0 <- Initial position of the first AllocOp
				// / \
				// bb1 bb2
				// \ /
				// bb3 <- Initial position of the second AllocOp
				// PromoteBuffersToStack expected behavior: It converts the allocs into allocas.

				#map0 = affine_map<(d0) -> (d0)>

				// CHECK-LABEL: func @ifElse
				func @ifElse(%arg0: i1, %arg1: memref<2xf32>, %arg2: memref<2xf32>) {
				%0 = alloc() : memref<2xf32>
				linalg.generic {indexing_maps = [#map0, #map0], iterator_types = ["parallel"]}
				ins(%arg1: memref<2xf32>)
				outs(%0: memref<2xf32>) {
				^bb0(%gen1_arg0: f32, %gen1_arg1: f32):
				%tmp1 = exp %gen1_arg0 : f32
				linalg.yield %tmp1 : f32
				}
				cond_br %arg0,
				^bb1(%arg1, %0 : memref<2xf32>, memref<2xf32>),
				^bb2(%0, %arg1 : memref<2xf32>, memref<2xf32>)
				^bb1(%1: memref<2xf32>, %2: memref<2xf32>):
				br ^bb3(%1, %2 : memref<2xf32>, memref<2xf32>)
				^bb2(%3: memref<2xf32>, %4: memref<2xf32>):
				br ^bb3(%3, %4 : memref<2xf32>, memref<2xf32>)
				^bb3(%5: memref<2xf32>, %6: memref<2xf32>):
				%7 = alloc() : memref<2xf32>
				linalg.generic {indexing_maps = [#map0, #map0], iterator_types = ["parallel"]}
				ins(%5: memref<2xf32>)
				outs(%7: memref<2xf32>) {
				^bb0(%gen2_arg0: f32, %gen2_arg1: f32):
				%tmp2 = exp %gen2_arg0 : f32
				linalg.yield %tmp2 : f32
				}
				"linalg.copy"(%7, %arg2) : (memref<2xf32>, memref<2xf32>) -> ()
				return
				}

				// CHECK-NEXT: %[[ALLOCA0:.*]] = alloca()
				// CHECK-NEXT: linalg.generic
				// CHECK: %[[ALLOCA1:.*]] = alloca()
				// CHECK: linalg.generic
				// CHECK: linalg.copy(%[[ALLOCA1]]
				// CHECK-NEXT: return

				// -----

				// Test Case: No users for buffer in if-else CFG
				// bb0 <- Initial position of AllocOp
				// / \
				// bb1 bb2
				// \ /
				// bb3
				// PromoteBuffersToStack expected behavior: It converts the alloc into alloca.

				#map0 = affine_map<(d0) -> (d0)>

				// CHECK-LABEL: func @ifElseNoUsers
				func @ifElseNoUsers(%arg0: i1, %arg1: memref<2xf32>, %arg2: memref<2xf32>) {
				%0 = alloc() : memref<2xf32>
				linalg.generic {indexing_maps = [#map0, #map0], iterator_types = ["parallel"]}
				ins(%arg1: memref<2xf32>)
				outs(%0: memref<2xf32>) {
				^bb0(%gen1_arg0: f32, %gen1_arg1: f32):
				%tmp1 = exp %gen1_arg0 : f32
				linalg.yield %tmp1 : f32
				}
				cond_br %arg0,
				^bb1(%arg1, %0 : memref<2xf32>, memref<2xf32>),
				^bb2(%0, %arg1 : memref<2xf32>, memref<2xf32>)
				^bb1(%1: memref<2xf32>, %2: memref<2xf32>):
				br ^bb3(%1, %2 : memref<2xf32>, memref<2xf32>)
				^bb2(%3: memref<2xf32>, %4: memref<2xf32>):
				br ^bb3(%3, %4 : memref<2xf32>, memref<2xf32>)
				^bb3(%5: memref<2xf32>, %6: memref<2xf32>):
				"linalg.copy"(%arg1, %arg2) : (memref<2xf32>, memref<2xf32>) -> ()
				return
				}

				// CHECK-NEXT: %[[ALLOCA:.*]] = alloca()
				// CHECK: return

				// -----

				// Test Case:
				// bb0 <- Initial position of the first AllocOp
				// / \
				// bb1 bb2
				// \| / \
				// \| bb3 bb4
				// \ \ /
				// \ /
				// bb5 <- Initial position of the second AllocOp
				// PromoteBuffersToStack expected behavior: The two allocs should be converted
				// into allocas.

				#map0 = affine_map<(d0) -> (d0)>

				// CHECK-LABEL: func @ifElseNested
				func @ifElseNested(%arg0: i1, %arg1: memref<2xf32>, %arg2: memref<2xf32>) {
				%0 = alloc() : memref<2xf32>
				linalg.generic {indexing_maps = [#map0, #map0], iterator_types = ["parallel"]}
				ins(%arg1: memref<2xf32>)
				outs(%0: memref<2xf32>) {
				^bb0(%gen1_arg0: f32, %gen1_arg1: f32):
				%tmp1 = exp %gen1_arg0 : f32
				linalg.yield %tmp1 : f32
				}
				cond_br %arg0,
				^bb1(%arg1, %0 : memref<2xf32>, memref<2xf32>),
				^bb2(%0, %arg1 : memref<2xf32>, memref<2xf32>)
				^bb1(%1: memref<2xf32>, %2: memref<2xf32>):
				br ^bb5(%1, %2 : memref<2xf32>, memref<2xf32>)
				^bb2(%3: memref<2xf32>, %4: memref<2xf32>):
				cond_br %arg0, ^bb3(%3 : memref<2xf32>), ^bb4(%4 : memref<2xf32>)
				^bb3(%5: memref<2xf32>):
				br ^bb5(%5, %3 : memref<2xf32>, memref<2xf32>)
				^bb4(%6: memref<2xf32>):
				br ^bb5(%3, %6 : memref<2xf32>, memref<2xf32>)
				^bb5(%7: memref<2xf32>, %8: memref<2xf32>):
				%9 = alloc() : memref<2xf32>
				linalg.generic {indexing_maps = [#map0, #map0], iterator_types = ["parallel"]}
				ins(%7: memref<2xf32>)
				outs(%9: memref<2xf32>) {
				^bb0(%gen2_arg0: f32, %gen2_arg1: f32):
				%tmp2 = exp %gen2_arg0 : f32
				linalg.yield %tmp2 : f32
				}
				"linalg.copy"(%9, %arg2) : (memref<2xf32>, memref<2xf32>) -> ()
				return
				}

				// CHECK-NEXT: %[[ALLOCA0:.*]] = alloca()
				// CHECK-NEXT: linalg.generic
				// CHECK: %[[ALLOCA1:.*]] = alloca()
				// CHECK: linalg.generic
				// CHECK: linalg.copy(%[[ALLOCA1]]
				// CHECK-NEXT: return

				// -----

				// Test Case: Dead operations in a single block.
				// PromoteBuffersToStack expected behavior: It converts the two AllocOps into
				// allocas.

				#map0 = affine_map<(d0) -> (d0)>

				// CHECK-LABEL: func @redundantOperations
				func @redundantOperations(%arg0: memref<2xf32>) {
				%0 = alloc() : memref<2xf32>
				linalg.generic {indexing_maps = [#map0, #map0], iterator_types = ["parallel"]}
				ins(%arg0: memref<2xf32>)
				outs(%0: memref<2xf32>) {
				^bb0(%gen1_arg0: f32, %gen1_arg1: f32):
				%tmp1 = exp %gen1_arg0 : f32
				linalg.yield %tmp1 : f32
				}
				%1 = alloc() : memref<2xf32>
				linalg.generic {indexing_maps = [#map0, #map0], iterator_types = ["parallel"]}
				ins(%0: memref<2xf32>)
				outs(%1: memref<2xf32>) {
				^bb0(%gen2_arg0: f32, %gen2_arg1: f32):
				%tmp2 = exp %gen2_arg0 : f32
				linalg.yield %tmp2 : f32
				}
				return
				}

				// CHECK: (%[[ARG0:.]]: {{.}})
				// CHECK-NEXT: %[[ALLOCA0:.*]] = alloca()
				// CHECK-NEXT: linalg.generic {{{.}}} ins(%[[ARG0]]{{.}} outs(%[[ALLOCA0]]
				// CHECK: %[[ALLOCA1:.*]] = alloca()
				// CHECK-NEXT: linalg.generic {{{.}}} ins(%[[ALLOCA0]]{{.}} outs(%[[ALLOCA1]]
				// CHECK: return

				// -----

				// Test Case:
				// bb0
				// / \
				// Initial pos of the 1st AllocOp -> bb1 bb2 <- Initial pos of the 2nd AllocOp
				// \ /
				// bb3
				// PromoteBuffersToStack expected behavior: Both AllocOps are converted into
				// allocas.

				#map0 = affine_map<(d0) -> (d0)>

				// CHECK-LABEL: func @moving_alloc_and_inserting_missing_dealloc
				func @moving_alloc_and_inserting_missing_dealloc(
				%cond: i1,
				%arg0: memref<2xf32>,
				%arg1: memref<2xf32>) {
				cond_br %cond, ^bb1, ^bb2
				^bb1:
				%0 = alloc() : memref<2xf32>
				linalg.generic {indexing_maps = [#map0, #map0], iterator_types = ["parallel"]}
				ins(%arg0: memref<2xf32>)
				outs(%0: memref<2xf32>) {
				^bb0(%gen1_arg0: f32, %gen1_arg1: f32):
				%tmp1 = exp %gen1_arg0 : f32
				linalg.yield %tmp1 : f32
				}
				br ^exit(%0 : memref<2xf32>)
				^bb2:
				%1 = alloc() : memref<2xf32>
				linalg.generic {indexing_maps = [#map0, #map0], iterator_types = ["parallel"]}
				ins(%arg0: memref<2xf32>)
				outs(%1: memref<2xf32>) {
				^bb0(%gen2_arg0: f32, %gen2_arg1: f32):
				%tmp2 = exp %gen2_arg0 : f32
				linalg.yield %tmp2 : f32
				}
				br ^exit(%1 : memref<2xf32>)
				^exit(%arg2: memref<2xf32>):
				"linalg.copy"(%arg2, %arg1) : (memref<2xf32>, memref<2xf32>) -> ()
				return
				}

				// CHECK-NEXT: cond_br {{.*}}
				// CHECK: ^bb1
				// CHECK-NEXT: %{{.*}} = alloca()
				// CHECK: ^bb2
				// CHECK-NEXT: %{{.*}} = alloca()
				// CHECK: linalg.copy
				// CHECK-NEXT: return

				// -----

				// Test Case: Nested regions - This test defines a GenericOp inside the region
				// of another GenericOp.
				// PromoteBuffersToStack expected behavior: The AllocOps are converted into
				// allocas.

				#map0 = affine_map<(d0) -> (d0)>

				// CHECK-LABEL: func @nested_regions_and_cond_branch
				func @nested_regions_and_cond_branch(
				%arg0: i1,
				%arg1: memref<2xf32>,
				%arg2: memref<2xf32>) {
				cond_br %arg0, ^bb1, ^bb2
				^bb1:
				br ^bb3(%arg1 : memref<2xf32>)
				^bb2:
				%0 = alloc() : memref<2xf32>
				linalg.generic {
				indexing_maps = [#map0, #map0],
				iterator_types = ["parallel"]}
				ins(%arg1: memref<2xf32>)
				outs(%0: memref<2xf32>) {
				^bb0(%gen1_arg0: f32, %gen1_arg1: f32):
				%1 = alloc() : memref<2xf32>
				linalg.generic {
				indexing_maps = [#map0, #map0],
				iterator_types = ["parallel"]}
				ins(%arg1: memref<2xf32>)
				outs(%1: memref<2xf32>) {
				^bb0(%gen2_arg0: f32, %gen2_arg1: f32):
				%tmp2 = exp %gen2_arg0 : f32
				linalg.yield %tmp2 : f32
				}
				%tmp1 = exp %gen1_arg0 : f32
				linalg.yield %tmp1 : f32
				}
				br ^bb3(%0 : memref<2xf32>)
				^bb3(%1: memref<2xf32>):
				"linalg.copy"(%1, %arg2) : (memref<2xf32>, memref<2xf32>) -> ()
				return
				}

				// CHECK-NEXT: cond_br {{.*}}
				// CHECK: ^bb2
				// CHECK-NEXT: %[[ALLOCA0:.*]] = alloca()
				// CHECK: ^bb0
				// CHECK-NEXT: %[[ALLOCA1:.*]] = alloc()

				// -----

				// Test Case: buffer deallocation escaping
				// PromoteBuffersToStack expected behavior: The first alloc is returned, so
				// there is no conversion allowed. The second alloc is converted, since it
				// only remains in the scope of the function.

				#map0 = affine_map<(d0) -> (d0)>

				// CHECK-LABEL: func @memref_in_function_results
				func @memref_in_function_results(
				%arg0: memref<5xf32>,
				%arg1: memref<10xf32>,
				%arg2: memref<5xf32>) -> (memref<10xf32>, memref<15xf32>) {
				%x = alloc() : memref<15xf32>
				%y = alloc() : memref<5xf32>
				linalg.generic {indexing_maps = [#map0, #map0], iterator_types = ["parallel"]}
				ins(%arg0: memref<5xf32>)
				outs(%y: memref<5xf32>) {
				^bb0(%arg3: f32, %arg4: f32):
				%2 = exp %arg3 : f32
				linalg.yield %2 : f32
				}
				linalg.copy(%y, %arg2) : memref<5xf32>, memref<5xf32>
				return %arg1, %x : memref<10xf32>, memref<15xf32>
				}
				// CHECK: (%[[ARG0:.]]: memref<5xf32>, %[[ARG1:.]]: memref<10xf32>,
				// CHECK-SAME: %[[RESULT:.*]]: memref<5xf32>)
				// CHECK: %[[ALLOC:.*]] = alloc()
				// CHECK: %[[ALLOCA:.*]] = alloca()
				// CHECK: linalg.copy
				// CHECK: return %[[ARG1]], %[[ALLOC]]

				// -----

				// Test Case: nested region control flow
				// The allocation in the nested if branch cannot be converted to an alloca
				// due to its dynamic memory allocation behavior.

				// CHECK-LABEL: func @nested_region_control_flow
				func @nested_region_control_flow(
				%arg0 : index,
				%arg1 : index) -> memref<?x?xf32> {
				%0 = cmpi "eq", %arg0, %arg1 : index
				%1 = alloc(%arg0, %arg0) : memref<?x?xf32>
				%2 = scf.if %0 -> (memref<?x?xf32>) {
				scf.yield %1 : memref<?x?xf32>
				} else {
				%3 = alloc(%arg0, %arg1) : memref<?x?xf32>
				scf.yield %1 : memref<?x?xf32>
				}
				return %2 : memref<?x?xf32>
				}

				// CHECK: %[[ALLOC0:.*]] = alloc(%arg0, %arg0)
				// CHECK-NEXT: %[[ALLOC1:.*]] = scf.if
				// CHECK: scf.yield %[[ALLOC0]]
				// CHECK: %[[ALLOC2:.*]] = alloc(%arg0, %arg1)
				// CHECK-NEXT: scf.yield %[[ALLOC0]]
				// CHECK: return %[[ALLOC1]]

				// -----

				// Test Case: nested region control flow within a region interface.
				// The alloc %0 does not need to be converted in this case since the
				// allocation finally escapes the method.

				// CHECK-LABEL: func @inner_region_control_flow
				func @inner_region_control_flow(%arg0 : index) -> memref<2x2xf32> {
				%0 = alloc() : memref<2x2xf32>
				%1 = test.region_if %0 : memref<2x2xf32> -> (memref<2x2xf32>) then {
				^bb0(%arg1 : memref<2x2xf32>):
				test.region_if_yield %arg1 : memref<2x2xf32>
				} else {
				^bb0(%arg1 : memref<2x2xf32>):
				test.region_if_yield %arg1 : memref<2x2xf32>
				} join {
				^bb0(%arg1 : memref<2x2xf32>):
				test.region_if_yield %arg1 : memref<2x2xf32>
				}
				return %1 : memref<2x2xf32>
				}

				// CHECK: %[[ALLOC0:.*]] = alloc()
				// CHECK-NEXT: %[[ALLOC1:.*]] = test.region_if
				// CHECK-NEXT: ^bb0(%[[ALLOC2:.]]:{{.}}):
				// CHECK-NEXT: test.region_if_yield %[[ALLOC2]]
				// CHECK: ^bb0(%[[ALLOC3:.]]:{{.}}):
				// CHECK-NEXT: test.region_if_yield %[[ALLOC3]]
				// CHECK: ^bb0(%[[ALLOC4:.]]:{{.}}):
				// CHECK-NEXT: test.region_if_yield %[[ALLOC4]]
				// CHECK: return %[[ALLOC1]]

				// -----

				// Test Case: structured control-flow loop using a nested alloc.
				herhutUnsubmitted Done Reply Inline Actions Please also fix the explanatory comment. herhut: Please also fix the explanatory comment.
				// Alloc %0 will be converted to an alloca. %3 is not transformed.

				// CHECK-LABEL: func @loop_alloc
				func @loop_alloc(
				%lb: index,
				%ub: index,
				%step: index,
				%buf: memref<2xf32>,
				%res: memref<2xf32>) {
				%0 = alloc() : memref<2xf32>
				%1 = scf.for %i = %lb to %ub step %step
				iter_args(%iterBuf = %buf) -> memref<2xf32> {
				%2 = cmpi "eq", %i, %ub : index
				%3 = alloc() : memref<2xf32>
				scf.yield %3 : memref<2xf32>
				}
				"linalg.copy"(%1, %res) : (memref<2xf32>, memref<2xf32>) -> ()
				return
				}

				// CHECK-NEXT: %[[ALLOCA:.*]] = alloca()
				// CHECK-NEXT: scf.for
				// CHECK: %[[ALLOC:.*]] = alloc()

				// -----

				// Test Case: structured control-flow loop with a nested if operation.
				// The loop yields buffers that have been defined outside of the loop and the
				// backeges only use the iteration arguments (or one of its aliases).
				// Therefore, we do not have to (and are not allowed to) free any buffers
				// that are passed via the backedges. The alloc is converted to an AllocaOp.

				// CHECK-LABEL: func @loop_nested_if_no_alloc
				func @loop_nested_if_no_alloc(
				%lb: index,
				%ub: index,
				%step: index,
				%buf: memref<2xf32>,
				%res: memref<2xf32>) {
				%0 = alloc() : memref<2xf32>
				%1 = scf.for %i = %lb to %ub step %step
				iter_args(%iterBuf = %buf) -> memref<2xf32> {
				%2 = cmpi "eq", %i, %ub : index
				%3 = scf.if %2 -> (memref<2xf32>) {
				scf.yield %0 : memref<2xf32>
				} else {
				scf.yield %iterBuf : memref<2xf32>
				}
				scf.yield %3 : memref<2xf32>
				}
				"linalg.copy"(%1, %res) : (memref<2xf32>, memref<2xf32>) -> ()
				return
				}

				// CHECK: %[[ALLOCA0:.*]] = alloca()
				// CHECK-NEXT: %[[ALLOCA1:.]] = scf.for {{.}} iter_args(%[[IALLOCA:.*]] =
				// CHECK: %[[ALLOCA2:.*]] = scf.if
				// CHECK: scf.yield %[[ALLOCA0]]
				herhutUnsubmitted Done Reply Inline Actions I would not want this transformation to happen, as might quickly exhaust the stack space. Using alloca inside of a loop is dangerous. Rather, could the optimization be limited to cases where the allocation is not inside a loop? In essence, if the alloc cannot be moved to the level of the allocation scope (conceptually), it should not be transformed into an `alloca`. This is just conceptually, as `alloca` should not be moved out of conditionals either. herhut: I would not want this transformation to happen, as might quickly exhaust the stack space. Using…
				dfki-jugrAuthorUnsubmitted Done Reply Inline Actions In the transformation process, the parent allocation scopes are checked. During this traversal, the visited operations are checked to be a known control flow interface. If this is the case, we skip the transformation for the given alloc value. dfki-jugr: In the transformation process, the parent allocation scopes are checked. During this traversal…
				// CHECK: scf.yield %[[IALLOCA]]
				// CHECK: scf.yield %[[ALLOCA2]]
				// CHECK: linalg.copy(%[[ALLOCA1]], %arg4)

				// -----

				// Test Case: structured control-flow loop with a nested if operation using
				// a deeply nested buffer allocation.
				// The allocs are not converted in this case.

				// CHECK-LABEL: func @loop_nested_if_alloc
				func @loop_nested_if_alloc(
				%lb: index,
				%ub: index,
				%step: index,
				%buf: memref<2xf32>) -> memref<2xf32> {
				%0 = alloc() : memref<2xf32>
				%1 = scf.for %i = %lb to %ub step %step
				iter_args(%iterBuf = %buf) -> memref<2xf32> {
				%2 = cmpi "eq", %i, %ub : index
				%3 = scf.if %2 -> (memref<2xf32>) {
				%4 = alloc() : memref<2xf32>
				scf.yield %4 : memref<2xf32>
				} else {
				scf.yield %0 : memref<2xf32>
				}
				scf.yield %3 : memref<2xf32>
				}
				return %1 : memref<2xf32>
				}

				// CHECK: %[[ALLOC0:.*]] = alloc()
				// CHECK-NEXT: %[[ALLOC1:.]] = scf.for {{.}}
				// CHECK: %[[ALLOC2:.*]] = scf.if
				// CHECK: %[[ALLOC3:.*]] = alloc()
				// CHECK-NEXT: scf.yield %[[ALLOC3]]
				// CHECK: scf.yield %[[ALLOC0]]
				// CHECK: scf.yield %[[ALLOC2]]
				// CHECK: return %[[ALLOC1]]

				// -----

				// Test Case: The allocated buffer is too large and, hence, it is not
				// converted. In the actual implementation the largest size is 1KB.

				// CHECK-LABEL: func @large_buffer_allocation
				func @large_buffer_allocation(%arg0: memref<2048xf32>) {
				%0 = alloc() : memref<2048xf32>
				"linalg.copy"(%0, %arg0) : (memref<2048xf32>, memref<2048xf32>) -> ()
				return
				}

				// CHECK-NEXT: %[[ALLOC:.*]] = alloc()
				// CHECK-NEXT: linalg.copy

This is an archive of the discontinued LLVM Phabricator instance.

[MLIR] Added PromoteBuffersToStackPass to convert heap-based allocations to stack-based allocations.
ClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 300209

mlir/include/mlir/Transforms/Passes.h

mlir/include/mlir/Transforms/Passes.td

mlir/lib/Transforms/BufferOptimizations.cpp

mlir/test/Transforms/promote-buffers-to-stack.mlir

This is an archive of the discontinued LLVM Phabricator instance.

[MLIR] Added PromoteBuffersToStackPass to convert heap-based allocations to stack-based allocations.ClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 300209

mlir/include/mlir/Transforms/Passes.h

mlir/include/mlir/Transforms/Passes.td

mlir/lib/Transforms/BufferOptimizations.cpp

mlir/test/Transforms/promote-buffers-to-stack.mlir

[MLIR] Added PromoteBuffersToStackPass to convert heap-based allocations to stack-based allocations.
ClosedPublic