diff --git a/mlir/include/mlir/Transforms/Bufferize.h b/mlir/include/mlir/Transforms/Bufferize.h --- a/mlir/include/mlir/Transforms/Bufferize.h +++ b/mlir/include/mlir/Transforms/Bufferize.h @@ -28,8 +28,10 @@ #ifndef MLIR_TRANSFORMS_BUFFERIZE_H #define MLIR_TRANSFORMS_BUFFERIZE_H +#include "mlir/Analysis/Liveness.h" #include "mlir/Dialect/StandardOps/IR/Ops.h" #include "mlir/IR/Builders.h" +#include "mlir/IR/Dominance.h" #include "mlir/IR/Function.h" #include "mlir/IR/Operation.h" #include "mlir/Transforms/DialectConversion.h" @@ -276,6 +278,142 @@ >(context, converter); // clang-format on } + +/// A straight-forward alias analysis which ensures that all aliases of all +/// values will be determined. This is a requirement for the BufferPlacement +/// class since you need to determine safe positions to place alloc and +/// deallocs. +class BufferPlacementAliasAnalysis { +public: + using ValueSetT = SmallPtrSet; + using ValueMapT = llvm::DenseMap; + +public: + /// Constructs a new alias analysis using the op provided. + BufferPlacementAliasAnalysis(Operation *op); + + /// Find all immediate aliases this value could potentially have. + ValueMapT::const_iterator find(Value value) const { + return aliases.find(value); + } + + /// Returns the begin iterator to iterate over all aliases. + ValueMapT::const_iterator begin() const { return aliases.begin(); } + + /// Returns the end iterator that can be used in combination with find. + ValueMapT::const_iterator end() const { return aliases.end(); } + + /// Find all immediate and indirect aliases this value could potentially + /// have. Note that the resulting set will also contain the value provided as + /// it is an alias of itself. + ValueSetT resolve(Value value) const; + + /// Removes the given values from all alias sets. + void remove(const SmallPtrSetImpl &aliasValues); + +private: + /// This function constructs a mapping from values to its immediate aliases. + void build(Operation *op); + + /// Maps values to all immediate aliases this value can have. + ValueMapT aliases; +}; + +/// A simple analysis that detects allocation operations. +class BufferPlacementAllocs { +public: + /// Represents a tuple of allocValue and deallocOperation. + using AllocEntry = std::tuple; + + /// Represents a list containing all alloc entries. + using AllocEntryList = SmallVector; + + /// Get the start operation to place the given alloc value withing the + // specified placement block. + static Operation *getStartOperation(Value allocValue, Block *placementBlock, + const Liveness &liveness); + + /// Find an associated dealloc operation that is linked to the given + /// allocation node (if any). + static Operation *findDealloc(Value allocValue); + +public: + /// Initializes the internal list by discovering all supported allocation + /// nodes. + BufferPlacementAllocs(Operation *op); + + /// Returns the begin iterator to iterate over all allocations. + AllocEntryList::const_iterator begin() const { return allocs.begin(); } + + /// Returns the end iterator that can be used in combination with begin. + AllocEntryList::const_iterator end() const { return allocs.end(); } + + /// Returns the begin iterator to iterate over all allocations. + AllocEntryList::iterator begin() { return allocs.begin(); } + + /// Returns the end iterator that can be used in combination with begin. + AllocEntryList::iterator end() { return allocs.end(); } + + /// Registers a new allocation entry. + void registerAlloc(const AllocEntry &entry) { allocs.push_back(entry); } + +private: + /// Searches for and registers all supported allocation entries. + void build(Operation *op); + +private: + /// Maps allocation nodes to their associated blocks. + AllocEntryList allocs; +}; + +/// The base class for all BufferPlacement transformations. +class BufferPlacementTransformationBase { +public: + using ValueSetT = BufferPlacementAliasAnalysis::ValueSetT; + + /// Finds a common dominator for the given value while taking the positions + /// of the values in the value set into account. It supports dominator and + /// post-dominator analyses via template arguments. + template + static Block *findCommonDominator(Value value, const ValueSetT &values, + const DominatorT &doms) { + // Start with the current block the value is defined in. + Block *dom = value.getParentBlock(); + // Iterate over all aliases and their uses to find a safe placement block + // according to the given dominator information. + for (Value childValue : values) { + for (Operation *user : childValue.getUsers()) { + // Move upwards in the dominator tree to find an appropriate + // dominator block that takes the current use into account. + dom = doms.findNearestCommonDominator(dom, user->getBlock()); + } + // Take values without any users into account. + dom = doms.findNearestCommonDominator(dom, childValue.getParentBlock()); + } + return dom; + } + + /// Returns true if the given operation represents a loop by testing whether + /// it implements the `LoopLikeOpInterface` or the `RegionBranchOpInterface`. + /// In the case of a `RegionBranchOpInterface`, it checks all region-based + /// control-flow edges for cycles. + static bool isLoop(Operation *op); + + /// Constructs a new operation base using the given root operation. + BufferPlacementTransformationBase(Operation *op); + +protected: + /// Alias information that can be updated during the insertion of copies. + BufferPlacementAliasAnalysis aliases; + + /// Stores all internally managed allocations. + BufferPlacementAllocs allocs; + + /// The underlying liveness analysis to compute fine grained information + /// about alloc and dealloc positions. + Liveness liveness; +}; + } // end namespace mlir #endif // MLIR_TRANSFORMS_BUFFERIZE_H diff --git a/mlir/include/mlir/Transforms/Passes.h b/mlir/include/mlir/Transforms/Passes.h --- a/mlir/include/mlir/Transforms/Passes.h +++ b/mlir/include/mlir/Transforms/Passes.h @@ -28,8 +28,17 @@ // Passes //===----------------------------------------------------------------------===// -/// Creates an instance of the BufferPlacement pass. -std::unique_ptr createBufferPlacementPass(); +/// Creates an instance of the BufferDeallocation pass to free all allocated +/// buffers. +std::unique_ptr createBufferDeallocationPass(); + +/// Creates a pass that moves allocations upwards to reduce the number of +/// required copies that are inserted during the BufferDeallocation pass. +std::unique_ptr createBufferHoistingPass(); + +/// Creates a pass that moves allocations upwards out of loops. This avoids +/// reallocations inside of loops. +std::unique_ptr createBufferLoopHoistingPass(); /// Creates an instance of the Canonicalizer pass. std::unique_ptr createCanonicalizerPass(); diff --git a/mlir/include/mlir/Transforms/Passes.td b/mlir/include/mlir/Transforms/Passes.td --- a/mlir/include/mlir/Transforms/Passes.td +++ b/mlir/include/mlir/Transforms/Passes.td @@ -102,12 +102,13 @@ let constructor = "mlir::createPipelineDataTransferPass()"; } -def BufferPlacement : FunctionPass<"buffer-placement"> { - let summary = "Optimizes placement of alloc and dealloc operations"; +def BufferDeallocation : FunctionPass<"buffer-deallocation"> { + let summary = "Adds all required dealloc operations for all allocations in the " + "input program"; let description = [{ - This pass implements an algorithm to optimize the placement of alloc and - dealloc operations. This pass also inserts missing dealloc operations - automatically to reclaim memory. + This pass implements an algorithm to automatically introduce all required + deallocation operations for all buffers in the input program. This ensures that + the resulting program does not have any memory leaks. Input @@ -121,7 +122,11 @@ br ^bb3(%arg1 : memref<2xf32>) ^bb2: %0 = alloc() : memref<2xf32> - linalg.generic {args_in = 1 : i64, args_out = 1 : i64, indexing_maps = [#map0, #map0], iterator_types = ["parallel"]} %arg1, %0 { + linalg.generic { + args_in = 1 : i64, + args_out = 1 : i64, + indexing_maps = [#map0, #map0], + iterator_types = ["parallel"]} %arg1, %0 { ^bb0(%gen1_arg0: f32, %gen1_arg1: f32): %tmp1 = exp %gen1_arg0 : f32 linalg.yield %tmp1 : f32 @@ -141,31 +146,61 @@ #map0 = affine_map<(d0) -> (d0)> module { func @condBranch(%arg0: i1, %arg1: memref<2xf32>, %arg2: memref<2xf32>) { - %0 = alloc() : memref<2xf32> cond_br %arg0, ^bb1, ^bb2 - ^bb1: // pred: ^bb0 - br ^bb3(%arg1 : memref<2xf32>) - ^bb2: // pred: ^bb0 - linalg.generic {args_in = 1 : i64, args_out = 1 : i64, indexing_maps = [#map0, #map0], iterator_types = ["parallel"]} %arg1, %0 { - ^bb0(%arg3: f32, %arg4: f32): // no predecessors - %2 = exp %arg3 : f32 - linalg.yield %2 : f32 - }: memref<2xf32>, memref<2xf32> + ^bb1: // pred: ^bb0 + %0 = alloc() : memref<2xf32> + linalg.copy(%arg1, %0) : memref<2xf32>, memref<2xf32> br ^bb3(%0 : memref<2xf32>) - ^bb3(%1: memref<2xf32>): // 2 preds: ^bb1, ^bb2 - linalg.copy(%1, %arg2) : memref<2xf32>, memref<2xf32> - dealloc %0 : memref<2xf32> + ^bb2: // pred: ^bb0 + %1 = alloc() : memref<2xf32> + linalg.generic { + args_in = 1 : i64, + args_out = 1 : i64, + indexing_maps = [#map0, #map0], + iterator_types = ["parallel"]} %arg1, %1 { + ^bb0(%arg3: f32, %arg4: f32): // no predecessors + %4 = exp %arg3 : f32 + linalg.yield %4 : f32 + }: memref<2xf32>, memref<2xf32> + %2 = alloc() : memref<2xf32> + linalg.copy(%1, %2) : memref<2xf32>, memref<2xf32> + dealloc %1 : memref<2xf32> + br ^bb3(%2 : memref<2xf32>) + ^bb3(%3: memref<2xf32>): // 2 preds: ^bb1, ^bb2 + linalg.copy(%3, %arg2) : memref<2xf32>, memref<2xf32> + dealloc %3 : memref<2xf32> return } + } ``` }]; - let constructor = "mlir::createBufferPlacementPass()"; + let constructor = "mlir::createBufferDeallocationPass()"; // TODO: this pass likely shouldn't depend on Linalg? let dependentDialects = ["linalg::LinalgDialect"]; } +def BufferHoisting : FunctionPass<"buffer-hoisting"> { + let summary = "Optimizes placement of allocation operations by moving them " + "into common dominators and out of nested regions"; + let description = [{ + This pass implements an approach to aggressively move allocations upwards + into common dominators and out of nested regions. + }]; + let constructor = "mlir::createBufferHoistingPass()"; +} + +def BufferLoopHoisting : FunctionPass<"buffer-loop-hoisting"> { + let summary = "Optimizes placement of allocation operations by moving them " + "out of loop nests"; + let description = [{ + This pass implements an approach to aggressively move allocations upwards + out of loop nests. It does not move allocations into common dominators. + }]; + let constructor = "mlir::createBufferLoopHoistingPass()"; +} + def Canonicalizer : Pass<"canonicalize"> { let summary = "Canonicalize operations"; let description = [{ diff --git a/mlir/lib/Transforms/BufferPlacement.cpp b/mlir/lib/Transforms/BufferDeallocation.cpp rename from mlir/lib/Transforms/BufferPlacement.cpp rename to mlir/lib/Transforms/BufferDeallocation.cpp --- a/mlir/lib/Transforms/BufferPlacement.cpp +++ b/mlir/lib/Transforms/BufferDeallocation.cpp @@ -1,4 +1,4 @@ -//===- BufferPlacement.cpp - the impl for buffer placement ---------------===// +//===- BufferDeallocation.cpp - the impl for buffer deallocation ----------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. @@ -8,14 +8,15 @@ // // This file implements logic for computing correct alloc and dealloc positions. // Furthermore, buffer placement also adds required new alloc and copy -// operations to ensure that all buffers are deallocated.The main class is the -// BufferPlacementPass class that implements the underlying algorithm. In order -// to put allocations and deallocations at safe positions, it is significantly -// important to put them into the correct blocks. However, the liveness analysis -// does not pay attention to aliases, which can occur due to branches (and their -// associated block arguments) in general. For this purpose, BufferPlacement -// firstly finds all possible aliases for a single value (using the -// BufferPlacementAliasAnalysis class). Consider the following example: +// operations to ensure that all buffers are deallocated. The main class is the +// BufferDeallocationPass class that implements the underlying algorithm. In +// order to put allocations and deallocations at safe positions, it is +// significantly important to put them into the correct blocks. However, the +// liveness analysis does not pay attention to aliases, which can occur due to +// branches (and their associated block arguments) in general. For this purpose, +// BufferDeallocation firstly finds all possible aliases for a single value +// (using the BufferPlacementAliasAnalysis class). Consider the following +// example: // // ^bb0(%arg0): // cond_br %cond, ^bb1, ^bb2 @@ -27,13 +28,9 @@ // ^exit(%arg1): // return %arg1; // -// Using liveness information on its own would cause us to place the allocs and -// deallocs in the wrong block. This is due to the fact that %new_value will not -// be liveOut of its block. Instead, we can place the alloc for %new_value -// in bb0 and its associated dealloc in exit. Alternatively, the alloc can stay -// (or even has to stay due to additional dependencies) at this location and we -// have to free the buffer in the same block, because it cannot be freed in the -// post dominator. However, this requires a new copy buffer for %arg1 that will +// We should place the dealloc for %new_value in exit. However, we have to free +// the buffer in the same block, because it cannot be freed in the post +// dominator. However, this requires a new copy buffer for %arg1 that will // contain the actual contents. Using the class BufferPlacementAliasAnalysis, we // will find out that %new_value has a potential alias %arg1. In order to find // the dealloc position we have to find all potential aliases, iterate over @@ -42,10 +39,9 @@ // of the deallocs). In all cases, the computed block can be safely used to free // the %new_value buffer (may be exit or bb2) as it will die and we can use // liveness information to determine the exact operation after which we have to -// insert the dealloc. Finding the alloc position is similar and non-obvious. -// However, the algorithm supports moving allocs to other places and introducing -// copy buffers and placing deallocs in safe places to ensure that all buffers -// will be freed in the end. +// insert the dealloc. However, the algorithm supports introducing copy buffers +// and placing deallocs in safe locations to ensure that all buffers will be +// freed in the end. // // TODO: // The current implementation does not support explicit-control-flow loops and @@ -56,13 +52,13 @@ //===----------------------------------------------------------------------===// #include "PassDetail.h" -#include "mlir/Analysis/Liveness.h" #include "mlir/Dialect/Linalg/IR/LinalgOps.h" #include "mlir/Dialect/StandardOps/IR/Ops.h" -#include "mlir/IR/Dominance.h" #include "mlir/IR/Operation.h" #include "mlir/Interfaces/ControlFlowInterfaces.h" +#include "mlir/Interfaces/LoopLikeInterface.h" #include "mlir/Pass/Pass.h" +#include "mlir/Transforms/Bufferize.h" #include "mlir/Transforms/Passes.h" #include "llvm/ADT/SetOperations.h" @@ -95,134 +91,249 @@ regionInterface.getSuccessorRegions(index, operandAttributes, successors); } -namespace { //===----------------------------------------------------------------------===// // BufferPlacementAliasAnalysis //===----------------------------------------------------------------------===// -/// A straight-forward alias analysis which ensures that all aliases of all -/// values will be determined. This is a requirement for the BufferPlacement -/// class since you need to determine safe positions to place alloc and -/// deallocs. -class BufferPlacementAliasAnalysis { -public: - using ValueSetT = SmallPtrSet; - using ValueMapT = llvm::DenseMap; - -public: - /// Constructs a new alias analysis using the op provided. - BufferPlacementAliasAnalysis(Operation *op) { build(op); } - - /// Find all immediate aliases this value could potentially have. - ValueMapT::const_iterator find(Value value) const { - return aliases.find(value); - } - - /// Returns the end iterator that can be used in combination with find. - ValueMapT::const_iterator end() const { return aliases.end(); } - - /// Find all immediate and indirect aliases this value could potentially - /// have. Note that the resulting set will also contain the value provided as - /// it is an alias of itself. - ValueSetT resolve(Value value) const { - ValueSetT result; - resolveRecursive(value, result); - return result; - } +/// Constructs a new alias analysis using the op provided. +BufferPlacementAliasAnalysis::BufferPlacementAliasAnalysis(Operation *op) { + build(op); +} - /// Removes the given values from all alias sets. - void remove(const SmallPtrSetImpl &aliasValues) { - for (auto &entry : aliases) - llvm::set_subtract(entry.second, aliasValues); - } +/// Find all immediate and indirect aliases this value could potentially +/// have. Note that the resulting set will also contain the value provided as +/// it is an alias of itself. +BufferPlacementAliasAnalysis::ValueSetT +BufferPlacementAliasAnalysis::resolve(Value value) const { + ValueSetT result; -private: /// Recursively determines alias information for the given value. It stores /// all newly found potential aliases in the given result set. - void resolveRecursive(Value value, ValueSetT &result) const { - if (!result.insert(value).second) + std::function resolveRecursive = [&](Value current) { + if (!result.insert(current).second) return; - auto it = aliases.find(value); + auto it = aliases.find(current); if (it == aliases.end()) return; for (Value alias : it->second) - resolveRecursive(alias, result); - } + resolveRecursive(alias); + }; - /// This function constructs a mapping from values to its immediate aliases. - /// It iterates over all blocks, gets their predecessors, determines the - /// values that will be passed to the corresponding block arguments and - /// inserts them into the underlying map. Furthermore, it wires successor - /// regions and branch-like return operations from nested regions. - void build(Operation *op) { - // Registers all aliases of the given values. - auto registerAliases = [&](auto values, auto aliases) { - for (auto entry : llvm::zip(values, aliases)) - this->aliases[std::get<0>(entry)].insert(std::get<1>(entry)); - }; + resolveRecursive(value); + return result; +} - // Add additional aliases created by view changes to the alias list. - op->walk([&](ViewLikeOpInterface viewInterface) { - aliases[viewInterface.getViewSource()].insert( - viewInterface.getOperation()->getResult(0)); - }); +/// Removes the given values from all alias sets. +void BufferPlacementAliasAnalysis::remove( + const SmallPtrSetImpl &aliasValues) { + for (auto &entry : aliases) + llvm::set_subtract(entry.second, aliasValues); +} - // Query all branch interfaces to link block argument aliases. - op->walk([&](BranchOpInterface branchInterface) { - Block *parentBlock = branchInterface.getOperation()->getBlock(); - for (auto it = parentBlock->succ_begin(), e = parentBlock->succ_end(); - it != e; ++it) { - // Query the branch op interface to get the successor operands. - auto successorOperands = - branchInterface.getSuccessorOperands(it.getIndex()); - if (!successorOperands.hasValue()) - continue; - // Build the actual mapping of values to their immediate aliases. - registerAliases(successorOperands.getValue(), (*it)->getArguments()); - } - }); +/// This function constructs a mapping from values to its immediate aliases. +/// It iterates over all blocks, gets their predecessors, determines the +/// values that will be passed to the corresponding block arguments and +/// inserts them into the underlying map. Furthermore, it wires successor +/// regions and branch-like return operations from nested regions. +void BufferPlacementAliasAnalysis::build(Operation *op) { + // Registers all aliases of the given values. + auto registerAliases = [&](auto values, auto aliases) { + for (auto entry : llvm::zip(values, aliases)) + this->aliases[std::get<0>(entry)].insert(std::get<1>(entry)); + }; - // Query the RegionBranchOpInterface to find potential successor regions. - op->walk([&](RegionBranchOpInterface regionInterface) { - // Extract all entry regions and wire all initial entry successor inputs. - SmallVector entrySuccessors; - getSuccessorRegions(regionInterface, /*index=*/llvm::None, - entrySuccessors); - for (RegionSuccessor &entrySuccessor : entrySuccessors) { - // Wire the entry region's successor arguments with the initial - // successor inputs. - assert(entrySuccessor.getSuccessor() && - "Invalid entry region without an attached successor region"); - registerAliases(regionInterface.getSuccessorEntryOperands( - entrySuccessor.getSuccessor()->getRegionNumber()), - entrySuccessor.getSuccessorInputs()); - } + // Add additional aliases created by view changes to the alias list. + op->walk([&](ViewLikeOpInterface viewInterface) { + aliases[viewInterface.getViewSource()].insert( + viewInterface.getOperation()->getResult(0)); + }); + + // Query all branch interfaces to link block argument aliases. + op->walk([&](BranchOpInterface branchInterface) { + Block *parentBlock = branchInterface.getOperation()->getBlock(); + for (auto it = parentBlock->succ_begin(), e = parentBlock->succ_end(); + it != e; ++it) { + // Query the branch op interface to get the successor operands. + auto successorOperands = + branchInterface.getSuccessorOperands(it.getIndex()); + if (!successorOperands.hasValue()) + continue; + // Build the actual mapping of values to their immediate aliases. + registerAliases(successorOperands.getValue(), (*it)->getArguments()); + } + }); + + // Query the RegionBranchOpInterface to find potential successor regions. + op->walk([&](RegionBranchOpInterface regionInterface) { + // Extract all entry regions and wire all initial entry successor inputs. + SmallVector entrySuccessors; + getSuccessorRegions(regionInterface, /*index=*/llvm::None, entrySuccessors); + for (RegionSuccessor &entrySuccessor : entrySuccessors) { + // Wire the entry region's successor arguments with the initial + // successor inputs. + assert(entrySuccessor.getSuccessor() && + "Invalid entry region without an attached successor region"); + registerAliases(regionInterface.getSuccessorEntryOperands( + entrySuccessor.getSuccessor()->getRegionNumber()), + entrySuccessor.getSuccessorInputs()); + } - // Wire flow between regions and from region exits. - for (Region ®ion : regionInterface.getOperation()->getRegions()) { - // Iterate over all successor region entries that are reachable from the - // current region. - SmallVector successorRegions; - getSuccessorRegions(regionInterface, region.getRegionNumber(), - successorRegions); - for (RegionSuccessor &successorRegion : successorRegions) { - // Iterate over all immediate terminator operations and wire the - // successor inputs with the operands of each terminator. - walkReturnOperations(®ion, [&](Operation *terminator) { - registerAliases(terminator->getOperands(), - successorRegion.getSuccessorInputs()); - }); - } + // Wire flow between regions and from region exits. + for (Region ®ion : regionInterface.getOperation()->getRegions()) { + // Iterate over all successor region entries that are reachable from the + // current region. + SmallVector successorRegions; + getSuccessorRegions(regionInterface, region.getRegionNumber(), + successorRegions); + for (RegionSuccessor &successorRegion : successorRegions) { + // Iterate over all immediate terminator operations and wire the + // successor inputs with the operands of each terminator. + walkReturnOperations(®ion, [&](Operation *terminator) { + registerAliases(terminator->getOperands(), + successorRegion.getSuccessorInputs()); + }); } + } + }); +} + +//===----------------------------------------------------------------------===// +// BufferPlacementAllocs +//===----------------------------------------------------------------------===// + +/// Get the start operation to place the given alloc value withing the +// specified placement block. +Operation *BufferPlacementAllocs::getStartOperation(Value allocValue, + Block *placementBlock, + const Liveness &liveness) { + // We have to ensure that we place the alloc before its first use in this + // block. + const LivenessBlockInfo &livenessInfo = *liveness.getLiveness(placementBlock); + Operation *startOperation = livenessInfo.getStartOperation(allocValue); + // Check whether the start operation lies in the desired placement block. + // If not, we will use the terminator as this is the last operation in + // this block. + if (startOperation->getBlock() != placementBlock) { + Operation *opInPlacementBlock = + placementBlock->findAncestorOpInBlock(*startOperation); + startOperation = opInPlacementBlock ? opInPlacementBlock + : placementBlock->getTerminator(); + } + + return startOperation; +} + +/// Finds associated deallocs that can be linked to our allocation nodes (if +/// any). +Operation *BufferPlacementAllocs::findDealloc(Value allocValue) { + auto userIt = llvm::find_if(allocValue.getUsers(), [&](Operation *user) { + auto effectInterface = dyn_cast(user); + if (!effectInterface) + return false; + // Try to find a free effect that is applied to one of our values + // that will be automatically freed by our pass. + SmallVector effects; + effectInterface.getEffectsOnValue(allocValue, effects); + return llvm::any_of(effects, [&](MemoryEffects::EffectInstance &it) { + return isa(it.getEffect()); }); + }); + // Assign the associated dealloc operation (if any). + return userIt != allocValue.user_end() ? *userIt : nullptr; +} + +/// Initializes the internal list by discovering all supported allocation +/// nodes. +BufferPlacementAllocs::BufferPlacementAllocs(Operation *op) { build(op); } + +/// Searches for and registers all supported allocation entries. +void BufferPlacementAllocs::build(Operation *op) { + op->walk([&](MemoryEffectOpInterface opInterface) { + // Try to find a single allocation result. + SmallVector effects; + opInterface.getEffects(effects); + + SmallVector allocateResultEffects; + llvm::copy_if( + effects, std::back_inserter(allocateResultEffects), + [=](MemoryEffects::EffectInstance &it) { + Value value = it.getValue(); + return isa(it.getEffect()) && value && + value.isa() && + it.getResource() != + SideEffects::AutomaticAllocationScopeResource::get(); + }); + // If there is one result only, we will be able to move the allocation and + // (possibly existing) deallocation ops. + if (allocateResultEffects.size() != 1) + return; + // Get allocation result. + Value allocValue = allocateResultEffects[0].getValue(); + // Find the associated dealloc value and register the allocation entry. + allocs.push_back(std::make_tuple(allocValue, findDealloc(allocValue))); + }); +} + +//===----------------------------------------------------------------------===// +// BufferPlacementTransformationBase +//===----------------------------------------------------------------------===// + +/// Constructs a new transformation base using the given root operation. +BufferPlacementTransformationBase::BufferPlacementTransformationBase( + Operation *op) + : aliases(op), allocs(op), liveness(op) {} + +/// Returns true if the given operation represents a loop by testing whether it +/// implements the `LoopLikeOpInterface` or the `RegionBranchOpInterface`. In +/// the case of a `RegionBranchOpInterface`, it checks all region-based control- +/// flow edges for cycles. +bool BufferPlacementTransformationBase::isLoop(Operation *op) { + // If the operation implements the `LoopLikeOpInterface` it can be considered + // a loop. + if (isa(op)) + return true; + + // If the operation does not implement the `RegionBranchOpInterface`, it is + // (currently) not possible to detect a loop. + RegionBranchOpInterface regionInterface; + if (!(regionInterface = dyn_cast(op))) + return false; + + // Recurses into a region using the current region interface to find potential + // cycles. + SmallPtrSet visitedRegions; + std::function recurse = [&](Region *current) { + if (!current) + return false; + // If we have found a back edge, the parent operation induces a loop. + if (!visitedRegions.insert(current).second) + return true; + // Recurses into all region successors. + SmallVector successors; + getSuccessorRegions(regionInterface, current->getRegionNumber(), + successors); + for (RegionSuccessor ®ionEntry : successors) + if (recurse(regionEntry.getSuccessor())) + return true; + return false; + }; + + // Start with all entry regions and test whether they induce a loop. + SmallVector successorRegions; + getSuccessorRegions(regionInterface, /*index=*/llvm::None, successorRegions); + for (RegionSuccessor ®ionEntry : successorRegions) { + if (recurse(regionEntry.getSuccessor())) + return true; + visitedRegions.clear(); } - /// Maps values to all immediate aliases this value can have. - ValueMapT aliases; -}; + return false; +} + +namespace { //===----------------------------------------------------------------------===// -// Backedges +// Backedges analysis //===----------------------------------------------------------------------===// /// A straight-forward program analysis which detects loop backedges induced by @@ -284,7 +395,7 @@ return; // Recurse into all operations and successor blocks. - for (auto &op : block.getOperations()) + for (Operation &op : block.getOperations()) recurse(&op, predecessor); // Leave the current block. @@ -299,133 +410,28 @@ }; //===----------------------------------------------------------------------===// -// BufferPlacement +// BufferDeallocation //===----------------------------------------------------------------------===// -// The main buffer placement analysis used to place allocs, copies and deallocs. -class BufferPlacement { -public: - using ValueSetT = BufferPlacementAliasAnalysis::ValueSetT; - - /// An intermediate representation of a single allocation node. - struct AllocEntry { - /// A reference to the associated allocation node. - Value allocValue; - - /// The associated placement block in which the allocation should be - /// performed. - Block *placementBlock; - - /// The associated dealloc operation (if any). - Operation *deallocOperation; - }; - - using AllocEntryList = SmallVector; - +/// The buffer deallocation transformation which ensures that all allocs in the +/// program have a corresponding de-allocation. As a side-effect, it might also +/// introduce copies that in turn leads to additional allocs and de-allocations. +class BufferDeallocation : BufferPlacementTransformationBase { public: - BufferPlacement(Operation *op) - : operation(op), aliases(op), liveness(op), dominators(op), - postDominators(op) { - // Gather all allocation nodes - initBlockMapping(); - } + BufferDeallocation(Operation *op) + : BufferPlacementTransformationBase(op), dominators(op), + postDominators(op) {} - /// Performs the actual placement/creation of all alloc, copy and dealloc - /// nodes. - void place() { - // Place all allocations. - placeAllocs(); + /// Performs the actual placement/creation of all temporary alloc, copy and + /// dealloc nodes. + void deallocate() { // Add additional allocations and copies that are required. introduceCopies(); - // Find all associated dealloc nodes. - findDeallocs(); // Place deallocations for all allocation entries. placeDeallocs(); } private: - /// Initializes the internal block mapping by discovering allocation nodes. It - /// maps all allocation nodes to their initial block in which they can be - /// safely allocated. - void initBlockMapping() { - operation->walk([&](MemoryEffectOpInterface opInterface) { - // Try to find a single allocation result. - SmallVector effects; - opInterface.getEffects(effects); - - SmallVector allocateResultEffects; - llvm::copy_if( - effects, std::back_inserter(allocateResultEffects), - [=](MemoryEffects::EffectInstance &it) { - Value value = it.getValue(); - return isa(it.getEffect()) && value && - value.isa() && - it.getResource() != - SideEffects::AutomaticAllocationScopeResource::get(); - }); - // If there is one result only, we will be able to move the allocation and - // (possibly existing) deallocation ops. - if (allocateResultEffects.size() != 1) - return; - // Get allocation result. - auto allocResult = allocateResultEffects[0].getValue().cast(); - // Find the initial allocation block and register this result. - allocs.push_back( - {allocResult, getInitialAllocBlock(allocResult), nullptr}); - }); - } - - /// Computes a valid allocation position in a dominator (if possible) for the - /// given allocation result. - Block *getInitialAllocBlock(OpResult result) { - // Get all allocation operands as these operands are important for the - // allocation operation. - Operation *owner = result.getOwner(); - auto operands = owner->getOperands(); - Block *dominator; - if (operands.size() < 1) - dominator = - findCommonDominator(result, aliases.resolve(result), dominators); - else { - // If this node has dependencies, check all dependent nodes with respect - // to a common post dominator in which all values are available. - ValueSetT dependencies(++operands.begin(), operands.end()); - dominator = - findCommonDominator(*operands.begin(), dependencies, postDominators); - } - - // Do not move allocs out of their parent regions to keep them local. - if (dominator->getParent() != owner->getParentRegion()) - return &owner->getParentRegion()->front(); - return dominator; - } - - /// Finds correct alloc positions according to the algorithm described at - /// the top of the file for all alloc nodes that can be handled by this - /// analysis. - void placeAllocs() const { - for (const AllocEntry &entry : allocs) { - Value alloc = entry.allocValue; - // Get the actual block to place the alloc and get liveness information - // for the placement block. - Block *placementBlock = entry.placementBlock; - // We have to ensure that we place the alloc before its first use in this - // block. - const LivenessBlockInfo *livenessInfo = - liveness.getLiveness(placementBlock); - Operation *startOperation = livenessInfo->getStartOperation(alloc); - // Check whether the start operation lies in the desired placement block. - // If not, we will use the terminator as this is the last operation in - // this block. - if (startOperation->getBlock() != placementBlock) - startOperation = placementBlock->getTerminator(); - - // Move the alloc in front of the start operation. - Operation *allocOperation = alloc.getDefiningOp(); - allocOperation->moveBefore(startOperation); - } - } - /// Introduces required allocs and copy operations to avoid memory leaks. void introduceCopies() { // Initialize the set of values that require a dedicated memory free @@ -462,9 +468,10 @@ }; // Detect possibly unsafe aliases starting from all allocations. - for (auto &entry : allocs) - findUnsafeValues(entry.allocValue, entry.placementBlock); - + for (BufferPlacementAllocs::AllocEntry &entry : allocs) { + Value allocValue = std::get<0>(entry); + findUnsafeValues(allocValue, allocValue.getDefiningOp()->getBlock()); + } // Try to find block arguments that require an explicit free operation // until we reach a fix point. while (!toProcess.empty()) { @@ -486,7 +493,7 @@ // Register the value to require a final dealloc. Note that we do not have // to assign a block here since we do not want to move the allocation node // to another location. - allocs.push_back({value, nullptr, nullptr}); + allocs.registerAlloc(std::make_tuple(value, nullptr)); } } @@ -542,7 +549,8 @@ // parent operation. In this case, we have to introduce an additional copy // for buffer that is passed to the argument. SmallVector successorRegions; - getSuccessorRegions(regionInterface, llvm::None, successorRegions); + getSuccessorRegions(regionInterface, /*index=*/llvm::None, + successorRegions); auto *it = llvm::find_if(successorRegions, [&](RegionSuccessor &successorRegion) { return successorRegion.getSuccessor() == argRegion; @@ -664,30 +672,6 @@ return alloc; } - /// Finds associated deallocs that can be linked to our allocation nodes (if - /// any). - void findDeallocs() { - for (auto &entry : allocs) { - auto userIt = - llvm::find_if(entry.allocValue.getUsers(), [&](Operation *user) { - auto effectInterface = dyn_cast(user); - if (!effectInterface) - return false; - // Try to find a free effect that is applied to one of our values - // that will be automatically freed by our pass. - SmallVector effects; - effectInterface.getEffectsOnValue(entry.allocValue, effects); - return llvm::any_of( - effects, [&](MemoryEffects::EffectInstance &it) { - return isa(it.getEffect()); - }); - }); - // Assign the associated dealloc operation (if any). - if (userIt != entry.allocValue.user_end()) - entry.deallocOperation = *userIt; - } - } - /// Finds correct dealloc positions according to the algorithm described at /// the top of the file for all alloc nodes and block arguments that can be /// handled by this analysis. @@ -696,8 +680,8 @@ // These deallocations will be linked to their associated allocation nodes // since they don't have any aliases that can (potentially) increase their // liveness. - for (const AllocEntry &entry : allocs) { - Value alloc = entry.allocValue; + for (const BufferPlacementAllocs::AllocEntry &entry : allocs) { + Value alloc = std::get<0>(entry); auto aliasesSet = aliases.resolve(alloc); assert(aliasesSet.size() > 0 && "must contain at least one alias"); @@ -713,11 +697,21 @@ // the placementBlock and that we can safely place the dealloc at the // beginning. Operation *endOperation = &placementBlock->front(); + // Iterate over all aliases and ensure that the endOperation will point // to the last operation of all potential aliases in the placementBlock. for (Value alias : aliasesSet) { + // Ensure that the start operation is at least the defining operation of + // the current alias to avoid invalid placement of deallocs for aliases + // without any uses. + Operation *beforeOp = endOperation; + if (alias.getDefiningOp() && + !(beforeOp = placementBlock->findAncestorOpInBlock( + *alias.getDefiningOp()))) + continue; + Operation *aliasEndOperation = - livenessInfo->getEndOperation(alias, endOperation); + livenessInfo->getEndOperation(alias, beforeOp); // Check whether the aliasEndOperation lies in the desired block and // whether it is behind the current endOperation. If yes, this will be // the new endOperation. @@ -729,8 +723,9 @@ // the dealloc taking all potential aliases into account. // If there is an existing dealloc, move it to the right place. - if (entry.deallocOperation) { - entry.deallocOperation->moveAfter(endOperation); + Operation *deallocOperation = std::get<1>(entry); + if (deallocOperation) { + deallocOperation->moveAfter(endOperation); } else { // If the Dealloc position is at the terminator operation of the // block, then the value should escape from a deallocation. @@ -744,56 +739,26 @@ } } - /// Finds a common dominator for the given value while taking the positions - /// of the values in the value set into account. It supports dominator and - /// post-dominator analyses via template arguments. - template - Block *findCommonDominator(Value value, const ValueSetT &values, - const DominatorT &doms) const { - // Start with the current block the value is defined in. - Block *dom = value.getParentBlock(); - // Iterate over all aliases and their uses to find a safe placement block - // according to the given dominator information. - for (Value childValue : values) - for (Operation *user : childValue.getUsers()) { - // Move upwards in the dominator tree to find an appropriate - // dominator block that takes the current use into account. - dom = doms.findNearestCommonDominator(dom, user->getBlock()); - } - return dom; - } - - /// The operation this transformation was constructed from. - Operation *operation; - - /// Alias information that can be updated during the insertion of copies. - BufferPlacementAliasAnalysis aliases; - - /// Maps allocation nodes to their associated blocks. - AllocEntryList allocs; - - // Stores already copied allocations to avoid additional copies of copies. - ValueSetT copiedValues; - - /// The underlying liveness analysis to compute fine grained information - /// about alloc and dealloc positions. - Liveness liveness; - - /// The dominator analysis to place deallocs in the appropriate blocks. + /// The dominator info to find the appropriate start operation to move the + /// allocs. DominanceInfo dominators; - /// The post dominator analysis to place deallocs in the appropriate blocks. + /// The post dominator info to move the dependent allocs in the right + /// position. PostDominanceInfo postDominators; + + /// Stores already copied allocations to avoid additional copies of copies. + ValueSetT copiedValues; }; //===----------------------------------------------------------------------===// -// BufferPlacementPass +// BufferDeallocationPass //===----------------------------------------------------------------------===// -/// The actual buffer placement pass that moves alloc and dealloc nodes into -/// the right positions. It uses the algorithm described at the top of the -/// file. -struct BufferPlacementPass : BufferPlacementBase { +/// The actual buffer deallocation pass that inserts and moves dealloc nodes +/// into the right positions. Furthermore, it inserts additional allocs and +/// copies if necessary. It uses the algorithm described at the top of the file. +struct BufferDeallocationPass : BufferDeallocationBase { void runOnFunction() override { // Ensure that there are supported loops only. @@ -804,18 +769,18 @@ return; } - // Place all required alloc, copy and dealloc nodes. - BufferPlacement placement(getFunction()); - placement.place(); + // Place all required temporary alloc, copy and dealloc nodes. + BufferDeallocation deallocation(getFunction()); + deallocation.deallocate(); } }; } // end anonymous namespace //===----------------------------------------------------------------------===// -// BufferPlacementPass construction +// BufferDeallocationPass construction //===----------------------------------------------------------------------===// -std::unique_ptr mlir::createBufferPlacementPass() { - return std::make_unique(); +std::unique_ptr mlir::createBufferDeallocationPass() { + return std::make_unique(); } diff --git a/mlir/lib/Transforms/BufferOptimizations.cpp b/mlir/lib/Transforms/BufferOptimizations.cpp new file mode 100644 --- /dev/null +++ b/mlir/lib/Transforms/BufferOptimizations.cpp @@ -0,0 +1,258 @@ +//===- BufferOptimizations.cpp - pre-pass optimizations for bufferization -===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file implements logic for two optimization passes. These passes try to +// hoist alloc nodes to reduce the number of allocations and copies during +// buffer deallocation. + +#include "PassDetail.h" +#include "mlir/IR/Operation.h" +#include "mlir/Interfaces/LoopLikeInterface.h" +#include "mlir/Pass/Pass.h" +#include "mlir/Transforms/Bufferize.h" +#include "mlir/Transforms/Passes.h" + +using namespace mlir; + +namespace { + +//===----------------------------------------------------------------------===// +// BufferAllocationHoisting +//===----------------------------------------------------------------------===// + +/// A base implementation compatible with the `BufferAllocationHoisting` class. +struct BufferAllocationHoistingStateBase { + /// A pointer to the current dominance info. + DominanceInfo *dominators; + + /// The current allocation value. + Value allocValue; + + /// The current placement block (if any). + Block *placementBlock; + + /// Initializes the state base. + BufferAllocationHoistingStateBase(DominanceInfo *dominators, Value allocValue, + Block *placementBlock) + : dominators(dominators), allocValue(allocValue), + placementBlock(placementBlock) {} +}; + +/// Implements the actual hoisting logic for allocation nodes. +template +class BufferAllocationHoisting : public BufferPlacementTransformationBase { +private: + /// Returns true if the given operation implements a known high-level region- + /// based control-flow interface. + static bool isKnownControlFlowInterface(Operation *op) { + return isa(op); + } + +public: + BufferAllocationHoisting(Operation *op) + : BufferPlacementTransformationBase(op), dominators(op), + postDominators(op) {} + + /// Moves allocations upwards. + void hoist() { + for (BufferPlacementAllocs::AllocEntry &entry : allocs) { + Value allocValue = std::get<0>(entry); + Operation *definingOp = allocValue.getDefiningOp(); + assert(definingOp && "No defining op"); + auto operands = definingOp->getOperands(); + auto resultAliases = aliases.resolve(allocValue); + // Determine the common dominator block of all aliases. + Block *dominatorBlock = + findCommonDominator(allocValue, resultAliases, dominators); + // Init the initial hoisting state. + StateT state(&dominators, allocValue, allocValue.getParentBlock()); + // Check for additional allocation dependencies to compute an upper bound + // for hoisting. + Block *dependencyBlock = nullptr; + if (!operands.empty()) { + // If this node has dependencies, check all dependent nodes with respect + // to a common post dominator. This ensures that all dependency values + // have been computed before allocating the buffer. + ValueSetT dependencies(std::next(operands.begin()), operands.end()); + dependencyBlock = findCommonDominator(*operands.begin(), dependencies, + postDominators); + } + + // Find the actual placement block and determine the start operation using + // an upper placement-block boundary. The idea is that placement block + // cannot be moved any further upwards than the given upper bound. + Block *placementBlock = findPlacementBlock( + state, state.computeUpperBound(dominatorBlock, dependencyBlock)); + Operation *startOperation = BufferPlacementAllocs::getStartOperation( + allocValue, placementBlock, liveness); + + // Move the alloc in front of the start operation. + Operation *allocOperation = allocValue.getDefiningOp(); + allocOperation->moveBefore(startOperation); + } + } + +private: + /// Finds a valid placement block by walking upwards in the CFG until we + /// either cannot continue our walk due to constraints (given by the StateT + /// implementation) or we have reached the upper-most dominator block. + Block *findPlacementBlock(StateT &state, Block *upperBound) { + Block *currentBlock = state.placementBlock; + // Walk from the innermost regions/loops to the outermost regions/loops and + // find an appropriate placement block that satisfies the constraint of the + // current StateT implementation. Walk until we reach the upperBound block + // (if any). + + // If we are not able to find a valid parent operation or an associated + // parent block, break the walk loop. + Operation *parentOp; + Block *parentBlock; + while ((parentOp = currentBlock->getParentOp()) && + (parentBlock = parentOp->getBlock()) && + (!upperBound || + dominators.properlyDominates(upperBound, currentBlock))) { + // Try to find an immediate dominator and check whether the parent block + // is above the immediate dominator (if any). + DominanceInfoNode *idom = dominators.getNode(currentBlock)->getIDom(); + if (idom && dominators.properlyDominates(parentBlock, idom->getBlock())) { + // If the current immediate dominator is below the placement block, move + // to the immediate dominator block. + currentBlock = idom->getBlock(); + state.recordMoveToDominator(currentBlock); + } else { + // We have to move to our parent block since an immediate dominator does + // either not exist or is above our parent block. If we cannot move to + // our parent operation due to constraints given by the StateT + // implementation, break the walk loop. Furthermore, we should not move + // allocations out of unknown region-based control-flow operations. + if (!isKnownControlFlowInterface(parentOp) || + !state.isLegalPlacement(parentOp)) + break; + // Move to our parent block by notifying the current StateT + // implementation. + currentBlock = parentBlock; + state.recordMoveToParent(currentBlock); + } + } + // Return the finally determined placement block. + return state.placementBlock; + } + + /// The dominator info to find the appropriate start operation to move the + /// allocs. + DominanceInfo dominators; + + /// The post dominator info to move the dependent allocs in the right + /// position. + PostDominanceInfo postDominators; + + /// The map storing the final placement blocks of a given alloc value. + llvm::DenseMap placementBlocks; +}; + +/// A state implementation compatible with the `BufferAllocationHoisting` class +/// that hoists allocations into dominator blocks while keeping them inside of +/// loops. +struct BufferAllocationHoistingState : BufferAllocationHoistingStateBase { + using BufferAllocationHoistingStateBase::BufferAllocationHoistingStateBase; + + /// Computes the upper bound for the placement block search. + Block *computeUpperBound(Block *dominatorBlock, Block *dependencyBlock) { + // If we do not have a dependency block, the upper bound is given by the + // dominator block. + if (!dependencyBlock) + return dominatorBlock; + + // Find the "lower" block of the dominator and the dependency block to + // ensure that we do not move allocations above this block. + return dominators->properlyDominates(dominatorBlock, dependencyBlock) + ? dependencyBlock + : dominatorBlock; + } + + /// Returns true if the given operation does not represent a loop. + bool isLegalPlacement(Operation *op) { + return !BufferPlacementTransformationBase::isLoop(op); + } + + /// Sets the current placement block to the given block. + void recordMoveToDominator(Block *block) { placementBlock = block; } + + /// Sets the current placement block to the given block. + void recordMoveToParent(Block *block) { recordMoveToDominator(block); } +}; + +/// A state implementation compatible with the `BufferAllocationHoisting` class +/// that hoists allocations out of loops. +struct BufferAllocationLoopHoistingState : BufferAllocationHoistingStateBase { + using BufferAllocationHoistingStateBase::BufferAllocationHoistingStateBase; + + /// Remembers the dominator block of all aliases. + Block *aliasDominatorBlock; + + /// Computes the upper bound for the placement block search. + Block *computeUpperBound(Block *dominatorBlock, Block *dependencyBlock) { + aliasDominatorBlock = dominatorBlock; + // If there is a dependency block, we have to use this block as an upper + // bound to satisfy all allocation value dependencies. + return dependencyBlock ? dependencyBlock : nullptr; + } + + /// Returns true if the given operation represents a loop and one of the + /// aliases caused the `aliasDominatorBlock` to be "above" the block of the + /// given loop operation. If this is the case, it indicates that the + /// allocation is passed via a back edge. + bool isLegalPlacement(Operation *op) { + return BufferPlacementTransformationBase::isLoop(op) && + !dominators->dominates(aliasDominatorBlock, op->getBlock()); + } + + /// Does not change the internal placement block, as we want to move + /// operations out of loops only. + void recordMoveToDominator(Block *block) {} + + /// Sets the current placement block to the given block. + void recordMoveToParent(Block *block) { placementBlock = block; } +}; + +//===----------------------------------------------------------------------===// +// BufferOptimizationPasses +//===----------------------------------------------------------------------===// + +/// The buffer hoisting pass that hoists allocation nodes into dominating +/// blocks. +struct BufferHoistingPass : BufferHoistingBase { + + void runOnFunction() override { + // Hoist all allocations into dominator blocks. + BufferAllocationHoisting optimizer( + getFunction()); + optimizer.hoist(); + } +}; + +/// The buffer loop hoisting pass that hoists allocation nodes out of loops. +struct BufferLoopHoistingPass : BufferLoopHoistingBase { + + void runOnFunction() override { + // Hoist all allocations out of loops. + BufferAllocationHoisting optimizer( + getFunction()); + optimizer.hoist(); + } +}; + +} // end anonymous namespace + +std::unique_ptr mlir::createBufferHoistingPass() { + return std::make_unique(); +} + +std::unique_ptr mlir::createBufferLoopHoistingPass() { + return std::make_unique(); +} diff --git a/mlir/lib/Transforms/CMakeLists.txt b/mlir/lib/Transforms/CMakeLists.txt --- a/mlir/lib/Transforms/CMakeLists.txt +++ b/mlir/lib/Transforms/CMakeLists.txt @@ -1,7 +1,8 @@ add_subdirectory(Utils) add_mlir_library(MLIRTransforms - BufferPlacement.cpp + BufferDeallocation.cpp + BufferOptimizations.cpp Bufferize.cpp Canonicalizer.cpp CopyRemoval.cpp diff --git a/mlir/test/Dialect/Linalg/bufferize.mlir b/mlir/test/Dialect/Linalg/bufferize.mlir --- a/mlir/test/Dialect/Linalg/bufferize.mlir +++ b/mlir/test/Dialect/Linalg/bufferize.mlir @@ -1,4 +1,4 @@ -// RUN: mlir-opt -linalg-bufferize -buffer-placement -split-input-file %s | FileCheck %s +// RUN: mlir-opt -linalg-bufferize -buffer-hoisting -buffer-deallocation -split-input-file %s | FileCheck %s #map0 = affine_map<(d0) -> (d0)> diff --git a/mlir/test/Transforms/buffer-placement.mlir b/mlir/test/Transforms/buffer-deallocation.mlir rename from mlir/test/Transforms/buffer-placement.mlir rename to mlir/test/Transforms/buffer-deallocation.mlir --- a/mlir/test/Transforms/buffer-placement.mlir +++ b/mlir/test/Transforms/buffer-deallocation.mlir @@ -1,8 +1,8 @@ -// RUN: mlir-opt -buffer-placement -split-input-file %s | FileCheck %s +// RUN: mlir-opt -buffer-deallocation -split-input-file %s | FileCheck %s -// This file checks the behaviour of BufferPlacement pass for moving Alloc and -// Dealloc operations and inserting the missing the DeallocOps in their correct -// positions. +// This file checks the behaviour of BufferDeallocation pass for moving and +// inserting missing DeallocOps in their correct positions. Furthermore, +// copies and their corresponding AllocOps are inserted. // Test Case: // bb0 @@ -10,9 +10,10 @@ // bb1 bb2 <- Initial position of AllocOp // \ / // bb3 -// BufferPlacement Expected Behaviour: It should move the existing AllocOp to -// the entry block, and insert a DeallocOp at the exit block after CopyOp since -// %1 is an alias for %0 and %arg1. +// BufferDeallocation expected behavior: bb2 contains an AllocOp which is +// passed to bb3. In the latter block, there should be an deallocation. +// Since bb1 does not contain an adequate alloc and the alloc in bb2 is not +// moved to bb0, we need to insert allocs and copies. #map0 = affine_map<(d0) -> (d0)> @@ -38,10 +39,18 @@ return } -// CHECK-NEXT: %[[ALLOC:.*]] = alloc() // CHECK-NEXT: cond_br +// CHECK: %[[ALLOC0:.*]] = alloc() +// CHECK-NEXT: linalg.copy +// CHECK-NEXT: br ^bb3(%[[ALLOC0]] +// CHECK: %[[ALLOC1:.*]] = alloc() +// CHECK-NEXT: linalg.generic +// CHECK: %[[ALLOC2:.*]] = alloc() +// CHECK-NEXT: linalg.copy +// CHECK-NEXT: dealloc %[[ALLOC1]] +// CHECK-NEXT: br ^bb3(%[[ALLOC2]] // CHECK: linalg.copy -// CHECK-NEXT: dealloc %[[ALLOC]] +// CHECK-NEXT: dealloc // CHECK-NEXT: return // ----- @@ -52,12 +61,12 @@ // bb1 bb2 <- Initial position of AllocOp // \ / // bb3 -// BufferPlacement Expected Behaviour: It should not move the existing AllocOp -// to any other block since the alloc has a dynamic dependency to block argument -// %0 in bb2. Since the dynamic type is passed to bb3 via the block argument %2, -// it is currently required to allocate a temporary buffer for %2 that gets -// copies of %arg0 and %1 with their appropriate shape dimensions. The copy -// buffer deallocation will be applied to %2 in block bb3. +// BufferDeallocation expected behavior: The existing AllocOp has a dynamic +// dependency to block argument %0 in bb2. Since the dynamic type is passed +// to bb3 via the block argument %2, it is currently required to allocate a +// temporary buffer for %2 that gets copies of %arg0 and %1 with their +// appropriate shape dimensions. The copy buffer deallocation will be applied +// to %2 in block bb3. #map0 = affine_map<(d0) -> (d0)> @@ -91,6 +100,7 @@ // CHECK: %[[DIM0:.*]] = dim // CHECK-NEXT: %[[ALLOC0:.*]] = alloc(%[[DIM0]]) // CHECK-NEXT: linalg.copy(%{{.*}}, %[[ALLOC0]]) +// CHECK-NEXT: br ^bb3(%[[ALLOC0]] // CHECK: ^bb2(%[[IDX:.*]]:{{.*}}) // CHECK-NEXT: %[[ALLOC1:.*]] = alloc(%[[IDX]]) // CHECK-NEXT: linalg.generic @@ -118,18 +128,17 @@ // bb6 // | // bb7 -// BufferPlacement Expected Behaviour: It should not move the existing AllocOp -// to any other block since the alloc has a dynamic dependency to block argument -// %0 in bb2. Since the dynamic type is passed to bb5 via the block argument %2 -// and to bb6 via block argument %3, it is currently required to allocate -// temporary buffers for %2 and %3 that gets copies of %1 and %arg0 1 with their -// appropriate shape dimensions. The copy buffer deallocations will be applied -// to %2 in block bb5 and to %3 in block bb6. Furthermore, there should be no -// copy inserted for %4. +// BufferDeallocation expected behavior: The existing AllocOp has a dynamic +// dependency to block argument %0 in bb2. Since the dynamic type is passed to +// bb5 via the block argument %2 and to bb6 via block argument %3, it is +// currently required to allocate temporary buffers for %2 and %3 that gets +// copies of %1 and %arg0 1 with their appropriate shape dimensions. The copy +// buffer deallocations will be applied to %2 in block bb5 and to %3 in block +// bb6. Furthermore, there should be no copy inserted for %4. #map0 = affine_map<(d0) -> (d0)> -// CHECK-LABEL: func @condBranchDynamicType +// CHECK-LABEL: func @condBranchDynamicTypeNested func @condBranchDynamicTypeNested( %arg0: i1, %arg1: memref, @@ -168,6 +177,7 @@ // CHECK: %[[DIM0:.*]] = dim // CHECK-NEXT: %[[ALLOC0:.*]] = alloc(%[[DIM0]]) // CHECK-NEXT: linalg.copy(%{{.*}}, %[[ALLOC0]]) +// CHECK-NEXT: br ^bb6 // CHECK: ^bb2(%[[IDX:.*]]:{{.*}}) // CHECK-NEXT: %[[ALLOC1:.*]] = alloc(%[[IDX]]) // CHECK-NEXT: linalg.generic @@ -192,8 +202,8 @@ // ----- // Test Case: Existing AllocOp with no users. -// BufferPlacement Expected Behaviour: It should insert a DeallocOp right before -// ReturnOp. +// BufferDeallocation expected behavior: It should insert a DeallocOp right +// before ReturnOp. // CHECK-LABEL: func @emptyUsesValue func @emptyUsesValue(%arg0: memref<4xf32>) { @@ -212,9 +222,9 @@ // | bb1 <- Initial position of AllocOp // \ / // bb2 -// BufferPlacement Expected Behaviour: It should move the existing AllocOp to -// the entry block and insert a DeallocOp at the exit block after CopyOp since -// %1 is an alias for %0 and %arg1. +// BufferDeallocation expected behavior: It should insert a DeallocOp at the +// exit block after CopyOp since %1 is an alias for %0 and %arg1. Furthermore, +// we have to insert a copy and an alloc in the beginning of the function. #map0 = affine_map<(d0) -> (d0)> @@ -238,10 +248,16 @@ return } -// CHECK-NEXT: %[[ALLOC:.*]] = alloc() +// CHECK-NEXT: %[[ALLOC0:.*]] = alloc() +// CHECK-NEXT: linalg.copy // CHECK-NEXT: cond_br +// CHECK: %[[ALLOC1:.*]] = alloc() +// CHECK-NEXT: linalg.generic +// CHECK: %[[ALLOC2:.*]] = alloc() +// CHECK-NEXT: linalg.copy +// CHECK-NEXT: dealloc %[[ALLOC1]] // CHECK: linalg.copy -// CHECK-NEXT: dealloc %[[ALLOC]] +// CHECK-NEXT: dealloc // CHECK-NEXT: return // ----- @@ -252,9 +268,8 @@ // | bb1 // \ / // bb2 -// BufferPlacement Expected Behaviour: It shouldn't move the alloc position. It -// only inserts a DeallocOp at the exit block after CopyOp since %1 is an alias -// for %0 and %arg1. +// BufferDeallocation expected behavior: It only inserts a DeallocOp at the +// exit block after CopyOp since %1 is an alias for %0 and %arg1. #map0 = affine_map<(d0) -> (d0)> @@ -289,10 +304,10 @@ // bb1 bb2 // \ / // bb3 <- Initial position of the second AllocOp -// BufferPlacement Expected Behaviour: It shouldn't move the AllocOps. It only -// inserts two missing DeallocOps in the exit block. %5 is an alias for %0. -// Therefore, the DeallocOp for %0 should occur after the last GenericOp. The -// Dealloc for %7 should happen after the CopyOp. +// BufferDeallocation expected behavior: It only inserts two missing +// DeallocOps in the exit block. %5 is an alias for %0. Therefore, the +// DeallocOp for %0 should occur after the last GenericOp. The Dealloc for %7 +// should happen after the CopyOp. #map0 = affine_map<(d0) -> (d0)> @@ -347,9 +362,8 @@ // bb1 bb2 // \ / // bb3 -// BufferPlacement Expected Behaviour: It shouldn't move the AllocOp. It only -// inserts a missing DeallocOp in the exit block since %5 or %6 are the latest -// aliases of %0. +// BufferDeallocation expected behavior: It only inserts a missing DeallocOp +// in the exit block since %5 or %6 are the latest aliases of %0. #map0 = affine_map<(d0) -> (d0)> @@ -378,7 +392,8 @@ } // CHECK-NEXT: %[[FIRST_ALLOC:.*]] = alloc() -// CHECK: dealloc %[[FIRST_ALLOC]] +// CHECK: linalg.copy +// CHECK-NEXT: dealloc %[[FIRST_ALLOC]] // CHECK-NEXT: return // ----- @@ -392,8 +407,8 @@ // \ \ / // \ / // bb5 <- Initial position of the second AllocOp -// BufferPlacement Expected Behaviour: AllocOps shouldn't be moved. -// Two missing DeallocOps should be inserted in the exit block. +// BufferDeallocation expected behavior: Two missing DeallocOps should be +// inserted in the exit block. #map0 = affine_map<(d0) -> (d0)> @@ -447,8 +462,8 @@ // ----- // Test Case: Dead operations in a single block. -// BufferPlacement Expected Behaviour: It shouldn't move the AllocOps. It only -// inserts the two missing DeallocOps after the last GenericOp. +// BufferDeallocation expected behavior: It only inserts the two missing +// DeallocOps after the last GenericOp. #map0 = affine_map<(d0) -> (d0)> @@ -481,7 +496,8 @@ // CHECK-NEXT: %[[FIRST_ALLOC:.*]] = alloc() // CHECK-NEXT: linalg.generic {{.*}} ins(%[[ARG0]]{{.*}}outs(%[[FIRST_ALLOC]] // CHECK: %[[SECOND_ALLOC:.*]] = alloc() -// CHECK-NEXT: linalg.generic {{.*}} ins(%[[FIRST_ALLOC]]{{.*}}outs(%[[SECOND_ALLOC]] +// CHECK-NEXT: linalg.generic {{.*}} ins +// CHECK-SAME: (%[[FIRST_ALLOC]]{{.*}}outs(%[[SECOND_ALLOC]] // CHECK: dealloc // CHECK-NEXT: dealloc // CHECK-NEXT: return @@ -494,9 +510,10 @@ // Initial pos of the 1st AllocOp -> bb1 bb2 <- Initial pos of the 2nd AllocOp // \ / // bb3 -// BufferPlacement Expected Behaviour: Both AllocOps should be moved to the -// entry block. Both missing DeallocOps should be moved to the exit block after -// CopyOp since %arg2 is an alias for %0 and %1. +// BufferDeallocation expected behavior: We need to introduce a copy for each +// buffer since the buffers are passed to bb3. The both missing DeallocOps are +// inserted in the respective block of the allocs. The copy is freed in the exit +// block. #map0 = affine_map<(d0) -> (d0)> @@ -535,11 +552,25 @@ return } -// CHECK-NEXT: %{{.*}} = alloc() -// CHECK-NEXT: %{{.*}} = alloc() +// CHECK-NEXT: cond_br +// CHECK: ^bb1 +// CHECK: ^bb1 +// CHECK: %[[ALLOC0:.*]] = alloc() +// CHECK-NEXT: linalg.generic +// CHECK: %[[ALLOC1:.*]] = alloc() +// CHECK-NEXT: linalg.copy +// CHECK-NEXT: dealloc %[[ALLOC0]] +// CHECK-NEXT: br ^bb3(%[[ALLOC1]] +// CHECK-NEXT: ^bb2 +// CHECK-NEXT: %[[ALLOC2:.*]] = alloc() +// CHECK-NEXT: linalg.generic +// CHECK: %[[ALLOC3:.*]] = alloc() +// CHECK-NEXT: linalg.copy +// CHECK-NEXT: dealloc %[[ALLOC2]] +// CHECK-NEXT: br ^bb3(%[[ALLOC3]] +// CHECK-NEXT: ^bb3(%[[ALLOC4:.*]]:{{.*}}) // CHECK: linalg.copy -// CHECK-NEXT: dealloc -// CHECK-NEXT: dealloc +// CHECK-NEXT: dealloc %[[ALLOC4]] // CHECK-NEXT: return // ----- @@ -551,8 +582,8 @@ // bb1 bb2 <- Initial position of AllocOp // \ / // bb3 -// BufferPlacement Expected Behaviour: It should move the AllocOp to the entry -// block. The existing DeallocOp should be moved to exit block. +// BufferDeallocation expected behavior: The existing DeallocOp should be +// moved to exit block. #map0 = affine_map<(d0) -> (d0)> @@ -561,11 +592,11 @@ %cond: i1, %arg0: memref<2xf32>, %arg1: memref<2xf32>) { + %1 = alloc() : memref<2xf32> cond_br %cond, ^bb1, ^bb2 ^bb1: br ^exit(%arg0 : memref<2xf32>) ^bb2: - %1 = alloc() : memref<2xf32> linalg.generic { indexing_maps = [#map0, #map0], iterator_types = ["parallel"]} @@ -582,9 +613,10 @@ return } -// CHECK-NEXT: %{{.*}} = alloc() +// CHECK-NEXT: %[[ALLOC0:.*]] = alloc() +// CHECK-NEXT: cond_br // CHECK: linalg.copy -// CHECK-NEXT: dealloc +// CHECK-NEXT: dealloc %[[ALLOC0]] // CHECK-NEXT: return // ----- @@ -611,8 +643,9 @@ return } +// CHECK-NEXT: %[[ALLOC0:.*]] = alloc() // CHECK: linalg.copy -// CHECK-NEXT: dealloc +// CHECK-NEXT: dealloc %[[ALLOC0]] // ----- @@ -638,33 +671,41 @@ return } +// CHECK-NEXT: %[[ALLOC0:.*]] = alloc() // CHECK: linalg.copy -// CHECK-NEXT: dealloc +// CHECK-NEXT: dealloc %[[ALLOC0]] // ----- -// Test Case: Nested regions - This test defines a GenericOp inside the region of -// another GenericOp. -// BufferPlacement Expected Behaviour: The AllocOp of inner GenericOp should remain -// inside the region of outer GenericOp and it should insert the missing DeallocOp -// in the same region. The AllocOp of the outer GenericOp should be moved to the -// entry block and its missing DeallocOp should be inserted after Linalg.Copy. +// Test Case: Nested regions - This test defines a GenericOp inside the region +// of another GenericOp. +// BufferDeallocation expected behavior: The AllocOp of inner GenericOp should +// remain inside the region of outer GenericOp and it should insert the missing +// DeallocOp in the same region. The missing DeallocOp should be inserted after +// Linalg.Copy. #map0 = affine_map<(d0) -> (d0)> // CHECK-LABEL: func @nested_regions_and_cond_branch -func @nested_regions_and_cond_branch(%arg0: i1, %arg1: memref<2xf32>, %arg2: memref<2xf32>) { +func @nested_regions_and_cond_branch( + %arg0: i1, + %arg1: memref<2xf32>, + %arg2: memref<2xf32>) { cond_br %arg0, ^bb1, ^bb2 ^bb1: br ^bb3(%arg1 : memref<2xf32>) ^bb2: %0 = alloc() : memref<2xf32> - linalg.generic {indexing_maps = [#map0, #map0], iterator_types = ["parallel"]} + linalg.generic { + indexing_maps = [#map0, #map0], + iterator_types = ["parallel"]} ins(%arg1: memref<2xf32>) outs(%0: memref<2xf32>) { ^bb0(%gen1_arg0: f32, %gen1_arg1: f32): %1 = alloc() : memref<2xf32> - linalg.generic {indexing_maps = [#map0, #map0], iterator_types = ["parallel"]} + linalg.generic { + indexing_maps = [#map0, #map0], + iterator_types = ["parallel"]} ins(%arg1: memref<2xf32>) outs(%1: memref<2xf32>) { ^bb0(%gen2_arg0: f32, %gen2_arg1: f32): @@ -680,29 +721,37 @@ return } // CHECK: (%[[cond:.*]]: {{.*}}, %[[ARG1:.*]]: {{.*}}, %{{.*}}: {{.*}}) -// CHECK-NEXT: %[[GENERIC1_ALLOC:.*]] = alloc() // CHECK-NEXT: cond_br %[[cond]], ^[[BB1:.*]], ^[[BB2:.*]] +// CHECK: %[[ALLOC0:.*]] = alloc() +// CHECK-NEXT: linalg.copy(%[[ARG1]], %[[ALLOC0]]) // CHECK: ^[[BB2]]: -// CHECK-NEXT: linalg.generic {{{.*}}} ins(%[[ARG1]]{{.*}}outs(%[[GENERIC1_ALLOC]] -// CHECK: %[[GENERIC2_ALLOC:.*]] = alloc() -// CHECK-NEXT: linalg.generic {{{.*}}} ins(%[[ARG1]]{{.*}}outs(%[[GENERIC2_ALLOC]] -// CHECK: dealloc %[[GENERIC2_ALLOC]] +// CHECK: %[[ALLOC1:.*]] = alloc() +// CHECK-NEXT: linalg.generic {{{.*}}} ins(%[[ARG1]]{{.*}}outs(%[[ALLOC1]] +// CHECK: %[[ALLOC2:.*]] = alloc() +// CHECK-NEXT: linalg.generic {{{.*}}} ins(%[[ARG1]]{{.*}}outs(%[[ALLOC2]] +// CHECK: dealloc %[[ALLOC2]] // CHECK-NEXT: %{{.*}} = exp +// CHECK: %[[ALLOC3:.*]] = alloc() +// CHECK-NEXT: linalg.copy(%[[ALLOC1]], %[[ALLOC3]]) +// CHECK-NEXT: dealloc %[[ALLOC1]] // CHECK: ^[[BB3:.*]]({{.*}}): // CHECK: linalg.copy -// CHECK-NEXT: dealloc %[[GENERIC1_ALLOC]] +// CHECK-NEXT: dealloc // ----- // Test Case: buffer deallocation escaping -// BufferPlacement Expected Behaviour: It must not dealloc %arg1 and %x +// BufferDeallocation expected behavior: It must not dealloc %arg1 and %x // since they are operands of return operation and should escape from // deallocating. It should dealloc %y after linalg.copy. #map0 = affine_map<(d0) -> (d0)> // CHECK-LABEL: func @memref_in_function_results -func @memref_in_function_results(%arg0: memref<5xf32>, %arg1: memref<10xf32>, %arg2: memref<5xf32>) -> (memref<10xf32>, memref<15xf32>) { +func @memref_in_function_results( + %arg0: memref<5xf32>, + %arg1: memref<10xf32>, + %arg2: memref<5xf32>) -> (memref<10xf32>, memref<15xf32>) { %x = alloc() : memref<15xf32> %y = alloc() : memref<5xf32> linalg.generic {indexing_maps = [#map0, #map0], iterator_types = ["parallel"]} @@ -715,19 +764,20 @@ linalg.copy(%y, %arg2) : memref<5xf32>, memref<5xf32> return %arg1, %x : memref<10xf32>, memref<15xf32> } -// CHECK: (%[[ARG0:.*]]: memref<5xf32>, %[[ARG1:.*]]: memref<10xf32>, %[[RESULT:.*]]: memref<5xf32>) -// CHECK: %[[X:.*]] = alloc() -// CHECK: %[[Y:.*]] = alloc() -// CHECK: linalg.copy -// CHECK: dealloc %[[Y]] -// CHECK: return %[[ARG1]], %[[X]] +// CHECK: (%[[ARG0:.*]]: memref<5xf32>, %[[ARG1:.*]]: memref<10xf32>, +// CHECK-SAME: %[[RESULT:.*]]: memref<5xf32>) +// CHECK: %[[X:.*]] = alloc() +// CHECK: %[[Y:.*]] = alloc() +// CHECK: linalg.copy +// CHECK: dealloc %[[Y]] +// CHECK: return %[[ARG1]], %[[X]] // ----- // Test Case: nested region control flow -// The alloc position of %1 does not need to be changed and flows through -// both if branches until it is finally returned. Hence, it does not -// require a specific dealloc operation. However, %3 requires a dealloc. +// The alloc %1 flows through both if branches until it is finally returned. +// Hence, it does not require a specific dealloc operation. However, %3 +// requires a dealloc. // CHECK-LABEL: func @nested_region_control_flow func @nested_region_control_flow( @@ -756,10 +806,8 @@ // Test Case: nested region control flow with a nested buffer allocation in a // divergent branch. -// The alloc positions of %1, %3 does not need to be changed since -// BufferPlacement does not move allocs out of nested regions at the moment. -// However, since %3 is allocated and "returned" in a divergent branch, we have -// to allocate a temporary buffer (like in condBranchDynamicTypeNested). +// Buffer deallocation places a copy for both %1 and %3, since they are +// returned in the end. // CHECK-LABEL: func @nested_region_control_flow_div func @nested_region_control_flow_div( @@ -791,61 +839,9 @@ // ----- -// Test Case: deeply nested region control flow with a nested buffer allocation -// in a divergent branch. -// The alloc positions of %1, %4 and %5 does not need to be changed since -// BufferPlacement does not move allocs out of nested regions at the moment. -// However, since %4 is allocated and "returned" in a divergent branch, we have -// to allocate several temporary buffers (like in condBranchDynamicTypeNested). - -// CHECK-LABEL: func @nested_region_control_flow_div_nested -func @nested_region_control_flow_div_nested( - %arg0 : index, - %arg1 : index) -> memref { - %0 = cmpi "eq", %arg0, %arg1 : index - %1 = alloc(%arg0, %arg0) : memref - %2 = scf.if %0 -> (memref) { - %3 = scf.if %0 -> (memref) { - scf.yield %1 : memref - } else { - %4 = alloc(%arg0, %arg1) : memref - scf.yield %4 : memref - } - scf.yield %3 : memref - } else { - %5 = alloc(%arg1, %arg1) : memref - scf.yield %5 : memref - } - return %2 : memref -} -// CHECK: %[[ALLOC0:.*]] = alloc(%arg0, %arg0) -// CHECK-NEXT: %[[ALLOC1:.*]] = scf.if -// CHECK-NEXT: %[[ALLOC2:.*]] = scf.if -// CHECK: %[[ALLOC3:.*]] = alloc -// CHECK-NEXT: linalg.copy(%[[ALLOC0]], %[[ALLOC3]]) -// CHECK: scf.yield %[[ALLOC3]] -// CHECK: %[[ALLOC4:.*]] = alloc(%arg0, %arg1) -// CHECK: %[[ALLOC5:.*]] = alloc -// CHECK-NEXT: linalg.copy(%[[ALLOC4]], %[[ALLOC5]]) -// CHECK: dealloc %[[ALLOC4]] -// CHECK: scf.yield %[[ALLOC5]] -// CHECK: %[[ALLOC6:.*]] = alloc -// CHECK-NEXT: linalg.copy(%[[ALLOC2]], %[[ALLOC6]]) -// CHECK: dealloc %[[ALLOC2]] -// CHECK: scf.yield %[[ALLOC6]] -// CHECK: %[[ALLOC7:.*]] = alloc(%arg1, %arg1) -// CHECK: %[[ALLOC8:.*]] = alloc -// CHECK-NEXT: linalg.copy(%[[ALLOC7]], %[[ALLOC8]]) -// CHECK: dealloc %[[ALLOC7]] -// CHECK: scf.yield %[[ALLOC8]] -// CHECK: dealloc %[[ALLOC0]] -// CHECK-NEXT: return %[[ALLOC1]] - -// ----- - // Test Case: nested region control flow within a region interface. -// The alloc positions of %0 does not need to be changed and no copies are -// required in this case since the allocation finally escapes the method. +// No copies are required in this case since the allocation finally escapes +// the method. // CHECK-LABEL: func @inner_region_control_flow func @inner_region_control_flow(%arg0 : index) -> memref { @@ -875,54 +871,6 @@ // ----- -// Test Case: nested region control flow within a region interface including an -// allocation in a divergent branch. -// The alloc positions of %1 and %2 does not need to be changed since -// BufferPlacement does not move allocs out of nested regions at the moment. -// However, since %2 is allocated and yielded in a divergent branch, we have -// to allocate several temporary buffers (like in condBranchDynamicTypeNested). - -// CHECK-LABEL: func @inner_region_control_flow_div -func @inner_region_control_flow_div( - %arg0 : index, - %arg1 : index) -> memref { - %0 = alloc(%arg0, %arg0) : memref - %1 = test.region_if %0 : memref -> (memref) then { - ^bb0(%arg2 : memref): - test.region_if_yield %arg2 : memref - } else { - ^bb0(%arg2 : memref): - %2 = alloc(%arg0, %arg1) : memref - test.region_if_yield %2 : memref - } join { - ^bb0(%arg2 : memref): - test.region_if_yield %arg2 : memref - } - return %1 : memref -} - -// CHECK: %[[ALLOC0:.*]] = alloc(%arg0, %arg0) -// CHECK-NEXT: %[[ALLOC1:.*]] = test.region_if -// CHECK-NEXT: ^bb0(%[[ALLOC2:.*]]:{{.*}}): -// CHECK: %[[ALLOC3:.*]] = alloc -// CHECK-NEXT: linalg.copy(%[[ALLOC2]], %[[ALLOC3]]) -// CHECK-NEXT: test.region_if_yield %[[ALLOC3]] -// CHECK: ^bb0(%[[ALLOC4:.*]]:{{.*}}): -// CHECK: %[[ALLOC5:.*]] = alloc -// CHECK: %[[ALLOC6:.*]] = alloc -// CHECK-NEXT: linalg.copy(%[[ALLOC5]], %[[ALLOC6]]) -// CHECK-NEXT: dealloc %[[ALLOC5]] -// CHECK-NEXT: test.region_if_yield %[[ALLOC6]] -// CHECK: ^bb0(%[[ALLOC7:.*]]:{{.*}}): -// CHECK: %[[ALLOC8:.*]] = alloc -// CHECK-NEXT: linalg.copy(%[[ALLOC7]], %[[ALLOC8]]) -// CHECK-NEXT: dealloc %[[ALLOC7]] -// CHECK-NEXT: test.region_if_yield %[[ALLOC8]] -// CHECK: dealloc %[[ALLOC0]] -// CHECK-NEXT: return %[[ALLOC1]] - -// ----- - // CHECK-LABEL: func @subview func @subview(%arg0 : index, %arg1 : index, %arg2 : memref) { %0 = alloc() : memref<64x4xf32, offset: 0, strides: [4, 1]> @@ -944,6 +892,9 @@ #map0 = affine_map<(d0) -> (d0)> +// Test Case: In the presence of AllocaOps only the AllocOps has top be freed. +// Therefore, all allocas are not handled. + // CHECK-LABEL: func @condBranchAlloca func @condBranchAlloca(%arg0: i1, %arg1: memref<2xf32>, %arg2: memref<2xf32>) { cond_br %arg0, ^bb1, ^bb2 @@ -977,6 +928,10 @@ #map0 = affine_map<(d0) -> (d0)> +// Test Case: In the presence of AllocaOps only the AllocOps has top be freed. +// Therefore, all allocas are not handled. In this case, only alloc %0 has a +// dealloc. + // CHECK-LABEL: func @ifElseAlloca func @ifElseAlloca(%arg0: i1, %arg1: memref<2xf32>, %arg2: memref<2xf32>) { %0 = alloc() : memref<2xf32> @@ -1024,7 +979,10 @@ #map0 = affine_map<(d0) -> (d0)> // CHECK-LABEL: func @ifElseNestedAlloca -func @ifElseNestedAlloca(%arg0: i1, %arg1: memref<2xf32>, %arg2: memref<2xf32>) { +func @ifElseNestedAlloca( + %arg0: i1, + %arg1: memref<2xf32>, + %arg2: memref<2xf32>) { %0 = alloca() : memref<2xf32> linalg.generic { indexing_maps = [#map0, #map0], @@ -1074,18 +1032,25 @@ #map0 = affine_map<(d0) -> (d0)> // CHECK-LABEL: func @nestedRegionsAndCondBranchAlloca -func @nestedRegionsAndCondBranchAlloca(%arg0: i1, %arg1: memref<2xf32>, %arg2: memref<2xf32>) { +func @nestedRegionsAndCondBranchAlloca( + %arg0: i1, + %arg1: memref<2xf32>, + %arg2: memref<2xf32>) { cond_br %arg0, ^bb1, ^bb2 ^bb1: br ^bb3(%arg1 : memref<2xf32>) ^bb2: %0 = alloc() : memref<2xf32> - linalg.generic {indexing_maps = [#map0, #map0], iterator_types = ["parallel"]} + linalg.generic { + indexing_maps = [#map0, #map0], + iterator_types = ["parallel"]} ins(%arg1: memref<2xf32>) outs(%0: memref<2xf32>) { ^bb0(%gen1_arg0: f32, %gen1_arg1: f32): %1 = alloca() : memref<2xf32> - linalg.generic {indexing_maps = [#map0, #map0], iterator_types = ["parallel"]} + linalg.generic { + indexing_maps = [#map0, #map0], + iterator_types = ["parallel"]} ins(%arg1: memref<2xf32>) outs(%1: memref<2xf32>) { ^bb0(%gen2_arg0: f32, %gen2_arg1: f32): @@ -1101,16 +1066,22 @@ return } // CHECK: (%[[cond:.*]]: {{.*}}, %[[ARG1:.*]]: {{.*}}, %{{.*}}: {{.*}}) -// CHECK-NEXT: %[[ALLOC:.*]] = alloc() // CHECK-NEXT: cond_br %[[cond]], ^[[BB1:.*]], ^[[BB2:.*]] +// CHECK: ^[[BB1]]: +// CHECK: %[[ALLOC0:.*]] = alloc() +// CHECK-NEXT: linalg.copy // CHECK: ^[[BB2]]: -// CHECK-NEXT: linalg.generic {{{.*}}} ins(%[[ARG1]]{{.*}}outs(%[[ALLOC]] +// CHECK: %[[ALLOC1:.*]] = alloc() +// CHECK-NEXT: linalg.generic {{{.*}}} ins(%[[ARG1]]{{.*}}outs(%[[ALLOC1]] // CHECK: %[[ALLOCA:.*]] = alloca() // CHECK-NEXT: linalg.generic {{{.*}}} ins(%[[ARG1]]{{.*}}outs(%[[ALLOCA]] // CHECK: %{{.*}} = exp +// CHECK: %[[ALLOC2:.*]] = alloc() +// CHECK-NEXT: linalg.copy +// CHECK-NEXT: dealloc %[[ALLOC1]] // CHECK: ^[[BB3:.*]]({{.*}}): // CHECK: linalg.copy -// CHECK-NEXT: dealloc %[[ALLOC]] +// CHECK-NEXT: dealloc // ----- @@ -1139,10 +1110,8 @@ // ----- // Test Case: structured control-flow loop using a nested alloc. -// The alloc positions of %3 will not be changed, but the iteration argument -// %iterBuf has to be freed before yielding %3 to avoid memory leaks. - -// ----- +// The iteration argument %iterBuf has to be freed before yielding %3 to avoid +// memory leaks. // CHECK-LABEL: func @loop_alloc func @loop_alloc( @@ -1166,7 +1135,8 @@ // CHECK-NEXT: dealloc %[[ALLOC0]] // CHECK-NEXT: %[[ALLOC1:.*]] = alloc() // CHECK: linalg.copy(%arg3, %[[ALLOC1]]) -// CHECK: %[[ALLOC2:.*]] = scf.for {{.*}} iter_args(%[[IALLOC:.*]] = %[[ALLOC1]] +// CHECK: %[[ALLOC2:.*]] = scf.for {{.*}} iter_args +// CHECK-SAME: (%[[IALLOC:.*]] = %[[ALLOC1]] // CHECK: cmpi // CHECK: dealloc %[[IALLOC]] // CHECK: %[[ALLOC3:.*]] = alloc() @@ -1251,7 +1221,8 @@ // CHECK: %[[ALLOC0:.*]] = alloc() // CHECK: %[[ALLOC1:.*]] = alloc() // CHECK-NEXT: linalg.copy(%arg3, %[[ALLOC1]]) -// CHECK-NEXT: %[[ALLOC2:.*]] = scf.for {{.*}} iter_args(%[[IALLOC:.*]] = %[[ALLOC1]] +// CHECK-NEXT: %[[ALLOC2:.*]] = scf.for {{.*}} iter_args +// CHECK-SAME: (%[[IALLOC:.*]] = %[[ALLOC1]] // CHECK: dealloc %[[IALLOC]] // CHECK: %[[ALLOC3:.*]] = scf.if @@ -1362,7 +1333,7 @@ // ----- // Test Case: explicit control-flow loop with a dynamically allocated buffer. -// The BufferPlacement transformation should fail on this explicit +// The BufferDeallocation transformation should fail on this explicit // control-flow loop since they are not supported. // CHECK-LABEL: func @loop_dynalloc @@ -1397,7 +1368,7 @@ // ----- // Test Case: explicit control-flow loop with a dynamically allocated buffer. -// The BufferPlacement transformation should fail on this explicit +// The BufferDeallocation transformation should fail on this explicit // control-flow loop since they are not supported. // CHECK-LABEL: func @do_loop_alloc diff --git a/mlir/test/Transforms/buffer-hoisting.mlir b/mlir/test/Transforms/buffer-hoisting.mlir new file mode 100644 --- /dev/null +++ b/mlir/test/Transforms/buffer-hoisting.mlir @@ -0,0 +1,905 @@ +// RUN: mlir-opt -buffer-hoisting -split-input-file %s | FileCheck %s + +// This file checks the behaviour of BufferHoisting pass for moving Alloc +// operations to their correct positions. + +// Test Case: +// bb0 +// / \ +// bb1 bb2 <- Initial position of AllocOp +// \ / +// bb3 +// BufferHoisting expected behavior: It should move the existing AllocOp to +// the entry block. + +#map0 = affine_map<(d0) -> (d0)> + +// CHECK-LABEL: func @condBranch +func @condBranch(%arg0: i1, %arg1: memref<2xf32>, %arg2: memref<2xf32>) { + cond_br %arg0, ^bb1, ^bb2 +^bb1: + br ^bb3(%arg1 : memref<2xf32>) +^bb2: + %0 = alloc() : memref<2xf32> + linalg.generic {indexing_maps = [#map0, #map0], iterator_types = ["parallel"]} + ins(%arg1: memref<2xf32>) + outs(%0: memref<2xf32>) { + ^bb0(%gen1_arg0: f32, %gen1_arg1: f32): + %tmp1 = exp %gen1_arg0 : f32 + linalg.yield %tmp1 : f32 + } + br ^bb3(%0 : memref<2xf32>) +^bb3(%1: memref<2xf32>): + "linalg.copy"(%1, %arg2) : (memref<2xf32>, memref<2xf32>) -> () + return +} + +// CHECK-NEXT: %[[ALLOC:.*]] = alloc() +// CHECK-NEXT: cond_br + +// ----- + +// Test Case: +// bb0 +// / \ +// bb1 bb2 <- Initial position of AllocOp +// \ / +// bb3 +// BufferHoisting expected behavior: It should not move the existing AllocOp +// to any other block since the alloc has a dynamic dependency to block argument +// %0 in bb2. + +#map0 = affine_map<(d0) -> (d0)> + +// CHECK-LABEL: func @condBranchDynamicType +func @condBranchDynamicType( + %arg0: i1, + %arg1: memref, + %arg2: memref, + %arg3: index) { + cond_br %arg0, ^bb1, ^bb2(%arg3: index) +^bb1: + br ^bb3(%arg1 : memref) +^bb2(%0: index): + %1 = alloc(%0) : memref + linalg.generic {indexing_maps = [#map0, #map0], iterator_types = ["parallel"]} + ins(%arg1: memref) + outs(%1: memref) { + ^bb0(%gen1_arg0: f32, %gen1_arg1: f32): + %tmp1 = exp %gen1_arg0 : f32 + linalg.yield %tmp1 : f32 + } + br ^bb3(%1 : memref) +^bb3(%2: memref): + "linalg.copy"(%2, %arg2) : (memref, memref) -> () + return +} + +// CHECK-NEXT: cond_br +// CHECK: ^bb2 +// CHECK: ^bb2(%[[IDX:.*]]:{{.*}}) +// CHECK-NEXT: %[[ALLOC0:.*]] = alloc(%[[IDX]]) +// CHECK-NEXT: linalg.generic + +// ----- + +// Test Case: +// bb0 +// / \ +// bb1 bb2 <- Initial position of AllocOp +// | / \ +// | bb3 bb4 +// | \ / +// \ bb5 +// \ / +// bb6 +// | +// bb7 +// BufferHoisting expected behavior: It should not move the existing AllocOp +// to any other block since the alloc has a dynamic dependency to block argument +// %0 in bb2. + +#map0 = affine_map<(d0) -> (d0)> + +// CHECK-LABEL: func @condBranchDynamicTypeNested +func @condBranchDynamicTypeNested( + %arg0: i1, + %arg1: memref, + %arg2: memref, + %arg3: index) { + cond_br %arg0, ^bb1, ^bb2(%arg3: index) +^bb1: + br ^bb6(%arg1 : memref) +^bb2(%0: index): + %1 = alloc(%0) : memref + linalg.generic {indexing_maps = [#map0, #map0], iterator_types = ["parallel"]} + ins(%arg1: memref) + outs(%1: memref) { + ^bb0(%gen1_arg0: f32, %gen1_arg1: f32): + %tmp1 = exp %gen1_arg0 : f32 + linalg.yield %tmp1 : f32 + } + cond_br %arg0, ^bb3, ^bb4 +^bb3: + br ^bb5(%1 : memref) +^bb4: + br ^bb5(%1 : memref) +^bb5(%2: memref): + br ^bb6(%2 : memref) +^bb6(%3: memref): + br ^bb7(%3 : memref) +^bb7(%4: memref): + "linalg.copy"(%4, %arg2) : (memref, memref) -> () + return +} + +// CHECK-NEXT: cond_br +// CHECK: ^bb2 +// CHECK: ^bb2(%[[IDX:.*]]:{{.*}}) +// CHECK-NEXT: %[[ALLOC0:.*]] = alloc(%[[IDX]]) +// CHECK-NEXT: linalg.generic + +// ----- + +// Test Case: +// bb0 +// / \ +// | bb1 <- Initial position of AllocOp +// \ / +// bb2 +// BufferHoisting expected behavior: It should move the existing AllocOp to +// the entry block. + +#map0 = affine_map<(d0) -> (d0)> + +// CHECK-LABEL: func @criticalEdge +func @criticalEdge(%arg0: i1, %arg1: memref<2xf32>, %arg2: memref<2xf32>) { + cond_br %arg0, ^bb1, ^bb2(%arg1 : memref<2xf32>) +^bb1: + %0 = alloc() : memref<2xf32> + linalg.generic {indexing_maps = [#map0, #map0], iterator_types = ["parallel"]} + ins(%arg1: memref<2xf32>) + outs(%0: memref<2xf32>) { + ^bb0(%gen1_arg0: f32, %gen1_arg1: f32): + %tmp1 = exp %gen1_arg0 : f32 + linalg.yield %tmp1 : f32 + } + br ^bb2(%0 : memref<2xf32>) +^bb2(%1: memref<2xf32>): + "linalg.copy"(%1, %arg2) : (memref<2xf32>, memref<2xf32>) -> () + return +} + +// CHECK-NEXT: %[[ALLOC:.*]] = alloc() +// CHECK-NEXT: cond_br + +// ----- + +// Test Case: +// bb0 <- Initial position of the first AllocOp +// / \ +// bb1 bb2 +// \ / +// bb3 <- Initial position of the second AllocOp +// BufferHoisting expected behavior: It shouldn't move the AllocOps. + +#map0 = affine_map<(d0) -> (d0)> + +// CHECK-LABEL: func @ifElse +func @ifElse(%arg0: i1, %arg1: memref<2xf32>, %arg2: memref<2xf32>) { + %0 = alloc() : memref<2xf32> + linalg.generic {indexing_maps = [#map0, #map0], iterator_types = ["parallel"]} + ins(%arg1: memref<2xf32>) + outs(%0: memref<2xf32>) { + ^bb0(%gen1_arg0: f32, %gen1_arg1: f32): + %tmp1 = exp %gen1_arg0 : f32 + linalg.yield %tmp1 : f32 + } + cond_br %arg0, + ^bb1(%arg1, %0 : memref<2xf32>, memref<2xf32>), + ^bb2(%0, %arg1 : memref<2xf32>, memref<2xf32>) +^bb1(%1: memref<2xf32>, %2: memref<2xf32>): + br ^bb3(%1, %2 : memref<2xf32>, memref<2xf32>) +^bb2(%3: memref<2xf32>, %4: memref<2xf32>): + br ^bb3(%3, %4 : memref<2xf32>, memref<2xf32>) +^bb3(%5: memref<2xf32>, %6: memref<2xf32>): + %7 = alloc() : memref<2xf32> + linalg.generic {indexing_maps = [#map0, #map0], iterator_types = ["parallel"]} + ins(%7: memref<2xf32>) + outs(%7: memref<2xf32>) { + ^bb0(%gen2_arg0: f32, %gen2_arg1: f32): + %tmp2 = exp %gen2_arg0 : f32 + linalg.yield %tmp2 : f32 + } + "linalg.copy"(%7, %arg2) : (memref<2xf32>, memref<2xf32>) -> () + return +} + +// CHECK-NEXT: %[[ALLOC0:.*]] = alloc() +// CHECK-NEXT: linalg.generic +// CHECK: br ^bb3 +// CHECK: br ^bb3 +// CHECK-NEXT: ^bb3 +// CHECK: %[[ALLOC1:.*]] = alloc() +// CHECK-NEXT: linalg.generic +// CHECK: linalg.copy(%[[ALLOC1]] +// CHECK-NEXT: return + +// ----- + +// Test Case: No users for buffer in if-else CFG +// bb0 <- Initial position of AllocOp +// / \ +// bb1 bb2 +// \ / +// bb3 +// BufferHoisting expected behavior: It shouldn't move the AllocOp. + +#map0 = affine_map<(d0) -> (d0)> + +// CHECK-LABEL: func @ifElseNoUsers +func @ifElseNoUsers(%arg0: i1, %arg1: memref<2xf32>, %arg2: memref<2xf32>) { + %0 = alloc() : memref<2xf32> + linalg.generic {indexing_maps = [#map0, #map0], iterator_types = ["parallel"]} + ins(%arg1: memref<2xf32>) + outs(%0: memref<2xf32>) { + ^bb0(%gen1_arg0: f32, %gen1_arg1: f32): + %tmp1 = exp %gen1_arg0 : f32 + linalg.yield %tmp1 : f32 + } + cond_br %arg0, + ^bb1(%arg1, %0 : memref<2xf32>, memref<2xf32>), + ^bb2(%0, %arg1 : memref<2xf32>, memref<2xf32>) +^bb1(%1: memref<2xf32>, %2: memref<2xf32>): + br ^bb3(%1, %2 : memref<2xf32>, memref<2xf32>) +^bb2(%3: memref<2xf32>, %4: memref<2xf32>): + br ^bb3(%3, %4 : memref<2xf32>, memref<2xf32>) +^bb3(%5: memref<2xf32>, %6: memref<2xf32>): + "linalg.copy"(%arg1, %arg2) : (memref<2xf32>, memref<2xf32>) -> () + return +} + +// CHECK-NEXT: %[[ALLOC0:.*]] = alloc() +// CHECK-NEXT: linalg.generic + +// ----- + +// Test Case: +// bb0 <- Initial position of the first AllocOp +// / \ +// bb1 bb2 +// | / \ +// | bb3 bb4 +// \ \ / +// \ / +// bb5 <- Initial position of the second AllocOp +// BufferHoisting expected behavior: AllocOps shouldn't be moved. + +#map0 = affine_map<(d0) -> (d0)> + +// CHECK-LABEL: func @ifElseNested +func @ifElseNested(%arg0: i1, %arg1: memref<2xf32>, %arg2: memref<2xf32>) { + %0 = alloc() : memref<2xf32> + linalg.generic {indexing_maps = [#map0, #map0], iterator_types = ["parallel"]} + ins(%arg1: memref<2xf32>) + outs(%0: memref<2xf32>) { + ^bb0(%gen1_arg0: f32, %gen1_arg1: f32): + %tmp1 = exp %gen1_arg0 : f32 + linalg.yield %tmp1 : f32 + } + cond_br %arg0, + ^bb1(%arg1, %0 : memref<2xf32>, memref<2xf32>), + ^bb2(%0, %arg1 : memref<2xf32>, memref<2xf32>) +^bb1(%1: memref<2xf32>, %2: memref<2xf32>): + br ^bb5(%1, %2 : memref<2xf32>, memref<2xf32>) +^bb2(%3: memref<2xf32>, %4: memref<2xf32>): + cond_br %arg0, ^bb3(%3 : memref<2xf32>), ^bb4(%4 : memref<2xf32>) +^bb3(%5: memref<2xf32>): + br ^bb5(%5, %3 : memref<2xf32>, memref<2xf32>) +^bb4(%6: memref<2xf32>): + br ^bb5(%3, %6 : memref<2xf32>, memref<2xf32>) +^bb5(%7: memref<2xf32>, %8: memref<2xf32>): + %9 = alloc() : memref<2xf32> + linalg.generic {indexing_maps = [#map0, #map0], iterator_types = ["parallel"]} + ins(%7: memref<2xf32>) + outs(%9: memref<2xf32>) { + ^bb0(%gen2_arg0: f32, %gen2_arg1: f32): + %tmp2 = exp %gen2_arg0 : f32 + linalg.yield %tmp2 : f32 + } + "linalg.copy"(%9, %arg2) : (memref<2xf32>, memref<2xf32>) -> () + return +} + +// CHECK-NEXT: %[[ALLOC0:.*]] = alloc() +// CHECK-NEXT: linalg.generic +// CHECK: br ^bb5 +// CHECK: br ^bb5 +// CHECK: br ^bb5 +// CHECK-NEXT: ^bb5 +// CHECK: %[[ALLOC1:.*]] = alloc() +// CHECK-NEXT: linalg.generic + +// ----- + +// Test Case: Dead operations in a single block. +// BufferHoisting expected behavior: It shouldn't move the AllocOps. + +#map0 = affine_map<(d0) -> (d0)> + +// CHECK-LABEL: func @redundantOperations +func @redundantOperations(%arg0: memref<2xf32>) { + %0 = alloc() : memref<2xf32> + linalg.generic {indexing_maps = [#map0, #map0], iterator_types = ["parallel"]} + ins(%arg0: memref<2xf32>) + outs(%0: memref<2xf32>) { + ^bb0(%gen1_arg0: f32, %gen1_arg1: f32): + %tmp1 = exp %gen1_arg0 : f32 + linalg.yield %tmp1 : f32 + } + %1 = alloc() : memref<2xf32> + linalg.generic {indexing_maps = [#map0, #map0], iterator_types = ["parallel"]} + ins(%0: memref<2xf32>) + outs(%1: memref<2xf32>) { + ^bb0(%gen2_arg0: f32, %gen2_arg1: f32): + %tmp2 = exp %gen2_arg0 : f32 + linalg.yield %tmp2 : f32 + } + return +} + +// CHECK-NEXT: %[[ALLOC0:.*]] = alloc() +// CHECK-NEXT: linalg.generic +// CHECK: %[[ALLOC1:.*]] = alloc() +// CHECK-NEXT: linalg.generic + +// ----- + +// Test Case: +// bb0 +// / \ +// Initial pos of the 1st AllocOp -> bb1 bb2 <- Initial pos of the 2nd AllocOp +// \ / +// bb3 +// BufferHoisting expected behavior: Both AllocOps should be moved to the +// entry block. + +#map0 = affine_map<(d0) -> (d0)> + +// CHECK-LABEL: func @moving_alloc_and_inserting_missing_dealloc +func @moving_alloc_and_inserting_missing_dealloc( + %cond: i1, + %arg0: memref<2xf32>, + %arg1: memref<2xf32>) { + cond_br %cond, ^bb1, ^bb2 +^bb1: + %0 = alloc() : memref<2xf32> + linalg.generic {indexing_maps = [#map0, #map0], iterator_types = ["parallel"]} + ins(%arg0: memref<2xf32>) + outs(%0: memref<2xf32>) { + ^bb0(%gen1_arg0: f32, %gen1_arg1: f32): + %tmp1 = exp %gen1_arg0 : f32 + linalg.yield %tmp1 : f32 + } + br ^exit(%0 : memref<2xf32>) +^bb2: + %1 = alloc() : memref<2xf32> + linalg.generic {indexing_maps = [#map0, #map0], iterator_types = ["parallel"]} + ins(%arg0: memref<2xf32>) + outs(%1: memref<2xf32>) { + ^bb0(%gen2_arg0: f32, %gen2_arg1: f32): + %tmp2 = exp %gen2_arg0 : f32 + linalg.yield %tmp2 : f32 + } + br ^exit(%1 : memref<2xf32>) +^exit(%arg2: memref<2xf32>): + "linalg.copy"(%arg2, %arg1) : (memref<2xf32>, memref<2xf32>) -> () + return +} + +// CHECK-NEXT: %{{.*}} = alloc() +// CHECK-NEXT: %{{.*}} = alloc() +// CHECK-NEXT: cond_br + +// ----- + +// Test Case: Invalid position of the DeallocOp. There is a user after +// deallocation. +// bb0 +// / \ +// bb1 bb2 <- Initial position of AllocOp +// \ / +// bb3 +// BufferHoisting expected behavior: It should move the AllocOp to the entry +// block. + +#map0 = affine_map<(d0) -> (d0)> + +// CHECK-LABEL: func @moving_invalid_dealloc_op_complex +func @moving_invalid_dealloc_op_complex( + %cond: i1, + %arg0: memref<2xf32>, + %arg1: memref<2xf32>) { + cond_br %cond, ^bb1, ^bb2 +^bb1: + br ^exit(%arg0 : memref<2xf32>) +^bb2: + %1 = alloc() : memref<2xf32> + linalg.generic {indexing_maps = [#map0, #map0], iterator_types = ["parallel"]} + ins(%arg0: memref<2xf32>) + outs(%1: memref<2xf32>) { + ^bb0(%gen1_arg0: f32, %gen1_arg1: f32): + %tmp1 = exp %gen1_arg0 : f32 + linalg.yield %tmp1 : f32 + } + dealloc %1 : memref<2xf32> + br ^exit(%1 : memref<2xf32>) +^exit(%arg2: memref<2xf32>): + "linalg.copy"(%arg2, %arg1) : (memref<2xf32>, memref<2xf32>) -> () + return +} + +// CHECK-NEXT: %{{.*}} = alloc() +// CHECK-NEXT: cond_br + +// ----- + +// Test Case: Nested regions - This test defines a GenericOp inside the region +// of another GenericOp. +// BufferHoisting expected behavior: The AllocOp of inner GenericOp should +// remain inside the region of outer GenericOp. The AllocOp of the outer +// GenericOp should be moved to the entry block. + +#map0 = affine_map<(d0) -> (d0)> + +// CHECK-LABEL: func @nested_regions_and_cond_branch +func @nested_regions_and_cond_branch( + %arg0: i1, + %arg1: memref<2xf32>, + %arg2: memref<2xf32>) { + cond_br %arg0, ^bb1, ^bb2 +^bb1: + br ^bb3(%arg1 : memref<2xf32>) +^bb2: + %0 = alloc() : memref<2xf32> + linalg.generic { + indexing_maps = [#map0, #map0], + iterator_types = ["parallel"]} + ins(%arg1: memref<2xf32>) + outs(%0: memref<2xf32>) { + ^bb0(%gen1_arg0: f32, %gen1_arg1: f32): + %1 = alloc() : memref<2xf32> + linalg.generic { + indexing_maps = [#map0, #map0], + iterator_types = ["parallel"]} + ins(%arg1: memref<2xf32>) + outs(%1: memref<2xf32>) { + ^bb0(%gen2_arg0: f32, %gen2_arg1: f32): + %tmp2 = exp %gen2_arg0 : f32 + linalg.yield %tmp2 : f32 + } + %tmp1 = exp %gen1_arg0 : f32 + linalg.yield %tmp1 : f32 + } + br ^bb3(%0 : memref<2xf32>) +^bb3(%1: memref<2xf32>): + "linalg.copy"(%1, %arg2) : (memref<2xf32>, memref<2xf32>) -> () + return +} +// CHECK-NEXT: %[[ALLOC0:.*]] = alloc() +// CHECK-NEXT: cond_br +// CHECK: linalg.generic +// CHECK: %[[ALLOC1:.*]] = alloc() +// CHECK-NEXT: linalg.generic + +// ----- + +// Test Case: nested region control flow +// The alloc position of %1 does not need to be changed and flows through +// both if branches until it is finally returned. + +// CHECK-LABEL: func @nested_region_control_flow +func @nested_region_control_flow( + %arg0 : index, + %arg1 : index) -> memref { + %0 = cmpi "eq", %arg0, %arg1 : index + %1 = alloc(%arg0, %arg0) : memref + %2 = scf.if %0 -> (memref) { + scf.yield %1 : memref + } else { + %3 = alloc(%arg0, %arg1) : memref + scf.yield %1 : memref + } + return %2 : memref +} + +// CHECK: %[[ALLOC0:.*]] = alloc(%arg0, %arg0) +// CHECK-NEXT: %{{.*}} = scf.if +// CHECK: else +// CHECK-NEXT: %[[ALLOC1:.*]] = alloc(%arg0, %arg1) + +// ----- + +// Test Case: nested region control flow with a nested buffer allocation in a +// divergent branch. +// The alloc positions of %1 does not need to be changed. %3 is moved upwards. + +// CHECK-LABEL: func @nested_region_control_flow_div +func @nested_region_control_flow_div( + %arg0 : index, + %arg1 : index) -> memref { + %0 = cmpi "eq", %arg0, %arg1 : index + %1 = alloc(%arg0, %arg0) : memref + %2 = scf.if %0 -> (memref) { + scf.yield %1 : memref + } else { + %3 = alloc(%arg0, %arg1) : memref + scf.yield %3 : memref + } + return %2 : memref +} + +// CHECK: %[[ALLOC0:.*]] = alloc(%arg0, %arg0) +// CHECK-NEXT: %[[ALLOC1:.*]] = alloc(%arg0, %arg1) +// CHECK-NEXT: %{{.*}} = scf.if + +// ----- + +// Test Case: deeply nested region control flow with a nested buffer allocation +// in a divergent branch. +// The alloc position of %1 does not need to be changed. Allocs %4 and %5 are +// moved upwards. + +// CHECK-LABEL: func @nested_region_control_flow_div_nested +func @nested_region_control_flow_div_nested( + %arg0 : index, + %arg1 : index) -> memref { + %0 = cmpi "eq", %arg0, %arg1 : index + %1 = alloc(%arg0, %arg0) : memref + %2 = scf.if %0 -> (memref) { + %3 = scf.if %0 -> (memref) { + scf.yield %1 : memref + } else { + %4 = alloc(%arg0, %arg1) : memref + scf.yield %4 : memref + } + scf.yield %3 : memref + } else { + %5 = alloc(%arg1, %arg1) : memref + scf.yield %5 : memref + } + return %2 : memref +} +// CHECK: %[[ALLOC0:.*]] = alloc(%arg0, %arg0) +// CHECK-NEXT: %[[ALLOC1:.*]] = alloc(%arg0, %arg1) +// CHECK-NEXT: %[[ALLOC2:.*]] = alloc(%arg1, %arg1) +// CHECK-NEXT: %{{.*}} = scf.if + +// ----- + +// Test Case: nested region control flow within a region interface. +// The alloc positions of %0 does not need to be changed. + +// CHECK-LABEL: func @inner_region_control_flow +func @inner_region_control_flow(%arg0 : index) -> memref { + %0 = alloc(%arg0, %arg0) : memref + %1 = test.region_if %0 : memref -> (memref) then { + ^bb0(%arg1 : memref): + test.region_if_yield %arg1 : memref + } else { + ^bb0(%arg1 : memref): + test.region_if_yield %arg1 : memref + } join { + ^bb0(%arg1 : memref): + test.region_if_yield %arg1 : memref + } + return %1 : memref +} + +// CHECK: %[[ALLOC0:.*]] = alloc(%arg0, %arg0) +// CHECK-NEXT: {{.*}} test.region_if + +// ----- + +// Test Case: nested region control flow within a region interface including an +// allocation in a divergent branch. +// The alloc positions of %0 does not need to be changed. %2 is moved upwards. + +// CHECK-LABEL: func @inner_region_control_flow_div +func @inner_region_control_flow_div( + %arg0 : index, + %arg1 : index) -> memref { + %0 = alloc(%arg0, %arg0) : memref + %1 = test.region_if %0 : memref -> (memref) then { + ^bb0(%arg2 : memref): + test.region_if_yield %arg2 : memref + } else { + ^bb0(%arg2 : memref): + %2 = alloc(%arg0, %arg1) : memref + test.region_if_yield %2 : memref + } join { + ^bb0(%arg2 : memref): + test.region_if_yield %arg2 : memref + } + return %1 : memref +} + +// CHECK: %[[ALLOC0:.*]] = alloc(%arg0, %arg0) +// CHECK-NEXT: %[[ALLOC1:.*]] = alloc(%arg0, %arg1) +// CHECK-NEXT: {{.*}} test.region_if + +// ----- + +#map0 = affine_map<(d0) -> (d0)> + +// Test Case: Alloca operations shouldn't be moved. + +// CHECK-LABEL: func @condBranchAlloca +func @condBranchAlloca(%arg0: i1, %arg1: memref<2xf32>, %arg2: memref<2xf32>) { + cond_br %arg0, ^bb1, ^bb2 +^bb1: + br ^bb3(%arg1 : memref<2xf32>) +^bb2: + %0 = alloca() : memref<2xf32> + linalg.generic {indexing_maps = [#map0, #map0], iterator_types = ["parallel"]} + ins(%arg1: memref<2xf32>) + outs(%0: memref<2xf32>) { + ^bb0(%gen1_arg0: f32, %gen1_arg1: f32): + %tmp1 = exp %gen1_arg0 : f32 + linalg.yield %tmp1 : f32 + } + br ^bb3(%0 : memref<2xf32>) +^bb3(%1: memref<2xf32>): + "linalg.copy"(%1, %arg2) : (memref<2xf32>, memref<2xf32>) -> () + return +} + +// CHECK-NEXT: cond_br +// CHECK: ^bb2 +// CHECK: ^bb2 +// CHECK-NEXT: %[[ALLOCA:.*]] = alloca() +// CHECK-NEXT: linalg.generic + +// ----- + +#map0 = affine_map<(d0) -> (d0)> + +// Test Case: Alloca operations shouldn't be moved. The alloc operation also +// shouldn't be moved analogously to the ifElseNested test. + +// CHECK-LABEL: func @ifElseNestedAlloca +func @ifElseNestedAlloca( + %arg0: i1, + %arg1: memref<2xf32>, + %arg2: memref<2xf32>) { + %0 = alloca() : memref<2xf32> + linalg.generic {indexing_maps = [#map0, #map0], iterator_types = ["parallel"]} + ins(%arg1: memref<2xf32>) + outs(%0: memref<2xf32>) { + ^bb0(%gen1_arg0: f32, %gen1_arg1: f32): + %tmp1 = exp %gen1_arg0 : f32 + linalg.yield %tmp1 : f32 + } + cond_br %arg0, + ^bb1(%arg1, %0 : memref<2xf32>, memref<2xf32>), + ^bb2(%0, %arg1 : memref<2xf32>, memref<2xf32>) +^bb1(%1: memref<2xf32>, %2: memref<2xf32>): + br ^bb5(%1, %2 : memref<2xf32>, memref<2xf32>) +^bb2(%3: memref<2xf32>, %4: memref<2xf32>): + cond_br %arg0, ^bb3(%3 : memref<2xf32>), ^bb4(%4 : memref<2xf32>) +^bb3(%5: memref<2xf32>): + br ^bb5(%5, %3 : memref<2xf32>, memref<2xf32>) +^bb4(%6: memref<2xf32>): + br ^bb5(%3, %6 : memref<2xf32>, memref<2xf32>) +^bb5(%7: memref<2xf32>, %8: memref<2xf32>): + %9 = alloc() : memref<2xf32> + linalg.generic {indexing_maps = [#map0, #map0], iterator_types = ["parallel"]} + ins(%7: memref<2xf32>) + outs(%9: memref<2xf32>) { + ^bb0(%gen2_arg0: f32, %gen2_arg1: f32): + %tmp2 = exp %gen2_arg0 : f32 + linalg.yield %tmp2 : f32 + } + "linalg.copy"(%9, %arg2) : (memref<2xf32>, memref<2xf32>) -> () + return +} + +// CHECK-NEXT: %[[ALLOCA:.*]] = alloca() +// CHECK-NEXT: linalg.generic +// CHECK: ^bb5 +// CHECK: ^bb5 +// CHECK: ^bb5 +// CHECK-NEXT: ^bb5 +// CHECK-NEXT: %[[ALLOC:.*]] = alloc() +// CHECK-NEXT: linalg.generic + +// ----- + +#map0 = affine_map<(d0) -> (d0)> + +// Test Case: Alloca operations shouldn't be moved. The alloc operation should +// be moved in the beginning analogous to the nestedRegionsAndCondBranch test. + +// CHECK-LABEL: func @nestedRegionsAndCondBranchAlloca +func @nestedRegionsAndCondBranchAlloca( + %arg0: i1, + %arg1: memref<2xf32>, + %arg2: memref<2xf32>) { + cond_br %arg0, ^bb1, ^bb2 +^bb1: + br ^bb3(%arg1 : memref<2xf32>) +^bb2: + %0 = alloc() : memref<2xf32> + linalg.generic { + indexing_maps = [#map0, #map0], + iterator_types = ["parallel"]} + ins(%arg1: memref<2xf32>) + outs(%0: memref<2xf32>) { + ^bb0(%gen1_arg0: f32, %gen1_arg1: f32): + %1 = alloca() : memref<2xf32> + linalg.generic { + indexing_maps = [#map0, #map0], + iterator_types = ["parallel"]} + ins(%arg1: memref<2xf32>) + outs(%1: memref<2xf32>) { + ^bb0(%gen2_arg0: f32, %gen2_arg1: f32): + %tmp2 = exp %gen2_arg0 : f32 + linalg.yield %tmp2 : f32 + } + %tmp1 = exp %gen1_arg0 : f32 + linalg.yield %tmp1 : f32 + } + br ^bb3(%0 : memref<2xf32>) +^bb3(%1: memref<2xf32>): + "linalg.copy"(%1, %arg2) : (memref<2xf32>, memref<2xf32>) -> () + return +} +// CHECK-NEXT: %[[ALLOC:.*]] = alloc() +// CHECK-NEXT: cond_br +// CHECK: linalg.generic +// CHECK: %[[ALLOCA:.*]] = alloca() +// CHECK-NEXT: linalg.generic + +// ----- + +// Test Case: structured control-flow loop using a nested alloc. +// The alloc positions of %3 will be moved upwards. + +// CHECK-LABEL: func @loop_alloc +func @loop_alloc( + %lb: index, + %ub: index, + %step: index, + %buf: memref<2xf32>, + %res: memref<2xf32>) { + %0 = alloc() : memref<2xf32> + %1 = scf.for %i = %lb to %ub step %step + iter_args(%iterBuf = %buf) -> memref<2xf32> { + %2 = cmpi "eq", %i, %ub : index + %3 = alloc() : memref<2xf32> + scf.yield %3 : memref<2xf32> + } + "linalg.copy"(%1, %res) : (memref<2xf32>, memref<2xf32>) -> () + return +} + +// CHECK: %[[ALLOC0:.*]] = alloc() +// CHECK-NEXT: {{.*}} scf.for +// CHECK: %[[ALLOC1:.*]] = alloc() + +// ----- + +// Test Case: structured control-flow loop with a nested if operation using +// a deeply nested buffer allocation. +// The allocation %4 is not moved upwards. + +// CHECK-LABEL: func @loop_nested_if_alloc +func @loop_nested_if_alloc( + %lb: index, + %ub: index, + %step: index, + %buf: memref<2xf32>) -> memref<2xf32> { + %0 = alloc() : memref<2xf32> + %1 = scf.for %i = %lb to %ub step %step + iter_args(%iterBuf = %buf) -> memref<2xf32> { + %2 = cmpi "eq", %i, %ub : index + %3 = scf.if %2 -> (memref<2xf32>) { + %4 = alloc() : memref<2xf32> + scf.yield %4 : memref<2xf32> + } else { + scf.yield %0 : memref<2xf32> + } + scf.yield %3 : memref<2xf32> + } + return %1 : memref<2xf32> +} + +// CHECK: %[[ALLOC0:.*]] = alloc() +// CHECK-NEXT: {{.*}} scf.for +// CHECK: %[[ALLOC1:.*]] = alloc() + +// ----- + +// Test Case: several nested structured control-flow loops with a deeply nested +// buffer allocation inside an if operation. +// Same behavior is an loop_nested_if_alloc: The allocs are not moved upwards. + +// CHECK-LABEL: func @loop_nested_alloc +func @loop_nested_alloc( + %lb: index, + %ub: index, + %step: index, + %buf: memref<2xf32>, + %res: memref<2xf32>) { + %0 = alloc() : memref<2xf32> + %1 = scf.for %i = %lb to %ub step %step + iter_args(%iterBuf = %buf) -> memref<2xf32> { + %2 = scf.for %i2 = %lb to %ub step %step + iter_args(%iterBuf2 = %iterBuf) -> memref<2xf32> { + %3 = scf.for %i3 = %lb to %ub step %step + iter_args(%iterBuf3 = %iterBuf2) -> memref<2xf32> { + %4 = alloc() : memref<2xf32> + %5 = cmpi "eq", %i, %ub : index + %6 = scf.if %5 -> (memref<2xf32>) { + %7 = alloc() : memref<2xf32> + scf.yield %7 : memref<2xf32> + } else { + scf.yield %iterBuf3 : memref<2xf32> + } + scf.yield %6 : memref<2xf32> + } + scf.yield %3 : memref<2xf32> + } + scf.yield %2 : memref<2xf32> + } + "linalg.copy"(%1, %res) : (memref<2xf32>, memref<2xf32>) -> () + return +} + +// CHECK: %[[ALLOC0:.*]] = alloc() +// CHECK-NEXT: {{.*}} = scf.for +// CHECK-NEXT: {{.*}} = scf.for +// CHECK-NEXT: {{.*}} = scf.for +// CHECK-NEXT: %[[ALLOC1:.*]] = alloc() +// CHECK: %[[ALLOC2:.*]] = alloc() + +// ----- + +// CHECK-LABEL: func @loop_nested_alloc_dyn_dependency +func @loop_nested_alloc_dyn_dependency( + %lb: index, + %ub: index, + %step: index, + %arg0: index, + %buf: memref, + %res: memref) { + %0 = alloc(%arg0) : memref + %1 = scf.for %i = %lb to %ub step %step + iter_args(%iterBuf = %buf) -> memref { + %2 = scf.for %i2 = %lb to %ub step %step + iter_args(%iterBuf2 = %iterBuf) -> memref { + %3 = scf.for %i3 = %lb to %ub step %step + iter_args(%iterBuf3 = %iterBuf2) -> memref { + %5 = cmpi "eq", %i, %ub : index + %6 = scf.if %5 -> (memref) { + %7 = alloc(%i3) : memref + scf.yield %7 : memref + } else { + scf.yield %iterBuf3 : memref + } + scf.yield %6 : memref + } + scf.yield %3 : memref + } + scf.yield %0 : memref + } + "linalg.copy"(%1, %res) : (memref, memref) -> () + return +} + + +// CHECK: %[[ALLOC0:.*]] = alloc({{.*}}) +// CHECK-NEXT: {{.*}} = scf.for +// CHECK-NEXT: {{.*}} = scf.for +// CHECK-NEXT: {{.*}} = scf.for +// CHECK: %[[ALLOC1:.*]] = alloc({{.*}}) diff --git a/mlir/test/Transforms/buffer-loop-hoisting.mlir b/mlir/test/Transforms/buffer-loop-hoisting.mlir new file mode 100644 --- /dev/null +++ b/mlir/test/Transforms/buffer-loop-hoisting.mlir @@ -0,0 +1,490 @@ +// RUN: mlir-opt -buffer-loop-hoisting -split-input-file %s | FileCheck %s + +// This file checks the behavior of BufferLoopHoisting pass for moving Alloc +// operations in their correct positions. + +// Test Case: +// bb0 +// / \ +// bb1 bb2 <- Initial position of AllocOp +// \ / +// bb3 +// BufferLoopHoisting expected behavior: It should not move the AllocOp. + +#map0 = affine_map<(d0) -> (d0)> + +// CHECK-LABEL: func @condBranch +func @condBranch(%arg0: i1, %arg1: memref<2xf32>, %arg2: memref<2xf32>) { + cond_br %arg0, ^bb1, ^bb2 +^bb1: + br ^bb3(%arg1 : memref<2xf32>) +^bb2: + %0 = alloc() : memref<2xf32> + linalg.generic {indexing_maps = [#map0, #map0], iterator_types = ["parallel"]} + ins(%arg1: memref<2xf32>) + outs(%0: memref<2xf32>) { + ^bb0(%gen1_arg0: f32, %gen1_arg1: f32): + %tmp1 = exp %gen1_arg0 : f32 + linalg.yield %tmp1 : f32 + } + br ^bb3(%0 : memref<2xf32>) +^bb3(%1: memref<2xf32>): + "linalg.copy"(%1, %arg2) : (memref<2xf32>, memref<2xf32>) -> () + return +} + +// CHECK-NEXT: cond_br +// CHECK: %[[ALLOC:.*]] = alloc() + +// ----- + +// Test Case: +// bb0 +// / \ +// bb1 bb2 <- Initial position of AllocOp +// \ / +// bb3 +// BufferLoopHoisting expected behavior: It should not move the existing AllocOp +// to any other block since the alloc has a dynamic dependency to block argument +// %0 in bb2. + +#map0 = affine_map<(d0) -> (d0)> + +// CHECK-LABEL: func @condBranchDynamicType +func @condBranchDynamicType( + %arg0: i1, + %arg1: memref, + %arg2: memref, + %arg3: index) { + cond_br %arg0, ^bb1, ^bb2(%arg3: index) +^bb1: + br ^bb3(%arg1 : memref) +^bb2(%0: index): + %1 = alloc(%0) : memref + linalg.generic {indexing_maps = [#map0, #map0], iterator_types = ["parallel"]} + ins(%arg1: memref) + outs(%1: memref) { + ^bb0(%gen1_arg0: f32, %gen1_arg1: f32): + %tmp1 = exp %gen1_arg0 : f32 + linalg.yield %tmp1 : f32 + } + br ^bb3(%1 : memref) +^bb3(%2: memref): + "linalg.copy"(%2, %arg2) : (memref, memref) -> () + return +} + +// CHECK-NEXT: cond_br +// CHECK: ^bb2 +// CHECK: ^bb2(%[[IDX:.*]]:{{.*}}) +// CHECK-NEXT: %[[ALLOC0:.*]] = alloc(%[[IDX]]) +// CHECK-NEXT: linalg.generic + +// ----- + +// Test Case: Nested regions - This test defines a GenericOp inside the region +// of another GenericOp. +// BufferLoopHoisting expected behavior: The AllocOp of inner GenericOp should +// remain inside the region of outer GenericOp. The AllocOp of the outer +// GenericOp should not be moved during this pass. + +#map0 = affine_map<(d0) -> (d0)> + +// CHECK-LABEL: func @nested_regions_and_cond_branch +func @nested_regions_and_cond_branch( + %arg0: i1, + %arg1: memref<2xf32>, + %arg2: memref<2xf32>) { + cond_br %arg0, ^bb1, ^bb2 +^bb1: + br ^bb3(%arg1 : memref<2xf32>) +^bb2: + %0 = alloc() : memref<2xf32> + linalg.generic { + indexing_maps = [#map0, #map0], + iterator_types = ["parallel"]} + ins(%arg1: memref<2xf32>) + outs(%0: memref<2xf32>) { + ^bb0(%gen1_arg0: f32, %gen1_arg1: f32): + %1 = alloc() : memref<2xf32> + linalg.generic { + indexing_maps = [#map0, #map0], + iterator_types = ["parallel"]} + ins(%arg1: memref<2xf32>) + outs(%1: memref<2xf32>) { + ^bb0(%gen2_arg0: f32, %gen2_arg1: f32): + %tmp2 = exp %gen2_arg0 : f32 + linalg.yield %tmp2 : f32 + } + %tmp1 = exp %gen1_arg0 : f32 + linalg.yield %tmp1 : f32 + } + br ^bb3(%0 : memref<2xf32>) +^bb3(%1: memref<2xf32>): + "linalg.copy"(%1, %arg2) : (memref<2xf32>, memref<2xf32>) -> () + return +} +// CHECK-NEXT: cond_br +// CHECK: %[[ALLOC0:.*]] = alloc() +// CHECK: linalg.generic +// CHECK: %[[ALLOC1:.*]] = alloc() +// CHECK-NEXT: linalg.generic + +// ----- + +// Test Case: nested region control flow +// The alloc position of %1 does not need to be changed and flows through +// both if branches until it is finally returned. + +// CHECK-LABEL: func @nested_region_control_flow +func @nested_region_control_flow( + %arg0 : index, + %arg1 : index) -> memref { + %0 = cmpi "eq", %arg0, %arg1 : index + %1 = alloc(%arg0, %arg0) : memref + %2 = scf.if %0 -> (memref) { + scf.yield %1 : memref + } else { + %3 = alloc(%arg0, %arg1) : memref + scf.yield %1 : memref + } + return %2 : memref +} + +// CHECK: %[[ALLOC0:.*]] = alloc(%arg0, %arg0) +// CHECK-NEXT: %{{.*}} = scf.if +// CHECK: else +// CHECK-NEXT: %[[ALLOC1:.*]] = alloc(%arg0, %arg1) + +// ----- + +// Test Case: structured control-flow loop using a nested alloc. +// The alloc positions of %3 should not be changed. + +// CHECK-LABEL: func @loop_alloc +func @loop_alloc( + %lb: index, + %ub: index, + %step: index, + %buf: memref<2xf32>, + %res: memref<2xf32>) { + %0 = alloc() : memref<2xf32> + %1 = scf.for %i = %lb to %ub step %step + iter_args(%iterBuf = %buf) -> memref<2xf32> { + %2 = cmpi "eq", %i, %ub : index + %3 = alloc() : memref<2xf32> + scf.yield %3 : memref<2xf32> + } + "linalg.copy"(%1, %res) : (memref<2xf32>, memref<2xf32>) -> () + return +} + +// CHECK: %[[ALLOC0:.*]] = alloc() +// CHECK-NEXT: {{.*}} scf.for +// CHECK: %[[ALLOC1:.*]] = alloc() + +// ----- + +// Test Case: structured control-flow loop with a nested if operation using +// a deeply nested buffer allocation. +// The allocation %4 should not be moved upwards due to a back-edge dependency. + +// CHECK-LABEL: func @loop_nested_if_alloc +func @loop_nested_if_alloc( + %lb: index, + %ub: index, + %step: index, + %buf: memref<2xf32>) -> memref<2xf32> { + %0 = alloc() : memref<2xf32> + %1 = scf.for %i = %lb to %ub step %step + iter_args(%iterBuf = %buf) -> memref<2xf32> { + %2 = cmpi "eq", %i, %ub : index + %3 = scf.if %2 -> (memref<2xf32>) { + %4 = alloc() : memref<2xf32> + scf.yield %4 : memref<2xf32> + } else { + scf.yield %0 : memref<2xf32> + } + scf.yield %3 : memref<2xf32> + } + return %1 : memref<2xf32> +} + +// CHECK: %[[ALLOC0:.*]] = alloc() +// CHECK-NEXT: {{.*}} scf.for +// CHECK: %[[ALLOC1:.*]] = alloc() + +// ----- + +// Test Case: several nested structured control-flow loops with deeply nested +// buffer allocations inside an if operation. +// Behavior: The allocs %0, %4 and %9 are moved upwards, while %7 and %8 stay +// in their positions. + +// CHECK-LABEL: func @loop_nested_alloc +func @loop_nested_alloc( + %lb: index, + %ub: index, + %step: index, + %buf: memref<2xf32>, + %res: memref<2xf32>) { + %0 = alloc() : memref<2xf32> + %1 = scf.for %i = %lb to %ub step %step + iter_args(%iterBuf = %buf) -> memref<2xf32> { + %2 = scf.for %i2 = %lb to %ub step %step + iter_args(%iterBuf2 = %iterBuf) -> memref<2xf32> { + %3 = scf.for %i3 = %lb to %ub step %step + iter_args(%iterBuf3 = %iterBuf2) -> memref<2xf32> { + %4 = alloc() : memref<2xf32> + %5 = cmpi "eq", %i, %ub : index + %6 = scf.if %5 -> (memref<2xf32>) { + %7 = alloc() : memref<2xf32> + %8 = alloc() : memref<2xf32> + scf.yield %8 : memref<2xf32> + } else { + scf.yield %iterBuf3 : memref<2xf32> + } + %9 = alloc() : memref<2xf32> + scf.yield %6 : memref<2xf32> + } + scf.yield %3 : memref<2xf32> + } + scf.yield %2 : memref<2xf32> + } + "linalg.copy"(%1, %res) : (memref<2xf32>, memref<2xf32>) -> () + return +} + +// CHECK: %[[ALLOC0:.*]] = alloc() +// CHECK-NEXT: %[[ALLOC1:.*]] = alloc() +// CHECK-NEXT: %[[ALLOC2:.*]] = alloc() +// CHECK-NEXT: {{.*}} = scf.for +// CHECK-NEXT: {{.*}} = scf.for +// CHECK-NEXT: {{.*}} = scf.for +// CHECK: {{.*}} = scf.if +// CHECK: %[[ALLOC3:.*]] = alloc() +// CHECK: %[[ALLOC4:.*]] = alloc() + +// ----- + +// CHECK-LABEL: func @loop_nested_alloc_dyn_dependency +func @loop_nested_alloc_dyn_dependency( + %lb: index, + %ub: index, + %step: index, + %arg0: index, + %buf: memref, + %res: memref) { + %0 = alloc(%arg0) : memref + %1 = scf.for %i = %lb to %ub step %step + iter_args(%iterBuf = %buf) -> memref { + %2 = scf.for %i2 = %lb to %ub step %step + iter_args(%iterBuf2 = %iterBuf) -> memref { + %3 = scf.for %i3 = %lb to %ub step %step + iter_args(%iterBuf3 = %iterBuf2) -> memref { + %4 = alloc(%i3) : memref + %5 = cmpi "eq", %i, %ub : index + %6 = scf.if %5 -> (memref) { + %7 = alloc(%i3) : memref + scf.yield %7 : memref + } else { + scf.yield %iterBuf3 : memref + } + %8 = alloc(%i3) : memref + scf.yield %6 : memref + } + scf.yield %3 : memref + } + scf.yield %0 : memref + } + "linalg.copy"(%1, %res) : (memref, memref) -> () + return +} + +// CHECK: %[[ALLOC0:.*]] = alloc({{.*}}) +// CHECK-NEXT: {{.*}} = scf.for +// CHECK-NEXT: {{.*}} = scf.for +// CHECK-NEXT: {{.*}} = scf.for +// CHECK: %[[ALLOC1:.*]] = alloc({{.*}}) +// CHECK: %[[ALLOC2:.*]] = alloc({{.*}}) + +// ----- + +// CHECK-LABEL: func @hoist_one_loop +func @hoist_one_loop( + %lb: index, + %ub: index, + %step: index, + %buf: memref<2xf32>, + %res: memref<2xf32>) { + %0 = alloc() : memref<2xf32> + %1 = scf.for %i = %lb to %ub step %step + iter_args(%iterBuf = %buf) -> memref<2xf32> { + %2 = alloc() : memref<2xf32> + scf.yield %0 : memref<2xf32> + } + "linalg.copy"(%1, %res) : (memref<2xf32>, memref<2xf32>) -> () + return +} + +// CHECK: %[[ALLOC0:.*]] = alloc({{.*}}) +// CHECK-NEXT: %[[ALLOC1:.*]] = alloc({{.*}}) +// CHECK-NEXT: {{.*}} = scf.for + +// ----- + +// CHECK-LABEL: func @no_hoist_one_loop +func @no_hoist_one_loop( + %lb: index, + %ub: index, + %step: index, + %buf: memref<2xf32>, + %res: memref<2xf32>) { + %0 = scf.for %i = %lb to %ub step %step + iter_args(%iterBuf = %buf) -> memref<2xf32> { + %1 = alloc() : memref<2xf32> + scf.yield %1 : memref<2xf32> + } + "linalg.copy"(%0, %res) : (memref<2xf32>, memref<2xf32>) -> () + return +} + +// CHECK: {{.*}} = scf.for +// CHECK-NEXT: %[[ALLOC0:.*]] = alloc({{.*}}) + +// ----- + +// CHECK-LABEL: func @hoist_multiple_loop +func @hoist_multiple_loop( + %lb: index, + %ub: index, + %step: index, + %buf: memref<2xf32>, + %res: memref<2xf32>) { + %0 = alloc() : memref<2xf32> + %1 = scf.for %i = %lb to %ub step %step + iter_args(%iterBuf = %buf) -> memref<2xf32> { + %2 = scf.for %i2 = %lb to %ub step %step + iter_args(%iterBuf2 = %iterBuf) -> memref<2xf32> { + %3 = alloc() : memref<2xf32> + scf.yield %0 : memref<2xf32> + } + scf.yield %0 : memref<2xf32> + } + "linalg.copy"(%1, %res) : (memref<2xf32>, memref<2xf32>) -> () + return +} + +// CHECK: %[[ALLOC0:.*]] = alloc({{.*}}) +// CHECK-NEXT: %[[ALLOC1:.*]] = alloc({{.*}}) +// CHECK-NEXT: {{.*}} = scf.for + +// ----- + +// CHECK-LABEL: func @no_hoist_one_loop_conditional +func @no_hoist_one_loop_conditional( + %lb: index, + %ub: index, + %step: index, + %buf: memref<2xf32>, + %res: memref<2xf32>) { + %0 = scf.for %i = %lb to %ub step %step + iter_args(%iterBuf = %buf) -> memref<2xf32> { + %1 = cmpi "eq", %i, %ub : index + %2 = scf.if %1 -> (memref<2xf32>) { + %3 = alloc() : memref<2xf32> + scf.yield %3 : memref<2xf32> + } else { + scf.yield %iterBuf : memref<2xf32> + } + scf.yield %2 : memref<2xf32> + } + "linalg.copy"(%0, %res) : (memref<2xf32>, memref<2xf32>) -> () + return +} + +// CHECK: {{.*}} = scf.for +// CHECK: {{.*}} = scf.if +// CHECK-NEXT: %[[ALLOC0:.*]] = alloc({{.*}}) + +// ----- + +// CHECK-LABEL: func @hoist_one_loop_conditional +func @hoist_one_loop_conditional( + %lb: index, + %ub: index, + %step: index, + %buf: memref<2xf32>, + %res: memref<2xf32>) { + %0 = alloc() : memref<2xf32> + %1 = cmpi "eq", %lb, %ub : index + %2 = scf.if %1 -> (memref<2xf32>) { + %3 = scf.for %i = %lb to %ub step %step + iter_args(%iterBuf = %buf) -> memref<2xf32> { + %4 = alloc() : memref<2xf32> + scf.yield %0 : memref<2xf32> + } + scf.yield %0 : memref<2xf32> + } + else + { + scf.yield %0 : memref<2xf32> + } + "linalg.copy"(%2, %res) : (memref<2xf32>, memref<2xf32>) -> () + return +} + +// CHECK: {{.*}} = scf.if +// CHECK-NEXT: %[[ALLOC0:.*]] = alloc({{.*}}) +// CHECK: {{.*}} = scf.for + +// ----- + +// CHECK-LABEL: func @no_hoist_one_loop_dependency +func @no_hoist_one_loop_dependency( + %lb: index, + %ub: index, + %step: index, + %buf: memref<2xf32>, + %res: memref<2xf32>) { + %0 = alloc() : memref<2xf32> + %1 = scf.for %i = %lb to %ub step %step + iter_args(%iterBuf = %buf) -> memref<2xf32> { + %2 = alloc(%i) : memref + scf.yield %0 : memref<2xf32> + } + "linalg.copy"(%1, %res) : (memref<2xf32>, memref<2xf32>) -> () + return +} + +// CHECK: %[[ALLOC0:.*]] = alloc({{.*}}) +// CHECK-NEXT: {{.*}} = scf.for +// CHECK-NEXT: %[[ALLOC1:.*]] = alloc({{.*}}) + +// ----- + +// CHECK-LABEL: func @partial_hoist_multiple_loop_dependency +func @partial_hoist_multiple_loop_dependency( + %lb: index, + %ub: index, + %step: index, + %buf: memref<2xf32>, + %res: memref<2xf32>) { + %0 = alloc() : memref<2xf32> + %1 = scf.for %i = %lb to %ub step %step + iter_args(%iterBuf = %buf) -> memref<2xf32> { + %2 = scf.for %i2 = %lb to %ub step %step + iter_args(%iterBuf2 = %iterBuf) -> memref<2xf32> { + %3 = alloc(%i) : memref + scf.yield %0 : memref<2xf32> + } + scf.yield %0 : memref<2xf32> + } + "linalg.copy"(%1, %res) : (memref<2xf32>, memref<2xf32>) -> () + return +} + +// CHECK: %[[ALLOC0:.*]] = alloc({{.*}}) +// CHECK-NEXT: {{.*}} = scf.for +// CHECK-NEXT: %[[ALLOC1:.*]] = alloc({{.*}}) +// CHECK-NEXT: {{.*}} = scf.for