diff --git a/mlir/include/mlir/Transforms/Passes.h b/mlir/include/mlir/Transforms/Passes.h --- a/mlir/include/mlir/Transforms/Passes.h +++ b/mlir/include/mlir/Transforms/Passes.h @@ -40,6 +40,10 @@ /// reallocations inside of loops. std::unique_ptr createBufferLoopHoistingPass(); +/// Creates a pass that promotes heap-based allocations to stack-based ones. +std::unique_ptr +createPromoteBuffersToStackPass(unsigned maxAllocSizeInBytes = 1024); + /// Creates an instance of the Canonicalizer pass. std::unique_ptr createCanonicalizerPass(); diff --git a/mlir/include/mlir/Transforms/Passes.td b/mlir/include/mlir/Transforms/Passes.td --- a/mlir/include/mlir/Transforms/Passes.td +++ b/mlir/include/mlir/Transforms/Passes.td @@ -201,6 +201,22 @@ let constructor = "mlir::createBufferLoopHoistingPass()"; } +def PromoteBuffersToStack : FunctionPass<"promote-buffers-to-stack"> { + let summary = "Promotes heap-based allocations to automatically managed " + "stack-based allocations"; + let description = [{ + This pass implements a simple algorithm to convert heap-based memory + allocations to stack-based ones. It uses a built-in heuristic to decide + whether it makes sense to convert an allocation. + }]; + let constructor = "mlir::createPromoteBuffersToStackPass()"; + let options = [ + Option<"maxAllocSizeInBytes", "max-alloc-size-in-bytes", "unsigned", + /*default=*/"1024", + "Define the maximum size in bytes to promote allocations to stack.">, + ]; +} + def Canonicalizer : Pass<"canonicalize"> { let summary = "Canonicalize operations"; let description = [{ diff --git a/mlir/lib/Transforms/BufferOptimizations.cpp b/mlir/lib/Transforms/BufferOptimizations.cpp --- a/mlir/lib/Transforms/BufferOptimizations.cpp +++ b/mlir/lib/Transforms/BufferOptimizations.cpp @@ -6,9 +6,10 @@ // //===----------------------------------------------------------------------===// // -// This file implements logic for two optimization passes. These passes try to -// hoist alloc nodes to reduce the number of allocations and copies during -// buffer deallocation. +// This file implements logic for three optimization passes. The first two +// passes try to move alloc nodes out of blocks to reduce the number of +// allocations and copies during buffer deallocation. The third pass tries to +// convert heap-based allocations to stack-based allocations, if possible. #include "PassDetail.h" #include "mlir/IR/Operation.h" @@ -19,6 +20,58 @@ using namespace mlir; +/// Returns true if the given operation implements a known high-level region- +/// based control-flow interface. +static bool isKnownControlFlowInterface(Operation *op) { + return isa(op); +} + +/// Check if the size of the allocation is less than the given size. The +/// transformation is only applied to small buffers since large buffers could +/// exceed the stack space. +static bool isSmallAlloc(Value alloc, unsigned maximumSizeInBytes) { + auto type = alloc.getType().dyn_cast(); + if (!type || !type.hasStaticShape()) + return false; + return type.getSizeInBits() < maximumSizeInBytes * 8; +} + +/// Checks whether the given aliases leave the allocation scope. +static bool +leavesAllocationScope(Region *parentRegion, + const BufferPlacementAliasAnalysis::ValueSetT &aliases) { + for (Value alias : aliases) { + for (auto *use : alias.getUsers()) { + if (use->hasTrait() && + use->getParentRegion() == parentRegion) + return true; + } + } + return false; +} + +/// Returns the allocation scope of a given alloc value. +static Block * +findAllocationScope(Value alloc, + const BufferPlacementAliasAnalysis &aliasAnalysis) { + Region *region = alloc.getParentRegion(); + do { + if (Operation *parentOp = region->getParentOp()) { + // Check if the operation is a known control flow interface and break the + // loop to avoid transformation in loops and branches. + if (isKnownControlFlowInterface(parentOp)) + break; + // Check if the operation is an automatic allocation scope and whether an + // alias leaves the scope. This means, an allocation yields out of + // this scope and can not be transformed in a stack-based allocation. + if (parentOp->hasTrait() && + !leavesAllocationScope(region, aliasAnalysis.resolve(alloc))) + return ®ion->front(); + } + } while ((region = region->getParentRegion())); + return nullptr; +} + namespace { //===----------------------------------------------------------------------===// @@ -46,13 +99,6 @@ /// Implements the actual hoisting logic for allocation nodes. template class BufferAllocationHoisting : public BufferPlacementTransformationBase { -private: - /// Returns true if the given operation implements a known high-level region- - /// based control-flow interface. - static bool isKnownControlFlowInterface(Operation *op) { - return isa(op); - } - public: BufferAllocationHoisting(Operation *op) : BufferPlacementTransformationBase(op), dominators(op), @@ -220,6 +266,45 @@ void recordMoveToParent(Block *block) { placementBlock = block; } }; +//===----------------------------------------------------------------------===// +// BufferPlacementPromotion +//===----------------------------------------------------------------------===// + +/// Promotes heap-based allocations to stack-based allocations (if possible). +class BufferPlacementPromotion : BufferPlacementTransformationBase { +public: + BufferPlacementPromotion(Operation *op) + : BufferPlacementTransformationBase(op) {} + + /// Promote buffers to stack-based allocations. + void promote(unsigned maximumSize) { + for (BufferPlacementAllocs::AllocEntry &entry : allocs) { + Value alloc = std::get<0>(entry); + Block *parent; + // Checking several requirements to transform an AllocOp into an AllocaOp. + // The transformation is done if the allocation is limited to a given + // size. Furthermore, a deallocation is not defined for this allocation + // entry and a parent allocation scope must exist. + if (!isSmallAlloc(alloc, maximumSize) || std::get<1>(entry) || + !(parent = findAllocationScope(alloc, aliases))) + continue; + + Operation *startOperation = + BufferPlacementAllocs::getStartOperation(alloc, parent, liveness); + // Build a new alloca that is associated with its parent + // `AutomaticAllocationScope` determined during the initialization phase. + OpBuilder builder(startOperation); + auto alloca = builder.create( + alloc.getLoc(), alloc.getType().cast()); + + // Replace the original alloc by a newly created alloca. + Operation *allocOp = alloc.getDefiningOp(); + allocOp->replaceAllUsesWith(alloca.getOperation()); + allocOp->erase(); + } + } +}; + //===----------------------------------------------------------------------===// // BufferOptimizationPasses //===----------------------------------------------------------------------===// @@ -247,6 +332,24 @@ } }; +/// The promote buffer to stack pass that tries to convert alloc nodes into +/// alloca nodes. +struct PromoteBuffersToStackPass + : PromoteBuffersToStackBase { + + PromoteBuffersToStackPass(unsigned maxAllocSizeInBytes) + : maximumSize(maxAllocSizeInBytes) {} + + void runOnFunction() override { + // Move all allocation nodes and convert candidates into allocas. + BufferPlacementPromotion optimizer(getFunction()); + optimizer.promote(maximumSize); + } + +private: + const unsigned maximumSize; +}; + } // end anonymous namespace std::unique_ptr mlir::createBufferHoistingPass() { @@ -256,3 +359,8 @@ std::unique_ptr mlir::createBufferLoopHoistingPass() { return std::make_unique(); } + +std::unique_ptr +mlir::createPromoteBuffersToStackPass(unsigned maxAllocSizeInBytes) { + return std::make_unique(maxAllocSizeInBytes); +} diff --git a/mlir/test/Transforms/promote-buffers-to-stack.mlir b/mlir/test/Transforms/promote-buffers-to-stack.mlir new file mode 100644 --- /dev/null +++ b/mlir/test/Transforms/promote-buffers-to-stack.mlir @@ -0,0 +1,663 @@ +// RUN: mlir-opt -promote-buffers-to-stack -split-input-file %s | FileCheck %s + +// This file checks the behavior of PromoteBuffersToStack pass for converting +// AllocOps into AllocaOps, if possible. + +// Test Case: +// bb0 +// / \ +// bb1 bb2 <- Initial position of AllocOp +// \ / +// bb3 +// PromoteBuffersToStack expected behavior: It should move the existing AllocOp +// to the entry block and convert it in to an AllocaOp. + +#map0 = affine_map<(d0) -> (d0)> + +// CHECK-LABEL: func @condBranch +func @condBranch(%arg0: i1, %arg1: memref<2xf32>, %arg2: memref<2xf32>) { + cond_br %arg0, ^bb1, ^bb2 +^bb1: + br ^bb3(%arg1 : memref<2xf32>) +^bb2: + %0 = alloc() : memref<2xf32> + linalg.generic {indexing_maps = [#map0, #map0], iterator_types = ["parallel"]} + ins(%arg1: memref<2xf32>) + outs(%0: memref<2xf32>) { + ^bb0(%gen1_arg0: f32, %gen1_arg1: f32): + %tmp1 = exp %gen1_arg0 : f32 + linalg.yield %tmp1 : f32 + } + br ^bb3(%0 : memref<2xf32>) +^bb3(%1: memref<2xf32>): + "linalg.copy"(%1, %arg2) : (memref<2xf32>, memref<2xf32>) -> () + return +} + +// CHECK-NEXT: %[[ALLOCA:.*]] = alloca() +// CHECK-NEXT: cond_br +// CHECK: linalg.copy +// CHECK-NEXT: return + +// ----- + +// Test Case: +// bb0 +// / \ +// bb1 bb2 <- Initial position of AllocOp +// \ / +// bb3 +// PromoteBuffersToStack expected behavior: It should not move the existing +// AllocOp to any other block since the alloc has a dynamic dependency to block +// argument %0 in bb2. Since it is dynamic type, it is not converted to an +// alloca. + +#map0 = affine_map<(d0) -> (d0)> + +// CHECK-LABEL: func @condBranchDynamicType +func @condBranchDynamicType( + %arg0: i1, + %arg1: memref, + %arg2: memref, + %arg3: index) { + cond_br %arg0, ^bb1, ^bb2(%arg3: index) +^bb1: + br ^bb3(%arg1 : memref) +^bb2(%0: index): + %1 = alloc(%0) : memref + linalg.generic {indexing_maps = [#map0, #map0], iterator_types = ["parallel"]} + ins(%arg1: memref) + outs(%1: memref) { + ^bb0(%gen1_arg0: f32, %gen1_arg1: f32): + %tmp1 = exp %gen1_arg0 : f32 + linalg.yield %tmp1 : f32 + } + br ^bb3(%1 : memref) +^bb3(%2: memref): + "linalg.copy"(%2, %arg2) : (memref, memref) -> () + return +} + +// CHECK-NEXT: cond_br +// CHECK: ^bb2 +// CHECK: ^bb2(%[[IDX:.*]]:{{.*}}) +// CHECK-NEXT: %[[ALLOC0:.*]] = alloc(%[[IDX]]) +// CHECK-NEXT: linalg.generic +// CHECK: br ^bb3 +// CHECK-NEXT: ^bb3(%[[ALLOC0:.*]]:{{.*}}) +// CHECK: linalg.copy(%[[ALLOC0]], +// CHECK-NEXT: return + +// ----- + +// Test Case: Existing AllocOp with no users. +// PromoteBuffersToStack expected behavior: It should convert it to an +// AllocaOp. + +// CHECK-LABEL: func @emptyUsesValue +func @emptyUsesValue(%arg0: memref<4xf32>) { + %0 = alloc() : memref<4xf32> + return +} +// CHECK-NEXT: %[[ALLOCA:.*]] = alloca() +// CHECK-NEXT: return + +// ----- + +// Test Case: +// bb0 +// / \ +// | bb1 <- Initial position of AllocOp +// \ / +// bb2 +// PromoteBuffersToStack expected behavior: It should move the existing AllocOp +// to the entry block and convert it to an AllocaOp. + +#map0 = affine_map<(d0) -> (d0)> + +// CHECK-LABEL: func @criticalEdge +func @criticalEdge(%arg0: i1, %arg1: memref<2xf32>, %arg2: memref<2xf32>) { + cond_br %arg0, ^bb1, ^bb2(%arg1 : memref<2xf32>) +^bb1: + %0 = alloc() : memref<2xf32> + linalg.generic {indexing_maps = [#map0, #map0], iterator_types = ["parallel"]} + ins(%arg1: memref<2xf32>) + outs(%0: memref<2xf32>) { + ^bb0(%gen1_arg0: f32, %gen1_arg1: f32): + %tmp1 = exp %gen1_arg0 : f32 + linalg.yield %tmp1 : f32 + } + br ^bb2(%0 : memref<2xf32>) +^bb2(%1: memref<2xf32>): + "linalg.copy"(%1, %arg2) : (memref<2xf32>, memref<2xf32>) -> () + return +} + +// CHECK-NEXT: %[[ALLOCA:.*]] = alloca() +// CHECK-NEXT: cond_br +// CHECK: linalg.copy +// CHECK-NEXT: return + +// ----- + +// Test Case: +// bb0 <- Initial position of AllocOp +// / \ +// | bb1 +// \ / +// bb2 +// PromoteBuffersToStack expected behavior: It shouldn't move the alloc +// position. It only converts it in an alloca. + +#map0 = affine_map<(d0) -> (d0)> + +// CHECK-LABEL: func @invCriticalEdge +func @invCriticalEdge(%arg0: i1, %arg1: memref<2xf32>, %arg2: memref<2xf32>) { + %0 = alloc() : memref<2xf32> + linalg.generic {indexing_maps = [#map0, #map0], iterator_types = ["parallel"]} + ins(%arg1: memref<2xf32>) + outs(%0: memref<2xf32>) { + ^bb0(%gen1_arg0: f32, %gen1_arg1: f32): + %tmp1 = exp %gen1_arg0 : f32 + linalg.yield %tmp1 : f32 + } + cond_br %arg0, ^bb1, ^bb2(%arg1 : memref<2xf32>) +^bb1: + br ^bb2(%0 : memref<2xf32>) +^bb2(%1: memref<2xf32>): + "linalg.copy"(%1, %arg2) : (memref<2xf32>, memref<2xf32>) -> () + return +} + +// CHECK-NEXT: %[[ALLOCA:.*]] = alloca() +// CHECK: cond_br +// CHECK: linalg.copy +// CHECK-NEXT: return + +// ----- + +// Test Case: +// bb0 <- Initial position of the first AllocOp +// / \ +// bb1 bb2 +// \ / +// bb3 <- Initial position of the second AllocOp +// PromoteBuffersToStack expected behavior: It shouldn't move the AllocOps. It +// only converts them into allocas. + +#map0 = affine_map<(d0) -> (d0)> + +// CHECK-LABEL: func @ifElse +func @ifElse(%arg0: i1, %arg1: memref<2xf32>, %arg2: memref<2xf32>) { + %0 = alloc() : memref<2xf32> + linalg.generic {indexing_maps = [#map0, #map0], iterator_types = ["parallel"]} + ins(%arg1: memref<2xf32>) + outs(%0: memref<2xf32>) { + ^bb0(%gen1_arg0: f32, %gen1_arg1: f32): + %tmp1 = exp %gen1_arg0 : f32 + linalg.yield %tmp1 : f32 + } + cond_br %arg0, + ^bb1(%arg1, %0 : memref<2xf32>, memref<2xf32>), + ^bb2(%0, %arg1 : memref<2xf32>, memref<2xf32>) +^bb1(%1: memref<2xf32>, %2: memref<2xf32>): + br ^bb3(%1, %2 : memref<2xf32>, memref<2xf32>) +^bb2(%3: memref<2xf32>, %4: memref<2xf32>): + br ^bb3(%3, %4 : memref<2xf32>, memref<2xf32>) +^bb3(%5: memref<2xf32>, %6: memref<2xf32>): + %7 = alloc() : memref<2xf32> + linalg.generic {indexing_maps = [#map0, #map0], iterator_types = ["parallel"]} + ins(%5: memref<2xf32>) + outs(%7: memref<2xf32>) { + ^bb0(%gen2_arg0: f32, %gen2_arg1: f32): + %tmp2 = exp %gen2_arg0 : f32 + linalg.yield %tmp2 : f32 + } + "linalg.copy"(%7, %arg2) : (memref<2xf32>, memref<2xf32>) -> () + return +} + +// CHECK-NEXT: %[[ALLOCA0:.*]] = alloca() +// CHECK-NEXT: linalg.generic +// CHECK: %[[ALLOCA1:.*]] = alloca() +// CHECK: linalg.generic +// CHECK: linalg.copy(%[[ALLOCA1]] +// CHECK-NEXT: return + +// ----- + +// Test Case: No users for buffer in if-else CFG +// bb0 <- Initial position of AllocOp +// / \ +// bb1 bb2 +// \ / +// bb3 +// PromoteBuffersToStack expected behavior: It shouldn't move the AllocOp. It +// only converts to alloca. + +#map0 = affine_map<(d0) -> (d0)> + +// CHECK-LABEL: func @ifElseNoUsers +func @ifElseNoUsers(%arg0: i1, %arg1: memref<2xf32>, %arg2: memref<2xf32>) { + %0 = alloc() : memref<2xf32> + linalg.generic {indexing_maps = [#map0, #map0], iterator_types = ["parallel"]} + ins(%arg1: memref<2xf32>) + outs(%0: memref<2xf32>) { + ^bb0(%gen1_arg0: f32, %gen1_arg1: f32): + %tmp1 = exp %gen1_arg0 : f32 + linalg.yield %tmp1 : f32 + } + cond_br %arg0, + ^bb1(%arg1, %0 : memref<2xf32>, memref<2xf32>), + ^bb2(%0, %arg1 : memref<2xf32>, memref<2xf32>) +^bb1(%1: memref<2xf32>, %2: memref<2xf32>): + br ^bb3(%1, %2 : memref<2xf32>, memref<2xf32>) +^bb2(%3: memref<2xf32>, %4: memref<2xf32>): + br ^bb3(%3, %4 : memref<2xf32>, memref<2xf32>) +^bb3(%5: memref<2xf32>, %6: memref<2xf32>): + "linalg.copy"(%arg1, %arg2) : (memref<2xf32>, memref<2xf32>) -> () + return +} + +// CHECK-NEXT: %[[ALLOCA:.*]] = alloca() +// CHECK: return + +// ----- + +// Test Case: +// bb0 <- Initial position of the first AllocOp +// / \ +// bb1 bb2 +// | / \ +// | bb3 bb4 +// \ \ / +// \ / +// bb5 <- Initial position of the second AllocOp +// PromoteBuffersToStack expected behavior: AllocOps shouldn't be moved. +// The two allocs should be converted to allocas. + +#map0 = affine_map<(d0) -> (d0)> + +// CHECK-LABEL: func @ifElseNested +func @ifElseNested(%arg0: i1, %arg1: memref<2xf32>, %arg2: memref<2xf32>) { + %0 = alloc() : memref<2xf32> + linalg.generic {indexing_maps = [#map0, #map0], iterator_types = ["parallel"]} + ins(%arg1: memref<2xf32>) + outs(%0: memref<2xf32>) { + ^bb0(%gen1_arg0: f32, %gen1_arg1: f32): + %tmp1 = exp %gen1_arg0 : f32 + linalg.yield %tmp1 : f32 + } + cond_br %arg0, + ^bb1(%arg1, %0 : memref<2xf32>, memref<2xf32>), + ^bb2(%0, %arg1 : memref<2xf32>, memref<2xf32>) +^bb1(%1: memref<2xf32>, %2: memref<2xf32>): + br ^bb5(%1, %2 : memref<2xf32>, memref<2xf32>) +^bb2(%3: memref<2xf32>, %4: memref<2xf32>): + cond_br %arg0, ^bb3(%3 : memref<2xf32>), ^bb4(%4 : memref<2xf32>) +^bb3(%5: memref<2xf32>): + br ^bb5(%5, %3 : memref<2xf32>, memref<2xf32>) +^bb4(%6: memref<2xf32>): + br ^bb5(%3, %6 : memref<2xf32>, memref<2xf32>) +^bb5(%7: memref<2xf32>, %8: memref<2xf32>): + %9 = alloc() : memref<2xf32> + linalg.generic {indexing_maps = [#map0, #map0], iterator_types = ["parallel"]} + ins(%7: memref<2xf32>) + outs(%9: memref<2xf32>) { + ^bb0(%gen2_arg0: f32, %gen2_arg1: f32): + %tmp2 = exp %gen2_arg0 : f32 + linalg.yield %tmp2 : f32 + } + "linalg.copy"(%9, %arg2) : (memref<2xf32>, memref<2xf32>) -> () + return +} + +// CHECK-NEXT: %[[ALLOCA0:.*]] = alloca() +// CHECK-NEXT: linalg.generic +// CHECK: %[[ALLOCA1:.*]] = alloca() +// CHECK: linalg.generic +// CHECK: linalg.copy(%[[ALLOCA1]] +// CHECK-NEXT: return + +// ----- + +// Test Case: Dead operations in a single block. +// PromoteBuffersToStack expected behavior: It shouldn't move the AllocOps. It +// only converts the two AllocOps to allocas. + +#map0 = affine_map<(d0) -> (d0)> + +// CHECK-LABEL: func @redundantOperations +func @redundantOperations(%arg0: memref<2xf32>) { + %0 = alloc() : memref<2xf32> + linalg.generic {indexing_maps = [#map0, #map0], iterator_types = ["parallel"]} + ins(%arg0: memref<2xf32>) + outs(%0: memref<2xf32>) { + ^bb0(%gen1_arg0: f32, %gen1_arg1: f32): + %tmp1 = exp %gen1_arg0 : f32 + linalg.yield %tmp1 : f32 + } + %1 = alloc() : memref<2xf32> + linalg.generic {indexing_maps = [#map0, #map0], iterator_types = ["parallel"]} + ins(%0: memref<2xf32>) + outs(%1: memref<2xf32>) { + ^bb0(%gen2_arg0: f32, %gen2_arg1: f32): + %tmp2 = exp %gen2_arg0 : f32 + linalg.yield %tmp2 : f32 + } + return +} + +// CHECK: (%[[ARG0:.*]]: {{.*}}) +// CHECK-NEXT: %[[ALLOCA0:.*]] = alloca() +// CHECK-NEXT: linalg.generic {{{.*}}} ins(%[[ARG0]]{{.*}} outs(%[[ALLOCA0]] +// CHECK: %[[ALLOCA1:.*]] = alloca() +// CHECK-NEXT: linalg.generic {{{.*}}} ins(%[[ALLOCA0]]{{.*}} outs(%[[ALLOCA1]] +// CHECK: return + +// ----- + +// Test Case: +// bb0 +// / \ +// Initial pos of the 1st AllocOp -> bb1 bb2 <- Initial pos of the 2nd AllocOp +// \ / +// bb3 +// PromoteBuffersToStack expected behavior: Both AllocOps should be moved to the +// entry block and converted to allocas, so there is no need for deallocations. + +#map0 = affine_map<(d0) -> (d0)> + +// CHECK-LABEL: func @moving_alloc_and_inserting_missing_dealloc +func @moving_alloc_and_inserting_missing_dealloc( + %cond: i1, + %arg0: memref<2xf32>, + %arg1: memref<2xf32>) { + cond_br %cond, ^bb1, ^bb2 +^bb1: + %0 = alloc() : memref<2xf32> + linalg.generic {indexing_maps = [#map0, #map0], iterator_types = ["parallel"]} + ins(%arg0: memref<2xf32>) + outs(%0: memref<2xf32>) { + ^bb0(%gen1_arg0: f32, %gen1_arg1: f32): + %tmp1 = exp %gen1_arg0 : f32 + linalg.yield %tmp1 : f32 + } + br ^exit(%0 : memref<2xf32>) +^bb2: + %1 = alloc() : memref<2xf32> + linalg.generic {indexing_maps = [#map0, #map0], iterator_types = ["parallel"]} + ins(%arg0: memref<2xf32>) + outs(%1: memref<2xf32>) { + ^bb0(%gen2_arg0: f32, %gen2_arg1: f32): + %tmp2 = exp %gen2_arg0 : f32 + linalg.yield %tmp2 : f32 + } + br ^exit(%1 : memref<2xf32>) +^exit(%arg2: memref<2xf32>): + "linalg.copy"(%arg2, %arg1) : (memref<2xf32>, memref<2xf32>) -> () + return +} + +// CHECK-NEXT: %{{.*}} = alloca() +// CHECK-NEXT: %{{.*}} = alloca() +// CHECK-NEXT: cond_br +// CHECK: linalg.copy +// CHECK-NEXT: return + +// ----- + +// Test Case: Nested regions - This test defines a GenericOp inside the region +// of another GenericOp. +// PromoteBuffersToStack expected behavior: The AllocOps are moved upwards and +// then converted to allocas. + +#map0 = affine_map<(d0) -> (d0)> + +// CHECK-LABEL: func @nested_regions_and_cond_branch +func @nested_regions_and_cond_branch( + %arg0: i1, + %arg1: memref<2xf32>, + %arg2: memref<2xf32>) { + cond_br %arg0, ^bb1, ^bb2 +^bb1: + br ^bb3(%arg1 : memref<2xf32>) +^bb2: + %0 = alloc() : memref<2xf32> + linalg.generic { + indexing_maps = [#map0, #map0], + iterator_types = ["parallel"]} + ins(%arg1: memref<2xf32>) + outs(%0: memref<2xf32>) { + ^bb0(%gen1_arg0: f32, %gen1_arg1: f32): + %1 = alloc() : memref<2xf32> + linalg.generic { + indexing_maps = [#map0, #map0], + iterator_types = ["parallel"]} + ins(%arg1: memref<2xf32>) + outs(%1: memref<2xf32>) { + ^bb0(%gen2_arg0: f32, %gen2_arg1: f32): + %tmp2 = exp %gen2_arg0 : f32 + linalg.yield %tmp2 : f32 + } + %tmp1 = exp %gen1_arg0 : f32 + linalg.yield %tmp1 : f32 + } + br ^bb3(%0 : memref<2xf32>) +^bb3(%1: memref<2xf32>): + "linalg.copy"(%1, %arg2) : (memref<2xf32>, memref<2xf32>) -> () + return +} +// CHECK-NEXT: %[[ALLOCA0:.*]] = alloca() +// CHECK-NEXT: %[[ALLOCA1:.*]] = alloca() +// CHECK-NEXT: cond_br + +// ----- + +// Test Case: buffer deallocation escaping +// PromoteBuffersToStack expected behavior: The first alloc is returned, so +// there is no conversion allowed. The second alloc is converted, since it +// only remains in the scope of the function. + +#map0 = affine_map<(d0) -> (d0)> + +// CHECK-LABEL: func @memref_in_function_results +func @memref_in_function_results( + %arg0: memref<5xf32>, + %arg1: memref<10xf32>, + %arg2: memref<5xf32>) -> (memref<10xf32>, memref<15xf32>) { + %x = alloc() : memref<15xf32> + %y = alloc() : memref<5xf32> + linalg.generic {indexing_maps = [#map0, #map0], iterator_types = ["parallel"]} + ins(%arg0: memref<5xf32>) + outs(%y: memref<5xf32>) { + ^bb0(%arg3: f32, %arg4: f32): + %2 = exp %arg3 : f32 + linalg.yield %2 : f32 + } + linalg.copy(%y, %arg2) : memref<5xf32>, memref<5xf32> + return %arg1, %x : memref<10xf32>, memref<15xf32> +} +// CHECK: (%[[ARG0:.*]]: memref<5xf32>, %[[ARG1:.*]]: memref<10xf32>, +// CHECK-SAME: %[[RESULT:.*]]: memref<5xf32>) +// CHECK: %[[ALLOC:.*]] = alloc() +// CHECK: %[[ALLOCA:.*]] = alloca() +// CHECK: linalg.copy +// CHECK: return %[[ARG1]], %[[ALLOC]] + +// ----- + +// Test Case: nested region control flow +// The allocation in the nested if branch cannot be converted to an alloca +// due to its dynamic memory allocation behavior. + +// CHECK-LABEL: func @nested_region_control_flow +func @nested_region_control_flow( + %arg0 : index, + %arg1 : index) -> memref { + %0 = cmpi "eq", %arg0, %arg1 : index + %1 = alloc(%arg0, %arg0) : memref + %2 = scf.if %0 -> (memref) { + scf.yield %1 : memref + } else { + %3 = alloc(%arg0, %arg1) : memref + scf.yield %1 : memref + } + return %2 : memref +} + +// CHECK: %[[ALLOC0:.*]] = alloc(%arg0, %arg0) +// CHECK-NEXT: %[[ALLOC1:.*]] = scf.if +// CHECK: scf.yield %[[ALLOC0]] +// CHECK: %[[ALLOC2:.*]] = alloc(%arg0, %arg1) +// CHECK-NEXT: scf.yield %[[ALLOC0]] +// CHECK: return %[[ALLOC1]] + +// ----- + +// Test Case: nested region control flow within a region interface. +// The alloc %0 does not need to be converted in this case since the +// allocation finally escapes the method. + +// CHECK-LABEL: func @inner_region_control_flow +func @inner_region_control_flow(%arg0 : index) -> memref<2x2xf32> { + %0 = alloc() : memref<2x2xf32> + %1 = test.region_if %0 : memref<2x2xf32> -> (memref<2x2xf32>) then { + ^bb0(%arg1 : memref<2x2xf32>): + test.region_if_yield %arg1 : memref<2x2xf32> + } else { + ^bb0(%arg1 : memref<2x2xf32>): + test.region_if_yield %arg1 : memref<2x2xf32> + } join { + ^bb0(%arg1 : memref<2x2xf32>): + test.region_if_yield %arg1 : memref<2x2xf32> + } + return %1 : memref<2x2xf32> +} + +// CHECK: %[[ALLOC0:.*]] = alloc() +// CHECK-NEXT: %[[ALLOC1:.*]] = test.region_if +// CHECK-NEXT: ^bb0(%[[ALLOC2:.*]]:{{.*}}): +// CHECK-NEXT: test.region_if_yield %[[ALLOC2]] +// CHECK: ^bb0(%[[ALLOC3:.*]]:{{.*}}): +// CHECK-NEXT: test.region_if_yield %[[ALLOC3]] +// CHECK: ^bb0(%[[ALLOC4:.*]]:{{.*}}): +// CHECK-NEXT: test.region_if_yield %[[ALLOC4]] +// CHECK: return %[[ALLOC1]] + +// ----- + +// Test Case: structured control-flow loop using a nested alloc. +// The alloc %3 will be moved upwards and both allocs are converted to +// allocas. + +// CHECK-LABEL: func @loop_alloc +func @loop_alloc( + %lb: index, + %ub: index, + %step: index, + %buf: memref<2xf32>, + %res: memref<2xf32>) { + %0 = alloc() : memref<2xf32> + %1 = scf.for %i = %lb to %ub step %step + iter_args(%iterBuf = %buf) -> memref<2xf32> { + %2 = cmpi "eq", %i, %ub : index + %3 = alloc() : memref<2xf32> + scf.yield %3 : memref<2xf32> + } + "linalg.copy"(%1, %res) : (memref<2xf32>, memref<2xf32>) -> () + return +} + +// CHECK-NEXT: %[[ALLOCA:.*]] = alloca() +// CHECK-NEXT: scf.for +// CHECK: %[[ALLOC:.*]] = alloc() + +// ----- + +// Test Case: structured control-flow loop with a nested if operation. +// The loop yields buffers that have been defined outside of the loop and the +// backeges only use the iteration arguments (or one of its aliases). +// Therefore, we do not have to (and are not allowed to) free any buffers +// that are passed via the backedges. The alloc is converted to an AllocaOp. + +// CHECK-LABEL: func @loop_nested_if_no_alloc +func @loop_nested_if_no_alloc( + %lb: index, + %ub: index, + %step: index, + %buf: memref<2xf32>, + %res: memref<2xf32>) { + %0 = alloc() : memref<2xf32> + %1 = scf.for %i = %lb to %ub step %step + iter_args(%iterBuf = %buf) -> memref<2xf32> { + %2 = cmpi "eq", %i, %ub : index + %3 = scf.if %2 -> (memref<2xf32>) { + scf.yield %0 : memref<2xf32> + } else { + scf.yield %iterBuf : memref<2xf32> + } + scf.yield %3 : memref<2xf32> + } + "linalg.copy"(%1, %res) : (memref<2xf32>, memref<2xf32>) -> () + return +} + +// CHECK: %[[ALLOCA0:.*]] = alloca() +// CHECK-NEXT: %[[ALLOCA1:.*]] = scf.for {{.*}} iter_args(%[[IALLOCA:.*]] = +// CHECK: %[[ALLOCA2:.*]] = scf.if +// CHECK: scf.yield %[[ALLOCA0]] +// CHECK: scf.yield %[[IALLOCA]] +// CHECK: scf.yield %[[ALLOCA2]] +// CHECK: linalg.copy(%[[ALLOCA1]], %arg4) + +// ----- + +// Test Case: structured control-flow loop with a nested if operation using +// a deeply nested buffer allocation. +// The allocs are not converted in this case. + +// CHECK-LABEL: func @loop_nested_if_alloc +func @loop_nested_if_alloc( + %lb: index, + %ub: index, + %step: index, + %buf: memref<2xf32>) -> memref<2xf32> { + %0 = alloc() : memref<2xf32> + %1 = scf.for %i = %lb to %ub step %step + iter_args(%iterBuf = %buf) -> memref<2xf32> { + %2 = cmpi "eq", %i, %ub : index + %3 = scf.if %2 -> (memref<2xf32>) { + %4 = alloc() : memref<2xf32> + scf.yield %4 : memref<2xf32> + } else { + scf.yield %0 : memref<2xf32> + } + scf.yield %3 : memref<2xf32> + } + return %1 : memref<2xf32> +} + +// CHECK: %[[ALLOC0:.*]] = alloc() +// CHECK-NEXT: %[[ALLOC1:.*]] = scf.for {{.*}} +// CHECK: %[[ALLOC2:.*]] = scf.if +// CHECK: %[[ALLOC3:.*]] = alloc() +// CHECK-NEXT: scf.yield %[[ALLOC3]] +// CHECK: scf.yield %[[ALLOC0]] +// CHECK: scf.yield %[[ALLOC2]] +// CHECK: return %[[ALLOC1]] + +// ----- + +// Test Case: The allocated buffer is too large and, hence, it is not +// converted. In the actual implementation the largest size is 1KB. + +// CHECK-LABEL: func @large_buffer_allocation +func @large_buffer_allocation(%arg0: memref<2048xf32>) { + %0 = alloc() : memref<2048xf32> + "linalg.copy"(%0, %arg0) : (memref<2048xf32>, memref<2048xf32>) -> () + return +} + +// CHECK-NEXT: %[[ALLOC:.*]] = alloc() +// CHECK-NEXT: linalg.copy