diff --git a/mlir/include/mlir/Transforms/Passes.h b/mlir/include/mlir/Transforms/Passes.h --- a/mlir/include/mlir/Transforms/Passes.h +++ b/mlir/include/mlir/Transforms/Passes.h @@ -40,6 +40,9 @@ /// reallocations inside of loops. std::unique_ptr createBufferLoopHoistingPass(); +/// Creates a pass that promotes heap-based allocations to stack-based ones. +std::unique_ptr createPromoteBuffersToStackPass(); + /// Creates an instance of the Canonicalizer pass. std::unique_ptr createCanonicalizerPass(); diff --git a/mlir/include/mlir/Transforms/Passes.td b/mlir/include/mlir/Transforms/Passes.td --- a/mlir/include/mlir/Transforms/Passes.td +++ b/mlir/include/mlir/Transforms/Passes.td @@ -201,6 +201,17 @@ let constructor = "mlir::createBufferLoopHoistingPass()"; } +def PromoteBuffersToStack : FunctionPass<"promote-buffers-to-stack"> { + let summary = "Promotes heap-based allocations to automatically managed " + "stack-based allocations"; + let description = [{ + This pass implements a simple algorithm to convert heap-based memory + allocations to stack-based ones. It uses a built-in heuristic to decide + whether it makes sense to convert an allocation. + }]; + let constructor = "mlir::createPromoteBuffersToStackPass()"; +} + def Canonicalizer : Pass<"canonicalize"> { let summary = "Canonicalize operations"; let description = [{ diff --git a/mlir/lib/Transforms/BufferOptimizations.cpp b/mlir/lib/Transforms/BufferOptimizations.cpp --- a/mlir/lib/Transforms/BufferOptimizations.cpp +++ b/mlir/lib/Transforms/BufferOptimizations.cpp @@ -6,9 +6,10 @@ // //===----------------------------------------------------------------------===// // -// This file implements logic for two optimization passes. These passes try to -// hoist alloc nodes to reduce the number of allocations and copies during -// buffer deallocation. +// This file implements logic for three optimization passes. The first two +// passes try to move alloc nodes out of blocks to reduce the number of +// allocations and copies during buffer deallocation. The third pass tries to +// convert heap-based allocations to stack-based allocations, if possible. #include "PassDetail.h" #include "mlir/IR/Operation.h" @@ -21,6 +22,70 @@ namespace { +//===----------------------------------------------------------------------===// +// BufferPlacementAllocationScope +//===----------------------------------------------------------------------===// + +/// The scope of a given allocation node to check if it is possible to convert +/// this alloc node into an alloca. If the scope satisfies several constraints +/// the transformation is performed. +class BufferPlacementAllocationScope { +public: + /// Constructs a new buffer placement allocation scope. + BufferPlacementAllocationScope(const BufferPlacementAllocs::AllocEntry &entry) + : alloc(std::get<0>(entry)), deallocOp(std::get<1>(entry)), + parent(nullptr) { + auto region = alloc.getParentRegion(); + do { + if (auto parentOp = region->getParentOp()) + if (parentOp->hasTrait()) { + parent = ®ion->front(); + break; + } + } while ((region = region->getParentRegion())); + } + + /// Returns the parent block. + Block *getParent() const { return parent; } + + /// Returns the parent region. + Region *getParentRegion() const { return parent->getParent(); } + + /// Check if the size of the allocation is less than 1 KB. The transformation + /// is only applied to small buffers since large buffers could exceed the + /// stack space. + bool isSmallAlloc() const { + auto type = alloc.getType().dyn_cast(); + if (!type || !type.hasStaticShape()) + return false; + return type.getSizeInBits() < 1024 * 8; + } + + /// Checks whether the given aliases leave the allocation scope. + bool leavesAllocationScope( + const BufferPlacementAliasAnalysis::ValueSetT &aliases) const { + for (Value alias : aliases) { + for (auto use : alias.getUsers()) { + if (use->hasTrait() && + use->getParentRegion() == getParentRegion()) + return true; + } + } + return false; + } + + /// Convert a possible alloc node into an alloca. + bool canConvertToAlloca( + const BufferPlacementAliasAnalysis::ValueSetT &aliases) const { + return isSmallAlloc() && !deallocOp && !leavesAllocationScope(aliases); + } + +private: + Value alloc; + Operation *deallocOp; + Block *parent; +}; + //===----------------------------------------------------------------------===// // BufferAllocationHoisting //===----------------------------------------------------------------------===// @@ -220,6 +285,46 @@ void recordMoveToParent(Block *block) { placementBlock = block; } }; +//===----------------------------------------------------------------------===// +// BufferPlacementPromotion +//===----------------------------------------------------------------------===// + +/// Promotes heap-based allocations to stack-based allocations (if possible). +class BufferPlacementPromotion : BufferPlacementTransformationBase { +public: + BufferPlacementPromotion(Operation *op) + : BufferPlacementTransformationBase(op) {} + + /// Promote buffers to stack-based allocations. + void promote() { + for (BufferPlacementAllocs::AllocEntry &entry : allocs) { + // Avoid dynamically allocated buffers. + Value alloc = std::get<0>(entry); + if (alloc.getDefiningOp()->getOperands().size() > 0) + continue; + // Use the AllocationScope analysis to test whether we can convert the + // current entry into a stack-based allocation node. + BufferPlacementAllocationScope allocationScope(entry); + auto resultAliases = aliases.resolve(alloc); + if (!allocationScope.canConvertToAlloca(resultAliases)) + continue; + Operation *startOperation = BufferPlacementAllocs::getStartOperation( + alloc, allocationScope.getParent(), liveness); + + // Build a new alloca that is associated with its parent + // `AutomaticAllocationScope` determined during the initialization phase. + OpBuilder builder(startOperation); + auto alloca = builder.create( + alloc.getLoc(), alloc.getType().cast()); + + // Replace the original alloc by a newly created alloca. + Operation *allocOp = alloc.getDefiningOp(); + allocOp->replaceAllUsesWith(alloca.getOperation()); + allocOp->erase(); + } + } +}; + //===----------------------------------------------------------------------===// // BufferOptimizationPasses //===----------------------------------------------------------------------===// @@ -247,6 +352,18 @@ } }; +/// The promote buffer to stack pass that tries to convert alloc nodes into +/// alloca nodes. +struct PromoteBuffersToStackPass + : PromoteBuffersToStackBase { + + void runOnFunction() override { + // Move all allocation nodes and convert candidates into allocas. + BufferPlacementPromotion optimizer(getFunction()); + optimizer.promote(); + } +}; + } // end anonymous namespace std::unique_ptr mlir::createBufferHoistingPass() { @@ -256,3 +373,7 @@ std::unique_ptr mlir::createBufferLoopHoistingPass() { return std::make_unique(); } + +std::unique_ptr mlir::createPromoteBuffersToStackPass() { + return std::make_unique(); +} diff --git a/mlir/test/Transforms/promote-buffers-to-stack.mlir b/mlir/test/Transforms/promote-buffers-to-stack.mlir new file mode 100644 --- /dev/null +++ b/mlir/test/Transforms/promote-buffers-to-stack.mlir @@ -0,0 +1,714 @@ +// RUN: mlir-opt -promote-buffers-to-stack -split-input-file %s | FileCheck %s + +// This file checks the behavior of PromoteBuffersToStack pass for converting +// AllocOps into AllocaOps, if possible. + +// Test Case: +// bb0 +// / \ +// bb1 bb2 <- Initial position of AllocOp +// \ / +// bb3 +// PromoteBuffersToStack expected behavior: It should move the existing AllocOp +// to the entry block and convert it in to an AllocaOp. + +#map0 = affine_map<(d0) -> (d0)> + +// CHECK-LABEL: func @condBranch +func @condBranch(%arg0: i1, %arg1: memref<2xf32>, %arg2: memref<2xf32>) { + cond_br %arg0, ^bb1, ^bb2 +^bb1: + br ^bb3(%arg1 : memref<2xf32>) +^bb2: + %0 = alloc() : memref<2xf32> + linalg.generic {indexing_maps = [#map0, #map0], iterator_types = ["parallel"]} + ins(%arg1: memref<2xf32>) + outs(%0: memref<2xf32>) { + ^bb0(%gen1_arg0: f32, %gen1_arg1: f32): + %tmp1 = exp %gen1_arg0 : f32 + linalg.yield %tmp1 : f32 + } + br ^bb3(%0 : memref<2xf32>) +^bb3(%1: memref<2xf32>): + "linalg.copy"(%1, %arg2) : (memref<2xf32>, memref<2xf32>) -> () + return +} + +// CHECK-NEXT: %[[ALLOCA:.*]] = alloca() +// CHECK-NEXT: cond_br +// CHECK: linalg.copy +// CHECK-NEXT: return + +// ----- + +// Test Case: +// bb0 +// / \ +// bb1 bb2 <- Initial position of AllocOp +// \ / +// bb3 +// PromoteBuffersToStack expected behavior: It should not move the existing +// AllocOp to any other block since the alloc has a dynamic dependency to block +// argument %0 in bb2. Since it is dynamic type, it is not converted to an +// alloca. + +#map0 = affine_map<(d0) -> (d0)> + +// CHECK-LABEL: func @condBranchDynamicType +func @condBranchDynamicType( + %arg0: i1, + %arg1: memref, + %arg2: memref, + %arg3: index) { + cond_br %arg0, ^bb1, ^bb2(%arg3: index) +^bb1: + br ^bb3(%arg1 : memref) +^bb2(%0: index): + %1 = alloc(%0) : memref + linalg.generic {indexing_maps = [#map0, #map0], iterator_types = ["parallel"]} + ins(%arg1: memref) + outs(%1: memref) { + ^bb0(%gen1_arg0: f32, %gen1_arg1: f32): + %tmp1 = exp %gen1_arg0 : f32 + linalg.yield %tmp1 : f32 + } + br ^bb3(%1 : memref) +^bb3(%2: memref): + "linalg.copy"(%2, %arg2) : (memref, memref) -> () + return +} + +// CHECK-NEXT: cond_br +// CHECK: ^bb2 +// CHECK: ^bb2(%[[IDX:.*]]:{{.*}}) +// CHECK-NEXT: %[[ALLOC0:.*]] = alloc(%[[IDX]]) +// CHECK-NEXT: linalg.generic +// CHECK: br ^bb3 +// CHECK-NEXT: ^bb3(%[[ALLOC0:.*]]:{{.*}}) +// CHECK: linalg.copy(%[[ALLOC0]], +// CHECK-NEXT: return + +// ----- + +// Test Case: Existing AllocOp with no users. +// PromoteBuffersToStack expected behavior: It should convert it to an +// AllocaOp. + +// CHECK-LABEL: func @emptyUsesValue +func @emptyUsesValue(%arg0: memref<4xf32>) { + %0 = alloc() : memref<4xf32> + return +} +// CHECK-NEXT: %[[ALLOCA:.*]] = alloca() +// CHECK-NEXT: return + +// ----- + +// Test Case: +// bb0 +// / \ +// | bb1 <- Initial position of AllocOp +// \ / +// bb2 +// PromoteBuffersToStack expected behavior: It should move the existing AllocOp +// to the entry block and convert it to an AllocaOp. + +#map0 = affine_map<(d0) -> (d0)> + +// CHECK-LABEL: func @criticalEdge +func @criticalEdge(%arg0: i1, %arg1: memref<2xf32>, %arg2: memref<2xf32>) { + cond_br %arg0, ^bb1, ^bb2(%arg1 : memref<2xf32>) +^bb1: + %0 = alloc() : memref<2xf32> + linalg.generic {indexing_maps = [#map0, #map0], iterator_types = ["parallel"]} + ins(%arg1: memref<2xf32>) + outs(%0: memref<2xf32>) { + ^bb0(%gen1_arg0: f32, %gen1_arg1: f32): + %tmp1 = exp %gen1_arg0 : f32 + linalg.yield %tmp1 : f32 + } + br ^bb2(%0 : memref<2xf32>) +^bb2(%1: memref<2xf32>): + "linalg.copy"(%1, %arg2) : (memref<2xf32>, memref<2xf32>) -> () + return +} + +// CHECK-NEXT: %[[ALLOCA:.*]] = alloca() +// CHECK-NEXT: cond_br +// CHECK: linalg.copy +// CHECK-NEXT: return + +// ----- + +// Test Case: +// bb0 <- Initial position of AllocOp +// / \ +// | bb1 +// \ / +// bb2 +// PromoteBuffersToStack expected behavior: It shouldn't move the alloc +// position. It only converts it in an alloca. + +#map0 = affine_map<(d0) -> (d0)> + +// CHECK-LABEL: func @invCriticalEdge +func @invCriticalEdge(%arg0: i1, %arg1: memref<2xf32>, %arg2: memref<2xf32>) { + %0 = alloc() : memref<2xf32> + linalg.generic {indexing_maps = [#map0, #map0], iterator_types = ["parallel"]} + ins(%arg1: memref<2xf32>) + outs(%0: memref<2xf32>) { + ^bb0(%gen1_arg0: f32, %gen1_arg1: f32): + %tmp1 = exp %gen1_arg0 : f32 + linalg.yield %tmp1 : f32 + } + cond_br %arg0, ^bb1, ^bb2(%arg1 : memref<2xf32>) +^bb1: + br ^bb2(%0 : memref<2xf32>) +^bb2(%1: memref<2xf32>): + "linalg.copy"(%1, %arg2) : (memref<2xf32>, memref<2xf32>) -> () + return +} + +// CHECK-NEXT: %[[ALLOCA:.*]] = alloca() +// CHECK: cond_br +// CHECK: linalg.copy +// CHECK-NEXT: return + +// ----- + +// Test Case: +// bb0 <- Initial position of the first AllocOp +// / \ +// bb1 bb2 +// \ / +// bb3 <- Initial position of the second AllocOp +// PromoteBuffersToStack expected behavior: It shouldn't move the AllocOps. It +// only converts them into allocas. + +#map0 = affine_map<(d0) -> (d0)> + +// CHECK-LABEL: func @ifElse +func @ifElse(%arg0: i1, %arg1: memref<2xf32>, %arg2: memref<2xf32>) { + %0 = alloc() : memref<2xf32> + linalg.generic {indexing_maps = [#map0, #map0], iterator_types = ["parallel"]} + ins(%arg1: memref<2xf32>) + outs(%0: memref<2xf32>) { + ^bb0(%gen1_arg0: f32, %gen1_arg1: f32): + %tmp1 = exp %gen1_arg0 : f32 + linalg.yield %tmp1 : f32 + } + cond_br %arg0, + ^bb1(%arg1, %0 : memref<2xf32>, memref<2xf32>), + ^bb2(%0, %arg1 : memref<2xf32>, memref<2xf32>) +^bb1(%1: memref<2xf32>, %2: memref<2xf32>): + br ^bb3(%1, %2 : memref<2xf32>, memref<2xf32>) +^bb2(%3: memref<2xf32>, %4: memref<2xf32>): + br ^bb3(%3, %4 : memref<2xf32>, memref<2xf32>) +^bb3(%5: memref<2xf32>, %6: memref<2xf32>): + %7 = alloc() : memref<2xf32> + linalg.generic {indexing_maps = [#map0, #map0], iterator_types = ["parallel"]} + ins(%5: memref<2xf32>) + outs(%7: memref<2xf32>) { + ^bb0(%gen2_arg0: f32, %gen2_arg1: f32): + %tmp2 = exp %gen2_arg0 : f32 + linalg.yield %tmp2 : f32 + } + "linalg.copy"(%7, %arg2) : (memref<2xf32>, memref<2xf32>) -> () + return +} + +// CHECK-NEXT: %[[ALLOCA0:.*]] = alloca() +// CHECK-NEXT: linalg.generic +// CHECK: %[[ALLOCA1:.*]] = alloca() +// CHECK: linalg.generic +// CHECK: linalg.copy(%[[ALLOCA1]] +// CHECK-NEXT: return + +// ----- + +// Test Case: No users for buffer in if-else CFG +// bb0 <- Initial position of AllocOp +// / \ +// bb1 bb2 +// \ / +// bb3 +// PromoteBuffersToStack expected behavior: It shouldn't move the AllocOp. It +// only converts to alloca. + +#map0 = affine_map<(d0) -> (d0)> + +// CHECK-LABEL: func @ifElseNoUsers +func @ifElseNoUsers(%arg0: i1, %arg1: memref<2xf32>, %arg2: memref<2xf32>) { + %0 = alloc() : memref<2xf32> + linalg.generic {indexing_maps = [#map0, #map0], iterator_types = ["parallel"]} + ins(%arg1: memref<2xf32>) + outs(%0: memref<2xf32>) { + ^bb0(%gen1_arg0: f32, %gen1_arg1: f32): + %tmp1 = exp %gen1_arg0 : f32 + linalg.yield %tmp1 : f32 + } + cond_br %arg0, + ^bb1(%arg1, %0 : memref<2xf32>, memref<2xf32>), + ^bb2(%0, %arg1 : memref<2xf32>, memref<2xf32>) +^bb1(%1: memref<2xf32>, %2: memref<2xf32>): + br ^bb3(%1, %2 : memref<2xf32>, memref<2xf32>) +^bb2(%3: memref<2xf32>, %4: memref<2xf32>): + br ^bb3(%3, %4 : memref<2xf32>, memref<2xf32>) +^bb3(%5: memref<2xf32>, %6: memref<2xf32>): + "linalg.copy"(%arg1, %arg2) : (memref<2xf32>, memref<2xf32>) -> () + return +} + +// CHECK-NEXT: %[[ALLOCA:.*]] = alloca() +// CHECK: return + +// ----- + +// Test Case: +// bb0 <- Initial position of the first AllocOp +// / \ +// bb1 bb2 +// | / \ +// | bb3 bb4 +// \ \ / +// \ / +// bb5 <- Initial position of the second AllocOp +// PromoteBuffersToStack expected behavior: AllocOps shouldn't be moved. +// The two allocs should be converted to allocas. + +#map0 = affine_map<(d0) -> (d0)> + +// CHECK-LABEL: func @ifElseNested +func @ifElseNested(%arg0: i1, %arg1: memref<2xf32>, %arg2: memref<2xf32>) { + %0 = alloc() : memref<2xf32> + linalg.generic {indexing_maps = [#map0, #map0], iterator_types = ["parallel"]} + ins(%arg1: memref<2xf32>) + outs(%0: memref<2xf32>) { + ^bb0(%gen1_arg0: f32, %gen1_arg1: f32): + %tmp1 = exp %gen1_arg0 : f32 + linalg.yield %tmp1 : f32 + } + cond_br %arg0, + ^bb1(%arg1, %0 : memref<2xf32>, memref<2xf32>), + ^bb2(%0, %arg1 : memref<2xf32>, memref<2xf32>) +^bb1(%1: memref<2xf32>, %2: memref<2xf32>): + br ^bb5(%1, %2 : memref<2xf32>, memref<2xf32>) +^bb2(%3: memref<2xf32>, %4: memref<2xf32>): + cond_br %arg0, ^bb3(%3 : memref<2xf32>), ^bb4(%4 : memref<2xf32>) +^bb3(%5: memref<2xf32>): + br ^bb5(%5, %3 : memref<2xf32>, memref<2xf32>) +^bb4(%6: memref<2xf32>): + br ^bb5(%3, %6 : memref<2xf32>, memref<2xf32>) +^bb5(%7: memref<2xf32>, %8: memref<2xf32>): + %9 = alloc() : memref<2xf32> + linalg.generic {indexing_maps = [#map0, #map0], iterator_types = ["parallel"]} + ins(%7: memref<2xf32>) + outs(%9: memref<2xf32>) { + ^bb0(%gen2_arg0: f32, %gen2_arg1: f32): + %tmp2 = exp %gen2_arg0 : f32 + linalg.yield %tmp2 : f32 + } + "linalg.copy"(%9, %arg2) : (memref<2xf32>, memref<2xf32>) -> () + return +} + +// CHECK-NEXT: %[[ALLOCA0:.*]] = alloca() +// CHECK-NEXT: linalg.generic +// CHECK: %[[ALLOCA1:.*]] = alloca() +// CHECK: linalg.generic +// CHECK: linalg.copy(%[[ALLOCA1]] +// CHECK-NEXT: return + +// ----- + +// Test Case: Dead operations in a single block. +// PromoteBuffersToStack expected behavior: It shouldn't move the AllocOps. It +// only converts the two AllocOps to allocas. + +#map0 = affine_map<(d0) -> (d0)> + +// CHECK-LABEL: func @redundantOperations +func @redundantOperations(%arg0: memref<2xf32>) { + %0 = alloc() : memref<2xf32> + linalg.generic {indexing_maps = [#map0, #map0], iterator_types = ["parallel"]} + ins(%arg0: memref<2xf32>) + outs(%0: memref<2xf32>) { + ^bb0(%gen1_arg0: f32, %gen1_arg1: f32): + %tmp1 = exp %gen1_arg0 : f32 + linalg.yield %tmp1 : f32 + } + %1 = alloc() : memref<2xf32> + linalg.generic {indexing_maps = [#map0, #map0], iterator_types = ["parallel"]} + ins(%0: memref<2xf32>) + outs(%1: memref<2xf32>) { + ^bb0(%gen2_arg0: f32, %gen2_arg1: f32): + %tmp2 = exp %gen2_arg0 : f32 + linalg.yield %tmp2 : f32 + } + return +} + +// CHECK: (%[[ARG0:.*]]: {{.*}}) +// CHECK-NEXT: %[[ALLOCA0:.*]] = alloca() +// CHECK-NEXT: linalg.generic {{{.*}}} ins(%[[ARG0]]{{.*}} outs(%[[ALLOCA0]] +// CHECK: %[[ALLOCA1:.*]] = alloca() +// CHECK-NEXT: linalg.generic {{{.*}}} ins(%[[ALLOCA0]]{{.*}} outs(%[[ALLOCA1]] +// CHECK: return + +// ----- + +// Test Case: +// bb0 +// / \ +// Initial pos of the 1st AllocOp -> bb1 bb2 <- Initial pos of the 2nd AllocOp +// \ / +// bb3 +// PromoteBuffersToStack expected behavior: Both AllocOps should be moved to the +// entry block and converted to allocas, so there is no need for deallocations. + +#map0 = affine_map<(d0) -> (d0)> + +// CHECK-LABEL: func @moving_alloc_and_inserting_missing_dealloc +func @moving_alloc_and_inserting_missing_dealloc( + %cond: i1, + %arg0: memref<2xf32>, + %arg1: memref<2xf32>) { + cond_br %cond, ^bb1, ^bb2 +^bb1: + %0 = alloc() : memref<2xf32> + linalg.generic {indexing_maps = [#map0, #map0], iterator_types = ["parallel"]} + ins(%arg0: memref<2xf32>) + outs(%0: memref<2xf32>) { + ^bb0(%gen1_arg0: f32, %gen1_arg1: f32): + %tmp1 = exp %gen1_arg0 : f32 + linalg.yield %tmp1 : f32 + } + br ^exit(%0 : memref<2xf32>) +^bb2: + %1 = alloc() : memref<2xf32> + linalg.generic {indexing_maps = [#map0, #map0], iterator_types = ["parallel"]} + ins(%arg0: memref<2xf32>) + outs(%1: memref<2xf32>) { + ^bb0(%gen2_arg0: f32, %gen2_arg1: f32): + %tmp2 = exp %gen2_arg0 : f32 + linalg.yield %tmp2 : f32 + } + br ^exit(%1 : memref<2xf32>) +^exit(%arg2: memref<2xf32>): + "linalg.copy"(%arg2, %arg1) : (memref<2xf32>, memref<2xf32>) -> () + return +} + +// CHECK-NEXT: %{{.*}} = alloca() +// CHECK-NEXT: %{{.*}} = alloca() +// CHECK-NEXT: cond_br +// CHECK: linalg.copy +// CHECK-NEXT: return + +// ----- + +// Test Case: Nested regions - This test defines a GenericOp inside the region +// of another GenericOp. +// PromoteBuffersToStack expected behavior: The AllocOps are moved upwards and +// then converted to allocas. + +#map0 = affine_map<(d0) -> (d0)> + +// CHECK-LABEL: func @nested_regions_and_cond_branch +func @nested_regions_and_cond_branch( + %arg0: i1, + %arg1: memref<2xf32>, + %arg2: memref<2xf32>) { + cond_br %arg0, ^bb1, ^bb2 +^bb1: + br ^bb3(%arg1 : memref<2xf32>) +^bb2: + %0 = alloc() : memref<2xf32> + linalg.generic { + indexing_maps = [#map0, #map0], + iterator_types = ["parallel"]} + ins(%arg1: memref<2xf32>) + outs(%0: memref<2xf32>) { + ^bb0(%gen1_arg0: f32, %gen1_arg1: f32): + %1 = alloc() : memref<2xf32> + linalg.generic { + indexing_maps = [#map0, #map0], + iterator_types = ["parallel"]} + ins(%arg1: memref<2xf32>) + outs(%1: memref<2xf32>) { + ^bb0(%gen2_arg0: f32, %gen2_arg1: f32): + %tmp2 = exp %gen2_arg0 : f32 + linalg.yield %tmp2 : f32 + } + %tmp1 = exp %gen1_arg0 : f32 + linalg.yield %tmp1 : f32 + } + br ^bb3(%0 : memref<2xf32>) +^bb3(%1: memref<2xf32>): + "linalg.copy"(%1, %arg2) : (memref<2xf32>, memref<2xf32>) -> () + return +} +// CHECK-NEXT: %[[ALLOCA0:.*]] = alloca() +// CHECK-NEXT: %[[ALLOCA1:.*]] = alloca() +// CHECK-NEXT: cond_br + +// ----- + +// Test Case: buffer deallocation escaping +// PromoteBuffersToStack expected behavior: The first alloc is returned, so +// there is no conversion allowed. The second alloc is converted, since it +// only remains in the scope of the function. + +#map0 = affine_map<(d0) -> (d0)> + +// CHECK-LABEL: func @memref_in_function_results +func @memref_in_function_results( + %arg0: memref<5xf32>, + %arg1: memref<10xf32>, + %arg2: memref<5xf32>) -> (memref<10xf32>, memref<15xf32>) { + %x = alloc() : memref<15xf32> + %y = alloc() : memref<5xf32> + linalg.generic {indexing_maps = [#map0, #map0], iterator_types = ["parallel"]} + ins(%arg0: memref<5xf32>) + outs(%y: memref<5xf32>) { + ^bb0(%arg3: f32, %arg4: f32): + %2 = exp %arg3 : f32 + linalg.yield %2 : f32 + } + linalg.copy(%y, %arg2) : memref<5xf32>, memref<5xf32> + return %arg1, %x : memref<10xf32>, memref<15xf32> +} +// CHECK: (%[[ARG0:.*]]: memref<5xf32>, %[[ARG1:.*]]: memref<10xf32>, +// CHECK-SAME: %[[RESULT:.*]]: memref<5xf32>) +// CHECK: %[[ALLOC:.*]] = alloc() +// CHECK: %[[ALLOCA:.*]] = alloca() +// CHECK: linalg.copy +// CHECK: return %[[ARG1]], %[[ALLOC]] + +// ----- + +// Test Case: nested region control flow +// The allocation in the nested if branch cannot be converted to an alloca +// due to its dynamic memory allocation behavior. + +// CHECK-LABEL: func @nested_region_control_flow +func @nested_region_control_flow( + %arg0 : index, + %arg1 : index) -> memref { + %0 = cmpi "eq", %arg0, %arg1 : index + %1 = alloc(%arg0, %arg0) : memref + %2 = scf.if %0 -> (memref) { + scf.yield %1 : memref + } else { + %3 = alloc(%arg0, %arg1) : memref + scf.yield %1 : memref + } + return %2 : memref +} + +// CHECK: %[[ALLOC0:.*]] = alloc(%arg0, %arg0) +// CHECK-NEXT: %[[ALLOC1:.*]] = scf.if +// CHECK: scf.yield %[[ALLOC0]] +// CHECK: %[[ALLOC2:.*]] = alloc(%arg0, %arg1) +// CHECK-NEXT: scf.yield %[[ALLOC0]] +// CHECK: return %[[ALLOC1]] + +// ----- + +// Test Case: nested region control flow within a region interface. +// The alloc %0 does not need to be converted in this case since the +// allocation finally escapes the method. + +// CHECK-LABEL: func @inner_region_control_flow +func @inner_region_control_flow(%arg0 : index) -> memref<2x2xf32> { + %0 = alloc() : memref<2x2xf32> + %1 = test.region_if %0 : memref<2x2xf32> -> (memref<2x2xf32>) then { + ^bb0(%arg1 : memref<2x2xf32>): + test.region_if_yield %arg1 : memref<2x2xf32> + } else { + ^bb0(%arg1 : memref<2x2xf32>): + test.region_if_yield %arg1 : memref<2x2xf32> + } join { + ^bb0(%arg1 : memref<2x2xf32>): + test.region_if_yield %arg1 : memref<2x2xf32> + } + return %1 : memref<2x2xf32> +} + +// CHECK: %[[ALLOC0:.*]] = alloc() +// CHECK-NEXT: %[[ALLOC1:.*]] = test.region_if +// CHECK-NEXT: ^bb0(%[[ALLOC2:.*]]:{{.*}}): +// CHECK-NEXT: test.region_if_yield %[[ALLOC2]] +// CHECK: ^bb0(%[[ALLOC3:.*]]:{{.*}}): +// CHECK-NEXT: test.region_if_yield %[[ALLOC3]] +// CHECK: ^bb0(%[[ALLOC4:.*]]:{{.*}}): +// CHECK-NEXT: test.region_if_yield %[[ALLOC4]] +// CHECK: return %[[ALLOC1]] + +// ----- + +// Test Case: structured control-flow loop using a nested alloc. +// The alloc %3 will be moved upwards and both allocs are converted to +// allocas. + +// CHECK-LABEL: func @loop_alloc +func @loop_alloc( + %lb: index, + %ub: index, + %step: index, + %buf: memref<2xf32>, + %res: memref<2xf32>) { + %0 = alloc() : memref<2xf32> + %1 = scf.for %i = %lb to %ub step %step + iter_args(%iterBuf = %buf) -> memref<2xf32> { + %2 = cmpi "eq", %i, %ub : index + %3 = alloc() : memref<2xf32> + scf.yield %3 : memref<2xf32> + } + "linalg.copy"(%1, %res) : (memref<2xf32>, memref<2xf32>) -> () + return +} + +// CHECK-NEXT: %[[ALLOCA0:.*]] = alloca() +// CHECK-NEXT: %[[ALLOCA1:.*]] = alloca() + +// ----- + +// Test Case: structured control-flow loop with a nested if operation. +// The loop yields buffers that have been defined outside of the loop and the +// backeges only use the iteration arguments (or one of its aliases). +// Therefore, we do not have to (and are not allowed to) free any buffers +// that are passed via the backedges. The alloc is converted to an AllocaOp. + +// CHECK-LABEL: func @loop_nested_if_no_alloc +func @loop_nested_if_no_alloc( + %lb: index, + %ub: index, + %step: index, + %buf: memref<2xf32>, + %res: memref<2xf32>) { + %0 = alloc() : memref<2xf32> + %1 = scf.for %i = %lb to %ub step %step + iter_args(%iterBuf = %buf) -> memref<2xf32> { + %2 = cmpi "eq", %i, %ub : index + %3 = scf.if %2 -> (memref<2xf32>) { + scf.yield %0 : memref<2xf32> + } else { + scf.yield %iterBuf : memref<2xf32> + } + scf.yield %3 : memref<2xf32> + } + "linalg.copy"(%1, %res) : (memref<2xf32>, memref<2xf32>) -> () + return +} + +// CHECK: %[[ALLOCA0:.*]] = alloca() +// CHECK-NEXT: %[[ALLOCA1:.*]] = scf.for {{.*}} iter_args(%[[IALLOCA:.*]] = +// CHECK: %[[ALLOCA2:.*]] = scf.if +// CHECK: scf.yield %[[ALLOCA0]] +// CHECK: scf.yield %[[IALLOCA]] +// CHECK: scf.yield %[[ALLOCA2]] +// CHECK: linalg.copy(%[[ALLOCA1]], %arg4) + +// ----- + +// Test Case: structured control-flow loop with a nested if operation using +// a deeply nested buffer allocation. +// The allocs are not converted in this case. + +// CHECK-LABEL: func @loop_nested_if_alloc +func @loop_nested_if_alloc( + %lb: index, + %ub: index, + %step: index, + %buf: memref<2xf32>) -> memref<2xf32> { + %0 = alloc() : memref<2xf32> + %1 = scf.for %i = %lb to %ub step %step + iter_args(%iterBuf = %buf) -> memref<2xf32> { + %2 = cmpi "eq", %i, %ub : index + %3 = scf.if %2 -> (memref<2xf32>) { + %4 = alloc() : memref<2xf32> + scf.yield %4 : memref<2xf32> + } else { + scf.yield %0 : memref<2xf32> + } + scf.yield %3 : memref<2xf32> + } + return %1 : memref<2xf32> +} + +// CHECK: %[[ALLOC0:.*]] = alloc() +// CHECK-NEXT: %[[ALLOC1:.*]] = scf.for {{.*}} +// CHECK: %[[ALLOC2:.*]] = scf.if +// CHECK: %[[ALLOC3:.*]] = alloc() +// CHECK-NEXT: scf.yield %[[ALLOC3]] +// CHECK: scf.yield %[[ALLOC0]] +// CHECK: scf.yield %[[ALLOC2]] +// CHECK: return %[[ALLOC1]] + +// ----- + +// Test Case: several nested structured control-flow loops with a deeply nested +// buffer allocation inside an if operation. +// The allocs are moved upwards and all converted to allocas. + +// CHECK-LABEL: func @loop_nested_alloc +func @loop_nested_alloc( + %lb: index, + %ub: index, + %step: index, + %buf: memref<2xf32>, + %res: memref<2xf32>) { + %0 = alloc() : memref<2xf32> + %1 = scf.for %i = %lb to %ub step %step + iter_args(%iterBuf = %buf) -> memref<2xf32> { + %2 = scf.for %i2 = %lb to %ub step %step + iter_args(%iterBuf2 = %iterBuf) -> memref<2xf32> { + %3 = scf.for %i3 = %lb to %ub step %step + iter_args(%iterBuf3 = %iterBuf2) -> memref<2xf32> { + %4 = alloc() : memref<2xf32> + %5 = cmpi "eq", %i, %ub : index + %6 = scf.if %5 -> (memref<2xf32>) { + %7 = alloc() : memref<2xf32> + scf.yield %7 : memref<2xf32> + } else { + scf.yield %iterBuf3 : memref<2xf32> + } + scf.yield %6 : memref<2xf32> + } + scf.yield %3 : memref<2xf32> + } + scf.yield %2 : memref<2xf32> + } + "linalg.copy"(%1, %res) : (memref<2xf32>, memref<2xf32>) -> () + return +} + +// CHECK: %[[ALLOCA0:.*]] = alloca() +// CHECK-NEXT: %[[ALLOCA1:.*]] = alloca() +// CHECK-NEXT: %[[ALLOCA2:.*]] = alloca() +// CHECK-NEXT: %[[ALLOCA3:.*]] = scf.for {{.*}} +// CHECK-NEXT: %[[ALLOCA4:.*]] = scf.for {{.*}} +// CHECK-NEXT: %[[ALLOCA5:.*]] = scf.for {{.*}} +// CHECK: %[[ALLOCA6:.*]] = scf.if +// CHECK-NEXT: scf.yield %[[ALLOCA2]] +// CHECK: scf.yield +// CHECK: scf.yield %[[ALLOCA6]] +// CHECK: scf.yield %[[ALLOCA5]] +// CHECK: scf.yield %[[ALLOCA4]] +// CHECK: linalg.copy(%[[ALLOCA3]], %arg4) + +// ----- + +// Test Case: The allocated buffer is too large and, hence, it is not +// converted. In the actual implementation the largest size is 1KB. + +// CHECK-LABEL: func @large_buffer_allocation +func @large_buffer_allocation(%arg0: memref<2048xf32>) { + %0 = alloc() : memref<2048xf32> + "linalg.copy"(%0, %arg0) : (memref<2048xf32>, memref<2048xf32>) -> () + return +} + +// CHECK-NEXT: %[[ALLOC:.*]] = alloc() +// CHECK-NEXT: linalg.copy