diff --git a/mlir/include/mlir/Transforms/Passes.h b/mlir/include/mlir/Transforms/Passes.h
--- a/mlir/include/mlir/Transforms/Passes.h
+++ b/mlir/include/mlir/Transforms/Passes.h
@@ -40,6 +40,9 @@
 /// reallocations inside of loops.
 std::unique_ptr<Pass> createBufferLoopHoistingPass();
 
+/// Creates a pass that promotes heap-based allocations to stack-based ones.
+std::unique_ptr<Pass> createPromoteBuffersToStackPass();
+
 /// Creates an instance of the Canonicalizer pass.
 std::unique_ptr<Pass> createCanonicalizerPass();
 
diff --git a/mlir/include/mlir/Transforms/Passes.td b/mlir/include/mlir/Transforms/Passes.td
--- a/mlir/include/mlir/Transforms/Passes.td
+++ b/mlir/include/mlir/Transforms/Passes.td
@@ -201,6 +201,17 @@
   let constructor = "mlir::createBufferLoopHoistingPass()";
 }
 
+def PromoteBuffersToStack : FunctionPass<"promote-buffers-to-stack"> {
+  let summary = "Promotes heap-based allocations to automatically managed "
+                "stack-based allocations";
+  let description = [{
+    This pass implements a simple algorithm to convert heap-based memory
+    allocations to stack-based ones. It uses a built-in heuristic to decide
+    whether it makes sense to convert an allocation.
+  }];
+  let constructor = "mlir::createPromoteBuffersToStackPass()";
+}
+
 def Canonicalizer : Pass<"canonicalize"> {
   let summary = "Canonicalize operations";
   let description = [{
diff --git a/mlir/lib/Transforms/BufferOptimizations.cpp b/mlir/lib/Transforms/BufferOptimizations.cpp
--- a/mlir/lib/Transforms/BufferOptimizations.cpp
+++ b/mlir/lib/Transforms/BufferOptimizations.cpp
@@ -6,9 +6,10 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// This file implements logic for two optimization passes. These passes try to
-// hoist alloc nodes to reduce the number of allocations and copies during
-// buffer deallocation.
+// This file implements logic for three optimization passes. The first two
+// passes try to move alloc nodes out of blocks to reduce the number of
+// allocations and copies during buffer deallocation. The third pass tries to
+// convert heap-based allocations to stack-based allocations, if possible.
 
 #include "PassDetail.h"
 #include "mlir/IR/Operation.h"
@@ -21,6 +22,70 @@
 
 namespace {
 
+//===----------------------------------------------------------------------===//
+// BufferPlacementAllocationScope
+//===----------------------------------------------------------------------===//
+
+/// The scope of a given allocation node to check if it is possible to convert
+/// this alloc node into an alloca. If the scope satisfies several constraints
+/// the transformation is performed.
+class BufferPlacementAllocationScope {
+public:
+  /// Constructs a new buffer placement allocation scope.
+  BufferPlacementAllocationScope(const BufferPlacementAllocs::AllocEntry &entry)
+      : alloc(std::get<0>(entry)), deallocOp(std::get<1>(entry)),
+        parent(nullptr) {
+    auto region = alloc.getParentRegion();
+    do {
+      if (auto parentOp = region->getParentOp())
+        if (parentOp->hasTrait<OpTrait::AutomaticAllocationScope>()) {
+          parent = &region->front();
+          break;
+        }
+    } while ((region = region->getParentRegion()));
+  }
+
+  /// Returns the parent block.
+  Block *getParent() const { return parent; }
+
+  /// Returns the parent region.
+  Region *getParentRegion() const { return parent->getParent(); }
+
+  /// Check if the size of the allocation is less than 1 KB. The transformation
+  /// is only applied to small buffers since large buffers could exceed the
+  /// stack space.
+  bool isSmallAlloc() const {
+    auto type = alloc.getType().dyn_cast<ShapedType>();
+    if (!type || !type.hasStaticShape())
+      return false;
+    return type.getSizeInBits() < 1024 * 8;
+  }
+
+  /// Checks whether the given aliases leave the allocation scope.
+  bool leavesAllocationScope(
+      const BufferPlacementAliasAnalysis::ValueSetT &aliases) const {
+    for (Value alias : aliases) {
+      for (auto use : alias.getUsers()) {
+        if (use->hasTrait<OpTrait::ReturnLike>() &&
+            use->getParentRegion() == getParentRegion())
+          return true;
+      }
+    }
+    return false;
+  }
+
+  /// Convert a possible alloc node into an alloca.
+  bool canConvertToAlloca(
+      const BufferPlacementAliasAnalysis::ValueSetT &aliases) const {
+    return isSmallAlloc() && !deallocOp && !leavesAllocationScope(aliases);
+  }
+
+private:
+  Value alloc;
+  Operation *deallocOp;
+  Block *parent;
+};
+
 //===----------------------------------------------------------------------===//
 // BufferAllocationHoisting
 //===----------------------------------------------------------------------===//
@@ -220,6 +285,46 @@
   void recordMoveToParent(Block *block) { placementBlock = block; }
 };
 
+//===----------------------------------------------------------------------===//
+// BufferPlacementPromotion
+//===----------------------------------------------------------------------===//
+
+/// Promotes heap-based allocations to stack-based allocations (if possible).
+class BufferPlacementPromotion : BufferPlacementTransformationBase {
+public:
+  BufferPlacementPromotion(Operation *op)
+      : BufferPlacementTransformationBase(op) {}
+
+  /// Promote buffers to stack-based allocations.
+  void promote() {
+    for (BufferPlacementAllocs::AllocEntry &entry : allocs) {
+      // Avoid dynamically allocated buffers.
+      Value alloc = std::get<0>(entry);
+      if (alloc.getDefiningOp()->getOperands().size() > 0)
+        continue;
+      // Use the AllocationScope analysis to test whether we can convert the
+      // current entry into a stack-based allocation node.
+      BufferPlacementAllocationScope allocationScope(entry);
+      auto resultAliases = aliases.resolve(alloc);
+      if (!allocationScope.canConvertToAlloca(resultAliases))
+        continue;
+      Operation *startOperation = BufferPlacementAllocs::getStartOperation(
+          alloc, allocationScope.getParent(), liveness);
+
+      // Build a new alloca that is associated with its parent
+      // `AutomaticAllocationScope` determined during the initialization phase.
+      OpBuilder builder(startOperation);
+      auto alloca = builder.create<AllocaOp>(
+          alloc.getLoc(), alloc.getType().cast<MemRefType>());
+
+      // Replace the original alloc by a newly created alloca.
+      Operation *allocOp = alloc.getDefiningOp();
+      allocOp->replaceAllUsesWith(alloca.getOperation());
+      allocOp->erase();
+    }
+  }
+};
+
 //===----------------------------------------------------------------------===//
 // BufferOptimizationPasses
 //===----------------------------------------------------------------------===//
@@ -247,6 +352,18 @@
   }
 };
 
+/// The promote buffer to stack pass that tries to convert alloc nodes into
+/// alloca nodes.
+struct PromoteBuffersToStackPass
+    : PromoteBuffersToStackBase<PromoteBuffersToStackPass> {
+
+  void runOnFunction() override {
+    // Move all allocation nodes and convert candidates into allocas.
+    BufferPlacementPromotion optimizer(getFunction());
+    optimizer.promote();
+  }
+};
+
 } // end anonymous namespace
 
 std::unique_ptr<Pass> mlir::createBufferHoistingPass() {
@@ -256,3 +373,7 @@
 std::unique_ptr<Pass> mlir::createBufferLoopHoistingPass() {
   return std::make_unique<BufferLoopHoistingPass>();
 }
+
+std::unique_ptr<Pass> mlir::createPromoteBuffersToStackPass() {
+  return std::make_unique<PromoteBuffersToStackPass>();
+}
diff --git a/mlir/test/Transforms/promote-buffers-to-stack.mlir b/mlir/test/Transforms/promote-buffers-to-stack.mlir
new file mode 100644
--- /dev/null
+++ b/mlir/test/Transforms/promote-buffers-to-stack.mlir
@@ -0,0 +1,714 @@
+// RUN: mlir-opt -promote-buffers-to-stack -split-input-file %s | FileCheck %s
+
+// This file checks the behavior of PromoteBuffersToStack pass for converting
+// AllocOps into AllocaOps, if possible.
+
+// Test Case:
+//    bb0
+//   /   \
+//  bb1  bb2 <- Initial position of AllocOp
+//   \   /
+//    bb3
+// PromoteBuffersToStack expected behavior: It should move the existing AllocOp
+// to the entry block and convert it in to an AllocaOp.
+
+#map0 = affine_map<(d0) -> (d0)>
+
+// CHECK-LABEL: func @condBranch
+func @condBranch(%arg0: i1, %arg1: memref<2xf32>, %arg2: memref<2xf32>) {
+  cond_br %arg0, ^bb1, ^bb2
+^bb1:
+  br ^bb3(%arg1 : memref<2xf32>)
+^bb2:
+  %0 = alloc() : memref<2xf32>
+  linalg.generic {indexing_maps = [#map0, #map0], iterator_types = ["parallel"]}
+    ins(%arg1: memref<2xf32>)
+   outs(%0: memref<2xf32>) {
+  ^bb0(%gen1_arg0: f32, %gen1_arg1: f32):
+    %tmp1 = exp %gen1_arg0 : f32
+    linalg.yield %tmp1 : f32
+  }
+  br ^bb3(%0 : memref<2xf32>)
+^bb3(%1: memref<2xf32>):
+  "linalg.copy"(%1, %arg2) : (memref<2xf32>, memref<2xf32>) -> ()
+  return
+}
+
+// CHECK-NEXT: %[[ALLOCA:.*]] = alloca()
+// CHECK-NEXT: cond_br
+//      CHECK: linalg.copy
+// CHECK-NEXT: return
+
+// -----
+
+// Test Case:
+//    bb0
+//   /   \
+//  bb1  bb2 <- Initial position of AllocOp
+//   \   /
+//    bb3
+// PromoteBuffersToStack expected behavior: It should not move the existing
+// AllocOp to any other block since the alloc has a dynamic dependency to block
+// argument %0 in bb2. Since it is dynamic type, it is not converted to an
+// alloca.
+
+#map0 = affine_map<(d0) -> (d0)>
+
+// CHECK-LABEL: func @condBranchDynamicType
+func @condBranchDynamicType(
+  %arg0: i1,
+  %arg1: memref<?xf32>,
+  %arg2: memref<?xf32>,
+  %arg3: index) {
+  cond_br %arg0, ^bb1, ^bb2(%arg3: index)
+^bb1:
+  br ^bb3(%arg1 : memref<?xf32>)
+^bb2(%0: index):
+  %1 = alloc(%0) : memref<?xf32>
+  linalg.generic {indexing_maps = [#map0, #map0], iterator_types = ["parallel"]}
+    ins(%arg1: memref<?xf32>)
+   outs(%1: memref<?xf32>) {
+  ^bb0(%gen1_arg0: f32, %gen1_arg1: f32):
+    %tmp1 = exp %gen1_arg0 : f32
+    linalg.yield %tmp1 : f32
+  }
+  br ^bb3(%1 : memref<?xf32>)
+^bb3(%2: memref<?xf32>):
+  "linalg.copy"(%2, %arg2) : (memref<?xf32>, memref<?xf32>) -> ()
+  return
+}
+
+// CHECK-NEXT: cond_br
+//      CHECK: ^bb2
+//      CHECK: ^bb2(%[[IDX:.*]]:{{.*}})
+// CHECK-NEXT: %[[ALLOC0:.*]] = alloc(%[[IDX]])
+// CHECK-NEXT: linalg.generic
+//      CHECK: br ^bb3
+// CHECK-NEXT: ^bb3(%[[ALLOC0:.*]]:{{.*}})
+//      CHECK: linalg.copy(%[[ALLOC0]],
+// CHECK-NEXT: return
+
+// -----
+
+// Test Case: Existing AllocOp with no users.
+// PromoteBuffersToStack expected behavior: It should convert it to an
+// AllocaOp.
+
+// CHECK-LABEL: func @emptyUsesValue
+func @emptyUsesValue(%arg0: memref<4xf32>) {
+  %0 = alloc() : memref<4xf32>
+  return
+}
+// CHECK-NEXT: %[[ALLOCA:.*]] = alloca()
+// CHECK-NEXT: return
+
+// -----
+
+// Test Case:
+//    bb0
+//   /   \
+//  |    bb1 <- Initial position of AllocOp
+//   \   /
+//    bb2
+// PromoteBuffersToStack expected behavior: It should move the existing AllocOp
+// to the entry block and convert it to an AllocaOp.
+
+#map0 = affine_map<(d0) -> (d0)>
+
+// CHECK-LABEL: func @criticalEdge
+func @criticalEdge(%arg0: i1, %arg1: memref<2xf32>, %arg2: memref<2xf32>) {
+  cond_br %arg0, ^bb1, ^bb2(%arg1 : memref<2xf32>)
+^bb1:
+  %0 = alloc() : memref<2xf32>
+  linalg.generic {indexing_maps = [#map0, #map0], iterator_types = ["parallel"]}
+    ins(%arg1: memref<2xf32>)
+   outs(%0: memref<2xf32>) {
+  ^bb0(%gen1_arg0: f32, %gen1_arg1: f32):
+    %tmp1 = exp %gen1_arg0 : f32
+    linalg.yield %tmp1 : f32
+  }
+  br ^bb2(%0 : memref<2xf32>)
+^bb2(%1: memref<2xf32>):
+  "linalg.copy"(%1, %arg2) : (memref<2xf32>, memref<2xf32>) -> ()
+  return
+}
+
+// CHECK-NEXT: %[[ALLOCA:.*]] = alloca()
+// CHECK-NEXT: cond_br
+//      CHECK: linalg.copy
+// CHECK-NEXT: return
+
+// -----
+
+// Test Case:
+//    bb0 <- Initial position of AllocOp
+//   /   \
+//  |    bb1
+//   \   /
+//    bb2
+// PromoteBuffersToStack expected behavior: It shouldn't move the alloc
+// position. It only converts it in an alloca.
+
+#map0 = affine_map<(d0) -> (d0)>
+
+// CHECK-LABEL: func @invCriticalEdge
+func @invCriticalEdge(%arg0: i1, %arg1: memref<2xf32>, %arg2: memref<2xf32>) {
+  %0 = alloc() : memref<2xf32>
+  linalg.generic {indexing_maps = [#map0, #map0], iterator_types = ["parallel"]}
+    ins(%arg1: memref<2xf32>)
+   outs(%0: memref<2xf32>) {
+  ^bb0(%gen1_arg0: f32, %gen1_arg1: f32):
+    %tmp1 = exp %gen1_arg0 : f32
+    linalg.yield %tmp1 : f32
+  }
+  cond_br %arg0, ^bb1, ^bb2(%arg1 : memref<2xf32>)
+^bb1:
+  br ^bb2(%0 : memref<2xf32>)
+^bb2(%1: memref<2xf32>):
+  "linalg.copy"(%1, %arg2) : (memref<2xf32>, memref<2xf32>) -> ()
+  return
+}
+
+// CHECK-NEXT: %[[ALLOCA:.*]] = alloca()
+//      CHECK: cond_br
+//      CHECK: linalg.copy
+// CHECK-NEXT: return
+
+// -----
+
+// Test Case:
+//    bb0 <- Initial position of the first AllocOp
+//   /   \
+//  bb1  bb2
+//   \   /
+//    bb3 <- Initial position of the second AllocOp
+// PromoteBuffersToStack expected behavior: It shouldn't move the AllocOps. It
+// only converts them into allocas.
+
+#map0 = affine_map<(d0) -> (d0)>
+
+// CHECK-LABEL: func @ifElse
+func @ifElse(%arg0: i1, %arg1: memref<2xf32>, %arg2: memref<2xf32>) {
+  %0 = alloc() : memref<2xf32>
+  linalg.generic {indexing_maps = [#map0, #map0], iterator_types = ["parallel"]}
+    ins(%arg1: memref<2xf32>)
+   outs(%0: memref<2xf32>) {
+  ^bb0(%gen1_arg0: f32, %gen1_arg1: f32):
+    %tmp1 = exp %gen1_arg0 : f32
+    linalg.yield %tmp1 : f32
+  }
+  cond_br %arg0,
+    ^bb1(%arg1, %0 : memref<2xf32>, memref<2xf32>),
+    ^bb2(%0, %arg1 : memref<2xf32>, memref<2xf32>)
+^bb1(%1: memref<2xf32>, %2: memref<2xf32>):
+  br ^bb3(%1, %2 : memref<2xf32>, memref<2xf32>)
+^bb2(%3: memref<2xf32>, %4: memref<2xf32>):
+  br ^bb3(%3, %4 : memref<2xf32>, memref<2xf32>)
+^bb3(%5: memref<2xf32>, %6: memref<2xf32>):
+  %7 = alloc() : memref<2xf32>
+  linalg.generic {indexing_maps = [#map0, #map0], iterator_types = ["parallel"]}
+    ins(%5: memref<2xf32>)
+   outs(%7: memref<2xf32>) {
+  ^bb0(%gen2_arg0: f32, %gen2_arg1: f32):
+    %tmp2 = exp %gen2_arg0 : f32
+    linalg.yield %tmp2 : f32
+  }
+  "linalg.copy"(%7, %arg2) : (memref<2xf32>, memref<2xf32>) -> ()
+  return
+}
+
+// CHECK-NEXT: %[[ALLOCA0:.*]] = alloca()
+// CHECK-NEXT: linalg.generic
+//      CHECK: %[[ALLOCA1:.*]] = alloca()
+//      CHECK: linalg.generic
+//      CHECK: linalg.copy(%[[ALLOCA1]]
+// CHECK-NEXT: return
+
+// -----
+
+// Test Case: No users for buffer in if-else CFG
+//    bb0 <- Initial position of AllocOp
+//   /   \
+//  bb1  bb2
+//   \   /
+//    bb3
+// PromoteBuffersToStack expected behavior: It shouldn't move the AllocOp. It
+// only converts to alloca.
+
+#map0 = affine_map<(d0) -> (d0)>
+
+// CHECK-LABEL: func @ifElseNoUsers
+func @ifElseNoUsers(%arg0: i1, %arg1: memref<2xf32>, %arg2: memref<2xf32>) {
+  %0 = alloc() : memref<2xf32>
+  linalg.generic {indexing_maps = [#map0, #map0], iterator_types = ["parallel"]}
+    ins(%arg1: memref<2xf32>)
+   outs(%0: memref<2xf32>) {
+  ^bb0(%gen1_arg0: f32, %gen1_arg1: f32):
+    %tmp1 = exp %gen1_arg0 : f32
+    linalg.yield %tmp1 : f32
+  }
+  cond_br %arg0,
+    ^bb1(%arg1, %0 : memref<2xf32>, memref<2xf32>),
+    ^bb2(%0, %arg1 : memref<2xf32>, memref<2xf32>)
+^bb1(%1: memref<2xf32>, %2: memref<2xf32>):
+  br ^bb3(%1, %2 : memref<2xf32>, memref<2xf32>)
+^bb2(%3: memref<2xf32>, %4: memref<2xf32>):
+  br ^bb3(%3, %4 : memref<2xf32>, memref<2xf32>)
+^bb3(%5: memref<2xf32>, %6: memref<2xf32>):
+  "linalg.copy"(%arg1, %arg2) : (memref<2xf32>, memref<2xf32>) -> ()
+  return
+}
+
+// CHECK-NEXT: %[[ALLOCA:.*]] = alloca()
+//      CHECK: return
+
+// -----
+
+// Test Case:
+//      bb0 <- Initial position of the first AllocOp
+//     /    \
+//   bb1    bb2
+//    |     /  \
+//    |   bb3  bb4
+//    \     \  /
+//     \     /
+//       bb5 <- Initial position of the second AllocOp
+// PromoteBuffersToStack expected behavior: AllocOps shouldn't be moved.
+// The two allocs should be converted to allocas.
+
+#map0 = affine_map<(d0) -> (d0)>
+
+// CHECK-LABEL: func @ifElseNested
+func @ifElseNested(%arg0: i1, %arg1: memref<2xf32>, %arg2: memref<2xf32>) {
+  %0 = alloc() : memref<2xf32>
+  linalg.generic {indexing_maps = [#map0, #map0], iterator_types = ["parallel"]}
+    ins(%arg1: memref<2xf32>)
+   outs(%0: memref<2xf32>) {
+  ^bb0(%gen1_arg0: f32, %gen1_arg1: f32):
+    %tmp1 = exp %gen1_arg0 : f32
+    linalg.yield %tmp1 : f32
+  }
+  cond_br %arg0,
+    ^bb1(%arg1, %0 : memref<2xf32>, memref<2xf32>),
+    ^bb2(%0, %arg1 : memref<2xf32>, memref<2xf32>)
+^bb1(%1: memref<2xf32>, %2: memref<2xf32>):
+  br ^bb5(%1, %2 : memref<2xf32>, memref<2xf32>)
+^bb2(%3: memref<2xf32>, %4: memref<2xf32>):
+  cond_br %arg0, ^bb3(%3 : memref<2xf32>), ^bb4(%4 : memref<2xf32>)
+^bb3(%5: memref<2xf32>):
+  br ^bb5(%5, %3 : memref<2xf32>, memref<2xf32>)
+^bb4(%6: memref<2xf32>):
+  br ^bb5(%3, %6 : memref<2xf32>, memref<2xf32>)
+^bb5(%7: memref<2xf32>, %8: memref<2xf32>):
+  %9 = alloc() : memref<2xf32>
+  linalg.generic {indexing_maps = [#map0, #map0], iterator_types = ["parallel"]}
+    ins(%7: memref<2xf32>)
+   outs(%9: memref<2xf32>) {
+  ^bb0(%gen2_arg0: f32, %gen2_arg1: f32):
+    %tmp2 = exp %gen2_arg0 : f32
+    linalg.yield %tmp2 : f32
+  }
+  "linalg.copy"(%9, %arg2) : (memref<2xf32>, memref<2xf32>) -> ()
+  return
+}
+
+// CHECK-NEXT: %[[ALLOCA0:.*]] = alloca()
+// CHECK-NEXT: linalg.generic
+//      CHECK: %[[ALLOCA1:.*]] = alloca()
+//      CHECK: linalg.generic
+//      CHECK: linalg.copy(%[[ALLOCA1]]
+// CHECK-NEXT: return
+
+// -----
+
+// Test Case: Dead operations in a single block.
+// PromoteBuffersToStack expected behavior: It shouldn't move the AllocOps. It
+// only converts the two AllocOps to allocas.
+
+#map0 = affine_map<(d0) -> (d0)>
+
+// CHECK-LABEL: func @redundantOperations
+func @redundantOperations(%arg0: memref<2xf32>) {
+  %0 = alloc() : memref<2xf32>
+  linalg.generic {indexing_maps = [#map0, #map0], iterator_types = ["parallel"]}
+    ins(%arg0: memref<2xf32>)
+   outs(%0: memref<2xf32>) {
+  ^bb0(%gen1_arg0: f32, %gen1_arg1: f32):
+    %tmp1 = exp %gen1_arg0 : f32
+    linalg.yield %tmp1 : f32
+  }
+  %1 = alloc() : memref<2xf32>
+  linalg.generic {indexing_maps = [#map0, #map0], iterator_types = ["parallel"]}
+    ins(%0: memref<2xf32>)
+   outs(%1: memref<2xf32>) {
+  ^bb0(%gen2_arg0: f32, %gen2_arg1: f32):
+    %tmp2 = exp %gen2_arg0 : f32
+    linalg.yield %tmp2 : f32
+  }
+  return
+}
+
+//      CHECK: (%[[ARG0:.*]]: {{.*}})
+// CHECK-NEXT: %[[ALLOCA0:.*]] = alloca()
+// CHECK-NEXT: linalg.generic {{{.*}}} ins(%[[ARG0]]{{.*}} outs(%[[ALLOCA0]]
+//      CHECK: %[[ALLOCA1:.*]] = alloca()
+// CHECK-NEXT: linalg.generic {{{.*}}} ins(%[[ALLOCA0]]{{.*}} outs(%[[ALLOCA1]]
+//      CHECK: return
+
+// -----
+
+// Test Case:
+//                                     bb0
+//                                    /   \
+// Initial pos of the 1st AllocOp -> bb1  bb2 <- Initial pos of the 2nd AllocOp
+//                                    \   /
+//                                     bb3
+// PromoteBuffersToStack expected behavior: Both AllocOps should be moved to the
+// entry block and converted to allocas, so there is no need for deallocations.
+
+#map0 = affine_map<(d0) -> (d0)>
+
+// CHECK-LABEL: func @moving_alloc_and_inserting_missing_dealloc
+func @moving_alloc_and_inserting_missing_dealloc(
+  %cond: i1,
+    %arg0: memref<2xf32>,
+    %arg1: memref<2xf32>) {
+  cond_br %cond, ^bb1, ^bb2
+^bb1:
+  %0 = alloc() : memref<2xf32>
+  linalg.generic {indexing_maps = [#map0, #map0], iterator_types = ["parallel"]}
+    ins(%arg0: memref<2xf32>)
+   outs(%0: memref<2xf32>) {
+  ^bb0(%gen1_arg0: f32, %gen1_arg1: f32):
+    %tmp1 = exp %gen1_arg0 : f32
+    linalg.yield %tmp1 : f32
+  }
+  br ^exit(%0 : memref<2xf32>)
+^bb2:
+  %1 = alloc() : memref<2xf32>
+  linalg.generic {indexing_maps = [#map0, #map0], iterator_types = ["parallel"]}
+    ins(%arg0: memref<2xf32>)
+   outs(%1: memref<2xf32>) {
+  ^bb0(%gen2_arg0: f32, %gen2_arg1: f32):
+    %tmp2 = exp %gen2_arg0 : f32
+    linalg.yield %tmp2 : f32
+  }
+  br ^exit(%1 : memref<2xf32>)
+^exit(%arg2: memref<2xf32>):
+  "linalg.copy"(%arg2, %arg1) : (memref<2xf32>, memref<2xf32>) -> ()
+  return
+}
+
+// CHECK-NEXT: %{{.*}} = alloca()
+// CHECK-NEXT: %{{.*}} = alloca()
+// CHECK-NEXT: cond_br
+//      CHECK: linalg.copy
+// CHECK-NEXT: return
+
+// -----
+
+// Test Case: Nested regions - This test defines a GenericOp inside the region
+// of another GenericOp.
+// PromoteBuffersToStack expected behavior: The AllocOps are moved upwards and
+// then converted to allocas.
+
+#map0 = affine_map<(d0) -> (d0)>
+
+// CHECK-LABEL: func @nested_regions_and_cond_branch
+func @nested_regions_and_cond_branch(
+  %arg0: i1,
+  %arg1: memref<2xf32>,
+  %arg2: memref<2xf32>) {
+  cond_br %arg0, ^bb1, ^bb2
+^bb1:
+  br ^bb3(%arg1 : memref<2xf32>)
+^bb2:
+  %0 = alloc() : memref<2xf32>
+  linalg.generic {
+    indexing_maps = [#map0, #map0],
+    iterator_types = ["parallel"]}
+    ins(%arg1: memref<2xf32>)
+   outs(%0: memref<2xf32>) {
+  ^bb0(%gen1_arg0: f32, %gen1_arg1: f32):
+    %1 = alloc() : memref<2xf32>
+    linalg.generic {
+      indexing_maps = [#map0, #map0],
+      iterator_types = ["parallel"]}
+      ins(%arg1: memref<2xf32>)
+    outs(%1: memref<2xf32>) {
+    ^bb0(%gen2_arg0: f32, %gen2_arg1: f32):
+      %tmp2 = exp %gen2_arg0 : f32
+      linalg.yield %tmp2 : f32
+    }
+    %tmp1 = exp %gen1_arg0 : f32
+    linalg.yield %tmp1 : f32
+  }
+  br ^bb3(%0 : memref<2xf32>)
+^bb3(%1: memref<2xf32>):
+  "linalg.copy"(%1, %arg2) : (memref<2xf32>, memref<2xf32>) -> ()
+  return
+}
+// CHECK-NEXT:   %[[ALLOCA0:.*]] = alloca()
+// CHECK-NEXT:   %[[ALLOCA1:.*]] = alloca()
+// CHECK-NEXT:   cond_br
+
+// -----
+
+// Test Case: buffer deallocation escaping
+// PromoteBuffersToStack expected behavior: The first alloc is returned, so
+// there is no conversion allowed. The second alloc is converted, since it
+// only remains in the scope of the function.
+
+#map0 = affine_map<(d0) -> (d0)>
+
+// CHECK-LABEL: func @memref_in_function_results
+func @memref_in_function_results(
+  %arg0: memref<5xf32>,
+  %arg1: memref<10xf32>,
+  %arg2: memref<5xf32>) -> (memref<10xf32>, memref<15xf32>) {
+  %x = alloc() : memref<15xf32>
+  %y = alloc() : memref<5xf32>
+  linalg.generic {indexing_maps = [#map0, #map0], iterator_types = ["parallel"]}
+    ins(%arg0: memref<5xf32>)
+   outs(%y: memref<5xf32>) {
+  ^bb0(%arg3: f32, %arg4: f32):
+    %2 = exp %arg3 : f32
+    linalg.yield %2 : f32
+  }
+  linalg.copy(%y, %arg2) : memref<5xf32>, memref<5xf32>
+  return %arg1, %x : memref<10xf32>, memref<15xf32>
+}
+//      CHECK: (%[[ARG0:.*]]: memref<5xf32>, %[[ARG1:.*]]: memref<10xf32>,
+// CHECK-SAME: %[[RESULT:.*]]: memref<5xf32>)
+//      CHECK: %[[ALLOC:.*]] = alloc()
+//      CHECK: %[[ALLOCA:.*]] = alloca()
+//      CHECK: linalg.copy
+//      CHECK: return %[[ARG1]], %[[ALLOC]]
+
+// -----
+
+// Test Case: nested region control flow
+// The allocation in the nested if branch cannot be converted to an alloca
+// due to its dynamic memory allocation behavior.
+
+// CHECK-LABEL: func @nested_region_control_flow
+func @nested_region_control_flow(
+  %arg0 : index,
+  %arg1 : index) -> memref<?x?xf32> {
+  %0 = cmpi "eq", %arg0, %arg1 : index
+  %1 = alloc(%arg0, %arg0) : memref<?x?xf32>
+  %2 = scf.if %0 -> (memref<?x?xf32>) {
+    scf.yield %1 : memref<?x?xf32>
+  } else {
+    %3 = alloc(%arg0, %arg1) : memref<?x?xf32>
+    scf.yield %1 : memref<?x?xf32>
+  }
+  return %2 : memref<?x?xf32>
+}
+
+//      CHECK: %[[ALLOC0:.*]] = alloc(%arg0, %arg0)
+// CHECK-NEXT: %[[ALLOC1:.*]] = scf.if
+//      CHECK: scf.yield %[[ALLOC0]]
+//      CHECK: %[[ALLOC2:.*]] = alloc(%arg0, %arg1)
+// CHECK-NEXT: scf.yield %[[ALLOC0]]
+//      CHECK: return %[[ALLOC1]]
+
+// -----
+
+// Test Case: nested region control flow within a region interface.
+// The alloc %0 does not need to be converted in this case since the
+// allocation finally escapes the method.
+
+// CHECK-LABEL: func @inner_region_control_flow
+func @inner_region_control_flow(%arg0 : index) -> memref<2x2xf32> {
+  %0 = alloc() : memref<2x2xf32>
+  %1 = test.region_if %0 : memref<2x2xf32> -> (memref<2x2xf32>) then {
+    ^bb0(%arg1 : memref<2x2xf32>):
+      test.region_if_yield %arg1 : memref<2x2xf32>
+  } else {
+    ^bb0(%arg1 : memref<2x2xf32>):
+      test.region_if_yield %arg1 : memref<2x2xf32>
+  } join {
+    ^bb0(%arg1 : memref<2x2xf32>):
+      test.region_if_yield %arg1 : memref<2x2xf32>
+  }
+  return %1 : memref<2x2xf32>
+}
+
+//      CHECK: %[[ALLOC0:.*]] = alloc()
+// CHECK-NEXT: %[[ALLOC1:.*]] = test.region_if
+// CHECK-NEXT: ^bb0(%[[ALLOC2:.*]]:{{.*}}):
+// CHECK-NEXT: test.region_if_yield %[[ALLOC2]]
+//      CHECK: ^bb0(%[[ALLOC3:.*]]:{{.*}}):
+// CHECK-NEXT: test.region_if_yield %[[ALLOC3]]
+//      CHECK: ^bb0(%[[ALLOC4:.*]]:{{.*}}):
+// CHECK-NEXT: test.region_if_yield %[[ALLOC4]]
+//      CHECK: return %[[ALLOC1]]
+
+// -----
+
+// Test Case: structured control-flow loop using a nested alloc.
+// The alloc %3 will be moved upwards and both allocs are converted to
+// allocas.
+
+// CHECK-LABEL: func @loop_alloc
+func @loop_alloc(
+  %lb: index,
+  %ub: index,
+  %step: index,
+  %buf: memref<2xf32>,
+  %res: memref<2xf32>) {
+  %0 = alloc() : memref<2xf32>
+  %1 = scf.for %i = %lb to %ub step %step
+    iter_args(%iterBuf = %buf) -> memref<2xf32> {
+    %2 = cmpi "eq", %i, %ub : index
+    %3 = alloc() : memref<2xf32>
+    scf.yield %3 : memref<2xf32>
+  }
+  "linalg.copy"(%1, %res) : (memref<2xf32>, memref<2xf32>) -> ()
+  return
+}
+
+// CHECK-NEXT: %[[ALLOCA0:.*]] = alloca()
+// CHECK-NEXT: %[[ALLOCA1:.*]] = alloca()
+
+// -----
+
+// Test Case: structured control-flow loop with a nested if operation.
+// The loop yields buffers that have been defined outside of the loop and the
+// backeges only use the iteration arguments (or one of its aliases).
+// Therefore, we do not have to (and are not allowed to) free any buffers
+// that are passed via the backedges. The alloc is converted to an AllocaOp.
+
+// CHECK-LABEL: func @loop_nested_if_no_alloc
+func @loop_nested_if_no_alloc(
+  %lb: index,
+  %ub: index,
+  %step: index,
+  %buf: memref<2xf32>,
+  %res: memref<2xf32>) {
+  %0 = alloc() : memref<2xf32>
+  %1 = scf.for %i = %lb to %ub step %step
+    iter_args(%iterBuf = %buf) -> memref<2xf32> {
+    %2 = cmpi "eq", %i, %ub : index
+    %3 = scf.if %2 -> (memref<2xf32>) {
+      scf.yield %0 : memref<2xf32>
+    } else {
+      scf.yield %iterBuf : memref<2xf32>
+    }
+    scf.yield %3 : memref<2xf32>
+  }
+  "linalg.copy"(%1, %res) : (memref<2xf32>, memref<2xf32>) -> ()
+  return
+}
+
+//      CHECK: %[[ALLOCA0:.*]] = alloca()
+// CHECK-NEXT: %[[ALLOCA1:.*]] = scf.for {{.*}} iter_args(%[[IALLOCA:.*]] =
+//      CHECK: %[[ALLOCA2:.*]] = scf.if
+//      CHECK: scf.yield %[[ALLOCA0]]
+//      CHECK: scf.yield %[[IALLOCA]]
+//      CHECK: scf.yield %[[ALLOCA2]]
+//      CHECK: linalg.copy(%[[ALLOCA1]], %arg4)
+
+// -----
+
+// Test Case: structured control-flow loop with a nested if operation using
+// a deeply nested buffer allocation.
+// The allocs are not converted in this case.
+
+// CHECK-LABEL: func @loop_nested_if_alloc
+func @loop_nested_if_alloc(
+  %lb: index,
+  %ub: index,
+  %step: index,
+  %buf: memref<2xf32>) -> memref<2xf32> {
+  %0 = alloc() : memref<2xf32>
+  %1 = scf.for %i = %lb to %ub step %step
+    iter_args(%iterBuf = %buf) -> memref<2xf32> {
+    %2 = cmpi "eq", %i, %ub : index
+    %3 = scf.if %2 -> (memref<2xf32>) {
+      %4 = alloc() : memref<2xf32>
+      scf.yield %4 : memref<2xf32>
+    } else {
+      scf.yield %0 : memref<2xf32>
+    }
+    scf.yield %3 : memref<2xf32>
+  }
+  return %1 : memref<2xf32>
+}
+
+//      CHECK: %[[ALLOC0:.*]] = alloc()
+// CHECK-NEXT: %[[ALLOC1:.*]] = scf.for {{.*}}
+//      CHECK: %[[ALLOC2:.*]] = scf.if
+//      CHECK: %[[ALLOC3:.*]] = alloc()
+// CHECK-NEXT: scf.yield %[[ALLOC3]]
+//      CHECK: scf.yield %[[ALLOC0]]
+//      CHECK: scf.yield %[[ALLOC2]]
+//      CHECK: return %[[ALLOC1]]
+
+// -----
+
+// Test Case: several nested structured control-flow loops with a deeply nested
+// buffer allocation inside an if operation.
+// The allocs are moved upwards and all converted to allocas.
+
+// CHECK-LABEL: func @loop_nested_alloc
+func @loop_nested_alloc(
+  %lb: index,
+  %ub: index,
+  %step: index,
+  %buf: memref<2xf32>,
+  %res: memref<2xf32>) {
+  %0 = alloc() : memref<2xf32>
+  %1 = scf.for %i = %lb to %ub step %step
+    iter_args(%iterBuf = %buf) -> memref<2xf32> {
+    %2 = scf.for %i2 = %lb to %ub step %step
+      iter_args(%iterBuf2 = %iterBuf) -> memref<2xf32> {
+      %3 = scf.for %i3 = %lb to %ub step %step
+        iter_args(%iterBuf3 = %iterBuf2) -> memref<2xf32> {
+        %4 = alloc() : memref<2xf32>
+        %5 = cmpi "eq", %i, %ub : index
+        %6 = scf.if %5 -> (memref<2xf32>) {
+          %7 = alloc() : memref<2xf32>
+          scf.yield %7 : memref<2xf32>
+        } else {
+          scf.yield %iterBuf3 : memref<2xf32>
+        }
+        scf.yield %6 : memref<2xf32>
+      }
+      scf.yield %3 : memref<2xf32>
+    }
+    scf.yield %2 : memref<2xf32>
+  }
+  "linalg.copy"(%1, %res) : (memref<2xf32>, memref<2xf32>) -> ()
+  return
+}
+
+//      CHECK: %[[ALLOCA0:.*]] = alloca()
+// CHECK-NEXT: %[[ALLOCA1:.*]] = alloca()
+// CHECK-NEXT: %[[ALLOCA2:.*]] = alloca()
+// CHECK-NEXT: %[[ALLOCA3:.*]] = scf.for {{.*}}
+// CHECK-NEXT: %[[ALLOCA4:.*]] = scf.for {{.*}}
+// CHECK-NEXT: %[[ALLOCA5:.*]] = scf.for {{.*}}
+//      CHECK: %[[ALLOCA6:.*]] = scf.if
+// CHECK-NEXT: scf.yield %[[ALLOCA2]]
+//      CHECK: scf.yield
+//      CHECK: scf.yield %[[ALLOCA6]]
+//      CHECK: scf.yield %[[ALLOCA5]]
+//      CHECK: scf.yield %[[ALLOCA4]]
+//      CHECK: linalg.copy(%[[ALLOCA3]], %arg4)
+
+// -----
+
+// Test Case: The allocated buffer is too large and, hence, it is not
+// converted. In the actual implementation the largest size is 1KB.
+
+// CHECK-LABEL: func @large_buffer_allocation
+func @large_buffer_allocation(%arg0: memref<2048xf32>) {
+  %0 = alloc() : memref<2048xf32>
+  "linalg.copy"(%0, %arg0) : (memref<2048xf32>, memref<2048xf32>) -> ()
+  return
+}
+
+// CHECK-NEXT: %[[ALLOC:.*]] = alloc()
+// CHECK-NEXT: linalg.copy