diff --git a/mlir/include/mlir/Transforms/Bufferize.h b/mlir/include/mlir/Transforms/Bufferize.h
--- a/mlir/include/mlir/Transforms/Bufferize.h
+++ b/mlir/include/mlir/Transforms/Bufferize.h
@@ -28,8 +28,10 @@
 #ifndef MLIR_TRANSFORMS_BUFFERIZE_H
 #define MLIR_TRANSFORMS_BUFFERIZE_H
 
+#include "mlir/Analysis/Liveness.h"
 #include "mlir/Dialect/StandardOps/IR/Ops.h"
 #include "mlir/IR/Builders.h"
+#include "mlir/IR/Dominance.h"
 #include "mlir/IR/Function.h"
 #include "mlir/IR/Operation.h"
 #include "mlir/Transforms/DialectConversion.h"
@@ -276,6 +278,142 @@
   >(context, converter);
   // clang-format on
 }
+
+/// A straight-forward alias analysis which ensures that all aliases of all
+/// values will be determined. This is a requirement for the BufferPlacement
+/// class since you need to determine safe positions to place alloc and
+/// deallocs.
+class BufferPlacementAliasAnalysis {
+public:
+  using ValueSetT = SmallPtrSet<Value, 16>;
+  using ValueMapT = llvm::DenseMap<Value, ValueSetT>;
+
+public:
+  /// Constructs a new alias analysis using the op provided.
+  BufferPlacementAliasAnalysis(Operation *op);
+
+  /// Find all immediate aliases this value could potentially have.
+  ValueMapT::const_iterator find(Value value) const {
+    return aliases.find(value);
+  }
+
+  /// Returns the begin iterator to iterate over all aliases.
+  ValueMapT::const_iterator begin() const { return aliases.begin(); }
+
+  /// Returns the end iterator that can be used in combination with find.
+  ValueMapT::const_iterator end() const { return aliases.end(); }
+
+  /// Find all immediate and indirect aliases this value could potentially
+  /// have. Note that the resulting set will also contain the value provided as
+  /// it is an alias of itself.
+  ValueSetT resolve(Value value) const;
+
+  /// Removes the given values from all alias sets.
+  void remove(const SmallPtrSetImpl<Value> &aliasValues);
+
+private:
+  /// This function constructs a mapping from values to its immediate aliases.
+  void build(Operation *op);
+
+  /// Maps values to all immediate aliases this value can have.
+  ValueMapT aliases;
+};
+
+/// A simple analysis that detects allocation operations.
+class BufferPlacementAllocs {
+public:
+  /// Represents a tuple of allocValue and deallocOperation.
+  using AllocEntry = std::tuple<Value, Operation *>;
+
+  /// Represents a list containing all alloc entries.
+  using AllocEntryList = SmallVector<AllocEntry, 8>;
+
+  /// Get the start operation to place the given alloc value withing the
+  // specified placement block.
+  static Operation *getStartOperation(Value allocValue, Block *placementBlock,
+                                      const Liveness &liveness);
+
+  /// Find an associated dealloc operation that is linked to the given
+  /// allocation node (if any).
+  static Operation *findDealloc(Value allocValue);
+
+public:
+  /// Initializes the internal list by discovering all supported allocation
+  /// nodes.
+  BufferPlacementAllocs(Operation *op);
+
+  /// Returns the begin iterator to iterate over all allocations.
+  AllocEntryList::const_iterator begin() const { return allocs.begin(); }
+
+  /// Returns the end iterator that can be used in combination with begin.
+  AllocEntryList::const_iterator end() const { return allocs.end(); }
+
+  /// Returns the begin iterator to iterate over all allocations.
+  AllocEntryList::iterator begin() { return allocs.begin(); }
+
+  /// Returns the end iterator that can be used in combination with begin.
+  AllocEntryList::iterator end() { return allocs.end(); }
+
+  /// Registers a new allocation entry.
+  void registerAlloc(const AllocEntry &entry) { allocs.push_back(entry); }
+
+private:
+  /// Searches for and registers all supported allocation entries.
+  void build(Operation *op);
+
+private:
+  /// Maps allocation nodes to their associated blocks.
+  AllocEntryList allocs;
+};
+
+/// The base class for all BufferPlacement transformations.
+class BufferPlacementTransformationBase {
+public:
+  using ValueSetT = BufferPlacementAliasAnalysis::ValueSetT;
+
+  /// Finds a common dominator for the given value while taking the positions
+  /// of the values in the value set into account. It supports dominator and
+  /// post-dominator analyses via template arguments.
+  template <typename DominatorT>
+  static Block *findCommonDominator(Value value, const ValueSetT &values,
+                                    const DominatorT &doms) {
+    // Start with the current block the value is defined in.
+    Block *dom = value.getParentBlock();
+    // Iterate over all aliases and their uses to find a safe placement block
+    // according to the given dominator information.
+    for (Value childValue : values) {
+      for (Operation *user : childValue.getUsers()) {
+        // Move upwards in the dominator tree to find an appropriate
+        // dominator block that takes the current use into account.
+        dom = doms.findNearestCommonDominator(dom, user->getBlock());
+      }
+      // Take values without any users into account.
+      dom = doms.findNearestCommonDominator(dom, childValue.getParentBlock());
+    }
+    return dom;
+  }
+
+  /// Returns true if the given operation represents a loop by testing whether
+  /// it implements the `LoopLikeOpInterface` or the `RegionBranchOpInterface`.
+  /// In the case of a `RegionBranchOpInterface`, it checks all region-based
+  /// control-flow edges for cycles.
+  static bool isLoop(Operation *op);
+
+  /// Constructs a new operation base using the given root operation.
+  BufferPlacementTransformationBase(Operation *op);
+
+protected:
+  /// Alias information that can be updated during the insertion of copies.
+  BufferPlacementAliasAnalysis aliases;
+
+  /// Stores all internally managed allocations.
+  BufferPlacementAllocs allocs;
+
+  /// The underlying liveness analysis to compute fine grained information
+  /// about alloc and dealloc positions.
+  Liveness liveness;
+};
+
 } // end namespace mlir
 
 #endif // MLIR_TRANSFORMS_BUFFERIZE_H
diff --git a/mlir/include/mlir/Transforms/Passes.h b/mlir/include/mlir/Transforms/Passes.h
--- a/mlir/include/mlir/Transforms/Passes.h
+++ b/mlir/include/mlir/Transforms/Passes.h
@@ -28,8 +28,17 @@
 // Passes
 //===----------------------------------------------------------------------===//
 
-/// Creates an instance of the BufferPlacement pass.
-std::unique_ptr<Pass> createBufferPlacementPass();
+/// Creates an instance of the BufferDeallocation pass to free all allocated
+/// buffers.
+std::unique_ptr<Pass> createBufferDeallocationPass();
+
+/// Creates a pass that moves allocations upwards to reduce the number of
+/// required copies that are inserted during the BufferDeallocation pass.
+std::unique_ptr<Pass> createBufferHoistingPass();
+
+/// Creates a pass that moves allocations upwards out of loops. This avoids
+/// reallocations inside of loops.
+std::unique_ptr<Pass> createBufferLoopHoistingPass();
 
 /// Creates an instance of the Canonicalizer pass.
 std::unique_ptr<Pass> createCanonicalizerPass();
diff --git a/mlir/include/mlir/Transforms/Passes.td b/mlir/include/mlir/Transforms/Passes.td
--- a/mlir/include/mlir/Transforms/Passes.td
+++ b/mlir/include/mlir/Transforms/Passes.td
@@ -102,12 +102,13 @@
   let constructor = "mlir::createPipelineDataTransferPass()";
 }
 
-def BufferPlacement : FunctionPass<"buffer-placement"> {
-  let summary = "Optimizes placement of alloc and dealloc operations";
+def BufferDeallocation : FunctionPass<"buffer-deallocation"> {
+  let summary = "Adds all required dealloc operations for all allocations in the "
+                "input program";
   let description = [{
-    This pass implements an algorithm to optimize the placement of alloc and
-    dealloc operations. This pass also inserts missing dealloc operations
-    automatically to reclaim memory.
+    This pass implements an algorithm to automatically introduce all required
+    deallocation operations for all buffers in the input program. This ensures that
+    the resulting program does not have any memory leaks.
 
 
     Input
@@ -121,7 +122,11 @@
         br ^bb3(%arg1 : memref<2xf32>)
       ^bb2:
         %0 = alloc() : memref<2xf32>
-        linalg.generic {args_in = 1 : i64, args_out = 1 : i64, indexing_maps = [#map0, #map0], iterator_types = ["parallel"]} %arg1, %0 {
+        linalg.generic {
+          args_in = 1 : i64,
+          args_out = 1 : i64,
+          indexing_maps = [#map0, #map0],
+          iterator_types = ["parallel"]} %arg1, %0 {
         ^bb0(%gen1_arg0: f32, %gen1_arg1: f32):
           %tmp1 = exp %gen1_arg0 : f32
           linalg.yield %tmp1 : f32
@@ -141,31 +146,61 @@
     #map0 = affine_map<(d0) -> (d0)>
     module {
       func @condBranch(%arg0: i1, %arg1: memref<2xf32>, %arg2: memref<2xf32>) {
-        %0 = alloc() : memref<2xf32>
         cond_br %arg0, ^bb1, ^bb2
-      ^bb1: // pred: ^bb0
-        br ^bb3(%arg1 : memref<2xf32>)
-      ^bb2: // pred: ^bb0
-        linalg.generic {args_in = 1 : i64, args_out = 1 : i64, indexing_maps = [#map0, #map0], iterator_types = ["parallel"]} %arg1, %0 {
-        ^bb0(%arg3: f32, %arg4: f32):       // no predecessors
-          %2 = exp %arg3 : f32
-          linalg.yield %2 : f32
-        }: memref<2xf32>, memref<2xf32>
+      ^bb1:  // pred: ^bb0
+        %0 = alloc() : memref<2xf32>
+        linalg.copy(%arg1, %0) : memref<2xf32>, memref<2xf32>
         br ^bb3(%0 : memref<2xf32>)
-      ^bb3(%1: memref<2xf32>):      // 2 preds: ^bb1, ^bb2
-        linalg.copy(%1, %arg2) : memref<2xf32>, memref<2xf32>
-        dealloc %0 : memref<2xf32>
+      ^bb2:  // pred: ^bb0
+        %1 = alloc() : memref<2xf32>
+        linalg.generic {
+          args_in = 1 : i64,
+          args_out = 1 : i64,
+          indexing_maps = [#map0, #map0],
+          iterator_types = ["parallel"]} %arg1, %1 {
+        ^bb0(%arg3: f32, %arg4: f32):  // no predecessors
+          %4 = exp %arg3 : f32
+          linalg.yield %4 : f32
+        }: memref<2xf32>, memref<2xf32>
+        %2 = alloc() : memref<2xf32>
+        linalg.copy(%1, %2) : memref<2xf32>, memref<2xf32>
+        dealloc %1 : memref<2xf32>
+        br ^bb3(%2 : memref<2xf32>)
+      ^bb3(%3: memref<2xf32>):  // 2 preds: ^bb1, ^bb2
+        linalg.copy(%3, %arg2) : memref<2xf32>, memref<2xf32>
+        dealloc %3 : memref<2xf32>
         return
       }
+
     }
     ```
 
   }];
-  let constructor = "mlir::createBufferPlacementPass()";
+  let constructor = "mlir::createBufferDeallocationPass()";
   // TODO: this pass likely shouldn't depend on Linalg?
   let dependentDialects = ["linalg::LinalgDialect"];
 }
 
+def BufferHoisting : FunctionPass<"buffer-hoisting"> {
+  let summary = "Optimizes placement of allocation operations by moving them "
+                "into common dominators and out of nested regions";
+  let description = [{
+    This pass implements an approach to aggressively move allocations upwards
+    into common dominators and out of nested regions.
+  }];
+  let constructor = "mlir::createBufferHoistingPass()";
+}
+
+def BufferLoopHoisting : FunctionPass<"buffer-loop-hoisting"> {
+  let summary = "Optimizes placement of allocation operations by moving them "
+                "out of loop nests";
+  let description = [{
+    This pass implements an approach to aggressively move allocations upwards
+    out of loop nests. It does not move allocations into common dominators.
+  }];
+  let constructor = "mlir::createBufferLoopHoistingPass()";
+}
+
 def Canonicalizer : Pass<"canonicalize"> {
   let summary = "Canonicalize operations";
   let description = [{
diff --git a/mlir/lib/Transforms/BufferPlacement.cpp b/mlir/lib/Transforms/BufferDeallocation.cpp
rename from mlir/lib/Transforms/BufferPlacement.cpp
rename to mlir/lib/Transforms/BufferDeallocation.cpp
--- a/mlir/lib/Transforms/BufferPlacement.cpp
+++ b/mlir/lib/Transforms/BufferDeallocation.cpp
@@ -1,4 +1,4 @@
-//===- BufferPlacement.cpp - the impl for buffer placement ---------------===//
+//===- BufferDeallocation.cpp - the impl for buffer deallocation ----------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -8,14 +8,15 @@
 //
 // This file implements logic for computing correct alloc and dealloc positions.
 // Furthermore, buffer placement also adds required new alloc and copy
-// operations to ensure that all buffers are deallocated.The main class is the
-// BufferPlacementPass class that implements the underlying algorithm. In order
-// to put allocations and deallocations at safe positions, it is significantly
-// important to put them into the correct blocks. However, the liveness analysis
-// does not pay attention to aliases, which can occur due to branches (and their
-// associated block arguments) in general. For this purpose, BufferPlacement
-// firstly finds all possible aliases for a single value (using the
-// BufferPlacementAliasAnalysis class). Consider the following example:
+// operations to ensure that all buffers are deallocated. The main class is the
+// BufferDeallocationPass class that implements the underlying algorithm. In
+// order to put allocations and deallocations at safe positions, it is
+// significantly important to put them into the correct blocks. However, the
+// liveness analysis does not pay attention to aliases, which can occur due to
+// branches (and their associated block arguments) in general. For this purpose,
+// BufferDeallocation firstly finds all possible aliases for a single value
+// (using the BufferPlacementAliasAnalysis class). Consider the following
+// example:
 //
 // ^bb0(%arg0):
 //   cond_br %cond, ^bb1, ^bb2
@@ -27,13 +28,9 @@
 // ^exit(%arg1):
 //   return %arg1;
 //
-// Using liveness information on its own would cause us to place the allocs and
-// deallocs in the wrong block. This is due to the fact that %new_value will not
-// be liveOut of its block. Instead, we can place the alloc for %new_value
-// in bb0 and its associated dealloc in exit. Alternatively, the alloc can stay
-// (or even has to stay due to additional dependencies) at this location and we
-// have to free the buffer in the same block, because it cannot be freed in the
-// post dominator. However, this requires a new copy buffer for %arg1 that will
+// We should place the dealloc for %new_value in exit. However, we have to free
+// the buffer in the same block, because it cannot be freed in the post
+// dominator. However, this requires a new copy buffer for %arg1 that will
 // contain the actual contents. Using the class BufferPlacementAliasAnalysis, we
 // will find out that %new_value has a potential alias %arg1. In order to find
 // the dealloc position we have to find all potential aliases, iterate over
@@ -42,10 +39,9 @@
 // of the deallocs). In all cases, the computed block can be safely used to free
 // the %new_value buffer (may be exit or bb2) as it will die and we can use
 // liveness information to determine the exact operation after which we have to
-// insert the dealloc. Finding the alloc position is similar and non-obvious.
-// However, the algorithm supports moving allocs to other places and introducing
-// copy buffers and placing deallocs in safe places to ensure that all buffers
-// will be freed in the end.
+// insert the dealloc. However, the algorithm supports introducing copy buffers
+// and placing deallocs in safe locations to ensure that all buffers will be
+// freed in the end.
 //
 // TODO:
 // The current implementation does not support explicit-control-flow loops and
@@ -56,13 +52,13 @@
 //===----------------------------------------------------------------------===//
 
 #include "PassDetail.h"
-#include "mlir/Analysis/Liveness.h"
 #include "mlir/Dialect/Linalg/IR/LinalgOps.h"
 #include "mlir/Dialect/StandardOps/IR/Ops.h"
-#include "mlir/IR/Dominance.h"
 #include "mlir/IR/Operation.h"
 #include "mlir/Interfaces/ControlFlowInterfaces.h"
+#include "mlir/Interfaces/LoopLikeInterface.h"
 #include "mlir/Pass/Pass.h"
+#include "mlir/Transforms/Bufferize.h"
 #include "mlir/Transforms/Passes.h"
 #include "llvm/ADT/SetOperations.h"
 
@@ -95,134 +91,249 @@
   regionInterface.getSuccessorRegions(index, operandAttributes, successors);
 }
 
-namespace {
 //===----------------------------------------------------------------------===//
 // BufferPlacementAliasAnalysis
 //===----------------------------------------------------------------------===//
 
-/// A straight-forward alias analysis which ensures that all aliases of all
-/// values will be determined. This is a requirement for the BufferPlacement
-/// class since you need to determine safe positions to place alloc and
-/// deallocs.
-class BufferPlacementAliasAnalysis {
-public:
-  using ValueSetT = SmallPtrSet<Value, 16>;
-  using ValueMapT = llvm::DenseMap<Value, ValueSetT>;
-
-public:
-  /// Constructs a new alias analysis using the op provided.
-  BufferPlacementAliasAnalysis(Operation *op) { build(op); }
-
-  /// Find all immediate aliases this value could potentially have.
-  ValueMapT::const_iterator find(Value value) const {
-    return aliases.find(value);
-  }
-
-  /// Returns the end iterator that can be used in combination with find.
-  ValueMapT::const_iterator end() const { return aliases.end(); }
-
-  /// Find all immediate and indirect aliases this value could potentially
-  /// have. Note that the resulting set will also contain the value provided as
-  /// it is an alias of itself.
-  ValueSetT resolve(Value value) const {
-    ValueSetT result;
-    resolveRecursive(value, result);
-    return result;
-  }
+/// Constructs a new alias analysis using the op provided.
+BufferPlacementAliasAnalysis::BufferPlacementAliasAnalysis(Operation *op) {
+  build(op);
+}
 
-  /// Removes the given values from all alias sets.
-  void remove(const SmallPtrSetImpl<Value> &aliasValues) {
-    for (auto &entry : aliases)
-      llvm::set_subtract(entry.second, aliasValues);
-  }
+/// Find all immediate and indirect aliases this value could potentially
+/// have. Note that the resulting set will also contain the value provided as
+/// it is an alias of itself.
+BufferPlacementAliasAnalysis::ValueSetT
+BufferPlacementAliasAnalysis::resolve(Value value) const {
+  ValueSetT result;
 
-private:
   /// Recursively determines alias information for the given value. It stores
   /// all newly found potential aliases in the given result set.
-  void resolveRecursive(Value value, ValueSetT &result) const {
-    if (!result.insert(value).second)
+  std::function<void(Value)> resolveRecursive = [&](Value current) {
+    if (!result.insert(current).second)
       return;
-    auto it = aliases.find(value);
+    auto it = aliases.find(current);
     if (it == aliases.end())
       return;
     for (Value alias : it->second)
-      resolveRecursive(alias, result);
-  }
+      resolveRecursive(alias);
+  };
 
-  /// This function constructs a mapping from values to its immediate aliases.
-  /// It iterates over all blocks, gets their predecessors, determines the
-  /// values that will be passed to the corresponding block arguments and
-  /// inserts them into the underlying map. Furthermore, it wires successor
-  /// regions and branch-like return operations from nested regions.
-  void build(Operation *op) {
-    // Registers all aliases of the given values.
-    auto registerAliases = [&](auto values, auto aliases) {
-      for (auto entry : llvm::zip(values, aliases))
-        this->aliases[std::get<0>(entry)].insert(std::get<1>(entry));
-    };
+  resolveRecursive(value);
+  return result;
+}
 
-    // Add additional aliases created by view changes to the alias list.
-    op->walk([&](ViewLikeOpInterface viewInterface) {
-      aliases[viewInterface.getViewSource()].insert(
-          viewInterface.getOperation()->getResult(0));
-    });
+/// Removes the given values from all alias sets.
+void BufferPlacementAliasAnalysis::remove(
+    const SmallPtrSetImpl<Value> &aliasValues) {
+  for (auto &entry : aliases)
+    llvm::set_subtract(entry.second, aliasValues);
+}
 
-    // Query all branch interfaces to link block argument aliases.
-    op->walk([&](BranchOpInterface branchInterface) {
-      Block *parentBlock = branchInterface.getOperation()->getBlock();
-      for (auto it = parentBlock->succ_begin(), e = parentBlock->succ_end();
-           it != e; ++it) {
-        // Query the branch op interface to get the successor operands.
-        auto successorOperands =
-            branchInterface.getSuccessorOperands(it.getIndex());
-        if (!successorOperands.hasValue())
-          continue;
-        // Build the actual mapping of values to their immediate aliases.
-        registerAliases(successorOperands.getValue(), (*it)->getArguments());
-      }
-    });
+/// This function constructs a mapping from values to its immediate aliases.
+/// It iterates over all blocks, gets their predecessors, determines the
+/// values that will be passed to the corresponding block arguments and
+/// inserts them into the underlying map. Furthermore, it wires successor
+/// regions and branch-like return operations from nested regions.
+void BufferPlacementAliasAnalysis::build(Operation *op) {
+  // Registers all aliases of the given values.
+  auto registerAliases = [&](auto values, auto aliases) {
+    for (auto entry : llvm::zip(values, aliases))
+      this->aliases[std::get<0>(entry)].insert(std::get<1>(entry));
+  };
 
-    // Query the RegionBranchOpInterface to find potential successor regions.
-    op->walk([&](RegionBranchOpInterface regionInterface) {
-      // Extract all entry regions and wire all initial entry successor inputs.
-      SmallVector<RegionSuccessor, 2> entrySuccessors;
-      getSuccessorRegions(regionInterface, /*index=*/llvm::None,
-                          entrySuccessors);
-      for (RegionSuccessor &entrySuccessor : entrySuccessors) {
-        // Wire the entry region's successor arguments with the initial
-        // successor inputs.
-        assert(entrySuccessor.getSuccessor() &&
-               "Invalid entry region without an attached successor region");
-        registerAliases(regionInterface.getSuccessorEntryOperands(
-                            entrySuccessor.getSuccessor()->getRegionNumber()),
-                        entrySuccessor.getSuccessorInputs());
-      }
+  // Add additional aliases created by view changes to the alias list.
+  op->walk([&](ViewLikeOpInterface viewInterface) {
+    aliases[viewInterface.getViewSource()].insert(
+        viewInterface.getOperation()->getResult(0));
+  });
+
+  // Query all branch interfaces to link block argument aliases.
+  op->walk([&](BranchOpInterface branchInterface) {
+    Block *parentBlock = branchInterface.getOperation()->getBlock();
+    for (auto it = parentBlock->succ_begin(), e = parentBlock->succ_end();
+         it != e; ++it) {
+      // Query the branch op interface to get the successor operands.
+      auto successorOperands =
+          branchInterface.getSuccessorOperands(it.getIndex());
+      if (!successorOperands.hasValue())
+        continue;
+      // Build the actual mapping of values to their immediate aliases.
+      registerAliases(successorOperands.getValue(), (*it)->getArguments());
+    }
+  });
+
+  // Query the RegionBranchOpInterface to find potential successor regions.
+  op->walk([&](RegionBranchOpInterface regionInterface) {
+    // Extract all entry regions and wire all initial entry successor inputs.
+    SmallVector<RegionSuccessor, 2> entrySuccessors;
+    getSuccessorRegions(regionInterface, /*index=*/llvm::None, entrySuccessors);
+    for (RegionSuccessor &entrySuccessor : entrySuccessors) {
+      // Wire the entry region's successor arguments with the initial
+      // successor inputs.
+      assert(entrySuccessor.getSuccessor() &&
+             "Invalid entry region without an attached successor region");
+      registerAliases(regionInterface.getSuccessorEntryOperands(
+                          entrySuccessor.getSuccessor()->getRegionNumber()),
+                      entrySuccessor.getSuccessorInputs());
+    }
 
-      // Wire flow between regions and from region exits.
-      for (Region &region : regionInterface.getOperation()->getRegions()) {
-        // Iterate over all successor region entries that are reachable from the
-        // current region.
-        SmallVector<RegionSuccessor, 2> successorRegions;
-        getSuccessorRegions(regionInterface, region.getRegionNumber(),
-                            successorRegions);
-        for (RegionSuccessor &successorRegion : successorRegions) {
-          // Iterate over all immediate terminator operations and wire the
-          // successor inputs with the operands of each terminator.
-          walkReturnOperations(&region, [&](Operation *terminator) {
-            registerAliases(terminator->getOperands(),
-                            successorRegion.getSuccessorInputs());
-          });
-        }
+    // Wire flow between regions and from region exits.
+    for (Region &region : regionInterface.getOperation()->getRegions()) {
+      // Iterate over all successor region entries that are reachable from the
+      // current region.
+      SmallVector<RegionSuccessor, 2> successorRegions;
+      getSuccessorRegions(regionInterface, region.getRegionNumber(),
+                          successorRegions);
+      for (RegionSuccessor &successorRegion : successorRegions) {
+        // Iterate over all immediate terminator operations and wire the
+        // successor inputs with the operands of each terminator.
+        walkReturnOperations(&region, [&](Operation *terminator) {
+          registerAliases(terminator->getOperands(),
+                          successorRegion.getSuccessorInputs());
+        });
       }
+    }
+  });
+}
+
+//===----------------------------------------------------------------------===//
+// BufferPlacementAllocs
+//===----------------------------------------------------------------------===//
+
+/// Get the start operation to place the given alloc value withing the
+// specified placement block.
+Operation *BufferPlacementAllocs::getStartOperation(Value allocValue,
+                                                    Block *placementBlock,
+                                                    const Liveness &liveness) {
+  // We have to ensure that we place the alloc before its first use in this
+  // block.
+  const LivenessBlockInfo &livenessInfo = *liveness.getLiveness(placementBlock);
+  Operation *startOperation = livenessInfo.getStartOperation(allocValue);
+  // Check whether the start operation lies in the desired placement block.
+  // If not, we will use the terminator as this is the last operation in
+  // this block.
+  if (startOperation->getBlock() != placementBlock) {
+    Operation *opInPlacementBlock =
+        placementBlock->findAncestorOpInBlock(*startOperation);
+    startOperation = opInPlacementBlock ? opInPlacementBlock
+                                        : placementBlock->getTerminator();
+  }
+
+  return startOperation;
+}
+
+/// Finds associated deallocs that can be linked to our allocation nodes (if
+/// any).
+Operation *BufferPlacementAllocs::findDealloc(Value allocValue) {
+  auto userIt = llvm::find_if(allocValue.getUsers(), [&](Operation *user) {
+    auto effectInterface = dyn_cast<MemoryEffectOpInterface>(user);
+    if (!effectInterface)
+      return false;
+    // Try to find a free effect that is applied to one of our values
+    // that will be automatically freed by our pass.
+    SmallVector<MemoryEffects::EffectInstance, 2> effects;
+    effectInterface.getEffectsOnValue(allocValue, effects);
+    return llvm::any_of(effects, [&](MemoryEffects::EffectInstance &it) {
+      return isa<MemoryEffects::Free>(it.getEffect());
     });
+  });
+  // Assign the associated dealloc operation (if any).
+  return userIt != allocValue.user_end() ? *userIt : nullptr;
+}
+
+/// Initializes the internal list by discovering all supported allocation
+/// nodes.
+BufferPlacementAllocs::BufferPlacementAllocs(Operation *op) { build(op); }
+
+/// Searches for and registers all supported allocation entries.
+void BufferPlacementAllocs::build(Operation *op) {
+  op->walk([&](MemoryEffectOpInterface opInterface) {
+    // Try to find a single allocation result.
+    SmallVector<MemoryEffects::EffectInstance, 2> effects;
+    opInterface.getEffects(effects);
+
+    SmallVector<MemoryEffects::EffectInstance, 2> allocateResultEffects;
+    llvm::copy_if(
+        effects, std::back_inserter(allocateResultEffects),
+        [=](MemoryEffects::EffectInstance &it) {
+          Value value = it.getValue();
+          return isa<MemoryEffects::Allocate>(it.getEffect()) && value &&
+                 value.isa<OpResult>() &&
+                 it.getResource() !=
+                     SideEffects::AutomaticAllocationScopeResource::get();
+        });
+    // If there is one result only, we will be able to move the allocation and
+    // (possibly existing) deallocation ops.
+    if (allocateResultEffects.size() != 1)
+      return;
+    // Get allocation result.
+    Value allocValue = allocateResultEffects[0].getValue();
+    // Find the associated dealloc value and register the allocation entry.
+    allocs.push_back(std::make_tuple(allocValue, findDealloc(allocValue)));
+  });
+}
+
+//===----------------------------------------------------------------------===//
+// BufferPlacementTransformationBase
+//===----------------------------------------------------------------------===//
+
+/// Constructs a new transformation base using the given root operation.
+BufferPlacementTransformationBase::BufferPlacementTransformationBase(
+    Operation *op)
+    : aliases(op), allocs(op), liveness(op) {}
+
+/// Returns true if the given operation represents a loop by testing whether it
+/// implements the `LoopLikeOpInterface` or the `RegionBranchOpInterface`. In
+/// the case of a `RegionBranchOpInterface`, it checks all region-based control-
+/// flow edges for cycles.
+bool BufferPlacementTransformationBase::isLoop(Operation *op) {
+  // If the operation implements the `LoopLikeOpInterface` it can be considered
+  // a loop.
+  if (isa<LoopLikeOpInterface>(op))
+    return true;
+
+  // If the operation does not implement the `RegionBranchOpInterface`, it is
+  // (currently) not possible to detect a loop.
+  RegionBranchOpInterface regionInterface;
+  if (!(regionInterface = dyn_cast<RegionBranchOpInterface>(op)))
+    return false;
+
+  // Recurses into a region using the current region interface to find potential
+  // cycles.
+  SmallPtrSet<Region *, 4> visitedRegions;
+  std::function<bool(Region *)> recurse = [&](Region *current) {
+    if (!current)
+      return false;
+    // If we have found a back edge, the parent operation induces a loop.
+    if (!visitedRegions.insert(current).second)
+      return true;
+    // Recurses into all region successors.
+    SmallVector<RegionSuccessor, 2> successors;
+    getSuccessorRegions(regionInterface, current->getRegionNumber(),
+                        successors);
+    for (RegionSuccessor &regionEntry : successors)
+      if (recurse(regionEntry.getSuccessor()))
+        return true;
+    return false;
+  };
+
+  // Start with all entry regions and test whether they induce a loop.
+  SmallVector<RegionSuccessor, 2> successorRegions;
+  getSuccessorRegions(regionInterface, /*index=*/llvm::None, successorRegions);
+  for (RegionSuccessor &regionEntry : successorRegions) {
+    if (recurse(regionEntry.getSuccessor()))
+      return true;
+    visitedRegions.clear();
   }
 
-  /// Maps values to all immediate aliases this value can have.
-  ValueMapT aliases;
-};
+  return false;
+}
+
+namespace {
 
 //===----------------------------------------------------------------------===//
-// Backedges
+// Backedges analysis
 //===----------------------------------------------------------------------===//
 
 /// A straight-forward program analysis which detects loop backedges induced by
@@ -284,7 +395,7 @@
       return;
 
     // Recurse into all operations and successor blocks.
-    for (auto &op : block.getOperations())
+    for (Operation &op : block.getOperations())
       recurse(&op, predecessor);
 
     // Leave the current block.
@@ -299,133 +410,28 @@
 };
 
 //===----------------------------------------------------------------------===//
-// BufferPlacement
+// BufferDeallocation
 //===----------------------------------------------------------------------===//
 
-// The main buffer placement analysis used to place allocs, copies and deallocs.
-class BufferPlacement {
-public:
-  using ValueSetT = BufferPlacementAliasAnalysis::ValueSetT;
-
-  /// An intermediate representation of a single allocation node.
-  struct AllocEntry {
-    /// A reference to the associated allocation node.
-    Value allocValue;
-
-    /// The associated placement block in which the allocation should be
-    /// performed.
-    Block *placementBlock;
-
-    /// The associated dealloc operation (if any).
-    Operation *deallocOperation;
-  };
-
-  using AllocEntryList = SmallVector<AllocEntry, 8>;
-
+/// The buffer deallocation transformation which ensures that all allocs in the
+/// program have a corresponding de-allocation. As a side-effect, it might also
+/// introduce copies that in turn leads to additional allocs and de-allocations.
+class BufferDeallocation : BufferPlacementTransformationBase {
 public:
-  BufferPlacement(Operation *op)
-      : operation(op), aliases(op), liveness(op), dominators(op),
-        postDominators(op) {
-    // Gather all allocation nodes
-    initBlockMapping();
-  }
+  BufferDeallocation(Operation *op)
+      : BufferPlacementTransformationBase(op), dominators(op),
+        postDominators(op) {}
 
-  /// Performs the actual placement/creation of all alloc, copy and dealloc
-  /// nodes.
-  void place() {
-    // Place all allocations.
-    placeAllocs();
+  /// Performs the actual placement/creation of all temporary alloc, copy and
+  /// dealloc nodes.
+  void deallocate() {
     // Add additional allocations and copies that are required.
     introduceCopies();
-    // Find all associated dealloc nodes.
-    findDeallocs();
     // Place deallocations for all allocation entries.
     placeDeallocs();
   }
 
 private:
-  /// Initializes the internal block mapping by discovering allocation nodes. It
-  /// maps all allocation nodes to their initial block in which they can be
-  /// safely allocated.
-  void initBlockMapping() {
-    operation->walk([&](MemoryEffectOpInterface opInterface) {
-      // Try to find a single allocation result.
-      SmallVector<MemoryEffects::EffectInstance, 2> effects;
-      opInterface.getEffects(effects);
-
-      SmallVector<MemoryEffects::EffectInstance, 2> allocateResultEffects;
-      llvm::copy_if(
-          effects, std::back_inserter(allocateResultEffects),
-          [=](MemoryEffects::EffectInstance &it) {
-            Value value = it.getValue();
-            return isa<MemoryEffects::Allocate>(it.getEffect()) && value &&
-                   value.isa<OpResult>() &&
-                   it.getResource() !=
-                       SideEffects::AutomaticAllocationScopeResource::get();
-          });
-      // If there is one result only, we will be able to move the allocation and
-      // (possibly existing) deallocation ops.
-      if (allocateResultEffects.size() != 1)
-        return;
-      // Get allocation result.
-      auto allocResult = allocateResultEffects[0].getValue().cast<OpResult>();
-      // Find the initial allocation block and register this result.
-      allocs.push_back(
-          {allocResult, getInitialAllocBlock(allocResult), nullptr});
-    });
-  }
-
-  /// Computes a valid allocation position in a dominator (if possible) for the
-  /// given allocation result.
-  Block *getInitialAllocBlock(OpResult result) {
-    // Get all allocation operands as these operands are important for the
-    // allocation operation.
-    Operation *owner = result.getOwner();
-    auto operands = owner->getOperands();
-    Block *dominator;
-    if (operands.size() < 1)
-      dominator =
-          findCommonDominator(result, aliases.resolve(result), dominators);
-    else {
-      // If this node has dependencies, check all dependent nodes with respect
-      // to a common post dominator in which all values are available.
-      ValueSetT dependencies(++operands.begin(), operands.end());
-      dominator =
-          findCommonDominator(*operands.begin(), dependencies, postDominators);
-    }
-
-    // Do not move allocs out of their parent regions to keep them local.
-    if (dominator->getParent() != owner->getParentRegion())
-      return &owner->getParentRegion()->front();
-    return dominator;
-  }
-
-  /// Finds correct alloc positions according to the algorithm described at
-  /// the top of the file for all alloc nodes that can be handled by this
-  /// analysis.
-  void placeAllocs() const {
-    for (const AllocEntry &entry : allocs) {
-      Value alloc = entry.allocValue;
-      // Get the actual block to place the alloc and get liveness information
-      // for the placement block.
-      Block *placementBlock = entry.placementBlock;
-      // We have to ensure that we place the alloc before its first use in this
-      // block.
-      const LivenessBlockInfo *livenessInfo =
-          liveness.getLiveness(placementBlock);
-      Operation *startOperation = livenessInfo->getStartOperation(alloc);
-      // Check whether the start operation lies in the desired placement block.
-      // If not, we will use the terminator as this is the last operation in
-      // this block.
-      if (startOperation->getBlock() != placementBlock)
-        startOperation = placementBlock->getTerminator();
-
-      // Move the alloc in front of the start operation.
-      Operation *allocOperation = alloc.getDefiningOp();
-      allocOperation->moveBefore(startOperation);
-    }
-  }
-
   /// Introduces required allocs and copy operations to avoid memory leaks.
   void introduceCopies() {
     // Initialize the set of values that require a dedicated memory free
@@ -462,9 +468,10 @@
     };
 
     // Detect possibly unsafe aliases starting from all allocations.
-    for (auto &entry : allocs)
-      findUnsafeValues(entry.allocValue, entry.placementBlock);
-
+    for (BufferPlacementAllocs::AllocEntry &entry : allocs) {
+      Value allocValue = std::get<0>(entry);
+      findUnsafeValues(allocValue, allocValue.getDefiningOp()->getBlock());
+    }
     // Try to find block arguments that require an explicit free operation
     // until we reach a fix point.
     while (!toProcess.empty()) {
@@ -486,7 +493,7 @@
       // Register the value to require a final dealloc. Note that we do not have
       // to assign a block here since we do not want to move the allocation node
       // to another location.
-      allocs.push_back({value, nullptr, nullptr});
+      allocs.registerAlloc(std::make_tuple(value, nullptr));
     }
   }
 
@@ -542,7 +549,8 @@
     // parent operation. In this case, we have to introduce an additional copy
     // for buffer that is passed to the argument.
     SmallVector<RegionSuccessor, 2> successorRegions;
-    getSuccessorRegions(regionInterface, llvm::None, successorRegions);
+    getSuccessorRegions(regionInterface, /*index=*/llvm::None,
+                        successorRegions);
     auto *it =
         llvm::find_if(successorRegions, [&](RegionSuccessor &successorRegion) {
           return successorRegion.getSuccessor() == argRegion;
@@ -664,30 +672,6 @@
     return alloc;
   }
 
-  /// Finds associated deallocs that can be linked to our allocation nodes (if
-  /// any).
-  void findDeallocs() {
-    for (auto &entry : allocs) {
-      auto userIt =
-          llvm::find_if(entry.allocValue.getUsers(), [&](Operation *user) {
-            auto effectInterface = dyn_cast<MemoryEffectOpInterface>(user);
-            if (!effectInterface)
-              return false;
-            // Try to find a free effect that is applied to one of our values
-            // that will be automatically freed by our pass.
-            SmallVector<MemoryEffects::EffectInstance, 2> effects;
-            effectInterface.getEffectsOnValue(entry.allocValue, effects);
-            return llvm::any_of(
-                effects, [&](MemoryEffects::EffectInstance &it) {
-                  return isa<MemoryEffects::Free>(it.getEffect());
-                });
-          });
-      // Assign the associated dealloc operation (if any).
-      if (userIt != entry.allocValue.user_end())
-        entry.deallocOperation = *userIt;
-    }
-  }
-
   /// Finds correct dealloc positions according to the algorithm described at
   /// the top of the file for all alloc nodes and block arguments that can be
   /// handled by this analysis.
@@ -696,8 +680,8 @@
     // These deallocations will be linked to their associated allocation nodes
     // since they don't have any aliases that can (potentially) increase their
     // liveness.
-    for (const AllocEntry &entry : allocs) {
-      Value alloc = entry.allocValue;
+    for (const BufferPlacementAllocs::AllocEntry &entry : allocs) {
+      Value alloc = std::get<0>(entry);
       auto aliasesSet = aliases.resolve(alloc);
       assert(aliasesSet.size() > 0 && "must contain at least one alias");
 
@@ -713,11 +697,21 @@
       // the placementBlock and that we can safely place the dealloc at the
       // beginning.
       Operation *endOperation = &placementBlock->front();
+
       // Iterate over all aliases and ensure that the endOperation will point
       // to the last operation of all potential aliases in the placementBlock.
       for (Value alias : aliasesSet) {
+        // Ensure that the start operation is at least the defining operation of
+        // the current alias to avoid invalid placement of deallocs for aliases
+        // without any uses.
+        Operation *beforeOp = endOperation;
+        if (alias.getDefiningOp() &&
+            !(beforeOp = placementBlock->findAncestorOpInBlock(
+                  *alias.getDefiningOp())))
+          continue;
+
         Operation *aliasEndOperation =
-            livenessInfo->getEndOperation(alias, endOperation);
+            livenessInfo->getEndOperation(alias, beforeOp);
         // Check whether the aliasEndOperation lies in the desired block and
         // whether it is behind the current endOperation. If yes, this will be
         // the new endOperation.
@@ -729,8 +723,9 @@
       // the dealloc taking all potential aliases into account.
 
       // If there is an existing dealloc, move it to the right place.
-      if (entry.deallocOperation) {
-        entry.deallocOperation->moveAfter(endOperation);
+      Operation *deallocOperation = std::get<1>(entry);
+      if (deallocOperation) {
+        deallocOperation->moveAfter(endOperation);
       } else {
         // If the Dealloc position is at the terminator operation of the
         // block, then the value should escape from a deallocation.
@@ -744,56 +739,26 @@
     }
   }
 
-  /// Finds a common dominator for the given value while taking the positions
-  /// of the values in the value set into account. It supports dominator and
-  /// post-dominator analyses via template arguments.
-  template <typename DominatorT>
-  Block *findCommonDominator(Value value, const ValueSetT &values,
-                             const DominatorT &doms) const {
-    // Start with the current block the value is defined in.
-    Block *dom = value.getParentBlock();
-    // Iterate over all aliases and their uses to find a safe placement block
-    // according to the given dominator information.
-    for (Value childValue : values)
-      for (Operation *user : childValue.getUsers()) {
-        // Move upwards in the dominator tree to find an appropriate
-        // dominator block that takes the current use into account.
-        dom = doms.findNearestCommonDominator(dom, user->getBlock());
-      }
-    return dom;
-  }
-
-  /// The operation this transformation was constructed from.
-  Operation *operation;
-
-  /// Alias information that can be updated during the insertion of copies.
-  BufferPlacementAliasAnalysis aliases;
-
-  /// Maps allocation nodes to their associated blocks.
-  AllocEntryList allocs;
-
-  // Stores already copied allocations to avoid additional copies of copies.
-  ValueSetT copiedValues;
-
-  /// The underlying liveness analysis to compute fine grained information
-  /// about alloc and dealloc positions.
-  Liveness liveness;
-
-  /// The dominator analysis to place deallocs in the appropriate blocks.
+  /// The dominator info to find the appropriate start operation to move the
+  /// allocs.
   DominanceInfo dominators;
 
-  /// The post dominator analysis to place deallocs in the appropriate blocks.
+  /// The post dominator info to move the dependent allocs in the right
+  /// position.
   PostDominanceInfo postDominators;
+
+  /// Stores already copied allocations to avoid additional copies of copies.
+  ValueSetT copiedValues;
 };
 
 //===----------------------------------------------------------------------===//
-// BufferPlacementPass
+// BufferDeallocationPass
 //===----------------------------------------------------------------------===//
 
-/// The actual buffer placement pass that moves alloc and dealloc nodes into
-/// the right positions. It uses the algorithm described at the top of the
-/// file.
-struct BufferPlacementPass : BufferPlacementBase<BufferPlacementPass> {
+/// The actual buffer deallocation pass that inserts and moves dealloc nodes
+/// into the right positions. Furthermore, it inserts additional allocs and
+/// copies if necessary. It uses the algorithm described at the top of the file.
+struct BufferDeallocationPass : BufferDeallocationBase<BufferDeallocationPass> {
 
   void runOnFunction() override {
     // Ensure that there are supported loops only.
@@ -804,18 +769,18 @@
       return;
     }
 
-    // Place all required alloc, copy and dealloc nodes.
-    BufferPlacement placement(getFunction());
-    placement.place();
+    // Place all required temporary alloc, copy and dealloc nodes.
+    BufferDeallocation deallocation(getFunction());
+    deallocation.deallocate();
   }
 };
 
 } // end anonymous namespace
 
 //===----------------------------------------------------------------------===//
-// BufferPlacementPass construction
+// BufferDeallocationPass construction
 //===----------------------------------------------------------------------===//
 
-std::unique_ptr<Pass> mlir::createBufferPlacementPass() {
-  return std::make_unique<BufferPlacementPass>();
+std::unique_ptr<Pass> mlir::createBufferDeallocationPass() {
+  return std::make_unique<BufferDeallocationPass>();
 }
diff --git a/mlir/lib/Transforms/BufferOptimizations.cpp b/mlir/lib/Transforms/BufferOptimizations.cpp
new file mode 100644
--- /dev/null
+++ b/mlir/lib/Transforms/BufferOptimizations.cpp
@@ -0,0 +1,258 @@
+//===- BufferOptimizations.cpp - pre-pass optimizations for bufferization -===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements logic for two optimization passes. These passes try to
+// hoist alloc nodes to reduce the number of allocations and copies during
+// buffer deallocation.
+
+#include "PassDetail.h"
+#include "mlir/IR/Operation.h"
+#include "mlir/Interfaces/LoopLikeInterface.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Transforms/Bufferize.h"
+#include "mlir/Transforms/Passes.h"
+
+using namespace mlir;
+
+namespace {
+
+//===----------------------------------------------------------------------===//
+// BufferAllocationHoisting
+//===----------------------------------------------------------------------===//
+
+/// A base implementation compatible with the `BufferAllocationHoisting` class.
+struct BufferAllocationHoistingStateBase {
+  /// A pointer to the current dominance info.
+  DominanceInfo *dominators;
+
+  /// The current allocation value.
+  Value allocValue;
+
+  /// The current placement block (if any).
+  Block *placementBlock;
+
+  /// Initializes the state base.
+  BufferAllocationHoistingStateBase(DominanceInfo *dominators, Value allocValue,
+                                    Block *placementBlock)
+      : dominators(dominators), allocValue(allocValue),
+        placementBlock(placementBlock) {}
+};
+
+/// Implements the actual hoisting logic for allocation nodes.
+template <typename StateT>
+class BufferAllocationHoisting : public BufferPlacementTransformationBase {
+private:
+  /// Returns true if the given operation implements a known high-level region-
+  /// based control-flow interface.
+  static bool isKnownControlFlowInterface(Operation *op) {
+    return isa<LoopLikeOpInterface, RegionBranchOpInterface>(op);
+  }
+
+public:
+  BufferAllocationHoisting(Operation *op)
+      : BufferPlacementTransformationBase(op), dominators(op),
+        postDominators(op) {}
+
+  /// Moves allocations upwards.
+  void hoist() {
+    for (BufferPlacementAllocs::AllocEntry &entry : allocs) {
+      Value allocValue = std::get<0>(entry);
+      Operation *definingOp = allocValue.getDefiningOp();
+      assert(definingOp && "No defining op");
+      auto operands = definingOp->getOperands();
+      auto resultAliases = aliases.resolve(allocValue);
+      // Determine the common dominator block of all aliases.
+      Block *dominatorBlock =
+          findCommonDominator(allocValue, resultAliases, dominators);
+      // Init the initial hoisting state.
+      StateT state(&dominators, allocValue, allocValue.getParentBlock());
+      // Check for additional allocation dependencies to compute an upper bound
+      // for hoisting.
+      Block *dependencyBlock = nullptr;
+      if (!operands.empty()) {
+        // If this node has dependencies, check all dependent nodes with respect
+        // to a common post dominator. This ensures that all dependency values
+        // have been computed before allocating the buffer.
+        ValueSetT dependencies(std::next(operands.begin()), operands.end());
+        dependencyBlock = findCommonDominator(*operands.begin(), dependencies,
+                                              postDominators);
+      }
+
+      // Find the actual placement block and determine the start operation using
+      // an upper placement-block boundary. The idea is that placement block
+      // cannot be moved any further upwards than the given upper bound.
+      Block *placementBlock = findPlacementBlock(
+          state, state.computeUpperBound(dominatorBlock, dependencyBlock));
+      Operation *startOperation = BufferPlacementAllocs::getStartOperation(
+          allocValue, placementBlock, liveness);
+
+      // Move the alloc in front of the start operation.
+      Operation *allocOperation = allocValue.getDefiningOp();
+      allocOperation->moveBefore(startOperation);
+    }
+  }
+
+private:
+  /// Finds a valid placement block by walking upwards in the CFG until we
+  /// either cannot continue our walk due to constraints (given by the StateT
+  /// implementation) or we have reached the upper-most dominator block.
+  Block *findPlacementBlock(StateT &state, Block *upperBound) {
+    Block *currentBlock = state.placementBlock;
+    // Walk from the innermost regions/loops to the outermost regions/loops and
+    // find an appropriate placement block that satisfies the constraint of the
+    // current StateT implementation. Walk until we reach the upperBound block
+    // (if any).
+
+    // If we are not able to find a valid parent operation or an associated
+    // parent block, break the walk loop.
+    Operation *parentOp;
+    Block *parentBlock;
+    while ((parentOp = currentBlock->getParentOp()) &&
+           (parentBlock = parentOp->getBlock()) &&
+           (!upperBound ||
+            dominators.properlyDominates(upperBound, currentBlock))) {
+      // Try to find an immediate dominator and check whether the parent block
+      // is above the immediate dominator (if any).
+      DominanceInfoNode *idom = dominators.getNode(currentBlock)->getIDom();
+      if (idom && dominators.properlyDominates(parentBlock, idom->getBlock())) {
+        // If the current immediate dominator is below the placement block, move
+        // to the immediate dominator block.
+        currentBlock = idom->getBlock();
+        state.recordMoveToDominator(currentBlock);
+      } else {
+        // We have to move to our parent block since an immediate dominator does
+        // either not exist or is above our parent block. If we cannot move to
+        // our parent operation due to constraints given by the StateT
+        // implementation, break the walk loop. Furthermore, we should not move
+        // allocations out of unknown region-based control-flow operations.
+        if (!isKnownControlFlowInterface(parentOp) ||
+            !state.isLegalPlacement(parentOp))
+          break;
+        // Move to our parent block by notifying the current StateT
+        // implementation.
+        currentBlock = parentBlock;
+        state.recordMoveToParent(currentBlock);
+      }
+    }
+    // Return the finally determined placement block.
+    return state.placementBlock;
+  }
+
+  /// The dominator info to find the appropriate start operation to move the
+  /// allocs.
+  DominanceInfo dominators;
+
+  /// The post dominator info to move the dependent allocs in the right
+  /// position.
+  PostDominanceInfo postDominators;
+
+  /// The map storing the final placement blocks of a given alloc value.
+  llvm::DenseMap<Value, Block *> placementBlocks;
+};
+
+/// A state implementation compatible with the `BufferAllocationHoisting` class
+/// that hoists allocations into dominator blocks while keeping them inside of
+/// loops.
+struct BufferAllocationHoistingState : BufferAllocationHoistingStateBase {
+  using BufferAllocationHoistingStateBase::BufferAllocationHoistingStateBase;
+
+  /// Computes the upper bound for the placement block search.
+  Block *computeUpperBound(Block *dominatorBlock, Block *dependencyBlock) {
+    // If we do not have a dependency block, the upper bound is given by the
+    // dominator block.
+    if (!dependencyBlock)
+      return dominatorBlock;
+
+    // Find the "lower" block of the dominator and the dependency block to
+    // ensure that we do not move allocations above this block.
+    return dominators->properlyDominates(dominatorBlock, dependencyBlock)
+               ? dependencyBlock
+               : dominatorBlock;
+  }
+
+  /// Returns true if the given operation does not represent a loop.
+  bool isLegalPlacement(Operation *op) {
+    return !BufferPlacementTransformationBase::isLoop(op);
+  }
+
+  /// Sets the current placement block to the given block.
+  void recordMoveToDominator(Block *block) { placementBlock = block; }
+
+  /// Sets the current placement block to the given block.
+  void recordMoveToParent(Block *block) { recordMoveToDominator(block); }
+};
+
+/// A state implementation compatible with the `BufferAllocationHoisting` class
+/// that hoists allocations out of loops.
+struct BufferAllocationLoopHoistingState : BufferAllocationHoistingStateBase {
+  using BufferAllocationHoistingStateBase::BufferAllocationHoistingStateBase;
+
+  /// Remembers the dominator block of all aliases.
+  Block *aliasDominatorBlock;
+
+  /// Computes the upper bound for the placement block search.
+  Block *computeUpperBound(Block *dominatorBlock, Block *dependencyBlock) {
+    aliasDominatorBlock = dominatorBlock;
+    // If there is a dependency block, we have to use this block as an upper
+    // bound to satisfy all allocation value dependencies.
+    return dependencyBlock ? dependencyBlock : nullptr;
+  }
+
+  /// Returns true if the given operation represents a loop and one of the
+  /// aliases caused the `aliasDominatorBlock` to be "above" the block of the
+  /// given loop operation. If this is the case, it indicates that the
+  /// allocation is passed via a back edge.
+  bool isLegalPlacement(Operation *op) {
+    return BufferPlacementTransformationBase::isLoop(op) &&
+           !dominators->dominates(aliasDominatorBlock, op->getBlock());
+  }
+
+  /// Does not change the internal placement block, as we want to move
+  /// operations out of loops only.
+  void recordMoveToDominator(Block *block) {}
+
+  /// Sets the current placement block to the given block.
+  void recordMoveToParent(Block *block) { placementBlock = block; }
+};
+
+//===----------------------------------------------------------------------===//
+// BufferOptimizationPasses
+//===----------------------------------------------------------------------===//
+
+/// The buffer hoisting pass that hoists allocation nodes into dominating
+/// blocks.
+struct BufferHoistingPass : BufferHoistingBase<BufferHoistingPass> {
+
+  void runOnFunction() override {
+    // Hoist all allocations into dominator blocks.
+    BufferAllocationHoisting<BufferAllocationHoistingState> optimizer(
+        getFunction());
+    optimizer.hoist();
+  }
+};
+
+/// The buffer loop hoisting pass that hoists allocation nodes out of loops.
+struct BufferLoopHoistingPass : BufferLoopHoistingBase<BufferLoopHoistingPass> {
+
+  void runOnFunction() override {
+    // Hoist all allocations out of loops.
+    BufferAllocationHoisting<BufferAllocationLoopHoistingState> optimizer(
+        getFunction());
+    optimizer.hoist();
+  }
+};
+
+} // end anonymous namespace
+
+std::unique_ptr<Pass> mlir::createBufferHoistingPass() {
+  return std::make_unique<BufferHoistingPass>();
+}
+
+std::unique_ptr<Pass> mlir::createBufferLoopHoistingPass() {
+  return std::make_unique<BufferLoopHoistingPass>();
+}
diff --git a/mlir/lib/Transforms/CMakeLists.txt b/mlir/lib/Transforms/CMakeLists.txt
--- a/mlir/lib/Transforms/CMakeLists.txt
+++ b/mlir/lib/Transforms/CMakeLists.txt
@@ -1,7 +1,8 @@
 add_subdirectory(Utils)
 
 add_mlir_library(MLIRTransforms
-  BufferPlacement.cpp
+  BufferDeallocation.cpp
+  BufferOptimizations.cpp
   Bufferize.cpp
   Canonicalizer.cpp
   CopyRemoval.cpp
diff --git a/mlir/test/Dialect/Linalg/bufferize.mlir b/mlir/test/Dialect/Linalg/bufferize.mlir
--- a/mlir/test/Dialect/Linalg/bufferize.mlir
+++ b/mlir/test/Dialect/Linalg/bufferize.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt -linalg-bufferize -buffer-placement -split-input-file %s | FileCheck %s
+// RUN: mlir-opt -linalg-bufferize -buffer-hoisting -buffer-deallocation -split-input-file %s | FileCheck %s
 
 #map0 = affine_map<(d0) -> (d0)>
 
diff --git a/mlir/test/Transforms/buffer-placement.mlir b/mlir/test/Transforms/buffer-deallocation.mlir
rename from mlir/test/Transforms/buffer-placement.mlir
rename to mlir/test/Transforms/buffer-deallocation.mlir
--- a/mlir/test/Transforms/buffer-placement.mlir
+++ b/mlir/test/Transforms/buffer-deallocation.mlir
@@ -1,8 +1,8 @@
-// RUN: mlir-opt -buffer-placement -split-input-file %s | FileCheck %s
+// RUN: mlir-opt -buffer-deallocation -split-input-file %s | FileCheck %s
 
-// This file checks the behaviour of BufferPlacement pass for moving Alloc and
-// Dealloc operations and inserting the missing the DeallocOps in their correct
-// positions.
+// This file checks the behaviour of BufferDeallocation pass for moving and
+// inserting missing DeallocOps in their correct positions. Furthermore,
+// copies and their corresponding AllocOps are inserted.
 
 // Test Case:
 //    bb0
@@ -10,9 +10,10 @@
 //  bb1  bb2 <- Initial position of AllocOp
 //   \   /
 //    bb3
-// BufferPlacement Expected Behaviour: It should move the existing AllocOp to
-// the entry block, and insert a DeallocOp at the exit block after CopyOp since
-// %1 is an alias for %0 and %arg1.
+// BufferDeallocation expected behavior: bb2 contains an AllocOp which is
+// passed to bb3. In the latter block, there should be an deallocation.
+// Since bb1 does not contain an adequate alloc and the alloc in bb2 is not
+// moved to bb0, we need to insert allocs and copies.
 
 #map0 = affine_map<(d0) -> (d0)>
 
@@ -38,10 +39,18 @@
   return
 }
 
-// CHECK-NEXT: %[[ALLOC:.*]] = alloc()
 // CHECK-NEXT: cond_br
+//      CHECK: %[[ALLOC0:.*]] = alloc()
+// CHECK-NEXT: linalg.copy
+// CHECK-NEXT: br ^bb3(%[[ALLOC0]]
+//      CHECK: %[[ALLOC1:.*]] = alloc()
+// CHECK-NEXT: linalg.generic
+//      CHECK: %[[ALLOC2:.*]] = alloc()
+// CHECK-NEXT: linalg.copy
+// CHECK-NEXT: dealloc %[[ALLOC1]]
+// CHECK-NEXT: br ^bb3(%[[ALLOC2]]
 //      CHECK: linalg.copy
-// CHECK-NEXT: dealloc %[[ALLOC]]
+// CHECK-NEXT: dealloc
 // CHECK-NEXT: return
 
 // -----
@@ -52,12 +61,12 @@
 //  bb1  bb2 <- Initial position of AllocOp
 //   \   /
 //    bb3
-// BufferPlacement Expected Behaviour: It should not move the existing AllocOp
-// to any other block since the alloc has a dynamic dependency to block argument
-// %0 in bb2. Since the dynamic type is passed to bb3 via the block argument %2,
-// it is currently required to allocate a temporary buffer for %2 that gets
-// copies of %arg0 and %1 with their appropriate shape dimensions. The copy
-// buffer deallocation will be applied to %2 in block bb3.
+// BufferDeallocation expected behavior: The existing AllocOp has a dynamic
+// dependency to block argument %0 in bb2. Since the dynamic type is passed
+// to bb3 via the block argument %2, it is currently required to allocate a
+// temporary buffer for %2 that gets copies of %arg0 and %1 with their
+// appropriate shape dimensions. The copy buffer deallocation will be applied
+// to %2 in block bb3.
 
 #map0 = affine_map<(d0) -> (d0)>
 
@@ -91,6 +100,7 @@
 //      CHECK: %[[DIM0:.*]] = dim
 // CHECK-NEXT: %[[ALLOC0:.*]] = alloc(%[[DIM0]])
 // CHECK-NEXT: linalg.copy(%{{.*}}, %[[ALLOC0]])
+// CHECK-NEXT: br ^bb3(%[[ALLOC0]]
 //      CHECK: ^bb2(%[[IDX:.*]]:{{.*}})
 // CHECK-NEXT: %[[ALLOC1:.*]] = alloc(%[[IDX]])
 // CHECK-NEXT: linalg.generic
@@ -118,18 +128,17 @@
 //       bb6
 //        |
 //       bb7
-// BufferPlacement Expected Behaviour: It should not move the existing AllocOp
-// to any other block since the alloc has a dynamic dependency to block argument
-// %0 in bb2. Since the dynamic type is passed to bb5 via the block argument %2
-// and to bb6 via block argument %3, it is currently required to allocate
-// temporary buffers for %2 and %3 that gets copies of %1 and %arg0 1 with their
-// appropriate shape dimensions. The copy buffer deallocations will be applied
-// to %2 in block bb5 and to %3 in block bb6. Furthermore, there should be no
-// copy inserted for %4.
+// BufferDeallocation expected behavior: The existing AllocOp has a dynamic
+// dependency to block argument %0 in bb2. Since the dynamic type is passed to
+// bb5 via the block argument %2 and to bb6 via block argument %3, it is
+// currently required to allocate temporary buffers for %2 and %3 that gets
+// copies of %1 and %arg0 1 with their appropriate shape dimensions. The copy
+// buffer deallocations will be applied to %2 in block bb5 and to %3 in block
+// bb6. Furthermore, there should be no copy inserted for %4.
 
 #map0 = affine_map<(d0) -> (d0)>
 
-// CHECK-LABEL: func @condBranchDynamicType
+// CHECK-LABEL: func @condBranchDynamicTypeNested
 func @condBranchDynamicTypeNested(
   %arg0: i1,
   %arg1: memref<?xf32>,
@@ -168,6 +177,7 @@
 //      CHECK: %[[DIM0:.*]] = dim
 // CHECK-NEXT: %[[ALLOC0:.*]] = alloc(%[[DIM0]])
 // CHECK-NEXT: linalg.copy(%{{.*}}, %[[ALLOC0]])
+// CHECK-NEXT: br ^bb6
 //      CHECK: ^bb2(%[[IDX:.*]]:{{.*}})
 // CHECK-NEXT: %[[ALLOC1:.*]] = alloc(%[[IDX]])
 // CHECK-NEXT: linalg.generic
@@ -192,8 +202,8 @@
 // -----
 
 // Test Case: Existing AllocOp with no users.
-// BufferPlacement Expected Behaviour: It should insert a DeallocOp right before
-// ReturnOp.
+// BufferDeallocation expected behavior: It should insert a DeallocOp right
+// before ReturnOp.
 
 // CHECK-LABEL: func @emptyUsesValue
 func @emptyUsesValue(%arg0: memref<4xf32>) {
@@ -212,9 +222,9 @@
 //  |    bb1 <- Initial position of AllocOp
 //   \   /
 //    bb2
-// BufferPlacement Expected Behaviour: It should move the existing AllocOp to
-// the entry block and insert a DeallocOp at the exit block after CopyOp since
-// %1 is an alias for %0 and %arg1.
+// BufferDeallocation expected behavior: It should insert a DeallocOp at the
+// exit block after CopyOp since %1 is an alias for %0 and %arg1. Furthermore,
+// we have to insert a copy and an alloc in the beginning of the function.
 
 #map0 = affine_map<(d0) -> (d0)>
 
@@ -238,10 +248,16 @@
   return
 }
 
-// CHECK-NEXT: %[[ALLOC:.*]] = alloc()
+// CHECK-NEXT: %[[ALLOC0:.*]] = alloc()
+// CHECK-NEXT: linalg.copy
 // CHECK-NEXT: cond_br
+//      CHECK: %[[ALLOC1:.*]] = alloc()
+// CHECK-NEXT: linalg.generic
+//      CHECK: %[[ALLOC2:.*]] = alloc()
+// CHECK-NEXT: linalg.copy
+// CHECK-NEXT: dealloc %[[ALLOC1]]
 //      CHECK: linalg.copy
-// CHECK-NEXT: dealloc %[[ALLOC]]
+// CHECK-NEXT: dealloc
 // CHECK-NEXT: return
 
 // -----
@@ -252,9 +268,8 @@
 //  |    bb1
 //   \   /
 //    bb2
-// BufferPlacement Expected Behaviour: It shouldn't move the alloc position. It
-// only inserts a DeallocOp at the exit block after CopyOp since %1 is an alias
-// for %0 and %arg1.
+// BufferDeallocation expected behavior: It only inserts a DeallocOp at the
+// exit block after CopyOp since %1 is an alias for %0 and %arg1.
 
 #map0 = affine_map<(d0) -> (d0)>
 
@@ -289,10 +304,10 @@
 //  bb1  bb2
 //   \   /
 //    bb3 <- Initial position of the second AllocOp
-// BufferPlacement Expected Behaviour: It shouldn't move the AllocOps. It only
-// inserts two missing DeallocOps in the exit block. %5 is an alias for %0.
-// Therefore, the DeallocOp for %0 should occur after the last GenericOp. The
-// Dealloc for %7 should happen after the CopyOp.
+// BufferDeallocation expected behavior: It only inserts two missing
+// DeallocOps in the exit block. %5 is an alias for %0. Therefore, the
+// DeallocOp for %0 should occur after the last GenericOp. The Dealloc for %7
+// should happen after the CopyOp.
 
 #map0 = affine_map<(d0) -> (d0)>
 
@@ -347,9 +362,8 @@
 //  bb1  bb2
 //   \   /
 //    bb3
-// BufferPlacement Expected Behaviour: It shouldn't move the AllocOp. It only
-// inserts a missing DeallocOp in the exit block since %5 or %6 are the latest
-// aliases of %0.
+// BufferDeallocation expected behavior: It only inserts a missing DeallocOp
+// in the exit block since %5 or %6 are the latest aliases of %0.
 
 #map0 = affine_map<(d0) -> (d0)>
 
@@ -378,7 +392,8 @@
 }
 
 // CHECK-NEXT: %[[FIRST_ALLOC:.*]] = alloc()
-//      CHECK: dealloc %[[FIRST_ALLOC]]
+//      CHECK: linalg.copy
+// CHECK-NEXT: dealloc %[[FIRST_ALLOC]]
 // CHECK-NEXT: return
 
 // -----
@@ -392,8 +407,8 @@
 //    \     \  /
 //     \     /
 //       bb5 <- Initial position of the second AllocOp
-// BufferPlacement Expected Behaviour: AllocOps shouldn't be moved.
-// Two missing DeallocOps should be inserted in the exit block.
+// BufferDeallocation expected behavior: Two missing DeallocOps should be
+// inserted in the exit block.
 
 #map0 = affine_map<(d0) -> (d0)>
 
@@ -447,8 +462,8 @@
 // -----
 
 // Test Case: Dead operations in a single block.
-// BufferPlacement Expected Behaviour: It shouldn't move the AllocOps. It only
-// inserts the two missing DeallocOps after the last GenericOp.
+// BufferDeallocation expected behavior: It only inserts the two missing
+// DeallocOps after the last GenericOp.
 
 #map0 = affine_map<(d0) -> (d0)>
 
@@ -481,7 +496,8 @@
 // CHECK-NEXT: %[[FIRST_ALLOC:.*]] = alloc()
 // CHECK-NEXT: linalg.generic {{.*}} ins(%[[ARG0]]{{.*}}outs(%[[FIRST_ALLOC]]
 //      CHECK: %[[SECOND_ALLOC:.*]] = alloc()
-// CHECK-NEXT: linalg.generic {{.*}} ins(%[[FIRST_ALLOC]]{{.*}}outs(%[[SECOND_ALLOC]]
+// CHECK-NEXT: linalg.generic {{.*}} ins
+// CHECK-SAME: (%[[FIRST_ALLOC]]{{.*}}outs(%[[SECOND_ALLOC]]
 //      CHECK: dealloc
 // CHECK-NEXT: dealloc
 // CHECK-NEXT: return
@@ -494,9 +510,10 @@
 // Initial pos of the 1st AllocOp -> bb1  bb2 <- Initial pos of the 2nd AllocOp
 //                                    \   /
 //                                     bb3
-// BufferPlacement Expected Behaviour: Both AllocOps should be moved to the
-// entry block. Both missing DeallocOps should be moved to the exit block after
-// CopyOp since %arg2 is an alias for %0 and %1.
+// BufferDeallocation expected behavior: We need to introduce a copy for each
+// buffer since the buffers are passed to bb3. The both missing DeallocOps are
+// inserted in the respective block of the allocs. The copy is freed in the exit
+// block.
 
 #map0 = affine_map<(d0) -> (d0)>
 
@@ -535,11 +552,25 @@
   return
 }
 
-// CHECK-NEXT: %{{.*}} = alloc()
-// CHECK-NEXT: %{{.*}} = alloc()
+// CHECK-NEXT: cond_br
+//      CHECK: ^bb1
+//      CHECK: ^bb1
+//      CHECK: %[[ALLOC0:.*]] = alloc()
+// CHECK-NEXT: linalg.generic
+//      CHECK: %[[ALLOC1:.*]] = alloc()
+// CHECK-NEXT: linalg.copy
+// CHECK-NEXT: dealloc %[[ALLOC0]]
+// CHECK-NEXT: br ^bb3(%[[ALLOC1]]
+// CHECK-NEXT: ^bb2
+// CHECK-NEXT: %[[ALLOC2:.*]] = alloc()
+// CHECK-NEXT: linalg.generic
+//      CHECK: %[[ALLOC3:.*]] = alloc()
+// CHECK-NEXT: linalg.copy
+// CHECK-NEXT: dealloc %[[ALLOC2]]
+// CHECK-NEXT: br ^bb3(%[[ALLOC3]]
+// CHECK-NEXT: ^bb3(%[[ALLOC4:.*]]:{{.*}})
 //      CHECK: linalg.copy
-// CHECK-NEXT: dealloc
-// CHECK-NEXT: dealloc
+// CHECK-NEXT: dealloc %[[ALLOC4]]
 // CHECK-NEXT: return
 
 // -----
@@ -551,8 +582,8 @@
 // bb1  bb2 <- Initial position of AllocOp
 //  \   /
 //   bb3
-// BufferPlacement Expected Behaviour: It should move the AllocOp to the entry
-// block. The existing DeallocOp should be moved to exit block.
+// BufferDeallocation expected behavior: The existing DeallocOp should be
+// moved to exit block.
 
 #map0 = affine_map<(d0) -> (d0)>
 
@@ -561,11 +592,11 @@
   %cond: i1,
     %arg0: memref<2xf32>,
     %arg1: memref<2xf32>) {
+  %1 = alloc() : memref<2xf32>
   cond_br %cond, ^bb1, ^bb2
 ^bb1:
   br ^exit(%arg0 : memref<2xf32>)
 ^bb2:
-  %1 = alloc() : memref<2xf32>
   linalg.generic {
     indexing_maps = [#map0, #map0],
     iterator_types = ["parallel"]}
@@ -582,9 +613,10 @@
   return
 }
 
-// CHECK-NEXT: %{{.*}} = alloc()
+// CHECK-NEXT: %[[ALLOC0:.*]] = alloc()
+// CHECK-NEXT: cond_br
 //      CHECK: linalg.copy
-// CHECK-NEXT: dealloc
+// CHECK-NEXT: dealloc %[[ALLOC0]]
 // CHECK-NEXT: return
 
 // -----
@@ -611,8 +643,9 @@
   return
 }
 
+// CHECK-NEXT: %[[ALLOC0:.*]] = alloc()
 //      CHECK: linalg.copy
-// CHECK-NEXT: dealloc
+// CHECK-NEXT: dealloc %[[ALLOC0]]
 
 // -----
 
@@ -638,33 +671,41 @@
   return
 }
 
+// CHECK-NEXT: %[[ALLOC0:.*]] = alloc()
 //      CHECK: linalg.copy
-// CHECK-NEXT: dealloc
+// CHECK-NEXT: dealloc %[[ALLOC0]]
 
 // -----
 
-// Test Case: Nested regions - This test defines a GenericOp inside the region of
-// another GenericOp.
-// BufferPlacement Expected Behaviour: The AllocOp of inner GenericOp should remain
-// inside the region of outer GenericOp and it should insert the missing DeallocOp
-// in the same region. The AllocOp of the outer GenericOp should be moved to the
-// entry block and its missing DeallocOp should be inserted after Linalg.Copy.
+// Test Case: Nested regions - This test defines a GenericOp inside the region
+// of another GenericOp.
+// BufferDeallocation expected behavior: The AllocOp of inner GenericOp should
+// remain inside the region of outer GenericOp and it should insert the missing
+// DeallocOp in the same region. The missing DeallocOp should be inserted after
+// Linalg.Copy.
 
 #map0 = affine_map<(d0) -> (d0)>
 
 // CHECK-LABEL: func @nested_regions_and_cond_branch
-func @nested_regions_and_cond_branch(%arg0: i1, %arg1: memref<2xf32>, %arg2: memref<2xf32>) {
+func @nested_regions_and_cond_branch(
+  %arg0: i1,
+  %arg1: memref<2xf32>,
+  %arg2: memref<2xf32>) {
   cond_br %arg0, ^bb1, ^bb2
 ^bb1:
   br ^bb3(%arg1 : memref<2xf32>)
 ^bb2:
   %0 = alloc() : memref<2xf32>
-  linalg.generic {indexing_maps = [#map0, #map0], iterator_types = ["parallel"]}
+  linalg.generic {
+    indexing_maps = [#map0, #map0],
+    iterator_types = ["parallel"]}
     ins(%arg1: memref<2xf32>)
    outs(%0: memref<2xf32>) {
   ^bb0(%gen1_arg0: f32, %gen1_arg1: f32):
     %1 = alloc() : memref<2xf32>
-    linalg.generic {indexing_maps = [#map0, #map0], iterator_types = ["parallel"]}
+    linalg.generic {
+      indexing_maps = [#map0, #map0],
+      iterator_types = ["parallel"]}
       ins(%arg1: memref<2xf32>)
      outs(%1: memref<2xf32>) {
     ^bb0(%gen2_arg0: f32, %gen2_arg1: f32):
@@ -680,29 +721,37 @@
   return
 }
 //      CHECK: (%[[cond:.*]]: {{.*}}, %[[ARG1:.*]]: {{.*}}, %{{.*}}: {{.*}})
-// CHECK-NEXT:   %[[GENERIC1_ALLOC:.*]] = alloc()
 // CHECK-NEXT:   cond_br %[[cond]], ^[[BB1:.*]], ^[[BB2:.*]]
+//      CHECK:   %[[ALLOC0:.*]] = alloc()
+// CHECK-NEXT:   linalg.copy(%[[ARG1]], %[[ALLOC0]])
 //      CHECK: ^[[BB2]]:
-// CHECK-NEXT:   linalg.generic {{{.*}}} ins(%[[ARG1]]{{.*}}outs(%[[GENERIC1_ALLOC]]
-//      CHECK:     %[[GENERIC2_ALLOC:.*]] = alloc()
-// CHECK-NEXT:     linalg.generic {{{.*}}} ins(%[[ARG1]]{{.*}}outs(%[[GENERIC2_ALLOC]]
-//      CHECK:     dealloc %[[GENERIC2_ALLOC]]
+//      CHECK:   %[[ALLOC1:.*]] = alloc()
+// CHECK-NEXT:   linalg.generic {{{.*}}} ins(%[[ARG1]]{{.*}}outs(%[[ALLOC1]]
+//      CHECK:     %[[ALLOC2:.*]] = alloc()
+// CHECK-NEXT:     linalg.generic {{{.*}}} ins(%[[ARG1]]{{.*}}outs(%[[ALLOC2]]
+//      CHECK:     dealloc %[[ALLOC2]]
 // CHECK-NEXT:     %{{.*}} = exp
+//      CHECK:   %[[ALLOC3:.*]] = alloc()
+// CHECK-NEXT:   linalg.copy(%[[ALLOC1]], %[[ALLOC3]])
+// CHECK-NEXT:   dealloc %[[ALLOC1]]
 //      CHECK:  ^[[BB3:.*]]({{.*}}):
 //      CHECK:  linalg.copy
-// CHECK-NEXT:  dealloc %[[GENERIC1_ALLOC]]
+// CHECK-NEXT:  dealloc
 
 // -----
 
 // Test Case: buffer deallocation escaping
-// BufferPlacement Expected Behaviour: It must not dealloc %arg1 and %x
+// BufferDeallocation expected behavior: It must not dealloc %arg1 and %x
 // since they are operands of return operation and should escape from
 // deallocating. It should dealloc %y after linalg.copy.
 
 #map0 = affine_map<(d0) -> (d0)>
 
 // CHECK-LABEL: func @memref_in_function_results
-func @memref_in_function_results(%arg0: memref<5xf32>, %arg1: memref<10xf32>, %arg2: memref<5xf32>) -> (memref<10xf32>, memref<15xf32>) {
+func @memref_in_function_results(
+  %arg0: memref<5xf32>,
+  %arg1: memref<10xf32>,
+  %arg2: memref<5xf32>) -> (memref<10xf32>, memref<15xf32>) {
   %x = alloc() : memref<15xf32>
   %y = alloc() : memref<5xf32>
   linalg.generic {indexing_maps = [#map0, #map0], iterator_types = ["parallel"]}
@@ -715,19 +764,20 @@
   linalg.copy(%y, %arg2) : memref<5xf32>, memref<5xf32>
   return %arg1, %x : memref<10xf32>, memref<15xf32>
 }
-// CHECK: (%[[ARG0:.*]]: memref<5xf32>, %[[ARG1:.*]]: memref<10xf32>, %[[RESULT:.*]]: memref<5xf32>)
-// CHECK: %[[X:.*]] = alloc()
-// CHECK: %[[Y:.*]] = alloc()
-// CHECK: linalg.copy
-// CHECK: dealloc %[[Y]]
-// CHECK: return %[[ARG1]], %[[X]]
+//      CHECK: (%[[ARG0:.*]]: memref<5xf32>, %[[ARG1:.*]]: memref<10xf32>,
+// CHECK-SAME: %[[RESULT:.*]]: memref<5xf32>)
+//      CHECK: %[[X:.*]] = alloc()
+//      CHECK: %[[Y:.*]] = alloc()
+//      CHECK: linalg.copy
+//      CHECK: dealloc %[[Y]]
+//      CHECK: return %[[ARG1]], %[[X]]
 
 // -----
 
 // Test Case: nested region control flow
-// The alloc position of %1 does not need to be changed and flows through
-// both if branches until it is finally returned. Hence, it does not
-// require a specific dealloc operation. However, %3 requires a dealloc.
+// The alloc %1 flows through both if branches until it is finally returned.
+// Hence, it does not require a specific dealloc operation. However, %3
+// requires a dealloc.
 
 // CHECK-LABEL: func @nested_region_control_flow
 func @nested_region_control_flow(
@@ -756,10 +806,8 @@
 
 // Test Case: nested region control flow with a nested buffer allocation in a
 // divergent branch.
-// The alloc positions of %1, %3 does not need to be changed since
-// BufferPlacement does not move allocs out of nested regions at the moment.
-// However, since %3 is allocated and "returned" in a divergent branch, we have
-// to allocate a temporary buffer (like in condBranchDynamicTypeNested).
+// Buffer deallocation places a copy for both  %1 and %3, since they are
+// returned in the end.
 
 // CHECK-LABEL: func @nested_region_control_flow_div
 func @nested_region_control_flow_div(
@@ -791,61 +839,9 @@
 
 // -----
 
-// Test Case: deeply nested region control flow with a nested buffer allocation
-// in a divergent branch.
-// The alloc positions of %1, %4 and %5 does not need to be changed since
-// BufferPlacement does not move allocs out of nested regions at the moment.
-// However, since %4 is allocated and "returned" in a divergent branch, we have
-// to allocate several temporary buffers (like in condBranchDynamicTypeNested).
-
-// CHECK-LABEL: func @nested_region_control_flow_div_nested
-func @nested_region_control_flow_div_nested(
-  %arg0 : index,
-  %arg1 : index) -> memref<?x?xf32> {
-  %0 = cmpi "eq", %arg0, %arg1 : index
-  %1 = alloc(%arg0, %arg0) : memref<?x?xf32>
-  %2 = scf.if %0 -> (memref<?x?xf32>) {
-    %3 = scf.if %0 -> (memref<?x?xf32>) {
-      scf.yield %1 : memref<?x?xf32>
-    } else {
-      %4 = alloc(%arg0, %arg1) : memref<?x?xf32>
-      scf.yield %4 : memref<?x?xf32>
-    }
-    scf.yield %3 : memref<?x?xf32>
-  } else {
-    %5 = alloc(%arg1, %arg1) : memref<?x?xf32>
-    scf.yield %5 : memref<?x?xf32>
-  }
-  return %2 : memref<?x?xf32>
-}
-//      CHECK: %[[ALLOC0:.*]] = alloc(%arg0, %arg0)
-// CHECK-NEXT: %[[ALLOC1:.*]] = scf.if
-// CHECK-NEXT: %[[ALLOC2:.*]] = scf.if
-//      CHECK: %[[ALLOC3:.*]] = alloc
-// CHECK-NEXT: linalg.copy(%[[ALLOC0]], %[[ALLOC3]])
-//      CHECK: scf.yield %[[ALLOC3]]
-//      CHECK: %[[ALLOC4:.*]] = alloc(%arg0, %arg1)
-//      CHECK: %[[ALLOC5:.*]] = alloc
-// CHECK-NEXT: linalg.copy(%[[ALLOC4]], %[[ALLOC5]])
-//      CHECK: dealloc %[[ALLOC4]]
-//      CHECK: scf.yield %[[ALLOC5]]
-//      CHECK: %[[ALLOC6:.*]] = alloc
-// CHECK-NEXT: linalg.copy(%[[ALLOC2]], %[[ALLOC6]])
-//      CHECK: dealloc %[[ALLOC2]]
-//      CHECK: scf.yield %[[ALLOC6]]
-//      CHECK: %[[ALLOC7:.*]] = alloc(%arg1, %arg1)
-//      CHECK: %[[ALLOC8:.*]] = alloc
-// CHECK-NEXT: linalg.copy(%[[ALLOC7]], %[[ALLOC8]])
-//      CHECK: dealloc %[[ALLOC7]]
-//      CHECK: scf.yield %[[ALLOC8]]
-//      CHECK: dealloc %[[ALLOC0]]
-// CHECK-NEXT: return %[[ALLOC1]]
-
-// -----
-
 // Test Case: nested region control flow within a region interface.
-// The alloc positions of %0 does not need to be changed and no copies are
-// required in this case since the allocation finally escapes the method.
+// No copies are required in this case since the allocation finally escapes
+// the method.
 
 // CHECK-LABEL: func @inner_region_control_flow
 func @inner_region_control_flow(%arg0 : index) -> memref<?x?xf32> {
@@ -875,54 +871,6 @@
 
 // -----
 
-// Test Case: nested region control flow within a region interface including an
-// allocation in a divergent branch.
-// The alloc positions of %1 and %2 does not need to be changed since
-// BufferPlacement does not move allocs out of nested regions at the moment.
-// However, since %2 is allocated and yielded in a divergent branch, we have
-// to allocate several temporary buffers (like in condBranchDynamicTypeNested).
-
-// CHECK-LABEL: func @inner_region_control_flow_div
-func @inner_region_control_flow_div(
-  %arg0 : index,
-  %arg1 : index) -> memref<?x?xf32> {
-  %0 = alloc(%arg0, %arg0) : memref<?x?xf32>
-  %1 = test.region_if %0 : memref<?x?xf32> -> (memref<?x?xf32>) then {
-    ^bb0(%arg2 : memref<?x?xf32>):
-      test.region_if_yield %arg2 : memref<?x?xf32>
-  } else {
-    ^bb0(%arg2 : memref<?x?xf32>):
-      %2 = alloc(%arg0, %arg1) : memref<?x?xf32>
-      test.region_if_yield %2 : memref<?x?xf32>
-  } join {
-    ^bb0(%arg2 : memref<?x?xf32>):
-      test.region_if_yield %arg2 : memref<?x?xf32>
-  }
-  return %1 : memref<?x?xf32>
-}
-
-//      CHECK: %[[ALLOC0:.*]] = alloc(%arg0, %arg0)
-// CHECK-NEXT: %[[ALLOC1:.*]] = test.region_if
-// CHECK-NEXT: ^bb0(%[[ALLOC2:.*]]:{{.*}}):
-//      CHECK: %[[ALLOC3:.*]] = alloc
-// CHECK-NEXT: linalg.copy(%[[ALLOC2]], %[[ALLOC3]])
-// CHECK-NEXT: test.region_if_yield %[[ALLOC3]]
-//      CHECK: ^bb0(%[[ALLOC4:.*]]:{{.*}}):
-//      CHECK: %[[ALLOC5:.*]] = alloc
-//      CHECK: %[[ALLOC6:.*]] = alloc
-// CHECK-NEXT: linalg.copy(%[[ALLOC5]], %[[ALLOC6]])
-// CHECK-NEXT: dealloc %[[ALLOC5]]
-// CHECK-NEXT: test.region_if_yield %[[ALLOC6]]
-//      CHECK: ^bb0(%[[ALLOC7:.*]]:{{.*}}):
-//      CHECK: %[[ALLOC8:.*]] = alloc
-// CHECK-NEXT: linalg.copy(%[[ALLOC7]], %[[ALLOC8]])
-// CHECK-NEXT: dealloc %[[ALLOC7]]
-// CHECK-NEXT: test.region_if_yield %[[ALLOC8]]
-//      CHECK: dealloc %[[ALLOC0]]
-// CHECK-NEXT: return %[[ALLOC1]]
-
-// -----
-
 // CHECK-LABEL: func @subview
 func @subview(%arg0 : index, %arg1 : index, %arg2 : memref<?x?xf32>) {
   %0 = alloc() : memref<64x4xf32, offset: 0, strides: [4, 1]>
@@ -944,6 +892,9 @@
 
 #map0 = affine_map<(d0) -> (d0)>
 
+// Test Case: In the presence of AllocaOps only the AllocOps has top be freed.
+// Therefore, all allocas are not handled.
+
 // CHECK-LABEL: func @condBranchAlloca
 func @condBranchAlloca(%arg0: i1, %arg1: memref<2xf32>, %arg2: memref<2xf32>) {
   cond_br %arg0, ^bb1, ^bb2
@@ -977,6 +928,10 @@
 
 #map0 = affine_map<(d0) -> (d0)>
 
+// Test Case: In the presence of AllocaOps only the AllocOps has top be freed.
+// Therefore, all allocas are not handled. In this case, only alloc %0 has a
+// dealloc.
+
 // CHECK-LABEL: func @ifElseAlloca
 func @ifElseAlloca(%arg0: i1, %arg1: memref<2xf32>, %arg2: memref<2xf32>) {
   %0 = alloc() : memref<2xf32>
@@ -1024,7 +979,10 @@
 #map0 = affine_map<(d0) -> (d0)>
 
 // CHECK-LABEL: func @ifElseNestedAlloca
-func @ifElseNestedAlloca(%arg0: i1, %arg1: memref<2xf32>, %arg2: memref<2xf32>) {
+func @ifElseNestedAlloca(
+  %arg0: i1,
+  %arg1: memref<2xf32>,
+  %arg2: memref<2xf32>) {
   %0 = alloca() : memref<2xf32>
   linalg.generic {
     indexing_maps = [#map0, #map0],
@@ -1074,18 +1032,25 @@
 #map0 = affine_map<(d0) -> (d0)>
 
 // CHECK-LABEL: func @nestedRegionsAndCondBranchAlloca
-func @nestedRegionsAndCondBranchAlloca(%arg0: i1, %arg1: memref<2xf32>, %arg2: memref<2xf32>) {
+func @nestedRegionsAndCondBranchAlloca(
+  %arg0: i1,
+  %arg1: memref<2xf32>,
+  %arg2: memref<2xf32>) {
   cond_br %arg0, ^bb1, ^bb2
 ^bb1:
   br ^bb3(%arg1 : memref<2xf32>)
 ^bb2:
   %0 = alloc() : memref<2xf32>
-  linalg.generic {indexing_maps = [#map0, #map0], iterator_types = ["parallel"]}
+  linalg.generic {
+    indexing_maps = [#map0, #map0],
+    iterator_types = ["parallel"]}
     ins(%arg1: memref<2xf32>)
    outs(%0: memref<2xf32>) {
   ^bb0(%gen1_arg0: f32, %gen1_arg1: f32):
     %1 = alloca() : memref<2xf32>
-    linalg.generic {indexing_maps = [#map0, #map0], iterator_types = ["parallel"]}
+    linalg.generic {
+      indexing_maps = [#map0, #map0],
+      iterator_types = ["parallel"]}
     ins(%arg1: memref<2xf32>)
    outs(%1: memref<2xf32>) {
     ^bb0(%gen2_arg0: f32, %gen2_arg1: f32):
@@ -1101,16 +1066,22 @@
   return
 }
 //      CHECK: (%[[cond:.*]]: {{.*}}, %[[ARG1:.*]]: {{.*}}, %{{.*}}: {{.*}})
-// CHECK-NEXT:   %[[ALLOC:.*]] = alloc()
 // CHECK-NEXT:   cond_br %[[cond]], ^[[BB1:.*]], ^[[BB2:.*]]
+//      CHECK: ^[[BB1]]:
+//      CHECK: %[[ALLOC0:.*]] = alloc()
+// CHECK-NEXT: linalg.copy
 //      CHECK: ^[[BB2]]:
-// CHECK-NEXT:   linalg.generic {{{.*}}} ins(%[[ARG1]]{{.*}}outs(%[[ALLOC]]
+//      CHECK:   %[[ALLOC1:.*]] = alloc()
+// CHECK-NEXT:   linalg.generic {{{.*}}} ins(%[[ARG1]]{{.*}}outs(%[[ALLOC1]]
 //      CHECK:     %[[ALLOCA:.*]] = alloca()
 // CHECK-NEXT:     linalg.generic {{{.*}}} ins(%[[ARG1]]{{.*}}outs(%[[ALLOCA]]
 //      CHECK:     %{{.*}} = exp
+//      CHECK:  %[[ALLOC2:.*]] = alloc()
+// CHECK-NEXT:  linalg.copy
+// CHECK-NEXT:  dealloc %[[ALLOC1]]
 //      CHECK:  ^[[BB3:.*]]({{.*}}):
 //      CHECK:  linalg.copy
-// CHECK-NEXT:  dealloc %[[ALLOC]]
+// CHECK-NEXT:  dealloc
 
 // -----
 
@@ -1139,10 +1110,8 @@
 // -----
 
 // Test Case: structured control-flow loop using a nested alloc.
-// The alloc positions of %3 will not be changed, but the iteration argument
-// %iterBuf has to be freed before yielding %3 to avoid memory leaks.
-
-// -----
+// The iteration argument %iterBuf has to be freed before yielding %3 to avoid
+// memory leaks.
 
 // CHECK-LABEL: func @loop_alloc
 func @loop_alloc(
@@ -1166,7 +1135,8 @@
 // CHECK-NEXT: dealloc %[[ALLOC0]]
 // CHECK-NEXT: %[[ALLOC1:.*]] = alloc()
 //      CHECK: linalg.copy(%arg3, %[[ALLOC1]])
-//      CHECK: %[[ALLOC2:.*]] = scf.for {{.*}} iter_args(%[[IALLOC:.*]] = %[[ALLOC1]]
+//      CHECK: %[[ALLOC2:.*]] = scf.for {{.*}} iter_args
+// CHECK-SAME: (%[[IALLOC:.*]] = %[[ALLOC1]]
 //      CHECK:    cmpi
 //      CHECK:    dealloc %[[IALLOC]]
 //      CHECK:    %[[ALLOC3:.*]] = alloc()
@@ -1251,7 +1221,8 @@
 //      CHECK: %[[ALLOC0:.*]] = alloc()
 //      CHECK: %[[ALLOC1:.*]] = alloc()
 // CHECK-NEXT: linalg.copy(%arg3, %[[ALLOC1]])
-// CHECK-NEXT: %[[ALLOC2:.*]] = scf.for {{.*}} iter_args(%[[IALLOC:.*]] = %[[ALLOC1]]
+// CHECK-NEXT: %[[ALLOC2:.*]] = scf.for {{.*}} iter_args
+// CHECK-SAME: (%[[IALLOC:.*]] = %[[ALLOC1]]
 //      CHECK: dealloc %[[IALLOC]]
 //      CHECK: %[[ALLOC3:.*]] = scf.if
 
@@ -1362,7 +1333,7 @@
 // -----
 
 // Test Case: explicit control-flow loop with a dynamically allocated buffer.
-// The BufferPlacement transformation should fail on this explicit
+// The BufferDeallocation transformation should fail on this explicit
 // control-flow loop since they are not supported.
 
 // CHECK-LABEL: func @loop_dynalloc
@@ -1397,7 +1368,7 @@
 // -----
 
 // Test Case: explicit control-flow loop with a dynamically allocated buffer.
-// The BufferPlacement transformation should fail on this explicit
+// The BufferDeallocation transformation should fail on this explicit
 // control-flow loop since they are not supported.
 
 // CHECK-LABEL: func @do_loop_alloc
diff --git a/mlir/test/Transforms/buffer-hoisting.mlir b/mlir/test/Transforms/buffer-hoisting.mlir
new file mode 100644
--- /dev/null
+++ b/mlir/test/Transforms/buffer-hoisting.mlir
@@ -0,0 +1,905 @@
+// RUN: mlir-opt -buffer-hoisting -split-input-file %s | FileCheck %s
+
+// This file checks the behaviour of BufferHoisting pass for moving Alloc
+// operations to their correct positions.
+
+// Test Case:
+//    bb0
+//   /   \
+//  bb1  bb2 <- Initial position of AllocOp
+//   \   /
+//    bb3
+// BufferHoisting expected behavior: It should move the existing AllocOp to
+// the entry block.
+
+#map0 = affine_map<(d0) -> (d0)>
+
+// CHECK-LABEL: func @condBranch
+func @condBranch(%arg0: i1, %arg1: memref<2xf32>, %arg2: memref<2xf32>) {
+  cond_br %arg0, ^bb1, ^bb2
+^bb1:
+  br ^bb3(%arg1 : memref<2xf32>)
+^bb2:
+  %0 = alloc() : memref<2xf32>
+  linalg.generic {indexing_maps = [#map0, #map0], iterator_types = ["parallel"]}
+      ins(%arg1: memref<2xf32>)
+     outs(%0: memref<2xf32>) {
+  ^bb0(%gen1_arg0: f32, %gen1_arg1: f32):
+    %tmp1 = exp %gen1_arg0 : f32
+    linalg.yield %tmp1 : f32
+  }
+  br ^bb3(%0 : memref<2xf32>)
+^bb3(%1: memref<2xf32>):
+  "linalg.copy"(%1, %arg2) : (memref<2xf32>, memref<2xf32>) -> ()
+  return
+}
+
+// CHECK-NEXT: %[[ALLOC:.*]] = alloc()
+// CHECK-NEXT: cond_br
+
+// -----
+
+// Test Case:
+//    bb0
+//   /   \
+//  bb1  bb2 <- Initial position of AllocOp
+//   \   /
+//    bb3
+// BufferHoisting expected behavior: It should not move the existing AllocOp
+// to any other block since the alloc has a dynamic dependency to block argument
+// %0 in bb2.
+
+#map0 = affine_map<(d0) -> (d0)>
+
+// CHECK-LABEL: func @condBranchDynamicType
+func @condBranchDynamicType(
+  %arg0: i1,
+  %arg1: memref<?xf32>,
+  %arg2: memref<?xf32>,
+  %arg3: index) {
+  cond_br %arg0, ^bb1, ^bb2(%arg3: index)
+^bb1:
+  br ^bb3(%arg1 : memref<?xf32>)
+^bb2(%0: index):
+  %1 = alloc(%0) : memref<?xf32>
+  linalg.generic {indexing_maps = [#map0, #map0], iterator_types = ["parallel"]}
+      ins(%arg1: memref<?xf32>)
+     outs(%1: memref<?xf32>) {
+  ^bb0(%gen1_arg0: f32, %gen1_arg1: f32):
+    %tmp1 = exp %gen1_arg0 : f32
+    linalg.yield %tmp1 : f32
+  }
+  br ^bb3(%1 : memref<?xf32>)
+^bb3(%2: memref<?xf32>):
+  "linalg.copy"(%2, %arg2) : (memref<?xf32>, memref<?xf32>) -> ()
+  return
+}
+
+// CHECK-NEXT: cond_br
+//      CHECK: ^bb2
+//      CHECK: ^bb2(%[[IDX:.*]]:{{.*}})
+// CHECK-NEXT: %[[ALLOC0:.*]] = alloc(%[[IDX]])
+// CHECK-NEXT: linalg.generic
+
+// -----
+
+// Test Case:
+//      bb0
+//     /    \
+//   bb1    bb2 <- Initial position of AllocOp
+//    |     /  \
+//    |   bb3  bb4
+//    |     \  /
+//    \     bb5
+//     \    /
+//       bb6
+//        |
+//       bb7
+// BufferHoisting expected behavior: It should not move the existing AllocOp
+// to any other block since the alloc has a dynamic dependency to block argument
+// %0 in bb2.
+
+#map0 = affine_map<(d0) -> (d0)>
+
+// CHECK-LABEL: func @condBranchDynamicTypeNested
+func @condBranchDynamicTypeNested(
+  %arg0: i1,
+  %arg1: memref<?xf32>,
+  %arg2: memref<?xf32>,
+  %arg3: index) {
+  cond_br %arg0, ^bb1, ^bb2(%arg3: index)
+^bb1:
+  br ^bb6(%arg1 : memref<?xf32>)
+^bb2(%0: index):
+  %1 = alloc(%0) : memref<?xf32>
+  linalg.generic {indexing_maps = [#map0, #map0], iterator_types = ["parallel"]}
+      ins(%arg1: memref<?xf32>)
+     outs(%1: memref<?xf32>) {
+  ^bb0(%gen1_arg0: f32, %gen1_arg1: f32):
+    %tmp1 = exp %gen1_arg0 : f32
+    linalg.yield %tmp1 : f32
+  }
+  cond_br %arg0, ^bb3, ^bb4
+^bb3:
+  br ^bb5(%1 : memref<?xf32>)
+^bb4:
+  br ^bb5(%1 : memref<?xf32>)
+^bb5(%2: memref<?xf32>):
+  br ^bb6(%2 : memref<?xf32>)
+^bb6(%3: memref<?xf32>):
+  br ^bb7(%3 : memref<?xf32>)
+^bb7(%4: memref<?xf32>):
+  "linalg.copy"(%4, %arg2) : (memref<?xf32>, memref<?xf32>) -> ()
+  return
+}
+
+// CHECK-NEXT: cond_br
+//      CHECK: ^bb2
+//      CHECK: ^bb2(%[[IDX:.*]]:{{.*}})
+// CHECK-NEXT: %[[ALLOC0:.*]] = alloc(%[[IDX]])
+// CHECK-NEXT: linalg.generic
+
+// -----
+
+// Test Case:
+//    bb0
+//   /   \
+//  |    bb1 <- Initial position of AllocOp
+//   \   /
+//    bb2
+// BufferHoisting expected behavior: It should move the existing AllocOp to
+// the entry block.
+
+#map0 = affine_map<(d0) -> (d0)>
+
+// CHECK-LABEL: func @criticalEdge
+func @criticalEdge(%arg0: i1, %arg1: memref<2xf32>, %arg2: memref<2xf32>) {
+  cond_br %arg0, ^bb1, ^bb2(%arg1 : memref<2xf32>)
+^bb1:
+  %0 = alloc() : memref<2xf32>
+  linalg.generic {indexing_maps = [#map0, #map0], iterator_types = ["parallel"]}
+      ins(%arg1: memref<2xf32>)
+     outs(%0: memref<2xf32>) {
+  ^bb0(%gen1_arg0: f32, %gen1_arg1: f32):
+    %tmp1 = exp %gen1_arg0 : f32
+    linalg.yield %tmp1 : f32
+  }
+  br ^bb2(%0 : memref<2xf32>)
+^bb2(%1: memref<2xf32>):
+  "linalg.copy"(%1, %arg2) : (memref<2xf32>, memref<2xf32>) -> ()
+  return
+}
+
+// CHECK-NEXT: %[[ALLOC:.*]] = alloc()
+// CHECK-NEXT: cond_br
+
+// -----
+
+// Test Case:
+//    bb0 <- Initial position of the first AllocOp
+//   /   \
+//  bb1  bb2
+//   \   /
+//    bb3 <- Initial position of the second AllocOp
+// BufferHoisting expected behavior: It shouldn't move the AllocOps.
+
+#map0 = affine_map<(d0) -> (d0)>
+
+// CHECK-LABEL: func @ifElse
+func @ifElse(%arg0: i1, %arg1: memref<2xf32>, %arg2: memref<2xf32>) {
+  %0 = alloc() : memref<2xf32>
+  linalg.generic {indexing_maps = [#map0, #map0], iterator_types = ["parallel"]}
+      ins(%arg1: memref<2xf32>)
+     outs(%0: memref<2xf32>) {
+  ^bb0(%gen1_arg0: f32, %gen1_arg1: f32):
+    %tmp1 = exp %gen1_arg0 : f32
+    linalg.yield %tmp1 : f32
+  }
+  cond_br %arg0,
+    ^bb1(%arg1, %0 : memref<2xf32>, memref<2xf32>),
+    ^bb2(%0, %arg1 : memref<2xf32>, memref<2xf32>)
+^bb1(%1: memref<2xf32>, %2: memref<2xf32>):
+  br ^bb3(%1, %2 : memref<2xf32>, memref<2xf32>)
+^bb2(%3: memref<2xf32>, %4: memref<2xf32>):
+  br ^bb3(%3, %4 : memref<2xf32>, memref<2xf32>)
+^bb3(%5: memref<2xf32>, %6: memref<2xf32>):
+  %7 = alloc() : memref<2xf32>
+  linalg.generic {indexing_maps = [#map0, #map0], iterator_types = ["parallel"]}
+      ins(%7: memref<2xf32>)
+     outs(%7: memref<2xf32>) {
+  ^bb0(%gen2_arg0: f32, %gen2_arg1: f32):
+    %tmp2 = exp %gen2_arg0 : f32
+    linalg.yield %tmp2 : f32
+  }
+  "linalg.copy"(%7, %arg2) : (memref<2xf32>, memref<2xf32>) -> ()
+  return
+}
+
+// CHECK-NEXT: %[[ALLOC0:.*]] = alloc()
+// CHECK-NEXT: linalg.generic
+//      CHECK: br ^bb3
+//      CHECK: br ^bb3
+// CHECK-NEXT: ^bb3
+//      CHECK: %[[ALLOC1:.*]] = alloc()
+// CHECK-NEXT: linalg.generic
+//      CHECK: linalg.copy(%[[ALLOC1]]
+// CHECK-NEXT: return
+
+// -----
+
+// Test Case: No users for buffer in if-else CFG
+//    bb0 <- Initial position of AllocOp
+//   /   \
+//  bb1  bb2
+//   \   /
+//    bb3
+// BufferHoisting expected behavior: It shouldn't move the AllocOp.
+
+#map0 = affine_map<(d0) -> (d0)>
+
+// CHECK-LABEL: func @ifElseNoUsers
+func @ifElseNoUsers(%arg0: i1, %arg1: memref<2xf32>, %arg2: memref<2xf32>) {
+  %0 = alloc() : memref<2xf32>
+  linalg.generic {indexing_maps = [#map0, #map0], iterator_types = ["parallel"]}
+      ins(%arg1: memref<2xf32>)
+     outs(%0: memref<2xf32>) {
+  ^bb0(%gen1_arg0: f32, %gen1_arg1: f32):
+    %tmp1 = exp %gen1_arg0 : f32
+    linalg.yield %tmp1 : f32
+  }
+  cond_br %arg0,
+    ^bb1(%arg1, %0 : memref<2xf32>, memref<2xf32>),
+    ^bb2(%0, %arg1 : memref<2xf32>, memref<2xf32>)
+^bb1(%1: memref<2xf32>, %2: memref<2xf32>):
+  br ^bb3(%1, %2 : memref<2xf32>, memref<2xf32>)
+^bb2(%3: memref<2xf32>, %4: memref<2xf32>):
+  br ^bb3(%3, %4 : memref<2xf32>, memref<2xf32>)
+^bb3(%5: memref<2xf32>, %6: memref<2xf32>):
+  "linalg.copy"(%arg1, %arg2) : (memref<2xf32>, memref<2xf32>) -> ()
+  return
+}
+
+// CHECK-NEXT: %[[ALLOC0:.*]] = alloc()
+// CHECK-NEXT: linalg.generic
+
+// -----
+
+// Test Case:
+//      bb0 <- Initial position of the first AllocOp
+//     /    \
+//   bb1    bb2
+//    |     /  \
+//    |   bb3  bb4
+//    \     \  /
+//     \     /
+//       bb5 <- Initial position of the second AllocOp
+// BufferHoisting expected behavior: AllocOps shouldn't be moved.
+
+#map0 = affine_map<(d0) -> (d0)>
+
+// CHECK-LABEL: func @ifElseNested
+func @ifElseNested(%arg0: i1, %arg1: memref<2xf32>, %arg2: memref<2xf32>) {
+  %0 = alloc() : memref<2xf32>
+  linalg.generic {indexing_maps = [#map0, #map0], iterator_types = ["parallel"]}
+      ins(%arg1: memref<2xf32>)
+     outs(%0: memref<2xf32>) {
+  ^bb0(%gen1_arg0: f32, %gen1_arg1: f32):
+    %tmp1 = exp %gen1_arg0 : f32
+    linalg.yield %tmp1 : f32
+  }
+  cond_br %arg0,
+    ^bb1(%arg1, %0 : memref<2xf32>, memref<2xf32>),
+    ^bb2(%0, %arg1 : memref<2xf32>, memref<2xf32>)
+^bb1(%1: memref<2xf32>, %2: memref<2xf32>):
+  br ^bb5(%1, %2 : memref<2xf32>, memref<2xf32>)
+^bb2(%3: memref<2xf32>, %4: memref<2xf32>):
+  cond_br %arg0, ^bb3(%3 : memref<2xf32>), ^bb4(%4 : memref<2xf32>)
+^bb3(%5: memref<2xf32>):
+  br ^bb5(%5, %3 : memref<2xf32>, memref<2xf32>)
+^bb4(%6: memref<2xf32>):
+  br ^bb5(%3, %6 : memref<2xf32>, memref<2xf32>)
+^bb5(%7: memref<2xf32>, %8: memref<2xf32>):
+  %9 = alloc() : memref<2xf32>
+  linalg.generic {indexing_maps = [#map0, #map0], iterator_types = ["parallel"]}
+      ins(%7: memref<2xf32>)
+     outs(%9: memref<2xf32>) {
+  ^bb0(%gen2_arg0: f32, %gen2_arg1: f32):
+    %tmp2 = exp %gen2_arg0 : f32
+    linalg.yield %tmp2 : f32
+  }
+  "linalg.copy"(%9, %arg2) : (memref<2xf32>, memref<2xf32>) -> ()
+  return
+}
+
+// CHECK-NEXT: %[[ALLOC0:.*]] = alloc()
+// CHECK-NEXT: linalg.generic
+//      CHECK: br ^bb5
+//      CHECK: br ^bb5
+//      CHECK: br ^bb5
+// CHECK-NEXT: ^bb5
+//      CHECK: %[[ALLOC1:.*]] = alloc()
+// CHECK-NEXT: linalg.generic
+
+// -----
+
+// Test Case: Dead operations in a single block.
+// BufferHoisting expected behavior: It shouldn't move the AllocOps.
+
+#map0 = affine_map<(d0) -> (d0)>
+
+// CHECK-LABEL: func @redundantOperations
+func @redundantOperations(%arg0: memref<2xf32>) {
+  %0 = alloc() : memref<2xf32>
+  linalg.generic {indexing_maps = [#map0, #map0], iterator_types = ["parallel"]}
+      ins(%arg0: memref<2xf32>)
+     outs(%0: memref<2xf32>) {
+  ^bb0(%gen1_arg0: f32, %gen1_arg1: f32):
+    %tmp1 = exp %gen1_arg0 : f32
+    linalg.yield %tmp1 : f32
+  }
+  %1 = alloc() : memref<2xf32>
+  linalg.generic {indexing_maps = [#map0, #map0], iterator_types = ["parallel"]}
+      ins(%0: memref<2xf32>)
+     outs(%1: memref<2xf32>) {
+  ^bb0(%gen2_arg0: f32, %gen2_arg1: f32):
+    %tmp2 = exp %gen2_arg0 : f32
+    linalg.yield %tmp2 : f32
+  }
+  return
+}
+
+// CHECK-NEXT: %[[ALLOC0:.*]] = alloc()
+// CHECK-NEXT: linalg.generic
+//      CHECK: %[[ALLOC1:.*]] = alloc()
+// CHECK-NEXT: linalg.generic
+
+// -----
+
+// Test Case:
+//                                     bb0
+//                                    /   \
+// Initial pos of the 1st AllocOp -> bb1  bb2 <- Initial pos of the 2nd AllocOp
+//                                    \   /
+//                                     bb3
+// BufferHoisting expected behavior: Both AllocOps should be moved to the
+// entry block.
+
+#map0 = affine_map<(d0) -> (d0)>
+
+// CHECK-LABEL: func @moving_alloc_and_inserting_missing_dealloc
+func @moving_alloc_and_inserting_missing_dealloc(
+  %cond: i1,
+    %arg0: memref<2xf32>,
+    %arg1: memref<2xf32>) {
+  cond_br %cond, ^bb1, ^bb2
+^bb1:
+  %0 = alloc() : memref<2xf32>
+  linalg.generic {indexing_maps = [#map0, #map0], iterator_types = ["parallel"]}
+      ins(%arg0: memref<2xf32>)
+     outs(%0: memref<2xf32>) {
+  ^bb0(%gen1_arg0: f32, %gen1_arg1: f32):
+    %tmp1 = exp %gen1_arg0 : f32
+    linalg.yield %tmp1 : f32
+  }
+  br ^exit(%0 : memref<2xf32>)
+^bb2:
+  %1 = alloc() : memref<2xf32>
+  linalg.generic {indexing_maps = [#map0, #map0], iterator_types = ["parallel"]}
+      ins(%arg0: memref<2xf32>)
+     outs(%1: memref<2xf32>) {
+  ^bb0(%gen2_arg0: f32, %gen2_arg1: f32):
+    %tmp2 = exp %gen2_arg0 : f32
+    linalg.yield %tmp2 : f32
+  }
+  br ^exit(%1 : memref<2xf32>)
+^exit(%arg2: memref<2xf32>):
+  "linalg.copy"(%arg2, %arg1) : (memref<2xf32>, memref<2xf32>) -> ()
+  return
+}
+
+// CHECK-NEXT: %{{.*}} = alloc()
+// CHECK-NEXT: %{{.*}} = alloc()
+// CHECK-NEXT: cond_br
+
+// -----
+
+// Test Case: Invalid position of the DeallocOp. There is a user after
+// deallocation.
+//   bb0
+//  /   \
+// bb1  bb2 <- Initial position of AllocOp
+//  \   /
+//   bb3
+// BufferHoisting expected behavior: It should move the AllocOp to the entry
+// block.
+
+#map0 = affine_map<(d0) -> (d0)>
+
+// CHECK-LABEL: func @moving_invalid_dealloc_op_complex
+func @moving_invalid_dealloc_op_complex(
+  %cond: i1,
+    %arg0: memref<2xf32>,
+    %arg1: memref<2xf32>) {
+  cond_br %cond, ^bb1, ^bb2
+^bb1:
+  br ^exit(%arg0 : memref<2xf32>)
+^bb2:
+  %1 = alloc() : memref<2xf32>
+  linalg.generic {indexing_maps = [#map0, #map0], iterator_types = ["parallel"]}
+      ins(%arg0: memref<2xf32>)
+     outs(%1: memref<2xf32>) {
+  ^bb0(%gen1_arg0: f32, %gen1_arg1: f32):
+    %tmp1 = exp %gen1_arg0 : f32
+    linalg.yield %tmp1 : f32
+  }
+  dealloc %1 : memref<2xf32>
+  br ^exit(%1 : memref<2xf32>)
+^exit(%arg2: memref<2xf32>):
+  "linalg.copy"(%arg2, %arg1) : (memref<2xf32>, memref<2xf32>) -> ()
+  return
+}
+
+// CHECK-NEXT: %{{.*}} = alloc()
+// CHECK-NEXT: cond_br
+
+// -----
+
+// Test Case: Nested regions - This test defines a GenericOp inside the region
+// of another GenericOp.
+// BufferHoisting expected behavior: The AllocOp of inner GenericOp should
+// remain inside the region of outer GenericOp. The AllocOp of the outer
+// GenericOp should be moved to the entry block.
+
+#map0 = affine_map<(d0) -> (d0)>
+
+// CHECK-LABEL: func @nested_regions_and_cond_branch
+func @nested_regions_and_cond_branch(
+  %arg0: i1,
+  %arg1: memref<2xf32>,
+  %arg2: memref<2xf32>) {
+  cond_br %arg0, ^bb1, ^bb2
+^bb1:
+  br ^bb3(%arg1 : memref<2xf32>)
+^bb2:
+  %0 = alloc() : memref<2xf32>
+  linalg.generic {
+    indexing_maps = [#map0, #map0],
+    iterator_types = ["parallel"]}
+      ins(%arg1: memref<2xf32>)
+     outs(%0: memref<2xf32>) {
+  ^bb0(%gen1_arg0: f32, %gen1_arg1: f32):
+    %1 = alloc() : memref<2xf32>
+    linalg.generic {
+      indexing_maps = [#map0, #map0],
+      iterator_types = ["parallel"]}
+        ins(%arg1: memref<2xf32>)
+      outs(%1: memref<2xf32>) {
+    ^bb0(%gen2_arg0: f32, %gen2_arg1: f32):
+      %tmp2 = exp %gen2_arg0 : f32
+      linalg.yield %tmp2 : f32
+    }
+    %tmp1 = exp %gen1_arg0 : f32
+    linalg.yield %tmp1 : f32
+  }
+  br ^bb3(%0 : memref<2xf32>)
+^bb3(%1: memref<2xf32>):
+  "linalg.copy"(%1, %arg2) : (memref<2xf32>, memref<2xf32>) -> ()
+  return
+}
+// CHECK-NEXT:   %[[ALLOC0:.*]] = alloc()
+// CHECK-NEXT:   cond_br
+//      CHECK:   linalg.generic
+//      CHECK:     %[[ALLOC1:.*]] = alloc()
+// CHECK-NEXT:     linalg.generic
+
+// -----
+
+// Test Case: nested region control flow
+// The alloc position of %1 does not need to be changed and flows through
+// both if branches until it is finally returned.
+
+// CHECK-LABEL: func @nested_region_control_flow
+func @nested_region_control_flow(
+  %arg0 : index,
+  %arg1 : index) -> memref<?x?xf32> {
+  %0 = cmpi "eq", %arg0, %arg1 : index
+  %1 = alloc(%arg0, %arg0) : memref<?x?xf32>
+  %2 = scf.if %0 -> (memref<?x?xf32>) {
+    scf.yield %1 : memref<?x?xf32>
+  } else {
+    %3 = alloc(%arg0, %arg1) : memref<?x?xf32>
+    scf.yield %1 : memref<?x?xf32>
+  }
+  return %2 : memref<?x?xf32>
+}
+
+//      CHECK: %[[ALLOC0:.*]] = alloc(%arg0, %arg0)
+// CHECK-NEXT: %{{.*}} = scf.if
+//      CHECK: else
+// CHECK-NEXT: %[[ALLOC1:.*]] = alloc(%arg0, %arg1)
+
+// -----
+
+// Test Case: nested region control flow with a nested buffer allocation in a
+// divergent branch.
+// The alloc positions of %1 does not need to be changed. %3 is moved upwards.
+
+// CHECK-LABEL: func @nested_region_control_flow_div
+func @nested_region_control_flow_div(
+  %arg0 : index,
+  %arg1 : index) -> memref<?x?xf32> {
+  %0 = cmpi "eq", %arg0, %arg1 : index
+  %1 = alloc(%arg0, %arg0) : memref<?x?xf32>
+  %2 = scf.if %0 -> (memref<?x?xf32>) {
+    scf.yield %1 : memref<?x?xf32>
+  } else {
+    %3 = alloc(%arg0, %arg1) : memref<?x?xf32>
+    scf.yield %3 : memref<?x?xf32>
+  }
+  return %2 : memref<?x?xf32>
+}
+
+//      CHECK: %[[ALLOC0:.*]] = alloc(%arg0, %arg0)
+// CHECK-NEXT: %[[ALLOC1:.*]] = alloc(%arg0, %arg1)
+// CHECK-NEXT: %{{.*}} = scf.if
+
+// -----
+
+// Test Case: deeply nested region control flow with a nested buffer allocation
+// in a divergent branch.
+// The alloc position of %1 does not need to be changed. Allocs %4 and %5 are
+// moved upwards.
+
+// CHECK-LABEL: func @nested_region_control_flow_div_nested
+func @nested_region_control_flow_div_nested(
+  %arg0 : index,
+  %arg1 : index) -> memref<?x?xf32> {
+  %0 = cmpi "eq", %arg0, %arg1 : index
+  %1 = alloc(%arg0, %arg0) : memref<?x?xf32>
+  %2 = scf.if %0 -> (memref<?x?xf32>) {
+    %3 = scf.if %0 -> (memref<?x?xf32>) {
+      scf.yield %1 : memref<?x?xf32>
+    } else {
+      %4 = alloc(%arg0, %arg1) : memref<?x?xf32>
+      scf.yield %4 : memref<?x?xf32>
+    }
+    scf.yield %3 : memref<?x?xf32>
+  } else {
+    %5 = alloc(%arg1, %arg1) : memref<?x?xf32>
+    scf.yield %5 : memref<?x?xf32>
+  }
+  return %2 : memref<?x?xf32>
+}
+//      CHECK: %[[ALLOC0:.*]] = alloc(%arg0, %arg0)
+// CHECK-NEXT: %[[ALLOC1:.*]] = alloc(%arg0, %arg1)
+// CHECK-NEXT: %[[ALLOC2:.*]] = alloc(%arg1, %arg1)
+// CHECK-NEXT: %{{.*}} = scf.if
+
+// -----
+
+// Test Case: nested region control flow within a region interface.
+// The alloc positions of %0 does not need to be changed.
+
+// CHECK-LABEL: func @inner_region_control_flow
+func @inner_region_control_flow(%arg0 : index) -> memref<?x?xf32> {
+  %0 = alloc(%arg0, %arg0) : memref<?x?xf32>
+  %1 = test.region_if %0 : memref<?x?xf32> -> (memref<?x?xf32>) then {
+    ^bb0(%arg1 : memref<?x?xf32>):
+      test.region_if_yield %arg1 : memref<?x?xf32>
+  } else {
+    ^bb0(%arg1 : memref<?x?xf32>):
+      test.region_if_yield %arg1 : memref<?x?xf32>
+  } join {
+    ^bb0(%arg1 : memref<?x?xf32>):
+      test.region_if_yield %arg1 : memref<?x?xf32>
+  }
+  return %1 : memref<?x?xf32>
+}
+
+//      CHECK: %[[ALLOC0:.*]] = alloc(%arg0, %arg0)
+// CHECK-NEXT: {{.*}} test.region_if
+
+// -----
+
+// Test Case: nested region control flow within a region interface including an
+// allocation in a divergent branch.
+// The alloc positions of %0 does not need to be changed. %2 is moved upwards.
+
+// CHECK-LABEL: func @inner_region_control_flow_div
+func @inner_region_control_flow_div(
+  %arg0 : index,
+  %arg1 : index) -> memref<?x?xf32> {
+  %0 = alloc(%arg0, %arg0) : memref<?x?xf32>
+  %1 = test.region_if %0 : memref<?x?xf32> -> (memref<?x?xf32>) then {
+    ^bb0(%arg2 : memref<?x?xf32>):
+      test.region_if_yield %arg2 : memref<?x?xf32>
+  } else {
+    ^bb0(%arg2 : memref<?x?xf32>):
+      %2 = alloc(%arg0, %arg1) : memref<?x?xf32>
+      test.region_if_yield %2 : memref<?x?xf32>
+  } join {
+    ^bb0(%arg2 : memref<?x?xf32>):
+      test.region_if_yield %arg2 : memref<?x?xf32>
+  }
+  return %1 : memref<?x?xf32>
+}
+
+//      CHECK: %[[ALLOC0:.*]] = alloc(%arg0, %arg0)
+// CHECK-NEXT: %[[ALLOC1:.*]] = alloc(%arg0, %arg1)
+// CHECK-NEXT: {{.*}} test.region_if
+
+// -----
+
+#map0 = affine_map<(d0) -> (d0)>
+
+// Test Case: Alloca operations shouldn't be moved.
+
+// CHECK-LABEL: func @condBranchAlloca
+func @condBranchAlloca(%arg0: i1, %arg1: memref<2xf32>, %arg2: memref<2xf32>) {
+  cond_br %arg0, ^bb1, ^bb2
+^bb1:
+  br ^bb3(%arg1 : memref<2xf32>)
+^bb2:
+  %0 = alloca() : memref<2xf32>
+  linalg.generic {indexing_maps = [#map0, #map0], iterator_types = ["parallel"]}
+      ins(%arg1: memref<2xf32>)
+     outs(%0: memref<2xf32>) {
+  ^bb0(%gen1_arg0: f32, %gen1_arg1: f32):
+    %tmp1 = exp %gen1_arg0 : f32
+    linalg.yield %tmp1 : f32
+  }
+  br ^bb3(%0 : memref<2xf32>)
+^bb3(%1: memref<2xf32>):
+  "linalg.copy"(%1, %arg2) : (memref<2xf32>, memref<2xf32>) -> ()
+  return
+}
+
+// CHECK-NEXT: cond_br
+//      CHECK: ^bb2
+//      CHECK: ^bb2
+// CHECK-NEXT: %[[ALLOCA:.*]] = alloca()
+// CHECK-NEXT: linalg.generic
+
+// -----
+
+#map0 = affine_map<(d0) -> (d0)>
+
+// Test Case: Alloca operations shouldn't be moved. The alloc operation also
+// shouldn't be moved analogously to the ifElseNested test.
+
+// CHECK-LABEL: func @ifElseNestedAlloca
+func @ifElseNestedAlloca(
+  %arg0: i1,
+  %arg1: memref<2xf32>,
+  %arg2: memref<2xf32>) {
+  %0 = alloca() : memref<2xf32>
+  linalg.generic {indexing_maps = [#map0, #map0], iterator_types = ["parallel"]}
+      ins(%arg1: memref<2xf32>)
+     outs(%0: memref<2xf32>) {
+  ^bb0(%gen1_arg0: f32, %gen1_arg1: f32):
+    %tmp1 = exp %gen1_arg0 : f32
+    linalg.yield %tmp1 : f32
+  }
+  cond_br %arg0,
+    ^bb1(%arg1, %0 : memref<2xf32>, memref<2xf32>),
+    ^bb2(%0, %arg1 : memref<2xf32>, memref<2xf32>)
+^bb1(%1: memref<2xf32>, %2: memref<2xf32>):
+  br ^bb5(%1, %2 : memref<2xf32>, memref<2xf32>)
+^bb2(%3: memref<2xf32>, %4: memref<2xf32>):
+  cond_br %arg0, ^bb3(%3 : memref<2xf32>), ^bb4(%4 : memref<2xf32>)
+^bb3(%5: memref<2xf32>):
+  br ^bb5(%5, %3 : memref<2xf32>, memref<2xf32>)
+^bb4(%6: memref<2xf32>):
+  br ^bb5(%3, %6 : memref<2xf32>, memref<2xf32>)
+^bb5(%7: memref<2xf32>, %8: memref<2xf32>):
+  %9 = alloc() : memref<2xf32>
+  linalg.generic {indexing_maps = [#map0, #map0], iterator_types = ["parallel"]}
+      ins(%7: memref<2xf32>)
+     outs(%9: memref<2xf32>) {
+  ^bb0(%gen2_arg0: f32, %gen2_arg1: f32):
+    %tmp2 = exp %gen2_arg0 : f32
+    linalg.yield %tmp2 : f32
+  }
+  "linalg.copy"(%9, %arg2) : (memref<2xf32>, memref<2xf32>) -> ()
+  return
+}
+
+// CHECK-NEXT: %[[ALLOCA:.*]] = alloca()
+// CHECK-NEXT: linalg.generic
+//      CHECK: ^bb5
+//      CHECK: ^bb5
+//      CHECK: ^bb5
+// CHECK-NEXT: ^bb5
+// CHECK-NEXT: %[[ALLOC:.*]] = alloc()
+// CHECK-NEXT: linalg.generic
+
+// -----
+
+#map0 = affine_map<(d0) -> (d0)>
+
+// Test Case: Alloca operations shouldn't be moved. The alloc operation should
+// be moved in the beginning analogous to the nestedRegionsAndCondBranch test.
+
+// CHECK-LABEL: func @nestedRegionsAndCondBranchAlloca
+func @nestedRegionsAndCondBranchAlloca(
+  %arg0: i1,
+  %arg1: memref<2xf32>,
+  %arg2: memref<2xf32>) {
+  cond_br %arg0, ^bb1, ^bb2
+^bb1:
+  br ^bb3(%arg1 : memref<2xf32>)
+^bb2:
+  %0 = alloc() : memref<2xf32>
+  linalg.generic {
+    indexing_maps = [#map0, #map0],
+    iterator_types = ["parallel"]}
+      ins(%arg1: memref<2xf32>)
+     outs(%0: memref<2xf32>) {
+  ^bb0(%gen1_arg0: f32, %gen1_arg1: f32):
+    %1 = alloca() : memref<2xf32>
+    linalg.generic {
+      indexing_maps = [#map0, #map0],
+      iterator_types = ["parallel"]}
+        ins(%arg1: memref<2xf32>)
+      outs(%1: memref<2xf32>) {
+    ^bb0(%gen2_arg0: f32, %gen2_arg1: f32):
+      %tmp2 = exp %gen2_arg0 : f32
+      linalg.yield %tmp2 : f32
+    }
+    %tmp1 = exp %gen1_arg0 : f32
+    linalg.yield %tmp1 : f32
+  }
+  br ^bb3(%0 : memref<2xf32>)
+^bb3(%1: memref<2xf32>):
+  "linalg.copy"(%1, %arg2) : (memref<2xf32>, memref<2xf32>) -> ()
+  return
+}
+// CHECK-NEXT:   %[[ALLOC:.*]] = alloc()
+// CHECK-NEXT:   cond_br
+//      CHECK:   linalg.generic
+//      CHECK:     %[[ALLOCA:.*]] = alloca()
+// CHECK-NEXT:     linalg.generic
+
+// -----
+
+// Test Case: structured control-flow loop using a nested alloc.
+// The alloc positions of %3 will be moved upwards.
+
+// CHECK-LABEL: func @loop_alloc
+func @loop_alloc(
+  %lb: index,
+  %ub: index,
+  %step: index,
+  %buf: memref<2xf32>,
+  %res: memref<2xf32>) {
+  %0 = alloc() : memref<2xf32>
+  %1 = scf.for %i = %lb to %ub step %step
+    iter_args(%iterBuf = %buf) -> memref<2xf32> {
+    %2 = cmpi "eq", %i, %ub : index
+    %3 = alloc() : memref<2xf32>
+    scf.yield %3 : memref<2xf32>
+  }
+  "linalg.copy"(%1, %res) : (memref<2xf32>, memref<2xf32>) -> ()
+  return
+}
+
+//      CHECK: %[[ALLOC0:.*]] = alloc()
+// CHECK-NEXT: {{.*}} scf.for
+//      CHECK: %[[ALLOC1:.*]] = alloc()
+
+// -----
+
+// Test Case: structured control-flow loop with a nested if operation using
+// a deeply nested buffer allocation.
+// The allocation %4 is not moved upwards.
+
+// CHECK-LABEL: func @loop_nested_if_alloc
+func @loop_nested_if_alloc(
+  %lb: index,
+  %ub: index,
+  %step: index,
+  %buf: memref<2xf32>) -> memref<2xf32> {
+  %0 = alloc() : memref<2xf32>
+  %1 = scf.for %i = %lb to %ub step %step
+    iter_args(%iterBuf = %buf) -> memref<2xf32> {
+    %2 = cmpi "eq", %i, %ub : index
+    %3 = scf.if %2 -> (memref<2xf32>) {
+      %4 = alloc() : memref<2xf32>
+      scf.yield %4 : memref<2xf32>
+    } else {
+      scf.yield %0 : memref<2xf32>
+    }
+    scf.yield %3 : memref<2xf32>
+  }
+  return %1 : memref<2xf32>
+}
+
+//      CHECK: %[[ALLOC0:.*]] = alloc()
+// CHECK-NEXT: {{.*}} scf.for
+//      CHECK: %[[ALLOC1:.*]] = alloc()
+
+// -----
+
+// Test Case: several nested structured control-flow loops with a deeply nested
+// buffer allocation inside an if operation.
+// Same behavior is an loop_nested_if_alloc: The allocs are not moved upwards.
+
+// CHECK-LABEL: func @loop_nested_alloc
+func @loop_nested_alloc(
+  %lb: index,
+  %ub: index,
+  %step: index,
+  %buf: memref<2xf32>,
+  %res: memref<2xf32>) {
+  %0 = alloc() : memref<2xf32>
+  %1 = scf.for %i = %lb to %ub step %step
+    iter_args(%iterBuf = %buf) -> memref<2xf32> {
+    %2 = scf.for %i2 = %lb to %ub step %step
+      iter_args(%iterBuf2 = %iterBuf) -> memref<2xf32> {
+      %3 = scf.for %i3 = %lb to %ub step %step
+        iter_args(%iterBuf3 = %iterBuf2) -> memref<2xf32> {
+        %4 = alloc() : memref<2xf32>
+        %5 = cmpi "eq", %i, %ub : index
+        %6 = scf.if %5 -> (memref<2xf32>) {
+          %7 = alloc() : memref<2xf32>
+          scf.yield %7 : memref<2xf32>
+        } else {
+          scf.yield %iterBuf3 : memref<2xf32>
+        }
+        scf.yield %6 : memref<2xf32>
+      }
+      scf.yield %3 : memref<2xf32>
+    }
+    scf.yield %2 : memref<2xf32>
+  }
+  "linalg.copy"(%1, %res) : (memref<2xf32>, memref<2xf32>) -> ()
+  return
+}
+
+//      CHECK: %[[ALLOC0:.*]] = alloc()
+// CHECK-NEXT: {{.*}} = scf.for
+// CHECK-NEXT: {{.*}} = scf.for
+// CHECK-NEXT: {{.*}} = scf.for
+// CHECK-NEXT: %[[ALLOC1:.*]] = alloc()
+//      CHECK: %[[ALLOC2:.*]] = alloc()
+
+// -----
+
+// CHECK-LABEL: func @loop_nested_alloc_dyn_dependency
+func @loop_nested_alloc_dyn_dependency(
+  %lb: index,
+  %ub: index,
+  %step: index,
+  %arg0: index,
+  %buf: memref<?xf32>,
+  %res: memref<?xf32>) {
+  %0 = alloc(%arg0) : memref<?xf32>
+  %1 = scf.for %i = %lb to %ub step %step
+    iter_args(%iterBuf = %buf) -> memref<?xf32> {
+    %2 = scf.for %i2 = %lb to %ub step %step
+      iter_args(%iterBuf2 = %iterBuf) -> memref<?xf32> {
+      %3 = scf.for %i3 = %lb to %ub step %step
+        iter_args(%iterBuf3 = %iterBuf2) -> memref<?xf32> {
+        %5 = cmpi "eq", %i, %ub : index
+        %6 = scf.if %5 -> (memref<?xf32>) {
+          %7 = alloc(%i3) : memref<?xf32>
+          scf.yield %7 : memref<?xf32>
+        } else {
+          scf.yield %iterBuf3 : memref<?xf32>
+        }
+        scf.yield %6 : memref<?xf32>
+      }
+      scf.yield %3 : memref<?xf32>
+    }
+    scf.yield %0 : memref<?xf32>
+  }
+  "linalg.copy"(%1, %res) : (memref<?xf32>, memref<?xf32>) -> ()
+  return
+}
+
+
+//      CHECK: %[[ALLOC0:.*]] = alloc({{.*}})
+// CHECK-NEXT: {{.*}} = scf.for
+// CHECK-NEXT: {{.*}} = scf.for
+// CHECK-NEXT: {{.*}} = scf.for
+//      CHECK: %[[ALLOC1:.*]] = alloc({{.*}})
diff --git a/mlir/test/Transforms/buffer-loop-hoisting.mlir b/mlir/test/Transforms/buffer-loop-hoisting.mlir
new file mode 100644
--- /dev/null
+++ b/mlir/test/Transforms/buffer-loop-hoisting.mlir
@@ -0,0 +1,490 @@
+// RUN: mlir-opt -buffer-loop-hoisting -split-input-file %s | FileCheck %s
+
+// This file checks the behavior of BufferLoopHoisting pass for moving Alloc
+// operations in their correct positions.
+
+// Test Case:
+//    bb0
+//   /   \
+//  bb1  bb2 <- Initial position of AllocOp
+//   \   /
+//    bb3
+// BufferLoopHoisting expected behavior: It should not move the AllocOp.
+
+#map0 = affine_map<(d0) -> (d0)>
+
+// CHECK-LABEL: func @condBranch
+func @condBranch(%arg0: i1, %arg1: memref<2xf32>, %arg2: memref<2xf32>) {
+  cond_br %arg0, ^bb1, ^bb2
+^bb1:
+  br ^bb3(%arg1 : memref<2xf32>)
+^bb2:
+  %0 = alloc() : memref<2xf32>
+  linalg.generic {indexing_maps = [#map0, #map0], iterator_types = ["parallel"]}
+    ins(%arg1: memref<2xf32>)
+    outs(%0: memref<2xf32>) {
+  ^bb0(%gen1_arg0: f32, %gen1_arg1: f32):
+    %tmp1 = exp %gen1_arg0 : f32
+    linalg.yield %tmp1 : f32
+  }
+  br ^bb3(%0 : memref<2xf32>)
+^bb3(%1: memref<2xf32>):
+  "linalg.copy"(%1, %arg2) : (memref<2xf32>, memref<2xf32>) -> ()
+  return
+}
+
+// CHECK-NEXT: cond_br
+//      CHECK: %[[ALLOC:.*]] = alloc()
+
+// -----
+
+// Test Case:
+//    bb0
+//   /   \
+//  bb1  bb2 <- Initial position of AllocOp
+//   \   /
+//    bb3
+// BufferLoopHoisting expected behavior: It should not move the existing AllocOp
+// to any other block since the alloc has a dynamic dependency to block argument
+// %0 in bb2.
+
+#map0 = affine_map<(d0) -> (d0)>
+
+// CHECK-LABEL: func @condBranchDynamicType
+func @condBranchDynamicType(
+  %arg0: i1,
+  %arg1: memref<?xf32>,
+  %arg2: memref<?xf32>,
+  %arg3: index) {
+  cond_br %arg0, ^bb1, ^bb2(%arg3: index)
+^bb1:
+  br ^bb3(%arg1 : memref<?xf32>)
+^bb2(%0: index):
+  %1 = alloc(%0) : memref<?xf32>
+  linalg.generic {indexing_maps = [#map0, #map0], iterator_types = ["parallel"]}
+    ins(%arg1: memref<?xf32>)
+    outs(%1: memref<?xf32>) {
+  ^bb0(%gen1_arg0: f32, %gen1_arg1: f32):
+    %tmp1 = exp %gen1_arg0 : f32
+    linalg.yield %tmp1 : f32
+  }
+  br ^bb3(%1 : memref<?xf32>)
+^bb3(%2: memref<?xf32>):
+  "linalg.copy"(%2, %arg2) : (memref<?xf32>, memref<?xf32>) -> ()
+  return
+}
+
+// CHECK-NEXT: cond_br
+//      CHECK: ^bb2
+//      CHECK: ^bb2(%[[IDX:.*]]:{{.*}})
+// CHECK-NEXT: %[[ALLOC0:.*]] = alloc(%[[IDX]])
+// CHECK-NEXT: linalg.generic
+
+// -----
+
+// Test Case: Nested regions - This test defines a GenericOp inside the region
+// of another GenericOp.
+// BufferLoopHoisting expected behavior: The AllocOp of inner GenericOp should
+// remain inside the region of outer GenericOp. The AllocOp of the outer
+// GenericOp should not be moved during this pass.
+
+#map0 = affine_map<(d0) -> (d0)>
+
+// CHECK-LABEL: func @nested_regions_and_cond_branch
+func @nested_regions_and_cond_branch(
+  %arg0: i1,
+  %arg1: memref<2xf32>,
+  %arg2: memref<2xf32>) {
+  cond_br %arg0, ^bb1, ^bb2
+^bb1:
+  br ^bb3(%arg1 : memref<2xf32>)
+^bb2:
+  %0 = alloc() : memref<2xf32>
+  linalg.generic {
+    indexing_maps = [#map0, #map0],
+    iterator_types = ["parallel"]}
+    ins(%arg1: memref<2xf32>)
+    outs(%0: memref<2xf32>) {
+  ^bb0(%gen1_arg0: f32, %gen1_arg1: f32):
+    %1 = alloc() : memref<2xf32>
+    linalg.generic {
+      indexing_maps = [#map0, #map0],
+      iterator_types = ["parallel"]}
+      ins(%arg1: memref<2xf32>)
+      outs(%1: memref<2xf32>) {
+    ^bb0(%gen2_arg0: f32, %gen2_arg1: f32):
+      %tmp2 = exp %gen2_arg0 : f32
+      linalg.yield %tmp2 : f32
+    }
+    %tmp1 = exp %gen1_arg0 : f32
+    linalg.yield %tmp1 : f32
+  }
+  br ^bb3(%0 : memref<2xf32>)
+^bb3(%1: memref<2xf32>):
+  "linalg.copy"(%1, %arg2) : (memref<2xf32>, memref<2xf32>) -> ()
+  return
+}
+// CHECK-NEXT:   cond_br
+//      CHECK:   %[[ALLOC0:.*]] = alloc()
+//      CHECK:   linalg.generic
+//      CHECK:     %[[ALLOC1:.*]] = alloc()
+// CHECK-NEXT:     linalg.generic
+
+// -----
+
+// Test Case: nested region control flow
+// The alloc position of %1 does not need to be changed and flows through
+// both if branches until it is finally returned.
+
+// CHECK-LABEL: func @nested_region_control_flow
+func @nested_region_control_flow(
+  %arg0 : index,
+  %arg1 : index) -> memref<?x?xf32> {
+  %0 = cmpi "eq", %arg0, %arg1 : index
+  %1 = alloc(%arg0, %arg0) : memref<?x?xf32>
+  %2 = scf.if %0 -> (memref<?x?xf32>) {
+    scf.yield %1 : memref<?x?xf32>
+  } else {
+    %3 = alloc(%arg0, %arg1) : memref<?x?xf32>
+    scf.yield %1 : memref<?x?xf32>
+  }
+  return %2 : memref<?x?xf32>
+}
+
+//      CHECK: %[[ALLOC0:.*]] = alloc(%arg0, %arg0)
+// CHECK-NEXT: %{{.*}} = scf.if
+//      CHECK: else
+// CHECK-NEXT: %[[ALLOC1:.*]] = alloc(%arg0, %arg1)
+
+// -----
+
+// Test Case: structured control-flow loop using a nested alloc.
+// The alloc positions of %3 should not be changed.
+
+// CHECK-LABEL: func @loop_alloc
+func @loop_alloc(
+  %lb: index,
+  %ub: index,
+  %step: index,
+  %buf: memref<2xf32>,
+  %res: memref<2xf32>) {
+  %0 = alloc() : memref<2xf32>
+  %1 = scf.for %i = %lb to %ub step %step
+    iter_args(%iterBuf = %buf) -> memref<2xf32> {
+    %2 = cmpi "eq", %i, %ub : index
+    %3 = alloc() : memref<2xf32>
+    scf.yield %3 : memref<2xf32>
+  }
+  "linalg.copy"(%1, %res) : (memref<2xf32>, memref<2xf32>) -> ()
+  return
+}
+
+//      CHECK: %[[ALLOC0:.*]] = alloc()
+// CHECK-NEXT: {{.*}} scf.for
+//      CHECK: %[[ALLOC1:.*]] = alloc()
+
+// -----
+
+// Test Case: structured control-flow loop with a nested if operation using
+// a deeply nested buffer allocation.
+// The allocation %4 should not be moved upwards due to a back-edge dependency.
+
+// CHECK-LABEL: func @loop_nested_if_alloc
+func @loop_nested_if_alloc(
+  %lb: index,
+  %ub: index,
+  %step: index,
+  %buf: memref<2xf32>) -> memref<2xf32> {
+  %0 = alloc() : memref<2xf32>
+  %1 = scf.for %i = %lb to %ub step %step
+    iter_args(%iterBuf = %buf) -> memref<2xf32> {
+    %2 = cmpi "eq", %i, %ub : index
+    %3 = scf.if %2 -> (memref<2xf32>) {
+      %4 = alloc() : memref<2xf32>
+      scf.yield %4 : memref<2xf32>
+    } else {
+      scf.yield %0 : memref<2xf32>
+    }
+    scf.yield %3 : memref<2xf32>
+  }
+  return %1 : memref<2xf32>
+}
+
+//      CHECK: %[[ALLOC0:.*]] = alloc()
+// CHECK-NEXT: {{.*}} scf.for
+//      CHECK: %[[ALLOC1:.*]] = alloc()
+
+// -----
+
+// Test Case: several nested structured control-flow loops with deeply nested
+// buffer allocations inside an if operation.
+// Behavior: The allocs %0, %4 and %9 are moved upwards, while %7 and %8 stay
+// in their positions.
+
+// CHECK-LABEL: func @loop_nested_alloc
+func @loop_nested_alloc(
+  %lb: index,
+  %ub: index,
+  %step: index,
+  %buf: memref<2xf32>,
+  %res: memref<2xf32>) {
+  %0 = alloc() : memref<2xf32>
+  %1 = scf.for %i = %lb to %ub step %step
+    iter_args(%iterBuf = %buf) -> memref<2xf32> {
+    %2 = scf.for %i2 = %lb to %ub step %step
+      iter_args(%iterBuf2 = %iterBuf) -> memref<2xf32> {
+      %3 = scf.for %i3 = %lb to %ub step %step
+        iter_args(%iterBuf3 = %iterBuf2) -> memref<2xf32> {
+        %4 = alloc() : memref<2xf32>
+        %5 = cmpi "eq", %i, %ub : index
+        %6 = scf.if %5 -> (memref<2xf32>) {
+          %7 = alloc() : memref<2xf32>
+          %8 = alloc() : memref<2xf32>
+          scf.yield %8 : memref<2xf32>
+        } else {
+          scf.yield %iterBuf3 : memref<2xf32>
+        }
+        %9 = alloc() : memref<2xf32>
+        scf.yield %6 : memref<2xf32>
+      }
+      scf.yield %3 : memref<2xf32>
+    }
+    scf.yield %2 : memref<2xf32>
+  }
+  "linalg.copy"(%1, %res) : (memref<2xf32>, memref<2xf32>) -> ()
+  return
+}
+
+//      CHECK: %[[ALLOC0:.*]] = alloc()
+// CHECK-NEXT: %[[ALLOC1:.*]] = alloc()
+// CHECK-NEXT: %[[ALLOC2:.*]] = alloc()
+// CHECK-NEXT: {{.*}} = scf.for
+// CHECK-NEXT: {{.*}} = scf.for
+// CHECK-NEXT: {{.*}} = scf.for
+//      CHECK: {{.*}} = scf.if
+//      CHECK: %[[ALLOC3:.*]] = alloc()
+//      CHECK: %[[ALLOC4:.*]] = alloc()
+
+// -----
+
+// CHECK-LABEL: func @loop_nested_alloc_dyn_dependency
+func @loop_nested_alloc_dyn_dependency(
+  %lb: index,
+  %ub: index,
+  %step: index,
+  %arg0: index,
+  %buf: memref<?xf32>,
+  %res: memref<?xf32>) {
+  %0 = alloc(%arg0) : memref<?xf32>
+  %1 = scf.for %i = %lb to %ub step %step
+    iter_args(%iterBuf = %buf) -> memref<?xf32> {
+    %2 = scf.for %i2 = %lb to %ub step %step
+      iter_args(%iterBuf2 = %iterBuf) -> memref<?xf32> {
+      %3 = scf.for %i3 = %lb to %ub step %step
+        iter_args(%iterBuf3 = %iterBuf2) -> memref<?xf32> {
+        %4 = alloc(%i3) : memref<?xf32>
+        %5 = cmpi "eq", %i, %ub : index
+        %6 = scf.if %5 -> (memref<?xf32>) {
+          %7 = alloc(%i3) : memref<?xf32>
+          scf.yield %7 : memref<?xf32>
+        } else {
+          scf.yield %iterBuf3 : memref<?xf32>
+        }
+        %8 = alloc(%i3) : memref<?xf32>
+        scf.yield %6 : memref<?xf32>
+      }
+      scf.yield %3 : memref<?xf32>
+    }
+    scf.yield %0 : memref<?xf32>
+  }
+  "linalg.copy"(%1, %res) : (memref<?xf32>, memref<?xf32>) -> ()
+  return
+}
+
+//      CHECK: %[[ALLOC0:.*]] = alloc({{.*}})
+// CHECK-NEXT: {{.*}} = scf.for
+// CHECK-NEXT: {{.*}} = scf.for
+// CHECK-NEXT: {{.*}} = scf.for
+//      CHECK: %[[ALLOC1:.*]] = alloc({{.*}})
+//      CHECK: %[[ALLOC2:.*]] = alloc({{.*}})
+
+// -----
+
+// CHECK-LABEL: func @hoist_one_loop
+func @hoist_one_loop(
+  %lb: index,
+  %ub: index,
+  %step: index,
+  %buf: memref<2xf32>,
+  %res: memref<2xf32>) {
+  %0 = alloc() : memref<2xf32>
+  %1 = scf.for %i = %lb to %ub step %step
+    iter_args(%iterBuf = %buf) -> memref<2xf32> {
+      %2 = alloc() : memref<2xf32>
+      scf.yield %0 : memref<2xf32>
+  }
+  "linalg.copy"(%1, %res) : (memref<2xf32>, memref<2xf32>) -> ()
+  return
+}
+
+//      CHECK: %[[ALLOC0:.*]] = alloc({{.*}})
+// CHECK-NEXT: %[[ALLOC1:.*]] = alloc({{.*}})
+// CHECK-NEXT: {{.*}} = scf.for
+
+// -----
+
+// CHECK-LABEL: func @no_hoist_one_loop
+func @no_hoist_one_loop(
+  %lb: index,
+  %ub: index,
+  %step: index,
+  %buf: memref<2xf32>,
+  %res: memref<2xf32>) {
+  %0 = scf.for %i = %lb to %ub step %step
+    iter_args(%iterBuf = %buf) -> memref<2xf32> {
+      %1 = alloc() : memref<2xf32>
+      scf.yield %1 : memref<2xf32>
+  }
+  "linalg.copy"(%0, %res) : (memref<2xf32>, memref<2xf32>) -> ()
+  return
+}
+
+//      CHECK: {{.*}} = scf.for
+// CHECK-NEXT: %[[ALLOC0:.*]] = alloc({{.*}})
+
+// -----
+
+// CHECK-LABEL: func @hoist_multiple_loop
+func @hoist_multiple_loop(
+  %lb: index,
+  %ub: index,
+  %step: index,
+  %buf: memref<2xf32>,
+  %res: memref<2xf32>) {
+  %0 = alloc() : memref<2xf32>
+  %1 = scf.for %i = %lb to %ub step %step
+    iter_args(%iterBuf = %buf) -> memref<2xf32> {
+    %2 = scf.for %i2 = %lb to %ub step %step
+      iter_args(%iterBuf2 = %iterBuf) -> memref<2xf32> {
+        %3 = alloc() : memref<2xf32>
+        scf.yield %0 : memref<2xf32>
+    }
+    scf.yield %0 : memref<2xf32>
+  }
+  "linalg.copy"(%1, %res) : (memref<2xf32>, memref<2xf32>) -> ()
+  return
+}
+
+//      CHECK: %[[ALLOC0:.*]] = alloc({{.*}})
+// CHECK-NEXT: %[[ALLOC1:.*]] = alloc({{.*}})
+// CHECK-NEXT: {{.*}} = scf.for
+
+// -----
+
+// CHECK-LABEL: func @no_hoist_one_loop_conditional
+func @no_hoist_one_loop_conditional(
+  %lb: index,
+  %ub: index,
+  %step: index,
+  %buf: memref<2xf32>,
+  %res: memref<2xf32>) {
+  %0 = scf.for %i = %lb to %ub step %step
+    iter_args(%iterBuf = %buf) -> memref<2xf32> {
+      %1 = cmpi "eq", %i, %ub : index
+      %2 = scf.if %1 -> (memref<2xf32>) {
+        %3 = alloc() : memref<2xf32>
+        scf.yield %3 : memref<2xf32>
+      } else {
+        scf.yield %iterBuf : memref<2xf32>
+      }
+    scf.yield %2 : memref<2xf32>
+  }
+  "linalg.copy"(%0, %res) : (memref<2xf32>, memref<2xf32>) -> ()
+  return
+}
+
+//      CHECK: {{.*}} = scf.for
+//      CHECK: {{.*}} = scf.if
+// CHECK-NEXT: %[[ALLOC0:.*]] = alloc({{.*}})
+
+// -----
+
+// CHECK-LABEL: func @hoist_one_loop_conditional
+func @hoist_one_loop_conditional(
+  %lb: index,
+  %ub: index,
+  %step: index,
+  %buf: memref<2xf32>,
+  %res: memref<2xf32>) {
+  %0 = alloc() : memref<2xf32>
+  %1 = cmpi "eq", %lb, %ub : index
+  %2 = scf.if %1 -> (memref<2xf32>) {
+    %3 = scf.for %i = %lb to %ub step %step
+    iter_args(%iterBuf = %buf) -> memref<2xf32> {
+      %4 = alloc() : memref<2xf32>
+      scf.yield %0 : memref<2xf32>
+    }
+    scf.yield %0 : memref<2xf32>
+  }
+  else
+  {
+    scf.yield %0 : memref<2xf32>
+  }
+  "linalg.copy"(%2, %res) : (memref<2xf32>, memref<2xf32>) -> ()
+  return
+}
+
+//      CHECK: {{.*}} = scf.if
+// CHECK-NEXT: %[[ALLOC0:.*]] = alloc({{.*}})
+//      CHECK: {{.*}} = scf.for
+
+// -----
+
+// CHECK-LABEL: func @no_hoist_one_loop_dependency
+func @no_hoist_one_loop_dependency(
+  %lb: index,
+  %ub: index,
+  %step: index,
+  %buf: memref<2xf32>,
+  %res: memref<2xf32>) {
+  %0 = alloc() : memref<2xf32>
+  %1 = scf.for %i = %lb to %ub step %step
+    iter_args(%iterBuf = %buf) -> memref<2xf32> {
+      %2 = alloc(%i) : memref<?xf32>
+      scf.yield %0 : memref<2xf32>
+  }
+  "linalg.copy"(%1, %res) : (memref<2xf32>, memref<2xf32>) -> ()
+  return
+}
+
+//      CHECK: %[[ALLOC0:.*]] = alloc({{.*}})
+// CHECK-NEXT: {{.*}} = scf.for
+// CHECK-NEXT: %[[ALLOC1:.*]] = alloc({{.*}})
+
+// -----
+
+// CHECK-LABEL: func @partial_hoist_multiple_loop_dependency
+func @partial_hoist_multiple_loop_dependency(
+  %lb: index,
+  %ub: index,
+  %step: index,
+  %buf: memref<2xf32>,
+  %res: memref<2xf32>) {
+  %0 = alloc() : memref<2xf32>
+  %1 = scf.for %i = %lb to %ub step %step
+    iter_args(%iterBuf = %buf) -> memref<2xf32> {
+    %2 = scf.for %i2 = %lb to %ub step %step
+      iter_args(%iterBuf2 = %iterBuf) -> memref<2xf32> {
+        %3 = alloc(%i) : memref<?xf32>
+        scf.yield %0 : memref<2xf32>
+    }
+    scf.yield %0 : memref<2xf32>
+  }
+  "linalg.copy"(%1, %res) : (memref<2xf32>, memref<2xf32>) -> ()
+  return
+}
+
+//      CHECK: %[[ALLOC0:.*]] = alloc({{.*}})
+// CHECK-NEXT: {{.*}} = scf.for
+// CHECK-NEXT: %[[ALLOC1:.*]] = alloc({{.*}})
+// CHECK-NEXT: {{.*}} = scf.for