diff --git a/mlir/lib/Transforms/BufferPlacement.cpp b/mlir/lib/Transforms/BufferPlacement.cpp
--- a/mlir/lib/Transforms/BufferPlacement.cpp
+++ b/mlir/lib/Transforms/BufferPlacement.cpp
@@ -47,13 +47,6 @@
 // copy buffers and placing deallocs in safe places to ensure that all buffers
 // will be freed in the end.
 //
-// TODO:
-// The current implementation does not support loops and the resulting code will
-// be invalid with respect to program semantics. The only thing that is
-// currently missing is a high-level loop analysis that allows us to move allocs
-// and deallocs outside of the loop blocks. Furthermore, it doesn't also accept
-// functions which return buffers already.
-//
 //===----------------------------------------------------------------------===//
 
 #include "mlir/Transforms/BufferPlacement.h"
@@ -76,6 +69,22 @@
     }
 }
 
+/// Wrapper for the actual `RegionBranchOpInterface.getSuccessorRegions`
+/// function that initializes the required `operandAttributes` array.
+void getSuccessorRegions(RegionBranchOpInterface regionInterface,
+                         llvm::Optional<unsigned> index,
+                         SmallVectorImpl<RegionSuccessor> &successors) {
+  // Create an empty attribute for each operand to comply with the
+  // `getSuccessorRegions` interface definition that requires a single
+  // attribute per operand.
+  SmallVector<Attribute, 2> operandAttributes(
+      regionInterface.getOperation()->getNumOperands());
+
+  // Get all successor regions using the temporarily allocated
+  // `operandAttributes`.
+  regionInterface.getSuccessorRegions(index, operandAttributes, successors);
+}
+
 namespace {
 //===----------------------------------------------------------------------===//
 // BufferPlacementAliasAnalysis
@@ -165,16 +174,10 @@
 
     // Query the RegionBranchOpInterface to find potential successor regions.
     op->walk([&](RegionBranchOpInterface regionInterface) {
-      // Create an empty attribute for each operand to comply with the
-      // `getSuccessorRegions` interface definition that requires a single
-      // attribute per operand.
-      SmallVector<Attribute, 2> operandAttributes(
-          regionInterface.getOperation()->getNumOperands());
-
       // Extract all entry regions and wire all initial entry successor inputs.
       SmallVector<RegionSuccessor, 2> entrySuccessors;
-      regionInterface.getSuccessorRegions(/*index=*/llvm::None,
-                                          operandAttributes, entrySuccessors);
+      getSuccessorRegions(regionInterface, /*index=*/llvm::None,
+                          entrySuccessors);
       for (RegionSuccessor &entrySuccessor : entrySuccessors) {
         // Wire the entry region's successor arguments with the initial
         // successor inputs.
@@ -190,8 +193,8 @@
         // Iterate over all successor region entries that are reachable from the
         // current region.
         SmallVector<RegionSuccessor, 2> successorRegions;
-        regionInterface.getSuccessorRegions(
-            region.getRegionNumber(), operandAttributes, successorRegions);
+        getSuccessorRegions(regionInterface, region.getRegionNumber(),
+                            successorRegions);
         for (RegionSuccessor &successorRegion : successorRegions) {
           // Iterate over all immediate terminator operations and wire the
           // successor inputs with the operands of each terminator.
@@ -208,6 +211,118 @@
   ValueMapT aliases;
 };
 
+/// A straight-forward program analysis which detects loop backedges induced by
+/// explicit control flow and RegionBranchOpInterfaces.
+class Backedges {
+public:
+  using BlockSetT = SmallPtrSet<Block *, 16>;
+  using BackedgeSetT = llvm::DenseSet<std::pair<Block *, Block *>>;
+
+public:
+  /// Constructs a new backedges analysis using the op provided.
+  Backedges(Operation *op) { recurse(op, op->getBlock()); }
+
+  /// Returns the start iterator to loop over all backedges.
+  BackedgeSetT::const_iterator begin() const { return edgeSet.begin(); }
+
+  /// Returns the end iterator to loop over all backedges.
+  BackedgeSetT::const_iterator end() const { return edgeSet.end(); }
+
+private:
+  /// Enters the current block and inserts a backedge into the `edgeSet` if we
+  /// have already visited the current block. The inserted edge links the given
+  /// `predecessor` with the `current` block.
+  bool enter(Block &current, Block *predecessor) {
+    bool inserted = visited.insert(&current).second;
+    if (!inserted)
+      edgeSet.insert(std::make_pair(predecessor, &current));
+    return inserted;
+  }
+
+  /// Leaves the current block.
+  void exit(Block &current) { visited.erase(&current); }
+
+  /// Recurses into the given operation while taking all attached regions into
+  /// account.
+  void recurse(Operation *op, Block *predecessor) {
+    Block *current = op->getBlock();
+    // If the current op implements the `BranchOpInterface`, there can be
+    // cycles in the scope of all successor blocks.
+    if (isa<BranchOpInterface>(op)) {
+      for (Block *succ : current->getSuccessors())
+        recurse(*succ, current, [=]() {});
+    }
+
+    // If the current op implements the `RegionBranchOpInterface`, there can be
+    // cycles in the scope of all attached regions.
+    if (auto regionInterface = dyn_cast<RegionBranchOpInterface>(op)) {
+      // Extract and recurse into all entry regions that this operation can
+      // directly branch to.
+      SmallVector<RegionSuccessor, 2> entrySuccessors;
+      getSuccessorRegions(regionInterface, /*index=*/llvm::None,
+                          entrySuccessors);
+      for (RegionSuccessor &entrySuccessor : entrySuccessors)
+        recurse(regionInterface, current, entrySuccessor);
+    } else {
+      // If not, recurse into all distinct regions. Note that there cannot be
+      // any control-flow edges between those regions.
+      for (Region &region : op->getRegions())
+        recurse(region.front(), current, [=]() {});
+    }
+  }
+
+  /// Recurses into control-flow structures that rely on the
+  /// `RegionBranchOpInterface` instead of explicit control flow.
+  void recurse(RegionBranchOpInterface regionInterface, Block *predecessor,
+               RegionSuccessor &regionSuccessor) {
+    // If this region successor branches to its parent operation, we can safely
+    // ignore this one since it cannot introduce any cycles.
+    Region *successor = regionSuccessor.getSuccessor();
+    if (!successor)
+      return;
+
+    // Start with the entry block of the region and recurse into all nested
+    // region successors.
+    Block &entryBlock = successor->front();
+    recurse(entryBlock, predecessor, [&]() {
+      // Extract all successors for the current region using the
+      // `RegionBranchOpInterace` and recurse into each region successor.
+      SmallVector<RegionSuccessor, 2> successorRegions;
+      getSuccessorRegions(regionInterface, successor->getRegionNumber(),
+                          successorRegions);
+      for (RegionSuccessor &regionSuccessor : successorRegions)
+        recurse(regionInterface, &entryBlock, regionSuccessor);
+    });
+  }
+
+  /// Recurses into explicit control-flow structures that are given by
+  /// the successor relation defined on the block level.
+  template <typename FuncT>
+  void recurse(Block &block, Block *predecessor, const FuncT &recurseOp) {
+    // Try to enter the current block. If this is not possible, we are
+    // currently processing this block and can safely return here.
+    if (!enter(block, predecessor))
+      return;
+
+    // Recurse into all operations and successor blocks.
+    for (auto &op : block.getOperations())
+      recurse(&op, predecessor);
+
+    // Invoke the user-defined recursion operation while the current block is
+    // still active.
+    recurseOp();
+
+    // Leave the current block.
+    exit(block);
+  }
+
+  /// Stores all blocks that are currently visited and on the processing stack.
+  BlockSetT visited;
+
+  /// Stores all backedges in the format (source, target).
+  BackedgeSetT edgeSet;
+};
+
 //===----------------------------------------------------------------------===//
 // BufferPlacement
 //===----------------------------------------------------------------------===//
@@ -251,6 +366,8 @@
     findDeallocs();
     // Place deallocations for all allocation entries.
     placeDeallocs();
+    // Place deallocations for all backedges.
+    placeBackedgeDeallocs();
   }
 
 private:
@@ -337,11 +454,10 @@
   }
 
   /// Introduces required allocs and copy operations to avoid memory leaks.
+  /// This function initializes the the set of values that require a dedicated
+  /// memory free operation since their operands cannot be safely deallocated in
+  /// a post dominator.
   void introduceCopies() {
-    // Initialize the set of values that require a dedicated memory free
-    // operation since their operands cannot be safely deallocated in a post
-    // dominator.
-    SmallPtrSet<Value, 8> valuesToFree;
     llvm::SmallDenseSet<std::tuple<Value, Block *>> visitedValues;
     SmallVector<std::tuple<Value, Block *>, 8> toProcess;
 
@@ -479,17 +595,12 @@
                                      MutableArrayRef<Region> regions,
                                      const TPredicate &regionPredicate,
                                      const TOperandProvider &operandProvider) {
-    // Create an empty attribute for each operand to comply with the
-    // `getSuccessorRegions` interface definition that requires a single
-    // attribute per operand.
-    SmallVector<Attribute, 2> operandAttributes(
-        regionInterface.getOperation()->getNumOperands());
     for (Region &region : regions) {
       // Query the regionInterface to get all successor regions of the current
       // one.
       SmallVector<RegionSuccessor, 2> successorRegions;
-      regionInterface.getSuccessorRegions(region.getRegionNumber(),
-                                          operandAttributes, successorRegions);
+      getSuccessorRegions(regionInterface, region.getRegionNumber(),
+                          successorRegions);
       // Try to find a matching region successor.
       RegionSuccessor *regionSuccessor =
           llvm::find_if(successorRegions, regionPredicate);
@@ -623,6 +734,102 @@
     }
   }
 
+  /// Inserts and places deallocation operations for buffers that have to be
+  /// freed before returning to other parts of the program via backedges.
+  void placeBackedgeDeallocs() {
+    // Determine and loop over all backedges starting with the operation this
+    // analysis operates on.
+    Backedges backedges(operation);
+    for (auto &edge : backedges) {
+      Block *source = edge.first;
+      Block *target = edge.second;
+
+      // Get the terminator of the source block and determine whether it uses a
+      // `BranchOpInterface` (works on explicit control flow) or on a
+      // `RegionBranchOpInterface`.
+      Operation *terminator = source->getTerminator();
+      if (auto branchInterface = dyn_cast<BranchOpInterface>(terminator)) {
+        // Find the coorresponding successor entry and get its operands.
+        auto it = llvm::find(source->getSuccessors(), target);
+        auto successorOperands =
+            branchInterface.getSuccessorOperands(it.getIndex());
+        // Ignore back edges without passing any values to their successor
+        // blocks.
+        if (!successorOperands.hasValue())
+          continue;
+        placeDeallocsForBackedgeBranch(terminator, target->getArguments(),
+                                       successorOperands.getValue());
+      } else {
+        // If this terminator does not operate on explicit control flow, it has
+        // to be a `RegionBranchOpInterface` in order to be recognized by the
+        // `Backedges` analysis.
+        assert(terminator->hasTrait<OpTrait::ReturnLike>() &&
+               terminator->getParentRegion() &&
+               "Not supported backedge terminator");
+        auto regionInterface = cast<RegionBranchOpInterface>(
+            terminator->getParentRegion()->getParentOp());
+        Region *sourceRegion = source->getParent();
+        // Get all successor regions of the current source region.
+        SmallVector<RegionSuccessor, 2> successorRegions;
+        getSuccessorRegions(regionInterface, sourceRegion->getRegionNumber(),
+                            successorRegions);
+        // Find the region successor that corresponds to the source region.
+        RegionSuccessor *regionSuccessor =
+            llvm::find_if(successorRegions, [&](RegionSuccessor &succ) {
+              return succ.getSuccessor() == sourceRegion;
+            });
+        assert(regionSuccessor != successorRegions.end() &&
+               "Invalid region successor backedge");
+        placeDeallocsForBackedgeBranch(terminator,
+                                       regionSuccessor->getSuccessorInputs(),
+                                       terminator->getOperands());
+      }
+    }
+  }
+
+  /// Places deallocation operations for all iteration arguments that have to be
+  /// freed. An iteration argument is a `BlockArgument` that is reachable via a
+  /// backedge. The list of operands represents values that are passed to the
+  /// iteration arguments following the backedge.
+  template <typename IterationArgumentsT, typename OperandsT>
+  void placeDeallocsForBackedgeBranch(Operation *operation,
+                                      const IterationArgumentsT &iterationArgs,
+                                      const OperandsT &operands) {
+    for (auto it : llvm::zip(iterationArgs, operands)) {
+      // Skip values that do not operate on buffers.
+      Value iterationArg = std::get<0>(it);
+      if (!iterationArg.getType().isa<MemRefType>())
+        continue;
+
+      // Extract the operand that is passed via the backedge and check whether
+      // the value has been defined in the scope of this loop. If not, we are
+      // not allowed to place a dealloc here since the buffer will be
+      // automatically freed in its post dominator anyway.
+      Value value = std::get<1>(it);
+      Block *domBlock = iterationArg.getParentBlock();
+      Block *postDomBlock = value.getParentBlock();
+      if (dominators.properlyDominates(postDomBlock, domBlock))
+        continue;
+
+      // Test all aliases of the iteration argument whether they belong to the
+      // loop and have been freed. If the current operand value is not
+      // contained in this set, we have to place a free operation in all cases.
+      auto iterationValueAliases = aliases.resolve(iterationArg);
+      if (iterationValueAliases.count(value) &&
+          !llvm::any_of(iterationValueAliases, [&](Value alias) {
+            return dominators.dominates(domBlock, alias.getParentBlock()) &&
+                   postDominators.postDominates(postDomBlock,
+                                                alias.getParentBlock()) &&
+                   valuesToFree.count(alias);
+          }))
+        continue;
+
+      // Insert dealloc for the corresponding iteration argument.
+      OpBuilder builder(operation);
+      builder.create<DeallocOp>(operation->getLoc(), iterationArg);
+    }
+  }
+
   /// Finds a common dominator for the given value while taking the positions
   /// of the values in the value set into account. It supports dominator and
   /// post-dominator analyses via template arguments.
@@ -651,6 +858,9 @@
   /// Maps allocation nodes to their associated blocks.
   AllocEntryList allocs;
 
+  /// The set of all values that require copy and specific free operations.
+  ValueSetT valuesToFree;
+
   /// The underlying liveness analysis to compute fine grained information
   /// about alloc and dealloc positions.
   Liveness liveness;
diff --git a/mlir/test/Transforms/buffer-placement.mlir b/mlir/test/Transforms/buffer-placement.mlir
--- a/mlir/test/Transforms/buffer-placement.mlir
+++ b/mlir/test/Transforms/buffer-placement.mlir
@@ -1125,3 +1125,312 @@
 //      CHECK: %[[ALLOCA:.*]] = alloca(%arg0, %arg1)
 // CHECK-NEXT: scf.yield %[[ALLOC0]]
 //      CHECK: return %[[ALLOC1]]
+
+// -----
+
+// Test Case: structured control-flow loop using a nested alloc.
+// The alloc positions of %3 will not be changed, but the iteration argument
+// %iterBuf has to be freed before yielding %3 to avoid memory leaks.
+
+// -----
+
+// CHECK-LABEL: func @loop_alloc
+func @loop_alloc(
+  %lb: index,
+  %ub: index,
+  %step: index,
+  %buf: memref<2xf32>,
+  %res: memref<2xf32>) {
+  %0 = alloc() : memref<2xf32>
+  %1 = scf.for %i = %lb to %ub step %step
+    iter_args(%iterBuf = %buf) -> memref<2xf32> {
+    %2 = cmpi "eq", %i, %ub : index
+    %3 = alloc() : memref<2xf32>
+    scf.yield %3 : memref<2xf32>
+  }
+  "linalg.copy"(%1, %res) : (memref<2xf32>, memref<2xf32>) -> ()
+  return
+}
+
+//      CHECK: %[[ALLOC0:.*]] = alloc()
+// CHECK-NEXT: dealloc %[[ALLOC0]]
+//      CHECK: %[[ALLOC1:.*]] = scf.for {{.*}} iter_args(%[[IALLOC:.*]] =
+//      CHECK: %[[ALLOC2:.*]] = alloc()
+//      CHECK: %[[ALLOC3:.*]] = alloc()
+//      CHECK: linalg.copy(%[[ALLOC2]], %[[ALLOC3]])
+//      CHECK: dealloc %[[ALLOC2]]
+//      CHECK: dealloc %[[IALLOC]]
+//      CHECK: scf.yield %[[ALLOC3]]
+//      CHECK: linalg.copy(%[[ALLOC1]], %arg4)
+// CHECK-NEXT: dealloc %[[ALLOC1]]
+
+// -----
+
+// Test Case: structured control-flow loop with a nested if operation.
+// The loop yields buffers that have been defined outside of the loop and the
+// backeges only use the iteration arguments (or one of its aliases).
+// Therefore, we do not have to (and are not allowed to) free any buffers
+// that are passed via the backedges.
+
+// CHECK-LABEL: func @loop_nested_if_no_alloc
+func @loop_nested_if_no_alloc(
+  %lb: index,
+  %ub: index,
+  %step: index,
+  %buf: memref<2xf32>,
+  %res: memref<2xf32>) {
+  %0 = alloc() : memref<2xf32>
+  %1 = scf.for %i = %lb to %ub step %step
+    iter_args(%iterBuf = %buf) -> memref<2xf32> {
+    %2 = cmpi "eq", %i, %ub : index
+    %3 = scf.if %2 -> (memref<2xf32>) {
+      scf.yield %0 : memref<2xf32>
+    } else {
+      scf.yield %iterBuf : memref<2xf32>
+    }
+    scf.yield %3 : memref<2xf32>
+  }
+  "linalg.copy"(%1, %res) : (memref<2xf32>, memref<2xf32>) -> ()
+  return
+}
+
+//      CHECK: %[[ALLOC0:.*]] = alloc()
+// CHECK-NEXT: %[[ALLOC1:.*]] = scf.for {{.*}} iter_args(%[[IALLOC:.*]] =
+//      CHECK: %[[ALLOC2:.*]] = scf.if
+//      CHECK: scf.yield %[[ALLOC0]]
+//      CHECK: scf.yield %[[IALLOC]]
+//      CHECK: scf.yield %[[ALLOC2]]
+//      CHECK: linalg.copy(%[[ALLOC1]], %arg4)
+//      CHECK: dealloc %[[ALLOC0]]
+
+// -----
+
+// Test Case: structured control-flow loop with a nested if operation using
+// a deeply nested buffer allocation.
+// Since the innermost allocation happens in a divergent branch, we have to
+// introduce additional copies for the nested if operation. Since the loop's
+// yield operation "returns" %3, it will return a newly allocated buffer.
+// Therefore, we have to free the iteration argument %iterBuf before
+// "returning" %3.
+
+// CHECK-LABEL: func @loop_nested_if_alloc
+func @loop_nested_if_alloc(
+  %lb: index,
+  %ub: index,
+  %step: index,
+  %buf: memref<2xf32>) -> memref<2xf32> {
+  %0 = alloc() : memref<2xf32>
+  %1 = scf.for %i = %lb to %ub step %step
+    iter_args(%iterBuf = %buf) -> memref<2xf32> {
+    %2 = cmpi "eq", %i, %ub : index
+    %3 = scf.if %2 -> (memref<2xf32>) {
+      %4 = alloc() : memref<2xf32>
+      scf.yield %4 : memref<2xf32>
+    } else {
+      scf.yield %0 : memref<2xf32>
+    }
+    scf.yield %3 : memref<2xf32>
+  }
+  return %1 : memref<2xf32>
+}
+
+//      CHECK: %[[ALLOC0:.*]] = alloc()
+// CHECK-NEXT: %[[ALLOC1:.*]] = scf.for {{.*}} iter_args(%[[IALLOC:.*]] =
+//      CHECK: %[[ALLOC2:.*]] = scf.if
+
+//      CHECK: %[[ALLOC3:.*]] = alloc()
+// CHECK-NEXT: %[[ALLOC4:.*]] = alloc()
+// CHECK-NEXT: linalg.copy(%[[ALLOC3]], %[[ALLOC4]])
+// CHECK-NEXT: dealloc %[[ALLOC3]]
+// CHECK-NEXT: scf.yield %[[ALLOC4]]
+
+//      CHECK: %[[ALLOC5:.*]] = alloc()
+// CHECK-NEXT: linalg.copy(%[[ALLOC0]], %[[ALLOC5]])
+// CHECK-NEXT: scf.yield %[[ALLOC5]]
+
+//      CHECK: %[[ALLOC6:.*]] = alloc()
+// CHECK-NEXT: linalg.copy(%[[ALLOC2:.*]], %[[ALLOC6]])
+// CHECK-NEXT: dealloc %[[ALLOC2]]
+// CHECK-NEXT: dealloc %[[IALLOC]]
+// CHECK-NEXT: scf.yield %[[ALLOC6]]
+
+//      CHECK: dealloc %[[ALLOC0]]
+// CHECK-NEXT: return %[[ALLOC1]]
+
+// -----
+
+// Test Case: several nested structured control-flow loops with a deeply nested
+// buffer allocation inside an if operation.
+// Same behavior is an loop_nested_if_alloc: we have to insert deallocations
+// before each yield in all loops recursively.
+
+// CHECK-LABEL: func @loop_nested_alloc
+func @loop_nested_alloc(
+  %lb: index,
+  %ub: index,
+  %step: index,
+  %buf: memref<2xf32>,
+  %res: memref<2xf32>) {
+  %0 = alloc() : memref<2xf32>
+  %1 = scf.for %i = %lb to %ub step %step
+    iter_args(%iterBuf = %buf) -> memref<2xf32> {
+    %2 = scf.for %i2 = %lb to %ub step %step
+      iter_args(%iterBuf2 = %iterBuf) -> memref<2xf32> {
+      %3 = scf.for %i3 = %lb to %ub step %step
+        iter_args(%iterBuf3 = %iterBuf2) -> memref<2xf32> {
+        %4 = cmpi "eq", %i, %ub : index
+        %5 = scf.if %4 -> (memref<2xf32>) {
+          %6 = alloc() : memref<2xf32>
+          scf.yield %6 : memref<2xf32>
+        } else {
+          scf.yield %iterBuf3 : memref<2xf32>
+        }
+        scf.yield %5 : memref<2xf32>
+      }
+      scf.yield %3 : memref<2xf32>
+    }
+    scf.yield %2 : memref<2xf32>
+  }
+  "linalg.copy"(%1, %res) : (memref<2xf32>, memref<2xf32>) -> ()
+  return
+}
+
+//      CHECK: %[[ALLOC0:.*]] = alloc()
+// CHECK-NEXT: dealloc %[[ALLOC0]]
+// CHECK-NEXT: %[[ALLOC1:.*]] = scf.for {{.*}} iter_args(%[[IALLOC0:.*]] =
+//      CHECK: %[[ALLOC2:.*]] = scf.for {{.*}} iter_args(%[[IALLOC1:.*]] =
+//      CHECK: %[[ALLOC3:.*]] = scf.for {{.*}} iter_args(%[[IALLOC2:.*]] =
+//      CHECK: %[[ALLOC4:.*]] = scf.if
+// CHECK-NEXT: %[[ALLOC5:.*]] = alloc()
+// CHECK-NEXT: %[[ALLOC6:.*]] = alloc()
+// CHECK-NEXT: linalg.copy(%[[ALLOC5]], %[[ALLOC6]])
+// CHECK-NEXT: dealloc %[[ALLOC5]]
+// CHECK-NEXT: scf.yield %[[ALLOC6]]
+
+//      CHECK: %[[ALLOC7:.*]] = alloc()
+// CHECK-NEXT: linalg.copy(%[[IALLOC2]], %[[ALLOC7]])
+// CHECK-NEXT: scf.yield %[[ALLOC7]]
+
+//      CHECK: %[[ALLOC8:.*]] = alloc()
+// CHECK-NEXT: linalg.copy(%[[ALLOC4]], %[[ALLOC8]])
+// CHECK-NEXT: dealloc %[[ALLOC4]]
+// CHECK-NEXT: dealloc %[[IALLOC2]]
+// CHECK-NEXT: scf.yield %[[ALLOC8]]
+
+//      CHECK: %[[ALLOC9:.*]] = alloc()
+// CHECK-NEXT: linalg.copy(%[[ALLOC3]], %[[ALLOC9]])
+// CHECK-NEXT: dealloc %[[ALLOC3]]
+// CHECK-NEXT: dealloc %[[IALLOC1]]
+// CHECK-NEXT: scf.yield %[[ALLOC9]]
+
+//      CHECK: %[[ALLOC10:.*]] = alloc()
+// CHECK-NEXT: linalg.copy(%[[ALLOC2]], %[[ALLOC10]])
+// CHECK-NEXT: dealloc %[[ALLOC2]]
+// CHECK-NEXT: dealloc %[[IALLOC0]]
+// CHECK-NEXT: scf.yield %[[ALLOC10]]
+
+//      CHECK: linalg.copy(%[[ALLOC1]], %arg4)
+// CHECK-NEXT: dealloc %[[ALLOC1]]
+
+// -----
+
+// Test Case: explicit control-flow loop with a dynamically allocated buffer.
+// Since %alloc1 flows back to the loop header, we have to free the iteration
+// argument %buff before returning to the header.
+
+// CHECK-LABEL: func @loop_dynalloc
+func @loop_dynalloc(
+  %arg0 : i32,
+  %arg1 : i32,
+  %arg2: memref<?xf32>,
+  %arg3: memref<?xf32>) {
+  %const0 = constant 0 : i32
+  br ^loopHeader(%const0, %arg2 : i32, memref<?xf32>)
+
+^loopHeader(%i : i32, %buff : memref<?xf32>):
+  %lessThan = cmpi "slt", %i, %arg1 : i32
+  cond_br %lessThan,
+    ^loopBody(%i, %buff : i32, memref<?xf32>),
+    ^exit(%buff : memref<?xf32>)
+
+^loopBody(%val : i32, %buff2: memref<?xf32>):
+  %const1 = constant 1 : i32
+  %inc = addi %val, %const1 : i32
+  %size = std.index_cast %inc : i32 to index
+  %alloc1 = alloc(%size) : memref<?xf32>
+  br ^loopHeader(%inc, %alloc1 : i32, memref<?xf32>)
+
+^exit(%buff3 : memref<?xf32>):
+  "linalg.copy"(%buff3, %arg3) : (memref<?xf32>, memref<?xf32>) -> ()
+  return
+}
+
+//      CHECK: %[[DIM0:.*]] = dim %arg2
+// CHECK-NEXT: %[[ALLOC0:.*]] = alloc(%[[DIM0]])
+// CHECK-NEXT: linalg.copy(%arg2, %[[ALLOC0]])
+// CHECK-NEXT: br ^bb1({{.*}}, %[[ALLOC0]] : {{.*}})
+
+//      CHECK: ^bb1({{.*}}, %[[IALLOC0:.*]]: {{.*}}):
+//      CHECK: cond_br {{.*}}, ^bb2({{.*}}, %[[IALLOC0]] : {{.*}}),
+// CHECK-SAME:   ^bb3(%[[IALLOC0]] : {{.*}})
+
+//      CHECK: ^bb2
+//      CHECK: %[[DIM1:.*]] = index_cast
+// CHECK-NEXT: %[[ALLOC1:.*]] = alloc(%[[DIM1]])
+//      CHECK: %[[DIM2:.*]] = dim %[[ALLOC1]]
+// CHECK-NEXT: %[[ALLOC2:.*]] = alloc(%[[DIM2]])
+// CHECK-NEXT: linalg.copy(%[[ALLOC1]], %[[ALLOC2]])
+// CHECK-NEXT: dealloc %[[ALLOC1]]
+// CHECK-NEXT: dealloc %[[IALLOC0]]
+// CHECK-NEXT: br ^bb1({{.*}}, %[[ALLOC2]] : {{.*}})
+
+//      CHECK: ^bb3(%[[ALLOC3:.*]]: {{.*}}):
+// CHECK-NEXT: linalg.copy(%[[ALLOC3]], %arg3)
+// CHECK-NEXT: dealloc %[[IALLOC0]]
+
+// -----
+
+// Test Case: explicit control-flow loop with a dynamically allocated buffer.
+// Since %alloc1 flows to the do-while loop header, we have to free the
+// iteration argument %buff2 before returning to the loop body.
+
+// CHECK-LABEL: func @do_loop_alloc
+func @do_loop_alloc(
+  %arg0 : i32,
+  %arg1 : i32,
+  %arg2: memref<2xf32>,
+  %arg3: memref<2xf32>) {
+  %const0 = constant 0 : i32
+  br ^loopBody(%const0, %arg2 : i32, memref<2xf32>)
+
+^loopBody(%val : i32, %buff2: memref<2xf32>):
+  %const1 = constant 1 : i32
+  %inc = addi %val, %const1 : i32
+  %alloc1 = alloc() : memref<2xf32>
+  br ^loopHeader(%inc, %alloc1 : i32, memref<2xf32>)
+
+^loopHeader(%i : i32, %buff : memref<2xf32>):
+  %lessThan = cmpi "slt", %i, %arg1 : i32
+  cond_br %lessThan,
+    ^loopBody(%i, %buff : i32, memref<2xf32>),
+    ^exit(%buff : memref<2xf32>)
+
+^exit(%buff3 : memref<2xf32>):
+  "linalg.copy"(%buff3, %arg3) : (memref<2xf32>, memref<2xf32>) -> ()
+  return
+}
+
+//      CHECK: br ^bb1({{.*}}, %arg2
+
+//      CHECK: ^bb1({{.*}}, %[[IALLOC0:.*]]: {{.*}}):
+//      CHECK: %[[ALLOC0:.*]] = alloc()
+// CHECK-NEXT: br ^bb2({{.*}}, %[[ALLOC0]] :
+
+//      CHECK: ^bb2({{.*}}, %[[IALLOC1:.*]]: {{.*}}):
+//      CHECK: dealloc %[[IALLOC0]]
+// CHECK-NEXT: cond_br {{.*}}, ^bb1({{.*}}, %[[IALLOC1]] : {{.*}}),
+// CHECK-SAME:    ^bb3(%[[IALLOC1]] :
+
+//      CHECK: ^bb3(%[[ALLOC2:.*]]: {{.*}}):
+// CHECK-NEXT: linalg.copy(%[[ALLOC2]], %arg3)
+// CHECK-NEXT: dealloc %[[ALLOC0]]