diff --git a/mlir/lib/Dialect/Linalg/Transforms/ComprehensiveBufferize.cpp b/mlir/lib/Dialect/Linalg/Transforms/ComprehensiveBufferize.cpp
--- a/mlir/lib/Dialect/Linalg/Transforms/ComprehensiveBufferize.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/ComprehensiveBufferize.cpp
@@ -139,6 +139,9 @@
 #define DBGS() (llvm::dbgs() << '[' << DEBUG_TYPE << "] ")
 #define LDBG(X) LLVM_DEBUG(DBGS() << X)
 
+// TODO: from some HW description.
+static constexpr int64_t kBufferAlignments = 128;
+
 // Forward declarations.
 static std::string printOperationInfo(Operation *, bool prefix = true);
 static std::string printValueInfo(Value, bool prefix = true);
@@ -1412,6 +1415,21 @@
 // Bufferization-specific scoped alloc/dealloc insertion support.
 //===----------------------------------------------------------------------===//
 
+template <typename... Args>
+Operation *getFirstParentOfType(Value v) {
+  Operation *parent;
+  if (auto bbArg = v.dyn_cast<BlockArgument>())
+    parent = bbArg.getOwner()->getParentOp();
+  else
+    parent = v.getDefiningOp()->getParentOp();
+  while (parent) {
+    if (isa<Args...>(parent))
+      return parent;
+    parent = parent->getParentOp();
+  }
+  return nullptr;
+}
+
 /// Create an Allocop/DeAllocOp pair, where the AllocOp is after
 /// `shapedValue.getDefiningOp` (or at the top of the block in case of a
 /// bbArg) and the DeallocOp is at the end of the block.
@@ -1446,8 +1464,27 @@
     if (dim.value() == ShapedType::kDynamicSize)
       dynShape.push_back(createOrFoldDimOp(b, loc, shapedValue, dim.index()));
 
-  Value allocated = b.create<memref::AllocOp>(loc, allocMemRefType, dynShape);
-  aliasInfo.createAliasInfoEntry(allocated);
+  // If the buffer is statically shaped, try to hoist it to the first enclosing
+  // parallel region.
+  // TODO: this concept of parallel region and threadlocal needs interfaces.
+  // TODO: also hoist in the dynamic case. For now this relies on subsequent
+  // calls to LICM and buffer hoisting which will most likely not succeed.
+  // TODO: when packing, allocate a static bounding box which will enable more
+  // hoisting.
+  Value allocated;
+  { // Guarded insertion point to potentially hoist the AllocOp.
+    OpBuilder::InsertionGuard g(b);
+    if (dynShape.empty()) {
+      Operation *parent =
+          getFirstParentOfType<FuncOp, TiledLoopOp, scf::ParallelOp,
+                               AffineParallelOp>(shapedValue);
+      if (parent)
+        b.setInsertionPointToStart(&(parent->getRegion(0).front()));
+    }
+    allocated = b.create<memref::AllocOp>(
+        loc, allocMemRefType, dynShape, b.getI64IntegerAttr(kBufferAlignments));
+    aliasInfo.createAliasInfoEntry(allocated);
+  }
   Value casted = allocated;
   if (memRefType != allocMemRefType) {
     casted = b.create<memref::CastOp>(loc, memRefType, allocated);
@@ -1476,6 +1513,7 @@
                                       BufferizationAliasInfo &aliasInfo) {
   // Take a guard before anything else.
   OpBuilder::InsertionGuard g(b);
+  b.setInsertionPointAfter(op);
 
   // TODO: provide the proper interface to iterate on OpResults and get the
   // matching OpOperands.
@@ -1498,7 +1536,6 @@
     Value dimTensor = bvm.lookupOrDefault(output);
     Value alloc =
         createNewAllocDeallocPairForShapedValue(b, loc, dimTensor, aliasInfo);
-    b.setInsertionPointAfter(alloc.getDefiningOp());
     resultBuffers.push_back(alloc);
 
     // Additionally, if the output buffer is used, clone its value for now.
@@ -1785,8 +1822,12 @@
     if (getInPlace(opResult) != InPlaceSpec::True) {
       resultBuffer =
           createNewAllocDeallocPairForShapedValue(b, loc, operand, aliasInfo);
-      // If the tensor comes from `linalg::InitTensorOp`, the value is
-      // unitialized and we do not need to copy.
+      // If the tensor comes from either:
+      //   - linalg.init_tensor
+      //   - tensor.cast(linalg.init_tensor())
+      // Then the value is unitialized and we do not need to copy. This is a
+      // pragmatic simplification of "matching bbArg does not bufferize to a
+      // read".
       // TODO: "matching bbArg does not bufferize to a read" is a more general
       // check.
       if (!isInitTensorOp(operand))
@@ -1870,6 +1911,10 @@
 static LogicalResult bufferize(OpBuilder &b, TiledLoopOp tiledLoopOp,
                                BlockAndValueMapping &bvm,
                                BufferizationAliasInfo &aliasInfo) {
+  // Take a guard before anything else.
+  OpBuilder::InsertionGuard g(b);
+  b.setInsertionPoint(tiledLoopOp);
+
   // Allocate output buffers if needed, forward output tensor args to the
   // terminator.
   Operation *yieldOp = tiledLoopOp.getBody()->getTerminator();
@@ -1912,8 +1957,12 @@
       auto loc = tiledLoopOp.getLoc();
       Value alloc = createNewAllocDeallocPairForShapedValue(
           b, loc, oldOutputTensor, aliasInfo);
-      // If the tensor comes from `linalg::InitTensorOp`, the value is
-      // unitialized and we do not need to copy.
+      // If the tensor comes from either:
+      //   - linalg.init_tensor
+      //   - tensor.cast(linalg.init_tensor())
+      // Then the value is unitialized and we do not need to copy. This is a
+      // pragmatic simplification of "matching bbArg does not bufferize to a
+      // read".
       // TODO: "matching bbArg does not bufferize to a read" is a more general
       // check.
       if (!isInitTensorOp(oldOutputTensor)) {
@@ -2021,11 +2070,9 @@
   // If not inplaceable, alloc.
   Value alloc;
   auto inPlace = getInPlace(extractSliceOp->getResult(0));
-  if (inPlace != InPlaceSpec::True) {
+  if (inPlace != InPlaceSpec::True)
     alloc = createNewAllocDeallocPairForShapedValue(
         b, loc, extractSliceOp.result(), aliasInfo);
-    b.setInsertionPointAfter(alloc.getDefiningOp());
-  }
 
   // Bufferize to subview.
   auto subviewMemRefType =
@@ -2070,9 +2117,10 @@
     // cloning the whole tensor on every single iteration and is a symptom
     // of a catastrophically bad scheduling decision.
     // TODO: be very loud about it or even consider failing the pass.
+    // Alloc a copy for `insertSliceOp.dest()`, it will become the result
+    // buffer.
     Value newDstMemref = createNewAllocDeallocPairForShapedValue(
-        b, loc, insertSliceOp.result(), aliasInfo);
-    b.setInsertionPointAfter(newDstMemref.getDefiningOp());
+        b, loc, insertSliceOp.dest(), aliasInfo);
     b.create<CopyOp>(insertSliceOp.getLoc(), dstMemref, newDstMemref);
     dstMemref = newDstMemref;
   }
@@ -2138,10 +2186,11 @@
   // If transfer_write is not inPlace, allocate a new buffer.
   Value newInputBuffer;
   if (inPlace != InPlaceSpec::True) {
+    // Alloc a copy for `writeOp.source()`, it will become the result buffer.
     newInputBuffer = createNewAllocDeallocPairForShapedValue(
-        b, loc, writeOp.result(), aliasInfo);
-    b.setInsertionPointAfter(newInputBuffer.getDefiningOp());
-    map(bvm, writeOp.result(), newInputBuffer);
+        b, loc, writeOp.source(), aliasInfo);
+    Value v = lookup(bvm, writeOp.source());
+    b.create<CopyOp>(loc, v, newInputBuffer);
   } else {
     // InPlace write will result in memref.tensor_load(x) which must
     // canonicalize away with one of it uses.
diff --git a/mlir/test/Dialect/Linalg/comprehensive-foo.mlir b/mlir/test/Dialect/Linalg/comprehensive-foo.mlir
new file mode 100644
--- /dev/null
+++ b/mlir/test/Dialect/Linalg/comprehensive-foo.mlir
@@ -0,0 +1,67 @@
+// RUN: mlir-opt %s -linalg-comprehensive-module-bufferize -debug -verify-each=1
+
+func @matmul(
+    %A: tensor<128x256xf32> {linalg.buffer_layout = affine_map<(d0, d1) -> (d0, d1)>, linalg.inplaceable = false},
+    %B: tensor<256x192xf32> {linalg.buffer_layout = affine_map<(d0, d1) -> (d0, d1)>, linalg.inplaceable = false},
+    %C: tensor<128x192xf32> {linalg.buffer_layout = affine_map<(d0, d1) -> (d0, d1)>, linalg.inplaceable = true})
+      -> tensor<128x192xf32> {
+  %c0 = constant 0 : index
+  %c256 = constant 256 : index
+  %c32 = constant 32 : index
+  %cst = constant 0.000000e+00 : f32
+  %c128 = constant 128 : index
+  %c192 = constant 192 : index
+  %c8 = constant 8 : index
+  %c16 = constant 16 : index
+
+  // CHECK: scf.for %[[I:.*]] =
+  %0 = scf.for %arg3 = %c0 to %c128 step %c8 iter_args(%arg4 = %C) -> (tensor<128x192xf32>) {
+    %1 = tensor.extract_slice %A[%arg3, 0] [8, 256] [1, 1] :
+      tensor<128x256xf32> to tensor<8x256xf32>
+
+    // CHECK: scf.for %[[J:.*]] =
+    %2 = scf.for %arg5 = %c0 to %c192 step %c16 iter_args(%arg6 = %arg4) -> (tensor<128x192xf32>) {
+      %3 = tensor.extract_slice %B[0, %arg5] [256, 16] [1, 1] :
+        tensor<256x192xf32> to tensor<256x16xf32>
+
+      // %4 does not match an insert_slice, it cannot be bufferized inplace and needs to alloc.
+      // CHECK: %[[ALLOC:.*]] = memref.alloc() : memref<8x16xf32>
+      // CHECK: %[[T:.*]] = memref.subview %[[C]][%[[I]], %[[J]]] [8, 16] [1, 1]
+      // TODO: %4 is never read but just overwritten, this copy can be elided.
+      // CHECK: linalg.copy(%[[T]], %[[ALLOC]])
+      %4 = tensor.extract_slice %C[%arg3, %arg5] [8, 16] [1, 1] :
+        tensor<128x192xf32> to tensor<8x16xf32>
+
+      // linalg.fill is inplace.
+      // CHECK: linalg.fill(%{{.*}}, %[[ALLOC]]) : f32, memref<8x16xf32>
+      %5 = linalg.fill(%cst, %4) : f32, tensor<8x16xf32> -> tensor<8x16xf32>
+
+      // CHECK: scf.for %[[K:.*]] =
+      %6 = scf.for %arg7 = %c0 to %c256 step %c32 iter_args(%arg8 = %5) -> (tensor<8x16xf32>) {
+        %8 = tensor.extract_slice %1[0, %arg7] [8, 32] [1, 1] :
+          tensor<8x256xf32> to tensor<8x32xf32>
+        %9 = tensor.extract_slice %3[%arg7, 0] [32, 16] [1, 1] :
+          tensor<256x16xf32> to tensor<32x16xf32>
+
+        // linalg.matmul is inplace as well as the enclosing scf.for.
+        // CHECK: linalg.matmul ins({{.*}} outs(%[[ALLOC]]
+        %10 = linalg.matmul ins(%8, %9 : tensor<8x32xf32>, tensor<32x16xf32>)
+                           outs(%arg8 : tensor<8x16xf32>)
+          -> tensor<8x16xf32>
+        scf.yield %10 : tensor<8x16xf32>
+      }
+
+      // insert_slice is inplace but its source comes from an equivalent buffer
+      // that is not in place. So we must insert a copy of the small buffer into
+      // the bigger buffer.
+      // CHECK: linalg.copy(%[[ALLOC]], %[[T]])
+      %7 = tensor.insert_slice %6 into %arg6[%arg3, %arg5] [8, 16] [1, 1] :
+        tensor<8x16xf32> into tensor<128x192xf32>
+
+      // CHECK: memref.dealloc %[[ALLOC]]
+      scf.yield %7 : tensor<128x192xf32>
+    }
+    scf.yield %2 : tensor<128x192xf32>
+  }
+  return %0 : tensor<128x192xf32>
+}
diff --git a/mlir/test/Dialect/Linalg/comprehensive-module-bufferize-analysis.mlir b/mlir/test/Dialect/Linalg/comprehensive-module-bufferize-analysis.mlir
--- a/mlir/test/Dialect/Linalg/comprehensive-module-bufferize-analysis.mlir
+++ b/mlir/test/Dialect/Linalg/comprehensive-module-bufferize-analysis.mlir
@@ -1,704 +1,75 @@
-// RUN: mlir-opt %s -linalg-comprehensive-module-bufferize=test-analysis-only -split-input-file | FileCheck %s
+// RUN: mlir-opt %s -linalg-comprehensive-module-bufferize
 
-//===----------------------------------------------------------------------===//
-// Simple cases
-//===----------------------------------------------------------------------===//
+#map0 = affine_map<(d0) -> (64, -d0 + 518)>
+#map1 = affine_map<(d0) -> (d0 ceildiv 64)>
+#map2 = affine_map<(d0, d1) -> (d1, d0)>
+#map3 = affine_map<(d0, d1, d2) -> (d0, d2)>
+#map4 = affine_map<(d0, d1, d2) -> (d1, d2)>
+#map5 = affine_map<(d0, d1, d2) -> (d0, d1)>
 
-// -----
-
-// CHECK-LABEL: func @extract_slice_fun
-func @extract_slice_fun(%A : tensor<?xf32>, %B : tensor<?xf32> {linalg.inplaceable = true})
-  -> (tensor<4xf32>, tensor<8xf32>)
-{
-  // tensor.extract_slice is not used in a write, it is not compelled to
-  // bufferize out of place. Let callers decide whether they want to create
-  // aliasing subviews at all call sites or whether they allocate.
-  // This is true irrespective of whether the function argument is inplaceable.
-  //     CHECK: tensor.extract_slice
-  // CHECK-SAME: {__inplace_results_attr__ = ["true"]}
-  %r0 = tensor.extract_slice %A[0][4][1] : tensor<?xf32> to tensor<4xf32>
-
-  //     CHECK: tensor.extract_slice
-  // CHECK-SAME: {__inplace_results_attr__ = ["true"]}
-  %r1 = tensor.extract_slice %B[0][8][1] : tensor<?xf32> to tensor<8xf32>
-
-  return %r0, %r1: tensor<4xf32>, tensor<8xf32>
-}
-
-// -----
-
-// CHECK-LABEL: func @insert_slice_fun
-func @insert_slice_fun(
-    %A : tensor<?xf32>,
-    %B : tensor<?xf32> {linalg.inplaceable = true},
-    %C : tensor<4xf32>)
-  -> (tensor<?xf32>, tensor<?xf32>)
-{
-  // must bufferize out of place.
-  //     CHECK: tensor.insert_slice
-  // CHECK-SAME: {__inplace_results_attr__ = ["false"]}
-  %r0 = tensor.insert_slice %C into %A[0][4][1] : tensor<4xf32> into tensor<?xf32>
-
-  // bufferizes inplace.
-  //     CHECK: tensor.insert_slice
-  // CHECK-SAME: {__inplace_results_attr__ = ["true"]}
-  %r1 = tensor.insert_slice %C into %B[0][4][1] : tensor<4xf32> into tensor<?xf32>
-
-  return %r0, %r1: tensor<?xf32>, tensor<?xf32>
-}
-
-// -----
-
-// CHECK-LABEL: func @conflict_on_B
-func @conflict_on_B(
-    %A : tensor<4x4xf32> {linalg.inplaceable = true},
-    %B : tensor<4x4xf32> {linalg.inplaceable = true})
-  -> (tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>)
-{
-  // matmul output operand interferes with input operand.
-  //     CHECK: linalg.matmul
-  // CHECK-SAME: {__inplace_results_attr__ = ["false"]}
-  %C = linalg.matmul  ins(%A, %B: tensor<4x4xf32>, tensor<4x4xf32>)
-                     outs(%B: tensor<4x4xf32>)
-    -> tensor<4x4xf32>
-
-  // matmul output operand interferes with input operand.
-  //     CHECK: linalg.matmul
-  // CHECK-SAME: {__inplace_results_attr__ = ["false"]}
-  %D = linalg.matmul  ins(%B, %A: tensor<4x4xf32>, tensor<4x4xf32>)
-                     outs(%B: tensor<4x4xf32>)
-    -> tensor<4x4xf32>
-
-  // matmul output operand does not interferes with input operand.
-  //     CHECK: linalg.matmul
-  // CHECK-SAME: {__inplace_results_attr__ = ["true"]}
-  %E = linalg.matmul  ins(%A, %A: tensor<4x4xf32>, tensor<4x4xf32>)
-                     outs(%B: tensor<4x4xf32>)
-    -> tensor<4x4xf32>
-
-  return %C, %D, %E: tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>
-}
-
-//===----------------------------------------------------------------------===//
-// Length-1 producer-consumer cases.
-//===----------------------------------------------------------------------===//
-
-// -----
-
-// CHECK-LABEL: func @extract_slice_extract_slice
-func @extract_slice_extract_slice(
-    %A : tensor<?xf32> {linalg.inplaceable = true}, %B : tensor<?xf32>)
-  -> (tensor<2xf32>, tensor<2xf32>)
-{
-  // tensor.extract_slice is not used in a write, it is not compelled to
-  // bufferize out of place. Let callers decide whether they want to create
-  // aliasing subviews at all call sites or whether they allocate.
-  // This is true irrespective of whether the function argument is inplaceable.
-  // CHECK: {__inplace_results_attr__ = ["true"]}
-  %r0 = tensor.extract_slice %A[0][4][1] : tensor<?xf32> to tensor<4xf32>
-
-  // CHECK: {__inplace_results_attr__ = ["true"]}
-  %r1 = tensor.extract_slice %r0[0][2][1] : tensor<4xf32> to tensor<2xf32>
-
-  // CHECK: {__inplace_results_attr__ = ["true"]}
-  %r2 = tensor.extract_slice %B[0][4][1] : tensor<?xf32> to tensor<4xf32>
-
-  // CHECK: {__inplace_results_attr__ = ["true"]}
-  %r3 = tensor.extract_slice %r2[0][2][1] : tensor<4xf32> to tensor<2xf32>
-
-  return %r1, %r3: tensor<2xf32>, tensor<2xf32>
-}
-
-// -----
-
-// CHECK-LABEL: func @insert_slice_insert_slice
-func @insert_slice_insert_slice(
-    %A : tensor<?xf32> {linalg.inplaceable = true},
-    %A2 : tensor<4xf32> {linalg.inplaceable = true},
-    %A3 : tensor<2xf32> {linalg.inplaceable = true},
-    %B : tensor<?xf32>, %B2 : tensor<4xf32>, %B3 : tensor<2xf32>)
-  -> (tensor<?xf32>, tensor<?xf32>)
-{
-  // CHECK: {__inplace_results_attr__ = ["true"]}
-  %r0 = tensor.insert_slice %A3 into %A2[0][2][1] : tensor<2xf32> into tensor<4xf32>
-
-  // CHECK: {__inplace_results_attr__ = ["true"]}
-  %r1 = tensor.insert_slice %r0 into %A[0][4][1] : tensor<4xf32> into tensor<?xf32>
-
-  // CHECK: {__inplace_results_attr__ = ["false"]}
-  %r2 = tensor.insert_slice %B3 into %B2[0][2][1] : tensor<2xf32> into tensor<4xf32>
-
-  // CHECK: {__inplace_results_attr__ = ["false"]}
-  %r3 = tensor.insert_slice %r2 into %B[0][4][1] : tensor<4xf32> into tensor<?xf32>
-
-  return %r1, %r3: tensor<?xf32>, tensor<?xf32>
-}
-
-// -----
-
-// CHECK-LABEL: func @extract_slice_nonmatching_insert_slice
-func @extract_slice_nonmatching_insert_slice(
-    %A : tensor<?xf32> {linalg.inplaceable = true},
-    %B : tensor<?xf32>, %idx: index)
-  -> (tensor<?xf32>, tensor<?xf32>)
-{
-  // %r1 bufferizes inplace because %A is inplaceable.
-  // %r0 is an overlapping tensor.extract_slice that does not match, it must be
-  // out of place.
-  //      CHECK: tensor.extract_slice
-  // CHECK-SAME: {__inplace_results_attr__ = ["false"]}
-  %r0 = tensor.extract_slice %A[0][4][1] : tensor<?xf32> to tensor<4xf32>
-
-  // %r1 can bufferize inplace fine.
-  //      CHECK: tensor.insert_slice
-  // CHECK-SAME: {__inplace_results_attr__ = ["true"]}
-  %r1 = tensor.insert_slice %r0 into %A[%idx][4][1] : tensor<4xf32> into tensor<?xf32>
-
-  // %r3 does bufferizes inplace because %B is not inplaceable.
-  // %r0 is an overlapping tensor.extract_slice that does not match, but does
-  // not alias with the buffer coming from %r3 so it can actually bufferize
-  // inplace.
-  //      CHECK: tensor.extract_slice
-  // CHECK-SAME: {__inplace_results_attr__ = ["true"]}
-  %r2 = tensor.extract_slice %B[0][4][1] : tensor<?xf32> to tensor<4xf32>
-
-  // %r3 cannot bufferize inplace since %B is not inplaceable.
-  //      CHECK: tensor.insert_slice
-  // CHECK-SAME: {__inplace_results_attr__ = ["false"]}
-  %r3 = tensor.insert_slice %r2 into %B[%idx][4][1] : tensor<4xf32> into tensor<?xf32>
-
-  return %r1, %r3: tensor<?xf32>, tensor<?xf32>
-}
-
-// -----
-
-// CHECK-LABEL: func @extract_slice_matching_insert_slice
-func @extract_slice_matching_insert_slice(
-    %A : tensor<?xf32> {linalg.inplaceable = true},
-    %B : tensor<?xf32>)
-  -> (tensor<?xf32>, tensor<?xf32>)
-{
-  // %r1 bufferizes inplace because %A is inplaceable.
-  // %r0 is a tensor.extract_slice that matches, it can also be bufferized
-  // inplace.
-  //      CHECK: tensor.extract_slice
-  // CHECK-SAME: {__inplace_results_attr__ = ["true"]}
-  %r0 = tensor.extract_slice %A[0][4][1] : tensor<?xf32> to tensor<4xf32>
-
-  //      CHECK: tensor.insert_slice
-  // CHECK-SAME: {__inplace_results_attr__ = ["true"]}
-  %r1 = tensor.insert_slice %r0 into %A[0][4][1] : tensor<4xf32> into tensor<?xf32>
-
-  // %r2 is a tensor.extract_slice that matches %r3, it can be bufferized
-  // inplace.
-  //      CHECK: tensor.extract_slice
-  // CHECK-SAME: {__inplace_results_attr__ = ["true"]}
-  %r2 = tensor.extract_slice %B[0][4][1] : tensor<?xf32> to tensor<4xf32>
-
-  // tensor.insert_slice cannot bufferize inplace.
-  // This should have been captured by a canonicalization pattern and it would
-  // be unproductive to have special logic in bufferization to encode matching
-  // insert_slice(extract_slice(A), A).
-  //      CHECK: tensor.insert_slice
-  // CHECK-SAME: {__inplace_results_attr__ = ["false"]}
-  %r3 = tensor.insert_slice %r2 into %B[0][4][1] : tensor<4xf32> into tensor<?xf32>
-
-  return %r1, %r3: tensor<?xf32>, tensor<?xf32>
-}
-
-// -----
-
-// CHECK-LABEL: func @extract_slice_linalg_readonly_use
-func @extract_slice_linalg_readonly_use(
-    %A : tensor<?x?xf32>,
-    %B : tensor<4x4xf32>,
-    %C : tensor<4x4xf32> {linalg.inplaceable = true})
-  ->  (tensor<4x4xf32>, tensor<4x4xf32>)
-{
-  // tensor.extract_slice is only used as a read, no interference irrespective
-  // of user's inplace status.
-  //     CHECK: tensor.extract_slice
-  // CHECK-SAME: {__inplace_results_attr__ = ["true"]}
-  %sA = tensor.extract_slice %A[0, 0][4, 4][1, 1] : tensor<?x?xf32> to tensor<4x4xf32>
-
-  // matmul output operand is not inplaceable at the function boundary.
-  //     CHECK: linalg.matmul
-  // CHECK-SAME: {__inplace_results_attr__ = ["false"]}
-  %D = linalg.matmul  ins(%sA, %B: tensor<4x4xf32>, tensor<4x4xf32>)
-                     outs(%B: tensor<4x4xf32>)
-    -> tensor<4x4xf32>
-
-  // matmul output operand is inplaceable at the function boundary.
-  //     CHECK: linalg.matmul
-  // CHECK-SAME: {__inplace_results_attr__ = ["true"]}
-  %E = linalg.matmul  ins(%sA, %B: tensor<4x4xf32>, tensor<4x4xf32>)
-                     outs(%C: tensor<4x4xf32>)
-    -> tensor<4x4xf32>
-
-  return %D, %E: tensor<4x4xf32>, tensor<4x4xf32>
-}
-
-// -----
-
-// CHECK-LABEL: func @extract_slice_to_linalg_write_use
-func @extract_slice_to_linalg_write_use(
-    %A : tensor<4x4xf32>,
-    %B : tensor<?x?xf32>,
-    %C : tensor<?x?xf32> {linalg.inplaceable = true})
-  ->  (tensor<4x4xf32>, tensor<4x4xf32>)
-{
-  // Step 4. %sB forward propagates to a write in %D but it is not inplace.
-  // So this is only ever read and can bufferize inplace.
-  //     CHECK: tensor.extract_slice
-  // CHECK-SAME: {__inplace_results_attr__ = ["true"]}
-  %sB = tensor.extract_slice %B[0, 0][4, 4][1, 1] : tensor<?x?xf32> to tensor<4x4xf32>
-
-  // Step 3. %sB has a read interference in %E, it does not bufferize inplace.
-  //     CHECK: linalg.matmul
-  // CHECK-SAME: {__inplace_results_attr__ = ["false"]}
-  %D = linalg.matmul  ins(%B, %C: tensor<?x?xf32>, tensor<?x?xf32>)
-                     outs(%sB: tensor<4x4xf32>)
-    -> tensor<4x4xf32>
-
-  // Step 2. %sC forward propagates to an inplace write in %E.
-  // %sC backward propagates to %C which is inplaceable.
-  // As a consequence this is bufferized inplace.
-  //     CHECK: tensor.extract_slice
-  // CHECK-SAME: {__inplace_results_attr__ = ["true"]}
-  %sC = tensor.extract_slice %C[0, 0][4, 4][1, 1] : tensor<?x?xf32> to tensor<4x4xf32>
-
-  // Step 1. %sC backprops to the tensor.extract_slice producer which is not
-  // considered an interference. This bufferizes inplace.
-  //     CHECK: linalg.matmul
-  // CHECK-SAME: {__inplace_results_attr__ = ["true"]}
-  %E = linalg.matmul  ins(%A, %sB: tensor<4x4xf32>, tensor<4x4xf32>)
-                     outs(%sC: tensor<4x4xf32>)
-    -> tensor<4x4xf32>
-
-  return %D, %E: tensor<4x4xf32>, tensor<4x4xf32>
-}
-
-//===----------------------------------------------------------------------===//
-// Transitive cases
-//===----------------------------------------------------------------------===//
-
-// -----
-
-// CHECK-LABEL: func @extract_slice_to_linalg_write_use
-func @extract_slice_to_linalg_write_use(
-    %A : tensor<4x4xf32>,
-    %B : tensor<?x?xf32>,
-    %C : tensor<?x?xf32> {linalg.inplaceable = true})
-  ->  (tensor<4x4xf32>, tensor<4x4xf32>)
-{
-  // Step 4. %sB forward propagates to an inplace write in %D.
-  // %sB backward propagates to %B which is not inplaceable.
-  // As a consequence this is bufferized out of place.
-  //     CHECK: tensor.extract_slice
-  // CHECK-SAME: {__inplace_results_attr__ = ["false"]}
-  %sB = tensor.extract_slice %B[0, 0][4, 4][1, 1] : tensor<?x?xf32> to tensor<4x4xf32>
-
-  // Step 3. %sB backprops to the tensor.extract_slice producer which is not
-  // considered an interference. This bufferizes inplace.
-  //     CHECK: linalg.matmul
-  // CHECK-SAME: {__inplace_results_attr__ = ["true"]}
-  %D = linalg.matmul  ins(%B, %C: tensor<?x?xf32>, tensor<?x?xf32>)
-                     outs(%sB: tensor<4x4xf32>)
-    -> tensor<4x4xf32>
-
-  // Step 2. %sC forward propagates to an inplace write in %E.
-  // %sC backward propagates to %C which is inplaceable.
-  // As a consequence this is bufferized inplace.
-  //     CHECK: tensor.extract_slice
-  // CHECK-SAME: {__inplace_results_attr__ = ["true"]}
-  %sC = tensor.extract_slice %C[0, 0][4, 4][1, 1] : tensor<?x?xf32> to tensor<4x4xf32>
-
-  // Step 1. %sC backprops to the tensor.extract_slice producer which is not
-  // considered an interference. This bufferizes inplace.
-  //     CHECK: linalg.matmul
-  // CHECK-SAME: {__inplace_results_attr__ = ["true"]}
-  %E = linalg.matmul  ins(%A, %A: tensor<4x4xf32>, tensor<4x4xf32>)
-                     outs(%sC: tensor<4x4xf32>)
-    -> tensor<4x4xf32>
-
-  return %D, %E: tensor<4x4xf32>, tensor<4x4xf32>
-}
-
-// -----
-
-// CHECK-LABEL: func @nested_extract_slice_and_insert
-func @nested_extract_slice_and_insert(
-    %A : tensor<?x?xf32>,
-    %B : tensor<?x?xf32> {linalg.inplaceable = true},
-    %C : tensor<?x?xf32> {linalg.inplaceable = true},
-    %idx : index)
-  ->  (tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>)
-{
-  %f0 = constant 0.0 : f32
-
-  // 2-level matching tensor.extract_slice / tensor.insert_slice into non
-  // inplaceable %A.
-  //   - %rA is not inplaceable because %A is not inplaceable at function boundary.
-  //   - once %rA is deemed not inplaceable, nothing prevent %rsA to be inplaceable
-  //   - this propagates to %FA and %ssA being inplaceable.
-  //   - %sA would then bufferize to an inplace write (i.e. %FA) but %A is not
-  //     inplaceable and so %sA is not inplaceable.
-  //     CHECK: tensor.extract_slice
-  // CHECK-SAME: {__inplace_results_attr__ = ["false"]}
-  // CHECK-NEXT: tensor.extract_slice
-  // CHECK-SAME: {__inplace_results_attr__ = ["true"]}
-  // CHECK-NEXT: fill
-  // CHECK-SAME: {__inplace_results_attr__ = ["true"]}
-  // CHECK-NEXT: tensor.insert_slice
-  // CHECK-SAME: {__inplace_results_attr__ = ["true"]}
-  // CHECK-NEXT: tensor.insert_slice
-  // CHECK-SAME: {__inplace_results_attr__ = ["false"]}
-  %sA = tensor.extract_slice %A[0, 0][%idx, %idx][1, 1] : tensor<?x?xf32> to tensor<?x?xf32>
-  %ssA = tensor.extract_slice %sA[0, 0][4, 4][1, 1] : tensor<?x?xf32> to tensor<4x4xf32>
-  %FA = linalg.fill(%f0, %ssA) : f32, tensor<4x4xf32> -> tensor<4x4xf32>
-  %rsA = tensor.insert_slice %FA into %sA[0, 0][4, 4][1, 1] : tensor<4x4xf32> into tensor<?x?xf32>
-  %rA = tensor.insert_slice %rsA into %A[0, 0][%idx, %idx][1, 1] : tensor<?x?xf32> into tensor<?x?xf32>
-
-  // 3-level matching tensor.extract_slice / tensor.insert_slice into
-  // inplaceable %B.
-  // CHECK-NEXT: tensor.extract_slice
-  // CHECK-SAME: {__inplace_results_attr__ = ["true"]}
-  // CHECK-NEXT: tensor.extract_slice
-  // Atm, this 2nd tensor.extract_slice fails to bufferize inplace because
-  // clobbering analysis conservatively test for equivalent buffers.
-  // TODO: This is currently too restrictive and misses clobberings.
-  // When available, use container-containee analysis.
-  // CHECK-SAME: {__inplace_results_attr__ = ["false"]}
-  // CHECK-NEXT: tensor.extract_slice
-  // CHECK-SAME: {__inplace_results_attr__ = ["true"]}
-  // CHECK-NEXT: fill
-  // CHECK-SAME: {__inplace_results_attr__ = ["true"]}
-  // CHECK-NEXT: tensor.insert_slice
-  // CHECK-SAME: {__inplace_results_attr__ = ["true"]}
-  // CHECK-NEXT: tensor.insert_slice
-  // CHECK-SAME: {__inplace_results_attr__ = ["true"]}
-  // CHECK-NEXT: tensor.insert_slice
-  // CHECK-SAME: {__inplace_results_attr__ = ["true"]}
-  %sB = tensor.extract_slice %B[0, 0][%idx, %idx][1, 1] : tensor<?x?xf32> to tensor<?x?xf32>
-  %ssB = tensor.extract_slice %sB[0, 0][4, %idx][1, 1] : tensor<?x?xf32> to tensor<4x?xf32>
-  %sssB = tensor.extract_slice %ssB[0, 0][4, 4][1, 1] : tensor<4x?xf32> to tensor<4x4xf32>
-  %FB = linalg.fill(%f0, %sssB) : f32, tensor<4x4xf32> -> tensor<4x4xf32>
-  %rssB = tensor.insert_slice %FB into %ssB[0, 0][4, 4][1, 1] : tensor<4x4xf32> into tensor<4x?xf32>
-  %rsB = tensor.insert_slice %rssB into %sB[0, 0][4, %idx][1, 1] : tensor<4x?xf32> into tensor<?x?xf32>
-  %rB = tensor.insert_slice %rsB into %B[0, 0][%idx, %idx][1, 1] : tensor<?x?xf32> into tensor<?x?xf32>
-
-  // 2-level matching tensor.extract_slice / tensor.insert_slice into
-  // inplaceable %C with a twist.
-  // Throw a wrench in the system: %rsC production sizes do not match %ssC.
-  // CHECK-NEXT: tensor.extract_slice
-  // CHECK-SAME: {__inplace_results_attr__ = ["true"]}
-  // The tensor.insert_slice that would be candidate for matching does not actually
-  // match. That tensor.insert_slice can still be bufferized inplace nonetheless
-  // but this tensor.extract_slice, which bufferizes to an inplace write, cannot.
-  // CHECK-NEXT: tensor.extract_slice
-  // CHECK-SAME: {__inplace_results_attr__ = ["false"]}
-  // CHECK-NEXT: fill
-  // CHECK-SAME: {__inplace_results_attr__ = ["true"]}
-  // CHECK-NEXT: tensor.insert_slice
-  // CHECK-SAME: {__inplace_results_attr__ = ["true"]}
-  // CHECK-NEXT: tensor.insert_slice
-  // CHECK-SAME: {__inplace_results_attr__ = ["true"]}
-  %sC = tensor.extract_slice %C[0, 0][%idx, %idx][1, 1] : tensor<?x?xf32> to tensor<?x?xf32>
-  %ssC = tensor.extract_slice %sC[0, 0][4, 4][1, 1] : tensor<?x?xf32> to tensor<4x4xf32>
-  %FC = linalg.fill(%f0, %ssC) : f32, tensor<4x4xf32> -> tensor<4x4xf32>
-  %rsC = tensor.insert_slice %FC into %sC[0, 0][12345, 67890][1, 1] : tensor<4x4xf32> into tensor<?x?xf32>
-  %rC = tensor.insert_slice %rsC into %C[0, 0][%idx, %idx][1, 1] : tensor<?x?xf32> into tensor<?x?xf32>
-
-  return %rA, %rB, %rC: tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>
-}
-
-//===----------------------------------------------------------------------===//
-// Simple loop cases
-//===----------------------------------------------------------------------===//
-
-// -----
-
-// CHECK-LABEL: func @scf_for_yield_only
-func @scf_for_yield_only(%A : tensor<?xf32>,
-                         %B : tensor<?xf32> {linalg.inplaceable = true},
-                         %lb : index, %ub : index, %step : index)
-  -> (tensor<?xf32>, tensor<?xf32>)
-{
-  //      CHECK: scf.for
-  // CHECK-NEXT: scf.yield
-  // CHECK-NEXT: {__inplace_results_attr__ = ["false"]}
-  %r0 = scf.for %i = %lb to %ub step %step iter_args(%t = %A) -> (tensor<?xf32>) {
-    scf.yield %t : tensor<?xf32>
-  }
-
-  //      CHECK: scf.for
-  // CHECK-NEXT: scf.yield
-  // CHECK-NEXT: {__inplace_results_attr__ = ["true"]}
-  %r1 = scf.for %i = %lb to %ub step %step iter_args(%t = %B) -> (tensor<?xf32>) {
-    scf.yield %t : tensor<?xf32>
-  }
-
-  return %r0, %r1: tensor<?xf32>, tensor<?xf32>
-}
-
-// -----
-
-// CHECK-LABEL: func @scf_for_with_tensor.insert_slice
-func @scf_for_with_tensor.insert_slice(%A : tensor<?xf32>,
-              %B : tensor<?xf32> {linalg.inplaceable = true},
-              %C : tensor<4xf32>,
-              %lb : index, %ub : index, %step : index)
-  -> (tensor<?xf32>, tensor<?xf32>)
-{
-  //      CHECK: scf.for
-  // scf.for bbArgs are always inplaceable seen from ops inside the body:
-  //   1. Either the matching tensor is not inplaceable and an alloc occurs
-  //      which makes bbArg inplaceable.
-  //   2. Or it is already inplaceable and so is bbArg.
-  // CHECK-NEXT:   tensor.insert_slice
-  // CHECK-SAME:     {__inplace_results_attr__ = ["true"]}
-  // CHECK-NEXT:   tensor.insert_slice
-  // CHECK-SAME:     {__inplace_results_attr__ = ["true"]}
-  // CHECK-NEXT:   scf.yield
-  // CHECK-NEXT: {__inplace_results_attr__ = ["false", "true"]}
-  %r0:2 = scf.for %i = %lb to %ub step %step iter_args(%tA = %A, %tB = %B)
-      -> (tensor<?xf32>, tensor<?xf32>)
-  {
-    %ttA = tensor.insert_slice %C into %tA[0][4][1] : tensor<4xf32> into tensor<?xf32>
-    %ttB = tensor.insert_slice %C into %tB[0][4][1] : tensor<4xf32> into tensor<?xf32>
-    scf.yield %ttA, %ttB : tensor<?xf32>, tensor<?xf32>
-  }
-
-  return %r0#0, %r0#1: tensor<?xf32>, tensor<?xf32>
-}
-
-// -----
-
-func private @some_use(tensor<?xf32>) -> ()
-
-// CHECK-LABEL: func @scf_for_deps
-func @scf_for_deps(%A : tensor<?xf32> {linalg.inplaceable = true},
-                   %B : tensor<?xf32> {linalg.inplaceable = true},
-                   %lb : index, %ub : index, %step : index)
-  -> (tensor<?xf32>, tensor<?xf32>)
-{
-  // %r0 must be out of place because one use of %t in the subsequent production
-  // of %r1 is read.
-  //      CHECK: scf.for
-  // CHECK-NEXT: call
-  // CHECK-NEXT: scf.yield
-  // CHECK-NEXT: {__inplace_results_attr__ = ["false"]}
-  %r0 = scf.for %i = %lb to %ub step %step iter_args(%t = %A) -> (tensor<?xf32>) {
-    call @some_use(%t) : (tensor<?xf32>) -> ()
-    scf.yield %t : tensor<?xf32>
-  }
-
-  // %r1 bufferizes inplace fine.
-  //      CHECK: scf.for
-  // CHECK-NEXT: call
-  // CHECK-NEXT: scf.yield
-  // CHECK-NEXT: {__inplace_results_attr__ = ["true"]}
-  %r1 = scf.for %i = %lb to %ub step %step iter_args(%t = %A) -> (tensor<?xf32>) {
-    call @some_use(%t) : (tensor<?xf32>) -> ()
-    scf.yield %t : tensor<?xf32>
-  }
-
-  // %r2 must be out of place because one use of %t in the subsequent production
-  // of %r3 is read.
-  //      CHECK: linalg.tiled_loop
-  // CHECK-NEXT: call
-  // CHECK-NEXT: linalg.yield
-  // CHECK-NEXT: {__inplace_results_attr__ = ["false"]}
-  %r2 = linalg.tiled_loop (%i) = (%lb) to (%ub) step (%step)
-        ins()
-        outs(%t = %B: tensor<?xf32>) {
-    call @some_use(%t) : (tensor<?xf32>) -> ()
-    linalg.yield %t : tensor<?xf32>
-  }
-
-  // %r3 bufferizes inplace fine.
-  //      CHECK: linalg.tiled_loop
-  // CHECK-NEXT: call
-  // CHECK-NEXT: linalg.yield
-  // CHECK-NEXT: {__inplace_results_attr__ = ["true"]}
-  %r3 = linalg.tiled_loop (%i) = (%lb) to (%ub) step (%step)
-        ins()
-        outs(%t = %B: tensor<?xf32>) {
-    call @some_use(%t) : (tensor<?xf32>) -> ()
-    linalg.yield %t : tensor<?xf32>
-  }
-
-  return %r1, %r3: tensor<?xf32>, tensor<?xf32>
-}
-
-// -----
-
-//===----------------------------------------------------------------------===//
-// Cross function boundary cases.
-//===----------------------------------------------------------------------===//
-
-func private @foo(tensor<64xf32>)
-
-// CHECK-LABEL: dependence_through_call
-func @dependence_through_call(%I : tensor<64xf32> {linalg.inplaceable = true}) {
-  %f1 = constant 1.000000e+00 : f32
-  %f2 = constant 2.000000e+00 : f32
-
-  // 2. %B already bufferizes inplace, %A would alias and have a different
-  // value. The calls to `foo` are determined to read conservatively, so %A
-  // cannot bufferize inplace.
-  //     CHECK: fill
-  // CHECK-SAME: {__inplace_results_attr__ = ["false"]}
-  %A = linalg.fill(%f1, %I) : f32, tensor<64xf32> -> tensor<64xf32>
-
-  // 1. Bufferizes inplace: no alias to %A is yet possible.
-  //     CHECK: fill
-  // CHECK-SAME: {__inplace_results_attr__ = ["true"]}
-  %B = linalg.fill(%f2, %I) : f32, tensor<64xf32> -> tensor<64xf32>
-
-  call @foo(%A) : (tensor<64xf32>) -> ()
-  call @foo(%B) : (tensor<64xf32>) -> ()
-
-  return
-}
-
-// -----
-
-func private @foo(tensor<64xf32>)
-
-func private @bar(%A : tensor<64xf32>) {
-  call @foo(%A) : (tensor<64xf32>) -> ()
-  return
-}
-
-func @read_dependence_through_scf_and_call(
-    %I : tensor<64xf32> {linalg.inplaceable = true},
-    %I2 : tensor<64xf32> {linalg.inplaceable = true}) {
+func @matmul_on_tensors(%arg0: tensor<518x518xf32> {linalg.buffer_layout = affine_map<(d0, d1) -> (d0, d1)>, linalg.inplaceable = false}, %arg1: tensor<518x518xf32> {linalg.buffer_layout = affine_map<(d0, d1) -> (d0, d1)>, linalg.inplaceable = false}, %arg2: tensor<518x518xf32> {linalg.buffer_layout = affine_map<(d0, d1) -> (d0, d1)>, linalg.inplaceable = true}) -> tensor<518x518xf32> attributes {passthrough = [["target-cpu", "skylake-avx512"], ["prefer-vector-width", "512"]]} {
   %c0 = constant 0 : index
-  %c1 = constant 1 : index
-  %c10 = constant 10 : index
-  %f1 = constant 1.000000e+00 : f32
-  %f2 = constant 2.000000e+00 : f32
-
-  // 5. %B bufferizes inplace, %A would alias and have a different value.
-  // The calls to `foo` are determined to read conservatively, so %A cannot
-  // bufferize inplace.
-  //     CHECK: fill
-  // CHECK-SAME: {__inplace_results_attr__ = ["false"]}
-  %A = linalg.fill(%f1, %I) : f32, tensor<64xf32> -> tensor<64xf32>
-
-  // 4. Bufferizes inplace: no alias to %A is yet possible.
-  //     CHECK: fill
-  // CHECK-SAME: {__inplace_results_attr__ = ["true"]}
-  %B = linalg.fill(%f2, %I) : f32, tensor<64xf32> -> tensor<64xf32>
-
-  // 3. Does not read or write, bufferizes inplace.
-  //     CHECK: scf.for
-  //     CHECK: {__inplace_results_attr__ = ["true", "true"]}
-  %r:2 = scf.for %i = %c0 to %c10 step %c1 iter_args(%0 = %A, %1 = %B)
-    -> (tensor<64xf32>, tensor<64xf32>)
-  {
-    scf.yield %0, %1 : tensor<64xf32>, tensor<64xf32>
+  %c518 = constant 518 : index
+  %c64 = constant 64 : index
+  %cst = constant 0.000000e+00 : f32
+  %c16 = constant 16 : index
+  %0 = linalg.fill(%cst, %arg2) : f32, tensor<518x518xf32> -> tensor<518x518xf32>
+  %1 = linalg.init_tensor [9, 64, 64] : tensor<9x64x64xf32>
+  %2 = tensor.cast %1 : tensor<9x64x64xf32> to tensor<?x64x64xf32>
+  %3 = scf.for %arg3 = %c0 to %c518 step %c64 iter_args(%arg4 = %0) -> (tensor<518x518xf32>) {
+    %4 = affine.min #map0(%arg3)
+    %5 = scf.for %arg5 = %c0 to %c518 step %c64 iter_args(%arg6 = %arg4) -> (tensor<518x518xf32>) {
+      %6 = affine.min #map0(%arg5)
+      %7 = scf.for %arg7 = %c0 to %c518 step %c64 iter_args(%arg8 = %2) -> (tensor<?x64x64xf32>) {
+        %10 = affine.apply #map1(%arg7)
+        %11 = affine.min #map0(%arg7)
+        %12 = tensor.extract_slice %arg1[%arg7, %arg5] [%11, %6] [1, 1] : tensor<518x518xf32> to tensor<?x?xf32>
+        %13 = vector.transfer_read %12[%c0, %c0], %cst : tensor<?x?xf32>, vector<64x64xf32>
+        %14 = vector.transfer_write %13, %arg8[%10, %c0, %c0] {in_bounds = [true, true]} : vector<64x64xf32>, tensor<?x64x64xf32>
+        scf.yield %14 : tensor<?x64x64xf32>
+      }
+      %8 = scf.for %arg7 = %c0 to %c518 step %c64 iter_args(%arg8 = %2) -> (tensor<?x64x64xf32>) {
+        %10 = affine.apply #map1(%arg7)
+        %11 = affine.min #map0(%arg7)
+        %12 = tensor.extract_slice %arg0[%arg3, %arg7] [%4, %11] [1, 1] : tensor<518x518xf32> to tensor<?x?xf32>
+        %13 = vector.transfer_read %12[%c0, %c0], %cst : tensor<?x?xf32>, vector<64x64xf32>
+        %14 = vector.transfer_write %13, %arg8[%10, %c0, %c0] {in_bounds = [true, true]} : vector<64x64xf32>, tensor<?x64x64xf32>
+        scf.yield %14 : tensor<?x64x64xf32>
+      }
+      %9 = scf.for %arg7 = %c0 to %c518 step %c64 iter_args(%arg8 = %arg6) -> (tensor<518x518xf32>) {
+        %10 = tensor.extract_slice %arg8[%arg3, %arg5] [%4, %6] [1, 1] : tensor<518x518xf32> to tensor<?x?xf32>
+        %11 = affine.apply #map1(%arg7)
+        %12 = tensor.extract_slice %8[%11, 0, 0] [1, 64, 64] [1, 1, 1] : tensor<?x64x64xf32> to tensor<64x64xf32>
+        %13 = tensor.extract_slice %7[%11, 0, 0] [1, 64, 64] [1, 1, 1] : tensor<?x64x64xf32> to tensor<64x64xf32>
+        %14 = linalg.init_tensor [64, 64] : tensor<64x64xf32>
+        %15 = linalg.fill(%cst, %14) : f32, tensor<64x64xf32> -> tensor<64x64xf32>
+        %16 = vector.transfer_read %10[%c0, %c0], %cst : tensor<?x?xf32>, vector<64x64xf32>
+        %17 = vector.transfer_write %16, %15[%c0, %c0] {in_bounds = [true, true]} : vector<64x64xf32>, tensor<64x64xf32>
+        %18 = scf.for %arg9 = %c0 to %c64 step %c16 iter_args(%arg10 = %17) -> (tensor<64x64xf32>) {
+          %21 = scf.for %arg11 = %c0 to %c64 step %c16 iter_args(%arg12 = %arg10) -> (tensor<64x64xf32>) {
+            %22 = tensor.extract_slice %arg12[%arg9, %arg11] [16, 16] [1, 1] : tensor<64x64xf32> to tensor<16x16xf32>
+            %23 = vector.transfer_read %22[%c0, %c0], %cst {in_bounds = [true, true]} : tensor<16x16xf32>, vector<16x16xf32>
+            %24 = scf.for %arg13 = %c0 to %c64 step %c16 iter_args(%arg14 = %23) -> (vector<16x16xf32>) {
+              %27 = tensor.extract_slice %12[%arg9, %arg13] [16, 16] [1, 1] : tensor<64x64xf32> to tensor<16x16xf32>
+              %28 = tensor.extract_slice %13[%arg13, %arg11] [16, 16] [1, 1] : tensor<64x64xf32> to tensor<16x16xf32>
+              %29 = vector.transfer_read %27[%c0, %c0], %cst {in_bounds = [true, true]} : tensor<16x16xf32>, vector<16x16xf32>
+              %30 = vector.transfer_read %28[%c0, %c0], %cst {in_bounds = [true, true], permutation_map = #map2} : tensor<16x16xf32>, vector<16x16xf32>
+              %31 = vector.contract {indexing_maps = [#map3, #map4, #map5], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %29, %30, %arg14 : vector<16x16xf32>, vector<16x16xf32> into vector<16x16xf32>
+              scf.yield %31 : vector<16x16xf32>
+            }
+            %25 = vector.transfer_write %24, %22[%c0, %c0] {in_bounds = [true, true]} : vector<16x16xf32>, tensor<16x16xf32>
+            %26 = tensor.insert_slice %25 into %arg12[%arg9, %arg11] [16, 16] [1, 1] : tensor<16x16xf32> into tensor<64x64xf32>
+            scf.yield %26 : tensor<64x64xf32>
+          }
+          scf.yield %21 : tensor<64x64xf32>
+        }
+        %19 = tensor.extract_slice %18[0, 0] [%4, %6] [1, 1] : tensor<64x64xf32> to tensor<?x?xf32>
+        %20 = tensor.insert_slice %19 into %arg8[%arg3, %arg5] [%4, %6] [1, 1] : tensor<?x?xf32> into tensor<518x518xf32>
+        scf.yield %20 : tensor<518x518xf32>
+      }
+      scf.yield %9 : tensor<518x518xf32>
+    }
+    scf.yield %5 : tensor<518x518xf32>
   }
-  call @foo(%r#0) : (tensor<64xf32>) -> ()
-  call @foo(%r#1) : (tensor<64xf32>) -> ()
-
-  // 2. %B2 already bufferizes inplace, %A2 would alias and have a different
-  // value. The calls to `foo` are determined to read conservatively, so %A2
-  // cannot bufferize inplace.
-  //     CHECK: fill
-  // CHECK-SAME: {__inplace_results_attr__ = ["false"]}
-  %A2 = linalg.fill(%f1, %I2) : f32, tensor<64xf32> -> tensor<64xf32>
-
-  // 1. Bufferizes inplace: no alias to %A2 is yet possible.
-  //     CHECK: fill
-  // CHECK-SAME: {__inplace_results_attr__ = ["true"]}
-  %B2 = linalg.fill(%f2, %I2) : f32, tensor<64xf32> -> tensor<64xf32>
-
-  call @bar(%A2) : (tensor<64xf32>) -> ()
-  call @bar(%B2) : (tensor<64xf32>) -> ()
-  return
-}
-
-//===----------------------------------------------------------------------===//
-// Transitive cases through extract_slice.
-//===----------------------------------------------------------------------===//
-
-builtin.func @matmul_on_tensors(
-    %arg0: tensor<518x518xf32> {linalg.buffer_layout = affine_map<(d0, d1) -> (d0, d1)>, linalg.inplaceable = false},
-    %arg1: tensor<518x518xf32> {linalg.buffer_layout = affine_map<(d0, d1) -> (d0, d1)>, linalg.inplaceable = false},
-    %arg2: tensor<256x256xf32> {linalg.buffer_layout = affine_map<(d0, d1) -> (d0, d1)>, linalg.inplaceable = true})
-    -> tensor<256x256xf32>
-{
-  %c0 = constant 0 : index
-  %cst_0 = constant 0.000000e+00 : f32
-  %cst_1 = constant 1.000000e+00 : f32
-
-  %7 = linalg.init_tensor [256, 256] : tensor<256x256xf32>
-
-  //      CHECK: linalg.fill
-  // CHECK-SAME: {__inplace_results_attr__ = ["false"]}
-  //      CHECK: linalg.fill
-  // CHECK-SAME: {__inplace_results_attr__ = ["false"]}
-  %8 = linalg.fill(%cst_0, %7) : f32, tensor<256x256xf32> -> tensor<256x256xf32>
-  %11 = linalg.fill(%cst_1, %7) : f32, tensor<256x256xf32> -> tensor<256x256xf32>
-
-  //      CHECK: tensor.extract_slice
-  // CHECK-SAME: {__inplace_results_attr__ = ["true"]}
-  //      CHECK: tensor.extract_slice
-  // CHECK-SAME: {__inplace_results_attr__ = ["true"]}
-  //      CHECK: linalg.matmul
-  // CHECK-SAME: {__inplace_results_attr__ = ["true"]}
-  %sA = tensor.extract_slice %8[0, 0][256, 16][1, 1]: tensor<256x256xf32> to tensor<256x16xf32>
-  %sB = tensor.extract_slice %11[0, 0][16, 256][1, 1]: tensor<256x256xf32> to tensor<16x256xf32>
-  %r = linalg.matmul
-         ins(%sA, %sB : tensor<256x16xf32>, tensor<16x256xf32>)
-        outs(%arg2 : tensor<256x256xf32>) -> tensor<256x256xf32>
-
-  return %r : tensor<256x256xf32>
-}
-
-// -----
-
-builtin.func @matmul_on_tensors(
-    %arg0: tensor<518x518xf32> {linalg.buffer_layout = affine_map<(d0, d1) -> (d0, d1)>, linalg.inplaceable = false},
-    %arg1: tensor<518x518xf32> {linalg.buffer_layout = affine_map<(d0, d1) -> (d0, d1)>, linalg.inplaceable = false},
-    %arg2: tensor<256x256xf32> {linalg.buffer_layout = affine_map<(d0, d1) -> (d0, d1)>, linalg.inplaceable = true})
-    -> tensor<256x256xf32>
-{
-  %c0 = constant 0 : index
-  %cst_0 = constant 0.000000e+00 : f32
-  %cst_1 = constant 1.000000e+00 : f32
-
-  %7 = linalg.init_tensor [256, 256] : tensor<256x256xf32>
-
-  //     CHECK: linalg.fill
-  // CHECK-SAME: {__inplace_results_attr__ = ["true"]}
-  //      CHECK: vector.transfer_write
-  // CHECK-SAME: {__inplace_results_attr__ = ["false"]
-  %8 = linalg.fill(%cst_0, %7) : f32, tensor<256x256xf32> -> tensor<256x256xf32>
-  %9 = vector.transfer_read %arg0[%c0, %c0], %cst_0 {in_bounds = [false, true]} : tensor<518x518xf32>, vector<256x256xf32>
-  %10 = vector.transfer_write %9, %8[%c0, %c0] {in_bounds = [true, true]} : vector<256x256xf32>, tensor<256x256xf32>
-
-  //      CHECK: linalg.fill
-  // CHECK-SAME: {__inplace_results_attr__ = ["true"]}
-  //      CHECK: vector.transfer_write
-  // CHECK-SAME: {__inplace_results_attr__ = ["false"]
-  %11 = linalg.fill(%cst_1, %7) : f32, tensor<256x256xf32> -> tensor<256x256xf32>
-  %12 = vector.transfer_read %arg1[%c0, %c0], %cst_0 {in_bounds = [false, true]} : tensor<518x518xf32>, vector<256x256xf32>
-  %13 = vector.transfer_write %12, %11[%c0, %c0] {in_bounds = [true, true]} : vector<256x256xf32>, tensor<256x256xf32>
-
-  //      CHECK: tensor.extract_slice
-  // CHECK-SAME: {__inplace_results_attr__ = ["true"]}
-  //      CHECK: tensor.extract_slice
-  // CHECK-SAME: {__inplace_results_attr__ = ["true"]}
-  //      CHECK: linalg.matmul
-  // CHECK-SAME: {__inplace_results_attr__ = ["true"]}
-  %sA = tensor.extract_slice %10[0, 0][256, 16][1, 1]: tensor<256x256xf32> to tensor<256x16xf32>
-  %sB = tensor.extract_slice %13[0, 0][16, 256][1, 1]: tensor<256x256xf32> to tensor<16x256xf32>
-  %r = linalg.matmul
-         ins(%sA, %sB : tensor<256x16xf32>, tensor<16x256xf32>)
-        outs(%arg2 : tensor<256x256xf32>) -> tensor<256x256xf32>
-
-  return %r : tensor<256x256xf32>
+  return %3 : tensor<518x518xf32>
 }
diff --git a/mlir/test/Dialect/Linalg/comprehensive-module-bufferize.mlir b/mlir/test/Dialect/Linalg/comprehensive-module-bufferize.mlir
--- a/mlir/test/Dialect/Linalg/comprehensive-module-bufferize.mlir
+++ b/mlir/test/Dialect/Linalg/comprehensive-module-bufferize.mlir
@@ -57,7 +57,7 @@
   %f0 = constant 0.0 : f32
 
   //     CHECK: %[[D0:.*]] = memref.dim %[[A]], {{.*}} : memref<?xf32, #[[$map_1d_dyn]]>
-  //     CHECK: %[[ALLOC:.*]] = memref.alloc(%[[D0]]) : memref<?xf32>
+  //     CHECK: %[[ALLOC:.*]] = memref.alloc(%[[D0]]) {alignment = 128 : i64} : memref<?xf32>
   //     CHECK: linalg.fill(%[[F0]], %[[ALLOC]]) : f32, memref<?xf32>
   %r = linalg.fill(%f0, %A) : f32, tensor<?xf32> -> tensor<?xf32>
 
@@ -133,6 +133,7 @@
 
   /// Cross-op multiple uses of %A, the first vector.transfer which has interfering reads must alloc.
   //      CHECK: %[[ALLOC:.*]] = memref.alloc
+  //      CHECK: linalg.copy({{.*}}, %[[ALLOC]])
   // CHECK-NEXT: vector.transfer_write {{.*}}, %[[ALLOC]]
   %r0 = vector.transfer_write %vec, %A[%c0] : vector<4xf32>, tensor<?xf32>
 
@@ -161,22 +162,24 @@
                        %t1 : tensor<4xf32> {linalg.inplaceable = true})
   ->  (tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>)
 {
-  // Alloc and copy the whole result tensor. Copy the tensor.extract_slice.
+  // Hoisted allocs.
+  //      CHECK: %[[REALLOC_A1:.*]] = memref.alloc
+  //      CHECK: %[[REALLOC_A0_2:.*]] = memref.alloc
   //      CHECK: %[[REALLOC_A0:.*]] = memref.alloc
+
+  // Alloc and copy the whole result tensor. Copy the tensor.extract_slice.
   //      CHECK: linalg.copy(%[[A0]], %[[REALLOC_A0]]
   //      CHECK: %[[SV_A0:.*]] = memref.subview %[[REALLOC_A0]]
   //      CHECK: linalg.copy(%[[t0]], %[[SV_A0]])
   %r0 = tensor.insert_slice %t0 into %A0[0][4][1] : tensor<4xf32> into tensor<?xf32>
 
   // Alloc and copy the whole result tensor. Copy the tensor.extract_slice.
-  //      CHECK: %[[REALLOC_A0_2:.*]] = memref.alloc
   //      CHECK: linalg.copy(%[[A0]]
   //      CHECK: %[[SV_A0_2:.*]] = memref.subview %[[REALLOC_A0_2]]
   //      CHECK: linalg.copy(%[[t1]], %[[SV_A0_2]])
   %r1 = tensor.insert_slice %t1 into %A0[0][4][1] : tensor<4xf32> into tensor<?xf32>
 
   //  Still alloc the large tensor because %A1 is read after. Copy the tensor.extract_slice.
-  //      CHECK: %[[REALLOC_A1:.*]] = memref.alloc
   //      CHECK: linalg.copy(%[[A1]]
   //      CHECK: %[[SV_A1:.*]] = memref.subview %[[REALLOC_A1]]
   //      CHECK: linalg.copy(%[[t0]], %[[SV_A1]])
@@ -255,7 +258,7 @@
 func @insert_slice_fun_not_inplace(%A : tensor<?xf32>, %t : tensor<4xf32>)
   -> tensor<?xf32>
 {
-  //      CHECK: %[[ALLOC:.*]] = memref.alloc(%{{.*}}) : memref<?xf32>
+  //      CHECK: %[[ALLOC:.*]] = memref.alloc(%{{.*}}) {alignment = 128 : i64} : memref<?xf32>
   //      CHECK: linalg.copy(%[[A]], %[[ALLOC]]) : memref<?xf32{{.*}}, memref<?xf32>
   //      CHECK: %[[SV:.*]] = memref.subview %[[ALLOC]][0] [4] [1] : memref<?xf32> to memref<4xf32>
   //      CHECK: linalg.copy(%[[t]], %[[SV]]) : memref<4xf32, #map>, memref<4xf32>
@@ -285,7 +288,7 @@
 
   // fill would interfere with %r0 that is also being returned.
   // So we need to bufferize it out of place and make a new alloc.
-  //  CHECK-DAG: %[[ALLOC:.*]] = memref.alloc({{.*}}) : memref<?xf32>
+  //  CHECK-DAG: %[[ALLOC:.*]] = memref.alloc({{.*}}) {alignment = 128 : i64} : memref<?xf32>
   //      CHECK: linalg.fill(%{{.*}}, %[[ALLOC]]
   %r1 = linalg.fill(%f0, %A) : f32, tensor<?xf32> -> tensor<?xf32>
 
@@ -489,9 +492,9 @@
   %v1 = constant 1.0 : f32
   %v2 = constant 2.0 : f32
 
-  // CHECK-NEXT:   %[[A:.*]] = memref.alloc() : memref<64xf32>
-  // CHECK-NEXT:   %[[B:.*]] = memref.alloc() : memref<64xf32>
-  // CHECK-NEXT:   %[[C:.*]] = memref.alloc() : memref<f32>
+  // CHECK-NEXT:   %[[C:.*]] = memref.alloc() {alignment = 128 : i64} : memref<f32>
+  // CHECK-NEXT:   %[[B:.*]] = memref.alloc() {alignment = 128 : i64} : memref<64xf32>
+  // CHECK-NEXT:   %[[A:.*]] = memref.alloc() {alignment = 128 : i64} : memref<64xf32>
   %A = linalg.init_tensor [64] : tensor<64xf32>
   %B = linalg.init_tensor [64] : tensor<64xf32>
   %C = linalg.init_tensor [] : tensor<f32>
@@ -686,6 +689,9 @@
   %c8 = constant 8 : index
   %c16 = constant 16 : index
 
+  // Hoisted alloc.
+  // CHECK: %[[ALLOC:.*]] = memref.alloc() {alignment = 128 : i64} : memref<8x16xf32>
+
   // CHECK: scf.for %[[I:.*]] =
   %0 = scf.for %arg3 = %c0 to %c128 step %c8 iter_args(%arg4 = %C) -> (tensor<128x192xf32>) {
     %1 = tensor.extract_slice %A[%arg3, 0] [8, 256] [1, 1] :
@@ -697,7 +703,6 @@
         tensor<256x192xf32> to tensor<256x16xf32>
 
       // %4 does not match an insert_slice, it cannot be bufferized inplace and needs to alloc.
-      // CHECK: %[[ALLOC:.*]] = memref.alloc() : memref<8x16xf32>
       // CHECK: %[[T:.*]] = memref.subview %[[C]][%[[I]], %[[J]]] [8, 16] [1, 1]
       // TODO: %4 is never read but just overwritten, this copy can be elided.
       // CHECK: linalg.copy(%[[T]], %[[ALLOC]])