diff --git a/mlir/include/mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.td b/mlir/include/mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.td
--- a/mlir/include/mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.td
+++ b/mlir/include/mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.td
@@ -36,9 +36,8 @@
      DeclareOpInterfaceMethods<MemoryEffectsOpInterface>]> {
   let description = [{
     This transform materializes an allocation for the targeted tensor value. It
-    replaces all original uses of the target with the newly allocated buffer,
-    wrapped in a `bufferization.to_tensor` op. It returns a handle to the result
-    of the `to_tensor` op.
+    replaces all original uses of the target with the result of the newly
+    created `bufferization.alloc_tensor` and returns a handle to its result.
 
     Example:
     ```
@@ -49,19 +48,13 @@
     Is rewritten to:
     ```
     %0 = "some_op"() : () -> (tensor<10xf32>)
-    %1 = memref.alloc() : memref<10xf32>
-    memref.tensor_store %0, %1 : memref<10xf32>
-    %2 = bufferization.to_tensor %1 restrict writable : memref<10xf32>
-    "some_use"(%2) : (tensor<10xf32>) -> ()
+    %1 = bufferization.alloc_tensor() copy(%0) : tensor<10xf32>
+    "some_use"(%1) : (tensor<10xf32>) -> ()
     ```
 
-    This transform has optimized lowerings for certain targets that are results
-    of non-DPS ops. For such targets, not only a buffer allocation is emitted
-    but also the defining op is bufferized. This is to avoid a second
-    allocation for the missing destination of the non-DPS op (when subsequently
-    running a bufferization pass/transform). Currently supported ops with
-    optimized lowerings:
-    - tensor.pad
+    In case the target is an OpResult, the entire defining op is moved into the
+    region of the `bufferization.alloc_tensor` op. Only ops with a single result
+    are supported.
 
     An optional memory space attribute can be specified for the materialized
     buffer allocation.
diff --git a/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h b/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h
--- a/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h
+++ b/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h
@@ -44,34 +44,15 @@
 //===----------------------------------------------------------------------===//
 using LinalgLoops = SmallVector<Operation *, 4>;
 
-/// Materialize a buffer allocation for the given tensor.pad op and lower the
-/// op to linalg.fill/linalg.generic + memref.tensor_store. E.g.:
-///
-/// %0 = tensor.pad low[%l] high[%h] %t ...
-///
-/// is lowered to:
-///
-/// %alloc = memref.alloc
-/// linalg.fill ... outs(%alloc)
-/// %subview = memref.subview %alloc [%l] [...] [1]
-/// memref.tensor_store %t, %subview
-/// %0 = bufferization.to_tensor %alloc restrict writable
-///
-/// In addition to rewriting the IR as shown above, the result of the
-/// bufferization.to_tensor op is returned.
-Value bufferizeToAllocation(RewriterBase &rewriter, tensor::PadOp padOp,
-                            Attribute memorySpace = {});
-
 /// Materialize a buffer allocation for the given tensor value. E.g.:
 ///
-/// %alloc = memref.alloc
-/// memref.tensor_store %value, %alloc
-/// %0 = bufferization.to_tensor %alloc restrict writable
+/// %0 = bufferization.alloc_tensor() copy(%t) : tensor<5xf32>
 ///
-/// In case `value` is a tensor.pad result, the corresponding overload is used
-/// internally to produce a better bufferization.
-Value bufferizeToAllocation(RewriterBase &rewriter, Value value,
-                            Attribute memorySpace = {});
+/// In case `value` is an OpResult, the defining op is moved into the region of
+/// the generated bufferization.alloc_tensor. If the tensor type has dynamic
+/// dimensions, the op must implement the ReifyRankedShapedTypeOpInterface.
+FailureOr<Value> bufferizeToAllocation(RewriterBase &rewriter, Value value,
+                                       Attribute memorySpace = {});
 
 void populatePadTensorTilingPatterns(RewritePatternSet &patterns,
                                      const LinalgTilingOptions &options);
diff --git a/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp b/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp
--- a/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp
+++ b/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp
@@ -219,10 +219,15 @@
   Attribute memorySpace =
       getMemorySpace().has_value() ? getMemorySpace().value() : Attribute();
   IRRewriter rewriter(getContext());
-  auto transformed = llvm::to_vector(
-      llvm::map_range(state.getPayloadValues(getTarget()), [&](Value v) {
-        return linalg::bufferizeToAllocation(rewriter, v, memorySpace);
-      }));
+  SmallVector<Value> transformed;
+  for (Value v : state.getPayloadValues(getTarget())) {
+    FailureOr<Value> alloc =
+        linalg::bufferizeToAllocation(rewriter, v, memorySpace);
+    if (failed(alloc))
+      return mlir::emitSilenceableFailure(v.getLoc())
+             << "unable to create allocation for value";
+    transformed.push_back(*alloc);
+  }
   results.setValues(getTransformed().cast<OpResult>(), transformed);
   return DiagnosedSilenceableFailure::success();
 }
diff --git a/mlir/lib/Dialect/Linalg/Transforms/ConvertToDestinationStyle.cpp b/mlir/lib/Dialect/Linalg/Transforms/ConvertToDestinationStyle.cpp
--- a/mlir/lib/Dialect/Linalg/Transforms/ConvertToDestinationStyle.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/ConvertToDestinationStyle.cpp
@@ -117,90 +117,6 @@
   return genericOp;
 }
 
-static SmallVector<Value> reifyOrComputeDynamicSizes(OpBuilder &b,
-                                                     Value value) {
-  auto tensorType = value.getType().cast<RankedTensorType>();
-  if (tensorType.hasStaticShape())
-    return {};
-
-  // Try to reify dynamic sizes.
-  if (auto reifiableOp =
-          value.getDefiningOp<ReifyRankedShapedTypeOpInterface>()) {
-    ReifiedRankedShapedTypeDims reifiedShape;
-    if (succeeded(reifiableOp.reifyResultShapes(b, reifiedShape))) {
-      SmallVector<Value> dynSizes;
-      for (int64_t i = 0; i < tensorType.getRank(); ++i) {
-        if (tensorType.isDynamicDim(i))
-          dynSizes.push_back(
-              reifiedShape[value.cast<OpResult>().getResultNumber()][i]);
-      }
-      return dynSizes;
-    }
-  }
-
-  // Create tensor.dim ops.
-  SmallVector<Value> dynSizes;
-  for (int64_t i = 0; i < tensorType.getRank(); ++i) {
-    if (tensorType.isDynamicDim(i))
-      dynSizes.push_back(
-          b.create<DimOp>(value.getLoc(), value,
-                          b.create<arith::ConstantIndexOp>(value.getLoc(), i)));
-  }
-  return dynSizes;
-}
-
-static Value createAllocationForTensor(RewriterBase &rewriter, Location loc,
-                                       Value value,
-                                       Attribute memorySpace = {}) {
-  OpBuilder::InsertionGuard g(rewriter);
-  auto tensorType = value.getType().cast<RankedTensorType>();
-
-  // Create buffer allocation.
-  auto memrefType = bufferization::getMemRefTypeWithStaticIdentityLayout(
-                        tensorType, memorySpace)
-                        .cast<MemRefType>();
-  SmallVector<Value> dynamicSizes = reifyOrComputeDynamicSizes(rewriter, value);
-  Value alloc = rewriter.create<memref::AllocOp>(loc, memrefType, dynamicSizes);
-
-  // Place deallocation at the end of the block.
-  rewriter.setInsertionPoint(rewriter.getInsertionBlock()->getTerminator());
-  rewriter.create<memref::DeallocOp>(loc, alloc);
-
-  return alloc;
-}
-
-Value linalg::bufferizeToAllocation(RewriterBase &rewriter, PadOp padOp,
-                                    Attribute memorySpace) {
-  OpBuilder::InsertionGuard g(rewriter);
-  rewriter.setInsertionPoint(padOp);
-  Location loc = padOp.getLoc();
-
-  // Create buffer allocation.
-  Value alloc =
-      createAllocationForTensor(rewriter, loc, padOp.getResult(), memorySpace);
-  rewriter.setInsertionPointAfter(alloc.getDefiningOp());
-
-  // Create linalg.fill or linalg.generic.
-  Operation *fillOp = movePaddingToFillOrGenericOp(rewriter, loc, padOp, alloc);
-  rewriter.setInsertionPointAfter(fillOp);
-
-  // Create memref.tensor_store.
-  SmallVector<OpFoldResult> sizes =
-      getMixedSizes(rewriter, loc, padOp.getSource());
-  SmallVector<OpFoldResult> strides(padOp.getResultType().getRank(),
-                                    rewriter.getIndexAttr(1));
-  Value subview = rewriter.create<memref::SubViewOp>(
-      loc, alloc, /*offsets=*/padOp.getMixedLowPad(), sizes, strides);
-  rewriter.create<memref::TensorStoreOp>(loc, padOp.getSource(), subview);
-
-  // Create bufferization.to_tensor with "restrict" and "writable". The returned
-  // tensor is a new buffer allocation, so it does not alias with any buffer.
-  Value toTensorOp = rewriter.create<bufferization::ToTensorOp>(
-      loc, alloc, /*restrict=*/true, /*writable=*/true);
-  rewriter.replaceOp(padOp, toTensorOp);
-  return toTensorOp;
-}
-
 /// Lower tensor.from_elements to a sequence of chained tensor.insert.
 FailureOr<Operation *> mlir::linalg::rewriteInDestinationPassingStyle(
     RewriterBase &rewriter, tensor::FromElementsOp fromElementsOp) {
@@ -329,41 +245,69 @@
   return insertSliceOp.getOperation();
 }
 
-Value linalg::bufferizeToAllocation(RewriterBase &rewriter, Value value,
-                                    Attribute memorySpace) {
-  // Call specialized overload for certain ops.
-  if (auto padOp = value.getDefiningOp<PadOp>())
-    return bufferizeToAllocation(rewriter, padOp, memorySpace);
+FailureOr<Value> linalg::bufferizeToAllocation(RewriterBase &rewriter,
+                                               Value value,
+                                               Attribute memorySpace) {
+  OpBuilder::InsertionGuard g(rewriter);
+  auto tensorType = value.getType().cast<RankedTensorType>();
+  SmallVector<Value> dynSizes;
 
-  // Collect all uses.
-  SmallVector<OpOperand *> uses = llvm::to_vector(
-      llvm::map_range(value.getUses(), [](OpOperand &use) { return &use; }));
+  // Collect uses of the value.
+  SmallVector<OpOperand *> uses;
+  for (OpOperand &use : value.getUses())
+    uses.push_back(&use);
+  auto replaceAllUsesWith = [&](Value replacement) {
+    for (OpOperand *use : uses) {
+      rewriter.updateRootInPlace(use->getOwner(),
+                                 [&]() { use->set(replacement); });
+    }
+  };
 
-  OpBuilder::InsertionGuard g(rewriter);
   if (auto bbArg = value.dyn_cast<BlockArgument>()) {
+    // BlockArgument: Directly yield the value from the generated alloc_tensor.
     rewriter.setInsertionPointToStart(bbArg.getOwner());
-  } else {
-    rewriter.setInsertionPointAfter(value.getDefiningOp());
+
+    // Create tensor.dim ops.
+    for (int64_t i = 0; i < tensorType.getRank(); ++i) {
+      if (tensorType.isDynamicDim(i))
+        dynSizes.push_back(rewriter.create<DimOp>(
+            value.getLoc(), value,
+            rewriter.create<arith::ConstantIndexOp>(value.getLoc(), i)));
+    }
+
+    auto allocTensorOp = rewriter.create<bufferization::AllocTensorOp>(
+        value.getLoc(), tensorType, dynSizes, /*sizeHint=*/Value(), memorySpace,
+        /*copy=*/bbArg);
+    replaceAllUsesWith(allocTensorOp.getResult());
+    return allocTensorOp.getResult();
   }
-  Location loc = value.getLoc();
-
-  // Create buffer allocation.
-  Value alloc = createAllocationForTensor(rewriter, loc, value, memorySpace);
-
-  // Create memref.tensor_store.
-  rewriter.setInsertionPointAfter(alloc.getDefiningOp());
-  rewriter.create<memref::TensorStoreOp>(loc, value, alloc);
-
-  // Create bufferization.to_tensor with "restrict" and "writable". The returned
-  // tensor is a new buffer allocation, so it does not alias with any buffer.
-  Value toTensorOp = rewriter.create<bufferization::ToTensorOp>(
-      loc, alloc, /*restrict=*/true, /*writable=*/true);
-  for (OpOperand *use : uses) {
-    rewriter.updateRootInPlace(use->getOwner(),
-                               [&]() { use->set(toTensorOp); });
+
+  // OpResult: Move the defining op into the region of the alloc_tensor.
+  Operation *op = value.getDefiningOp();
+  // TODO: Ops with more than one result are not supported.
+  if (op->getNumResults() != 1)
+    return failure();
+  rewriter.setInsertionPointAfter(op);
+  if (!tensorType.hasStaticShape()) {
+    // Try to reify dynamic sizes.
+    if (auto reifiableOp = cast<ReifyRankedShapedTypeOpInterface>(op)) {
+      ReifiedRankedShapedTypeDims reifiedShape;
+      if (failed(reifiableOp.reifyResultShapes(rewriter, reifiedShape)))
+        return failure();
+      for (int64_t i = 0; i < tensorType.getRank(); ++i) {
+        if (tensorType.isDynamicDim(i))
+          dynSizes.push_back(
+              reifiedShape[value.cast<OpResult>().getResultNumber()][i]);
+      }
+    }
   }
 
-  return toTensorOp;
+  auto allocTensorOp = rewriter.create<bufferization::AllocTensorOp>(
+      value.getLoc(), tensorType, dynSizes, memorySpace);
+  op->moveBefore(allocTensorOp.getTerminator());
+  replaceAllUsesWith(allocTensorOp.getResult());
+  allocTensorOp.getTerminator().getTensorMutable().assign(value);
+  return allocTensorOp.getResult();
 }
 
 namespace {
diff --git a/mlir/test/Dialect/Linalg/transform-op-bufferize-to-allocation.mlir b/mlir/test/Dialect/Linalg/transform-op-bufferize-to-allocation.mlir
--- a/mlir/test/Dialect/Linalg/transform-op-bufferize-to-allocation.mlir
+++ b/mlir/test/Dialect/Linalg/transform-op-bufferize-to-allocation.mlir
@@ -1,6 +1,6 @@
 // RUN: mlir-opt -split-input-file \
 // RUN:   -test-transform-dialect-interpreter -canonicalize \
-// RUN:   -allow-unregistered-dialect -split-input-file %s | FileCheck %s
+// RUN:   -split-input-file %s | FileCheck %s
 
 // CHECK:       #[[$map:.+]] = affine_map<()[s0, s1] -> (s0 + s1 + 5)>
 // CHECK:       #[[$map1:.+]] = affine_map<()[s0, s1] -> (s0 + s1 + 10)>
@@ -11,40 +11,11 @@
 //   CHECK-DAG:   %[[dim0:.*]] = tensor.dim %[[t]], %[[c0]]
 //   CHECK-DAG:   %[[size0:.*]] = affine.apply #[[$map]]()[%[[h1]], %[[dim0]]]
 //   CHECK-DAG:   %[[size1:.*]] = affine.apply #[[$map1]]()[%[[l2]], %[[h2]]]
-//       CHECK:   %[[alloc:.*]] = memref.alloc(%[[size0]], %[[size1]]) : memref<?x?xindex>
-//       CHECK:   linalg.fill ins(%[[c50]] : index) outs(%[[alloc]] : memref<?x?xindex>)
-//       CHECK:   %[[dim0:.*]] = tensor.dim %[[t]], %[[c0]]
-//       CHECK:   %[[subview:.*]] = memref.subview %[[alloc]][5, %[[l2]]] [%[[dim0]], 10] [1, 1]
-//       CHECK:   memref.tensor_store %[[t]], %[[subview]]
-//       CHECK:   %[[r:.*]] = bufferization.to_tensor %[[alloc]] restrict writable : memref<?x?xindex>
-//       CHECK:   memref.dealloc %[[alloc]]
-//       CHECK:   return %[[r]]
-func.func @tensor_pad_constant(%t: tensor<?x10xindex>, %l2: index, %h1: index,
-                               %h2: index) -> tensor<?x?xindex> {
-  %0 = tensor.pad %t low[5, %l2] high[%h1, %h2] {
-  ^bb0(%arg0: index, %arg1: index):
-    %c = arith.constant 50 : index
-    tensor.yield %c : index
-  } : tensor<?x10xindex> to tensor<?x?xindex>
-  return %0 : tensor<?x?xindex>
-}
-
-transform.sequence failures(propagate) {
-^bb1(%arg1: !pdl.operation):
-  %0 = transform.structured.match ops{["tensor.pad"]} in %arg1 : (!pdl.operation) -> !pdl.operation
-  %1 = transform.get_result %0[0] : (!pdl.operation) -> !transform.any_value
-  %2 = transform.structured.bufferize_to_allocation %1
-}
-
-// -----
-
-// CHECK-LABEL: func @tensor_pad_constant(
-//  CHECK-SAME:     %[[t:.*]]: tensor<?x10xindex>
-//       CHECK:   %[[src:.*]] = bufferization.to_memref %[[t]]
-//       CHECK:   %[[alloc:.*]] = memref.alloc
-//       CHECK:   %[[subview:.*]] = memref.subview %[[alloc]]
-//       CHECK:   memref.copy %[[src]], %[[subview]]
-//       CHECK:   bufferization.to_tensor %[[alloc]] restrict writable
+//       CHECK:   %[[alloc:.*]] = bufferization.alloc_tensor(%[[size0]], %[[size1]]) init {
+//       CHECK:     %[[padded:.*]] = tensor.pad %[[t]]
+//       CHECK:     bufferization.yield %[[padded]]
+//       CHECK:   } : tensor<?x?xindex>
+//       CHECK:   return %[[alloc]]
 func.func @tensor_pad_constant(%t: tensor<?x10xindex>, %l2: index, %h1: index,
                                %h2: index) -> tensor<?x?xindex> {
   %0 = tensor.pad %t low[5, %l2] high[%h1, %h2] {
@@ -60,8 +31,6 @@
   %0 = transform.structured.match ops{["tensor.pad"]} in %arg1 : (!pdl.operation) -> !pdl.operation
   %1 = transform.get_result %0[0] : (!pdl.operation) -> !transform.any_value
   %2 = transform.structured.bufferize_to_allocation %1
-  // Make sure that One-Shot Bufferize can bufferize the rest.
-  transform.bufferization.one_shot_bufferize %arg1
 }
 
 // -----
@@ -70,32 +39,8 @@
 //  CHECK-SAME:     %[[t:.*]]: tensor<?x10xindex>
 //       CHECK:   %[[c0:.*]] = arith.constant 0 : index
 //       CHECK:   %[[dim:.*]] = tensor.dim %[[t]], %[[c0]]
-//       CHECK:   %[[alloc:.*]] = memref.alloc(%[[dim]]) : memref<?x10xindex, 4>
-//       CHECK:   memref.tensor_store %[[t]], %[[alloc]]
-//       CHECK:   %[[alloc_t:.*]] = bufferization.to_tensor %[[alloc]] restrict writable
-//       CHECK:   %[[r:.*]] = tensor.extract %[[alloc_t]]
-//       CHECK:   memref.dealloc %[[alloc]]
-//       CHECK:   return %[[r]]
-func.func @materialization_of_bbarg(%t: tensor<?x10xindex>, %idx: index) -> index {
-  %r = tensor.extract %t[%idx, %idx] : tensor<?x10xindex>
-  return %r : index
-}
-
-transform.sequence failures(propagate) {
-^bb1(%arg1: !pdl.operation):
-  %0 = transform.structured.match ops{["tensor.extract"]} in %arg1 : (!pdl.operation) -> !pdl.operation
-  %1 = test_produce_value_handle_to_argument_of_parent_block %0, 0 : (!pdl.operation) -> !transform.any_value
-  %2 = transform.structured.bufferize_to_allocation %1 {memory_space = 4}
-}
-
-// -----
-
-// CHECK-LABEL: func @materialization_of_bbarg(
-//  CHECK-SAME:     %[[t:.*]]: tensor<?x10xindex>
-//       CHECK:   %[[m:.*]] = bufferization.to_memref %[[t]]
-//       CHECK:   %[[alloc:.*]] = memref.alloc(%{{.*}}) : memref<?x10xindex, 4>
-//       CHECK:   memref.copy %[[m]], %[[alloc]]
-//       CHECK:   %[[r:.*]] = memref.load %[[alloc]]
+//       CHECK:   %[[alloc:.*]] = bufferization.alloc_tensor(%[[dim]]) copy(%[[t]]) {memory_space = 4 : i64} : tensor<?x10xindex>
+//       CHECK:   %[[r:.*]] = tensor.extract %[[alloc]]
 //       CHECK:   return %[[r]]
 func.func @materialization_of_bbarg(%t: tensor<?x10xindex>, %idx: index) -> index {
   %r = tensor.extract %t[%idx, %idx] : tensor<?x10xindex>
@@ -107,28 +52,4 @@
   %0 = transform.structured.match ops{["tensor.extract"]} in %arg1 : (!pdl.operation) -> !pdl.operation
   %1 = test_produce_value_handle_to_argument_of_parent_block %0, 0 : (!pdl.operation) -> !transform.any_value
   %2 = transform.structured.bufferize_to_allocation %1 {memory_space = 4}
-  // Make sure that One-Shot Bufferize can bufferize the rest.
-  transform.bufferization.one_shot_bufferize %arg1
 }
-
-// -----
-
-// CHECK-LABEL: func @materialization_of_opresult(
-//       CHECK:   %[[t:.*]] = "dummy.some_op"
-//       CHECK:   %[[alloc:.*]] = memref.alloc(%{{.*}}) : memref<?x10xindex, 4>
-//       CHECK:   memref.tensor_store %[[t]], %[[alloc]]
-//       CHECK:   %[[r:.*]] = bufferization.to_tensor %[[alloc]]
-//       CHECK:   return %[[r]]
-func.func @materialization_of_opresult(%idx: index) -> tensor<?x10xindex> {
-  %t = "dummy.some_op"() : () -> (tensor<?x10xindex>)
-  return %t : tensor<?x10xindex>
-}
-
-transform.sequence failures(propagate) {
-^bb1(%arg1: !pdl.operation):
-  %0 = transform.structured.match ops{["dummy.some_op"]} in %arg1 : (!pdl.operation) -> !pdl.operation
-  %1 = transform.get_result %0[0] : (!pdl.operation) -> !transform.any_value
-  %2 = transform.structured.bufferize_to_allocation %1 {memory_space = 4}
-}
-
-