diff --git a/mlir/include/mlir/Dialect/Bufferization/IR/BufferizableOpInterface.h b/mlir/include/mlir/Dialect/Bufferization/IR/BufferizableOpInterface.h
--- a/mlir/include/mlir/Dialect/Bufferization/IR/BufferizableOpInterface.h
+++ b/mlir/include/mlir/Dialect/Bufferization/IR/BufferizableOpInterface.h
@@ -202,6 +202,20 @@
   /// For debugging only. Should be used together with `testAnalysisOnly`.
   bool printConflicts = false;
 
+  /// If set to `true`, an `getAliasingOpResult` will return the corresponding
+  /// "out"/"dest" OpOperand for every op that has the notion of an "out"/"dest"
+  /// operand. I.e., the aliasing OpOperand of the i-th tensor OpResult is
+  /// usually the i-th "out" tensor OpOperand. This is in line with
+  /// destination-passing style and the default behavior. Op interface
+  /// implementations must follow this contract to avoid surprising behavior.
+  ///
+  /// If set to `false`, BufferizableOpInterface implementations can try to be
+  /// smart and choose to alias with "in" operands or other operands. E.g., the
+  /// result of a `linalg.generic` op could bufferize in-place with an "in"
+  /// OpOperand if the corresponding "out" operand is not used within the
+  /// computation. Whether this pays off or not can be very input IR-specific.
+  bool alwaysAliasingWithDest = true;
+
   /// Buffer alignment for new memory allocations.
   unsigned int bufferAlignment = 128;
 
diff --git a/mlir/include/mlir/Dialect/Linalg/Passes.td b/mlir/include/mlir/Dialect/Linalg/Passes.td
--- a/mlir/include/mlir/Dialect/Linalg/Passes.td
+++ b/mlir/include/mlir/Dialect/Linalg/Passes.td
@@ -49,6 +49,10 @@
     Option<"allowUnknownOps", "allow-unknown-ops", "bool",
            /*default=*/"false",
            "Allows unknown (not bufferizable) ops in the input IR.">,
+    Option<"alwaysAliasingWithDest", "always-aliasing-with-dest", "bool",
+            /*default=*/"true",
+            "Tensor OpResult cannot bufferize inplace OpOperands other than "
+            "out or dest OpOperands (if the op has a notion of such operands)">,
     Option<"useAlloca", "use-alloca", "bool",
            /*default=*/"false",
            "Use stack allocations for memrefs (for testing purposes only)">,
diff --git a/mlir/lib/Dialect/Linalg/Transforms/BufferizableOpInterfaceImpl.cpp b/mlir/lib/Dialect/Linalg/Transforms/BufferizableOpInterfaceImpl.cpp
--- a/mlir/lib/Dialect/Linalg/Transforms/BufferizableOpInterfaceImpl.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/BufferizableOpInterfaceImpl.cpp
@@ -164,8 +164,7 @@
 
   bool bufferizesToMemoryWrite(Operation *op, OpOperand &opOperand,
                                const BufferizationState &state) const {
-    // Operand is written to if it has an aliasing OpResult. For more details,
-    // see `computeAliasingPairs`.
+    // Operand is written to if it has an aliasing OpResult.
     auto bufferizableOp = cast<BufferizableOpInterface>(op);
     return !bufferizableOp.getAliasingOpResult(opOperand, state).empty();
   }
@@ -175,6 +174,12 @@
                        const BufferizationState &state) const {
     auto genericOp = cast<linalg::LinalgOp>(op);
 
+    // By default, the i-th OpResult may alias with the i-th "out" tensor.
+    if (state.getOptions().alwaysAliasingWithDest)
+      return {genericOp.getOutputOperand(opResult.getResultNumber())};
+
+    // We can try to be smart and alias in-place with an "in" tensor if the
+    // corresponding "out" tensor is not used in the computation.
     // Aliasing OpOperand/OpResult pairs are computed by `computeAliasingPairs`.
     DenseMap<OpOperand *, OpResult> pairs = computeAliasingPairs(genericOp);
     for (OpOperand *opOperand : genericOp.getInputAndOutputOperands())
@@ -188,6 +193,14 @@
                       const BufferizationState &state) const {
     auto genericOp = cast<linalg::LinalgOp>(op);
 
+    // By default, the i-th "out" tensor may alias with the i-th OpResult.
+    if (state.getOptions().alwaysAliasingWithDest) {
+      if (genericOp.isOutputTensor(&opOperand))
+        return {genericOp.getTiedOpResult(&opOperand)};
+      return {};
+    }
+
+    // We can try to be smart. See comment in `getAliasingOpOperand`.
     // Aliasing OpOperand/OpResult pairs are computed by `computeAliasingPairs`.
     DenseMap<OpOperand *, OpResult> pairs = computeAliasingPairs(genericOp);
     if (!pairs.count(&opOperand))
diff --git a/mlir/lib/Dialect/Linalg/Transforms/ComprehensiveBufferizePass.cpp b/mlir/lib/Dialect/Linalg/Transforms/ComprehensiveBufferizePass.cpp
--- a/mlir/lib/Dialect/Linalg/Transforms/ComprehensiveBufferizePass.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/ComprehensiveBufferizePass.cpp
@@ -97,6 +97,7 @@
     opt.fullyDynamicLayoutMaps = fullyDynamicLayoutMaps;
     opt.printConflicts = printConflicts;
     opt.testAnalysisOnly = testAnalysisOnly;
+    opt.alwaysAliasingWithDest = alwaysAliasingWithDest;
     if (initTensorElimination) {
       opt.addPostAnalysisStep(insertSliceAnchoredInitTensorEliminationStep);
     }
diff --git a/mlir/test/Dialect/Linalg/comprehensive-module-bufferize-aliasing-in.mlir b/mlir/test/Dialect/Linalg/comprehensive-module-bufferize-aliasing-in.mlir
new file mode 100644
--- /dev/null
+++ b/mlir/test/Dialect/Linalg/comprehensive-module-bufferize-aliasing-in.mlir
@@ -0,0 +1,75 @@
+// RUN: mlir-opt %s -linalg-comprehensive-module-bufferize="allow-return-memref always-aliasing-with-dest=0" -split-input-file | FileCheck %s
+
+// CHECK-LABEL: func @linalg_op_bufferizes_inplace_with_input
+//  CHECK-SAME:     %[[t1:.*]]: memref<?x?xf32, #{{.*}}>, %[[t2:.*]]: memref<?xf32, #{{.*}}>, %[[t3:.*]]: memref<?x?xf32, #{{.*}}>
+func @linalg_op_bufferizes_inplace_with_input(
+    %t1: tensor<?x?xf32> {linalg.inplaceable = true},
+    %t2: tensor<?xf32> {linalg.inplaceable = false},
+    %t3: tensor<?x?xf32> {linalg.inplaceable = false},
+    %s1: index, %s2: index, %cst: f32) -> tensor<?x?xf32> {
+  // CHECK: linalg.generic {{.*}} ins(%[[t1]], %[[t2]] : {{.*}}) outs(%[[t1]] : {{.*}})
+  %r = linalg.generic {
+    indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>,
+                     affine_map<(d0, d1) -> (d1)>,
+                     affine_map<(d0, d1)-> (d0, d1)>],
+    iterator_types = ["parallel", "parallel"]}
+    ins(%t1, %t2 : tensor<?x?xf32>, tensor<?xf32>)
+    outs(%t3 : tensor<?x?xf32>) {
+      ^bb0(%arg0 : f32, %arg1 : f32, %arg2 : f32) :
+        %add = arith.addf %arg0, %arg1 : f32
+        linalg.yield %add : f32
+    } -> tensor<?x?xf32>
+  return %r : tensor<?x?xf32>
+}
+
+// -----
+
+// CHECK-LABEL: func @linalg_op_bufferizes_out_of_place_with_input
+//  CHECK-SAME:     %[[t1:.*]]: memref<?x?xf32, #{{.*}}>, %[[t2:.*]]: memref<?xf32, #{{.*}}>, %[[t3:.*]]: memref<?x?xf32, #{{.*}}>
+func @linalg_op_bufferizes_out_of_place_with_input(
+    %t1: tensor<?x?xf32> {linalg.inplaceable = false},
+    %t2: tensor<?xf32> {linalg.inplaceable = false},
+    %t3: tensor<?x?xf32> {linalg.inplaceable = false},
+    %s1: index, %s2: index, %cst: f32) -> tensor<?x?xf32> {
+  // CHECK: %[[alloc:.*]] = memref.alloc
+  // CHECK: memref.copy %[[t1]], %[[alloc]]
+  // CHECK: linalg.generic {{.*}} ins(%[[t1]], %[[t2]] : {{.*}}) outs(%[[alloc]] : {{.*}})
+  %r = linalg.generic {
+    indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>,
+                     affine_map<(d0, d1) -> (d1)>,
+                     affine_map<(d0, d1)-> (d0, d1)>],
+    iterator_types = ["parallel", "parallel"]}
+    ins(%t1, %t2 : tensor<?x?xf32>, tensor<?xf32>)
+    outs(%t3 : tensor<?x?xf32>) {
+      ^bb0(%arg0 : f32, %arg1 : f32, %arg2 : f32) :
+        %add = arith.addf %arg0, %arg1 : f32
+        linalg.yield %add : f32
+    } -> tensor<?x?xf32>
+  // CHECK: return %[[alloc]]
+  return %r : tensor<?x?xf32>
+}
+
+// -----
+
+// CHECK-LABEL: func @linalg_op_output_cannot_alias_with_input
+//  CHECK-SAME:     %[[t1:.*]]: memref<?x?xf32, #{{.*}}>, %[[t2:.*]]: memref<?xf32, #{{.*}}>, %[[t3:.*]]: memref<?x?xf32, #{{.*}}>
+func @linalg_op_output_cannot_alias_with_input(
+    %t1: tensor<?x?xf32> {linalg.inplaceable = true},
+    %t2: tensor<?xf32> {linalg.inplaceable = false},
+    %t3: tensor<?x?xf32> {linalg.inplaceable = true},
+    %s1: index, %s2: index, %cst: f32) -> tensor<?x?xf32> {
+  // CHECK: linalg.generic {{.*}} ins(%[[t1]], %[[t2]] : {{.*}}) outs(%[[t3]] : {{.*}})
+  %r = linalg.generic {
+    indexing_maps = [affine_map<(d0, d1) -> (d1, d0)>,
+                     affine_map<(d0, d1) -> (d1)>,
+                     affine_map<(d0, d1)-> (d0, d1)>],
+    iterator_types = ["parallel", "parallel"]}
+    ins(%t1, %t2 : tensor<?x?xf32>, tensor<?xf32>)
+    outs(%t3 : tensor<?x?xf32>) {
+      ^bb0(%arg0 : f32, %arg1 : f32, %arg2 : f32) :
+        %add = arith.addf %arg0, %arg1 : f32
+        linalg.yield %add : f32
+    } -> tensor<?x?xf32>
+  return %r : tensor<?x?xf32>
+}
+
diff --git a/mlir/test/Dialect/Linalg/comprehensive-module-bufferize-analysis-aliasing-in.mlir b/mlir/test/Dialect/Linalg/comprehensive-module-bufferize-analysis-aliasing-in.mlir
new file mode 100644
--- /dev/null
+++ b/mlir/test/Dialect/Linalg/comprehensive-module-bufferize-analysis-aliasing-in.mlir
@@ -0,0 +1,83 @@
+// RUN: mlir-opt %s -linalg-comprehensive-module-bufferize="test-analysis-only allow-return-memref always-aliasing-with-dest=0" -split-input-file | FileCheck %s
+
+// This is a test case for alwaysAliasingWithDest = 0. In that case, an OpResult
+// may bufferize in-place with an "in" OpOperand or any non-"out" OpOperand.
+
+
+#accesses = [
+  affine_map<(i) -> (i)>,
+  affine_map<(i) -> (i)>,
+  affine_map<(i) -> (i)>
+]
+#trait = {
+  indexing_maps = #accesses,
+  iterator_types = ["parallel"]
+}
+
+// CHECK-LABEL: func @linalg_op_same_out_tensors(
+func @linalg_op_same_out_tensors(
+    %t1: tensor<?xf32> {linalg.inplaceable = true},
+// CHECK-SAME:          bufferization.access = "read-write"
+    %t2: tensor<?xf32> {linalg.inplaceable = true})
+// CHECK-SAME:          bufferization.access = "write"
+  -> (tensor<?xf32>, tensor<?xf32>){
+
+  // %1 and %2 are not used in the computation, so the two OpResults do not
+  // necessarily have to bufferize in-place with the two "out" OpOperands. They
+  // bufferize in-place with the first and second OpOperand (one of which is an
+  // "in" OpOperand).
+  //      CHECK: linalg.generic
+  // CHECK-SAME: {__inplace_operands_attr__ = ["true", "true", "true"]
+  %o:2 = linalg.generic #trait ins(%t1 : tensor<?xf32>)
+                               outs (%t2, %t2 : tensor<?xf32>, tensor<?xf32>) {
+      ^bb(%0: f32, %1: f32, %2 : f32) :
+        linalg.yield %0, %0 : f32, f32
+    } -> (tensor<?xf32>, tensor<?xf32>)
+
+  //      CHECK: return
+  // CHECK-SAME: __equivalent_func_args__ = [0, 1]
+  return %o#0, %o#1 : tensor<?xf32>, tensor<?xf32>
+}
+
+// -----
+
+#accesses = [
+  affine_map<(i) -> (i)>,
+  affine_map<(i) -> (i)>,
+  affine_map<(i) -> (i)>,
+  affine_map<(i) -> (i)>
+]
+#trait = {
+  indexing_maps = #accesses,
+  iterator_types = ["parallel"]
+}
+
+// CHECK-LABEL: func @linalg_op_same_out_tensors_2(
+func @linalg_op_same_out_tensors_2(
+    %t1: tensor<?xf32> {linalg.inplaceable = true},
+// CHECK-SAME:          bufferization.access = "read-write"
+    %t2: tensor<?xf32> {linalg.inplaceable = true})
+// CHECK-SAME:          bufferization.access = "write"
+        -> (tensor<?xf32>, tensor<?xf32>, tensor<?xf32>){
+
+  // %1, %2 and %3 are not used in the computation, so the three OpResults do
+  // not necessarily have to bufferize in-place with the three "out" OpOperands.
+  // They bufferize in-place with the first, second and third OpOperand (one of
+  // which is an "in" OpOperand).
+  // In contrast to the previous test case, two of the chosen OpOperands are the
+  // same (aliasing) SSA value, which is why one of them must bufferize
+  // out-of-place.
+  //      CHECK: linalg.generic
+  // CHECK-SAME: {__inplace_operands_attr__ = ["true", "true", "true", "false"]
+  %o:3 = linalg.generic #trait
+          ins(%t1 : tensor<?xf32>)
+          outs (%t2, %t2, %t2 : tensor<?xf32>, tensor<?xf32>, tensor<?xf32>) {
+      ^bb(%0: f32, %1: f32, %2 : f32, %3 : f32) :
+        linalg.yield %0, %0, %0 : f32, f32, f32
+    } -> (tensor<?xf32>, tensor<?xf32>, tensor<?xf32>)
+
+  //      CHECK: return
+  // CHECK-SAME: __equivalent_func_args__ = [0, 1, -1]
+  return %o#0, %o#1, %o#2 : tensor<?xf32>, tensor<?xf32>, tensor<?xf32>
+}
+
diff --git a/mlir/test/Dialect/Linalg/comprehensive-module-bufferize-analysis.mlir b/mlir/test/Dialect/Linalg/comprehensive-module-bufferize-analysis.mlir
--- a/mlir/test/Dialect/Linalg/comprehensive-module-bufferize-analysis.mlir
+++ b/mlir/test/Dialect/Linalg/comprehensive-module-bufferize-analysis.mlir
@@ -990,13 +990,13 @@
 // CHECK-LABEL: func @linalg_op_same_out_tensors(
 func @linalg_op_same_out_tensors(
     %t1: tensor<?xf32> {linalg.inplaceable = true},
-// CHECK-SAME:          bufferization.access = "read-write"
+// CHECK-SAME:          bufferization.access = "read"
     %t2: tensor<?xf32> {linalg.inplaceable = true})
 // CHECK-SAME:          bufferization.access = "write"
   -> (tensor<?xf32>, tensor<?xf32>){
 
   //      CHECK: linalg.generic
-  // CHECK-SAME: {__inplace_operands_attr__ = ["true", "true", "true"]
+  // CHECK-SAME: {__inplace_operands_attr__ = ["true", "true", "false"]
   %o:2 = linalg.generic #trait ins(%t1 : tensor<?xf32>)
                                outs (%t2, %t2 : tensor<?xf32>, tensor<?xf32>) {
       ^bb(%0: f32, %1: f32, %2 : f32) :
@@ -1004,7 +1004,7 @@
     } -> (tensor<?xf32>, tensor<?xf32>)
 
   //      CHECK: return
-  // CHECK-SAME: __equivalent_func_args__ = [0, 1]
+  // CHECK-SAME: __equivalent_func_args__ = [1, -1]
   return %o#0, %o#1 : tensor<?xf32>, tensor<?xf32>
 }
 
@@ -1024,13 +1024,13 @@
 // CHECK-LABEL: func @linalg_op_same_out_tensors_2(
 func @linalg_op_same_out_tensors_2(
     %t1: tensor<?xf32> {linalg.inplaceable = true},
-// CHECK-SAME:          bufferization.access = "read-write"
+// CHECK-SAME:          bufferization.access = "read"
     %t2: tensor<?xf32> {linalg.inplaceable = true})
 // CHECK-SAME:          bufferization.access = "write"
         -> (tensor<?xf32>, tensor<?xf32>, tensor<?xf32>){
 
   //      CHECK: linalg.generic
-  // CHECK-SAME: {__inplace_operands_attr__ = ["true", "true", "true", "false"]
+  // CHECK-SAME: {__inplace_operands_attr__ = ["true", "true", "false", "false"]
   %o:3 = linalg.generic #trait
           ins(%t1 : tensor<?xf32>)
           outs (%t2, %t2, %t2 : tensor<?xf32>, tensor<?xf32>, tensor<?xf32>) {
@@ -1039,7 +1039,7 @@
     } -> (tensor<?xf32>, tensor<?xf32>, tensor<?xf32>)
 
   //      CHECK: return
-  // CHECK-SAME: __equivalent_func_args__ = [0, 1, -1]
+  // CHECK-SAME: __equivalent_func_args__ = [1, -1, -1]
   return %o#0, %o#1, %o#2 : tensor<?xf32>, tensor<?xf32>, tensor<?xf32>
 }
 
diff --git a/mlir/test/Dialect/Linalg/comprehensive-module-bufferize.mlir b/mlir/test/Dialect/Linalg/comprehensive-module-bufferize.mlir
--- a/mlir/test/Dialect/Linalg/comprehensive-module-bufferize.mlir
+++ b/mlir/test/Dialect/Linalg/comprehensive-module-bufferize.mlir
@@ -1176,63 +1176,12 @@
 //  CHECK-SAME:     %[[t1:.*]]: memref<?x?xf32, #{{.*}}>, %[[t2:.*]]: memref<?xf32, #{{.*}}>, %[[t3:.*]]: memref<?x?xf32, #{{.*}}>
 func @linalg_op_bufferizes_inplace_with_input(
     %t1: tensor<?x?xf32> {linalg.inplaceable = true},
-    %t2: tensor<?xf32> {linalg.inplaceable = false},
-    %t3: tensor<?x?xf32> {linalg.inplaceable = false},
-    %s1: index, %s2: index, %cst: f32) -> tensor<?x?xf32> {
-  // CHECK: linalg.generic {{.*}} ins(%[[t1]], %[[t2]] : {{.*}}) outs(%[[t1]] : {{.*}})
-  %r = linalg.generic {
-    indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>,
-                     affine_map<(d0, d1) -> (d1)>,
-                     affine_map<(d0, d1)-> (d0, d1)>],
-    iterator_types = ["parallel", "parallel"]}
-    ins(%t1, %t2 : tensor<?x?xf32>, tensor<?xf32>)
-    outs(%t3 : tensor<?x?xf32>) {
-      ^bb0(%arg0 : f32, %arg1 : f32, %arg2 : f32) :
-        %add = arith.addf %arg0, %arg1 : f32
-        linalg.yield %add : f32
-    } -> tensor<?x?xf32>
-  return %r : tensor<?x?xf32>
-}
-
-// -----
-
-// CHECK-LABEL: func @linalg_op_bufferizes_out_of_place_with_input
-//  CHECK-SAME:     %[[t1:.*]]: memref<?x?xf32, #{{.*}}>, %[[t2:.*]]: memref<?xf32, #{{.*}}>, %[[t3:.*]]: memref<?x?xf32, #{{.*}}>
-func @linalg_op_bufferizes_out_of_place_with_input(
-    %t1: tensor<?x?xf32> {linalg.inplaceable = false},
-    %t2: tensor<?xf32> {linalg.inplaceable = false},
-    %t3: tensor<?x?xf32> {linalg.inplaceable = false},
-    %s1: index, %s2: index, %cst: f32) -> tensor<?x?xf32> {
-  // CHECK: %[[alloc:.*]] = memref.alloc
-  // CHECK: memref.copy %[[t1]], %[[alloc]]
-  // CHECK: linalg.generic {{.*}} ins(%[[t1]], %[[t2]] : {{.*}}) outs(%[[alloc]] : {{.*}})
-  %r = linalg.generic {
-    indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>,
-                     affine_map<(d0, d1) -> (d1)>,
-                     affine_map<(d0, d1)-> (d0, d1)>],
-    iterator_types = ["parallel", "parallel"]}
-    ins(%t1, %t2 : tensor<?x?xf32>, tensor<?xf32>)
-    outs(%t3 : tensor<?x?xf32>) {
-      ^bb0(%arg0 : f32, %arg1 : f32, %arg2 : f32) :
-        %add = arith.addf %arg0, %arg1 : f32
-        linalg.yield %add : f32
-    } -> tensor<?x?xf32>
-  // CHECK: return %[[alloc]]
-  return %r : tensor<?x?xf32>
-}
-
-// -----
-
-// CHECK-LABEL: func @linalg_op_output_cannot_alias_with_input
-//  CHECK-SAME:     %[[t1:.*]]: memref<?x?xf32, #{{.*}}>, %[[t2:.*]]: memref<?xf32, #{{.*}}>, %[[t3:.*]]: memref<?x?xf32, #{{.*}}>
-func @linalg_op_output_cannot_alias_with_input(
-    %t1: tensor<?x?xf32> {linalg.inplaceable = true},
-    %t2: tensor<?xf32> {linalg.inplaceable = false},
+    %t2: tensor<?xf32> {linalg.inplaceable = true},
     %t3: tensor<?x?xf32> {linalg.inplaceable = true},
     %s1: index, %s2: index, %cst: f32) -> tensor<?x?xf32> {
   // CHECK: linalg.generic {{.*}} ins(%[[t1]], %[[t2]] : {{.*}}) outs(%[[t3]] : {{.*}})
   %r = linalg.generic {
-    indexing_maps = [affine_map<(d0, d1) -> (d1, d0)>,
+    indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>,
                      affine_map<(d0, d1) -> (d1)>,
                      affine_map<(d0, d1)-> (d0, d1)>],
     iterator_types = ["parallel", "parallel"]}