diff --git a/mlir/include/mlir/Dialect/Bufferization/Transforms/Passes.td b/mlir/include/mlir/Dialect/Bufferization/Transforms/Passes.td
--- a/mlir/include/mlir/Dialect/Bufferization/Transforms/Passes.td
+++ b/mlir/include/mlir/Dialect/Bufferization/Transforms/Passes.td
@@ -347,6 +347,10 @@
            "Bufferize function boundaries (experimental).">,
     Option<"createDeallocs", "create-deallocs", "bool", /*default=*/"true",
            "Specify if new allocations should be deallocated.">,
+    Option<"mustInferMemorySpace", "must-infer-memory-space", "bool",
+           /*default=*/"false",
+           "The memory space of an memref types must always be inferred. If "
+           "unset, a default memory space of 0 is used otherwise.">,
   ];
   let constructor = "mlir::bufferization::createTensorCopyInsertionPass()";
 }
diff --git a/mlir/lib/Dialect/Arithmetic/Transforms/BufferizableOpInterfaceImpl.cpp b/mlir/lib/Dialect/Arithmetic/Transforms/BufferizableOpInterfaceImpl.cpp
--- a/mlir/lib/Dialect/Arithmetic/Transforms/BufferizableOpInterfaceImpl.cpp
+++ b/mlir/lib/Dialect/Arithmetic/Transforms/BufferizableOpInterfaceImpl.cpp
@@ -150,6 +150,10 @@
       return failure();
     Value trueBuffer = *maybeTrueBuffer;
     Value falseBuffer = *maybeFalseBuffer;
+    BaseMemRefType trueType = trueBuffer.getType().cast<BaseMemRefType>();
+    BaseMemRefType falseType = falseBuffer.getType().cast<BaseMemRefType>();
+    if (trueType.getMemorySpaceAsInt() != falseType.getMemorySpaceAsInt())
+      return op->emitError("inconsistent memory space on true/false operands");
 
     // The "true" and the "false" operands must have the same type. If the
     // buffers have different types, they differ only in their layout map. Cast
diff --git a/mlir/lib/Dialect/Bufferization/IR/BufferizableOpInterface.cpp b/mlir/lib/Dialect/Bufferization/IR/BufferizableOpInterface.cpp
--- a/mlir/lib/Dialect/Bufferization/IR/BufferizableOpInterface.cpp
+++ b/mlir/lib/Dialect/Bufferization/IR/BufferizableOpInterface.cpp
@@ -43,6 +43,13 @@
 constexpr const ::llvm::StringLiteral
     bufferization::BufferizableOpInterface::kInplaceableAttrName;
 
+/// Return the owner of the given value.
+static Operation *getOwnerOfValue(Value value) {
+  if (auto opResult = value.dyn_cast<OpResult>())
+    return opResult.getDefiningOp();
+  return value.cast<BlockArgument>().getOwner()->getParentOp();
+}
+
 /// Create an AllocTensorOp for the given shaped value. If `copy` is set, the
 /// shaped value is copied. Otherwise, a tensor with undefined contents is
 /// allocated.
@@ -84,10 +91,21 @@
       populateDynamicDimSizes(b, loc, tensor, dynamicSizes);
   }
 
+  // Create AllocTensorOp.
   auto allocTensorOp = b.create<AllocTensorOp>(loc, tensorType, dynamicSizes,
                                                copy ? tensor : Value());
   allocTensorOp->setAttr(BufferizationDialect::kEscapeAttrName,
                          b.getBoolArrayAttr({escape}));
+
+  // Add 'memory_space' attribute. Not needed if 'copy' operand is specified.
+  if (copy)
+    return allocTensorOp.getResult();
+  FailureOr<BaseMemRefType> copyBufferType = getBufferType(tensor, options);
+  if (failed(copyBufferType))
+    return failure();
+  allocTensorOp->setAttr(
+      BufferizationDialect::kMemorySpaceAttrName,
+      b.getI64ArrayAttr({copyBufferType->getMemorySpaceAsInt()}));
   return allocTensorOp.getResult();
 }
 
@@ -512,16 +530,43 @@
 bufferization::getBufferType(Value value, const BufferizationOptions &options) {
   auto tensorType = value.getType().dyn_cast<TensorType>();
   assert(tensorType && "unexpected non-tensor type");
+  Operation *op = getOwnerOfValue(value);
 
+  // ToTensorOp: Take buffer type directly from the op.
   if (auto toTensorOp = value.getDefiningOp<bufferization::ToTensorOp>())
     return toTensorOp.getMemref().getType().cast<BaseMemRefType>();
 
+  // If value is a bbArg of a bufferizable op: query op interface.
   if (auto bbArg = value.dyn_cast<BlockArgument>())
     if (auto bufferizableOp =
             options.dynCastBufferizableOp(bbArg.getOwner()->getParentOp()))
       return bufferizableOp.getBufferType(bbArg, options);
 
-  return getMemRefType(tensorType, options);
+  // Check value is a new buffer allocation with a memory space attribute. In
+  // that case we can at least infer the memory space.
+  Optional<unsigned> memorySpace = None;
+  if (auto opResult = value.dyn_cast<OpResult>())
+    if (auto bufferizableOp =
+            options.dynCastBufferizableOp(opResult.getDefiningOp()))
+      if (bufferizableOp.bufferizesToAllocation(opResult))
+        if (op->hasAttr(BufferizationDialect::kMemorySpaceAttrName))
+          memorySpace = op->getAttrOfType<ArrayAttr>(
+                              BufferizationDialect::kMemorySpaceAttrName)
+                            [opResult.getResultNumber()]
+                                .cast<IntegerAttr>()
+                                .getValue()
+                                .getZExtValue();
+
+  // If we still do not know the memory space, use the default memory space (if
+  // any).
+  if (!memorySpace.hasValue())
+    memorySpace = options.defaultMemorySpace;
+
+  // If we still do not know the memory space, report a failure.
+  if (!memorySpace.hasValue())
+    return op->emitError("could not infer memory space");
+
+  return getMemRefType(tensorType, options, /*layout=*/{}, *memorySpace);
 }
 
 void bufferization::replaceOpWithBufferizedValues(RewriterBase &rewriter,
diff --git a/mlir/lib/Dialect/Bufferization/IR/BufferizationOps.cpp b/mlir/lib/Dialect/Bufferization/IR/BufferizationOps.cpp
--- a/mlir/lib/Dialect/Bufferization/IR/BufferizationOps.cpp
+++ b/mlir/lib/Dialect/Bufferization/IR/BufferizationOps.cpp
@@ -178,6 +178,9 @@
                       .cast<IntegerAttr>()
                       .getValue()
                       .getZExtValue();
+  } else if (getCopy()) {
+    memorySpace =
+        copyBuffer.getType().cast<BaseMemRefType>().getMemorySpaceAsInt();
   } else if (options.defaultMemorySpace.hasValue()) {
     memorySpace = *options.defaultMemorySpace;
   } else {
diff --git a/mlir/lib/Dialect/Bufferization/Transforms/TensorCopyInsertion.cpp b/mlir/lib/Dialect/Bufferization/Transforms/TensorCopyInsertion.cpp
--- a/mlir/lib/Dialect/Bufferization/Transforms/TensorCopyInsertion.cpp
+++ b/mlir/lib/Dialect/Bufferization/Transforms/TensorCopyInsertion.cpp
@@ -105,6 +105,8 @@
       options.allowReturnAllocs = allowReturnAllocs;
       options.bufferizeFunctionBoundaries = bufferizeFunctionBoundaries;
       options.createDeallocs = createDeallocs;
+      if (mustInferMemorySpace)
+        options.defaultMemorySpace = None;
       if (failed(insertTensorCopies(getOperation(), options)))
         signalPassFailure();
     }
diff --git a/mlir/lib/Dialect/SCF/Transforms/BufferizableOpInterfaceImpl.cpp b/mlir/lib/Dialect/SCF/Transforms/BufferizableOpInterfaceImpl.cpp
--- a/mlir/lib/Dialect/SCF/Transforms/BufferizableOpInterfaceImpl.cpp
+++ b/mlir/lib/Dialect/SCF/Transforms/BufferizableOpInterfaceImpl.cpp
@@ -706,6 +706,8 @@
     return success();
   }
 
+  // TODO: Implement getBufferType interface method and infer buffer types.
+
   LogicalResult bufferize(Operation *op, RewriterBase &rewriter,
                           const BufferizationOptions &options) const {
     auto whileOp = cast<scf::WhileOp>(op);
diff --git a/mlir/lib/Dialect/Tensor/Transforms/BufferizableOpInterfaceImpl.cpp b/mlir/lib/Dialect/Tensor/Transforms/BufferizableOpInterfaceImpl.cpp
--- a/mlir/lib/Dialect/Tensor/Transforms/BufferizableOpInterfaceImpl.cpp
+++ b/mlir/lib/Dialect/Tensor/Transforms/BufferizableOpInterfaceImpl.cpp
@@ -792,7 +792,9 @@
     if (failed(srcBuffer) || failed(shapeBuffer))
       return failure();
     auto resultTensorType = reshapeOp.getResult().getType().cast<TensorType>();
-    auto resultMemRefType = getMemRefType(resultTensorType, options);
+    auto resultMemRefType = getMemRefType(
+        resultTensorType, options, /*layout=*/{},
+        srcBuffer->getType().cast<BaseMemRefType>().getMemorySpaceAsInt());
     replaceOpWithNewBufferizedOp<memref::ReshapeOp>(
         rewriter, op, resultMemRefType, *srcBuffer, *shapeBuffer);
     return success();
diff --git a/mlir/test/Dialect/Arithmetic/one-shot-bufferize-memory-space-invalid.mlir b/mlir/test/Dialect/Arithmetic/one-shot-bufferize-memory-space-invalid.mlir
new file mode 100644
--- /dev/null
+++ b/mlir/test/Dialect/Arithmetic/one-shot-bufferize-memory-space-invalid.mlir
@@ -0,0 +1,12 @@
+// RUN: mlir-opt %s -one-shot-bufferize="must-infer-memory-space" -split-input-file -verify-diagnostics
+
+func.func @inconsistent_memory_space_arith_select(%c: i1) -> tensor<10xf32> {
+  // Selecting tensors with different memory spaces. Such IR cannot be
+  // bufferized.
+  %0 = bufferization.alloc_tensor() {bufferization.memory_space = [0]} : tensor<10xf32>
+  %1 = bufferization.alloc_tensor() {bufferization.memory_space = [1]} : tensor<10xf32>
+  // expected-error @+2 {{inconsistent memory space on true/false operands}}
+  // expected-error @+1 {{failed to bufferize op}}
+  %r = arith.select %c, %0, %1 : tensor<10xf32>
+  func.return %r : tensor<10xf32>
+}
diff --git a/mlir/test/Dialect/Bufferization/Transforms/one-shot-bufferize-memory-space-invalid.mlir b/mlir/test/Dialect/Bufferization/Transforms/one-shot-bufferize-memory-space-invalid.mlir
--- a/mlir/test/Dialect/Bufferization/Transforms/one-shot-bufferize-memory-space-invalid.mlir
+++ b/mlir/test/Dialect/Bufferization/Transforms/one-shot-bufferize-memory-space-invalid.mlir
@@ -6,3 +6,14 @@
   %0 = bufferization.alloc_tensor() : tensor<10xf32>
   return %0 : tensor<10xf32>
 }
+
+// -----
+
+func.func @memory_space_of_unknown_op() -> f32 {
+  %c0 = arith.constant 0 : index
+  // expected-error @+1 {{could not infer memory space}}
+  %t = "test.dummy_op"() : () -> (tensor<10xf32>)
+  // expected-error @+1 {{failed to bufferize op}}
+  %s = tensor.extract %t[%c0] : tensor<10xf32>
+  return %s : f32
+}
diff --git a/mlir/test/Dialect/Bufferization/Transforms/tensor-copy-insertion-memory-space-invalid.mlir b/mlir/test/Dialect/Bufferization/Transforms/tensor-copy-insertion-memory-space-invalid.mlir
new file mode 100644
--- /dev/null
+++ b/mlir/test/Dialect/Bufferization/Transforms/tensor-copy-insertion-memory-space-invalid.mlir
@@ -0,0 +1,12 @@
+// RUN: mlir-opt %s -tensor-copy-insertion="must-infer-memory-space" -split-input-file -verify-diagnostics
+
+// An alloc is inserted but the copy is emitted. Therefore, the memory space
+// should be specified on the alloc_tensor op.
+func.func @memory_space_of_unknown_op() -> (tensor<10xf32>, tensor<10xf32>) {
+  %c0 = arith.constant 0 : index
+  %cst = arith.constant 0.0 : f32
+  // expected-error @+1 {{could not infer memory space}}
+  %t = bufferization.alloc_tensor() : tensor<10xf32>
+  %s = tensor.insert %cst into %t[%c0] : tensor<10xf32>
+  return %s, %t : tensor<10xf32>, tensor<10xf32>
+}
diff --git a/mlir/test/Dialect/Bufferization/Transforms/tensor-copy-insertion-memory-space.mlir b/mlir/test/Dialect/Bufferization/Transforms/tensor-copy-insertion-memory-space.mlir
new file mode 100644
--- /dev/null
+++ b/mlir/test/Dialect/Bufferization/Transforms/tensor-copy-insertion-memory-space.mlir
@@ -0,0 +1,25 @@
+// RUN: mlir-opt %s -tensor-copy-insertion="must-infer-memory-space" -split-input-file | FileCheck %s
+
+// CHECK-LABEL: func @unknown_op_copy
+func.func @unknown_op_copy() -> (tensor<10xf32>, tensor<10xf32>) {
+  %c0 = arith.constant 0 : index
+  %cst = arith.constant 0.0 : f32
+  // CHECK: %[[dummy:.*]] = "test.dummy_op"() : () -> tensor<10xf32>
+  %t = "test.dummy_op"() : () -> tensor<10xf32>
+  // CHECK: %[[copy:.*]] = bufferization.alloc_tensor() copy(%[[dummy]]) {bufferization.escape = [false]} : tensor<10xf32>
+  %s = tensor.insert %cst into %t[%c0] : tensor<10xf32>
+  return %s, %t : tensor<10xf32>, tensor<10xf32>
+}
+
+// -----
+
+// CHECK-LABEL: func @alloc_tensor_copy
+func.func @alloc_tensor_copy() -> (tensor<10xf32>, tensor<10xf32>) {
+  %c0 = arith.constant 0 : index
+  %cst = arith.constant 0.0 : f32
+  // CHECK: bufferization.alloc_tensor() {bufferization.escape = [false], bufferization.memory_space = [1]} : tensor<10xf32>
+  %t = bufferization.alloc_tensor() {bufferization.memory_space = [1]} : tensor<10xf32>
+  // CHECK: bufferization.alloc_tensor() {bufferization.escape = [false], bufferization.memory_space = [1]} : tensor<10xf32>
+  %s = tensor.insert %cst into %t[%c0] : tensor<10xf32>
+  return %s, %t : tensor<10xf32>, tensor<10xf32>
+}
diff --git a/mlir/test/Dialect/Bufferization/Transforms/tensor-copy-insertion.mlir b/mlir/test/Dialect/Bufferization/Transforms/tensor-copy-insertion.mlir
--- a/mlir/test/Dialect/Bufferization/Transforms/tensor-copy-insertion.mlir
+++ b/mlir/test/Dialect/Bufferization/Transforms/tensor-copy-insertion.mlir
@@ -40,10 +40,10 @@
 {
   // CHECK: bufferization.alloc_tensor() {bufferization.escape = [false]} : tensor<5xf32>
   // The second alloc_tensor should not have a copy operand.
-  // CHECK: bufferization.alloc_tensor() {bufferization.escape = [false]} : tensor<5xf32>
+  // CHECK: bufferization.alloc_tensor() {bufferization.escape = [false], bufferization.memory_space = [0]} : tensor<5xf32>
 
   // CHECK-NO-DEALLOC: bufferization.alloc_tensor() {bufferization.escape = [true]} : tensor<5xf32>
-  // CHECK-NO-DEALLOC: bufferization.alloc_tensor() {bufferization.escape = [true]} : tensor<5xf32>
+  // CHECK-NO-DEALLOC: bufferization.alloc_tensor() {bufferization.escape = [true], bufferization.memory_space = [0]} : tensor<5xf32>
   %0 = bufferization.alloc_tensor() : tensor<5xf32>
   %1 = tensor.insert %f into %0[%idx] : tensor<5xf32>
   return %0, %1 : tensor<5xf32>, tensor<5xf32>
@@ -55,7 +55,7 @@
 func.func @do_not_copy_when_overwritten(%t: tensor<5xf32>, %f: f32)
   -> (tensor<5xf32>, tensor<5xf32>)
 {
-  // CHECK: %[[alloc:.*]] = bufferization.alloc_tensor() {bufferization.escape = [false]} : tensor<5xf32>
+  // CHECK: %[[alloc:.*]] = bufferization.alloc_tensor() {bufferization.escape = [false], bufferization.memory_space = [0]} : tensor<5xf32>
   // CHECK: linalg.generic {{.*}} outs(%[[alloc]] : tensor<5xf32>)
   %r = linalg.generic {
     indexing_maps = [affine_map<(d0) -> (d0)>],
@@ -74,7 +74,7 @@
   -> (tensor<3xf32>)
 {
   %0 = tensor.extract_slice %t[0][3][1] : tensor<5xf32> to tensor<3xf32>
-  // CHECK: %[[alloc:.*]] = bufferization.alloc_tensor() {bufferization.escape = [false]} : tensor<3xf32>
+  // CHECK: %[[alloc:.*]] = bufferization.alloc_tensor() {bufferization.escape = [false], bufferization.memory_space = [0]} : tensor<3xf32>
   // CHECK: linalg.generic {{.*}} outs(%[[alloc]] : tensor<3xf32>)
   %r = linalg.generic {
     indexing_maps = [affine_map<(d0) -> (d0)>],
diff --git a/mlir/test/Dialect/SCF/one-shot-bufferize-invalid.mlir b/mlir/test/Dialect/SCF/one-shot-bufferize-invalid.mlir
new file mode 100644
--- /dev/null
+++ b/mlir/test/Dialect/SCF/one-shot-bufferize-invalid.mlir
@@ -0,0 +1,16 @@
+// RUN: mlir-opt %s -one-shot-bufferize -split-input-file -verify-diagnostics
+
+func.func @inconsistent_memory_space_scf_if(%c: i1) -> tensor<10xf32> {
+  // Yielding tensors with different memory spaces. Such IR cannot be
+  // bufferized.
+  %0 = bufferization.alloc_tensor() {bufferization.memory_space = [0]} : tensor<10xf32>
+  %1 = bufferization.alloc_tensor() {bufferization.memory_space = [1]} : tensor<10xf32>
+  // expected-error @+2 {{inconsistent memory space on then/else branches}}
+  // expected-error @+1 {{failed to bufferize op}}
+  %r = scf.if %c -> tensor<10xf32> {
+    scf.yield %0 : tensor<10xf32>
+  } else {
+    scf.yield %1 : tensor<10xf32>
+  }
+  func.return %r : tensor<10xf32>
+}
diff --git a/mlir/test/Dialect/SCF/one-shot-bufferize.mlir b/mlir/test/Dialect/SCF/one-shot-bufferize.mlir
--- a/mlir/test/Dialect/SCF/one-shot-bufferize.mlir
+++ b/mlir/test/Dialect/SCF/one-shot-bufferize.mlir
@@ -632,3 +632,80 @@
   }
   return %0 : tensor<8x8xf32>
 }
+
+// -----
+
+// CHECK-LABEL: func @scf_if_memory_space
+func.func @scf_if_memory_space(%c: i1, %f: f32) -> (f32, f32)
+{
+  %c0 = arith.constant 0 : index
+  // CHECK: %[[alloc:.*]] = memref.alloc() {{.*}} : memref<5xf32, 1>
+  %0 = bufferization.alloc_tensor() {bufferization.memory_space = [1]} : tensor<5xf32>
+  // CHECK: scf.if %{{.*}} -> (memref<5xf32, 1>) {
+  %1 = scf.if %c -> tensor<5xf32> {
+    // CHECK: %[[cloned:.*]] = bufferization.clone %[[alloc]]
+    // CHECK: scf.yield %[[cloned]]
+    scf.yield %0 : tensor<5xf32>
+  } else {
+    // CHECK: %[[alloc2:.*]] = memref.alloc() {{.*}} : memref<5xf32, 1>
+    // CHECK: memref.store %{{.*}}, %[[alloc2]]
+    // CHECK: %[[cloned2:.*]] = bufferization.clone %[[alloc2]]
+    // CHECK: memref.dealloc %[[alloc2]]
+    // CHECK: scf.yield %[[cloned2]]
+    %2 = tensor.insert %f into %0[%c0] : tensor<5xf32>
+    scf.yield %2 : tensor<5xf32>
+  }
+  %r0 = tensor.extract %0[%c0] : tensor<5xf32>
+  %r1 = tensor.extract %1[%c0] : tensor<5xf32>
+  return %r0, %r1 : f32, f32
+}
+
+// -----
+
+// CHECK-LABEL: func @scf_execute_region_memory_space
+// CHECK: memref.alloc() {{.*}} : memref<5xf32, 1>
+// CHECK: memref.store
+// CHECK: memref.load
+// CHECK: memref.dealloc
+func.func @scf_execute_region_memory_space(%f: f32) -> f32 {
+  %c0 = arith.constant 0 : index
+  %0 = scf.execute_region -> tensor<5xf32> {
+    %1 = bufferization.alloc_tensor() {bufferization.memory_space = [1]} : tensor<5xf32>
+    %2 = tensor.insert %f into %1[%c0] : tensor<5xf32>
+    scf.yield %2 : tensor<5xf32>
+  }
+  %r = tensor.extract %0[%c0] : tensor<5xf32>
+  return %r : f32
+}
+
+// -----
+
+// Additional allocs are inserted in the loop body. We just check that all
+// allocs have the correct memory space.
+
+// CHECK-LABEL: func @scf_for_swapping_yields_memory_space
+func.func @scf_for_swapping_yields_memory_space(
+    %sz: index, %C : tensor<4xf32>, %lb : index, %ub : index, %step : index)
+  -> (f32, f32)
+{
+  // CHECK: memref.alloc(%{{.*}}) {{.*}} : memref<?xf32, 1>
+  // CHECK: memref.alloc(%{{.*}}) {{.*}} : memref<?xf32, 1>
+  %A = bufferization.alloc_tensor(%sz) {bufferization.memory_space = [1]} : tensor<?xf32>
+  %B = bufferization.alloc_tensor(%sz) {bufferization.memory_space = [1]} : tensor<?xf32>
+
+  // CHECK: scf.for {{.*}} {
+  %r0:2 = scf.for %i = %lb to %ub step %step iter_args(%tA = %A, %tB = %B)
+      -> (tensor<?xf32>, tensor<?xf32>)
+  {
+    // CHECK: memref.alloc(%{{.*}}) {{.*}} : memref<?xf32, 1>
+    // CHECK: memref.alloc(%{{.*}}) {{.*}} : memref<?xf32, 1>
+    %ttA = tensor.insert_slice %C into %tA[0][4][1] : tensor<4xf32> into tensor<?xf32>
+    %ttB = tensor.insert_slice %C into %tB[0][4][1] : tensor<4xf32> into tensor<?xf32>
+    // Yield tensors in different order.
+    scf.yield %ttB, %ttA : tensor<?xf32>, tensor<?xf32>
+  }
+  // CHECK: }
+  %f0 = tensor.extract %r0#0[%step] : tensor<?xf32>
+  %f1 = tensor.extract %r0#1[%step] : tensor<?xf32>
+  return %f0, %f1: f32, f32
+}