diff --git a/mlir/lib/Dialect/Linalg/Transforms/Tiling.cpp b/mlir/lib/Dialect/Linalg/Transforms/Tiling.cpp
--- a/mlir/lib/Dialect/Linalg/Transforms/Tiling.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/Tiling.cpp
@@ -380,11 +380,14 @@
     auto destinationStyleOp = dyn_cast<DestinationStyleOpInterface>(clonedOp);
     if (destinationStyleOp) {
       for (OpOperand *outOperand : destinationStyleOp.getDpsInitOperands()) {
-        auto *it = llvm::find(dest, outOperand->get());
-        if (it == dest.end())
-          return op->emitOpError("must have \"tensor semantic\" for tiling");
-        unsigned destNum = std::distance(dest.begin(), it);
-        outOperand->set(destBbArgs[destNum]);
+        // Swap tensor inits with the corresponding block argument of the
+        // scf.forall op. Memref inits remain as is.
+        if (outOperand->get().getType().isa<TensorType>()) {
+          auto *it = llvm::find(dest, outOperand->get());
+          assert(it != dest.end() && "could not find destination tensor");
+          unsigned destNum = std::distance(dest.begin(), it);
+          outOperand->set(destBbArgs[destNum]);
+        }
       }
     }
 
diff --git a/mlir/test/Dialect/GPU/transform-gpu-failing.mlir b/mlir/test/Dialect/GPU/transform-gpu-failing.mlir
--- a/mlir/test/Dialect/GPU/transform-gpu-failing.mlir
+++ b/mlir/test/Dialect/GPU/transform-gpu-failing.mlir
@@ -274,34 +274,3 @@
   // expected-error @below {{duplicated attribute, cannot map different loops to the same processor}}
   transform.gpu.map_nested_forall_to_threads %funcop block_dims = [32, 32, 1] : (!transform.any_op) -> !transform.any_op
 }
-
-// -----
-
-func.func @tiling_buffer_semantic_op(%x: memref<32x32xf32>, %y: memref<32x32xf32>, %stream : !gpu.async.token) {
-  %one = arith.constant 1 : index
-  %name = gpu.launch async[%stream] blocks(%arg3, %arg4, %arg5) in (%arg9 = %one, %arg10 = %one, %arg11 = %one)
-            threads(%arg6, %arg7, %arg8) in (%arg12 = %one, %arg13 = %one, %arg14 = %one)
-  {
-    // expected-error @below {{'linalg.generic' op must have "tensor semantic" for tiling}}
-    // expected-note @below {{when applied to this op}}
-    linalg.generic
-      {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>,
-                        affine_map<(d0, d1) -> (d0, d1)>],
-       iterator_types = ["parallel", "parallel"]}
-      ins(%x : memref<32x32xf32>)
-      outs(%y : memref<32x32xf32>) {
-        ^bb0(%in: f32, %out: f32):
-          linalg.yield %in : f32
-    }
-    gpu.terminator
-  }
-  return
-}
-
-transform.sequence failures(propagate) {
-^bb1(%arg0: !transform.any_op):
-  %matmul = transform.structured.match ops{["linalg.generic"]} in %arg0 : (!transform.any_op) -> !transform.any_op
-  // expected-error @below {{transform.structured.tile_to_forall_op failed to apply}}
-  %forall, %tiled = transform.structured.tile_to_forall_op %matmul num_threads [10, 20, 30] (mapping = [ #gpu.thread<y>, #gpu.thread<x>, #gpu.thread<z> ] )
-    : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-}
diff --git a/mlir/test/Dialect/GPU/transform-gpu.mlir b/mlir/test/Dialect/GPU/transform-gpu.mlir
--- a/mlir/test/Dialect/GPU/transform-gpu.mlir
+++ b/mlir/test/Dialect/GPU/transform-gpu.mlir
@@ -307,3 +307,39 @@
   transform.gpu.map_nested_forall_to_threads %funcop
     block_dims = [12, 11, 1] warp_dims = [3, 2, 1] : (!transform.any_op) -> !transform.any_op
 }
+
+// -----
+
+// CHECK-LABEL: func.func @tiling_buffer_semantic_op(
+//       CHECK:   gpu.launch {{.*}} {
+//       CHECK:     scf.forall {{.*}} {
+//       CHECK:       memref.subview
+//       CHECK:       memref.subview
+//       CHECK:       linalg.generic
+//       CHECK:     }
+//       CHECK:   }
+func.func @tiling_buffer_semantic_op(%x: memref<32x32xf32>, %y: memref<32x32xf32>, %stream : !gpu.async.token) {
+  %one = arith.constant 1 : index
+  %name = gpu.launch async[%stream] blocks(%arg3, %arg4, %arg5) in (%arg9 = %one, %arg10 = %one, %arg11 = %one)
+            threads(%arg6, %arg7, %arg8) in (%arg12 = %one, %arg13 = %one, %arg14 = %one)
+  {
+    linalg.generic
+      {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>,
+                        affine_map<(d0, d1) -> (d0, d1)>],
+       iterator_types = ["parallel", "parallel"]}
+      ins(%x : memref<32x32xf32>)
+      outs(%y : memref<32x32xf32>) {
+        ^bb0(%in: f32, %out: f32):
+          linalg.yield %in : f32
+    }
+    gpu.terminator
+  }
+  return
+}
+
+transform.sequence failures(propagate) {
+^bb1(%arg0: !transform.any_op):
+  %matmul = transform.structured.match ops{["linalg.generic"]} in %arg0 : (!transform.any_op) -> !transform.any_op
+  %forall, %tiled = transform.structured.tile_to_forall_op %matmul num_threads [10, 20, 30] (mapping = [ #gpu.thread<y>, #gpu.thread<x>, #gpu.thread<z> ] )
+    : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+}
diff --git a/mlir/test/Dialect/Linalg/tile-to-foreach-thread.mlir b/mlir/test/Dialect/Linalg/tile-to-foreach-thread.mlir
--- a/mlir/test/Dialect/Linalg/tile-to-foreach-thread.mlir
+++ b/mlir/test/Dialect/Linalg/tile-to-foreach-thread.mlir
@@ -40,6 +40,53 @@
 
 // -----
 
+module {
+  // CHECK-LABEL: func @matmul_memref(
+  //       CHECK:   scf.forall (%{{.*}}, %{{.*}}) in (10, 20) {
+  //       CHECK:     memref.subview
+  //       CHECK:     memref.subview
+  //       CHECK:     memref.subview
+  //       CHECK:     linalg.matmul
+  //       CHECK:   } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
+  func.func @matmul_memref(%A: memref<?x?xf32>, %B: memref<?x?xf32>, %C: memref<?x?xf32>) {
+    linalg.matmul ins(%A, %B : memref<?x?xf32>, memref<?x?xf32>)
+                  outs(%C : memref<?x?xf32>)
+    return
+  }
+
+  transform.sequence failures(propagate) {
+  ^bb1(%arg1: !transform.any_op):
+    %0 = transform.structured.match ops{["linalg.matmul"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    %1:2 = transform.structured.tile_to_forall_op %0 num_threads [10, 20] (mapping = [ #gpu.thread<y>, #gpu.thread<x> ] )
+         : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+  }
+}
+
+// -----
+
+module {
+  // CHECK-LABEL: func @copy_memref(
+  //       CHECK:   scf.forall (%{{.*}}, %{{.*}}) in (10, 20) {
+  //       CHECK:     memref.subview
+  //       CHECK:     memref.subview
+  //       CHECK:     linalg.copy
+  //       CHECK:   } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
+  func.func @copy_memref(%A: memref<?x?xf32>, %B: memref<?x?xf32>) {
+    linalg.copy ins(%A: memref<?x?xf32>)
+                outs(%B : memref<?x?xf32>)
+    return
+  }
+
+  transform.sequence failures(propagate) {
+  ^bb1(%arg1: !transform.any_op):
+    %0 = transform.structured.match ops{["linalg.copy"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    %1:2 = transform.structured.tile_to_forall_op %0 num_threads [10, 20] (mapping = [ #gpu.thread<y>, #gpu.thread<x> ] )
+         : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+  }
+}
+
+// -----
+
 // In this test case, matmul dims and tile size are dynamic.
 
 // CHECK-DAG: #[[$map0:.+]] = affine_map<()[s0, s1] -> (s0 ceildiv s1)>