diff --git a/mlir/lib/Dialect/Linalg/Transforms/Tiling.cpp b/mlir/lib/Dialect/Linalg/Transforms/Tiling.cpp
--- a/mlir/lib/Dialect/Linalg/Transforms/Tiling.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/Tiling.cpp
@@ -235,10 +235,7 @@
   auto hasStrideOne = [](Range r) { return !isConstantIntValue(r.stride, 1); };
   if (llvm::any_of(loopRanges, hasStrideOne))
     return op->emitOpError("only stride-1 supported atm");
-  // TODO: support `getTiledImplementation` with >1 produced tiled ops.
-  auto dest = op.getDestinationOperands(b);
-  if (dest.size() != 1)
-    return op->emitOpError("only single dest operand supported atm");
+  auto destOperands = op.getDestinationOperands(b);
 
   SmallVector<OpFoldResult> nonZeroNumThreads =
       llvm::to_vector(llvm::make_filter_range(numThreads, [](OpFoldResult ofr) {
diff --git a/mlir/test/Dialect/Linalg/tile-to-foreach-thread.mlir b/mlir/test/Dialect/Linalg/tile-to-foreach-thread.mlir
--- a/mlir/test/Dialect/Linalg/tile-to-foreach-thread.mlir
+++ b/mlir/test/Dialect/Linalg/tile-to-foreach-thread.mlir
@@ -249,3 +249,61 @@
     %1:2 = transform.structured.tile_to_foreach_thread_op %0 tile_sizes [%sz, 20]
   }
 }
+
+// -----
+
+// CHECK-DAG: #[[$map0:.+]] = affine_map<(d0) -> (d0 * -15 + 100, 15)>
+// CHECK-DAG: #[[$map1:.+]] = affine_map<(d0) -> (0, d0)>
+// CHECK-DAG: #[[$map2:.+]] = affine_map<(d0) -> (d0 * 15)>
+// CHECK-DAG: #[[$map3:.+]] = affine_map<(d0) -> (d0)>
+
+// CHECK-LABEL: tile_output_multi_1d_static(
+//  CHECK-SAME:   %[[IN1:[0-9a-z]+]]: tensor<100xf32>
+//  CHECK-SAME:   %[[IN2:[0-9a-z]+]]: tensor<100xf32>
+//  CHECK-SAME:   %[[OUT1:[0-9a-z]+]]: tensor<100xf32>
+//  CHECK-SAME:   %[[OUT2:[0-9a-z]+]]: tensor<100xf32>
+  func.func @tile_output_multi_1d_static(%IN1: tensor<100xf32>, %IN2: tensor<100xf32>, 
+                                         %OUT1: tensor<100xf32>, %OUT2: tensor<100xf32>)
+                                         -> (tensor<100xf32>, tensor<100xf32>) {
+//  CHECK-DAG: %[[c0:.+]] = arith.constant 7 :
+//      CHECK: scf.foreach_thread (%[[IV0:.+]]) in (%[[c0]])
+//      CHECK:   %[[TSMIN:.+]] = affine.min #[[$map0]](%[[IV0]])
+//      CHECK:   %[[TS:.+]] = affine.max #[[$map1]](%[[TSMIN]])
+//  CHECK-NOT:   affine.min
+//  CHECK-NOT:   affine.max
+//      CHECK:   %[[LB:.+]] = affine.apply #[[$map2]](%[[IV0]])
+//      CHECK:   %[[tIN1:.+]] = tensor.extract_slice %[[IN1]][%[[LB]]] [%[[TS]]] [1] :
+//      CHECK:   %[[tIN2:.+]] = tensor.extract_slice %[[IN2]][%[[LB]]] [%[[TS]]] [1] :
+//      CHECK:   %[[tOUT1:.+]] = tensor.extract_slice %[[OUT1]][%[[LB]]] [%[[TS]]] [1] :
+//      CHECK:   %[[tOUT2:.+]] = tensor.extract_slice %[[OUT2]][%[[LB]]] [%[[TS]]] [1] :
+//      CHECK:   %[[RES1:[0-9]+]]:[[RES2:[0-9]+]] = linalg.generic
+//      CHECK:   scf.foreach_thread.perform_concurrently
+// CHECK-NEXT:    tensor.parallel_insert_slice %[[RES1]]#1 into %[[OUT2]][%[[LB]]] [%[[TS]]] [1] :
+// CHECK-NEXT:    tensor.parallel_insert_slice %[[RES1]]#0 into %[[OUT1]][%[[LB]]] [%[[TS]]] [1] :
+    %res1, %res2 = linalg.generic
+    {
+      indexing_maps = [affine_map<(d0) -> (d0)>,
+                       affine_map<(d0) -> (d0)>,
+                       affine_map<(d0) -> (d0)>,
+                       affine_map<(d0) -> (d0)>],
+      iterator_types = ["parallel"]
+    } ins(%IN1, %IN2 : tensor<100xf32>, tensor<100xf32>)
+      outs(%OUT1, %OUT2 : tensor<100xf32>, tensor<100xf32>) 
+    {
+      ^bb0(%a1: f32, %a2: f32, %a3: f32, %a4: f32):
+        %1 = arith.addf %a1, %a3 : f32
+        %2 = arith.addf %a2, %a4 : f32
+        linalg.yield %1, %2 : f32,f32
+    } -> (tensor<100xf32>, tensor<100xf32>)
+    return %res1, %res2 : tensor<100xf32>, tensor<100xf32>
+  }
+
+  transform.with_pdl_patterns {
+    ^bb0(%arg0: !pdl.operation):
+    transform.sequence %arg0 failures(propagate) {
+      ^bb1(%arg1: !pdl.operation):
+      %0 = transform.structured.match ops{["linalg.generic"]} in %arg1
+      %foreach_thread, %tiled_generic = transform.structured.tile_to_foreach_thread_op %0 num_threads [7]
+    }
+  }
+