diff --git a/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp b/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp
--- a/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp
+++ b/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp
@@ -213,12 +213,11 @@
   if (parseCommonStructuredOpParts(parser, result, inputTypes, outputTypes))
     return failure();
 
-  // TODO: consider merging results parsing into region parsing.
-  // Need to wait for declarative assembly resolution to decide.
-  SmallVector<Type, 1> outputTensorsTypes;
-  if (parseNamedStructuredOpResults(parser, outputTensorsTypes))
+  if (outputTypes.empty())
     return failure();
-  result.addTypes(outputTensorsTypes);
+
+  if (outputTypes.front().isa<RankedTensorType>())
+    result.addTypes(outputTypes);
 
   std::unique_ptr<Region> region = std::make_unique<Region>();
   if (parseNamedStructuredOpRegion(parser, *region, numRegionArgs, inputTypes,
@@ -250,9 +249,6 @@
   // attributes.
   printCommonStructuredOpParts(p, inputs, outputs);
 
-  // Results printing.
-  printNamedStructuredOpResults(p, op->getResultTypes());
-
   // Region is elided.
 }
 
diff --git a/mlir/test/Conversion/TensorToLinalg/tensor-ops-to-linalg.mlir b/mlir/test/Conversion/TensorToLinalg/tensor-ops-to-linalg.mlir
--- a/mlir/test/Conversion/TensorToLinalg/tensor-ops-to-linalg.mlir
+++ b/mlir/test/Conversion/TensorToLinalg/tensor-ops-to-linalg.mlir
@@ -7,7 +7,7 @@
 // CHECK-SAME:                                             %[[IN:.*]]: tensor<1x28x28x1xf32>) -> tensor<1x32x32x1xf32> {
 // CHECK:           %[[C0:.*]] = arith.constant 0.000000e+00 : f32
 // CHECK:           %[[INIT:.*]] = tensor.empty() : tensor<1x32x32x1xf32>
-// CHECK:           %[[FILL:.*]] = linalg.fill ins(%[[C0]] : f32) outs(%[[INIT]] : tensor<1x32x32x1xf32>) -> tensor<1x32x32x1xf32>
+// CHECK:           %[[FILL:.*]] = linalg.fill ins(%[[C0]] : f32) outs(%[[INIT]] : tensor<1x32x32x1xf32>)
 // CHECK:           %[[PADDED:.*]] = tensor.insert_slice %[[IN]] into %[[FILL]][0, 2, 2, 0] [1, 28, 28, 1] [1, 1, 1, 1] : tensor<1x28x28x1xf32> into tensor<1x32x32x1xf32>
 // CHECK:           return %[[PADDED]] : tensor<1x32x32x1xf32>
 func.func @generalize_pad_tensor_static_shape(%arg0: tensor<1x28x28x1xf32>) -> tensor<1x32x32x1xf32> {
@@ -32,7 +32,7 @@
 // CHECK:           %[[DIM3:.*]] = tensor.dim %[[IN]], %[[C3]] : tensor<4x?x2x?xf32>
 // CHECK:           %[[OUT_DIM3:.*]] = arith.addi %[[DIM3]], %[[OFFSET]] : index
 // CHECK:           %[[INIT:.*]] = tensor.empty(%[[DIM1]], %[[OUT_DIM2]], %[[OUT_DIM3]]) : tensor<4x?x?x?xf32>
-// CHECK:           %[[FILL:.*]] = linalg.fill ins(%[[CST]] : f32) outs(%[[INIT]] : tensor<4x?x?x?xf32>) -> tensor<4x?x?x?xf32>
+// CHECK:           %[[FILL:.*]] = linalg.fill ins(%[[CST]] : f32) outs(%[[INIT]] : tensor<4x?x?x?xf32>)
 // CHECK:           %[[PADDED:.*]] = tensor.insert_slice %[[IN]] into %[[FILL]]{{\[}}%[[C0]], %[[C0]], %[[OFFSET]], %[[C0]]] [4, %[[DIM1]], 2, %[[DIM3]]] [1, 1, 1, 1] : tensor<4x?x2x?xf32> into tensor<4x?x?x?xf32>
 // CHECK:           return %[[PADDED]] : tensor<4x?x?x?xf32>
 // CHECK:         }
diff --git a/mlir/test/Conversion/TosaToLinalg/tosa-to-linalg-named.mlir b/mlir/test/Conversion/TosaToLinalg/tosa-to-linalg-named.mlir
--- a/mlir/test/Conversion/TosaToLinalg/tosa-to-linalg-named.mlir
+++ b/mlir/test/Conversion/TosaToLinalg/tosa-to-linalg-named.mlir
@@ -4,8 +4,8 @@
 func.func @matmul(%arg0: tensor<1x5x3xf32>, %arg1: tensor<1x3x6xf32>) -> (tensor<1x5x6xf32>) {
   // CHECK: [[C0:%.+]] = arith.constant 0
   // CHECK: [[INIT:%.+]] = tensor.empty()
-  // CHECK: [[FILLED:%.+]] = linalg.fill ins([[C0]] : f32) outs([[INIT]] : tensor<1x5x6xf32>) -> tensor<1x5x6xf32>
-  // CHECK: linalg.batch_matmul ins(%arg0, %arg1 : tensor<1x5x3xf32>, tensor<1x3x6xf32>) outs([[FILLED]] : tensor<1x5x6xf32>) -> tensor<1x5x6xf32>
+  // CHECK: [[FILLED:%.+]] = linalg.fill ins([[C0]] : f32) outs([[INIT]] : tensor<1x5x6xf32>)
+  // CHECK: linalg.batch_matmul ins(%arg0, %arg1 : tensor<1x5x3xf32>, tensor<1x3x6xf32>) outs([[FILLED]] : tensor<1x5x6xf32>)
   %0 = "tosa.matmul"(%arg0, %arg1) : (tensor<1x5x3xf32>, tensor<1x3x6xf32>)  -> (tensor<1x5x6xf32>)
   return %0 : tensor<1x5x6xf32>
 }
@@ -17,10 +17,10 @@
 func.func @matmul_quantized(%arg0: tensor<1x5x3xi8>, %arg1: tensor<1x3x6xi8>) -> (tensor<1x5x6xi32>) {
   // CHECK: [[C0:%.+]] = arith.constant 0
   // CHECK: [[INIT:%.+]] = tensor.empty()
-  // CHECK: [[FILLED:%.+]] = linalg.fill ins([[C0]] : i32) outs([[INIT]] : tensor<1x5x6xi32>) -> tensor<1x5x6xi32>
+  // CHECK: [[FILLED:%.+]] = linalg.fill ins([[C0]] : i32) outs([[INIT]] : tensor<1x5x6xi32>)
   // CHECK: [[ONE:%.+]] = arith.constant 1
   // CHECK: [[TWO:%.+]] = arith.constant 2
-  // CHECK: linalg.quantized_batch_matmul ins(%arg0, %arg1, [[ONE]], [[TWO]] : tensor<1x5x3xi8>, tensor<1x3x6xi8>, i32, i32) outs([[FILLED]] : tensor<1x5x6xi32>) -> tensor<1x5x6xi32>
+  // CHECK: linalg.quantized_batch_matmul ins(%arg0, %arg1, [[ONE]], [[TWO]] : tensor<1x5x3xi8>, tensor<1x3x6xi8>, i32, i32) outs([[FILLED]] : tensor<1x5x6xi32>)
   %0 = "tosa.matmul"(%arg0, %arg1) {quantization_info = #tosa.matmul_quant<a_zp = 1, b_zp = 2>} : (tensor<1x5x3xi8>, tensor<1x3x6xi8>) -> (tensor<1x5x6xi32>)
   return %0 : tensor<1x5x6xi32>
 }
@@ -33,8 +33,8 @@
   // CHECK: %[[DIM:.+]] = tensor.dim %arg0, %[[C0]]
   // CHECK: %[[C0_0:.+]] = arith.constant 0
   // CHECK: %[[INIT:.+]] = tensor.empty(%[[DIM]])
-  // CHECK: %[[FILLED:.+]] = linalg.fill ins(%[[C0_0]] : f32) outs(%[[INIT]] : tensor<?x5x6xf32>) -> tensor<?x5x6xf32>
-  // CHECK: linalg.batch_matmul ins(%arg0, %arg1 : tensor<?x5x3xf32>, tensor<?x3x6xf32>) outs(%[[FILLED]] : tensor<?x5x6xf32>) -> tensor<?x5x6xf32>
+  // CHECK: %[[FILLED:.+]] = linalg.fill ins(%[[C0_0]] : f32) outs(%[[INIT]] : tensor<?x5x6xf32>)
+  // CHECK: linalg.batch_matmul ins(%arg0, %arg1 : tensor<?x5x3xf32>, tensor<?x3x6xf32>) outs(%[[FILLED]] : tensor<?x5x6xf32>)
   %0 = "tosa.matmul"(%arg0, %arg1) : (tensor<?x5x3xf32>, tensor<?x3x6xf32>)  -> (tensor<?x5x6xf32>)
   return %0 : tensor<?x5x6xf32>
 }
@@ -47,8 +47,8 @@
   // CHECK: %[[DIM:.+]] = tensor.dim %arg1, %[[C2]]
   // CHECK: %[[C0:.+]] = arith.constant 0
   // CHECK: %[[INIT:.+]] = tensor.empty(%[[DIM]])
-  // CHECK: %[[FILLED:.+]] = linalg.fill ins(%[[C0]] : f32) outs(%[[INIT]] : tensor<1x5x?xf32>) -> tensor<1x5x?xf32>
-  // CHECK: linalg.batch_matmul ins(%arg0, %arg1 : tensor<1x5x3xf32>, tensor<1x3x?xf32>) outs(%[[FILLED]] : tensor<1x5x?xf32>) -> tensor<1x5x?xf32>
+  // CHECK: %[[FILLED:.+]] = linalg.fill ins(%[[C0]] : f32) outs(%[[INIT]] : tensor<1x5x?xf32>)
+  // CHECK: linalg.batch_matmul ins(%arg0, %arg1 : tensor<1x5x3xf32>, tensor<1x3x?xf32>) outs(%[[FILLED]] : tensor<1x5x?xf32>)
   %0 = "tosa.matmul"(%arg0, %arg1) : (tensor<1x5x3xf32>, tensor<1x3x?xf32>)  -> (tensor<1x5x?xf32>)
   return %0 : tensor<1x5x?xf32>
 }
@@ -59,8 +59,8 @@
 func.func @matmul_dyn_independent_dim(%arg0: tensor<1x5x?xf32>, %arg1: tensor<1x?x6xf32>) -> (tensor<1x5x6xf32>) {
   // CHECK: %[[C0:.+]] = arith.constant 0
   // CHECK: %[[INIT:.+]] = tensor.empty()
-  // CHECK: %[[FILLED:.+]] = linalg.fill ins(%[[C0]] : f32) outs(%[[INIT]] : tensor<1x5x6xf32>) -> tensor<1x5x6xf32>
-  // CHECK: linalg.batch_matmul ins(%arg0, %arg1 : tensor<1x5x?xf32>, tensor<1x?x6xf32>) outs(%[[FILLED]] : tensor<1x5x6xf32>) -> tensor<1x5x6xf32>
+  // CHECK: %[[FILLED:.+]] = linalg.fill ins(%[[C0]] : f32) outs(%[[INIT]] : tensor<1x5x6xf32>)
+  // CHECK: linalg.batch_matmul ins(%arg0, %arg1 : tensor<1x5x?xf32>, tensor<1x?x6xf32>) outs(%[[FILLED]] : tensor<1x5x6xf32>)
   %0 = "tosa.matmul"(%arg0, %arg1) : (tensor<1x5x?xf32>, tensor<1x?x6xf32>)  -> (tensor<1x5x6xf32>)
   return %0 : tensor<1x5x6xf32>
 }
@@ -78,7 +78,7 @@
   // CHECK: [[PERM:%.+]] = arith.constant dense<[1, 0]>
   // CHECK: [[TRANSPOSE:%.+]] = "tosa.transpose"(%arg1, [[PERM]])
   // CHECK: [[INITB:%.+]] = tensor.empty()
-  // CHECK: [[MATMUL:%.+]] = linalg.matmul ins(%arg0, [[TRANSPOSE]] : tensor<5x3xf32>, tensor<3x6xf32>) outs([[FILL]] : tensor<5x6xf32>) -> tensor<5x6xf32>
+  // CHECK: [[MATMUL:%.+]] = linalg.matmul ins(%arg0, [[TRANSPOSE]] : tensor<5x3xf32>, tensor<3x6xf32>) outs([[FILL]] : tensor<5x6xf32>)
   // CHECK: [[ADDED:%.+]] = linalg.generic {indexing_maps = [#[[$MAP1]], #[[$MAP2]], #[[$MAP2]]], iterator_types = ["parallel", "parallel"]} ins(%arg2, [[MATMUL]] : tensor<6xf32>, tensor<5x6xf32>) outs([[INITB]] : tensor<5x6xf32>) {
   // CHECK: ^bb0(%[[ARG3:[0-9a-zA-Z_]+]]: f32, %[[ARG4:[0-9a-zA-Z_]+]]: f32, %[[ARG5:[0-9a-zA-Z_]+]]: f32):
   // CHECK:   [[ADD:%.+]] = arith.addf %[[ARG3]], %[[ARG4]] : f32
@@ -103,7 +103,7 @@
   // CHECK: [[INITB:%.+]] = tensor.empty()
   // CHECK: [[ONE:%.+]] = arith.constant 1
   // CHECK: [[TWO:%.+]] = arith.constant 2
-  // CHECK: [[MATMUL:%.+]] = linalg.quantized_matmul ins(%arg0, [[TRANSPOSE]], [[ONE]], [[TWO]] : tensor<5x3xi8>, tensor<3x6xi8>, i32, i32) outs([[FILL]] : tensor<5x6xi32>) -> tensor<5x6xi32>
+  // CHECK: [[MATMUL:%.+]] = linalg.quantized_matmul ins(%arg0, [[TRANSPOSE]], [[ONE]], [[TWO]] : tensor<5x3xi8>, tensor<3x6xi8>, i32, i32) outs([[FILL]] : tensor<5x6xi32>)
   // CHECK: [[ADDED:%.+]] = linalg.generic {indexing_maps = [#[[$MAP1]], #[[$MAP2]], #[[$MAP2]]], iterator_types = ["parallel", "parallel"]} ins(%arg2, [[MATMUL]] : tensor<6xi32>, tensor<5x6xi32>) outs([[INITB]]
   // CHECK: ^bb0([[IN1:%.+]]: i32, [[IN2:%.+]]: i32, [[UNUSED:%.+]]: i32):
   // CHECK:   [[ADD:%.+]] = arith.addi
@@ -127,7 +127,7 @@
   // CHECK: %[[PERM:.+]] = arith.constant dense<[1, 0]>
   // CHECK: %[[TRANSPOSE:.+]] = "tosa.transpose"(%arg1, %[[PERM]])
   // CHECK: %[[INITB:.+]] = tensor.empty(%[[DIM]])
-  // CHECK: %[[MATMUL:.+]] = linalg.matmul ins(%arg0, %[[TRANSPOSE]] : tensor<?x3xf32>, tensor<3x6xf32>) outs(%[[FILL]] : tensor<?x6xf32>) -> tensor<?x6xf32>
+  // CHECK: %[[MATMUL:.+]] = linalg.matmul ins(%arg0, %[[TRANSPOSE]] : tensor<?x3xf32>, tensor<3x6xf32>) outs(%[[FILL]] : tensor<?x6xf32>)
   // CHECK: %[[ADDED:.+]] = linalg.generic {indexing_maps = [#[[$MAP1]], #[[$MAP2]], #[[$MAP2]]], iterator_types = ["parallel", "parallel"]} ins(%arg2, %[[MATMUL]] : tensor<6xf32>, tensor<?x6xf32>) outs(%[[INITB]] : tensor<?x6xf32>) {
   // CHECK: ^bb0(%[[ARG3:[0-9a-zA-Z_]+]]: f32, %[[ARG4:[0-9a-zA-Z_]+]]: f32, %[[ARG5:[0-9a-zA-Z_]+]]: f32):
   // CHECK:   %[[ADD:.+]] = arith.addf %[[ARG3]], %[[ARG4]] : f32
@@ -598,7 +598,7 @@
   // CHECK: ^bb0(%[[ARG3:[0-9a-zA-Z_]+]]: index, %[[ARG4:[0-9a-zA-Z_]+]]: index, %[[ARG5:[0-9a-zA-Z_]+]]: index, %[[ARG6:[0-9a-zA-Z_]+]]: index):
   // CHECK: tensor.yield %cst : f32
   // CHECK:  } : tensor<2x?x?x3xf32> to tensor<2x?x?x3xf32>
-  // CHECK: %[[CONV:.+]] = linalg.depthwise_conv_2d_nhwc_hwcm {dilations = dense<[2, 1]> : tensor<2xi64>, strides = dense<[1, 2]> : tensor<2xi64>} ins(%[[PADDED]], %arg1 : tensor<2x?x?x3xf32>, tensor<3x6x3x5xf32>) outs(%{{.*}} : tensor<2x?x?x3x5xf32>) -> tensor<2x?x?x3x5xf32>
+  // CHECK: %[[CONV:.+]] = linalg.depthwise_conv_2d_nhwc_hwcm {dilations = dense<[2, 1]> : tensor<2xi64>, strides = dense<[1, 2]> : tensor<2xi64>} ins(%[[PADDED]], %arg1 : tensor<2x?x?x3xf32>, tensor<3x6x3x5xf32>) outs(%{{.*}} : tensor<2x?x?x3x5xf32>)
   // CHECK: %[[COLLAPSED:.+]] = tensor.collapse_shape %[[CONV]] {{\[}}[0], [1], [2], [3, 4]]
   %0 = "tosa.depthwise_conv2d"(%arg0, %arg1, %arg2) {pad = array<i64: 1, 2, 3, 4>, dilation = array<i64: 2, 1>, stride = array<i64: 1, 2>} : (tensor<2x?x?x3xf32>, tensor<3x6x3x5xf32>, tensor<15xf32>) -> tensor<2x?x?x15xf32>
   return
@@ -617,7 +617,7 @@
   // CHECK-DAG: %[[CONV3D:.+]] = linalg.conv_3d_ndhwc_dhwcf
   // CHECK-SAME: {dilations = dense<1> : tensor<3xi64>, strides = dense<1> : tensor<3xi64>}
   // CHECK-SAME: ins(%arg0, %[[TRANSPOSE]] : tensor<1x49x48x47x27xf32>, tensor<3x4x5x27x28xf32>)
-  // CHECK-SAME: outs(%[[FILL]] : tensor<1x47x45x43x28xf32>) -> tensor<1x47x45x43x28xf32>
+  // CHECK-SAME: outs(%[[FILL]] : tensor<1x47x45x43x28xf32>)
   // CHECK: %[[GENERIC:.+]] = linalg.generic
   // CHECK-SAME: {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel"]}
   // CHECK-SAME: ins(%arg2, %[[CONV3D]] : tensor<28xf32>, tensor<1x47x45x43x28xf32>)
@@ -644,7 +644,7 @@
   // CHECK-DAG: %[[CONV3D:.+]] = linalg.conv_3d_ndhwc_dhwcf_q
   // CHECK-SAME: {dilations = dense<1> : tensor<3xi64>, strides = dense<1> : tensor<3xi64>}
   // CHECK-SAME: ins(%arg0, %[[TRANSPOSE]], %[[IZP]], %[[FZP]] : tensor<1x49x48x47x27xi8>, tensor<3x4x5x27x28xi8>, i32, i32)
-  // CHECK-SAME: outs(%[[FILL]] : tensor<1x47x45x43x28xi32>) -> tensor<1x47x45x43x28xi32>
+  // CHECK-SAME: outs(%[[FILL]] : tensor<1x47x45x43x28xi32>)
   // CHECK: %[[GENERIC:.+]] = linalg.generic
   // CHECK-SAME: {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel"]}
   // CHECK-SAME: ins(%arg2, %[[CONV3D]] : tensor<28xi32>, tensor<1x47x45x43x28xi32>)
diff --git a/mlir/test/Dialect/Bufferization/Transforms/one-shot-bufferize-analysis-empty-tensor-elimination.mlir b/mlir/test/Dialect/Bufferization/Transforms/one-shot-bufferize-analysis-empty-tensor-elimination.mlir
--- a/mlir/test/Dialect/Bufferization/Transforms/one-shot-bufferize-analysis-empty-tensor-elimination.mlir
+++ b/mlir/test/Dialect/Bufferization/Transforms/one-shot-bufferize-analysis-empty-tensor-elimination.mlir
@@ -10,7 +10,7 @@
 
   //      CHECK: linalg.fill
   // CHECK-SAME: {__inplace_operands_attr__ = ["none", "true"]
-  %1 = linalg.fill ins(%cst : f32) outs(%0 : tensor<?xf32>) -> tensor<?xf32>
+  %1 = linalg.fill ins(%cst : f32) outs(%0 : tensor<?xf32>)
 
   //      CHECK: tensor.insert_slice
   // CHECK-SAME: {__inplace_operands_attr__ = ["true", "false", "none"]
@@ -37,7 +37,7 @@
 
   //      CHECK: linalg.fill
   // CHECK-SAME: {__inplace_operands_attr__ = ["none", "true"]
-  %1 = linalg.fill ins(%cst : f32) outs(%0 : tensor<?xf32>) -> tensor<?xf32>
+  %1 = linalg.fill ins(%cst : f32) outs(%0 : tensor<?xf32>)
 
   //      CHECK: tensor.insert_slice
   // CHECK-SAME: {__inplace_operands_attr__ = ["true", "true", "none"]
diff --git a/mlir/test/Dialect/Bufferization/Transforms/one-shot-bufferize-empty-tensor-elimination.mlir b/mlir/test/Dialect/Bufferization/Transforms/one-shot-bufferize-empty-tensor-elimination.mlir
--- a/mlir/test/Dialect/Bufferization/Transforms/one-shot-bufferize-empty-tensor-elimination.mlir
+++ b/mlir/test/Dialect/Bufferization/Transforms/one-shot-bufferize-empty-tensor-elimination.mlir
@@ -20,7 +20,7 @@
   // insert_slice. The pass replaces the tensor.empty with an out-of-place
   // extract_slice.
   %a = tensor.empty(%sz) : tensor<?xf32>
-  %f = linalg.fill ins(%f0 : f32) outs(%a : tensor<?xf32>) -> tensor<?xf32>
+  %f = linalg.fill ins(%f0 : f32) outs(%a : tensor<?xf32>)
 
   //     CHECK: memref.copy %[[FUNC_ARG]], %[[ALLOC]] : memref<?xf32> to memref<?xf32>
   //     CHECK: %[[SV0_ALLOC:.*]] = memref.subview %[[ALLOC]][0] [%[[sz]]] [1] : memref<?xf32> to memref<?xf32, strided<[1]>>
@@ -53,7 +53,7 @@
   %a = tensor.empty(%sz) : tensor<?xf32>
 
   // CHECK: linalg.fill ins({{.*}} : f32) outs(%[[T_SUBVIEW]] : memref<?xf32
-  %f = linalg.fill ins(%f0 : f32) outs(%a : tensor<?xf32>) -> tensor<?xf32>
+  %f = linalg.fill ins(%f0 : f32) outs(%a : tensor<?xf32>)
 
   // Self-copy canonicalizes away later.
   %r1 = tensor.insert_slice %f into %t[42][%sz][1]: tensor<?xf32> into tensor<?xf32>
@@ -80,7 +80,7 @@
     %f = arith.sitofp %iv_i32 : i32 to f32
 
     // CHECK: linalg.fill ins(%{{.*}}{{.*}}outs(%[[subview]]
-    %filled = linalg.fill ins(%f : f32) outs(%blank : tensor<5xf32>) -> tensor<5xf32>
+    %filled = linalg.fill ins(%f : f32) outs(%blank : tensor<5xf32>)
 
     // CHECK-NOT: memref.copy
     %inserted = tensor.insert_slice %filled into %bb[%iv][5][1] : tensor<5xf32> into tensor<?xf32>
@@ -110,7 +110,7 @@
     %f = arith.sitofp %iv_i32 : i32 to f32
 
     // CHECK: linalg.fill ins(%{{.*}}{{.*}}outs(%[[subview]]
-    %filled = linalg.fill ins(%f : f32) outs(%blank : tensor<5xf32>) -> tensor<5xf32>
+    %filled = linalg.fill ins(%f : f32) outs(%blank : tensor<5xf32>)
 
     // CHECK-NOT: memref.copy
     %inserted = tensor.insert_slice %filled into %bb[%idx][5][1] : tensor<5xf32> into tensor<?xf32>
@@ -130,7 +130,7 @@
 func.func @shape_mismatch(%t: tensor<5x6x128xf32>) -> tensor<5x6x128xf32> {
   %cst = arith.constant 8.0 : f32
   %0 = tensor.empty() : tensor<128xf32>
-  %1 = linalg.fill ins(%cst : f32) outs(%0 : tensor<128xf32>) -> tensor<128xf32>
+  %1 = linalg.fill ins(%cst : f32) outs(%0 : tensor<128xf32>)
   %2 = tensor.expand_shape %1 [[0, 1, 2]]
       : tensor<128xf32> into tensor<1x1x128xf32>
   %3 = tensor.insert_slice %2 into %t[2, 3, 0][1, 1, 128][1, 1, 1]
@@ -159,7 +159,7 @@
     %a = tensor.empty(%sz) : tensor<?xf32>
 
     // CHECK: linalg.fill ins({{.*}} : f32) outs(%[[T_SUBVIEW]] : memref<?xf32
-    %f = linalg.fill ins(%f0 : f32) outs(%a : tensor<?xf32>) -> tensor<?xf32>
+    %f = linalg.fill ins(%f0 : f32) outs(%a : tensor<?xf32>)
 
     // Self-copy canonicalizes away later.
     scf.foreach_thread.perform_concurrently {
diff --git a/mlir/test/Dialect/Bufferization/Transforms/one-shot-bufferize-partial.mlir b/mlir/test/Dialect/Bufferization/Transforms/one-shot-bufferize-partial.mlir
--- a/mlir/test/Dialect/Bufferization/Transforms/one-shot-bufferize-partial.mlir
+++ b/mlir/test/Dialect/Bufferization/Transforms/one-shot-bufferize-partial.mlir
@@ -140,7 +140,7 @@
   // CHECK: linalg.fill ins(%{{.*}}{{.*}}outs(%[[m1]]
   // CHECK: %[[filled_tensor:.*]] = bufferization.to_tensor %[[m1]]
   %t1 = bufferization.alloc_tensor() : tensor<10xf32>
-  %filled = linalg.fill ins(%cst : f32) outs(%t1 : tensor<10xf32>) -> tensor<10xf32>
+  %filled = linalg.fill ins(%cst : f32) outs(%t1 : tensor<10xf32>)
 
   // The transfer_write is out-of-place because "dummy_op" may read.
   // CHECK: %[[alloc:.*]] = memref.alloc() {{.*}} : memref<10xf32>
diff --git a/mlir/test/Dialect/Bufferization/Transforms/one-shot-module-bufferize-allow-return-allocs.mlir b/mlir/test/Dialect/Bufferization/Transforms/one-shot-module-bufferize-allow-return-allocs.mlir
--- a/mlir/test/Dialect/Bufferization/Transforms/one-shot-module-bufferize-allow-return-allocs.mlir
+++ b/mlir/test/Dialect/Bufferization/Transforms/one-shot-module-bufferize-allow-return-allocs.mlir
@@ -57,7 +57,7 @@
 func.func @main(%t: tensor<?xf32>, %sz: index, %idx: index) -> (f32, f32) {
   %cst = arith.constant 1.0 : f32
   %0 = call @return_slice(%t, %sz) : (tensor<?xf32>, index) -> (tensor<?xf32>)
-  %filled = linalg.fill ins(%cst : f32) outs(%t : tensor<?xf32>) -> tensor<?xf32>
+  %filled = linalg.fill ins(%cst : f32) outs(%t : tensor<?xf32>)
   %r1 = tensor.extract %0[%idx] : tensor<?xf32>
   %r2 = tensor.extract %filled[%idx] : tensor<?xf32>
   return %r1, %r2 : f32, f32
diff --git a/mlir/test/Dialect/Bufferization/Transforms/one-shot-module-bufferize-analysis.mlir b/mlir/test/Dialect/Bufferization/Transforms/one-shot-module-bufferize-analysis.mlir
--- a/mlir/test/Dialect/Bufferization/Transforms/one-shot-module-bufferize-analysis.mlir
+++ b/mlir/test/Dialect/Bufferization/Transforms/one-shot-module-bufferize-analysis.mlir
@@ -76,21 +76,18 @@
   // CHECK-SAME: {__inplace_operands_attr__ = ["true", "true", "false"]}
   %C = linalg.matmul  ins(%A, %B: tensor<4x4xf32>, tensor<4x4xf32>)
                      outs(%B: tensor<4x4xf32>)
-    -> tensor<4x4xf32>
 
   // matmul output operand interferes with input operand.
   //     CHECK: linalg.matmul
   // CHECK-SAME: {__inplace_operands_attr__ = ["true", "true", "false"]}
   %D = linalg.matmul  ins(%B, %A: tensor<4x4xf32>, tensor<4x4xf32>)
                      outs(%B: tensor<4x4xf32>)
-    -> tensor<4x4xf32>
 
   // matmul output operand does not interferes with input operand.
   //     CHECK: linalg.matmul
   // CHECK-SAME: {__inplace_operands_attr__ = ["true", "true", "true"]}
   %E = linalg.matmul  ins(%A, %A: tensor<4x4xf32>, tensor<4x4xf32>)
                      outs(%B: tensor<4x4xf32>)
-    -> tensor<4x4xf32>
 
   //      CHECK: return
   // CHECK-SAME: __equivalent_func_args__ = [-1, -1, 1]
@@ -260,7 +257,7 @@
 
   //      CHECK: linalg.fill
   // CHECK-SAME: {__inplace_operands_attr__ = ["none", "true"]}
-  %1 = linalg.fill ins(%cst : f32) outs(%0 : tensor<?xf32>) -> tensor<?xf32>
+  %1 = linalg.fill ins(%cst : f32) outs(%0 : tensor<?xf32>)
 
   //      CHECK: tensor.insert_slice
   // CHECK-SAME: {__inplace_operands_attr__ = ["true", "true", "none", "none"]}
@@ -292,7 +289,7 @@
 
   //      CHECK: linalg.fill
   // CHECK-SAME: {__inplace_operands_attr__ = ["none", "true"]}
-  %1 = linalg.fill ins(%cst : f32) outs(%0 : tensor<?xf32>) -> tensor<?xf32>
+  %1 = linalg.fill ins(%cst : f32) outs(%0 : tensor<?xf32>)
 
   //      CHECK: tensor.insert_slice
   // CHECK-SAME: {__inplace_operands_attr__ = ["true", "true", "none", "none"]}
@@ -304,7 +301,7 @@
 
   //      CHECK: linalg.fill
   // CHECK-SAME: {__inplace_operands_attr__ = ["none", "true"]}
-  %5 = linalg.fill ins(%cst : f32) outs(%4 : tensor<?xf32>) -> tensor<?xf32>
+  %5 = linalg.fill ins(%cst : f32) outs(%4 : tensor<?xf32>)
 
   %3 = vector.transfer_read %1[%idx2], %cst2 : tensor<?xf32>, vector<5xf32>
 
@@ -337,14 +334,12 @@
   // CHECK-SAME: {__inplace_operands_attr__ = ["true", "true", "false"]}
   %D = linalg.matmul  ins(%sA, %B: tensor<4x4xf32>, tensor<4x4xf32>)
                      outs(%B: tensor<4x4xf32>)
-    -> tensor<4x4xf32>
 
   // matmul output operand is inplaceable at the function boundary.
   //     CHECK: linalg.matmul
   // CHECK-SAME: {__inplace_operands_attr__ = ["true", "true", "true"]}
   %E = linalg.matmul  ins(%sA, %B: tensor<4x4xf32>, tensor<4x4xf32>)
                      outs(%C: tensor<4x4xf32>)
-    -> tensor<4x4xf32>
 
   //      CHECK: return
   // CHECK-SAME: __equivalent_func_args__ = [-1, 2]
@@ -371,7 +366,6 @@
   // CHECK-SAME: {__inplace_operands_attr__ = ["true", "true", "false"]}
   %D = linalg.matmul  ins(%B, %C: tensor<?x?xf32>, tensor<?x?xf32>)
                      outs(%sB: tensor<4x4xf32>)
-    -> tensor<4x4xf32>
 
   // Step 2. %sC forward propagates to an inplace write in %E.
   // %sC backward propagates to %C which is inplaceable.
@@ -386,7 +380,6 @@
   // CHECK-SAME: {__inplace_operands_attr__ = ["true", "true", "true"]}
   %E = linalg.matmul  ins(%A, %sB: tensor<4x4xf32>, tensor<4x4xf32>)
                      outs(%sC: tensor<4x4xf32>)
-    -> tensor<4x4xf32>
 
   return %D, %E: tensor<4x4xf32>, tensor<4x4xf32>
 }
@@ -410,7 +403,7 @@
 
   //      CHECK: linalg.matmul
   // CHECK-SAME: {__inplace_operands_attr__ = ["true", "true", "true"]}
-  %18 = linalg.matmul ins(%A, %B : tensor<8x6xf32>, tensor<6x6xf32>) outs(%15 : tensor<?x?xf32>) -> tensor<?x?xf32>
+  %18 = linalg.matmul ins(%A, %B : tensor<8x6xf32>, tensor<6x6xf32>) outs(%15 : tensor<?x?xf32>)
 
   //      CHECK: tensor.extract_slice
   // CHECK-SAME: {__inplace_operands_attr__ = ["true", "none", "none"]}
@@ -451,7 +444,6 @@
   // CHECK-SAME: {__inplace_operands_attr__ = ["true", "true", "true"]}
   %D = linalg.matmul  ins(%B, %C: tensor<?x?xf32>, tensor<?x?xf32>)
                      outs(%sB: tensor<4x4xf32>)
-    -> tensor<4x4xf32>
 
   // Step 2. %sC forward propagates to an inplace write in %E.
   // %sC backward propagates to %C which is inplaceable.
@@ -466,7 +458,6 @@
   // CHECK-SAME: {__inplace_operands_attr__ = ["true", "true", "true"]}
   %E = linalg.matmul  ins(%A, %A: tensor<4x4xf32>, tensor<4x4xf32>)
                      outs(%sC: tensor<4x4xf32>)
-    -> tensor<4x4xf32>
 
   return %D, %E: tensor<4x4xf32>, tensor<4x4xf32>
 }
@@ -504,7 +495,7 @@
   // CHECK-SAME: {__inplace_operands_attr__ = ["true", "false", "none", "none"]}
   %sA = tensor.extract_slice %A[0, 0][%idx, %idx][1, 1] : tensor<?x?xf32> to tensor<?x?xf32>
   %ssA = tensor.extract_slice %sA[0, 0][4, 4][1, 1] : tensor<?x?xf32> to tensor<4x4xf32>
-  %FA = linalg.fill ins(%f0 : f32) outs(%ssA : tensor<4x4xf32>) -> tensor<4x4xf32>
+  %FA = linalg.fill ins(%f0 : f32) outs(%ssA : tensor<4x4xf32>)
   %rsA = tensor.insert_slice %FA into %sA[0, 0][4, 4][1, 1] : tensor<4x4xf32> into tensor<?x?xf32>
   %rA = tensor.insert_slice %rsA into %A[0, 0][%idx, %idx][1, 1] : tensor<?x?xf32> into tensor<?x?xf32>
 
@@ -527,7 +518,7 @@
   %sB = tensor.extract_slice %B[0, 0][%idx, %idx][1, 1] : tensor<?x?xf32> to tensor<?x?xf32>
   %ssB = tensor.extract_slice %sB[0, 0][4, %idx][1, 1] : tensor<?x?xf32> to tensor<4x?xf32>
   %sssB = tensor.extract_slice %ssB[0, 0][4, 4][1, 1] : tensor<4x?xf32> to tensor<4x4xf32>
-  %FB = linalg.fill ins(%f0 : f32) outs(%sssB : tensor<4x4xf32>) -> tensor<4x4xf32>
+  %FB = linalg.fill ins(%f0 : f32) outs(%sssB : tensor<4x4xf32>)
   %rssB = tensor.insert_slice %FB into %ssB[0, 0][4, 4][1, 1] : tensor<4x4xf32> into tensor<4x?xf32>
   %rsB = tensor.insert_slice %rssB into %sB[0, 0][4, %idx][1, 1] : tensor<4x?xf32> into tensor<?x?xf32>
   %rB = tensor.insert_slice %rsB into %B[0, 0][%idx, %idx][1, 1] : tensor<?x?xf32> into tensor<?x?xf32>
@@ -550,7 +541,7 @@
   // CHECK-SAME: {__inplace_operands_attr__ = ["true", "true", "none", "none"]}
   %sC = tensor.extract_slice %C[0, 0][%idx, %idx][1, 1] : tensor<?x?xf32> to tensor<?x?xf32>
   %ssC = tensor.extract_slice %sC[0, 0][%sz1, 4][1, 1] : tensor<?x?xf32> to tensor<?x4xf32>
-  %FC = linalg.fill ins(%f0 : f32) outs(%ssC : tensor<?x4xf32>) -> tensor<?x4xf32>
+  %FC = linalg.fill ins(%f0 : f32) outs(%ssC : tensor<?x4xf32>)
   %rsC = tensor.insert_slice %FC into %sC[0, 0][%sz2, 4][1, 1] : tensor<?x4xf32> into tensor<?x?xf32>
   %rC = tensor.insert_slice %rsC into %C[0, 0][%idx, %idx][1, 1] : tensor<?x?xf32> into tensor<?x?xf32>
 
@@ -577,12 +568,12 @@
   // cannot bufferize inplace.
   //     CHECK: fill
   // CHECK-SAME: {__inplace_operands_attr__ = ["none", "false"]}
-  %A = linalg.fill ins(%f1 : f32) outs(%I : tensor<64xf32>) -> tensor<64xf32>
+  %A = linalg.fill ins(%f1 : f32) outs(%I : tensor<64xf32>)
 
   // 1. Bufferizes inplace: no alias to %A is yet possible.
   //     CHECK: fill
   // CHECK-SAME: {__inplace_operands_attr__ = ["none", "true"]}
-  %B = linalg.fill ins(%f2 : f32) outs(%I : tensor<64xf32>) -> tensor<64xf32>
+  %B = linalg.fill ins(%f2 : f32) outs(%I : tensor<64xf32>)
 
   call @foo(%A) : (tensor<64xf32>) -> ()
   call @foo(%B) : (tensor<64xf32>) -> ()
@@ -613,12 +604,12 @@
   // bufferize inplace.
   //     CHECK: fill
   // CHECK-SAME: {__inplace_operands_attr__ = ["none", "false"]}
-  %A = linalg.fill ins(%f1 : f32) outs(%I : tensor<64xf32>) -> tensor<64xf32>
+  %A = linalg.fill ins(%f1 : f32) outs(%I : tensor<64xf32>)
 
   // 4. Bufferizes inplace: no alias to %A is yet possible.
   //     CHECK: fill
   // CHECK-SAME: {__inplace_operands_attr__ = ["none", "true"]}
-  %B = linalg.fill ins(%f2 : f32) outs(%I : tensor<64xf32>) -> tensor<64xf32>
+  %B = linalg.fill ins(%f2 : f32) outs(%I : tensor<64xf32>)
 
   // 3. Does not read or write, bufferizes inplace.
   //      CHECK: scf.for
@@ -638,12 +629,12 @@
   // cannot bufferize inplace.
   //     CHECK: fill
   // CHECK-SAME: {__inplace_operands_attr__ = ["none", "false"]}
-  %A2 = linalg.fill ins(%f1 : f32) outs(%I2 : tensor<64xf32>) -> tensor<64xf32>
+  %A2 = linalg.fill ins(%f1 : f32) outs(%I2 : tensor<64xf32>)
 
   // 1. Bufferizes inplace: no alias to %A2 is yet possible.
   //     CHECK: fill
   // CHECK-SAME: {__inplace_operands_attr__ = ["none", "true"]}
-  %B2 = linalg.fill ins(%f2 : f32) outs(%I2 : tensor<64xf32>) -> tensor<64xf32>
+  %B2 = linalg.fill ins(%f2 : f32) outs(%I2 : tensor<64xf32>)
 
   call @bar(%A2) : (tensor<64xf32>) -> ()
   call @bar(%B2) : (tensor<64xf32>) -> ()
@@ -688,8 +679,8 @@
   // CHECK-SAME: {__inplace_operands_attr__ = ["none", "false"]}
   //      CHECK: linalg.fill
   // CHECK-SAME: {__inplace_operands_attr__ = ["none", "true"]}
-  %8 = linalg.fill ins(%cst_0 : f32) outs(%7 : tensor<256x256xf32>) -> tensor<256x256xf32>
-  %11 = linalg.fill ins(%cst_1 : f32) outs(%7 : tensor<256x256xf32>) -> tensor<256x256xf32>
+  %8 = linalg.fill ins(%cst_0 : f32) outs(%7 : tensor<256x256xf32>)
+  %11 = linalg.fill ins(%cst_1 : f32) outs(%7 : tensor<256x256xf32>)
 
   //      CHECK: tensor.extract_slice
   // CHECK-SAME: {__inplace_operands_attr__ = ["true"]}
@@ -701,7 +692,7 @@
   %sB = tensor.extract_slice %11[0, 0][16, 256][1, 1]: tensor<256x256xf32> to tensor<16x256xf32>
   %r = linalg.matmul
          ins(%sA, %sB : tensor<256x16xf32>, tensor<16x256xf32>)
-        outs(%arg2 : tensor<256x256xf32>) -> tensor<256x256xf32>
+        outs(%arg2 : tensor<256x256xf32>)
 
   //      CHECK: return
   // CHECK-SAME: __equivalent_func_args__ = [2]
@@ -726,7 +717,7 @@
   // CHECK-SAME: {__inplace_operands_attr__ = ["none", "false"]}
   //      CHECK: vector.transfer_write
   // CHECK-SAME: {__inplace_operands_attr__ = ["none", "true", "none", "none"]
-  %8 = linalg.fill ins(%cst_0 : f32) outs(%7 : tensor<256x256xf32>) -> tensor<256x256xf32>
+  %8 = linalg.fill ins(%cst_0 : f32) outs(%7 : tensor<256x256xf32>)
   %9 = vector.transfer_read %arg0[%c0, %c0], %cst_0 {in_bounds = [false, true]} : tensor<518x518xf32>, vector<256x256xf32>
   %10 = vector.transfer_write %9, %8[%c0, %c0] {in_bounds = [true, true]} : vector<256x256xf32>, tensor<256x256xf32>
 
@@ -734,7 +725,7 @@
   // CHECK-SAME: {__inplace_operands_attr__ = ["none", "true"]}
   //      CHECK: vector.transfer_write
   // CHECK-SAME: {__inplace_operands_attr__ = ["none", "true", "none", "none"]
-  %11 = linalg.fill ins(%cst_1 : f32) outs(%7 : tensor<256x256xf32>) -> tensor<256x256xf32>
+  %11 = linalg.fill ins(%cst_1 : f32) outs(%7 : tensor<256x256xf32>)
   %12 = vector.transfer_read %arg1[%c0, %c0], %cst_0 {in_bounds = [false, true]} : tensor<518x518xf32>, vector<256x256xf32>
   %13 = vector.transfer_write %12, %11[%c0, %c0] {in_bounds = [true, true]} : vector<256x256xf32>, tensor<256x256xf32>
 
@@ -748,7 +739,7 @@
   %sB = tensor.extract_slice %13[0, 0][16, 256][1, 1]: tensor<256x256xf32> to tensor<16x256xf32>
   %r = linalg.matmul
          ins(%sA, %sB : tensor<256x16xf32>, tensor<16x256xf32>)
-        outs(%arg2 : tensor<256x256xf32>) -> tensor<256x256xf32>
+        outs(%arg2 : tensor<256x256xf32>)
 
   //      CHECK: return
   // CHECK-SAME: __equivalent_func_args__ = [2]
@@ -779,7 +770,7 @@
 
   //      CHECK: linalg.fill
   // CHECK-SAME: {__inplace_operands_attr__ = ["none", "true"]
-  %0 = linalg.fill ins(%cst : f32) outs(%arg2 : tensor<62x90xf32>) -> tensor<62x90xf32>
+  %0 = linalg.fill ins(%cst : f32) outs(%arg2 : tensor<62x90xf32>)
 
   //      CHECK: tensor.extract_slice
   // CHECK-SAME: {__inplace_operands_attr__ = ["true"]
diff --git a/mlir/test/Dialect/Bufferization/Transforms/one-shot-module-bufferize-invalid.mlir b/mlir/test/Dialect/Bufferization/Transforms/one-shot-module-bufferize-invalid.mlir
--- a/mlir/test/Dialect/Bufferization/Transforms/one-shot-module-bufferize-invalid.mlir
+++ b/mlir/test/Dialect/Bufferization/Transforms/one-shot-module-bufferize-invalid.mlir
@@ -222,7 +222,7 @@
 func.func @mini_test_case1() -> tensor<10x20xf32> {
   %f0 = arith.constant 0.0 : f32
   %t = bufferization.alloc_tensor() : tensor<10x20xf32>
-  %r = linalg.fill ins(%f0 : f32) outs(%t : tensor<10x20xf32>) -> tensor<10x20xf32>
+  %r = linalg.fill ins(%f0 : f32) outs(%t : tensor<10x20xf32>)
   // expected-error @+1 {{operand #0 may return/yield a new buffer allocation}}
   return %r : tensor<10x20xf32>
 }
diff --git a/mlir/test/Dialect/Bufferization/Transforms/one-shot-module-bufferize.mlir b/mlir/test/Dialect/Bufferization/Transforms/one-shot-module-bufferize.mlir
--- a/mlir/test/Dialect/Bufferization/Transforms/one-shot-module-bufferize.mlir
+++ b/mlir/test/Dialect/Bufferization/Transforms/one-shot-module-bufferize.mlir
@@ -244,7 +244,7 @@
 //   CHECK-NOT:   copy
 func.func @does_not_read(%t: tensor<?xf32>) -> tensor<?xf32> {
   %f0 = arith.constant 0.0 : f32
-  %r = linalg.fill ins(%f0 : f32) outs(%t : tensor<?xf32>) -> tensor<?xf32>
+  %r = linalg.fill ins(%f0 : f32) outs(%t : tensor<?xf32>)
   return %r : tensor<?xf32>
 }
 
@@ -417,11 +417,11 @@
   %v0 = arith.constant 0.0 : f32
 
   // CHECK-NEXT:   linalg.fill ins(%[[C0]] : f32) outs(%[[C]] : memref<f32, strided<[], offset: ?>>)
-  %d = linalg.fill ins(%v0 : f32) outs(%c : tensor<f32>) -> tensor<f32>
+  %d = linalg.fill ins(%v0 : f32) outs(%c : tensor<f32>)
 
   // CHECK-NEXT:   linalg.dot ins(%[[A]], %[[B]] : memref<64xf32, strided<[?], offset: ?>>, memref<64xf32, strided<[?], offset: ?>>) outs(%[[C]] : memref<f32, strided<[], offset: ?>>)
   %e = linalg.dot ins(%a, %b : tensor<64xf32>,tensor<64xf32>)
-    outs(%d: tensor<f32>) -> tensor<f32>
+    outs(%d: tensor<f32>)
 
   // CHECK-NEXT:   return
   return %e : tensor<f32>
@@ -449,9 +449,9 @@
   //  CHECK-DAG:   linalg.fill ins(%[[C1]] : f32) outs(%[[A]] : memref<64xf32>)
   //  CHECK-DAG:   linalg.fill ins(%[[C2]] : f32) outs(%[[B]] : memref<64xf32>)
   //  CHECK-DAG:   linalg.fill ins(%[[C0]] : f32) outs(%[[C]] : memref<f32>)
-  %AA = linalg.fill ins(%v1 : f32) outs(%A : tensor<64xf32>) -> tensor<64xf32>
-  %BB = linalg.fill ins(%v2 : f32) outs(%B : tensor<64xf32>) -> tensor<64xf32>
-  %CC = linalg.fill ins(%v0 : f32) outs(%C : tensor<f32>) -> tensor<f32>
+  %AA = linalg.fill ins(%v1 : f32) outs(%A : tensor<64xf32>)
+  %BB = linalg.fill ins(%v2 : f32) outs(%B : tensor<64xf32>)
+  %CC = linalg.fill ins(%v0 : f32) outs(%C : tensor<f32>)
 
   // CHECK-NEXT:   call @init_and_dot(%[[cA]], %[[cB]], %[[cC]])
   %res = call @init_and_dot(%AA, %BB, %CC) :
diff --git a/mlir/test/Dialect/Bufferization/Transforms/transform-ops.mlir b/mlir/test/Dialect/Bufferization/Transforms/transform-ops.mlir
--- a/mlir/test/Dialect/Bufferization/Transforms/transform-ops.mlir
+++ b/mlir/test/Dialect/Bufferization/Transforms/transform-ops.mlir
@@ -114,7 +114,7 @@
 // CHECK-SAME:  %[[C:.*]]: memref<12x6xf32>) -> memref<12x6xf32> {
 func.func @matmul(%A: tensor<12x9xf32>, %B: tensor<9x6xf32>, %C: tensor<12x6xf32>) -> tensor<12x6xf32> {
   // CHECK: linalg.matmul ins(%[[A]], %[[B]] : memref<12x9xf32>, memref<9x6xf32>) outs(%[[C]] : memref<12x6xf32>)
-  %D = linalg.matmul ins(%A, %B: tensor<12x9xf32>, tensor<9x6xf32>) outs(%C: tensor<12x6xf32>) -> tensor<12x6xf32>
+  %D = linalg.matmul ins(%A, %B: tensor<12x9xf32>, tensor<9x6xf32>) outs(%C: tensor<12x6xf32>)
   // CHECK: return %[[C]] : memref<12x6xf32>
   return %D : tensor<12x6xf32>
 }
diff --git a/mlir/test/Dialect/GPU/transform-gpu-failing.mlir b/mlir/test/Dialect/GPU/transform-gpu-failing.mlir
--- a/mlir/test/Dialect/GPU/transform-gpu-failing.mlir
+++ b/mlir/test/Dialect/GPU/transform-gpu-failing.mlir
@@ -126,7 +126,7 @@
   %name = gpu.launch async[%stream] blocks(%arg3, %arg4, %arg5) in (%arg9 = %one, %arg10 = %one, %arg11 = %one)
             threads(%arg6, %arg7, %arg8) in (%arg12 = %one, %arg13 = %one, %arg14 = %one)
   {
-    %t = linalg.matmul ins(%x, %y: tensor<32x32xf32>, tensor<32x32xf32>) outs(%z : tensor<32x32xf32>) -> tensor<32x32xf32>
+    %t = linalg.matmul ins(%x, %y: tensor<32x32xf32>, tensor<32x32xf32>) outs(%z : tensor<32x32xf32>)
     gpu.terminator
   }
   return
diff --git a/mlir/test/Dialect/LLVM/transform-e2e.mlir b/mlir/test/Dialect/LLVM/transform-e2e.mlir
--- a/mlir/test/Dialect/LLVM/transform-e2e.mlir
+++ b/mlir/test/Dialect/LLVM/transform-e2e.mlir
@@ -8,7 +8,6 @@
 // CHECK: llvm.intr.fmuladd{{.*}}
   %0 = linalg.matmul  ins(%arg0, %arg1: tensor<2x4xf32>, tensor<4x6xf32>)
                      outs(%arg2: tensor<2x6xf32>)
-    -> tensor<2x6xf32>
   return %0 : tensor<2x6xf32>
 }
 
diff --git a/mlir/test/Dialect/Linalg/affine.mlir b/mlir/test/Dialect/Linalg/affine.mlir
--- a/mlir/test/Dialect/Linalg/affine.mlir
+++ b/mlir/test/Dialect/Linalg/affine.mlir
@@ -10,7 +10,7 @@
   %B = memref.view %arg0[%c0][%K, %N] : memref<?xi8> to memref<?x?xf32>
   %C = memref.view %arg0[%c0][%M, %N] : memref<?xi8> to memref<?x?xf32>
   linalg.matmul ins(%A, %B: memref<?x?xf32>, memref<?x?xf32>)
-               outs(%C: memref<?x?xf32>)
+                outs(%C: memref<?x?xf32>)
   return
 }
 
@@ -19,7 +19,7 @@
 //----------------------------------------------------------------------------//
 func.func @named_batch_matmul(%A: memref<?x?x?xf32>, %B: memref<?x?x?xf32>, %C: memref<?x?x?xf32>) {
   linalg.batch_matmul ins(%A, %B: memref<?x?x?xf32>, memref<?x?x?xf32>)
-                     outs(%C : memref<?x?x?xf32>)
+                      outs(%C : memref<?x?x?xf32>)
   return
 }
 // CHECK-LABEL: @named_batch_matmul
diff --git a/mlir/test/Dialect/Linalg/bubble-up-extract-slice-op.mlir b/mlir/test/Dialect/Linalg/bubble-up-extract-slice-op.mlir
--- a/mlir/test/Dialect/Linalg/bubble-up-extract-slice-op.mlir
+++ b/mlir/test/Dialect/Linalg/bubble-up-extract-slice-op.mlir
@@ -112,7 +112,7 @@
     %lhs = arith.constant dense<1.0> : tensor<4x4xf32>
     %rhs = arith.constant dense<1.0> : tensor<4x4xf32>
     %dst = arith.constant dense<[[0.0, 1.0, 2.0, 3.0], [4.0, 5.0, 6.0, 7.0], [8.0, 9.0, 10.0, 11.0], [12.0, 13.0, 14.0, 15.0]]> : tensor<4x4xf32>
-    %0 = linalg.matmul ins(%lhs, %rhs : tensor<4x4xf32>, tensor<4x4xf32>) outs(%dst : tensor<4x4xf32>) -> tensor<4x4xf32>
+    %0 = linalg.matmul ins(%lhs, %rhs : tensor<4x4xf32>, tensor<4x4xf32>) outs(%dst : tensor<4x4xf32>)
     %1 = tensor.extract_slice %0[1,1][2,2][1,1] : tensor<4x4xf32> to tensor<2x2xf32>
     return %1 : tensor<2x2xf32>
 }
@@ -121,7 +121,7 @@
 // CHECK: %[[SLICE0:.+]] = arith.constant dense<1.000000e+00> : tensor<2x4xf32>
 // CHECK: %[[SLICE1:.+]] = arith.constant dense<1.000000e+00> : tensor<4x2xf32>
 // CHECK: %[[SLICE3:.+]] = tensor.extract_slice %[[CST:.+]][1, 1] [2, 2] [1, 1] : tensor<4x4xf32> to tensor<2x2xf32>
-// CHECK: %[[MATMUL:.+]] = linalg.matmul ins(%[[SLICE0]], %[[SLICE1]] : tensor<2x4xf32>, tensor<4x2xf32>) outs(%[[SLICE3]] : tensor<2x2xf32>) -> tensor<2x2xf32>
+// CHECK: %[[MATMUL:.+]] = linalg.matmul ins(%[[SLICE0]], %[[SLICE1]] : tensor<2x4xf32>, tensor<4x2xf32>) outs(%[[SLICE3]] : tensor<2x2xf32>)
 // CHECK: return %[[MATMUL]] : tensor<2x2xf32>
 
 //-----
@@ -136,12 +136,12 @@
   %cst = arith.constant 0.0 : f32
 
   %init = tensor.empty() : tensor<1x112x112x32xf32>
-  %fill = linalg.fill ins(%cst : f32) outs(%init : tensor<1x112x112x32xf32>) -> tensor<1x112x112x32xf32>
+  %fill = linalg.fill ins(%cst : f32) outs(%init : tensor<1x112x112x32xf32>)
 
   %conv = linalg.conv_2d_nhwc_hwcf
     {dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>}
     ins(%input, %filter : tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>)
-    outs(%fill : tensor<1x112x112x32xf32>) -> tensor<1x112x112x32xf32>
+    outs(%fill : tensor<1x112x112x32xf32>)
 
   %slice = tensor.extract_slice %conv [0, 64, 64, 16] [1, 32, 32, 16] [1, 1, 1, 1] : tensor<1x112x112x32xf32> to tensor<1x32x32x16xf32>
 
@@ -153,8 +153,8 @@
 // CHECK: %[[SLICE0:.+]] = tensor.extract_slice %arg0[0, 128, 128, 0] [1, 65, 65, 3] [1, 1, 1, 1] : tensor<1x225x225x3xf32> to tensor<1x65x65x3xf32>
 // CHECK: %[[SLICE1:.+]] = tensor.extract_slice %arg1[0, 0, 0, 16] [3, 3, 3, 16] [1, 1, 1, 1] : tensor<3x3x3x32xf32> to tensor<3x3x3x16xf32>
 // CHECK: %[[SLICE2:.+]] = tensor.extract_slice %[[INIT]][0, 64, 64, 16] [1, 32, 32, 16] [1, 1, 1, 1] : tensor<1x112x112x32xf32> to tensor<1x32x32x16xf32>
-// CHECK: %[[FILL:.+]] = linalg.fill ins(%[[CST:.+]] : f32) outs(%[[SLICE2]] : tensor<1x32x32x16xf32>) -> tensor<1x32x32x16xf32>
-// CHECK: %[[CONV:.+]] = linalg.conv_2d_nhwc_hwcf {dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} ins(%[[SLICE0]], %[[SLICE1]] : tensor<1x65x65x3xf32>, tensor<3x3x3x16xf32>) outs(%[[FILL]] : tensor<1x32x32x16xf32>) -> tensor<1x32x32x16xf32>
+// CHECK: %[[FILL:.+]] = linalg.fill ins(%[[CST:.+]] : f32) outs(%[[SLICE2]] : tensor<1x32x32x16xf32>)
+// CHECK: %[[CONV:.+]] = linalg.conv_2d_nhwc_hwcf {dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} ins(%[[SLICE0]], %[[SLICE1]] : tensor<1x65x65x3xf32>, tensor<3x3x3x16xf32>) outs(%[[FILL]] : tensor<1x32x32x16xf32>)
 // CHECK: return %[[CONV]] : tensor<1x32x32x16xf32>
 
 //-----
@@ -163,7 +163,7 @@
 func.func @rank_reducing_slice(%width : index) -> tensor<1x1x1x?xf32> {
   %cst = arith.constant 1.000000e+00 : f32
   %init = tensor.empty(%width) : tensor<1x?xf32>
-  %fill = linalg.fill ins(%cst : f32) outs(%init : tensor<1x?xf32>) -> tensor<1x?xf32>
+  %fill = linalg.fill ins(%cst : f32) outs(%init : tensor<1x?xf32>)
   %slice = tensor.extract_slice %fill[0, 0] [1, %width] [1, 1] : tensor<1x?xf32> to tensor<?xf32>
   %expand = tensor.expand_shape %slice [[0, 1, 2, 3]] : tensor<?xf32> into tensor<1x1x1x?xf32>
   return %expand : tensor<1x1x1x?xf32>
diff --git a/mlir/test/Dialect/Linalg/bufferize.mlir b/mlir/test/Dialect/Linalg/bufferize.mlir
--- a/mlir/test/Dialect/Linalg/bufferize.mlir
+++ b/mlir/test/Dialect/Linalg/bufferize.mlir
@@ -170,7 +170,7 @@
   // CHECK: linalg.fill ins(%cst : f32) outs(%[[ALLOC]] : memref<?xf32>)
   // CHECK: %[[TENSOR:.*]] = bufferization.to_tensor %[[ALLOC]] : memref<?xf32>
   // CHECK: return %[[TENSOR]]
-  %0 = linalg.fill ins(%c0 : f32) outs(%arg0 : tensor<?xf32>) -> tensor<?xf32>
+  %0 = linalg.fill ins(%c0 : f32) outs(%arg0 : tensor<?xf32>)
   return %0 : tensor<?xf32>
 }
 
@@ -179,7 +179,7 @@
 // CHECK-LABEL:   func @bufferize_dot
 func.func @bufferize_dot(%in: tensor<4xf32>, %out: tensor<f32>) -> tensor<f32> {
   %dot = linalg.dot ins(%in, %in : tensor<4xf32>, tensor<4xf32>)
-                    outs(%out : tensor<f32>) -> tensor<f32>
+                    outs(%out : tensor<f32>)
   return %dot : tensor<f32>
   // CHECK: %[[ALLOC:.*]] = memref.alloc
   // TODO: The copy is not necessary.
diff --git a/mlir/test/Dialect/Linalg/canonicalize.mlir b/mlir/test/Dialect/Linalg/canonicalize.mlir
--- a/mlir/test/Dialect/Linalg/canonicalize.mlir
+++ b/mlir/test/Dialect/Linalg/canonicalize.mlir
@@ -56,9 +56,9 @@
   %tc = tensor.cast %c : tensor<3x?xf32> to tensor<?x?xf32>
 
   //      CHECK:  linalg.matmul ins({{.*}}tensor<3x4xf32>, tensor<4x?xf32>)
-  // CHECK-SAME:    outs({{.*}}tensor<3x?xf32>) -> tensor<3x?xf32>
+  // CHECK-SAME:    outs({{.*}}tensor<3x?xf32>)
   %0 = linalg.matmul ins(%ta, %tb: tensor<?x?xf32>, tensor<?x?xf32>)
-                    outs(%tc: tensor<?x?xf32>) -> tensor<?x?xf32>
+                    outs(%tc: tensor<?x?xf32>)
 
   %1 = tensor.cast %0 : tensor<?x?xf32> to tensor<3x?xf32>
 
@@ -79,9 +79,9 @@
   %tc = tensor.cast %c : tensor<*xf32> to tensor<?x?xf32>
 
   //      CHECK:  linalg.matmul ins({{.*}}tensor<?x?xf32>, tensor<?x?xf32>)
-  // CHECK-SAME:    outs({{.*}}tensor<?x?xf32>) -> tensor<?x?xf32>
+  // CHECK-SAME:    outs({{.*}}tensor<?x?xf32>)
   %0 = linalg.matmul ins(%ta, %tb: tensor<?x?xf32>, tensor<?x?xf32>)
-                    outs(%tc: tensor<?x?xf32>) -> tensor<?x?xf32>
+                    outs(%tc: tensor<?x?xf32>)
 
   //      CHECK:  tensor.cast
   %1 = tensor.cast %0 : tensor<?x?xf32> to tensor<*xf32>
@@ -98,7 +98,7 @@
 func.func @linalg_effects(%a : tensor<?x?xf32>, %b : memref<?x?xf32>, %c : tensor<?x?xf32>) {
   // CHECK-NOT:   %{{.*}} = linalg.matmul
   %t = linalg.matmul ins(%a, %b : tensor<?x?xf32>, memref<?x?xf32>)
-                    outs(%c : tensor<?x?xf32>) -> tensor<?x?xf32>
+                    outs(%c : tensor<?x?xf32>)
 
   // CHECK:   linalg.matmul
   linalg.matmul ins(%a, %c : tensor<?x?xf32>, tensor<?x?xf32>)
@@ -247,9 +247,9 @@
   %c0_i32 = arith.constant 0 : i32
   %c0 = arith.constant 0 : index
   %cst = arith.constant 0.000000e+00 : f32
-  %0 = linalg.fill ins(%c0_i32 : i32) outs(%arg0 : tensor<7x7xi32>) -> tensor<7x7xi32>
+  %0 = linalg.fill ins(%c0_i32 : i32) outs(%arg0 : tensor<7x7xi32>)
   %1 = linalg.matmul ins(%arg1, %arg1: tensor<7x7xf32>, tensor<7x7xf32>)
-                     outs(%arg1: tensor<7x7xf32>) -> tensor<7x7xf32>
+                     outs(%arg1: tensor<7x7xf32>)
   %2 = linalg.generic #trait outs(%arg0 : tensor<7x7xi32>) {
   ^bb(%3: i32) :
     linalg.yield %3 : i32
@@ -270,7 +270,7 @@
   %c21 = arith.constant 21 : index
   %c42 = arith.constant 42 : index
   %0 = tensor.empty(%c21, %c42) : tensor<?x?xf32>
-  %1 = linalg.fill ins(%arg1 : f32) outs(%0 : tensor<?x?xf32>) -> tensor<?x?xf32>
+  %1 = linalg.fill ins(%arg1 : f32) outs(%0 : tensor<?x?xf32>)
   %2 = tensor.dim %arg0, %c0 : tensor<?x?xf32>
   %3 = tensor.dim %arg0, %c1 : tensor<?x?xf32>
   %4 = tensor.insert_slice %arg0 into %1[%arg2, %arg3] [%2, %3] [1, 1] : tensor<?x?xf32> into tensor<?x?xf32>
@@ -303,7 +303,7 @@
   // CHECK:      %[[COLLAPSE:.+]] = tensor.collapse_shape
   // CHECK-NEXT: %[[FILL:.+]] = linalg.fill ins(%cst : f32)
   // CHECK-SAME:   outs(%[[COLLAPSE]] : tensor<6x4xf32>)
-  %fill = linalg.fill ins(%zero : f32) outs(%empty : tensor<1x2x3x4xf32>) -> tensor<1x2x3x4xf32>
+  %fill = linalg.fill ins(%zero : f32) outs(%empty : tensor<1x2x3x4xf32>)
   %reshape = tensor.collapse_shape %fill [[0, 1, 2], [3]]
       : tensor<1x2x3x4xf32> into tensor<6x4xf32>
   // CHECK: return %[[FILL]] : tensor<6x4xf32>
@@ -317,7 +317,7 @@
 func.func @fold_fill_reshape_dynamic(%arg0 : tensor<?x?x?x?x?xf32>) -> tensor<?x?xf32> {
   %zero = arith.constant 0.0 : f32
   // CHECK: %[[RESHAPE:.+]] = tensor.collapse_shape %[[ARG0]]
-  %0 = linalg.fill ins(%zero : f32) outs(%arg0 : tensor<?x?x?x?x?xf32>) -> tensor<?x?x?x?x?xf32>
+  %0 = linalg.fill ins(%zero : f32) outs(%arg0 : tensor<?x?x?x?x?xf32>)
   // CHECK: %[[RESULT:.+]] = linalg.fill ins(%{{.+}}{{.*}}outs(%[[RESHAPE]]
   %1 = tensor.collapse_shape %0 [[0, 1, 2], [3, 4]]
       : tensor<?x?x?x?x?xf32> into tensor<?x?xf32>
@@ -351,7 +351,7 @@
 func.func @fold_static_pad_fill() -> tensor<412x276xf32> {
   %f0 = arith.constant 0.0 : f32
   %empty = tensor.empty() : tensor<400x273xf32>
-  %fill = linalg.fill ins(%f0 : f32) outs(%empty : tensor<400x273xf32>) -> tensor<400x273xf32>
+  %fill = linalg.fill ins(%f0 : f32) outs(%empty : tensor<400x273xf32>)
   %pad = tensor.pad %fill low[4, 1] high[8, 2] {
   ^bb0(%arg1: index, %arg2: index):
     tensor.yield %f0 : f32
@@ -382,7 +382,7 @@
 //      CHECK:   return %[[FILL]]
 func.func @fold_dynamic_pad_fill(%empty: tensor<8x?x16x32xf32>, %low0: index, %low3: index, %high2: index, %high3: index) -> tensor<?x?x?x?xf32> {
   %f0 = arith.constant 0.0 : f32
-  %fill = linalg.fill ins(%f0 : f32) outs(%empty : tensor<8x?x16x32xf32>) -> tensor<8x?x16x32xf32>
+  %fill = linalg.fill ins(%f0 : f32) outs(%empty : tensor<8x?x16x32xf32>)
   %pad = tensor.pad %fill low[%low0, 8, 7, %low3] high[1, 2, %high2, %high3] {
   ^bb0(%arg1: index, %arg2: index, %arg3: index, %arg4: index):
     tensor.yield %f0 : f32
@@ -397,7 +397,7 @@
   %f0 = arith.constant 0.0 : f32
   %f1 = arith.constant 1.0 : f32
   %empty = tensor.empty() : tensor<400x273xf32>
-  %fill = linalg.fill ins(%f0 : f32) outs(%empty : tensor<400x273xf32>) -> tensor<400x273xf32>
+  %fill = linalg.fill ins(%f0 : f32) outs(%empty : tensor<400x273xf32>)
   // CHECK: tensor.pad
   %pad = tensor.pad %fill low[4, 1] high[8, 2] {
   ^bb0(%arg1: index, %arg2: index):
@@ -584,7 +584,7 @@
     tensor.yield %f0 : f32
   } : tensor<?x?x?xf32> to tensor<8x128x128xf32>
   %empty = tensor.empty() : tensor<8x384x384xf32>
-  %fill = linalg.fill ins(%f0 : f32) outs(%empty : tensor<8x384x384xf32>) -> tensor<8x384x384xf32>
+  %fill = linalg.fill ins(%f0 : f32) outs(%empty : tensor<8x384x384xf32>)
   %0 = tensor.insert_slice %pad into %fill[0, 1, 2] [8, 128, 128] [1, 1, 1] : tensor<8x128x128xf32> into tensor<8x384x384xf32>
   return %0: tensor<8x384x384xf32>
 }
@@ -605,7 +605,7 @@
     tensor.yield %f0 : f32
   } : tensor<7x123x124xf32> to tensor<8x128x128xf32>
   %empty = tensor.empty() : tensor<8x384x384xf32>
-  %fill = linalg.fill ins(%f0 : f32) outs(%empty : tensor<8x384x384xf32>) -> tensor<8x384x384xf32>
+  %fill = linalg.fill ins(%f0 : f32) outs(%empty : tensor<8x384x384xf32>)
   %0 = tensor.insert_slice %a   into %fill[%offset, 0, 0]  [8, 128, 128] [1, 1, 1] : tensor<8x128x128xf32> into tensor<8x384x384xf32>
   %1 = tensor.insert_slice %a   into %0   [0, 128, %offset][8, 128, 128] [1, 1, 1] : tensor<8x128x128xf32> into tensor<8x384x384xf32>
   %2 = tensor.insert_slice %pad into %1   [0, 0, 256]      [8, 128, 128] [1, 1, 1] : tensor<8x128x128xf32> into tensor<8x384x384xf32>
@@ -624,7 +624,7 @@
     tensor.yield %f0 : f32
   } : tensor<7x123x124xf32> to tensor<8x128x128xf32>
   %empty = tensor.empty() : tensor<8x384x384xf32>
-  %fill = linalg.fill ins(%f0 : f32) outs(%empty : tensor<8x384x384xf32>) -> tensor<8x384x384xf32>
+  %fill = linalg.fill ins(%f0 : f32) outs(%empty : tensor<8x384x384xf32>)
   %0 = tensor.insert_slice %a   into %fill[%offset, 0, 0]  [8, 128, 128] [1, 1, 1] : tensor<8x128x128xf32> into tensor<8x384x384xf32>
   %1 = tensor.insert_slice %a   into %0   [0, 0, 129]      [8, 128, 128] [1, 1, 1] : tensor<8x128x128xf32> into tensor<8x384x384xf32>
   // Range overlap with %1 at dim#3
@@ -644,7 +644,7 @@
     tensor.yield %f0 : f32
   } : tensor<7x123x124xf32> to tensor<8x128x128xf32>
   %empty = tensor.empty() : tensor<8x384x384xf32>
-  %fill = linalg.fill ins(%f0 : f32) outs(%empty : tensor<8x384x384xf32>) -> tensor<8x384x384xf32>
+  %fill = linalg.fill ins(%f0 : f32) outs(%empty : tensor<8x384x384xf32>)
   %0 = tensor.insert_slice %a   into %fill[0, 0, %offset]  [8, 128, 128] [1, 1, 1] : tensor<8x128x128xf32> into tensor<8x384x384xf32>
   %1 = tensor.insert_slice %a   into %0   [0, 128, 255]    [8, 128, 128] [1, 1, 1] : tensor<8x128x128xf32> into tensor<8x384x384xf32>
   // Range overlap with %0 at dim#3
@@ -664,7 +664,7 @@
     tensor.yield %f0 : f32
   } : tensor<7x123x124xf32> to tensor<8x128x128xf32>
   %empty = tensor.empty() : tensor<8x384x384xf32>
-  %fill = linalg.fill ins(%f0 : f32) outs(%empty : tensor<8x384x384xf32>) -> tensor<8x384x384xf32>
+  %fill = linalg.fill ins(%f0 : f32) outs(%empty : tensor<8x384x384xf32>)
   // Overlap btween %0 and %1 is fine but not with %2 is fine.
   // CHECK-COUNT-3: tensor.insert_slice
   %0 = tensor.insert_slice %a   into %fill[0, 0, %offset]  [8, 128, 128] [1, 1, 1] : tensor<8x128x128xf32> into tensor<8x384x384xf32>
@@ -687,7 +687,7 @@
   } : tensor<7x123x124xf32> to tensor<8x128x128xf32>
   %empty = tensor.empty() : tensor<8x384x384xf32>
   // Different filling value than padding value.
-  %fill = linalg.fill ins(%f1 : f32) outs(%empty : tensor<8x384x384xf32>) -> tensor<8x384x384xf32>
+  %fill = linalg.fill ins(%f1 : f32) outs(%empty : tensor<8x384x384xf32>)
   %0 = tensor.insert_slice %a   into %fill[%offset, 0, 0]  [8, 128, 128] [1, 1, 1] : tensor<8x128x128xf32> into tensor<8x384x384xf32>
   %1 = tensor.insert_slice %a   into %0   [0, 128, %offset][8, 128, 128] [1, 1, 1] : tensor<8x128x128xf32> into tensor<8x384x384xf32>
   %2 = tensor.insert_slice %pad into %1   [0, 0, 256]      [8, 128, 128] [1, 1, 1] : tensor<8x128x128xf32> into tensor<8x384x384xf32>
@@ -699,7 +699,7 @@
 func.func @fold_linalgop_with_cast_consumer(%arg0 : tensor<?x?xf32>, %arg1 : tensor<?x?xf32>,
     %arg2 : tensor<?x?xf32>) -> (tensor<4x8xf32>, tensor<?x?xf32>) {
   %0 = linalg.matmul ins(%arg0, %arg1 : tensor<?x?xf32>, tensor<?x?xf32>)
-      outs(%arg2 : tensor<?x?xf32>) -> tensor<?x?xf32>
+      outs(%arg2 : tensor<?x?xf32>)
   %1 = tensor.cast %0 : tensor<?x?xf32> to tensor<4x8xf32>
   return %1, %0 : tensor<4x8xf32>, tensor<?x?xf32>
 }
@@ -723,7 +723,7 @@
 func.func @linalgop_with_cond_cast_consumer(%arg0 : tensor<?x?xf32>, %arg1 : tensor<?x?xf32>,
     %arg2 : tensor<?x?xf32>, %arg3 : i1) -> tensor<?x?xf32> {
   %0 = linalg.matmul ins(%arg0, %arg1 : tensor<?x?xf32>, tensor<?x?xf32>)
-      outs(%arg2 : tensor<?x?xf32>) -> tensor<?x?xf32>
+      outs(%arg2 : tensor<?x?xf32>)
   scf.if %arg3 {
     %1 = tensor.cast %0 : tensor<?x?xf32> to tensor<4x8xf32>
     func.call @some_use(%1) : (tensor<4x8xf32>) -> ()
@@ -735,7 +735,7 @@
 // CHECK-LABEL: func @linalgop_with_cond_cast_consumer
 //  CHECK-SAME:     (%[[ARG0:.*]]: tensor<?x?xf32>, %[[ARG1:.*]]: tensor<?x?xf32>, %[[ARG2:.*]]: tensor<?x?xf32>, %[[ARG3:.*]]: i1)
 //       CHECK: %[[RES:.*]] = linalg.matmul ins(%[[ARG0]], %[[ARG1]] : tensor<?x?xf32>, tensor<?x?xf32>)
-//  CHECK-SAME:      outs(%[[ARG2]] : tensor<?x?xf32>) -> tensor<?x?xf32>
+//  CHECK-SAME:      outs(%[[ARG2]] : tensor<?x?xf32>)
 //       CHECK: scf.if %[[ARG3]] {
 //       CHECK:   %[[CAST:.*]] = tensor.cast %[[RES]] : tensor<?x?xf32> to tensor<4x8xf32>
 //       CHECK:   func.call @some_use(%[[CAST]]) : (tensor<4x8xf32>) -> ()
@@ -749,7 +749,7 @@
     %arg1 : tensor<?x?x?x?xf32>,  %arg2 : tensor<?x?x?x?xf32>) ->
     (tensor<4x8x12x16xf32>, tensor<?x?x?x?xf32>) {
   %0 = linalg.conv_2d_nchw_fchw ins(%arg0, %arg1 : tensor<?x?x?x?xf32>, tensor<?x?x?x?xf32>)
-      outs(%arg2 : tensor<?x?x?x?xf32>) -> tensor<?x?x?x?xf32>
+      outs(%arg2 : tensor<?x?x?x?xf32>)
   %1 = tensor.cast %0 : tensor<?x?x?x?xf32> to tensor<4x8x12x16xf32>
   return %1, %0 : tensor<4x8x12x16xf32>, tensor<?x?x?x?xf32>
 }
diff --git a/mlir/test/Dialect/Linalg/drop-unit-extent-dims.mlir b/mlir/test/Dialect/Linalg/drop-unit-extent-dims.mlir
--- a/mlir/test/Dialect/Linalg/drop-unit-extent-dims.mlir
+++ b/mlir/test/Dialect/Linalg/drop-unit-extent-dims.mlir
@@ -313,7 +313,7 @@
 func.func @fold_unit_dim_for_empty_tensor(%input: tensor<1x1000xf32>) -> tensor<1xf32> {
   %cst = arith.constant 0.0 : f32
   %init = tensor.empty() : tensor<1xf32>
-  %fill = linalg.fill ins(%cst : f32) outs(%init : tensor<1xf32>) -> tensor<1xf32>
+  %fill = linalg.fill ins(%cst : f32) outs(%init : tensor<1xf32>)
   %add = linalg.generic {
       indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>],
       iterator_types = ["parallel", "reduction"]}
@@ -334,7 +334,7 @@
 
 //       CHECK: %[[INPUT_RESHAPE:.+]] = tensor.collapse_shape %{{.+}} {{\[}}[0, 1]] : tensor<1x1000xf32> into tensor<1000xf32>
 //       CHECK: %[[INIT:.+]] = tensor.empty() : tensor<f32>
-//       CHECK: %[[FILL:.+]] = linalg.fill ins(%cst : f32) outs(%[[INIT]] : tensor<f32>) -> tensor<f32>
+//       CHECK: %[[FILL:.+]] = linalg.fill ins(%cst : f32) outs(%[[INIT]] : tensor<f32>)
 //       CHECK: %[[GENERIC:.+]] = linalg.generic
 //  CHECK-SAME:     indexing_maps = [#[[MAP1]], #[[MAP2]]]
 //  CHECK-SAME:     iterator_types = ["reduction"]
@@ -378,7 +378,7 @@
   %c3 = arith.constant 3 : index
   %0 = tensor.dim %arg0, %c3 : tensor<1x?x1x?xf32>
   %1 = tensor.empty(%0) : tensor<1x?xf32>
-  %2 = linalg.fill ins(%cst : f32) outs(%1 : tensor<1x?xf32>) -> tensor<1x?xf32>
+  %2 = linalg.fill ins(%cst : f32) outs(%1 : tensor<1x?xf32>)
   %3 = linalg.generic {
     indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>,
                      affine_map<(d0, d1, d2, d3) -> (d0, d1)>],
@@ -412,7 +412,7 @@
   %cst = arith.constant 1.000000e+00 : f32
   %c3 = arith.constant 3 : index
   %1 = tensor.empty() : tensor<1x1xf32>
-  %2 = linalg.fill ins(%cst : f32) outs(%1 : tensor<1x1xf32>) -> tensor<1x1xf32>
+  %2 = linalg.fill ins(%cst : f32) outs(%1 : tensor<1x1xf32>)
   %3 = linalg.generic {
     indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>,
                      affine_map<(d0, d1, d2, d3) -> (d0, d1)>],
@@ -447,7 +447,7 @@
   %c2 = arith.constant 2 : index
   %0 = tensor.dim %arg0, %c2 : tensor<?x1x?x1xf32>
   %1 = tensor.empty(%0) : tensor<?x1xf32>
-  %2 = linalg.fill ins(%cst : f32) outs(%1 : tensor<?x1xf32>) -> tensor<?x1xf32>
+  %2 = linalg.fill ins(%cst : f32) outs(%1 : tensor<?x1xf32>)
   %3 = linalg.generic {
     indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>,
                      affine_map<(d0, d1, d2, d3) -> (d0, d1)>],
@@ -882,7 +882,7 @@
   %0 = tensor.empty() : tensor<4x2xf32>
   %res = scf.foreach_thread (%arg0, %arg1) in (%c4, %c2) shared_outs(%o = %0) -> (tensor<4x2xf32>) {
     %1 = tensor.empty() : tensor<1x1xf32>
-    %2 = linalg.fill ins(%cst : f32) outs(%1 : tensor<1x1xf32>) -> tensor<1x1xf32>
+    %2 = linalg.fill ins(%cst : f32) outs(%1 : tensor<1x1xf32>)
     scf.foreach_thread.perform_concurrently {
       //      CHECK: tensor.parallel_insert_slice %{{[0-9a-z]*}} into %{{[0-9a-z]*}}
       // CHECK-SAME: [%{{.*}}, %{{.*}}] [1, 1] [1, 1] : tensor<f32> into tensor<4x2xf32>
diff --git a/mlir/test/Dialect/Linalg/erase-unused-operands-and-results.mlir b/mlir/test/Dialect/Linalg/erase-unused-operands-and-results.mlir
--- a/mlir/test/Dialect/Linalg/erase-unused-operands-and-results.mlir
+++ b/mlir/test/Dialect/Linalg/erase-unused-operands-and-results.mlir
@@ -214,7 +214,7 @@
 func.func @loop_dim_operand(%arg0 : tensor<?xf32>) -> tensor<i32> {
   %cst = arith.constant 0 : i32
   %init = tensor.empty() : tensor<i32>
-  %fill = linalg.fill ins(%cst : i32) outs(%init : tensor<i32>) -> tensor<i32>
+  %fill = linalg.fill ins(%cst : i32) outs(%init : tensor<i32>)
   %0 = linalg.generic {
       indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> ()>],
       iterator_types = ["reduction"]}
@@ -239,7 +239,7 @@
   %cst = arith.constant 0 : i32
   %init1 = tensor.empty(%arg0) : tensor<?xi32>
   %init = tensor.empty() : tensor<i32>
-  %fill = linalg.fill ins(%cst : i32) outs(%init : tensor<i32>) -> tensor<i32>
+  %fill = linalg.fill ins(%cst : i32) outs(%init : tensor<i32>)
   %0:2 = linalg.generic {
       indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> ()>],
       iterator_types = ["parallel"]}
diff --git a/mlir/test/Dialect/Linalg/fusion-elementwise-ops.mlir b/mlir/test/Dialect/Linalg/fusion-elementwise-ops.mlir
--- a/mlir/test/Dialect/Linalg/fusion-elementwise-ops.mlir
+++ b/mlir/test/Dialect/Linalg/fusion-elementwise-ops.mlir
@@ -934,7 +934,7 @@
     linalg.yield %arg2 : f32
   } -> tensor<?x?xf32>
   %6 = tensor.empty(%arg1) : tensor<?xf32>
-  %7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<?xf32>) -> tensor<?xf32>
+  %7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<?xf32>)
   %8 = linalg.generic {
     indexing_maps = [#map2, #map3],
     iterator_types = ["parallel", "reduction"]
@@ -1005,7 +1005,7 @@
   %cst = arith.constant 7.0 : f32
   %0 = tensor.dim %arg0, %c0 : tensor<?xf32>
   %1 = tensor.empty(%0) : tensor<?xf32>
-  %2 = linalg.fill ins(%cst : f32) outs(%1 : tensor<?xf32>) -> tensor<?xf32>
+  %2 = linalg.fill ins(%cst : f32) outs(%1 : tensor<?xf32>)
   %3 = tensor.empty(%0) : tensor<?xf32>
   %4 = linalg.generic {indexing_maps = [#map0, #map0, #map0], iterator_types=["parallel"]} ins(%arg0, %2 : tensor<?xf32>, tensor<?xf32>) outs (%3:tensor<?xf32>) {
   ^bb0(%arg1: f32, %arg2: f32, %arg3: f32):
@@ -1029,7 +1029,7 @@
   %cst = arith.constant 7.0 : f32
   %0 = tensor.dim %arg0, %c0 : tensor<?xf16>
   %1 = tensor.empty(%0) : tensor<?xf16>
-  %2 = linalg.fill ins(%cst : f32) outs(%1 : tensor<?xf16>) -> tensor<?xf16>
+  %2 = linalg.fill ins(%cst : f32) outs(%1 : tensor<?xf16>)
   %3 = tensor.empty(%0) : tensor<?xf16>
   %4 = linalg.generic {indexing_maps = [#map0, #map0, #map0], iterator_types=["parallel"]} ins(%arg0, %2 : tensor<?xf16>, tensor<?xf16>) outs (%3:tensor<?xf16>) {
   ^bb0(%arg1: f16, %arg2: f16, %arg3: f16):
@@ -1056,9 +1056,9 @@
   %0 = tensor.dim %arg0, %c0 : tensor<?x?xf32>
   %1 = tensor.dim %arg0, %c1 : tensor<?x?xf32>
   %2 = tensor.empty(%0, %1) : tensor<?x?xf32>
-  %3 = linalg.fill ins(%cst1 : f32) outs(%2 : tensor<?x?xf32>) -> tensor<?x?xf32>
+  %3 = linalg.fill ins(%cst1 : f32) outs(%2 : tensor<?x?xf32>)
   %4 = tensor.empty(%1, %0) : tensor<?x?xf32>
-  %5 = linalg.fill ins(%cst2 : f32) outs(%4 : tensor<?x?xf32>) -> tensor<?x?xf32>
+  %5 = linalg.fill ins(%cst2 : f32) outs(%4 : tensor<?x?xf32>)
   %6 = tensor.empty(%0, %1) : tensor<?x?xf32>
   %7 = linalg.generic {indexing_maps = [#map0, #map1, #map0], iterator_types=["parallel","parallel"]} ins(%3, %5 : tensor<?x?xf32>, tensor<?x?xf32>) outs (%6:tensor<?x?xf32>) {
   ^bb0(%arg1: f32, %arg2: f32, %arg3: f32):
diff --git a/mlir/test/Dialect/Linalg/generalize-named-ops.mlir b/mlir/test/Dialect/Linalg/generalize-named-ops.mlir
--- a/mlir/test/Dialect/Linalg/generalize-named-ops.mlir
+++ b/mlir/test/Dialect/Linalg/generalize-named-ops.mlir
@@ -31,7 +31,7 @@
 
 func.func @generalize_matmul_tensor(%A : tensor<16x8xf32>, %B: tensor<8x32xf32>, %C: tensor<16x32xf32>) -> tensor<16x32xf32> {
   %0 = linalg.matmul ins(%A, %B: tensor<16x8xf32>, tensor<8x32xf32>)
-                    outs(%C: tensor<16x32xf32>) -> tensor<16x32xf32>
+                    outs(%C: tensor<16x32xf32>)
   return %0: tensor<16x32xf32>
 }
 
@@ -54,7 +54,7 @@
                                             %C: tensor<16x32xcomplex<f32>>)
           -> tensor<16x32xcomplex<f32>> {
   %0 = linalg.matmul ins(%A, %B: tensor<16x8xcomplex<f32>>, tensor<8x32xcomplex<f32>>)
-                    outs(%C: tensor<16x32xcomplex<f32>>) -> tensor<16x32xcomplex<f32>>
+                    outs(%C: tensor<16x32xcomplex<f32>>)
   return %0: tensor<16x32xcomplex<f32>>
 }
 
diff --git a/mlir/test/Dialect/Linalg/generalize-named-polymorphic-ops.mlir b/mlir/test/Dialect/Linalg/generalize-named-polymorphic-ops.mlir
--- a/mlir/test/Dialect/Linalg/generalize-named-polymorphic-ops.mlir
+++ b/mlir/test/Dialect/Linalg/generalize-named-polymorphic-ops.mlir
@@ -3,7 +3,7 @@
 // Verifies that different argument types is legal.
 func.func @generalize_matmul_tensor_f16f64f32(%A : tensor<16x8xf16>, %B: tensor<8x32xf64>, %C: tensor<16x32xf32>) -> tensor<16x32xf32> {
   %0 = linalg.matmul ins(%A, %B: tensor<16x8xf16>, tensor<8x32xf64>)
-                          outs(%C: tensor<16x32xf32>) -> tensor<16x32xf32>
+                          outs(%C: tensor<16x32xf32>)
   return %0: tensor<16x32xf32>
 }
 
@@ -22,7 +22,7 @@
 // Verifies that different argument types is legal.
 func.func @generalize_matmul_tensor_i16i64i32(%A : tensor<16x8xi16>, %B: tensor<8x32xi64>, %C: tensor<16x32xi32>) -> tensor<16x32xi32> {
   %0 = linalg.matmul ins(%A, %B: tensor<16x8xi16>, tensor<8x32xi64>)
-                          outs(%C: tensor<16x32xi32>) -> tensor<16x32xi32>
+                          outs(%C: tensor<16x32xi32>)
   return %0: tensor<16x32xi32>
 }
 
@@ -43,7 +43,7 @@
 func.func @generalize_matmul_tensor_i16i64i32_unsigned(%A : tensor<16x8xi16>, %B: tensor<8x32xi64>, %C: tensor<16x32xi32>) -> tensor<16x32xi32> {
   %0 = linalg.matmul {cast = #linalg.type_fn<cast_unsigned>}
                      ins(%A, %B: tensor<16x8xi16>, tensor<8x32xi64>)
-                          outs(%C: tensor<16x32xi32>) -> tensor<16x32xi32>
+                          outs(%C: tensor<16x32xi32>)
   return %0: tensor<16x32xi32>
 }
 
@@ -54,7 +54,7 @@
 
 func.func @generalize_matmul_tensor_i16i64f32(%A : tensor<16x8xi16>, %B: tensor<8x32xi64>, %C: tensor<16x32xf32>) -> tensor<16x32xf32> {
   %0 = linalg.matmul ins(%A, %B: tensor<16x8xi16>, tensor<8x32xi64>)
-                     outs(%C: tensor<16x32xf32>) -> tensor<16x32xf32>
+                     outs(%C: tensor<16x32xf32>)
   return %0: tensor<16x32xf32>
 }
 
@@ -67,7 +67,7 @@
 
 func.func @generalize_matmul_tensor_f16f64i32(%A : tensor<16x8xf16>, %B: tensor<8x32xf64>, %C: tensor<16x32xi32>) -> tensor<16x32xi32> {
   %0 = linalg.matmul ins(%A, %B: tensor<16x8xf16>, tensor<8x32xf64>)
-                              outs(%C: tensor<16x32xi32>) -> tensor<16x32xi32>
+                              outs(%C: tensor<16x32xi32>)
   return %0: tensor<16x32xi32>
 }
 
@@ -80,7 +80,7 @@
 
 func.func @generalize_matmul_unsigned_tensor_i16i64i32(%A : tensor<16x8xi16>, %B: tensor<8x32xi64>, %C: tensor<16x32xi32>) -> tensor<16x32xi32> {
   %0 = linalg.matmul_unsigned ins(%A, %B: tensor<16x8xi16>, tensor<8x32xi64>)
-                              outs(%C: tensor<16x32xi32>) -> tensor<16x32xi32>
+                              outs(%C: tensor<16x32xi32>)
   return %0: tensor<16x32xi32>
 }
 
@@ -93,7 +93,7 @@
 
 func.func @generalize_matmul_unsigned_tensor_i16i64f32(%A : tensor<16x8xi16>, %B: tensor<8x32xi64>, %C: tensor<16x32xf32>) -> tensor<16x32xf32> {
   %0 = linalg.matmul_unsigned ins(%A, %B: tensor<16x8xi16>, tensor<8x32xi64>)
-                              outs(%C: tensor<16x32xf32>) -> tensor<16x32xf32>
+                              outs(%C: tensor<16x32xf32>)
   return %0: tensor<16x32xf32>
 }
 
@@ -106,7 +106,7 @@
 
 func.func @generalize_matmul_unsigned_tensor_f16f64i32(%A : tensor<16x8xf16>, %B: tensor<8x32xf64>, %C: tensor<16x32xi32>) -> tensor<16x32xi32> {
   %0 = linalg.matmul_unsigned ins(%A, %B: tensor<16x8xf16>, tensor<8x32xf64>)
-                              outs(%C: tensor<16x32xi32>) -> tensor<16x32xi32>
+                              outs(%C: tensor<16x32xi32>)
   return %0: tensor<16x32xi32>
 }
 
@@ -119,7 +119,7 @@
 
 func.func @generalize_pooling_nhwc_max_f32(%input : tensor<1x4x16x1xf32>, %shape: tensor<2x2xf32>, %output: tensor<1x2x4x1xf32>) -> tensor<1x2x4x1xf32> {
   %0 = linalg.pooling_nhwc_max {dilations = dense<[1, 2]> : tensor<2xi64>, strides = dense<[2, 4]> : tensor<2xi64>}
-    ins(%input, %shape : tensor<1x4x16x1xf32>, tensor<2x2xf32>) outs(%output : tensor<1x2x4x1xf32>) -> tensor<1x2x4x1xf32>
+    ins(%input, %shape : tensor<1x4x16x1xf32>, tensor<2x2xf32>) outs(%output : tensor<1x2x4x1xf32>)
   return %0: tensor<1x2x4x1xf32>
 }
 
@@ -133,7 +133,7 @@
 
 func.func @generalize_pooling_nwc_max_f32(%input : tensor<1x16x1xf32>, %shape: tensor<2xf32>, %output: tensor<1x4x1xf32>) -> tensor<1x4x1xf32> {
   %0 = linalg.pooling_nwc_max {dilations = dense<[2]> : tensor<1xi64>, strides = dense<[4]> : tensor<1xi64>}
-    ins(%input, %shape : tensor<1x16x1xf32>, tensor<2xf32>) outs(%output : tensor<1x4x1xf32>) -> tensor<1x4x1xf32>
+    ins(%input, %shape : tensor<1x16x1xf32>, tensor<2xf32>) outs(%output : tensor<1x4x1xf32>)
   return %0: tensor<1x4x1xf32>
 }
 
@@ -147,7 +147,7 @@
 
 func.func @generalize_pooling_nhwc_max_i32(%input : tensor<1x4x16x1xi32>, %shape: tensor<2x2xi32>, %output: tensor<1x2x4x1xi32>) -> tensor<1x2x4x1xi32> {
   %0 = linalg.pooling_nhwc_max {dilations = dense<[1, 2]> : tensor<2xi64>, strides = dense<[2, 4]> : tensor<2xi64>}
-    ins(%input, %shape : tensor<1x4x16x1xi32>, tensor<2x2xi32>) outs(%output : tensor<1x2x4x1xi32>) -> tensor<1x2x4x1xi32>
+    ins(%input, %shape : tensor<1x4x16x1xi32>, tensor<2x2xi32>) outs(%output : tensor<1x2x4x1xi32>)
   return %0: tensor<1x2x4x1xi32>
 }
 
@@ -159,7 +159,7 @@
 
 func.func @generalize_pooling_nwc_max_i32(%input : tensor<1x16x1xi32>, %shape: tensor<2xi32>, %output: tensor<1x4x1xi32>) -> tensor<1x4x1xi32> {
   %0 = linalg.pooling_nwc_max {dilations = dense<[2]> : tensor<1xi64>, strides = dense<[4]> : tensor<1xi64>}
-    ins(%input, %shape : tensor<1x16x1xi32>, tensor<2xi32>) outs(%output : tensor<1x4x1xi32>) -> tensor<1x4x1xi32>
+    ins(%input, %shape : tensor<1x16x1xi32>, tensor<2xi32>) outs(%output : tensor<1x4x1xi32>)
   return %0: tensor<1x4x1xi32>
 }
 
@@ -171,7 +171,7 @@
 
 func.func @generalize_pooling_nhwc_max_unsigned_i32(%input : tensor<1x4x16x1xi32>, %shape: tensor<2x2xi32>, %output: tensor<1x2x4x1xi32>) -> tensor<1x2x4x1xi32> {
   %0 = linalg.pooling_nhwc_max_unsigned {dilations = dense<[1, 2]> : tensor<2xi64>, strides = dense<[2, 4]> : tensor<2xi64>}
-    ins(%input, %shape : tensor<1x4x16x1xi32>, tensor<2x2xi32>) outs(%output : tensor<1x2x4x1xi32>) -> tensor<1x2x4x1xi32>
+    ins(%input, %shape : tensor<1x4x16x1xi32>, tensor<2x2xi32>) outs(%output : tensor<1x2x4x1xi32>)
   return %0: tensor<1x2x4x1xi32>
 }
 
@@ -183,7 +183,7 @@
 
 func.func @generalize_pooling_nwc_max_unsigned_i32(%input : tensor<1x16x1xi32>, %shape: tensor<2xi32>, %output: tensor<1x4x1xi32>) -> tensor<1x4x1xi32> {
   %0 = linalg.pooling_nwc_max_unsigned {dilations = dense<[2]> : tensor<1xi64>, strides = dense<[4]> : tensor<1xi64>}
-    ins(%input, %shape : tensor<1x16x1xi32>, tensor<2xi32>) outs(%output : tensor<1x4x1xi32>) -> tensor<1x4x1xi32>
+    ins(%input, %shape : tensor<1x16x1xi32>, tensor<2xi32>) outs(%output : tensor<1x4x1xi32>)
   return %0: tensor<1x4x1xi32>
 }
 
@@ -195,7 +195,7 @@
 
 func.func @generalize_pooling_nhwc_min_f32(%input : tensor<1x4x16x1xf32>, %shape: tensor<2x2xf32>, %output: tensor<1x2x4x1xf32>) -> tensor<1x2x4x1xf32> {
   %0 = linalg.pooling_nhwc_min {dilations = dense<[1, 2]> : tensor<2xi64>, strides = dense<[2, 4]> : tensor<2xi64>}
-    ins(%input, %shape : tensor<1x4x16x1xf32>, tensor<2x2xf32>) outs(%output : tensor<1x2x4x1xf32>) -> tensor<1x2x4x1xf32>
+    ins(%input, %shape : tensor<1x4x16x1xf32>, tensor<2x2xf32>) outs(%output : tensor<1x2x4x1xf32>)
   return %0: tensor<1x2x4x1xf32>
 }
 
@@ -209,7 +209,7 @@
 
 func.func @generalize_pooling_nwc_min_f32(%input : tensor<1x16x1xf32>, %shape: tensor<2xf32>, %output: tensor<1x4x1xf32>) -> tensor<1x4x1xf32> {
   %0 = linalg.pooling_nwc_min {dilations = dense<[2]> : tensor<1xi64>, strides = dense<[4]> : tensor<1xi64>}
-    ins(%input, %shape : tensor<1x16x1xf32>, tensor<2xf32>) outs(%output : tensor<1x4x1xf32>) -> tensor<1x4x1xf32>
+    ins(%input, %shape : tensor<1x16x1xf32>, tensor<2xf32>) outs(%output : tensor<1x4x1xf32>)
   return %0: tensor<1x4x1xf32>
 }
 
@@ -223,7 +223,7 @@
 
 func.func @generalize_pooling_nhwc_min_i32(%input : tensor<1x4x16x1xi32>, %shape: tensor<2x2xi32>, %output: tensor<1x2x4x1xi32>) -> tensor<1x2x4x1xi32> {
   %0 = linalg.pooling_nhwc_min {dilations = dense<[1, 2]> : tensor<2xi64>, strides = dense<[2, 4]> : tensor<2xi64>}
-    ins(%input, %shape : tensor<1x4x16x1xi32>, tensor<2x2xi32>) outs(%output : tensor<1x2x4x1xi32>) -> tensor<1x2x4x1xi32>
+    ins(%input, %shape : tensor<1x4x16x1xi32>, tensor<2x2xi32>) outs(%output : tensor<1x2x4x1xi32>)
   return %0: tensor<1x2x4x1xi32>
 }
 
@@ -235,7 +235,7 @@
 
 func.func @generalize_pooling_nwc_min_i32(%input : tensor<1x16x1xi32>, %shape: tensor<2xi32>, %output: tensor<1x4x1xi32>) -> tensor<1x4x1xi32> {
   %0 = linalg.pooling_nwc_min {dilations = dense<[2]> : tensor<1xi64>, strides = dense<[4]> : tensor<1xi64>}
-    ins(%input, %shape : tensor<1x16x1xi32>, tensor<2xi32>) outs(%output : tensor<1x4x1xi32>) -> tensor<1x4x1xi32>
+    ins(%input, %shape : tensor<1x16x1xi32>, tensor<2xi32>) outs(%output : tensor<1x4x1xi32>)
   return %0: tensor<1x4x1xi32>
 }
 
@@ -247,7 +247,7 @@
 
 func.func @generalize_pooling_nhwc_min_unsigned_i32(%input : tensor<1x4x16x1xi32>, %shape: tensor<2x2xi32>, %output: tensor<1x2x4x1xi32>) -> tensor<1x2x4x1xi32> {
   %0 = linalg.pooling_nhwc_min_unsigned {dilations = dense<[1, 2]> : tensor<2xi64>, strides = dense<[2, 4]> : tensor<2xi64>}
-    ins(%input, %shape : tensor<1x4x16x1xi32>, tensor<2x2xi32>) outs(%output : tensor<1x2x4x1xi32>) -> tensor<1x2x4x1xi32>
+    ins(%input, %shape : tensor<1x4x16x1xi32>, tensor<2x2xi32>) outs(%output : tensor<1x2x4x1xi32>)
   return %0: tensor<1x2x4x1xi32>
 }
 
@@ -259,7 +259,7 @@
 
 func.func @generalize_pooling_nwc_min_unsigned_i32(%input : tensor<1x16x1xi32>, %shape: tensor<2xi32>, %output: tensor<1x4x1xi32>) -> tensor<1x4x1xi32> {
   %0 = linalg.pooling_nwc_min_unsigned {dilations = dense<[2]> : tensor<1xi64>, strides = dense<[4]> : tensor<1xi64>}
-    ins(%input, %shape : tensor<1x16x1xi32>, tensor<2xi32>) outs(%output : tensor<1x4x1xi32>) -> tensor<1x4x1xi32>
+    ins(%input, %shape : tensor<1x16x1xi32>, tensor<2xi32>) outs(%output : tensor<1x4x1xi32>)
   return %0: tensor<1x4x1xi32>
 }
 
@@ -271,7 +271,7 @@
 
 func.func @generalize_pooling_nhwc_sum_f32(%input : tensor<1x4x16x1xf32>, %shape: tensor<2x2xf32>, %output: tensor<1x2x4x1xf32>) -> tensor<1x2x4x1xf32> {
   %0 = linalg.pooling_nhwc_sum {dilations = dense<[1, 2]> : tensor<2xi64>, strides = dense<[2, 4]> : tensor<2xi64>}
-    ins(%input, %shape : tensor<1x4x16x1xf32>, tensor<2x2xf32>) outs(%output : tensor<1x2x4x1xf32>) -> tensor<1x2x4x1xf32>
+    ins(%input, %shape : tensor<1x4x16x1xf32>, tensor<2x2xf32>) outs(%output : tensor<1x2x4x1xf32>)
   return %0: tensor<1x2x4x1xf32>
 }
 
@@ -285,7 +285,7 @@
 
 func.func @generalize_pooling_nwc_sum_f32(%input : tensor<1x16x1xf32>, %shape: tensor<2xf32>, %output: tensor<1x4x1xf32>) -> tensor<1x4x1xf32> {
   %0 = linalg.pooling_nwc_sum {dilations = dense<[2]> : tensor<1xi64>, strides = dense<[4]> : tensor<1xi64>}
-    ins(%input, %shape : tensor<1x16x1xf32>, tensor<2xf32>) outs(%output : tensor<1x4x1xf32>) -> tensor<1x4x1xf32>
+    ins(%input, %shape : tensor<1x16x1xf32>, tensor<2xf32>) outs(%output : tensor<1x4x1xf32>)
   return %0: tensor<1x4x1xf32>
 }
 
@@ -299,7 +299,7 @@
 
 func.func @generalize_pooling_nhwc_sum_i32(%input : tensor<1x4x16x1xi32>, %shape: tensor<2x2xi32>, %output: tensor<1x2x4x1xi32>) -> tensor<1x2x4x1xi32> {
   %0 = linalg.pooling_nhwc_sum {dilations = dense<[1, 2]> : tensor<2xi64>, strides = dense<[2, 4]> : tensor<2xi64>}
-    ins(%input, %shape : tensor<1x4x16x1xi32>, tensor<2x2xi32>) outs(%output : tensor<1x2x4x1xi32>) -> tensor<1x2x4x1xi32>
+    ins(%input, %shape : tensor<1x4x16x1xi32>, tensor<2x2xi32>) outs(%output : tensor<1x2x4x1xi32>)
   return %0: tensor<1x2x4x1xi32>
 }
 
@@ -313,7 +313,7 @@
 
 func.func @generalize_pooling_nwc_sum_i32(%input : tensor<1x16x1xi32>, %shape: tensor<2xi32>, %output: tensor<1x4x1xi32>) -> tensor<1x4x1xi32> {
   %0 = linalg.pooling_nwc_sum {dilations = dense<[2]> : tensor<1xi64>, strides = dense<[4]> : tensor<1xi64>}
-    ins(%input, %shape : tensor<1x16x1xi32>, tensor<2xi32>) outs(%output : tensor<1x4x1xi32>) -> tensor<1x4x1xi32>
+    ins(%input, %shape : tensor<1x16x1xi32>, tensor<2xi32>) outs(%output : tensor<1x4x1xi32>)
   return %0: tensor<1x4x1xi32>
 }
 
@@ -326,7 +326,7 @@
 // -----
 
 func.func @generalize_fill_0d(%value: f64, %O: tensor<f32>) -> tensor<f32> {
-  %0 = linalg.fill ins(%value: f64) outs(%O : tensor<f32>) -> tensor<f32>
+  %0 = linalg.fill ins(%value: f64) outs(%O : tensor<f32>)
   return %0: tensor<f32>
 }
 
@@ -355,7 +355,7 @@
 // -----
 
 func.func @generalize_index(%min: f64, %max: f64, %seed: i32, %O: tensor<16x32xf32>) -> tensor<16x32xf32> {
-  %0 = linalg.fill_rng_2d ins(%min, %max, %seed: f64, f64, i32) outs(%O : tensor<16x32xf32>) -> tensor<16x32xf32>
+  %0 = linalg.fill_rng_2d ins(%min, %max, %seed: f64, f64, i32) outs(%O : tensor<16x32xf32>)
   return %0: tensor<16x32xf32>
 }
 
@@ -368,7 +368,7 @@
 // -----
 
 func.func @generalize_const(%min: f64, %max: f64, %seed: i32, %O: tensor<16x32xf32>) -> tensor<16x32xf32> {
-  %0 = linalg.fill_rng_2d ins(%min, %max, %seed: f64, f64, i32) outs(%O : tensor<16x32xf32>) -> tensor<16x32xf32>
+  %0 = linalg.fill_rng_2d ins(%min, %max, %seed: f64, f64, i32) outs(%O : tensor<16x32xf32>)
   return %0: tensor<16x32xf32>
 }
 
@@ -381,7 +381,7 @@
 
 // Verifies the default value of the fun attribute is an exp op.
 func.func @generalize_elemwise_exp(%lhs : tensor<4x8xf32>, %output : tensor<4x8xf32>) -> tensor<4x8xf32> {
-  %0 = linalg.elemwise_unary ins(%lhs: tensor<4x8xf32>) outs(%output: tensor<4x8xf32>) -> tensor<4x8xf32>
+  %0 = linalg.elemwise_unary ins(%lhs: tensor<4x8xf32>) outs(%output: tensor<4x8xf32>)
   return %0: tensor<4x8xf32>
 }
 
@@ -393,7 +393,7 @@
 // Verifies the fun attribute controls the unary function used.
 func.func @generalize_elemwise_log(%lhs : tensor<4x8xf32>, %output : tensor<4x8xf32>) -> tensor<4x8xf32> {
   %0 = linalg.elemwise_unary {fun = #linalg.unary_fn<log>}
-                              ins(%lhs: tensor<4x8xf32>) outs(%output: tensor<4x8xf32>) -> tensor<4x8xf32>
+                              ins(%lhs: tensor<4x8xf32>) outs(%output: tensor<4x8xf32>)
   return %0: tensor<4x8xf32>
 }
 
@@ -405,7 +405,7 @@
 // Verifies the fun attribute controls the unary function used.
 func.func @generalize_elemwise_abs(%lhs : tensor<4x8xf32>, %output : tensor<4x8xf32>) -> tensor<4x8xf32> {
   %0 = linalg.elemwise_unary {fun = #linalg.unary_fn<abs>}
-                              ins(%lhs: tensor<4x8xf32>) outs(%output: tensor<4x8xf32>) -> tensor<4x8xf32>
+                              ins(%lhs: tensor<4x8xf32>) outs(%output: tensor<4x8xf32>)
   return %0: tensor<4x8xf32>
 }
 
@@ -417,7 +417,7 @@
 // Verifies the fun attribute controls the unary function used.
 func.func @generalize_elemwise_ceil(%lhs : tensor<4x8xf32>, %output : tensor<4x8xf32>) -> tensor<4x8xf32> {
   %0 = linalg.elemwise_unary {fun = #linalg.unary_fn<ceil>}
-                              ins(%lhs: tensor<4x8xf32>) outs(%output: tensor<4x8xf32>) -> tensor<4x8xf32>
+                              ins(%lhs: tensor<4x8xf32>) outs(%output: tensor<4x8xf32>)
   return %0: tensor<4x8xf32>
 }
 
@@ -429,7 +429,7 @@
 // Verifies the fun attribute controls the unary function used.
 func.func @generalize_elemwise_floor(%lhs : tensor<4x8xf32>, %output : tensor<4x8xf32>) -> tensor<4x8xf32> {
   %0 = linalg.elemwise_unary {fun = #linalg.unary_fn<floor>}
-                              ins(%lhs: tensor<4x8xf32>) outs(%output: tensor<4x8xf32>) -> tensor<4x8xf32>
+                              ins(%lhs: tensor<4x8xf32>) outs(%output: tensor<4x8xf32>)
   return %0: tensor<4x8xf32>
 }
 
@@ -441,7 +441,7 @@
 // Verifies the fun attribute controls the unary function used.
 func.func @generalize_elemwise_negf(%lhs : tensor<4x8xf32>, %output : tensor<4x8xf32>) -> tensor<4x8xf32> {
   %0 = linalg.elemwise_unary {fun = #linalg.unary_fn<negf>}
-                              ins(%lhs: tensor<4x8xf32>) outs(%output: tensor<4x8xf32>) -> tensor<4x8xf32>
+                              ins(%lhs: tensor<4x8xf32>) outs(%output: tensor<4x8xf32>)
   return %0: tensor<4x8xf32>
 }
 
@@ -453,7 +453,7 @@
 // Verifies the default value of the fun attribute is an add op.
 func.func @generalize_elemwise_add(%lhs : tensor<4x8xf32>, %rhs : tensor<4x8xf32>, %output : tensor<4x8xf32>) -> tensor<4x8xf32> {
   %0 = linalg.elemwise_binary ins(%lhs, %rhs: tensor<4x8xf32>, tensor<4x8xf32>)
-                              outs(%output: tensor<4x8xf32>) -> tensor<4x8xf32>
+                              outs(%output: tensor<4x8xf32>)
   return %0: tensor<4x8xf32>
 }
 
@@ -466,7 +466,7 @@
 func.func @generalize_elemwise_mul(%lhs : tensor<4x8xf32>, %rhs : tensor<4x8xf32>, %output : tensor<4x8xf32>) -> tensor<4x8xf32> {
   %0 = linalg.elemwise_binary {fun = #linalg.binary_fn<mul>}
                               ins(%lhs, %rhs: tensor<4x8xf32>, tensor<4x8xf32>)
-                              outs(%output: tensor<4x8xf32>) -> tensor<4x8xf32>
+                              outs(%output: tensor<4x8xf32>)
   return %0: tensor<4x8xf32>
 }
 
@@ -479,7 +479,7 @@
 func.func @generalize_elemwise_rank_zero(%lhs : tensor<f32>, %rhs : tensor<f32>, %output : tensor<4x8xf32>) -> tensor<4x8xf32> {
   %0 = linalg.elemwise_binary {fun = #linalg.binary_fn<sub>}
                               ins(%lhs, %rhs: tensor<f32>, tensor<f32>)
-                              outs(%output: tensor<4x8xf32>) -> tensor<4x8xf32>
+                              outs(%output: tensor<4x8xf32>)
   return %0: tensor<4x8xf32>
 }
 
@@ -492,7 +492,7 @@
 
 // Verifies the fun attribute controls the binary function used.
 func.func @generalize_copy(%lhs : tensor<4x8xf32>, %output : tensor<4x8xf32>) -> tensor<4x8xf32> {
-  %0 = linalg.copy ins(%lhs: tensor<4x8xf32>) outs(%output: tensor<4x8xf32>) -> tensor<4x8xf32>
+  %0 = linalg.copy ins(%lhs: tensor<4x8xf32>) outs(%output: tensor<4x8xf32>)
   return %0: tensor<4x8xf32>
 }
 
diff --git a/mlir/test/Dialect/Linalg/generalize-pad-tensor.mlir b/mlir/test/Dialect/Linalg/generalize-pad-tensor.mlir
--- a/mlir/test/Dialect/Linalg/generalize-pad-tensor.mlir
+++ b/mlir/test/Dialect/Linalg/generalize-pad-tensor.mlir
@@ -4,7 +4,7 @@
 // CHECK-SAME:                                             %[[IN:.*]]: tensor<1x28x28x1xf32>) -> tensor<1x32x32x1xf32> {
 // CHECK:           %[[C0:.*]] = arith.constant 0.000000e+00 : f32
 // CHECK:           %[[INIT:.*]] = tensor.empty() : tensor<1x32x32x1xf32>
-// CHECK:           %[[FILL:.*]] = linalg.fill ins(%[[C0]] : f32) outs(%[[INIT]] : tensor<1x32x32x1xf32>) -> tensor<1x32x32x1xf32>
+// CHECK:           %[[FILL:.*]] = linalg.fill ins(%[[C0]] : f32) outs(%[[INIT]] : tensor<1x32x32x1xf32>)
 // CHECK:           %[[PADDED:.*]] = tensor.insert_slice %[[IN]] into %[[FILL]][0, 2, 2, 0] [1, 28, 28, 1] [1, 1, 1, 1] : tensor<1x28x28x1xf32> into tensor<1x32x32x1xf32>
 // CHECK:           return %[[PADDED]] : tensor<1x32x32x1xf32>
 func.func @generalize_pad_tensor_static_shape(%arg0: tensor<1x28x28x1xf32>) -> tensor<1x32x32x1xf32> {
@@ -29,7 +29,7 @@
 // CHECK:           %[[DIM3:.*]] = tensor.dim %[[IN]], %[[C3]] : tensor<4x?x2x?xf32>
 // CHECK:           %[[OUT_DIM3:.*]] = arith.addi %[[DIM3]], %[[OFFSET]] : index
 // CHECK:           %[[INIT:.*]] = tensor.empty(%[[DIM1]], %[[OUT_DIM2]], %[[OUT_DIM3]]) : tensor<4x?x?x?xf32>
-// CHECK:           %[[FILL:.*]] = linalg.fill ins(%[[CST]] : f32) outs(%[[INIT]] : tensor<4x?x?x?xf32>) -> tensor<4x?x?x?xf32>
+// CHECK:           %[[FILL:.*]] = linalg.fill ins(%[[CST]] : f32) outs(%[[INIT]] : tensor<4x?x?x?xf32>)
 // CHECK:           %[[DIM1_1:.*]] = tensor.dim %[[IN]], %[[C1]] : tensor<4x?x2x?xf32>
 // CHECK:           %[[DIM3_1:.*]] = tensor.dim %[[IN]], %[[C3]] : tensor<4x?x2x?xf32>
 // CHECK:           %[[PADDED:.*]] = tensor.insert_slice %[[IN]] into %[[FILL]]{{\[}}%[[C0]], %[[C0]], %[[OFFSET]], %[[C0]]] [4, %[[DIM1_1]], 2, %[[DIM3_1]]] [1, 1, 1, 1] : tensor<4x?x2x?xf32> into tensor<4x?x?x?xf32>
diff --git a/mlir/test/Dialect/Linalg/invalid.mlir b/mlir/test/Dialect/Linalg/invalid.mlir
--- a/mlir/test/Dialect/Linalg/invalid.mlir
+++ b/mlir/test/Dialect/Linalg/invalid.mlir
@@ -304,54 +304,14 @@
 
 // -----
 
-func.func @incorrect_region_arg_count(%m: memref<?x?xf32>) {
-  // expected-error @+3 {{region expects 3 args, got 2}}
-  %res = linalg.matmul ins(%m, %m : memref<?x?xf32>, memref<?x?xf32>)
-                       -> (tensor<?x?xf32>, tensor<?x?xf32>)
+func.func @incorrect_region_arg_count(%m: tensor<?x?xf32>) {
+  // expected-error @+2 {{region expects 3 args, got 2}}
+  %res = linalg.matmul outs(%m, %m : tensor<?x?xf32>, tensor<?x?xf32>)
   return
 }
 
 // -----
 
-func.func @matching_inits(%m: memref<?x?xf32>, %t: tensor<?x?xf32>) {
-  // expected-error @+1 {{expected type of operand #2 ('tensor<?x?xf32>') to match type of corresponding result ('tensor<?xf32>')}}
-  %res = linalg.matmul ins(%m, %m : memref<?x?xf32>, memref<?x?xf32>)
-                      outs(%t : tensor<?x?xf32>)
-                        -> tensor<?xf32>
-  return
-}
-
-// -----
-
-func.func @illegal_fill_tensor_no_return(%arg0 : index, %arg1 : index, %arg2 : f32)
-{
-  %0 = tensor.empty(%arg0, %arg1) : tensor<?x?xf32>
-  // expected-error @+1 {{expected the number of results (0) to be equal to the number of output tensors (1)}}
-  linalg.fill ins(%arg2 : f32) outs(%0 : tensor<?x?xf32>)
-}
-
-// -----
-
-func.func @illegal_fill_memref_with_tensor_return
-  (%arg0 : memref<?x?xf32>, %arg1 : f32) -> tensor<?x?xf32>
-{
-  // expected-error @+1 {{expected the number of results (1) to be equal to the number of output tensors (0)}}
-  %0 = linalg.fill ins(%arg1 : f32) outs(%arg0 : memref<?x?xf32>) -> tensor<?x?xf32>
-  return %0 : tensor<?x?xf32>
-}
-
-// -----
-
-func.func @illegal_fill_tensor_with_memref_return
-  (%arg0 : tensor<?x?xf32>, %arg1 : f32) -> memref<?x?xf32>
-{
-  // expected-error @+1 {{result #0 must be ranked tensor of any type values, but got 'memref<?x?xf32>'}}
-  %0 = linalg.fill ins(%arg1 : f32) outs(%arg0 : tensor<?x?xf32>) -> memref<?x?xf32>
-  return %0 : memref<?x?xf32>
-}
-
-// -----
-
 func.func @invalid_static_matmul(%arg0: memref<2x4xf32>, %arg1: memref<3x4xf32>, %arg2: memref<2x4xf32>) {
   // expected-error @+1 {{inferred input/output operand #1 has shape's dimension #0 to be 4, but found 3}}
   linalg.matmul ins(%arg0, %arg1 : memref<2x4xf32>, memref<3x4xf32>)
diff --git a/mlir/test/Dialect/Linalg/named-ops.mlir b/mlir/test/Dialect/Linalg/named-ops.mlir
--- a/mlir/test/Dialect/Linalg/named-ops.mlir
+++ b/mlir/test/Dialect/Linalg/named-ops.mlir
@@ -4,11 +4,11 @@
 func.func @depthwise_conv_1d_nwc_wcm(%input: tensor<1x12x8xf32>, %filter: tensor<3x8x8xf32>) -> tensor<1x10x8x8xf32> {
   %zero = arith.constant 0.000000e+00 : f32
   %init = tensor.empty() : tensor<1x10x8x8xf32>
-  %fill = linalg.fill ins(%zero : f32) outs(%init : tensor<1x10x8x8xf32>) -> tensor<1x10x8x8xf32>
+  %fill = linalg.fill ins(%zero : f32) outs(%init : tensor<1x10x8x8xf32>)
   // CHECK: depthwise_conv_1d_nwc_wcm
   %0 = linalg.depthwise_conv_1d_nwc_wcm {dilations = dense<1> : tensor<1xi64>, strides = dense<1> : tensor<1xi64>}
     ins(%input, %filter : tensor<1x12x8xf32>, tensor<3x8x8xf32>)
-    outs(%fill : tensor<1x10x8x8xf32>) -> tensor<1x10x8x8xf32>
+    outs(%fill : tensor<1x10x8x8xf32>)
   return %0 : tensor<1x10x8x8xf32>
 }
 
@@ -18,11 +18,11 @@
 func.func @depthwise_conv_1d_nwc_wc(%input: tensor<1x12x8xf32>, %filter: tensor<3x8xf32>) -> tensor<1x10x8xf32> {
   %zero = arith.constant 0.000000e+00 : f32
   %init = tensor.empty() : tensor<1x10x8xf32>
-  %fill = linalg.fill ins(%zero : f32) outs(%init : tensor<1x10x8xf32>) -> tensor<1x10x8xf32>
+  %fill = linalg.fill ins(%zero : f32) outs(%init : tensor<1x10x8xf32>)
   // CHECK: depthwise_conv_1d_nwc_wc
   %0 = linalg.depthwise_conv_1d_nwc_wc {dilations = dense<1> : tensor<1xi64>, strides = dense<1> : tensor<1xi64>}
     ins(%input, %filter : tensor<1x12x8xf32>, tensor<3x8xf32>)
-    outs(%fill : tensor<1x10x8xf32>) -> tensor<1x10x8xf32>
+    outs(%fill : tensor<1x10x8xf32>)
   return %0 : tensor<1x10x8xf32>
 }
 
@@ -32,7 +32,7 @@
 func.func @depthwise_conv_2d_nhwc_hwcm_tensor(%input: tensor<2x4x5x2xf32>, %filter: tensor<2x2x2x3xf32>) -> tensor<2x3x4x2x3xf32> {
   %zero = arith.constant 0.000000e+00 : f32
   %init = tensor.empty() : tensor<2x3x4x2x3xf32>
-  %fill = linalg.fill ins(%zero : f32) outs(%init : tensor<2x3x4x2x3xf32>) -> tensor<2x3x4x2x3xf32>
+  %fill = linalg.fill ins(%zero : f32) outs(%init : tensor<2x3x4x2x3xf32>)
   // CHECK:      %{{.+}} = linalg.depthwise_conv_2d_nhwc_hwcm
   // CHECK-SAME:   {dilations = dense<1> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>}
   // CHECK-SAME:   ins(%{{.+}}, %{{.+}} : tensor<2x4x5x2xf32>, tensor<2x2x2x3xf32>)
@@ -40,7 +40,7 @@
   %0 = linalg.depthwise_conv_2d_nhwc_hwcm
      { dilations = dense<1> : tensor<2xi64>, strides = dense<1> : tensor<2xi64> }
      ins(%input, %filter : tensor<2x4x5x2xf32>, tensor<2x2x2x3xf32>)
-    outs(%fill : tensor<2x3x4x2x3xf32>) -> tensor<2x3x4x2x3xf32>
+    outs(%fill : tensor<2x3x4x2x3xf32>)
   return %0 : tensor<2x3x4x2x3xf32>
 }
 
@@ -63,10 +63,10 @@
   // CHECK:      %{{.+}} = linalg.depthwise_conv_1d_nw
   // CHECK-SAME:   {dilations = dense<1> : vector<1xi64>, strides = dense<2> : vector<1xi64>}
   // CHECK-SAME:   ins(%{{.+}}, %{{.+}} : tensor<1x113x96xf32>, tensor<3x96xf32>)
-  // CHECK-SAME:   outs(%{{.+}} : tensor<1x56x96xf32>) -> tensor<1x56x96xf32>
+  // CHECK-SAME:   outs(%{{.+}} : tensor<1x56x96xf32>)
   %0 = linalg.depthwise_conv_1d_nwc_wc {dilations = dense<1> : vector<1xi64>, strides = dense<2> : vector<1xi64>}
          ins(%input, %filter: tensor<1x113x96xf32>, tensor<3x96xf32>)
-         outs(%init: tensor<1x56x96xf32>) -> tensor<1x56x96xf32>
+         outs(%init: tensor<1x56x96xf32>)
   return %0: tensor<1x56x96xf32>
 }
 
@@ -76,10 +76,10 @@
   // CHECK:      %{{.+}} = linalg.depthwise_conv_2d_nhwc_hwc
   // CHECK-SAME:   {dilations = dense<1> : vector<2xi64>, strides = dense<2> : vector<2xi64>}
   // CHECK-SAME:   ins(%{{.+}}, %{{.+}} : tensor<1x113x113x96xf32>, tensor<3x3x96xf32>)
-  // CHECK-SAME:   outs(%{{.+}} : tensor<1x56x56x96xf32>) -> tensor<1x56x56x96xf32>
+  // CHECK-SAME:   outs(%{{.+}} : tensor<1x56x56x96xf32>)
   %0 = linalg.depthwise_conv_2d_nhwc_hwc {dilations = dense<1> : vector<2xi64>, strides = dense<2> : vector<2xi64>}
          ins(%input, %filter: tensor<1x113x113x96xf32>, tensor<3x3x96xf32>)
-         outs(%init: tensor<1x56x56x96xf32>) -> tensor<1x56x56x96xf32>
+         outs(%init: tensor<1x56x56x96xf32>)
   return %0: tensor<1x56x56x96xf32>
 }
 
@@ -101,10 +101,10 @@
   // CHECK:      %{{.+}} = linalg.depthwise_conv_2d_nchw_chw
   // CHECK-SAME:   {dilations = dense<1> : vector<2xi64>, strides = dense<2> : vector<2xi64>}
   // CHECK-SAME:   ins(%{{.+}}, %{{.+}} : tensor<1x96x113x113xf32>, tensor<96x3x3xf32>)
-  // CHECK-SAME:   outs(%{{.+}} : tensor<1x96x56x56xf32>) -> tensor<1x96x56x56xf32>
+  // CHECK-SAME:   outs(%{{.+}} : tensor<1x96x56x56xf32>)
   %0 = linalg.depthwise_conv_2d_nchw_chw {dilations = dense<1> : vector<2xi64>, strides = dense<2> : vector<2xi64>}
          ins(%input, %filter: tensor<1x96x113x113xf32>, tensor<96x3x3xf32>)
-         outs(%init: tensor<1x96x56x56xf32>) -> tensor<1x96x56x56xf32>
+         outs(%init: tensor<1x96x56x56xf32>)
   return %0: tensor<1x96x56x56xf32>
 }
 
@@ -123,7 +123,7 @@
 func.func @depthwise_conv_2d_nhwc_hwcm_tensor_dilated(%input: tensor<2x8x9x2xf32>, %filter: tensor<2x2x2x3xf32>) -> tensor<2x6x7x2x3xf32> {
   %zero = arith.constant 0.000000e+00 : f32
   %init = tensor.empty() : tensor<2x6x7x2x3xf32>
-  %fill = linalg.fill ins(%zero : f32) outs(%init : tensor<2x6x7x2x3xf32>) -> tensor<2x6x7x2x3xf32>
+  %fill = linalg.fill ins(%zero : f32) outs(%init : tensor<2x6x7x2x3xf32>)
   // CHECK:      %{{.+}} = linalg.depthwise_conv_2d_nhwc_hwcm
   // CHECK-SAME:   {dilations = dense<2> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>}
   // CHECK-SAME:   ins(%{{.+}}, %{{.+}} : tensor<2x8x9x2xf32>, tensor<2x2x2x3xf32>)
@@ -131,7 +131,7 @@
   %0 = linalg.depthwise_conv_2d_nhwc_hwcm
      { dilations = dense<2> : tensor<2xi64>, strides = dense<1> : tensor<2xi64> }
      ins(%input, %filter : tensor<2x8x9x2xf32>, tensor<2x2x2x3xf32>)
-    outs(%fill : tensor<2x6x7x2x3xf32>) -> tensor<2x6x7x2x3xf32>
+    outs(%fill : tensor<2x6x7x2x3xf32>)
   return %0 : tensor<2x6x7x2x3xf32>
 }
 
@@ -187,11 +187,11 @@
 func.func @depthwise_conv_3d_ndhwc_dhwcm(%input: tensor<2x6x13x12x6xf32>, %filter: tensor<2x1x3x6x6xf32>) -> tensor<2x3x13x4x6x6xf32> {
   %zero = arith.constant 0.000000e+00 : f32
   %init = tensor.empty() : tensor<2x3x13x4x6x6xf32>
-  %fill = linalg.fill ins(%zero : f32) outs(%init : tensor<2x3x13x4x6x6xf32>) -> tensor<2x3x13x4x6x6xf32>
+  %fill = linalg.fill ins(%zero : f32) outs(%init : tensor<2x3x13x4x6x6xf32>)
   // CHECK: depthwise_conv_3d_ndhwc_dhwcm
   %0 = linalg.depthwise_conv_3d_ndhwc_dhwcm {dilations = dense<1> : tensor<3xi64>, strides = dense<[2, 1, 3]> : tensor<3xi64>}
     ins(%input, %filter : tensor<2x6x13x12x6xf32>, tensor<2x1x3x6x6xf32>)
-    outs(%fill : tensor<2x3x13x4x6x6xf32>) -> tensor<2x3x13x4x6x6xf32>
+    outs(%fill : tensor<2x3x13x4x6x6xf32>)
   return %0 : tensor<2x3x13x4x6x6xf32>
 }
 
@@ -201,11 +201,11 @@
 func.func @depthwise_conv_3d_ndhwc_dhwc(%input: tensor<2x6x13x12x6xf32>, %filter: tensor<2x1x3x6xf32>) -> tensor<2x3x13x4x6xf32> {
   %zero = arith.constant 0.000000e+00 : f32
   %init = tensor.empty() : tensor<2x3x13x4x6xf32>
-  %fill = linalg.fill ins(%zero : f32) outs(%init : tensor<2x3x13x4x6xf32>) -> tensor<2x3x13x4x6xf32>
+  %fill = linalg.fill ins(%zero : f32) outs(%init : tensor<2x3x13x4x6xf32>)
   // CHECK: depthwise_conv_3d_ndhwc_dhwc
   %0 = linalg.depthwise_conv_3d_ndhwc_dhwc {dilations = dense<1> : tensor<3xi64>, strides = dense<[2, 1, 3]> : tensor<3xi64>}
     ins(%input, %filter : tensor<2x6x13x12x6xf32>, tensor<2x1x3x6xf32>)
-    outs(%fill : tensor<2x3x13x4x6xf32>) -> tensor<2x3x13x4x6xf32>
+    outs(%fill : tensor<2x3x13x4x6xf32>)
   return %0 : tensor<2x3x13x4x6xf32>
 }
 
@@ -217,11 +217,11 @@
   // CHECK-SAME:   dilations = dense<1> : tensor<1xi64>
   // CHECK-SAME:   strides = dense<1> : tensor<1xi64>
   // CHECK-SAME:   ins(%{{.+}}, %{{.+}} : tensor<?x?x?xf32>, tensor<?x?x?xf32>)
-  // CHECK-SAME:   outs(%{{.+}} : tensor<?x?x?xf32>) -> tensor<?x?x?xf32>
+  // CHECK-SAME:   outs(%{{.+}} : tensor<?x?x?xf32>)
   %0 = linalg.conv_1d_nwc_wcf {dilations = dense<1> : tensor<1xi64>,
                                             strides = dense<1> : tensor<1xi64>}
      ins (%input, %filter: tensor<?x?x?xf32>, tensor<?x?x?xf32>)
-    outs (%init: tensor<?x?x?xf32>) -> tensor<?x?x?xf32>
+    outs (%init: tensor<?x?x?xf32>)
   return %0 : tensor<?x?x?xf32>
 }
 
@@ -249,11 +249,11 @@
   // CHECK-SAME:   dilations = dense<1> : tensor<1xi64>
   // CHECK-SAME:   strides = dense<1> : tensor<1xi64>
   // CHECK-SAME:   ins(%{{.+}}, %{{.+}} : tensor<?x?x?xf32>, tensor<?x?x?xf32>)
-  // CHECK-SAME:   outs(%{{.+}} : tensor<?x?x?xf32>) -> tensor<?x?x?xf32>
+  // CHECK-SAME:   outs(%{{.+}} : tensor<?x?x?xf32>)
   %0 = linalg.conv_1d_ncw_fcw {dilations = dense<1> : tensor<1xi64>,
                                             strides = dense<1> : tensor<1xi64>}
      ins (%input, %filter: tensor<?x?x?xf32>, tensor<?x?x?xf32>)
-    outs (%init: tensor<?x?x?xf32>) -> tensor<?x?x?xf32>
+    outs (%init: tensor<?x?x?xf32>)
   return %0 : tensor<?x?x?xf32>
 }
 
@@ -281,11 +281,11 @@
   // CHECK-SAME:   dilations = dense<1> : tensor<2xi64>
   // CHECK-SAME:   strides = dense<1> : tensor<2xi64>
   // CHECK-SAME:   ins(%{{.+}}, %{{.+}} : tensor<?x?x?x?xf32>, tensor<?x?x?x?xf32>)
-  // CHECK-SAME:   outs(%{{.+}} : tensor<?x?x?x?xf32>) -> tensor<?x?x?x?xf32>
+  // CHECK-SAME:   outs(%{{.+}} : tensor<?x?x?x?xf32>)
   %0 = linalg.conv_2d_nhwc_hwcf {dilations = dense<1> : tensor<2xi64>,
                                               strides = dense<1> : tensor<2xi64>}
      ins (%input, %filter: tensor<?x?x?x?xf32>, tensor<?x?x?x?xf32>)
-    outs (%init: tensor<?x?x?x?xf32>) -> tensor<?x?x?x?xf32>
+    outs (%init: tensor<?x?x?x?xf32>)
   return %0 : tensor<?x?x?x?xf32>
 }
 
@@ -297,11 +297,11 @@
   // CHECK-SAME:   dilations = dense<1> : tensor<2xi64>
   // CHECK-SAME:   strides = dense<1> : tensor<2xi64>
   // CHECK-SAME:   ins(%{{.+}}, %{{.+}} : tensor<?x?x?x?x?xf32>, tensor<?x?x?x?x?xf32>)
-  // CHECK-SAME:   outs(%{{.+}} : tensor<?x?x?x?x?xf32>) -> tensor<?x?x?x?x?xf32>
+  // CHECK-SAME:   outs(%{{.+}} : tensor<?x?x?x?x?xf32>)
   %0 = linalg.conv_2d_ngchw_fgchw {dilations = dense<1> : tensor<2xi64>,
                                               strides = dense<1> : tensor<2xi64>}
      ins (%input, %filter: tensor<?x?x?x?x?xf32>, tensor<?x?x?x?x?xf32>)
-    outs (%init: tensor<?x?x?x?x?xf32>) -> tensor<?x?x?x?x?xf32>
+    outs (%init: tensor<?x?x?x?x?xf32>)
   return %0 : tensor<?x?x?x?x?xf32>
 }
 
@@ -313,11 +313,11 @@
   // CHECK-SAME:   dilations = dense<1> : tensor<2xi64>
   // CHECK-SAME:   strides = dense<1> : tensor<2xi64>
   // CHECK-SAME:   ins(%{{.+}}, %{{.+}} : tensor<?x?x?x?xf32>, tensor<?x?x?x?xf32>)
-  // CHECK-SAME:   outs(%{{.+}} : tensor<?x?x?x?xf32>) -> tensor<?x?x?x?xf32>
+  // CHECK-SAME:   outs(%{{.+}} : tensor<?x?x?x?xf32>)
   %0 = linalg.conv_2d_nhwc_fhwc {dilations = dense<1> : tensor<2xi64>,
                                  strides = dense<1> : tensor<2xi64>}
      ins (%input, %filter: tensor<?x?x?x?xf32>, tensor<?x?x?x?xf32>)
-    outs (%init: tensor<?x?x?x?xf32>) -> tensor<?x?x?x?xf32>
+    outs (%init: tensor<?x?x?x?xf32>)
   return %0 : tensor<?x?x?x?xf32>
 }
 
@@ -329,11 +329,11 @@
   // CHECK-SAME:   dilations = dense<1> : tensor<2xi64>
   // CHECK-SAME:   strides = dense<1> : tensor<2xi64>
   // CHECK-SAME:   ins(%{{.+}}, %{{.+}} : tensor<?x128x128x32xf32>, tensor<64x3x3x32xf32>)
-  // CHECK-SAME:   outs(%{{.+}} : tensor<?x126x126x64xf32>) -> tensor<?x126x126x64xf32>
+  // CHECK-SAME:   outs(%{{.+}} : tensor<?x126x126x64xf32>)
   %0 = linalg.conv_2d_nhwc_fhwc {dilations = dense<1> : tensor<2xi64>,
                                  strides = dense<1> : tensor<2xi64>}
      ins (%input, %filter: tensor<?x128x128x32xf32>, tensor<64x3x3x32xf32>)
-    outs (%init: tensor<?x126x126x64xf32>) -> tensor<?x126x126x64xf32>
+    outs (%init: tensor<?x126x126x64xf32>)
   return %0 : tensor<?x126x126x64xf32>
 }
 
@@ -377,11 +377,11 @@
   // CHECK-SAME:   dilations = dense<1> : tensor<3xi64>
   // CHECK-SAME:   strides = dense<1> : tensor<3xi64>
   // CHECK-SAME:   ins(%{{.+}}, %{{.+}} : tensor<?x?x?x?x?xf32>, tensor<?x?x?x?x?xf32>)
-  // CHECK-SAME:   outs(%{{.+}} : tensor<?x?x?x?x?xf32>) -> tensor<?x?x?x?x?xf32>
+  // CHECK-SAME:   outs(%{{.+}} : tensor<?x?x?x?x?xf32>)
   %0 = linalg.conv_3d_ndhwc_dhwcf {dilations = dense<1> : tensor<3xi64>,
                                                 strides = dense<1> : tensor<3xi64>}
      ins (%input, %filter: tensor<?x?x?x?x?xf32>, tensor<?x?x?x?x?xf32>)
-    outs (%init: tensor<?x?x?x?x?xf32>) -> tensor<?x?x?x?x?xf32>
+    outs (%init: tensor<?x?x?x?x?xf32>)
   return %0 : tensor<?x?x?x?x?xf32>
 }
 
@@ -408,15 +408,15 @@
 // CHECK-SAME:      dilations = dense<1> : tensor<2xi64>
 // CHECK-SAME:      strides = dense<1> : tensor<2xi64>
 // CHECK-SAME:      ins(%{{.+}}, %{{.+}} : tensor<1x4x4x1xf32>, tensor<3x3xf32>)
-// CHECK-SAME:      outs(%{{.+}} : tensor<1x2x2x1xf32>) -> tensor<1x2x2x1xf32>
+// CHECK-SAME:      outs(%{{.+}} : tensor<1x2x2x1xf32>)
 func.func @pooling_nhwc_sum_tensor(%input: tensor<1x4x4x1xf32>) -> tensor<1x2x2x1xf32> {
   %fake = tensor.empty() : tensor<3x3xf32>
   %init = tensor.empty() : tensor<1x2x2x1xf32>
   %cst = arith.constant 0.000000e+00 : f32
-  %fill = linalg.fill ins(%cst : f32) outs(%init : tensor<1x2x2x1xf32>) -> tensor<1x2x2x1xf32>
+  %fill = linalg.fill ins(%cst : f32) outs(%init : tensor<1x2x2x1xf32>)
   %res = linalg.pooling_nhwc_sum {dilations = dense<1> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>}
     ins(%input, %fake: tensor<1x4x4x1xf32>, tensor<3x3xf32>)
-    outs(%fill: tensor<1x2x2x1xf32>) -> tensor<1x2x2x1xf32>
+    outs(%fill: tensor<1x2x2x1xf32>)
   return %res : tensor<1x2x2x1xf32>
 }
 
@@ -427,15 +427,15 @@
 // CHECK-SAME:      dilations = dense<1> : tensor<1xi64>
 // CHECK-SAME:      strides = dense<1> : tensor<1xi64>
 // CHECK-SAME:      ins(%{{.+}}, %{{.+}} : tensor<1x4x1xf32>, tensor<3xf32>)
-// CHECK-SAME:      outs(%{{.+}} : tensor<1x2x1xf32>) -> tensor<1x2x1xf32>
+// CHECK-SAME:      outs(%{{.+}} : tensor<1x2x1xf32>)
 func.func @pooling_nwc_sum_tensor(%input: tensor<1x4x1xf32>) -> tensor<1x2x1xf32> {
   %fake = tensor.empty() : tensor<3xf32>
   %init = tensor.empty() : tensor<1x2x1xf32>
   %cst = arith.constant 0.000000e+00 : f32
-  %fill = linalg.fill ins(%cst : f32) outs(%init : tensor<1x2x1xf32>) -> tensor<1x2x1xf32>
+  %fill = linalg.fill ins(%cst : f32) outs(%init : tensor<1x2x1xf32>)
   %res = linalg.pooling_nwc_sum {dilations = dense<1> : tensor<1xi64>, strides = dense<1> : tensor<1xi64>}
     ins(%input, %fake: tensor<1x4x1xf32>, tensor<3xf32>)
-    outs(%fill: tensor<1x2x1xf32>) -> tensor<1x2x1xf32>
+    outs(%fill: tensor<1x2x1xf32>)
   return %res : tensor<1x2x1xf32>
 }
 
@@ -476,15 +476,15 @@
 // CHECK-SAME:      dilations = dense<1> : tensor<2xi64>
 // CHECK-SAME:      strides = dense<1> : tensor<2xi64>
 // CHECK-SAME:      ins(%{{.+}}, %{{.+}} : tensor<1x1x4x4xf32>, tensor<3x3xf32>)
-// CHECK-SAME:      outs(%{{.+}} : tensor<1x1x2x2xf32>) -> tensor<1x1x2x2xf32>
+// CHECK-SAME:      outs(%{{.+}} : tensor<1x1x2x2xf32>)
 func.func @pooling_nchw_sum_tensor(%input: tensor<1x1x4x4xf32>) -> tensor<1x1x2x2xf32> {
   %fake = tensor.empty() : tensor<3x3xf32>
   %init = tensor.empty() : tensor<1x1x2x2xf32>
   %cst = arith.constant 0.000000e+00 : f32
-  %fill = linalg.fill ins(%cst : f32) outs(%init : tensor<1x1x2x2xf32>) -> tensor<1x1x2x2xf32>
+  %fill = linalg.fill ins(%cst : f32) outs(%init : tensor<1x1x2x2xf32>)
   %res = linalg.pooling_nchw_sum {dilations = dense<1> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>}
     ins(%input, %fake: tensor<1x1x4x4xf32>, tensor<3x3xf32>)
-    outs(%fill: tensor<1x1x2x2xf32>) -> tensor<1x1x2x2xf32>
+    outs(%fill: tensor<1x1x2x2xf32>)
   return %res : tensor<1x1x2x2xf32>
 }
 
@@ -495,15 +495,15 @@
 // CHECK-SAME:      dilations = dense<1> : tensor<1xi64>
 // CHECK-SAME:      strides = dense<1> : tensor<1xi64>
 // CHECK-SAME:      ins(%{{.+}}, %{{.+}} : tensor<1x1x4xf32>, tensor<3xf32>)
-// CHECK-SAME:      outs(%{{.+}} : tensor<1x1x2xf32>) -> tensor<1x1x2xf32>
+// CHECK-SAME:      outs(%{{.+}} : tensor<1x1x2xf32>)
 func.func @pooling_ncw_sum_tensor(%input: tensor<1x1x4xf32>) -> tensor<1x1x2xf32> {
   %fake = tensor.empty() : tensor<3xf32>
   %init = tensor.empty() : tensor<1x1x2xf32>
   %cst = arith.constant 0.000000e+00 : f32
-  %fill = linalg.fill ins(%cst : f32) outs(%init : tensor<1x1x2xf32>) -> tensor<1x1x2xf32>
+  %fill = linalg.fill ins(%cst : f32) outs(%init : tensor<1x1x2xf32>)
   %res = linalg.pooling_ncw_sum {dilations = dense<1> : tensor<1xi64>, strides = dense<1> : tensor<1xi64>}
     ins(%input, %fake: tensor<1x1x4xf32>, tensor<3xf32>)
-    outs(%fill: tensor<1x1x2xf32>) -> tensor<1x1x2xf32>
+    outs(%fill: tensor<1x1x2xf32>)
   return %res : tensor<1x1x2xf32>
 }
 
@@ -544,15 +544,15 @@
 // CHECK-SAME:      dilations = dense<1> : tensor<2xi64>
 // CHECK-SAME:      strides = dense<1> : tensor<2xi64>
 // CHECK-SAME:      ins(%{{.+}}, %{{.+}} : tensor<1x4x4x1xf32>, tensor<3x3xf32>)
-// CHECK-SAME:      outs(%{{.+}} : tensor<1x2x2x1xf32>) -> tensor<1x2x2x1xf32>
+// CHECK-SAME:      outs(%{{.+}} : tensor<1x2x2x1xf32>)
 func.func @pooling_nhwc_max_tensor(%input: tensor<1x4x4x1xf32>) -> tensor<1x2x2x1xf32> {
   %fake = tensor.empty() : tensor<3x3xf32>
   %init = tensor.empty() : tensor<1x2x2x1xf32>
   %cst = arith.constant 0.000000e+00 : f32
-  %fill = linalg.fill ins(%cst : f32) outs(%init : tensor<1x2x2x1xf32>) -> tensor<1x2x2x1xf32>
+  %fill = linalg.fill ins(%cst : f32) outs(%init : tensor<1x2x2x1xf32>)
   %res = linalg.pooling_nhwc_max {dilations = dense<1> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>}
     ins(%input, %fake: tensor<1x4x4x1xf32>, tensor<3x3xf32>)
-    outs(%fill: tensor<1x2x2x1xf32>) -> tensor<1x2x2x1xf32>
+    outs(%fill: tensor<1x2x2x1xf32>)
   return %res : tensor<1x2x2x1xf32>
 }
 
@@ -562,15 +562,15 @@
 // CHECK-SAME:      dilations = dense<1> : tensor<1xi64>
 // CHECK-SAME:      strides = dense<1> : tensor<1xi64>
 // CHECK-SAME:      ins(%{{.+}}, %{{.+}} : tensor<1x4x1xf32>, tensor<3xf32>)
-// CHECK-SAME:      outs(%{{.+}} : tensor<1x2x1xf32>) -> tensor<1x2x1xf32>
+// CHECK-SAME:      outs(%{{.+}} : tensor<1x2x1xf32>)
 func.func @pooling_nwc_max_tensor(%input: tensor<1x4x1xf32>) -> tensor<1x2x1xf32> {
   %fake = tensor.empty() : tensor<3xf32>
   %init = tensor.empty() : tensor<1x2x1xf32>
   %cst = arith.constant 0.000000e+00 : f32
-  %fill = linalg.fill ins(%cst : f32) outs(%init : tensor<1x2x1xf32>) -> tensor<1x2x1xf32>
+  %fill = linalg.fill ins(%cst : f32) outs(%init : tensor<1x2x1xf32>)
   %res = linalg.pooling_nwc_max {dilations = dense<1> : tensor<1xi64>, strides = dense<1> : tensor<1xi64>}
     ins(%input, %fake: tensor<1x4x1xf32>, tensor<3xf32>)
-    outs(%fill: tensor<1x2x1xf32>) -> tensor<1x2x1xf32>
+    outs(%fill: tensor<1x2x1xf32>)
   return %res : tensor<1x2x1xf32>
 }
 
@@ -580,16 +580,16 @@
 // CHECK-SAME:      dilations = dense<1> : tensor<2xi64>
 // CHECK-SAME:      strides = dense<1> : tensor<2xi64>
 // CHECK-SAME:      ins(%{{.+}}, %{{.+}} : tensor<1x1x4x4xf32>, tensor<3x3xf32>)
-// CHECK-SAME:      outs(%{{.+}} : tensor<1x1x2x2xf32>) -> tensor<1x1x2x2xf32>
+// CHECK-SAME:      outs(%{{.+}} : tensor<1x1x2x2xf32>)
 
 func.func @pooling_nchw_max_tensor(%input: tensor<1x1x4x4xf32>) -> tensor<1x1x2x2xf32> {
   %fake = tensor.empty() : tensor<3x3xf32>
   %init = tensor.empty() : tensor<1x1x2x2xf32>
   %cst = arith.constant 0.000000e+00 : f32
-  %fill = linalg.fill ins(%cst : f32) outs(%init : tensor<1x1x2x2xf32>) -> tensor<1x1x2x2xf32>
+  %fill = linalg.fill ins(%cst : f32) outs(%init : tensor<1x1x2x2xf32>)
   %res = linalg.pooling_nchw_max {dilations = dense<1> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>}
     ins(%input, %fake: tensor<1x1x4x4xf32>, tensor<3x3xf32>)
-    outs(%fill: tensor<1x1x2x2xf32>) -> tensor<1x1x2x2xf32>
+    outs(%fill: tensor<1x1x2x2xf32>)
   return %res : tensor<1x1x2x2xf32>
 }
 
@@ -599,16 +599,16 @@
 // CHECK-SAME:      dilations = dense<1> : tensor<1xi64>
 // CHECK-SAME:      strides = dense<1> : tensor<1xi64>
 // CHECK-SAME:      ins(%{{.+}}, %{{.+}} : tensor<1x1x4xf32>, tensor<3xf32>)
-// CHECK-SAME:      outs(%{{.+}} : tensor<1x1x2xf32>) -> tensor<1x1x2xf32>
+// CHECK-SAME:      outs(%{{.+}} : tensor<1x1x2xf32>)
 
 func.func @pooling_ncw_max_tensor(%input: tensor<1x1x4xf32>) -> tensor<1x1x2xf32> {
   %fake = tensor.empty() : tensor<3xf32>
   %init = tensor.empty() : tensor<1x1x2xf32>
   %cst = arith.constant 0.000000e+00 : f32
-  %fill = linalg.fill ins(%cst : f32) outs(%init : tensor<1x1x2xf32>) -> tensor<1x1x2xf32>
+  %fill = linalg.fill ins(%cst : f32) outs(%init : tensor<1x1x2xf32>)
   %res = linalg.pooling_ncw_max {dilations = dense<1> : tensor<1xi64>, strides = dense<1> : tensor<1xi64>}
     ins(%input, %fake: tensor<1x1x4xf32>, tensor<3xf32>)
-    outs(%fill: tensor<1x1x2xf32>) -> tensor<1x1x2xf32>
+    outs(%fill: tensor<1x1x2xf32>)
   return %res : tensor<1x1x2xf32>
 }
 
@@ -649,15 +649,15 @@
 // CHECK-SAME:      dilations = dense<1> : tensor<2xi64>
 // CHECK-SAME:      strides = dense<1> : tensor<2xi64>
 // CHECK-SAME:      ins(%{{.+}}, %{{.+}} : tensor<1x4x4x1xi8>, tensor<3x3xi8>)
-// CHECK-SAME:      outs(%{{.+}} : tensor<1x2x2x1xi8>) -> tensor<1x2x2x1xi8>
+// CHECK-SAME:      outs(%{{.+}} : tensor<1x2x2x1xi8>)
 func.func @pooling_nhwc_i8_max_tensor(%input: tensor<1x4x4x1xi8>) -> tensor<1x2x2x1xi8> {
   %fake = tensor.empty() : tensor<3x3xi8>
   %init = tensor.empty() : tensor<1x2x2x1xi8>
   %cst = arith.constant 0 : i8
-  %fill = linalg.fill ins(%cst : i8) outs(%init : tensor<1x2x2x1xi8>) -> tensor<1x2x2x1xi8>
+  %fill = linalg.fill ins(%cst : i8) outs(%init : tensor<1x2x2x1xi8>)
   %res = linalg.pooling_nhwc_max {dilations = dense<1> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>}
     ins(%input, %fake: tensor<1x4x4x1xi8>, tensor<3x3xi8>)
-    outs(%fill: tensor<1x2x2x1xi8>) -> tensor<1x2x2x1xi8>
+    outs(%fill: tensor<1x2x2x1xi8>)
   return %res : tensor<1x2x2x1xi8>
 }
 
@@ -668,15 +668,15 @@
 // CHECK-SAME:      dilations = dense<1> : tensor<1xi64>
 // CHECK-SAME:      strides = dense<1> : tensor<1xi64>
 // CHECK-SAME:      ins(%{{.+}}, %{{.+}} : tensor<1x4x1xi8>, tensor<3xi8>)
-// CHECK-SAME:      outs(%{{.+}} : tensor<1x2x1xi8>) -> tensor<1x2x1xi8>
+// CHECK-SAME:      outs(%{{.+}} : tensor<1x2x1xi8>)
 func.func @pooling_nwc_i8_max_tensor(%input: tensor<1x4x1xi8>) -> tensor<1x2x1xi8> {
   %fake = tensor.empty() : tensor<3xi8>
   %init = tensor.empty() : tensor<1x2x1xi8>
   %cst = arith.constant 0 : i8
-  %fill = linalg.fill ins(%cst : i8) outs(%init : tensor<1x2x1xi8>) -> tensor<1x2x1xi8>
+  %fill = linalg.fill ins(%cst : i8) outs(%init : tensor<1x2x1xi8>)
   %res = linalg.pooling_nwc_max {dilations = dense<1> : tensor<1xi64>, strides = dense<1> : tensor<1xi64>}
     ins(%input, %fake: tensor<1x4x1xi8>, tensor<3xi8>)
-    outs(%fill: tensor<1x2x1xi8>) -> tensor<1x2x1xi8>
+    outs(%fill: tensor<1x2x1xi8>)
   return %res : tensor<1x2x1xi8>
 }
 
@@ -717,15 +717,15 @@
 // CHECK-SAME:      dilations = dense<1> : tensor<2xi64>
 // CHECK-SAME:      strides = dense<1> : tensor<2xi64>
 // CHECK-SAME:      ins(%{{.+}}, %{{.+}} : tensor<1x4x4x1xi16>, tensor<3x3xi16>)
-// CHECK-SAME:      outs(%{{.+}} : tensor<1x2x2x1xi16>) -> tensor<1x2x2x1xi16>
+// CHECK-SAME:      outs(%{{.+}} : tensor<1x2x2x1xi16>)
 func.func @pooling_nhwc_i16_max_tensor(%input: tensor<1x4x4x1xi16>) -> tensor<1x2x2x1xi16> {
   %fake = tensor.empty() : tensor<3x3xi16>
   %init = tensor.empty() : tensor<1x2x2x1xi16>
   %cst = arith.constant 0 : i16
-  %fill = linalg.fill ins(%cst : i16) outs(%init : tensor<1x2x2x1xi16>) -> tensor<1x2x2x1xi16>
+  %fill = linalg.fill ins(%cst : i16) outs(%init : tensor<1x2x2x1xi16>)
   %res = linalg.pooling_nhwc_max {dilations = dense<1> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>}
     ins(%input, %fake: tensor<1x4x4x1xi16>, tensor<3x3xi16>)
-    outs(%fill: tensor<1x2x2x1xi16>) -> tensor<1x2x2x1xi16>
+    outs(%fill: tensor<1x2x2x1xi16>)
   return %res : tensor<1x2x2x1xi16>
 }
 
@@ -736,15 +736,15 @@
 // CHECK-SAME:      dilations = dense<1> : tensor<1xi64>
 // CHECK-SAME:      strides = dense<1> : tensor<1xi64>
 // CHECK-SAME:      ins(%{{.+}}, %{{.+}} : tensor<1x4x1xi16>, tensor<3xi16>)
-// CHECK-SAME:      outs(%{{.+}} : tensor<1x2x1xi16>) -> tensor<1x2x1xi16>
+// CHECK-SAME:      outs(%{{.+}} : tensor<1x2x1xi16>)
 func.func @pooling_nwc_i16_max_tensor(%input: tensor<1x4x1xi16>) -> tensor<1x2x1xi16> {
   %fake = tensor.empty() : tensor<3xi16>
   %init = tensor.empty() : tensor<1x2x1xi16>
   %cst = arith.constant 0 : i16
-  %fill = linalg.fill ins(%cst : i16) outs(%init : tensor<1x2x1xi16>) -> tensor<1x2x1xi16>
+  %fill = linalg.fill ins(%cst : i16) outs(%init : tensor<1x2x1xi16>)
   %res = linalg.pooling_nwc_max {dilations = dense<1> : tensor<1xi64>, strides = dense<1> : tensor<1xi64>}
     ins(%input, %fake: tensor<1x4x1xi16>, tensor<3xi16>)
-    outs(%fill: tensor<1x2x1xi16>) -> tensor<1x2x1xi16>
+    outs(%fill: tensor<1x2x1xi16>)
   return %res : tensor<1x2x1xi16>
 }
 
@@ -785,15 +785,15 @@
 // CHECK-SAME:      dilations = dense<1> : tensor<2xi64>
 // CHECK-SAME:      strides = dense<1> : tensor<2xi64>
 // CHECK-SAME:      ins(%{{.+}}, %{{.+}} : tensor<1x4x4x1xi32>, tensor<3x3xi32>)
-// CHECK-SAME:      outs(%{{.+}} : tensor<1x2x2x1xi32>) -> tensor<1x2x2x1xi32>
+// CHECK-SAME:      outs(%{{.+}} : tensor<1x2x2x1xi32>)
 func.func @pooling_nhwc_i32_max_tensor(%input: tensor<1x4x4x1xi32>) -> tensor<1x2x2x1xi32> {
   %fake = tensor.empty() : tensor<3x3xi32>
   %init = tensor.empty() : tensor<1x2x2x1xi32>
   %cst = arith.constant 0 : i32
-  %fill = linalg.fill ins(%cst : i32) outs(%init : tensor<1x2x2x1xi32>) -> tensor<1x2x2x1xi32>
+  %fill = linalg.fill ins(%cst : i32) outs(%init : tensor<1x2x2x1xi32>)
   %res = linalg.pooling_nhwc_max {dilations = dense<1> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>}
     ins(%input, %fake: tensor<1x4x4x1xi32>, tensor<3x3xi32>)
-    outs(%fill: tensor<1x2x2x1xi32>) -> tensor<1x2x2x1xi32>
+    outs(%fill: tensor<1x2x2x1xi32>)
   return %res : tensor<1x2x2x1xi32>
 }
 
@@ -804,15 +804,15 @@
 // CHECK-SAME:      dilations = dense<1> : tensor<1xi64>
 // CHECK-SAME:      strides = dense<1> : tensor<1xi64>
 // CHECK-SAME:      ins(%{{.+}}, %{{.+}} : tensor<1x4x1xi32>, tensor<3xi32>)
-// CHECK-SAME:      outs(%{{.+}} : tensor<1x2x1xi32>) -> tensor<1x2x1xi32>
+// CHECK-SAME:      outs(%{{.+}} : tensor<1x2x1xi32>)
 func.func @pooling_nwc_i32_max_tensor(%input: tensor<1x4x1xi32>) -> tensor<1x2x1xi32> {
   %fake = tensor.empty() : tensor<3xi32>
   %init = tensor.empty() : tensor<1x2x1xi32>
   %cst = arith.constant 0 : i32
-  %fill = linalg.fill ins(%cst : i32) outs(%init : tensor<1x2x1xi32>) -> tensor<1x2x1xi32>
+  %fill = linalg.fill ins(%cst : i32) outs(%init : tensor<1x2x1xi32>)
   %res = linalg.pooling_nwc_max {dilations = dense<1> : tensor<1xi64>, strides = dense<1> : tensor<1xi64>}
     ins(%input, %fake: tensor<1x4x1xi32>, tensor<3xi32>)
-    outs(%fill: tensor<1x2x1xi32>) -> tensor<1x2x1xi32>
+    outs(%fill: tensor<1x2x1xi32>)
   return %res : tensor<1x2x1xi32>
 }
 
@@ -854,15 +854,15 @@
 // CHECK-SAME:      dilations = dense<1> : tensor<2xi64>
 // CHECK-SAME:      strides = dense<1> : tensor<2xi64>
 // CHECK-SAME:      ins(%{{.+}}, %{{.+}} : tensor<1x4x4x1xf32>, tensor<3x3xf32>)
-// CHECK-SAME:      outs(%{{.+}} : tensor<1x2x2x1xf32>) -> tensor<1x2x2x1xf32>
+// CHECK-SAME:      outs(%{{.+}} : tensor<1x2x2x1xf32>)
 func.func @pooling_nhwc_min_tensor(%input: tensor<1x4x4x1xf32>) -> tensor<1x2x2x1xf32> {
   %fake = tensor.empty() : tensor<3x3xf32>
   %init = tensor.empty() : tensor<1x2x2x1xf32>
   %cst = arith.constant 0.000000e+00 : f32
-  %fill = linalg.fill ins(%cst : f32) outs(%init : tensor<1x2x2x1xf32>) -> tensor<1x2x2x1xf32>
+  %fill = linalg.fill ins(%cst : f32) outs(%init : tensor<1x2x2x1xf32>)
   %res = linalg.pooling_nhwc_min {dilations = dense<1> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>}
     ins(%input, %fake: tensor<1x4x4x1xf32>, tensor<3x3xf32>)
-    outs(%fill: tensor<1x2x2x1xf32>) -> tensor<1x2x2x1xf32>
+    outs(%fill: tensor<1x2x2x1xf32>)
   return %res : tensor<1x2x2x1xf32>
 }
 
@@ -873,15 +873,15 @@
 // CHECK-SAME:      dilations = dense<1> : tensor<1xi64>
 // CHECK-SAME:      strides = dense<1> : tensor<1xi64>
 // CHECK-SAME:      ins(%{{.+}}, %{{.+}} : tensor<1x4x1xf32>, tensor<3xf32>)
-// CHECK-SAME:      outs(%{{.+}} : tensor<1x2x1xf32>) -> tensor<1x2x1xf32>
+// CHECK-SAME:      outs(%{{.+}} : tensor<1x2x1xf32>)
 func.func @pooling_nwc_min_tensor(%input: tensor<1x4x1xf32>) -> tensor<1x2x1xf32> {
   %fake = tensor.empty() : tensor<3xf32>
   %init = tensor.empty() : tensor<1x2x1xf32>
   %cst = arith.constant 0.000000e+00 : f32
-  %fill = linalg.fill ins(%cst : f32) outs(%init : tensor<1x2x1xf32>) -> tensor<1x2x1xf32>
+  %fill = linalg.fill ins(%cst : f32) outs(%init : tensor<1x2x1xf32>)
   %res = linalg.pooling_nwc_min {dilations = dense<1> : tensor<1xi64>, strides = dense<1> : tensor<1xi64>}
     ins(%input, %fake: tensor<1x4x1xf32>, tensor<3xf32>)
-    outs(%fill: tensor<1x2x1xf32>) -> tensor<1x2x1xf32>
+    outs(%fill: tensor<1x2x1xf32>)
   return %res : tensor<1x2x1xf32>
 }
 
@@ -922,15 +922,15 @@
 // CHECK-SAME:      dilations = dense<1> : tensor<3xi64>
 // CHECK-SAME:      strides = dense<1> : tensor<3xi64>
 // CHECK-SAME:      ins(%{{.+}}, %{{.+}} : tensor<1x4x4x4x1xf32>, tensor<3x3x3xf32>)
-// CHECK-SAME:      outs(%{{.+}} : tensor<1x2x2x2x1xf32>) -> tensor<1x2x2x2x1xf32>
+// CHECK-SAME:      outs(%{{.+}} : tensor<1x2x2x2x1xf32>)
 func.func @pooling_ndhwc_sum_tensor(%input: tensor<1x4x4x4x1xf32>) -> tensor<1x2x2x2x1xf32> {
   %fake = tensor.empty() : tensor<3x3x3xf32>
   %init = tensor.empty() : tensor<1x2x2x2x1xf32>
   %cst = arith.constant 0.000000e+00 : f32
-  %fill = linalg.fill ins(%cst : f32) outs(%init : tensor<1x2x2x2x1xf32>) -> tensor<1x2x2x2x1xf32>
+  %fill = linalg.fill ins(%cst : f32) outs(%init : tensor<1x2x2x2x1xf32>)
   %res = linalg.pooling_ndhwc_sum {dilations = dense<1> : tensor<3xi64>, strides = dense<1> : tensor<3xi64>}
     ins(%input, %fake: tensor<1x4x4x4x1xf32>, tensor<3x3x3xf32>)
-    outs(%fill: tensor<1x2x2x2x1xf32>) -> tensor<1x2x2x2x1xf32>
+    outs(%fill: tensor<1x2x2x2x1xf32>)
   return %res : tensor<1x2x2x2x1xf32>
 }
 
@@ -956,15 +956,15 @@
 // CHECK-SAME:      dilations = dense<1> : tensor<3xi64>
 // CHECK-SAME:      strides = dense<1> : tensor<3xi64>
 // CHECK-SAME:      ins(%{{.+}}, %{{.+}} : tensor<1x4x4x4x1xf32>, tensor<3x3x3xf32>)
-// CHECK-SAME:      outs(%{{.+}} : tensor<1x2x2x2x1xf32>) -> tensor<1x2x2x2x1xf32>
+// CHECK-SAME:      outs(%{{.+}} : tensor<1x2x2x2x1xf32>)
 func.func @pooling_ndhwc_max_tensor(%input: tensor<1x4x4x4x1xf32>) -> tensor<1x2x2x2x1xf32> {
   %fake = tensor.empty() : tensor<3x3x3xf32>
   %init = tensor.empty() : tensor<1x2x2x2x1xf32>
   %cst = arith.constant 0.000000e+00 : f32
-  %fill = linalg.fill ins(%cst : f32) outs(%init : tensor<1x2x2x2x1xf32>) -> tensor<1x2x2x2x1xf32>
+  %fill = linalg.fill ins(%cst : f32) outs(%init : tensor<1x2x2x2x1xf32>)
   %res = linalg.pooling_ndhwc_max {dilations = dense<1> : tensor<3xi64>, strides = dense<1> : tensor<3xi64>}
     ins(%input, %fake: tensor<1x4x4x4x1xf32>, tensor<3x3x3xf32>)
-    outs(%fill: tensor<1x2x2x2x1xf32>) -> tensor<1x2x2x2x1xf32>
+    outs(%fill: tensor<1x2x2x2x1xf32>)
   return %res : tensor<1x2x2x2x1xf32>
 }
 
@@ -990,15 +990,15 @@
 // CHECK-SAME:      dilations = dense<1> : tensor<3xi64>
 // CHECK-SAME:      strides = dense<1> : tensor<3xi64>
 // CHECK-SAME:      ins(%{{.+}}, %{{.+}} : tensor<1x4x4x4x1xf32>, tensor<3x3x3xf32>)
-// CHECK-SAME:      outs(%{{.+}} : tensor<1x2x2x2x1xf32>) -> tensor<1x2x2x2x1xf32>
+// CHECK-SAME:      outs(%{{.+}} : tensor<1x2x2x2x1xf32>)
 func.func @pooling_ndhwc_min_tensor(%input: tensor<1x4x4x4x1xf32>) -> tensor<1x2x2x2x1xf32> {
   %fake = tensor.empty() : tensor<3x3x3xf32>
   %init = tensor.empty() : tensor<1x2x2x2x1xf32>
   %cst = arith.constant 0.000000e+00 : f32
-  %fill = linalg.fill ins(%cst : f32) outs(%init : tensor<1x2x2x2x1xf32>) -> tensor<1x2x2x2x1xf32>
+  %fill = linalg.fill ins(%cst : f32) outs(%init : tensor<1x2x2x2x1xf32>)
   %res = linalg.pooling_ndhwc_min {dilations = dense<1> : tensor<3xi64>, strides = dense<1> : tensor<3xi64>}
     ins(%input, %fake: tensor<1x4x4x4x1xf32>, tensor<3x3x3xf32>)
-    outs(%fill: tensor<1x2x2x2x1xf32>) -> tensor<1x2x2x2x1xf32>
+    outs(%fill: tensor<1x2x2x2x1xf32>)
   return %res : tensor<1x2x2x2x1xf32>
 }
 
@@ -1056,8 +1056,8 @@
 func.func @batch_reduce_matmul(%arg0: tensor<8x128x256xf32>, %arg1: tensor<8x256x512xf32>, %arg2: tensor<128x512xf32>) -> tensor<128x512xf32> {
   // CHECK: %{{.+}} = linalg.batch_reduce_matmul
   // CHECK-SAME: ins(%{{.+}}, %{{.+}} : tensor<8x128x256xf32>, tensor<8x256x512xf32>)
-  // CHECK-SAME: outs(%{{.+}} : tensor<128x512xf32>) -> tensor<128x512xf32>
-  %0 = linalg.batch_reduce_matmul ins(%arg0, %arg1 : tensor<8x128x256xf32>, tensor<8x256x512xf32>) outs(%arg2: tensor<128x512xf32>) -> tensor<128x512xf32>
+  // CHECK-SAME: outs(%{{.+}} : tensor<128x512xf32>)
+  %0 = linalg.batch_reduce_matmul ins(%arg0, %arg1 : tensor<8x128x256xf32>, tensor<8x256x512xf32>) outs(%arg2: tensor<128x512xf32>)
   return %0: tensor<128x512xf32>
 }
 
diff --git a/mlir/test/Dialect/Linalg/namedop_conversion.mlir b/mlir/test/Dialect/Linalg/namedop_conversion.mlir
--- a/mlir/test/Dialect/Linalg/namedop_conversion.mlir
+++ b/mlir/test/Dialect/Linalg/namedop_conversion.mlir
@@ -6,7 +6,7 @@
   // CHECK-DAG: %[[INIT:.+]] = tensor.collapse_shape %arg2 {{\[\[}}0], [1], [2], [3, 4]]
   // CHECK-DAG: %[[CONV:.+]] = linalg.depthwise_conv_2d_nhwc_hwc {_someattr, dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} ins(%arg0, %[[KERNEL]] : tensor<?x?x?x?xf32>, tensor<?x?x?xf32>) outs(%[[INIT]] : tensor<?x?x?x?xf32>)
   // CHECK: %[[OUT:.+]] = tensor.expand_shape %[[CONV]] {{\[\[}}0], [1], [2], [3, 4]]
-  %0 = linalg.depthwise_conv_2d_nhwc_hwcm {_someattr, dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} ins(%arg0, %arg1 : tensor<?x?x?x?xf32>, tensor<?x?x?x1xf32>) outs(%arg2 : tensor<?x?x?x?x1xf32>) -> tensor<?x?x?x?x1xf32>
+  %0 = linalg.depthwise_conv_2d_nhwc_hwcm {_someattr, dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} ins(%arg0, %arg1 : tensor<?x?x?x?xf32>, tensor<?x?x?x1xf32>) outs(%arg2 : tensor<?x?x?x?x1xf32>)
   return %0 : tensor<?x?x?x?x1xf32>
 }
 
@@ -19,6 +19,6 @@
   // CHECK-DAG: %[[INIT:.+]] = tensor.collapse_shape %arg2 {{\[\[}}0], [1], [2], [3, 4]]
   // CHECK-DAG: %[[CONV:.+]] = linalg.depthwise_conv_2d_nhwc_hwc_q {_someattr, dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} ins(%arg0, %[[KERNEL]], %arg3, %arg4 : tensor<?x?x?x?xi8>, tensor<?x?x?xi8>, i32, i32) outs(%[[INIT]] : tensor<?x?x?x?xi32>)
   // CHECK: %[[OUT:.+]] = tensor.expand_shape %[[CONV]] {{\[\[}}0], [1], [2], [3, 4]]
-  %0 = linalg.depthwise_conv_2d_nhwc_hwcm_q {_someattr, dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} ins(%arg0, %arg1, %arg3, %arg4 : tensor<?x?x?x?xi8>, tensor<?x?x?x1xi8>, i32, i32) outs(%arg2 : tensor<?x?x?x?x1xi32>) -> tensor<?x?x?x?x1xi32>
+  %0 = linalg.depthwise_conv_2d_nhwc_hwcm_q {_someattr, dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} ins(%arg0, %arg1, %arg3, %arg4 : tensor<?x?x?x?xi8>, tensor<?x?x?x1xi8>, i32, i32) outs(%arg2 : tensor<?x?x?x?x1xi32>)
   return %0 : tensor<?x?x?x?x1xi32>
 }
diff --git a/mlir/test/Dialect/Linalg/one-shot-bufferize-analysis-2fill-extract-matmul-all-perms.mlir b/mlir/test/Dialect/Linalg/one-shot-bufferize-analysis-2fill-extract-matmul-all-perms.mlir
--- a/mlir/test/Dialect/Linalg/one-shot-bufferize-analysis-2fill-extract-matmul-all-perms.mlir
+++ b/mlir/test/Dialect/Linalg/one-shot-bufferize-analysis-2fill-extract-matmul-all-perms.mlir
@@ -18,15 +18,15 @@
   %0 = bufferization.alloc_tensor() : tensor<256x256xf32>
 
   // CHECK: {__inplace_operands_attr__ = ["none", "false"]}
-  %1 = linalg.fill ins(%cst : f32) outs(%0 : tensor<256x256xf32>) -> tensor<256x256xf32>
+  %1 = linalg.fill ins(%cst : f32) outs(%0 : tensor<256x256xf32>)
   // CHECK: {__inplace_operands_attr__ = ["none", "true"]}
-  %2 = linalg.fill ins(%cst_0 : f32) outs(%0 : tensor<256x256xf32>) -> tensor<256x256xf32>
+  %2 = linalg.fill ins(%cst_0 : f32) outs(%0 : tensor<256x256xf32>)
   // CHECK: {__inplace_operands_attr__ = ["true"]}
   %3 = tensor.extract_slice %1[0, 0] [256, 16] [1, 1] : tensor<256x256xf32> to tensor<256x16xf32>
   // CHECK: {__inplace_operands_attr__ = ["true"]}
   %4 = tensor.extract_slice %2[0, 0] [16, 256] [1, 1] : tensor<256x256xf32> to tensor<16x256xf32>
   // CHECK: {__inplace_operands_attr__ = ["true", "true", "true"]}
-  %5 = linalg.matmul ins(%3, %4 : tensor<256x16xf32>, tensor<16x256xf32>) outs(%arg2 : tensor<256x256xf32>) -> tensor<256x256xf32>
+  %5 = linalg.matmul ins(%3, %4 : tensor<256x16xf32>, tensor<16x256xf32>) outs(%arg2 : tensor<256x256xf32>)
   return %5 : tensor<256x256xf32>
 }
 
@@ -45,15 +45,15 @@
   %0 = bufferization.alloc_tensor() : tensor<256x256xf32>
 
   // CHECK: {__inplace_operands_attr__ = ["none", "false"]}
-  %1 = linalg.fill ins(%cst : f32) outs(%0 : tensor<256x256xf32>) -> tensor<256x256xf32>
+  %1 = linalg.fill ins(%cst : f32) outs(%0 : tensor<256x256xf32>)
   // CHECK: {__inplace_operands_attr__ = ["none", "true"]}
-  %2 = linalg.fill ins(%cst_0 : f32) outs(%0 : tensor<256x256xf32>) -> tensor<256x256xf32>
+  %2 = linalg.fill ins(%cst_0 : f32) outs(%0 : tensor<256x256xf32>)
   // CHECK: {__inplace_operands_attr__ = ["true"]}
   %4 = tensor.extract_slice %2[0, 0] [16, 256] [1, 1] : tensor<256x256xf32> to tensor<16x256xf32>
   // CHECK: {__inplace_operands_attr__ = ["true"]}
   %3 = tensor.extract_slice %1[0, 0] [256, 16] [1, 1] : tensor<256x256xf32> to tensor<256x16xf32>
   // CHECK: {__inplace_operands_attr__ = ["true", "true", "true"]}
-  %5 = linalg.matmul ins(%3, %4 : tensor<256x16xf32>, tensor<16x256xf32>) outs(%arg2 : tensor<256x256xf32>) -> tensor<256x256xf32>
+  %5 = linalg.matmul ins(%3, %4 : tensor<256x16xf32>, tensor<16x256xf32>) outs(%arg2 : tensor<256x256xf32>)
   return %5 : tensor<256x256xf32>
 }
 
@@ -72,15 +72,15 @@
   %0 = bufferization.alloc_tensor() : tensor<256x256xf32>
 
   // CHECK: {__inplace_operands_attr__ = ["none", "false"]}
-  %1 = linalg.fill ins(%cst : f32) outs(%0 : tensor<256x256xf32>) -> tensor<256x256xf32>
+  %1 = linalg.fill ins(%cst : f32) outs(%0 : tensor<256x256xf32>)
   // CHECK: {__inplace_operands_attr__ = ["true"]}
   %3 = tensor.extract_slice %1[0, 0] [256, 16] [1, 1] : tensor<256x256xf32> to tensor<256x16xf32>
   // CHECK: {__inplace_operands_attr__ = ["none", "true"]}
-  %2 = linalg.fill ins(%cst_0 : f32) outs(%0 : tensor<256x256xf32>) -> tensor<256x256xf32>
+  %2 = linalg.fill ins(%cst_0 : f32) outs(%0 : tensor<256x256xf32>)
   // CHECK: {__inplace_operands_attr__ = ["true"]}
   %4 = tensor.extract_slice %2[0, 0] [16, 256] [1, 1] : tensor<256x256xf32> to tensor<16x256xf32>
   // CHECK: {__inplace_operands_attr__ = ["true", "true", "true"]}
-  %5 = linalg.matmul ins(%3, %4 : tensor<256x16xf32>, tensor<16x256xf32>) outs(%arg2 : tensor<256x256xf32>) -> tensor<256x256xf32>
+  %5 = linalg.matmul ins(%3, %4 : tensor<256x16xf32>, tensor<16x256xf32>) outs(%arg2 : tensor<256x256xf32>)
   return %5 : tensor<256x256xf32>
 }
 
@@ -99,15 +99,15 @@
   %0 = bufferization.alloc_tensor() : tensor<256x256xf32>
 
   // CHECK: {__inplace_operands_attr__ = ["none", "false"]}
-  %1 = linalg.fill ins(%cst : f32) outs(%0 : tensor<256x256xf32>) -> tensor<256x256xf32>
+  %1 = linalg.fill ins(%cst : f32) outs(%0 : tensor<256x256xf32>)
   // CHECK: {__inplace_operands_attr__ = ["true"]}
   %3 = tensor.extract_slice %1[0, 0] [256, 16] [1, 1] : tensor<256x256xf32> to tensor<256x16xf32>
   // CHECK: {__inplace_operands_attr__ = ["true"]}
   %4 = tensor.extract_slice %0[0, 0] [16, 256] [1, 1] : tensor<256x256xf32> to tensor<16x256xf32>
   // CHECK: {__inplace_operands_attr__ = ["none", "true"]}
-  %2 = linalg.fill ins(%cst_0 : f32) outs(%4 : tensor<16x256xf32>) -> tensor<16x256xf32>
+  %2 = linalg.fill ins(%cst_0 : f32) outs(%4 : tensor<16x256xf32>)
   // CHECK: {__inplace_operands_attr__ = ["true", "true", "true"]}
-  %5 = linalg.matmul ins(%3, %2 : tensor<256x16xf32>, tensor<16x256xf32>) outs(%arg2 : tensor<256x256xf32>) -> tensor<256x256xf32>
+  %5 = linalg.matmul ins(%3, %2 : tensor<256x16xf32>, tensor<16x256xf32>) outs(%arg2 : tensor<256x256xf32>)
   return %5 : tensor<256x256xf32>
 }
 
@@ -126,15 +126,15 @@
   %0 = bufferization.alloc_tensor() : tensor<256x256xf32>
 
   // CHECK: {__inplace_operands_attr__ = ["none", "false"]}
-  %1 = linalg.fill ins(%cst : f32) outs(%0 : tensor<256x256xf32>) -> tensor<256x256xf32>
+  %1 = linalg.fill ins(%cst : f32) outs(%0 : tensor<256x256xf32>)
   // CHECK: {__inplace_operands_attr__ = ["true"]}
   %4 = tensor.extract_slice %0[0, 0] [16, 256] [1, 1] : tensor<256x256xf32> to tensor<16x256xf32>
   // CHECK: {__inplace_operands_attr__ = ["none", "true"]}
-  %2 = linalg.fill ins(%cst_0 : f32) outs(%4 : tensor<16x256xf32>) -> tensor<16x256xf32>
+  %2 = linalg.fill ins(%cst_0 : f32) outs(%4 : tensor<16x256xf32>)
   // CHECK: {__inplace_operands_attr__ = ["true"]}
   %3 = tensor.extract_slice %1[0, 0] [256, 16] [1, 1] : tensor<256x256xf32> to tensor<256x16xf32>
   // CHECK: {__inplace_operands_attr__ = ["true", "true", "true"]}
-  %5 = linalg.matmul ins(%3, %2 : tensor<256x16xf32>, tensor<16x256xf32>) outs(%arg2 : tensor<256x256xf32>) -> tensor<256x256xf32>
+  %5 = linalg.matmul ins(%3, %2 : tensor<256x16xf32>, tensor<16x256xf32>) outs(%arg2 : tensor<256x256xf32>)
   return %5 : tensor<256x256xf32>
 }
 
@@ -153,15 +153,15 @@
   %0 = bufferization.alloc_tensor() : tensor<256x256xf32>
 
   // CHECK: {__inplace_operands_attr__ = ["none", "false"]}
-  %1 = linalg.fill ins(%cst : f32) outs(%0 : tensor<256x256xf32>) -> tensor<256x256xf32>
+  %1 = linalg.fill ins(%cst : f32) outs(%0 : tensor<256x256xf32>)
   // CHECK: {__inplace_operands_attr__ = ["true"]}
   %4 = tensor.extract_slice %0[0, 0] [16, 256] [1, 1] : tensor<256x256xf32> to tensor<16x256xf32>
   // CHECK: {__inplace_operands_attr__ = ["true"]}
   %3 = tensor.extract_slice %1[0, 0] [256, 16] [1, 1] : tensor<256x256xf32> to tensor<256x16xf32>
   // CHECK: {__inplace_operands_attr__ = ["none", "true"]}
-  %2 = linalg.fill ins(%cst_0 : f32) outs(%4 : tensor<16x256xf32>) -> tensor<16x256xf32>
+  %2 = linalg.fill ins(%cst_0 : f32) outs(%4 : tensor<16x256xf32>)
   // CHECK: {__inplace_operands_attr__ = ["true", "true", "true"]}
-  %5 = linalg.matmul ins(%3, %2 : tensor<256x16xf32>, tensor<16x256xf32>) outs(%arg2 : tensor<256x256xf32>) -> tensor<256x256xf32>
+  %5 = linalg.matmul ins(%3, %2 : tensor<256x16xf32>, tensor<16x256xf32>) outs(%arg2 : tensor<256x256xf32>)
   return %5 : tensor<256x256xf32>
 }
 
@@ -180,15 +180,15 @@
   %0 = bufferization.alloc_tensor() : tensor<256x256xf32>
 
   // CHECK: {__inplace_operands_attr__ = ["none", "false"]}
-  %2 = linalg.fill ins(%cst_0 : f32) outs(%0 : tensor<256x256xf32>) -> tensor<256x256xf32>
+  %2 = linalg.fill ins(%cst_0 : f32) outs(%0 : tensor<256x256xf32>)
   // CHECK: {__inplace_operands_attr__ = ["none", "true"]}
-  %1 = linalg.fill ins(%cst : f32) outs(%0 : tensor<256x256xf32>) -> tensor<256x256xf32>
+  %1 = linalg.fill ins(%cst : f32) outs(%0 : tensor<256x256xf32>)
   // CHECK: {__inplace_operands_attr__ = ["true"]}
   %3 = tensor.extract_slice %1[0, 0] [256, 16] [1, 1] : tensor<256x256xf32> to tensor<256x16xf32>
   // CHECK: {__inplace_operands_attr__ = ["true"]}
   %4 = tensor.extract_slice %2[0, 0] [16, 256] [1, 1] : tensor<256x256xf32> to tensor<16x256xf32>
   // CHECK: {__inplace_operands_attr__ = ["true", "true", "true"]}
-  %5 = linalg.matmul ins(%3, %4 : tensor<256x16xf32>, tensor<16x256xf32>) outs(%arg2 : tensor<256x256xf32>) -> tensor<256x256xf32>
+  %5 = linalg.matmul ins(%3, %4 : tensor<256x16xf32>, tensor<16x256xf32>) outs(%arg2 : tensor<256x256xf32>)
   return %5 : tensor<256x256xf32>
 }
 
@@ -207,15 +207,15 @@
   %0 = bufferization.alloc_tensor() : tensor<256x256xf32>
 
   // CHECK: {__inplace_operands_attr__ = ["none", "false"]}
-  %2 = linalg.fill ins(%cst_0 : f32) outs(%0 : tensor<256x256xf32>) -> tensor<256x256xf32>
+  %2 = linalg.fill ins(%cst_0 : f32) outs(%0 : tensor<256x256xf32>)
   // CHECK: {__inplace_operands_attr__ = ["none", "true"]}
-  %1 = linalg.fill ins(%cst : f32) outs(%0 : tensor<256x256xf32>) -> tensor<256x256xf32>
+  %1 = linalg.fill ins(%cst : f32) outs(%0 : tensor<256x256xf32>)
   // CHECK: {__inplace_operands_attr__ = ["true"]}
   %4 = tensor.extract_slice %2[0, 0] [16, 256] [1, 1] : tensor<256x256xf32> to tensor<16x256xf32>
   // CHECK: {__inplace_operands_attr__ = ["true"]}
   %3 = tensor.extract_slice %1[0, 0] [256, 16] [1, 1] : tensor<256x256xf32> to tensor<256x16xf32>
   // CHECK: {__inplace_operands_attr__ = ["true", "true", "true"]}
-  %5 = linalg.matmul ins(%3, %4 : tensor<256x16xf32>, tensor<16x256xf32>) outs(%arg2 : tensor<256x256xf32>) -> tensor<256x256xf32>
+  %5 = linalg.matmul ins(%3, %4 : tensor<256x16xf32>, tensor<16x256xf32>) outs(%arg2 : tensor<256x256xf32>)
   return %5 : tensor<256x256xf32>
 }
 
@@ -234,15 +234,15 @@
   %0 = bufferization.alloc_tensor() : tensor<256x256xf32>
 
   // CHECK: {__inplace_operands_attr__ = ["none", "false"]}
-  %2 = linalg.fill ins(%cst_0 : f32) outs(%0 : tensor<256x256xf32>) -> tensor<256x256xf32>
+  %2 = linalg.fill ins(%cst_0 : f32) outs(%0 : tensor<256x256xf32>)
   // CHECK: {__inplace_operands_attr__ = ["true"]}
   %3 = tensor.extract_slice %0[0, 0] [256, 16] [1, 1] : tensor<256x256xf32> to tensor<256x16xf32>
   // CHECK: {__inplace_operands_attr__ = ["none", "true"]}
-  %1 = linalg.fill ins(%cst : f32) outs(%3 : tensor<256x16xf32>) -> tensor<256x16xf32>
+  %1 = linalg.fill ins(%cst : f32) outs(%3 : tensor<256x16xf32>)
   // CHECK: {__inplace_operands_attr__ = ["true"]}
   %4 = tensor.extract_slice %2[0, 0] [16, 256] [1, 1] : tensor<256x256xf32> to tensor<16x256xf32>
   // CHECK: {__inplace_operands_attr__ = ["true", "true", "true"]}
-  %5 = linalg.matmul ins(%1, %4 : tensor<256x16xf32>, tensor<16x256xf32>) outs(%arg2 : tensor<256x256xf32>) -> tensor<256x256xf32>
+  %5 = linalg.matmul ins(%1, %4 : tensor<256x16xf32>, tensor<16x256xf32>) outs(%arg2 : tensor<256x256xf32>)
   return %5 : tensor<256x256xf32>
 }
 
@@ -261,15 +261,15 @@
   %0 = bufferization.alloc_tensor() : tensor<256x256xf32>
 
   // CHECK: {__inplace_operands_attr__ = ["none", "false"]}
-  %2 = linalg.fill ins(%cst_0 : f32) outs(%0 : tensor<256x256xf32>) -> tensor<256x256xf32>
+  %2 = linalg.fill ins(%cst_0 : f32) outs(%0 : tensor<256x256xf32>)
   // CHECK: {__inplace_operands_attr__ = ["true"]}
   %3 = tensor.extract_slice %0[0, 0] [256, 16] [1, 1] : tensor<256x256xf32> to tensor<256x16xf32>
   // CHECK: {__inplace_operands_attr__ = ["true"]}
   %4 = tensor.extract_slice %2[0, 0] [16, 256] [1, 1] : tensor<256x256xf32> to tensor<16x256xf32>
   // CHECK: {__inplace_operands_attr__ = ["none", "true"]}
-  %1 = linalg.fill ins(%cst : f32) outs(%3 : tensor<256x16xf32>) -> tensor<256x16xf32>
+  %1 = linalg.fill ins(%cst : f32) outs(%3 : tensor<256x16xf32>)
   // CHECK: {__inplace_operands_attr__ = ["true", "true", "true"]}
-  %5 = linalg.matmul ins(%1, %4 : tensor<256x16xf32>, tensor<16x256xf32>) outs(%arg2 : tensor<256x256xf32>) -> tensor<256x256xf32>
+  %5 = linalg.matmul ins(%1, %4 : tensor<256x16xf32>, tensor<16x256xf32>) outs(%arg2 : tensor<256x256xf32>)
   return %5 : tensor<256x256xf32>
 }
 
@@ -288,15 +288,15 @@
   %0 = bufferization.alloc_tensor() : tensor<256x256xf32>
 
   // CHECK: {__inplace_operands_attr__ = ["none", "false"]}
-  %2 = linalg.fill ins(%cst_0 : f32) outs(%0 : tensor<256x256xf32>) -> tensor<256x256xf32>
+  %2 = linalg.fill ins(%cst_0 : f32) outs(%0 : tensor<256x256xf32>)
   // CHECK: {__inplace_operands_attr__ = ["true"]}
   %4 = tensor.extract_slice %2[0, 0] [16, 256] [1, 1] : tensor<256x256xf32> to tensor<16x256xf32>
   // CHECK: {__inplace_operands_attr__ = ["none", "true"]}
-  %1 = linalg.fill ins(%cst : f32) outs(%0 : tensor<256x256xf32>) -> tensor<256x256xf32>
+  %1 = linalg.fill ins(%cst : f32) outs(%0 : tensor<256x256xf32>)
   // CHECK: {__inplace_operands_attr__ = ["true"]}
   %3 = tensor.extract_slice %1[0, 0] [256, 16] [1, 1] : tensor<256x256xf32> to tensor<256x16xf32>
   // CHECK: {__inplace_operands_attr__ = ["true", "true", "true"]}
-  %5 = linalg.matmul ins(%3, %4 : tensor<256x16xf32>, tensor<16x256xf32>) outs(%arg2 : tensor<256x256xf32>) -> tensor<256x256xf32>
+  %5 = linalg.matmul ins(%3, %4 : tensor<256x16xf32>, tensor<16x256xf32>) outs(%arg2 : tensor<256x256xf32>)
   return %5 : tensor<256x256xf32>
 }
 
@@ -315,15 +315,15 @@
   %0 = bufferization.alloc_tensor() : tensor<256x256xf32>
 
   // CHECK: {__inplace_operands_attr__ = ["none", "false"]}
-  %2 = linalg.fill ins(%cst_0 : f32) outs(%0 : tensor<256x256xf32>) -> tensor<256x256xf32>
+  %2 = linalg.fill ins(%cst_0 : f32) outs(%0 : tensor<256x256xf32>)
   // CHECK: {__inplace_operands_attr__ = ["true"]}
   %4 = tensor.extract_slice %2[0, 0] [16, 256] [1, 1] : tensor<256x256xf32> to tensor<16x256xf32>
   // CHECK: {__inplace_operands_attr__ = ["true"]}
   %3 = tensor.extract_slice %0[0, 0] [256, 16] [1, 1] : tensor<256x256xf32> to tensor<256x16xf32>
   // CHECK: {__inplace_operands_attr__ = ["none", "true"]}
-  %1 = linalg.fill ins(%cst : f32) outs(%3 : tensor<256x16xf32>) -> tensor<256x16xf32>
+  %1 = linalg.fill ins(%cst : f32) outs(%3 : tensor<256x16xf32>)
   // CHECK: {__inplace_operands_attr__ = ["true", "true", "true"]}
-  %5 = linalg.matmul ins(%1, %4 : tensor<256x16xf32>, tensor<16x256xf32>) outs(%arg2 : tensor<256x256xf32>) -> tensor<256x256xf32>
+  %5 = linalg.matmul ins(%1, %4 : tensor<256x16xf32>, tensor<16x256xf32>) outs(%arg2 : tensor<256x256xf32>)
   return %5 : tensor<256x256xf32>
 }
 
@@ -344,13 +344,13 @@
   // CHECK: {__inplace_operands_attr__ = ["false"]}
   %3 = tensor.extract_slice %0[0, 0] [256, 16] [1, 1] : tensor<256x256xf32> to tensor<256x16xf32>
   // CHECK: {__inplace_operands_attr__ = ["none", "true"]}
-  %1 = linalg.fill ins(%cst : f32) outs(%3 : tensor<256x16xf32>) -> tensor<256x16xf32>
+  %1 = linalg.fill ins(%cst : f32) outs(%3 : tensor<256x16xf32>)
   // CHECK: {__inplace_operands_attr__ = ["none", "true"]}
-  %2 = linalg.fill ins(%cst_0 : f32) outs(%0 : tensor<256x256xf32>) -> tensor<256x256xf32>
+  %2 = linalg.fill ins(%cst_0 : f32) outs(%0 : tensor<256x256xf32>)
   // CHECK: {__inplace_operands_attr__ = ["true"]}
   %4 = tensor.extract_slice %2[0, 0] [16, 256] [1, 1] : tensor<256x256xf32> to tensor<16x256xf32>
   // CHECK: {__inplace_operands_attr__ = ["true", "true", "true"]}
-  %5 = linalg.matmul ins(%1, %4 : tensor<256x16xf32>, tensor<16x256xf32>) outs(%arg2 : tensor<256x256xf32>) -> tensor<256x256xf32>
+  %5 = linalg.matmul ins(%1, %4 : tensor<256x16xf32>, tensor<16x256xf32>) outs(%arg2 : tensor<256x256xf32>)
   return %5 : tensor<256x256xf32>
 }
 
@@ -371,13 +371,13 @@
   // CHECK: {__inplace_operands_attr__ = ["false"]}
   %3 = tensor.extract_slice %0[0, 0] [256, 16] [1, 1] : tensor<256x256xf32> to tensor<256x16xf32>
   // CHECK: {__inplace_operands_attr__ = ["none", "true"]}
-  %1 = linalg.fill ins(%cst : f32) outs(%3 : tensor<256x16xf32>) -> tensor<256x16xf32>
+  %1 = linalg.fill ins(%cst : f32) outs(%3 : tensor<256x16xf32>)
   // CHECK: {__inplace_operands_attr__ = ["true"]}
   %4 = tensor.extract_slice %0[0, 0] [16, 256] [1, 1] : tensor<256x256xf32> to tensor<16x256xf32>
   // CHECK: {__inplace_operands_attr__ = ["none", "true"]}
-  %2 = linalg.fill ins(%cst_0 : f32) outs(%4 : tensor<16x256xf32>) -> tensor<16x256xf32>
+  %2 = linalg.fill ins(%cst_0 : f32) outs(%4 : tensor<16x256xf32>)
   // CHECK: {__inplace_operands_attr__ = ["true", "true", "true"]}
-  %5 = linalg.matmul ins(%1, %2 : tensor<256x16xf32>, tensor<16x256xf32>) outs(%arg2 : tensor<256x256xf32>) -> tensor<256x256xf32>
+  %5 = linalg.matmul ins(%1, %2 : tensor<256x16xf32>, tensor<16x256xf32>) outs(%arg2 : tensor<256x256xf32>)
   return %5 : tensor<256x256xf32>
 }
 
@@ -397,13 +397,13 @@
   // CHECK: {__inplace_operands_attr__ = ["false"]}
   %3 = tensor.extract_slice %0[0, 0] [256, 16] [1, 1] : tensor<256x256xf32> to tensor<256x16xf32>
   // CHECK: {__inplace_operands_attr__ = ["none", "true"]}
-  %2 = linalg.fill ins(%cst_0 : f32) outs(%0 : tensor<256x256xf32>) -> tensor<256x256xf32>
+  %2 = linalg.fill ins(%cst_0 : f32) outs(%0 : tensor<256x256xf32>)
   // CHECK: {__inplace_operands_attr__ = ["none", "true"]}
-  %1 = linalg.fill ins(%cst : f32) outs(%3 : tensor<256x16xf32>) -> tensor<256x16xf32>
+  %1 = linalg.fill ins(%cst : f32) outs(%3 : tensor<256x16xf32>)
   // CHECK: {__inplace_operands_attr__ = ["true"]}
   %4 = tensor.extract_slice %2[0, 0] [16, 256] [1, 1] : tensor<256x256xf32> to tensor<16x256xf32>
   // CHECK: {__inplace_operands_attr__ = ["true", "true", "true"]}
-  %5 = linalg.matmul ins(%1, %4 : tensor<256x16xf32>, tensor<16x256xf32>) outs(%arg2 : tensor<256x256xf32>) -> tensor<256x256xf32>
+  %5 = linalg.matmul ins(%1, %4 : tensor<256x16xf32>, tensor<16x256xf32>) outs(%arg2 : tensor<256x256xf32>)
   return %5 : tensor<256x256xf32>
 }
 
@@ -424,13 +424,13 @@
   // CHECK: {__inplace_operands_attr__ = ["false"]}
   %3 = tensor.extract_slice %0[0, 0] [256, 16] [1, 1] : tensor<256x256xf32> to tensor<256x16xf32>
   // CHECK: {__inplace_operands_attr__ = ["none", "true"]}
-  %2 = linalg.fill ins(%cst_0 : f32) outs(%0 : tensor<256x256xf32>) -> tensor<256x256xf32>
+  %2 = linalg.fill ins(%cst_0 : f32) outs(%0 : tensor<256x256xf32>)
   // CHECK: {__inplace_operands_attr__ = ["true"]}
   %4 = tensor.extract_slice %2[0, 0] [16, 256] [1, 1] : tensor<256x256xf32> to tensor<16x256xf32>
   // CHECK: {__inplace_operands_attr__ = ["none", "true"]}
-  %1 = linalg.fill ins(%cst : f32) outs(%3 : tensor<256x16xf32>) -> tensor<256x16xf32>
+  %1 = linalg.fill ins(%cst : f32) outs(%3 : tensor<256x16xf32>)
   // CHECK: {__inplace_operands_attr__ = ["true", "true", "true"]}
-  %5 = linalg.matmul ins(%1, %4 : tensor<256x16xf32>, tensor<16x256xf32>) outs(%arg2 : tensor<256x256xf32>) -> tensor<256x256xf32>
+  %5 = linalg.matmul ins(%1, %4 : tensor<256x16xf32>, tensor<16x256xf32>) outs(%arg2 : tensor<256x256xf32>)
   return %5 : tensor<256x256xf32>
 }
 
@@ -453,11 +453,11 @@
   // CHECK: {__inplace_operands_attr__ = ["true"]}
   %4 = tensor.extract_slice %0[0, 0] [16, 256] [1, 1] : tensor<256x256xf32> to tensor<16x256xf32>
   // CHECK: {__inplace_operands_attr__ = ["none", "true"]}
-  %1 = linalg.fill ins(%cst : f32) outs(%3 : tensor<256x16xf32>) -> tensor<256x16xf32>
+  %1 = linalg.fill ins(%cst : f32) outs(%3 : tensor<256x16xf32>)
   // CHECK: {__inplace_operands_attr__ = ["none", "true"]}
-  %2 = linalg.fill ins(%cst_0 : f32) outs(%4 : tensor<16x256xf32>) -> tensor<16x256xf32>
+  %2 = linalg.fill ins(%cst_0 : f32) outs(%4 : tensor<16x256xf32>)
   // CHECK: {__inplace_operands_attr__ = ["true", "true", "true"]}
-  %5 = linalg.matmul ins(%1, %2 : tensor<256x16xf32>, tensor<16x256xf32>) outs(%arg2 : tensor<256x256xf32>) -> tensor<256x256xf32>
+  %5 = linalg.matmul ins(%1, %2 : tensor<256x16xf32>, tensor<16x256xf32>) outs(%arg2 : tensor<256x256xf32>)
   return %5 : tensor<256x256xf32>
 }
 
@@ -480,11 +480,11 @@
   // CHECK: {__inplace_operands_attr__ = ["true"]}
   %4 = tensor.extract_slice %0[0, 0] [16, 256] [1, 1] : tensor<256x256xf32> to tensor<16x256xf32>
   // CHECK: {__inplace_operands_attr__ = ["none", "true"]}
-  %2 = linalg.fill ins(%cst_0 : f32) outs(%4 : tensor<16x256xf32>) -> tensor<16x256xf32>
+  %2 = linalg.fill ins(%cst_0 : f32) outs(%4 : tensor<16x256xf32>)
   // CHECK: {__inplace_operands_attr__ = ["none", "true"]}
-  %1 = linalg.fill ins(%cst : f32) outs(%3 : tensor<256x16xf32>) -> tensor<256x16xf32>
+  %1 = linalg.fill ins(%cst : f32) outs(%3 : tensor<256x16xf32>)
   // CHECK: {__inplace_operands_attr__ = ["true", "true", "true"]}
-  %5 = linalg.matmul ins(%1, %2 : tensor<256x16xf32>, tensor<16x256xf32>) outs(%arg2 : tensor<256x256xf32>) -> tensor<256x256xf32>
+  %5 = linalg.matmul ins(%1, %2 : tensor<256x16xf32>, tensor<16x256xf32>) outs(%arg2 : tensor<256x256xf32>)
   return %5 : tensor<256x256xf32>
 }
 
@@ -505,13 +505,13 @@
   // CHECK: {__inplace_operands_attr__ = ["false"]}
   %4 = tensor.extract_slice %0[0, 0] [16, 256] [1, 1] : tensor<256x256xf32> to tensor<16x256xf32>
   // CHECK: {__inplace_operands_attr__ = ["none", "true"]}
-  %1 = linalg.fill ins(%cst : f32) outs(%0 : tensor<256x256xf32>) -> tensor<256x256xf32>
+  %1 = linalg.fill ins(%cst : f32) outs(%0 : tensor<256x256xf32>)
   // CHECK: {__inplace_operands_attr__ = ["none", "true"]}
-  %2 = linalg.fill ins(%cst_0 : f32) outs(%4 : tensor<16x256xf32>) -> tensor<16x256xf32>
+  %2 = linalg.fill ins(%cst_0 : f32) outs(%4 : tensor<16x256xf32>)
   // CHECK: {__inplace_operands_attr__ = ["true"]}
   %3 = tensor.extract_slice %1[0, 0] [256, 16] [1, 1] : tensor<256x256xf32> to tensor<256x16xf32>
   // CHECK: {__inplace_operands_attr__ = ["true", "true", "true"]}
-  %5 = linalg.matmul ins(%3, %2 : tensor<256x16xf32>, tensor<16x256xf32>) outs(%arg2 : tensor<256x256xf32>) -> tensor<256x256xf32>
+  %5 = linalg.matmul ins(%3, %2 : tensor<256x16xf32>, tensor<16x256xf32>) outs(%arg2 : tensor<256x256xf32>)
   return %5 : tensor<256x256xf32>
 }
 
@@ -532,13 +532,13 @@
   // CHECK: {__inplace_operands_attr__ = ["false"]}
   %4 = tensor.extract_slice %0[0, 0] [16, 256] [1, 1] : tensor<256x256xf32> to tensor<16x256xf32>
   // CHECK: {__inplace_operands_attr__ = ["none", "true"]}
-  %1 = linalg.fill ins(%cst : f32) outs(%0 : tensor<256x256xf32>) -> tensor<256x256xf32>
+  %1 = linalg.fill ins(%cst : f32) outs(%0 : tensor<256x256xf32>)
   // CHECK: {__inplace_operands_attr__ = ["true"]}
   %3 = tensor.extract_slice %1[0, 0] [256, 16] [1, 1] : tensor<256x256xf32> to tensor<256x16xf32>
   // CHECK: {__inplace_operands_attr__ = ["none", "true"]}
-  %2 = linalg.fill ins(%cst_0 : f32) outs(%4 : tensor<16x256xf32>) -> tensor<16x256xf32>
+  %2 = linalg.fill ins(%cst_0 : f32) outs(%4 : tensor<16x256xf32>)
   // CHECK: {__inplace_operands_attr__ = ["true", "true", "true"]}
-  %5 = linalg.matmul ins(%3, %2 : tensor<256x16xf32>, tensor<16x256xf32>) outs(%arg2 : tensor<256x256xf32>) -> tensor<256x256xf32>
+  %5 = linalg.matmul ins(%3, %2 : tensor<256x16xf32>, tensor<16x256xf32>) outs(%arg2 : tensor<256x256xf32>)
   return %5 : tensor<256x256xf32>
 }
 
@@ -559,13 +559,13 @@
   // CHECK: {__inplace_operands_attr__ = ["false"]}
   %4 = tensor.extract_slice %0[0, 0] [16, 256] [1, 1] : tensor<256x256xf32> to tensor<16x256xf32>
   // CHECK: {__inplace_operands_attr__ = ["none", "true"]}
-  %2 = linalg.fill ins(%cst_0 : f32) outs(%4 : tensor<16x256xf32>) -> tensor<16x256xf32>
+  %2 = linalg.fill ins(%cst_0 : f32) outs(%4 : tensor<16x256xf32>)
   // CHECK: {__inplace_operands_attr__ = ["none", "true"]}
-  %1 = linalg.fill ins(%cst : f32) outs(%0 : tensor<256x256xf32>) -> tensor<256x256xf32>
+  %1 = linalg.fill ins(%cst : f32) outs(%0 : tensor<256x256xf32>)
   // CHECK: {__inplace_operands_attr__ = ["true"]}
   %3 = tensor.extract_slice %1[0, 0] [256, 16] [1, 1] : tensor<256x256xf32> to tensor<256x16xf32>
   // CHECK: {__inplace_operands_attr__ = ["true", "true", "true"]}
-  %5 = linalg.matmul ins(%3, %2 : tensor<256x16xf32>, tensor<16x256xf32>) outs(%arg2 : tensor<256x256xf32>) -> tensor<256x256xf32>
+  %5 = linalg.matmul ins(%3, %2 : tensor<256x16xf32>, tensor<16x256xf32>) outs(%arg2 : tensor<256x256xf32>)
   return %5 : tensor<256x256xf32>
 }
 
@@ -586,13 +586,13 @@
   // CHECK: {__inplace_operands_attr__ = ["false"]}
   %4 = tensor.extract_slice %0[0, 0] [16, 256] [1, 1] : tensor<256x256xf32> to tensor<16x256xf32>
   // CHECK: {__inplace_operands_attr__ = ["none", "true"]}
-  %2 = linalg.fill ins(%cst_0 : f32) outs(%4 : tensor<16x256xf32>) -> tensor<16x256xf32>
+  %2 = linalg.fill ins(%cst_0 : f32) outs(%4 : tensor<16x256xf32>)
   // CHECK: {__inplace_operands_attr__ = ["true"]}
   %3 = tensor.extract_slice %0[0, 0] [256, 16] [1, 1] : tensor<256x256xf32> to tensor<256x16xf32>
   // CHECK: {__inplace_operands_attr__ = ["none", "true"]}
-  %1 = linalg.fill ins(%cst : f32) outs(%3 : tensor<256x16xf32>) -> tensor<256x16xf32>
+  %1 = linalg.fill ins(%cst : f32) outs(%3 : tensor<256x16xf32>)
   // CHECK: {__inplace_operands_attr__ = ["true", "true", "true"]}
-  %5 = linalg.matmul ins(%1, %2 : tensor<256x16xf32>, tensor<16x256xf32>) outs(%arg2 : tensor<256x256xf32>) -> tensor<256x256xf32>
+  %5 = linalg.matmul ins(%1, %2 : tensor<256x16xf32>, tensor<16x256xf32>) outs(%arg2 : tensor<256x256xf32>)
   return %5 : tensor<256x256xf32>
 }
 
@@ -615,11 +615,11 @@
   // CHECK: {__inplace_operands_attr__ = ["true"]}
   %3 = tensor.extract_slice %0[0, 0] [256, 16] [1, 1] : tensor<256x256xf32> to tensor<256x16xf32>
   // CHECK: {__inplace_operands_attr__ = ["none", "true"]}
-  %1 = linalg.fill ins(%cst : f32) outs(%3 : tensor<256x16xf32>) -> tensor<256x16xf32>
+  %1 = linalg.fill ins(%cst : f32) outs(%3 : tensor<256x16xf32>)
   // CHECK: {__inplace_operands_attr__ = ["none", "true"]}
-  %2 = linalg.fill ins(%cst_0 : f32) outs(%4 : tensor<16x256xf32>) -> tensor<16x256xf32>
+  %2 = linalg.fill ins(%cst_0 : f32) outs(%4 : tensor<16x256xf32>)
   // CHECK: {__inplace_operands_attr__ = ["true", "true", "true"]}
-  %5 = linalg.matmul ins(%1, %2 : tensor<256x16xf32>, tensor<16x256xf32>) outs(%arg2 : tensor<256x256xf32>) -> tensor<256x256xf32>
+  %5 = linalg.matmul ins(%1, %2 : tensor<256x16xf32>, tensor<16x256xf32>) outs(%arg2 : tensor<256x256xf32>)
   return %5 : tensor<256x256xf32>
 }
 
@@ -642,10 +642,10 @@
   // CHECK: {__inplace_operands_attr__ = ["true"]}
   %3 = tensor.extract_slice %0[0, 0] [256, 16] [1, 1] : tensor<256x256xf32> to tensor<256x16xf32>
   // CHECK: {__inplace_operands_attr__ = ["none", "true"]}
-  %2 = linalg.fill ins(%cst_0 : f32) outs(%4 : tensor<16x256xf32>) -> tensor<16x256xf32>
+  %2 = linalg.fill ins(%cst_0 : f32) outs(%4 : tensor<16x256xf32>)
   // CHECK: {__inplace_operands_attr__ = ["none", "true"]}
-  %1 = linalg.fill ins(%cst : f32) outs(%3 : tensor<256x16xf32>) -> tensor<256x16xf32>
+  %1 = linalg.fill ins(%cst : f32) outs(%3 : tensor<256x16xf32>)
   // CHECK: {__inplace_operands_attr__ = ["true", "true", "true"]}
-  %5 = linalg.matmul ins(%1, %2 : tensor<256x16xf32>, tensor<16x256xf32>) outs(%arg2 : tensor<256x256xf32>) -> tensor<256x256xf32>
+  %5 = linalg.matmul ins(%1, %2 : tensor<256x16xf32>, tensor<16x256xf32>) outs(%arg2 : tensor<256x256xf32>)
   return %5 : tensor<256x256xf32>
 }
diff --git a/mlir/test/Dialect/Linalg/one-shot-bufferize.mlir b/mlir/test/Dialect/Linalg/one-shot-bufferize.mlir
--- a/mlir/test/Dialect/Linalg/one-shot-bufferize.mlir
+++ b/mlir/test/Dialect/Linalg/one-shot-bufferize.mlir
@@ -23,7 +23,7 @@
   /// Inplaceable, no alloc
   // CHECK-NOT: alloc
   //     CHECK: linalg.fill ins(%[[F0]] : f32) outs(%[[A]] : memref<?xf32, strided<[?], offset: ?>>)
-  %r = linalg.fill ins(%f0 : f32) outs(%A : tensor<?xf32>) -> tensor<?xf32>
+  %r = linalg.fill ins(%f0 : f32) outs(%A : tensor<?xf32>)
 
   //     CHECK: return
   // CHECK-NOT: tensor
@@ -46,7 +46,7 @@
   //     CHECK: %[[D0:.*]] = memref.dim %[[A]], {{.*}} : memref<?xf32, strided<[?], offset: ?>>
   //     CHECK: %[[ALLOC:.*]] = memref.alloc(%[[D0]]) {alignment = 64 : i64} : memref<?xf32>
   //     CHECK: linalg.fill ins(%[[F0]] : f32) outs(%[[ALLOC]] : memref<?xf32>)
-  %r = linalg.fill ins(%f0 : f32) outs(%A : tensor<?xf32>) -> tensor<?xf32>
+  %r = linalg.fill ins(%f0 : f32) outs(%A : tensor<?xf32>)
 
   // CHECK-NOT: dealloc
   //     CHECK: return %[[ALLOC]] : memref<?xf32>
@@ -68,14 +68,13 @@
   /// Cross-op multiple uses of %A, the first op which has interfering reads must alloc.
   //       CHECK: %[[ALLOC:.*]] = memref.alloc
   //       CHECK: linalg.fill ins({{.*}}{{.*}}outs(%[[ALLOC]]
-  %f = linalg.fill ins(%f0 : f32) outs(%A : tensor<?x?xf32>) -> tensor<?x?xf32>
+  %f = linalg.fill ins(%f0 : f32) outs(%A : tensor<?x?xf32>)
 
   /// The second op has no interfering reads and can reuse.
   //   CHECK-NOT: alloc
   //       CHECK: linalg.matmul ins(%[[ALLOC]], %[[ALLOC]]{{.*}}) outs(%[[A]]
   %r = linalg.matmul  ins(%f, %f: tensor<?x?xf32>, tensor<?x?xf32>)
                      outs(%A: tensor<?x?xf32>)
-    -> tensor<?x?xf32>
 
   //     CHECK: memref.dealloc %[[ALLOC]]
   //     CHECK: return
@@ -92,7 +91,6 @@
   // CHECK: alloc
   %r = linalg.matmul  ins(%A, %A: tensor<?x?xf32>, tensor<?x?xf32>)
                      outs(%A: tensor<?x?xf32>)
-    -> tensor<?x?xf32>
   // CHECK-NOT: dealloc
   return %r: tensor<?x?xf32>
 }
@@ -182,7 +180,7 @@
 
       // linalg.fill is inplace.
       // CHECK: linalg.fill ins(%{{.*}} : f32) outs(%[[C_SLICE]]
-      %5 = linalg.fill ins(%cst : f32) outs(%4 : tensor<8x16xf32>) -> tensor<8x16xf32>
+      %5 = linalg.fill ins(%cst : f32) outs(%4 : tensor<8x16xf32>)
 
       // CHECK: scf.for %[[K:.*]] =
       %6 = scf.for %arg7 = %c0 to %c256 step %c32 iter_args(%arg8 = %5) -> (tensor<8x16xf32>) {
@@ -195,7 +193,6 @@
         // CHECK: linalg.matmul ins({{.*}} outs(%[[C_SLICE]]
         %10 = linalg.matmul ins(%8, %9 : tensor<8x32xf32>, tensor<32x16xf32>)
                            outs(%arg8 : tensor<8x16xf32>)
-          -> tensor<8x16xf32>
         scf.yield %10 : tensor<8x16xf32>
       }
 
@@ -231,7 +228,7 @@
 
   %sA = tensor.extract_slice %A[0, 0][%idx, %idx][1, 1] : tensor<?x?xf32> to tensor<?x?xf32>
   %ssA = tensor.extract_slice %sA[0, 0][4, 4][1, 1] : tensor<?x?xf32> to tensor<4x4xf32>
-  %FA = linalg.fill ins(%f0 : f32) outs(%ssA : tensor<4x4xf32>) -> tensor<4x4xf32>
+  %FA = linalg.fill ins(%f0 : f32) outs(%ssA : tensor<4x4xf32>)
   %rsA = tensor.insert_slice %FA into %sA[0, 0][4, 4][1, 1] : tensor<4x4xf32> into tensor<?x?xf32>
   %rA = tensor.insert_slice %rsA into %A[0, 0][%idx, %idx][1, 1] : tensor<?x?xf32> into tensor<?x?xf32>
 
diff --git a/mlir/test/Dialect/Linalg/reshape_control_fusion.mlir b/mlir/test/Dialect/Linalg/reshape_control_fusion.mlir
--- a/mlir/test/Dialect/Linalg/reshape_control_fusion.mlir
+++ b/mlir/test/Dialect/Linalg/reshape_control_fusion.mlir
@@ -50,7 +50,7 @@
       } -> tensor<?x?xf32>
   %0 = tensor.expand_shape %fill [[0, 1], [2]] : tensor<?x?xf32> into tensor<1x?x?xf32>
   %1 = linalg.batch_matmul ins(%arg0, %arg1 : tensor<1x?x?xf32>, tensor<1x?x?xf32>)
-      outs(%0 : tensor<1x?x?xf32>) -> tensor<1x?x?xf32>
+      outs(%0 : tensor<1x?x?xf32>)
   return %1 : tensor<1x?x?xf32>
 }
 //  CHECK-DAG: #[[MAP:.+]] = affine_map<(d0, d1, d2) -> (d0, d1, d2)
diff --git a/mlir/test/Dialect/Linalg/resolve-shaped-type-result-dims.mlir b/mlir/test/Dialect/Linalg/resolve-shaped-type-result-dims.mlir
--- a/mlir/test/Dialect/Linalg/resolve-shaped-type-result-dims.mlir
+++ b/mlir/test/Dialect/Linalg/resolve-shaped-type-result-dims.mlir
@@ -111,7 +111,7 @@
   %c0 = arith.constant 0 : index
   %c1 = arith.constant 1 : index
   %0 = linalg.matmul ins(%arg0, %arg1 : tensor<?x?xf32>, tensor<?x?xf32>)
-    outs(%arg2 : tensor<?x?xf32>) -> tensor<?x?xf32>
+    outs(%arg2 : tensor<?x?xf32>)
   %1 = tensor.dim %0, %c0 : tensor<?x?xf32>
   %2 = tensor.dim %0, %c1 : tensor<?x?xf32>
   %3 = linalg.generic
diff --git a/mlir/test/Dialect/Linalg/roundtrip.mlir b/mlir/test/Dialect/Linalg/roundtrip.mlir
--- a/mlir/test/Dialect/Linalg/roundtrip.mlir
+++ b/mlir/test/Dialect/Linalg/roundtrip.mlir
@@ -183,9 +183,9 @@
     -> (tensor<i32>, tensor<i32>) {
   %c0 = arith.constant 0 : index
   %0 = tensor.empty() : tensor<i32>
-  %1 = linalg.fill ins(%arg2 : i32) outs(%0 : tensor<i32>) -> tensor<i32>
+  %1 = linalg.fill ins(%arg2 : i32) outs(%0 : tensor<i32>)
   %2 = tensor.empty() : tensor<i32>
-  %3 = linalg.fill ins(%arg2 : i32) outs(%2 : tensor<i32>) -> tensor<i32>
+  %3 = linalg.fill ins(%arg2 : i32) outs(%2 : tensor<i32>)
   %4:2 = linalg.generic {
     indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> ()>, affine_map<(d0) -> ()>],
     iterator_types = ["reduction"]}
@@ -287,7 +287,6 @@
   %res1 = linalg.batch_matmul
                       ins(%ta3, %tb3: tensor<?x?x?xf32>, tensor<?x?x?xf32>)
                      outs(%tc3: tensor<?x?x?xf32>)
-                  -> tensor<?x?x?xf32>
   return %res1 : tensor<?x?x?xf32>
 }
 // CHECK-LABEL: func @named_ops
@@ -298,10 +297,10 @@
 
 func.func @fill_tensor(%arg0 : index, %arg1 : index, %arg2 : f32) -> tensor<?x?xf32> {
   %0 = tensor.empty(%arg0, %arg1) : tensor<?x?xf32>
-  %1 = linalg.fill ins(%arg2 : f32) outs(%0 : tensor<?x?xf32>) -> tensor<?x?xf32>
+  %1 = linalg.fill ins(%arg2 : f32) outs(%0 : tensor<?x?xf32>)
   return %1 : tensor<?x?xf32>
 }
-// CHECK: %{{.+}} = linalg.fill ins(%{{.+}} : f32) outs(%{{.+}} : tensor<?x?xf32>) -> tensor<?x?xf32>
+// CHECK: %{{.+}} = linalg.fill ins(%{{.+}} : f32) outs(%{{.+}} : tensor<?x?xf32>)
 
 // -----
 
diff --git a/mlir/test/Dialect/Linalg/swap-extract-slice-with-fill.mlir b/mlir/test/Dialect/Linalg/swap-extract-slice-with-fill.mlir
--- a/mlir/test/Dialect/Linalg/swap-extract-slice-with-fill.mlir
+++ b/mlir/test/Dialect/Linalg/swap-extract-slice-with-fill.mlir
@@ -4,11 +4,11 @@
 //  CHECK-SAME: (%[[INIT:.+]]: tensor<?x?x?xf32>, %[[OFFSET0:.+]]: index, %[[SIZE1:.+]]: index)
 //       CHECK:   %[[F0:.+]] = arith.constant 0.000000e+00 : f32
 //       CHECK:   %[[EXT:.+]] = tensor.extract_slice %[[INIT]][%[[OFFSET0]], 8, 4] [1, %[[SIZE1]], 6] [1, 3, 1]
-//       CHECK:   %[[FILL:.+]] = linalg.fill ins(%[[F0]] : f32) outs(%[[EXT]] : tensor<?x6xf32>) -> tensor<?x6xf32>
+//       CHECK:   %[[FILL:.+]] = linalg.fill ins(%[[F0]] : f32) outs(%[[EXT]] : tensor<?x6xf32>)
 //       CHECK:   return %[[FILL]]
 func.func @swap_fill_insert_slice(%init : tensor<?x?x?xf32>, %offset0: index, %size1: index) -> tensor<?x6xf32> {
   %f0 = arith.constant 0.000000e+00 : f32
-  %0 = linalg.fill ins(%f0 : f32) outs(%init : tensor<?x?x?xf32>) -> tensor<?x?x?xf32>
+  %0 = linalg.fill ins(%f0 : f32) outs(%init : tensor<?x?x?xf32>)
   %1 = tensor.extract_slice %0[%offset0, 8, 4] [1, %size1, 6] [1, 3, 1]
     : tensor<?x?x?xf32> to tensor<?x6xf32>
   return %1: tensor<?x6xf32>
@@ -21,7 +21,7 @@
 //       CHECK:   tensor.extract_slice
 func.func @dont_swap_fill_insert_slice_multi_user(%init : tensor<?x?x?xf32>, %offset0: index, %size1: index) -> (tensor<?x?x?xf32>, tensor<2x?x6xf32>) {
   %f0 = arith.constant 0.000000e+00 : f32
-  %0 = linalg.fill ins(%f0 : f32) outs(%init : tensor<?x?x?xf32>) -> tensor<?x?x?xf32>
+  %0 = linalg.fill ins(%f0 : f32) outs(%init : tensor<?x?x?xf32>)
   %1 = tensor.extract_slice %0[%offset0, 8, 4] [2, %size1, 6] [1, 3, 1]
     : tensor<?x?x?xf32> to tensor<2x?x6xf32>
   return %0, %1: tensor<?x?x?xf32>, tensor<2x?x6xf32>
diff --git a/mlir/test/Dialect/Linalg/tile-and-fuse-tensors.mlir b/mlir/test/Dialect/Linalg/tile-and-fuse-tensors.mlir
--- a/mlir/test/Dialect/Linalg/tile-and-fuse-tensors.mlir
+++ b/mlir/test/Dialect/Linalg/tile-and-fuse-tensors.mlir
@@ -3,7 +3,6 @@
 func.func @matmul_tensors(%arg0: tensor<?x?xf32>, %arg1: tensor<?x?xf32>, %arg2: tensor<?x?xf32>) -> tensor<?x?xf32> {
   %t0 = linalg.matmul ins(%arg0, %arg1: tensor<?x?xf32>, tensor<?x?xf32>)
                      outs(%arg2: tensor<?x?xf32>)
-    -> tensor<?x?xf32>
 
   %c4 = arith.constant 4 : index
   %c2 = arith.constant 2 : index
@@ -19,7 +18,7 @@
         %6 = tensor.extract_slice %t0[%arg3, %arg7][%c2, 4][1, 1] : tensor<?x?xf32> to tensor<?x4xf32>
         %7 = tensor.extract_slice %arg1[%arg7, %arg5][4, %c3][1, 1] : tensor<?x?xf32> to tensor<4x?xf32>
         %8 = tensor.extract_slice %arg8[%arg3, %arg5][%c2, %c3][1, 1] : tensor<?x?xf32> to tensor<?x?xf32>
-        %9 = linalg.matmul ins(%6, %7 : tensor<?x4xf32>, tensor<4x?xf32>) outs(%8 : tensor<?x?xf32>) -> tensor<?x?xf32>
+        %9 = linalg.matmul ins(%6, %7 : tensor<?x4xf32>, tensor<4x?xf32>) outs(%8 : tensor<?x?xf32>)
         %10 = tensor.insert_slice %9 into %arg8[%arg3, %arg5] [%c2, %c3] [1, 1]  : tensor<?x?xf32> into tensor<?x?xf32>
         scf.yield %10 : tensor<?x?xf32>
       }
@@ -50,8 +49,8 @@
 // slices of the producing matmul.
 //   CHECK-DAG:       %[[stB2:.*]] = tensor.extract_slice %[[B]][0, %[[K]]] [%[[dB0]], 4] [1, 1]  : tensor<?x?xf32> to tensor<?x4xf32>
 //   CHECK-DAG:       %[[stC:.*]] = tensor.extract_slice %[[C]][%[[I]], %[[K]]] [2, 4] [1, 1]  : tensor<?x?xf32> to tensor<2x4xf32>
-//       CHECK:       %[[stD:.*]] = linalg.matmul ins(%[[stA]], %[[stB2]] : tensor<2x?xf32>, tensor<?x4xf32>) outs(%[[stC]] : tensor<2x4xf32>)  -> tensor<2x4xf32>
-//  CHECK-NEXT:       %[[stG:.*]] = linalg.matmul ins(%[[stD]], %[[stB1]] : tensor<2x4xf32>, tensor<4x3xf32>) outs(%[[stF]] : tensor<2x3xf32>)  -> tensor<2x3xf32>
+//       CHECK:       %[[stD:.*]] = linalg.matmul ins(%[[stA]], %[[stB2]] : tensor<2x?xf32>, tensor<?x4xf32>) outs(%[[stC]] : tensor<2x4xf32>)
+//  CHECK-NEXT:       %[[stG:.*]] = linalg.matmul ins(%[[stD]], %[[stB1]] : tensor<2x4xf32>, tensor<4x3xf32>) outs(%[[stF]] : tensor<2x3xf32>)
 //  CHECK-NEXT:       tensor.insert_slice %[[stG]] into %[[RES]][%[[I]], %[[J]]]
 
 // -----
@@ -66,12 +65,12 @@
   %cst = arith.constant 0.0 : f32
 
   %init = tensor.empty() : tensor<1x112x112x32xf32>
-  %fill = linalg.fill ins(%cst : f32) outs(%init : tensor<1x112x112x32xf32>) -> tensor<1x112x112x32xf32>
+  %fill = linalg.fill ins(%cst : f32) outs(%init : tensor<1x112x112x32xf32>)
 
   %conv = linalg.conv_2d_nhwc_hwcf
     {dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>}
     ins(%input, %filter : tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>)
-    outs(%fill : tensor<1x112x112x32xf32>) -> tensor<1x112x112x32xf32>
+    outs(%fill : tensor<1x112x112x32xf32>)
 
   %for0 = scf.for %iv0 = %c0 to %c112 step %c8 iter_args(%arg0 = %fill) -> tensor<1x112x112x32xf32> {
     %for1 = scf.for %iv1 = %c0 to %c112 step %c16 iter_args(%arg1 = %arg0) -> tensor<1x112x112x32xf32> {
@@ -110,7 +109,7 @@
 // CHECK-SAME: (%[[INPUT:.+]]: tensor<1x225x225x3xf32>, %[[FILTER:.+]]: tensor<3x3x3x32xf32>, %[[ELEM:.+]]: tensor<1x112x112x32xf32>)
 
 //      CHECK: %[[INIT:.+]] = tensor.empty() : tensor<1x112x112x32xf32>
-// CHECK-NEXT: %[[FILL:.+]] = linalg.fill ins(%cst : f32) outs(%[[INIT]] : tensor<1x112x112x32xf32>) -> tensor<1x112x112x32xf32>
+// CHECK-NEXT: %[[FILL:.+]] = linalg.fill ins(%cst : f32) outs(%[[INIT]] : tensor<1x112x112x32xf32>)
 
 // CHECK-NEXT: scf.for %[[IV0:.+]] = %{{.+}} to %{{.+}} step %{{.+}} iter_args(%[[ARG0:.+]] = %[[FILL]])
 // CHECK-NEXT:   %[[OFFSET_H:.+]] = affine.apply #[[MAP0]](%[[IV0]])
@@ -148,12 +147,12 @@
   %oc = tensor.dim %elementwise, %c3 : tensor<?x?x?x?xf32>
 
   %init = tensor.empty(%n, %oh, %ow, %oc) : tensor<?x?x?x?xf32>
-  %fill = linalg.fill ins(%cst : f32) outs(%init : tensor<?x?x?x?xf32>) -> tensor<?x?x?x?xf32>
+  %fill = linalg.fill ins(%cst : f32) outs(%init : tensor<?x?x?x?xf32>)
 
   %conv = linalg.conv_2d_nhwc_hwcf
     {dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>}
     ins(%input, %filter : tensor<?x?x?x?xf32>, tensor<?x?x?x?xf32>)
-    outs(%fill : tensor<?x?x?x?xf32>) -> tensor<?x?x?x?xf32>
+    outs(%fill : tensor<?x?x?x?xf32>)
 
   %for0 = scf.for %iv0 = %c0 to %n step %c8 iter_args(%arg0 = %fill) -> tensor<?x?x?x?xf32> {
     %for1 = scf.for %iv1 = %c0 to %oh step %c16 iter_args(%arg1 = %arg0) -> tensor<?x?x?x?xf32> {
@@ -217,7 +216,7 @@
 //  CHECK-DAG:   %[[ELEM_OC:.+]] = tensor.dim %[[ELEM]], %[[C3]] : tensor<?x?x?x?xf32>
 
 //      CHECK:   %[[INIT:.+]] = tensor.empty(%[[ELEM_N]], %[[ELEM_OH]], %[[ELEM_OW]], %[[ELEM_OC]]) : tensor<?x?x?x?xf32>
-//      CHECK:   %[[FILL:.+]] = linalg.fill ins(%cst : f32) outs(%[[INIT]] : tensor<?x?x?x?xf32>) -> tensor<?x?x?x?xf32>
+//      CHECK:   %[[FILL:.+]] = linalg.fill ins(%cst : f32) outs(%[[INIT]] : tensor<?x?x?x?xf32>)
 
 //  CHECK-DAG:   %[[FILTER_H:.+]] = tensor.dim %[[FILTER]], %[[C0]] : tensor<?x?x?x?xf32>
 //  CHECK-DAG:   %[[FILTER_W:.+]] = tensor.dim %[[FILTER]], %[[C1]] : tensor<?x?x?x?xf32>
@@ -256,7 +255,7 @@
 // CHECK-SAME:                 [%[[SIZE_INPUT_N]], %[[SIZE_ELEM_OH_2]], %[[SIZE_ELEM_OW_2]], %[[SIZE_ELEM_OC_2]]]
 // CHECK-NEXT:           %[[ST_CONV:.+]] = linalg.conv_2d_nhwc_hwcf
 // CHECK-SAME:                 ins(%[[ST_INPUT]], %[[ST_FILTER]] : tensor<?x?x?x?xf32>, tensor<?x?x?x?xf32>)
-// CHECK-SAME:                 outs(%[[ST_FILL]] : tensor<?x?x?x?xf32>) -> tensor<?x?x?x?xf32>
+// CHECK-SAME:                 outs(%[[ST_FILL]] : tensor<?x?x?x?xf32>)
 // CHECK-NEXT:           %[[ST_ADD:.+]] = linalg.generic
 // CHECK-SAME:                 ins(%[[ST_CONV]], %[[ST_ELEM]] : tensor<?x?x?x?xf32>, tensor<?x?x?x?xf32>)
 // CHECK-SAME:                 outs(%[[ST_ARG]] : tensor<?x?x?x?xf32>)
@@ -301,7 +300,7 @@
     tensor.yield %zero : f32
   } : tensor<58x1xf32> to tensor<64x128xf32>
 
-  %fill = linalg.fill ins(%zero : f32) outs(%large_input : tensor<64x128xf32>) -> tensor<64x128xf32>
+  %fill = linalg.fill ins(%zero : f32) outs(%large_input : tensor<64x128xf32>)
 
   %for0 = scf.for %iv0 = %c0 to %d0 step %c16 iter_args(%arg0 = %fill) -> tensor<64x128xf32> {
     %for1 = scf.for %iv1 = %c0 to %d1 step %c32 iter_args(%arg1 = %arg0) -> tensor<64x128xf32> {
diff --git a/mlir/test/Dialect/Linalg/tile-tensors.mlir b/mlir/test/Dialect/Linalg/tile-tensors.mlir
--- a/mlir/test/Dialect/Linalg/tile-tensors.mlir
+++ b/mlir/test/Dialect/Linalg/tile-tensors.mlir
@@ -14,14 +14,13 @@
 //      CHECK:       %[[sTB:.*]] = tensor.extract_slice %[[TB]][{{.*}}] : tensor<?x?xf32> to tensor<?x?xf32>
 //      CHECK:       %[[sTC:.*]] = tensor.extract_slice %[[TC2]][{{.*}}] : tensor<?x?xf32> to tensor<?x?xf32>
 //      CHECK:       %[[sTD:.*]] = linalg.matmul ins(%[[sTA]], %[[sTB]] : tensor<?x?xf32>, tensor<?x?xf32>)
-// CHECK-SAME:                                  outs(%[[sTC]] : tensor<?x?xf32>)  -> tensor<?x?xf32>
+// CHECK-SAME:                                  outs(%[[sTC]] : tensor<?x?xf32>)
 //      CHECK:       %[[TD:.*]] = tensor.insert_slice %[[sTD]] into %[[TC2]][{{.*}}]  : tensor<?x?xf32> into tensor<?x?xf32>
 //      CHECK:       scf.yield %[[TD]] : tensor<?x?xf32>
 //      CHECK:     scf.yield %[[TD2]] : tensor<?x?xf32>
 //      CHECK:   scf.yield %[[TD1]] : tensor<?x?xf32>
   %0 = linalg.matmul  ins(%arg0, %arg1: tensor<?x?xf32>, tensor<?x?xf32>)
                      outs(%arg2: tensor<?x?xf32>)
-    -> tensor<?x?xf32>
 
 //      CHECK: return %[[TD0]] : tensor<?x?xf32>
   return %0 : tensor<?x?xf32>
diff --git a/mlir/test/Dialect/Linalg/tile-to-foreach-thread.mlir b/mlir/test/Dialect/Linalg/tile-to-foreach-thread.mlir
--- a/mlir/test/Dialect/Linalg/tile-to-foreach-thread.mlir
+++ b/mlir/test/Dialect/Linalg/tile-to-foreach-thread.mlir
@@ -21,14 +21,14 @@
   //      CHECK:   %[[tC:.*]] = tensor.extract_slice %[[C_BLK]]{{.*}} : tensor<?x?xf32> to tensor<?x?xf32>
   //      CHECK:   %[[RES:.*]] = linalg.matmul
   // CHECK-SAME:      ins(%[[tA]], %[[tB]] : tensor<?x?xf32>, tensor<?x?xf32>)
-  // CHECK-SAME:     outs(%[[tC]] : tensor<?x?xf32>) -> tensor<?x?xf32>
+  // CHECK-SAME:     outs(%[[tC]] : tensor<?x?xf32>)
   //      CHECK:   scf.foreach_thread.perform_concurrently {
   // CHECK-NEXT:     tensor.parallel_insert_slice %[[RES]] into %[[C_BLK]]{{.*}} :
   // CHECK-SAME:       tensor<?x?xf32> into tensor<?x?xf32>
   // CHECK-NEXT:   }
   // CHECK-NEXT: } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
     %0 = linalg.matmul ins(%A, %B : tensor<?x?xf32>, tensor<?x?xf32>)
-                      outs(%C : tensor<?x?xf32>) -> (tensor<?x?xf32>)
+                      outs(%C : tensor<?x?xf32>)
     return %0 : tensor<?x?xf32>
   }
 
@@ -70,7 +70,7 @@
   %tile_size_1 = "test.dummy"() : () -> (index)
   %tile_size_2 = "test.dummy"() : () -> (index)
   %0 = linalg.matmul ins(%A, %B : tensor<?x?xf32>, tensor<?x?xf32>)
-                    outs(%C : tensor<?x?xf32>) -> (tensor<?x?xf32>)
+                    outs(%C : tensor<?x?xf32>)
   return %0 : tensor<?x?xf32>
 }
 
@@ -111,7 +111,7 @@
   //      CHECK:   scf.foreach_thread.perform_concurrently
   // CHECK-NEXT:    tensor.parallel_insert_slice
   %0 = linalg.matmul ins(%A, %B : tensor<100x200xf32>, tensor<200x300xf32>)
-                    outs(%C : tensor<100x300xf32>) -> (tensor<100x300xf32>)
+                    outs(%C : tensor<100x300xf32>)
   return %0 : tensor<100x300xf32>
 }
 
@@ -152,7 +152,7 @@
   //      CHECK:   scf.foreach_thread.perform_concurrently
   // CHECK-NEXT:    tensor.parallel_insert_slice
   %0 = linalg.matmul ins(%A, %B : tensor<?x?xf32>, tensor<?x?xf32>)
-                    outs(%C : tensor<?x?xf32>) -> (tensor<?x?xf32>)
+                    outs(%C : tensor<?x?xf32>)
   return %0 : tensor<?x?xf32>
 }
 
@@ -190,7 +190,7 @@
   //      CHECK:   scf.foreach_thread.perform_concurrently
   // CHECK-NEXT:    tensor.parallel_insert_slice
   %0 = linalg.matmul ins(%A, %B : tensor<100x200xf32>, tensor<200x300xf32>)
-                    outs(%C : tensor<100x300xf32>) -> (tensor<100x300xf32>)
+                    outs(%C : tensor<100x300xf32>)
   return %0 : tensor<100x300xf32>
 }
 
@@ -263,7 +263,7 @@
   // CHECK-NEXT:    tensor.parallel_insert_slice
   %tile_size = "test.dummy"() : () -> (index)
   %0 = linalg.matmul ins(%A, %B : tensor<?x?xf32>, tensor<?x?xf32>)
-                    outs(%C : tensor<?x?xf32>) -> (tensor<?x?xf32>)
+                    outs(%C : tensor<?x?xf32>)
   return %0 : tensor<?x?xf32>
 }
 
diff --git a/mlir/test/Dialect/Linalg/transform-op-decompose.mlir b/mlir/test/Dialect/Linalg/transform-op-decompose.mlir
--- a/mlir/test/Dialect/Linalg/transform-op-decompose.mlir
+++ b/mlir/test/Dialect/Linalg/transform-op-decompose.mlir
@@ -13,7 +13,7 @@
   %0 = linalg.conv_2d_nhwc_hwcf {dilations = dense<1> : tensor<2xi64>,
                                  strides = dense<1> : tensor<2xi64>}
      ins (%input, %filter: tensor<?x1x?x?xf32>, tensor<1x?x?x?xf32>)
-    outs (%init: tensor<?x1x?x?xf32>) -> tensor<?x1x?x?xf32>
+    outs (%init: tensor<?x1x?x?xf32>)
   // CHECK: return %[[RES]]
   return %0 : tensor<?x1x?x?xf32>
 }
@@ -31,7 +31,7 @@
   %0 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : tensor<2xi64>,
                                  strides = dense<1> : tensor<2xi64>}
      ins (%input, %filter: tensor<?x?x1x?xf32>, tensor<?x?x1x?xf32>)
-    outs (%init: tensor<?x?x1x?xf32>) -> tensor<?x?x1x?xf32>
+    outs (%init: tensor<?x?x1x?xf32>)
   // CHECK: return %[[RES]]
   return %0 : tensor<?x?x1x?xf32>
 }
@@ -51,7 +51,7 @@
   // CHECK: %[[INSERTED:.+]] = tensor.insert_slice %[[OPRES]] into %[[RES]]
   %0 = linalg.depthwise_conv_2d_nhwc_hwc {dilations = dense<1> : vector<2xi64>, strides = dense<2> : vector<2xi64>}
          ins(%input, %filter: tensor<1x1x113x96xf32>, tensor<1x3x96xf32>)
-         outs(%init: tensor<1x1x56x96xf32>) -> tensor<1x1x56x96xf32>
+         outs(%init: tensor<1x1x56x96xf32>)
   // CHECK: %[[INSERTED]]
   return %0: tensor<1x1x56x96xf32>
 }
@@ -69,7 +69,7 @@
   %0 = linalg.pooling_nhwc_sum {dilations = dense<1> : tensor<2xi64>,
                                 strides = dense<1> : tensor<2xi64>}
      ins (%input, %filter: tensor<?x1x?x?xf32>, tensor<1x?xf32>)
-    outs (%init: tensor<?x1x?x?xf32>) -> tensor<?x1x?x?xf32>
+    outs (%init: tensor<?x1x?x?xf32>)
   // CHECK: return %[[RES]]
   return %0 : tensor<?x1x?x?xf32>
 }
@@ -87,7 +87,7 @@
   %0 = linalg.pooling_nchw_sum {dilations = dense<1> : tensor<2xi64>,
                                 strides = dense<1> : tensor<2xi64>}
      ins (%input, %filter: tensor<?x?x1x?xf32>, tensor<1x?xf32>)
-    outs (%init: tensor<?x?x1x?xf32>) -> tensor<?x?x1x?xf32>
+    outs (%init: tensor<?x?x1x?xf32>)
   // CHECK: return %[[RES]]
   return %0 : tensor<?x?x1x?xf32>
 }
@@ -105,7 +105,7 @@
   %0 = linalg.pooling_nhwc_max {dilations = dense<1> : tensor<2xi64>,
                                 strides = dense<1> : tensor<2xi64>}
      ins (%input, %filter: tensor<?x1x?x?xf32>, tensor<1x?xf32>)
-    outs (%init: tensor<?x1x?x?xf32>) -> tensor<?x1x?x?xf32>
+    outs (%init: tensor<?x1x?x?xf32>)
   // CHECK: return %[[RES]]
   return %0 : tensor<?x1x?x?xf32>
 }
@@ -123,7 +123,7 @@
   %0 = linalg.pooling_nhwc_max_unsigned {dilations = dense<1> : tensor<2xi64>,
                                 strides = dense<1> : tensor<2xi64>}
      ins (%input, %filter: tensor<?x1x?x?xf32>, tensor<1x?xf32>)
-    outs (%init: tensor<?x1x?x?xf32>) -> tensor<?x1x?x?xf32>
+    outs (%init: tensor<?x1x?x?xf32>)
   // CHECK: return %[[RES]]
   return %0 : tensor<?x1x?x?xf32>
 }
@@ -141,7 +141,7 @@
   %0 = linalg.pooling_nhwc_min {dilations = dense<1> : tensor<2xi64>,
                                 strides = dense<1> : tensor<2xi64>}
      ins (%input, %filter: tensor<?x1x?x?xf32>, tensor<1x?xf32>)
-    outs (%init: tensor<?x1x?x?xf32>) -> tensor<?x1x?x?xf32>
+    outs (%init: tensor<?x1x?x?xf32>)
   // CHECK: return %[[RES]]
   return %0 : tensor<?x1x?x?xf32>
 }
@@ -159,7 +159,7 @@
   %0 = linalg.pooling_nhwc_min_unsigned {dilations = dense<1> : tensor<2xi64>,
                                 strides = dense<1> : tensor<2xi64>}
      ins (%input, %filter: tensor<?x1x?x?xf32>, tensor<1x?xf32>)
-    outs (%init: tensor<?x1x?x?xf32>) -> tensor<?x1x?x?xf32>
+    outs (%init: tensor<?x1x?x?xf32>)
   // CHECK: return %[[RES]]
   return %0 : tensor<?x1x?x?xf32>
 }
@@ -177,7 +177,7 @@
   %0 = linalg.pooling_nchw_max {dilations = dense<1> : tensor<2xi64>,
                                 strides = dense<1> : tensor<2xi64>}
      ins (%input, %filter: tensor<?x?x1x?xf32>, tensor<1x?xf32>)
-    outs (%init: tensor<?x?x1x?xf32>) -> tensor<?x?x1x?xf32>
+    outs (%init: tensor<?x?x1x?xf32>)
   // CHECK: return %[[RES]]
   return %0 : tensor<?x?x1x?xf32>
 }
diff --git a/mlir/test/Dialect/Linalg/transform-op-fuse-into-containing.mlir b/mlir/test/Dialect/Linalg/transform-op-fuse-into-containing.mlir
--- a/mlir/test/Dialect/Linalg/transform-op-fuse-into-containing.mlir
+++ b/mlir/test/Dialect/Linalg/transform-op-fuse-into-containing.mlir
@@ -12,7 +12,7 @@
   func.func @fuse_tileable_op(%arg0: index, %arg1: tensor<?xf32>, %arg2: tensor<?xf32>) -> tensor<?xf32> {
     %cst = arith.constant 4.200000e+01 : f32
     %c0 = arith.constant 0 : index
-    %0 = linalg.fill ins(%cst : f32) outs(%arg1 : tensor<?xf32>) -> tensor<?xf32>
+    %0 = linalg.fill ins(%cst : f32) outs(%arg1 : tensor<?xf32>)
     %d0 = tensor.dim %arg1, %c0 : tensor<?xf32>
     %1 = affine.apply #map0()[%d0, %arg0]
 
@@ -27,7 +27,7 @@
       %6 = tensor.extract_slice %0[%3] [%4] [1] : tensor<?xf32> to tensor<?xf32>
 
       // CHECK: %[[T2:.*]] = linalg.elemwise_unary ins(%[[T1]]
-      %7 = linalg.elemwise_unary ins(%6 : tensor<?xf32>) outs(%5 : tensor<?xf32>) -> tensor<?xf32>
+      %7 = linalg.elemwise_unary ins(%6 : tensor<?xf32>) outs(%5 : tensor<?xf32>)
       scf.foreach_thread.perform_concurrently {
         tensor.parallel_insert_slice %7 into %o[%3] [%4] [1] : tensor<?xf32> into tensor<?xf32>
       }
@@ -74,7 +74,7 @@
       %5 = tensor.extract_slice %o[%3] [%4] [1] : tensor<64xf32> to tensor<?xf32>
 
       // CHECK: %[[T2:.*]] = linalg.elemwise_unary ins(%[[INIT_TENSOR]]
-      %7 = linalg.elemwise_unary ins(%0 : tensor<?xf32>) outs(%5 : tensor<?xf32>) -> tensor<?xf32>
+      %7 = linalg.elemwise_unary ins(%0 : tensor<?xf32>) outs(%5 : tensor<?xf32>)
       scf.foreach_thread.perform_concurrently {
         tensor.parallel_insert_slice %7 into %o[%3] [%4] [1] : tensor<?xf32> into tensor<64xf32>
       }
@@ -108,7 +108,7 @@
   func.func @fuse_tileable_op_rank_reducing(%arg0: index, %arg1: tensor<?xf32>, %arg2: tensor<?xf32>) -> tensor<?xf32> {
     %cst = arith.constant 4.200000e+01 : f32
     %c0 = arith.constant 0 : index
-    %0 = linalg.fill ins(%cst : f32) outs(%arg2 : tensor<?xf32>) -> tensor<?xf32>
+    %0 = linalg.fill ins(%cst : f32) outs(%arg2 : tensor<?xf32>)
     %d0 = tensor.dim %arg1, %c0 : tensor<?xf32>
 
     // CHECK: scf.foreach_thread {{.*}} -> (tensor<?xf32>) {
@@ -116,7 +116,7 @@
       %5 = tensor.extract_slice %o[%arg3] [1] [1] : tensor<?xf32> to tensor<f32>
       
       // CHECK: tensor.extract_slice %{{.*}}[%{{.*}}] [1] [1] : tensor<?xf32> to tensor<1xf32>
-      // CHECK: linalg.fill ins(%{{.*}} : f32) outs(%{{.*}} : tensor<1xf32>) -> tensor<1xf32>
+      // CHECK: linalg.fill ins(%{{.*}} : f32) outs(%{{.*}} : tensor<1xf32>)
       // CHECK: tensor.extract_slice %{{.*}}[0] [1] [1] : tensor<1xf32> to tensor<f32>
       // CHECK: func.call @foo(%{{.*}}) : (tensor<f32>) -> tensor<f32>
       %7 = func.call @foo(%5) : (tensor<f32>) -> tensor<f32>
@@ -154,7 +154,7 @@
   func.func @fuse_tileable_op_through_bbarg(%arg0: index, %arg1: tensor<?xf32>, %arg2: tensor<?xf32>) -> tensor<?xf32> {
     %cst = arith.constant 4.200000e+01 : f32
     %c0 = arith.constant 0 : index
-    %0 = linalg.fill ins(%cst : f32) outs(%arg2 : tensor<?xf32>) -> tensor<?xf32>
+    %0 = linalg.fill ins(%cst : f32) outs(%arg2 : tensor<?xf32>)
     %d0 = tensor.dim %arg1, %c0 : tensor<?xf32>
     %1 = affine.apply #map0()[%d0, %arg0]
 
@@ -169,7 +169,7 @@
       %6 = tensor.extract_slice %arg1[%3] [%4] [1] : tensor<?xf32> to tensor<?xf32>
 
       // CHECK: %[[T2:.*]] = linalg.elemwise_unary {{.*}} outs(%[[T1]]
-      %7 = linalg.elemwise_unary ins(%6 : tensor<?xf32>) outs(%5 : tensor<?xf32>) -> tensor<?xf32>
+      %7 = linalg.elemwise_unary ins(%6 : tensor<?xf32>) outs(%5 : tensor<?xf32>)
       scf.foreach_thread.perform_concurrently {
         tensor.parallel_insert_slice %7 into %o[%3] [%4] [1] : tensor<?xf32> into tensor<?xf32>
       }
@@ -229,7 +229,7 @@
       %6 = tensor.extract_slice %0#0[%3] [%4] [1] : tensor<?xf32> to tensor<?xf32>
 
       // CHECK: %[[T2:.*]] = linalg.elemwise_unary ins(%[[T1]]#0
-      %7 = linalg.elemwise_unary ins(%6 : tensor<?xf32>) outs(%5 : tensor<?xf32>) -> tensor<?xf32>
+      %7 = linalg.elemwise_unary ins(%6 : tensor<?xf32>) outs(%5 : tensor<?xf32>)
       scf.foreach_thread.perform_concurrently {
         tensor.parallel_insert_slice %7 into %o[%3] [%4] [1] : tensor<?xf32> into tensor<?xf32>
       }
diff --git a/mlir/test/Dialect/Linalg/transform-op-fuse.mlir b/mlir/test/Dialect/Linalg/transform-op-fuse.mlir
--- a/mlir/test/Dialect/Linalg/transform-op-fuse.mlir
+++ b/mlir/test/Dialect/Linalg/transform-op-fuse.mlir
@@ -9,9 +9,9 @@
   //     CHECK:       linalg.elemwise_binary
   //     CHECK: return %[[RES]]
   %0 = linalg.elemwise_unary ins(%arg0 : tensor<?x?xf32>)
-                             outs(%arg1: tensor<?x?xf32>) -> tensor<?x?xf32>
+                             outs(%arg1: tensor<?x?xf32>)
   %1 = linalg.elemwise_binary ins(%0, %arg0 : tensor<?x?xf32>, tensor<?x?xf32>)
-                             outs(%arg1: tensor<?x?xf32>) -> tensor<?x?xf32>
+                             outs(%arg1: tensor<?x?xf32>)
   return %1 : tensor<?x?xf32>
 }
 
@@ -36,9 +36,9 @@
   //     CHECK:       linalg.elemwise_binary
   //     CHECK: return %[[RES]]
   %0 = linalg.elemwise_unary ins(%arg0 : tensor<?x?xf32>)
-                             outs(%arg1: tensor<?x?xf32>) -> tensor<?x?xf32>
+                             outs(%arg1: tensor<?x?xf32>)
   %1 = linalg.elemwise_binary ins(%0, %arg0 : tensor<?x?xf32>, tensor<?x?xf32>)
-                             outs(%arg1: tensor<?x?xf32>) -> tensor<?x?xf32>
+                             outs(%arg1: tensor<?x?xf32>)
   return %1 : tensor<?x?xf32>
 }
 
@@ -73,7 +73,7 @@
 //       CHECK:       linalg.generic {{.+}} ins(%[[IN_SLICE]] : tensor<?x?x?xf32>) outs(%[[OUT_SLICE2]] : tensor<?x?xf32>)
 //       CHECK: return %[[RES]]
 
-  %fill = linalg.fill ins(%five : f32) outs(%init : tensor<12x25xf32>) -> tensor<12x25xf32>
+  %fill = linalg.fill ins(%five : f32) outs(%init : tensor<12x25xf32>)
   %0 = linalg.generic {
     indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d2)>],
     iterator_types = ["parallel", "reduction", "parallel"]
@@ -105,7 +105,7 @@
   %1 = tensor.unpack %arg0 inner_dims_pos = [0, 1] inner_tiles = [8, 8] into %0
       : tensor<16x48x8x8xf32> -> tensor<128x384xf32>
   %2 = linalg.elemwise_unary ins(%1: tensor<128x384xf32>)
-                             outs(%arg1: tensor<128x384xf32>) -> tensor<128x384xf32>
+                             outs(%arg1: tensor<128x384xf32>)
   return %2 : tensor<128x384xf32>
 }
 
diff --git a/mlir/test/Dialect/Linalg/transform-op-generalize.mlir b/mlir/test/Dialect/Linalg/transform-op-generalize.mlir
--- a/mlir/test/Dialect/Linalg/transform-op-generalize.mlir
+++ b/mlir/test/Dialect/Linalg/transform-op-generalize.mlir
@@ -6,7 +6,7 @@
   // CHECK-NOT:   linalg.elemwise_unary
   //     CHECK:   linalg.generic
   %0 = linalg.elemwise_unary ins(%arg0 : tensor<?x?xf32>)
-                             outs(%arg1: tensor<?x?xf32>) -> tensor<?x?xf32>
+                             outs(%arg1: tensor<?x?xf32>)
   return %0 : tensor<?x?xf32>
 }
 
diff --git a/mlir/test/Dialect/Linalg/transform-op-interchange.mlir b/mlir/test/Dialect/Linalg/transform-op-interchange.mlir
--- a/mlir/test/Dialect/Linalg/transform-op-interchange.mlir
+++ b/mlir/test/Dialect/Linalg/transform-op-interchange.mlir
@@ -28,7 +28,7 @@
 
 func.func @interchange_matmul(%arg0: tensor<?x?xf32>, %arg1: tensor<?x?xf32>, %arg2: tensor<?x?xf32>) -> tensor<?x?xf32> {
   // expected-note @below {{when applied to this op}}
-  %0 = linalg.matmul ins(%arg0, %arg1 : tensor<?x?xf32>, tensor<?x?xf32>) outs(%arg2 : tensor<?x?xf32>) -> tensor<?x?xf32>
+  %0 = linalg.matmul ins(%arg0, %arg1 : tensor<?x?xf32>, tensor<?x?xf32>) outs(%arg2 : tensor<?x?xf32>)
   return %0 : tensor<?x?xf32>
 }
 
diff --git a/mlir/test/Dialect/Linalg/transform-op-multitile-sizes.mlir b/mlir/test/Dialect/Linalg/transform-op-multitile-sizes.mlir
--- a/mlir/test/Dialect/Linalg/transform-op-multitile-sizes.mlir
+++ b/mlir/test/Dialect/Linalg/transform-op-multitile-sizes.mlir
@@ -14,7 +14,6 @@
     -> tensor<13x42xf32> {
   %0 = linalg.matmul  ins(%arg0, %arg1: tensor<13x34xf32>, tensor<34x42xf32>)
                      outs(%arg2: tensor<13x42xf32>)
-    -> tensor<13x42xf32>
   // The first application computes the total size.
   // CHECK: %{{.*}} = affine.apply #[[$MAP13]]()
   // CHECK: %[[SIZE:.+]] = affine.apply #[[$MAP13]]()
@@ -60,7 +59,6 @@
     -> tensor<?x?xf32> {
   %0 = linalg.matmul  ins(%arg0, %arg1: tensor<?x?xf32>, tensor<?x?xf32>)
                      outs(%arg2: tensor<?x?xf32>)
-    -> tensor<?x?xf32>
 
   return %0 : tensor<?x?xf32>
 }
diff --git a/mlir/test/Dialect/Linalg/transform-op-pad.mlir b/mlir/test/Dialect/Linalg/transform-op-pad.mlir
--- a/mlir/test/Dialect/Linalg/transform-op-pad.mlir
+++ b/mlir/test/Dialect/Linalg/transform-op-pad.mlir
@@ -26,7 +26,7 @@
   //      CHECK: %[[T5:.*]] = linalg.matmul
   // CHECK-SAME:              ins(%[[T3]], %[[T4]] : tensor<4x7xf32>, tensor<7x5xf32>)
   // CHECK-SAME:              outs(%[[T2]] : tensor<4x5xf32>)
-  %4 = linalg.matmul ins(%1, %2 : tensor<4x?xf32>, tensor<?x5xf32>) outs(%3 : tensor<4x5xf32>) -> tensor<4x5xf32>
+  %4 = linalg.matmul ins(%1, %2 : tensor<4x?xf32>, tensor<?x5xf32>) outs(%3 : tensor<4x5xf32>)
   %5 = tensor.insert_slice %4 into %arg2[%iv0, %iv1] [4, 5] [1, 1] : tensor<4x5xf32> into tensor<24x25xf32>
   func.return %5 : tensor<24x25xf32>
 }
@@ -43,7 +43,7 @@
                %arg1: tensor<12x25xf32>,
                %arg2: tensor<24x25xf32>) -> tensor<24x25xf32> {
   // expected-note @below {{when applied to this op}}
-  %0 = linalg.matmul ins(%arg0, %arg1 : tensor<24x12xf32>, tensor<12x25xf32>) outs(%arg2 : tensor<24x25xf32>) -> tensor<24x25xf32>
+  %0 = linalg.matmul ins(%arg0, %arg1 : tensor<24x12xf32>, tensor<12x25xf32>) outs(%arg2 : tensor<24x25xf32>)
   func.return %0 : tensor<24x25xf32>
 }
 
@@ -60,7 +60,7 @@
                %arg1: tensor<12x25xf32>,
                %arg2: tensor<24x25xf32>) -> tensor<24x25xf32> {
   // expected-note @below {{when applied to this op}}
-  %0 = linalg.matmul ins(%arg0, %arg1 : tensor<24x12xf32>, tensor<12x25xf32>) outs(%arg2 : tensor<24x25xf32>) -> tensor<24x25xf32>
+  %0 = linalg.matmul ins(%arg0, %arg1 : tensor<24x12xf32>, tensor<12x25xf32>) outs(%arg2 : tensor<24x25xf32>)
   func.return %0 : tensor<24x25xf32>
 }
 
@@ -78,7 +78,7 @@
                %arg2: tensor<24x25xf32>) -> tensor<24x25xf32> {
   // This is attached to an error that is silenceable and is not reported by this transform
   //   {{when applied to this op}}
-  %0 = linalg.matmul ins(%arg0, %arg1 : tensor<24x12xf32>, tensor<12x25xf32>) outs(%arg2 : tensor<24x25xf32>) -> tensor<24x25xf32>
+  %0 = linalg.matmul ins(%arg0, %arg1 : tensor<24x12xf32>, tensor<12x25xf32>) outs(%arg2 : tensor<24x25xf32>)
   func.return %0 : tensor<24x25xf32>
 }
 
diff --git a/mlir/test/Dialect/Linalg/transform-op-scalarize.mlir b/mlir/test/Dialect/Linalg/transform-op-scalarize.mlir
--- a/mlir/test/Dialect/Linalg/transform-op-scalarize.mlir
+++ b/mlir/test/Dialect/Linalg/transform-op-scalarize.mlir
@@ -12,7 +12,7 @@
   // CHECK:     scf.yield %[[INS_2]] : tensor<?x25xf32>
   // CHECK:   %[[INS_1:.*]] = tensor.insert_slice %[[RES_LOOP_2]] into %{{.*}}, 25] [1, 1] : tensor<?x25xf32> into tensor<24x25xf32>
   // CHECK:   scf.yield %[[INS_1]] : tensor<24x25xf32>
-  %0 = linalg.matmul ins(%arg0, %arg1 : tensor<24x12xf32>, tensor<12x25xf32>) outs(%arg2 : tensor<24x25xf32>) -> tensor<24x25xf32>
+  %0 = linalg.matmul ins(%arg0, %arg1 : tensor<24x12xf32>, tensor<12x25xf32>) outs(%arg2 : tensor<24x25xf32>)
 
   // CHECK: return %[[RES_LOOP_1]] : tensor<24x25xf32>
   func.return %0 : tensor<24x25xf32>
diff --git a/mlir/test/Dialect/Linalg/transform-op-split-reduction-by-scaling.mlir b/mlir/test/Dialect/Linalg/transform-op-split-reduction-by-scaling.mlir
--- a/mlir/test/Dialect/Linalg/transform-op-split-reduction-by-scaling.mlir
+++ b/mlir/test/Dialect/Linalg/transform-op-split-reduction-by-scaling.mlir
@@ -14,7 +14,7 @@
   // CHECK-SAME: ins(%{{[a-zA-Z0-9]*}} : tensor<?x32x64xf32>)
   // CHECK-SAME: outs(%{{[a-zA-Z0-9]*}} : tensor<?x32xf32>) {
   %0 = linalg.matmul ins(%A, %B: tensor<?x256xf32>, tensor<256x32xf32>)
-                    outs(%C: tensor<?x32xf32>) -> tensor<?x32xf32>
+                    outs(%C: tensor<?x32xf32>)
   return %0: tensor<?x32xf32>
 }
 
diff --git a/mlir/test/Dialect/Linalg/transform-op-split-reduction.mlir b/mlir/test/Dialect/Linalg/transform-op-split-reduction.mlir
--- a/mlir/test/Dialect/Linalg/transform-op-split-reduction.mlir
+++ b/mlir/test/Dialect/Linalg/transform-op-split-reduction.mlir
@@ -2,7 +2,7 @@
 
 func.func @matmul_split(%A : tensor<16x256xf32>, %B: tensor<256x32xf32>, %C: tensor<16x32xf32>) -> tensor<16x32xf32> {
   %0 = linalg.matmul ins(%A, %B: tensor<16x256xf32>, tensor<256x32xf32>)
-                    outs(%C: tensor<16x32xf32>) -> tensor<16x32xf32>
+                    outs(%C: tensor<16x32xf32>)
   return %0: tensor<16x32xf32>
 }
 
@@ -16,7 +16,7 @@
 //  CHECK-DAG: %[[I1:.*]] = tensor.expand_shape %{{.*}}[0], [1, 2]] : tensor<16x256xf32> into tensor<16x4x64xf32>
 //  CHECK-DAG: %[[I2:.*]] = tensor.expand_shape %{{.*}}[0, 1], [2]] : tensor<256x32xf32> into tensor<4x64x32xf32>
 //  CHECK-DAG: %[[INI:.*]] = tensor.empty() : tensor<16x32x4xf32>
-//      CHECK: %[[F:.*]] = linalg.fill ins(%[[ID]] : f32) outs(%[[INI]] : tensor<16x32x4xf32>) -> tensor<16x32x4xf32>
+//      CHECK: %[[F:.*]] = linalg.fill ins(%[[ID]] : f32) outs(%[[INI]] : tensor<16x32x4xf32>)
 //      CHECK: %[[G:.*]] = linalg.generic {indexing_maps = [#[[$MAP0]], #[[$MAP1]], #[[$MAP2]]]
 // CHECK-SAME:   , iterator_types = ["parallel", "parallel", "parallel", "reduction"]}
 // CHECK-SAME:   ins(%[[I1]], %[[I2]] : tensor<16x4x64xf32>, tensor<4x64x32xf32>) outs(%[[F]] : tensor<16x32x4xf32>) {
@@ -64,7 +64,7 @@
 //  CHECK-DAG: %[[ID:.*]] = arith.constant 1.000000e+00 : f32
 //  CHECK-DAG: %[[I1:.*]] = tensor.expand_shape %{{.*}}[0, 1]] : tensor<32xf32> into tensor<4x8xf32>
 //  CHECK-DAG: %[[INI:.*]] = tensor.empty() : tensor<4xf32>
-//      CHECK: %[[F:.*]] = linalg.fill ins(%[[ID]] : f32) outs(%[[INI]] : tensor<4xf32>) -> tensor<4xf32>
+//      CHECK: %[[F:.*]] = linalg.fill ins(%[[ID]] : f32) outs(%[[INI]] : tensor<4xf32>)
 //      CHECK: %[[G:.*]] = linalg.generic
 //      CHECK:   {indexing_maps = [#[[$MAP0]], #[[$MAP1]], #[[$MAP2]]],
 //      CHECK:   iterator_types = ["parallel", "reduction"]} ins(%[[I1]], %{{.*}} : tensor<4x8xf32>, tensor<f32>) outs(%[[F]] : tensor<4xf32>) {
@@ -116,7 +116,7 @@
 //  CHECK-DAG: %[[I1:.*]] = tensor.expand_shape %{{.*}}[0, 1], [2]] : tensor<32x2xf32> into tensor<4x8x2xf32>
 //  CHECK-DAG: %[[I2:.*]] = tensor.expand_shape %{{.*}}[0], [1, 2]] : tensor<5x32xf32> into tensor<5x4x8xf32>
 //  CHECK-DAG: %[[INI:.*]] = tensor.empty() : tensor<5x2x4xf32>
-//      CHECK: %[[F:.*]] = linalg.fill ins(%[[ID]] : f32) outs(%[[INI]] : tensor<5x2x4xf32>) -> tensor<5x2x4xf32>
+//      CHECK: %[[F:.*]] = linalg.fill ins(%[[ID]] : f32) outs(%[[INI]] : tensor<5x2x4xf32>)
 //      CHECK: %[[G:.*]] = linalg.generic {indexing_maps = [#[[$MAP0]], #[[$MAP1]], #[[$MAP2]]], iterator_types = ["parallel", "reduction", "parallel", "parallel"]}
 // CHECK-SAME:   ins(%[[I1]], %[[I2]] : tensor<4x8x2xf32>, tensor<5x4x8xf32>) outs(%[[F]] : tensor<5x2x4xf32>) {
 //      CHECK:   arith.addf
@@ -140,7 +140,7 @@
 
 func.func @matmul_split(%A : tensor<16x256xf32>, %B: tensor<256x32xf32>, %C: tensor<16x32xf32>) -> tensor<16x32xf32> {
   %0 = linalg.matmul ins(%A, %B: tensor<16x256xf32>, tensor<256x32xf32>)
-                    outs(%C: tensor<16x32xf32>) -> tensor<16x32xf32>
+                    outs(%C: tensor<16x32xf32>)
   return %0: tensor<16x32xf32>
 }
 
@@ -154,7 +154,7 @@
 //  CHECK-DAG: %[[I1:.*]] = tensor.expand_shape %{{.*}}[0], [1, 2]] : tensor<16x256xf32> into tensor<16x64x4xf32>
 //  CHECK-DAG: %[[I2:.*]] = tensor.expand_shape %{{.*}}[0, 1], [2]] : tensor<256x32xf32> into tensor<64x4x32xf32>
 //  CHECK-DAG: %[[INI:.*]] = tensor.empty() : tensor<16x32x4xf32>
-//      CHECK: %[[F:.*]] = linalg.fill ins(%[[ID]] : f32) outs(%[[INI]] : tensor<16x32x4xf32>) -> tensor<16x32x4xf32>
+//      CHECK: %[[F:.*]] = linalg.fill ins(%[[ID]] : f32) outs(%[[INI]] : tensor<16x32x4xf32>)
 //      CHECK: %[[G:.*]] = linalg.generic {indexing_maps = [#[[$MAP0]], #[[$MAP1]], #[[$MAP2]]]
 // CHECK-SAME:   , iterator_types = ["parallel", "parallel", "reduction", "parallel"]}
 // CHECK-SAME:   ins(%[[I1]], %[[I2]] : tensor<16x64x4xf32>, tensor<64x4x32xf32>) outs(%[[F]] : tensor<16x32x4xf32>) {
@@ -202,7 +202,7 @@
 //  CHECK-DAG: %[[ID:.*]] = arith.constant 1.000000e+00 : f32
 //  CHECK-DAG: %[[I1:.*]] = tensor.expand_shape %{{.*}}[0, 1]] : tensor<32xf32> into tensor<8x4xf32>
 //  CHECK-DAG: %[[INI:.*]] = tensor.empty() : tensor<4xf32>
-//      CHECK: %[[F:.*]] = linalg.fill ins(%[[ID]] : f32) outs(%[[INI]] : tensor<4xf32>) -> tensor<4xf32>
+//      CHECK: %[[F:.*]] = linalg.fill ins(%[[ID]] : f32) outs(%[[INI]] : tensor<4xf32>)
 //      CHECK: %[[G:.*]] = linalg.generic
 //      CHECK:   {indexing_maps = [#[[$MAP0]], #[[$MAP1]], #[[$MAP2]]],
 //      CHECK:   iterator_types = ["reduction", "parallel"]} ins(%[[I1]], %{{.*}} : tensor<8x4xf32>, tensor<f32>) outs(%[[F]] : tensor<4xf32>) {
@@ -254,7 +254,7 @@
 //  CHECK-DAG: %[[I1:.*]] = tensor.expand_shape %{{.*}}[0, 1], [2]] : tensor<32x2xf32> into tensor<8x4x2xf32>
 //  CHECK-DAG: %[[I2:.*]] = tensor.expand_shape %{{.*}}[0], [1, 2]] : tensor<5x32xf32> into tensor<5x8x4xf32>
 //  CHECK-DAG: %[[INI:.*]] = tensor.empty() : tensor<5x2x4xf32>
-//      CHECK: %[[F:.*]] = linalg.fill ins(%[[ID]] : f32) outs(%[[INI]] : tensor<5x2x4xf32>) -> tensor<5x2x4xf32>
+//      CHECK: %[[F:.*]] = linalg.fill ins(%[[ID]] : f32) outs(%[[INI]] : tensor<5x2x4xf32>)
 //      CHECK: %[[G:.*]] = linalg.generic {indexing_maps = [#[[$MAP0]], #[[$MAP1]], #[[$MAP2]]], iterator_types = ["parallel", "reduction", "parallel", "parallel"]}
 // CHECK-SAME:   ins(%[[I1]], %[[I2]] : tensor<8x4x2xf32>, tensor<5x8x4xf32>) outs(%[[F]] : tensor<5x2x4xf32>) {
 //      CHECK:   arith.addf
diff --git a/mlir/test/Dialect/Linalg/transform-op-tile.mlir b/mlir/test/Dialect/Linalg/transform-op-tile.mlir
--- a/mlir/test/Dialect/Linalg/transform-op-tile.mlir
+++ b/mlir/test/Dialect/Linalg/transform-op-tile.mlir
@@ -21,14 +21,13 @@
 //      CHECK:       %[[sTB:.*]] = tensor.extract_slice %[[TB]][{{.*}}] : tensor<128x128xf32> to tensor<4x4xf32>
 //      CHECK:       %[[sTC:.*]] = tensor.extract_slice %[[TC2]][{{.*}}] : tensor<128x128xf32> to tensor<4x4xf32>
 //      CHECK:       %[[sTD:.*]] = linalg.matmul ins(%[[sTA]], %[[sTB]] : tensor<4x4xf32>, tensor<4x4xf32>)
-// CHECK-SAME:                                   outs(%[[sTC]] : tensor<4x4xf32>)  -> tensor<4x4xf32>
+// CHECK-SAME:                                   outs(%[[sTC]] : tensor<4x4xf32>)
 //      CHECK:       %[[TD:.*]] = tensor.insert_slice %[[sTD]] into %[[TC2]][{{.*}}]  : tensor<4x4xf32> into tensor<128x128xf32>
 //      CHECK:       scf.yield %[[TD]] : tensor<128x128xf32>
 //      CHECK:     scf.yield %[[TD2]] : tensor<128x128xf32>
 //      CHECK:   scf.yield %[[TD1]] : tensor<128x128xf32>
   %0 = linalg.matmul  ins(%arg0, %arg1: tensor<128x128xf32>, tensor<128x128xf32>)
                      outs(%arg2: tensor<128x128xf32>)
-    -> tensor<128x128xf32>
 
 //      CHECK: return %[[TD0]] : tensor<128x128xf32>
   return %0 : tensor<128x128xf32>
@@ -60,7 +59,7 @@
 //      CHECK:       %[[sTB:.*]] = tensor.extract_slice %[[TB]][{{.*}}] : tensor<128x128xf32> to tensor<4x?xf32>
 //      CHECK:       %[[sTC:.*]] = tensor.extract_slice %[[TC2]][{{.*}}] : tensor<128x128xf32> to tensor<?x?xf32>
 //      CHECK:       %[[sTD:.*]] = linalg.matmul ins(%[[sTA]], %[[sTB]] : tensor<?x4xf32>, tensor<4x?xf32>)
-// CHECK-SAME:                                   outs(%[[sTC]] : tensor<?x?xf32>)  -> tensor<?x?xf32>
+// CHECK-SAME:                                   outs(%[[sTC]] : tensor<?x?xf32>)
 //      CHECK:       %[[TD:.*]] = tensor.insert_slice %[[sTD]] into %[[TC2]][{{.*}}]  : tensor<?x?xf32> into tensor<128x128xf32>
 //      CHECK:       scf.yield %[[TD]] : tensor<128x128xf32>
 //      CHECK:     scf.yield %[[TD2]] : tensor<128x128xf32>
@@ -68,7 +67,6 @@
   %sz = func.call @get_dynamic_tile_size() : () -> index
   %0 = linalg.matmul  ins(%arg0, %arg1: tensor<128x128xf32>, tensor<128x128xf32>)
                      outs(%arg2: tensor<128x128xf32>)
-    -> tensor<128x128xf32>
 
 //      CHECK: return %[[TD0]] : tensor<128x128xf32>
   return %0 : tensor<128x128xf32>
diff --git a/mlir/test/Dialect/Linalg/transform-op-vectorize.mlir b/mlir/test/Dialect/Linalg/transform-op-vectorize.mlir
--- a/mlir/test/Dialect/Linalg/transform-op-vectorize.mlir
+++ b/mlir/test/Dialect/Linalg/transform-op-vectorize.mlir
@@ -12,7 +12,7 @@
   // CHECK: %[[vC:.+]] = vector.transfer_read %[[C]]
   // CHECK: %[[vR:.+]] = vector.contract {{.*}} %[[vA]], %[[vB]], %[[vC]]
   // CHECK: vector.transfer_write %[[vR]], %[[C]]
-  %0 = linalg.matmul ins(%arg0, %arg1 : tensor<24x12xf32>, tensor<12x25xf32>) outs(%arg2 : tensor<24x25xf32>) -> tensor<24x25xf32>
+  %0 = linalg.matmul ins(%arg0, %arg1 : tensor<24x12xf32>, tensor<12x25xf32>) outs(%arg2 : tensor<24x25xf32>)
   func.return %0 : tensor<24x25xf32>
 }
 
@@ -57,7 +57,7 @@
   // CHECK: %[[vC:.+]] = vector.transfer_read %[[C]]
   // CHECK: %[[vR:.+]] = vector.contract {{.*}} %[[vA]], %[[vB]], %[[vC]]
   // CHECK: vector.transfer_write %[[vR]], %[[C]]
-  %8 = linalg.matmul ins(%5, %7 : tensor<4x7xf32>, tensor<7x5xf32>) outs(%3 : tensor<4x5xf32>) -> tensor<4x5xf32>
+  %8 = linalg.matmul ins(%5, %7 : tensor<4x7xf32>, tensor<7x5xf32>) outs(%3 : tensor<4x5xf32>)
   %9 = tensor.insert_slice %8 into %arg2[%arg3, %arg4] [4, 5] [1, 1] : tensor<4x5xf32> into tensor<24x25xf32>
   return %9 : tensor<24x25xf32>
 }
@@ -105,7 +105,7 @@
   // CHECK: %[[vC:.+]] = vector.transfer_read %[[C]]
   // CHECK: %[[vR:.+]] = vector.contract {{.*}} %[[vA]], %[[vB]], %[[vC]]
   // CHECK: vector.transfer_write %[[vR]], %[[C]]
-  %8 = linalg.matmul ins(%5, %7 : tensor<4x7xf32>, tensor<7x5xf32>) outs(%3 : tensor<4x5xf32>) -> tensor<4x5xf32>
+  %8 = linalg.matmul ins(%5, %7 : tensor<4x7xf32>, tensor<7x5xf32>) outs(%3 : tensor<4x5xf32>)
   %9 = tensor.insert_slice %8 into %arg2[%arg3, %arg4] [4, 5] [1, 1] : tensor<4x5xf32> into tensor<24x25xf32>
   return %9 : tensor<24x25xf32>
 }
@@ -123,7 +123,7 @@
                      %arg1: tensor<12x25xf32>,
                      %arg2: tensor<24x25xf32>) -> tensor<24x25xf32> {
   // expected-note @below {{non-isolated target}}
-  %0 = linalg.matmul ins(%arg0, %arg1 : tensor<24x12xf32>, tensor<12x25xf32>) outs(%arg2 : tensor<24x25xf32>) -> tensor<24x25xf32>
+  %0 = linalg.matmul ins(%arg0, %arg1 : tensor<24x12xf32>, tensor<12x25xf32>) outs(%arg2 : tensor<24x25xf32>)
   func.return %0 : tensor<24x25xf32>
 }
 
diff --git a/mlir/test/Dialect/Linalg/transform-tile-and-fuse.mlir b/mlir/test/Dialect/Linalg/transform-tile-and-fuse.mlir
--- a/mlir/test/Dialect/Linalg/transform-tile-and-fuse.mlir
+++ b/mlir/test/Dialect/Linalg/transform-tile-and-fuse.mlir
@@ -17,11 +17,11 @@
     %5 = linalg.fill
         {__producer__}
         ins(%cst : f32)
-        outs(%D : tensor<?x?xf32>) -> tensor<?x?xf32>
+        outs(%D : tensor<?x?xf32>)
     %6 = linalg.matmul
         {__producer__}
         ins(%A, %B : tensor<?x?xf32>, tensor<?x?xf32>)
-        outs(%5 : tensor<?x?xf32>) -> tensor<?x?xf32>
+        outs(%5 : tensor<?x?xf32>)
     %7 = linalg.generic
         {__root__,
          indexing_maps = [affine_map<(d0, d1) -> (d0)>,
@@ -74,11 +74,11 @@
     %5 = linalg.fill
         {__producer__}
         ins(%cst : f32)
-        outs(%D : tensor<?x?xf32>) -> tensor<?x?xf32>
+        outs(%D : tensor<?x?xf32>)
     %6 = linalg.matmul
         {__producer__}
         ins(%A, %B : tensor<?x?xf32>, tensor<?x?xf32>)
-        outs(%5 : tensor<?x?xf32>) -> tensor<?x?xf32>
+        outs(%5 : tensor<?x?xf32>)
     %7 = linalg.generic
         {__root__,
          indexing_maps = [affine_map<(d0, d1) -> (d0)>,
diff --git a/mlir/test/Dialect/Linalg/transform-tile-reduction.mlir b/mlir/test/Dialect/Linalg/transform-tile-reduction.mlir
--- a/mlir/test/Dialect/Linalg/transform-tile-reduction.mlir
+++ b/mlir/test/Dialect/Linalg/transform-tile-reduction.mlir
@@ -33,7 +33,7 @@
 // CHECK-DAG:   %[[D1:.*]] = tensor.dim %[[ARG0]], %[[C1]] : tensor<?x?xf32>
 // CHECK-DAG:   %[[D2:.*]] = tensor.dim %[[ARG1]], %[[C0]] : tensor<?xf32>
 //     CHECK:   %[[E:.*]] = tensor.empty(%[[D2]]) : tensor<?x5xf32>
-//     CHECK:   %[[F:.*]] = linalg.fill ins(%[[I]] : f32) outs(%[[E]] : tensor<?x5xf32>) -> tensor<?x5xf32>
+//     CHECK:   %[[F:.*]] = linalg.fill ins(%[[I]] : f32) outs(%[[E]] : tensor<?x5xf32>)
 //     CHECK:   %[[L:.*]] = scf.for %[[K:.*]] = %[[C0]] to %[[D1]] step %[[C5]] iter_args(%[[ARG3:.*]] = %[[F]]) -> (tensor<?x5xf32>) {
 //     CHECK:     %[[PS:.*]] = affine.min #[[MAP2]](%[[K]])[%[[D1]]]
 //     CHECK:     %[[EXT2:.*]] = tensor.extract_slice %[[ARG0]][0, %[[K:.*]]] [%[[D0]], %[[PS]]] [1, 1] : tensor<?x?xf32> to tensor<?x?xf32>
@@ -78,7 +78,7 @@
 
 //     CHECK: func @reduction_tile_transpose
 //     CHECK:   tensor.empty(%{{.*}}) : tensor<5x?xf32>
-//     CHECK:   linalg.fill {{.*}} : tensor<5x?xf32>) -> tensor<5x?xf32>
+//     CHECK:   linalg.fill {{.*}} : tensor<5x?xf32>)
 //     CHECK:   scf.for
 //     CHECK:     linalg.generic
 //     CHECK:     %[[D3:.*]] = tensor.dim %{{.*}}, %[[C0]] : tensor<?x?xf32>
@@ -127,7 +127,7 @@
 // CHECK-DAG:   %[[D1:.*]] = tensor.dim %[[ARG0]], %[[C1]] : tensor<?x?xf32>
 // CHECK-DAG:   %[[D2:.*]] = tensor.dim %[[ARG1]], %[[C0]] : tensor<?xf32>
 //     CHECK:   %[[E:.*]] = tensor.empty(%[[D2]]) : tensor<?x5xf32>
-//     CHECK:   %[[F:.*]] = linalg.fill ins(%[[I]] : f32) outs(%[[E]] : tensor<?x5xf32>) -> tensor<?x5xf32>
+//     CHECK:   %[[F:.*]] = linalg.fill ins(%[[I]] : f32) outs(%[[E]] : tensor<?x5xf32>)
 //     CHECK:   %[[L:.*]] = scf.foreach_thread (%[[IV:.+]]) in (%[[C5]]) shared_outs(%[[ARG3:.+]] = %[[F]]) -> (tensor<?x5xf32>) {
 // CHECK-DAG:     %[[TS0:.+]] = affine.min #[[MAP0]](%[[IV]])[%[[D1]]]
 // CHECK-DAG:     %[[TS1:.+]] = affine.max #[[MAP1]](%[[TS0]])
@@ -155,7 +155,7 @@
 func.func @matmul_tile_parallel(
   %A: tensor<?x?xf32>, %B: tensor<?x?xf32>, %out: tensor<?x?xf32>) -> tensor<?x?xf32> {
   %matmul = linalg.matmul ins(%A, %B: tensor<?x?xf32>, tensor<?x?xf32>)
-                     outs(%out: tensor<?x?xf32>) -> tensor<?x?xf32>
+                     outs(%out: tensor<?x?xf32>)
   return %matmul : tensor<?x?xf32>
 }
 
@@ -182,7 +182,7 @@
 // CHECK-DAG:   %[[D3:.*]] = tensor.dim %[[ARG2]], %[[C0]] : tensor<?x?xf32>
 // CHECK-DAG:   %[[D4:.*]] = tensor.dim %[[ARG2]], %[[C1]] : tensor<?x?xf32>
 //     CHECK:   %[[E:.*]] = tensor.empty(%[[D3]], %[[D4]]) : tensor<?x?x5xf32>
-//     CHECK:   %[[F:.*]] = linalg.fill ins(%[[I]] : f32) outs(%[[E]] : tensor<?x?x5xf32>) -> tensor<?x?x5xf32>
+//     CHECK:   %[[F:.*]] = linalg.fill ins(%[[I]] : f32) outs(%[[E]] : tensor<?x?x5xf32>)
 //     CHECK:   %[[L:.*]] = scf.foreach_thread (%[[IV:.+]]) in (%[[C5]]) shared_outs(%[[ARG3:.+]] = %[[F]]) -> (tensor<?x?x5xf32>) {
 // CHECK-DAG:     %[[TS0:.+]] = affine.min #[[MAP0]](%[[IV]])[%[[D1]]]
 // CHECK-DAG:     %[[TS1:.+]] = affine.max #[[MAP1]](%[[TS0]])
@@ -191,7 +191,7 @@
 //     CHECK:     %[[INCHUNKA:.+]] = tensor.extract_slice %[[ARG0]][0, %[[TINDEX]]] [%[[D0]], %[[TS1]]] [1, 1] : tensor<?x?xf32> to tensor<?x?xf32>
 //     CHECK:     %[[INCHUNKB:.+]] = tensor.extract_slice %[[ARG1]][%[[TINDEX]], 0] [%[[TS1]], %[[D2]]] [1, 1] : tensor<?x?xf32> to tensor<?x?xf32>
 //     CHECK:     %[[TEMPEXT:.+]] = tensor.extract_slice %[[ET]][0, 0] [%[[D0]], %[[D2]]] [1, 1] : tensor<?x?xf32> to tensor<?x?xf32>
-//     CHECK:     %[[PARTIAL:.+]] = linalg.matmul ins(%[[INCHUNKA]], %[[INCHUNKB]] : tensor<?x?xf32>, tensor<?x?xf32>) outs(%[[TEMPEXT]] : tensor<?x?xf32>) -> tensor<?x?xf32>
+//     CHECK:     %[[PARTIAL:.+]] = linalg.matmul ins(%[[INCHUNKA]], %[[INCHUNKB]] : tensor<?x?xf32>, tensor<?x?xf32>) outs(%[[TEMPEXT]] : tensor<?x?xf32>)
 //     CHECK:     scf.foreach_thread.perform_concurrently {
 //     CHECK:       tensor.parallel_insert_slice %[[PARTIAL]] into %[[ARG3]][0, 0, %[[IV]]] [%[[D0]], %[[D2]], 1] [1, 1, 1] : tensor<?x?xf32> into tensor<?x?x5xf32>
 //     CHECK:     }
@@ -240,7 +240,7 @@
 // CHECK-DAG:   %[[D0:.*]] = tensor.dim %[[ARG0]], %[[C0]] : tensor<?x?xf32>
 // CHECK-DAG:   %[[D2:.*]] = tensor.dim %[[ARG1]], %[[C0]] : tensor<?xf32>
 //     CHECK:   %[[E:.*]] = tensor.empty(%[[D2]]) : tensor<?x5xf32>
-//     CHECK:   %[[F:.*]] = linalg.fill ins(%[[I]] : f32) outs(%[[E]] : tensor<?x5xf32>) -> tensor<?x5xf32>
+//     CHECK:   %[[F:.*]] = linalg.fill ins(%[[I]] : f32) outs(%[[E]] : tensor<?x5xf32>)
 //     CHECK:   %[[L:.*]] = scf.foreach_thread (%[[IV:.+]]) in (%[[C5]]) shared_outs(%[[ARG3:.+]] = %[[F]]) -> (tensor<?x5xf32>) {
 //     CHECK:     %[[ET:.+]] = tensor.extract_slice %[[ARG3:.+]][0, %[[IV]]] [%[[D0]], 1] [1, 1] : tensor<?x5xf32> to tensor<?xf32>
 //     CHECK:     %[[D1:.*]] = tensor.dim %[[ARG0]], %[[C1]] : tensor<?x?xf32>
diff --git a/mlir/test/Dialect/Linalg/vectorization.mlir b/mlir/test/Dialect/Linalg/vectorization.mlir
--- a/mlir/test/Dialect/Linalg/vectorization.mlir
+++ b/mlir/test/Dialect/Linalg/vectorization.mlir
@@ -699,7 +699,6 @@
   //       CHECK:   %[[W:.*]] = vector.transfer_write %[[R]], %[[ARG2]][%[[C0]], %[[C0]]] {in_bounds = [true, true]} : vector<8x12xf32>, tensor<8x12xf32>
   %0 = linalg.matmul  ins(%arg0, %arg1: tensor<8x4xf32>, tensor<4x12xf32>)
                      outs(%arg2: tensor<8x12xf32>)
-    -> tensor<8x12xf32>
   //       CHECK:   return %[[W]] : tensor<8x12xf32>
   return %0 : tensor<8x12xf32>
 }
@@ -785,7 +784,7 @@
 //       CHECK:   %[[V4:.*]] = arith.addi %[[DIM3]], %[[C3]] : index
 //       CHECK:   %[[V5:.*]] = arith.addi %[[V4]], %[[C2]] : index
 //       CHECK:   %[[INIT:.*]] = tensor.empty(%[[V1]], %[[V2]], %[[V5]]) : tensor<6x?x?x?xf32>
-//       CHECK:   %[[FILL:.*]] = linalg.fill ins(%{{.*}} : f32) outs(%[[INIT]] : tensor<6x?x?x?xf32>) -> tensor<6x?x?x?xf32>
+//       CHECK:   %[[FILL:.*]] = linalg.fill ins(%{{.*}} : f32) outs(%[[INIT]] : tensor<6x?x?x?xf32>)
 //       CHECK:   %[[SRCDIM:.*]] = tensor.dim %[[SRC]], %[[C3]] : tensor<1x2x2x?xf32>
 //       CHECK:   %[[RESULT:.*]] = tensor.insert_slice %[[SRC]] into %[[FILL]][2, %[[LOW]], 3, 3] [1, 2, 2, %[[SRCDIM]]] [1, 1, 1, 1] : tensor<1x2x2x?xf32> into tensor<6x?x?x?xf32>
 //       CHECK:   return %[[RESULT]]
@@ -1096,7 +1095,7 @@
   // CHECK: vector.transfer_write {{.*}} : vector<4xf32>, tensor<4xf32>
   %ident = arith.constant -3.40282e+38 : f32
   %init = tensor.empty() : tensor<4xf32>
-  %fill = linalg.fill ins(%ident : f32) outs(%init : tensor<4xf32>) -> tensor<4xf32>
+  %fill = linalg.fill ins(%ident : f32) outs(%init : tensor<4xf32>)
   %red = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>,
                                           affine_map<(d0, d1) -> (d0)>],
                          iterator_types = ["parallel", "reduction"]}
@@ -1127,7 +1126,7 @@
   // CHECK: vector.transfer_write {{.*}} : vector<4xf32>, tensor<4xf32>
   %maxf32 = arith.constant 3.40282e+38 : f32
   %init = tensor.empty() : tensor<4xf32>
-  %fill = linalg.fill ins(%maxf32 : f32) outs(%init : tensor<4xf32>) -> tensor<4xf32>
+  %fill = linalg.fill ins(%maxf32 : f32) outs(%init : tensor<4xf32>)
   %red = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>,
                                           affine_map<(d0, d1) -> (d0)>],
                          iterator_types = ["parallel", "reduction"]}
@@ -1157,7 +1156,7 @@
   // CHECK: vector.transfer_write {{.*}} : vector<4xf32>, tensor<4xf32>
   %ident = arith.constant 1.0 : f32
   %init = tensor.empty() : tensor<4xf32>
-  %fill = linalg.fill ins(%ident : f32) outs(%init : tensor<4xf32>) -> tensor<4xf32>
+  %fill = linalg.fill ins(%ident : f32) outs(%init : tensor<4xf32>)
   %red = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>,
                                           affine_map<(d0, d1) -> (d0)>],
                          iterator_types = ["parallel", "reduction"]}
@@ -1187,7 +1186,7 @@
   // CHECK: vector.transfer_write {{.*}} : vector<4xi1>, tensor<4xi1>
   %ident = arith.constant false
   %init = tensor.empty() : tensor<4xi1>
-  %fill = linalg.fill ins(%ident : i1) outs(%init : tensor<4xi1>) -> tensor<4xi1>
+  %fill = linalg.fill ins(%ident : i1) outs(%init : tensor<4xi1>)
   %red = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>,
                                           affine_map<(d0, d1) -> (d0)>],
                          iterator_types = ["parallel", "reduction"]}
@@ -1217,7 +1216,7 @@
   // CHECK: vector.transfer_write {{.*}} : vector<4xi1>, tensor<4xi1>
   %ident = arith.constant true
   %init = tensor.empty() : tensor<4xi1>
-  %fill = linalg.fill ins(%ident : i1) outs(%init : tensor<4xi1>) -> tensor<4xi1>
+  %fill = linalg.fill ins(%ident : i1) outs(%init : tensor<4xi1>)
   %red = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>,
                                           affine_map<(d0, d1) -> (d0)>],
                          iterator_types = ["parallel", "reduction"]}
@@ -1247,7 +1246,7 @@
   // CHECK: vector.transfer_write {{.*}} : vector<4xi1>, tensor<4xi1>
   %ident = arith.constant false
   %init = tensor.empty() : tensor<4xi1>
-  %fill = linalg.fill ins(%ident : i1) outs(%init : tensor<4xi1>) -> tensor<4xi1>
+  %fill = linalg.fill ins(%ident : i1) outs(%init : tensor<4xi1>)
   %red = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>,
                                           affine_map<(d0, d1) -> (d0)>],
                          iterator_types = ["parallel", "reduction"]}
@@ -1279,7 +1278,7 @@
   // CHECK: vector.transfer_write {{.*}} {in_bounds = [true, true]} : vector<4x4xf32>, tensor<4x4xf32>
   %c0 = arith.constant 0.0 : f32
   %init = tensor.empty() : tensor<4x4xf32>
-  %fill = linalg.fill ins(%c0 : f32) outs(%init : tensor<4x4xf32>) -> tensor<4x4xf32>
+  %fill = linalg.fill ins(%c0 : f32) outs(%init : tensor<4x4xf32>)
   %red = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>,
                                           affine_map<(d0, d1) -> (d0, 0)>,
                                           affine_map<(d0, d1) -> (d0, d1)>],
@@ -1315,7 +1314,7 @@
   // CHECK: vector.transfer_write {{.*}} {in_bounds = [true]} : vector<4xf32>, tensor<4xf32>
   %c0 = arith.constant 0.0 : f32
   %init = tensor.empty() : tensor<4xf32>
-  %fill = linalg.fill ins(%c0 : f32) outs(%init : tensor<4xf32>) -> tensor<4xf32>
+  %fill = linalg.fill ins(%c0 : f32) outs(%init : tensor<4xf32>)
   %red = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>,
                                           affine_map<(d0, d1) -> (d0, 0)>,
                                           affine_map<(d0, d1) -> (d0)>],
@@ -1356,7 +1355,7 @@
   //      CHECK: %[[init:.*]] = tensor.empty() : tensor<f32>
   %0 = tensor.empty() : tensor<f32>
 
-  %1 = linalg.fill ins(%f0 : f32) outs(%0 : tensor<f32>) -> tensor<f32>
+  %1 = linalg.fill ins(%f0 : f32) outs(%0 : tensor<f32>)
   //      CHECK: %[[r:.*]] = vector.transfer_read %[[A]][%[[C0]]]
   // CHECK-SAME:   : tensor<32xf32>, vector<32xf32>
   //      CHECK: %[[f0:.*]] = vector.extractelement %[[vF0]][] : vector<f32>
@@ -1397,7 +1396,7 @@
 func.func @not_projected_permutation(%arg0: tensor<8x8xf32>) -> tensor<6x6x3x3xf32> {
   %c0 = arith.constant 0.0 : f32
   %init = tensor.empty() : tensor<6x6x3x3xf32>
-  %fill = linalg.fill ins(%c0 : f32) outs(%init : tensor<6x6x3x3xf32>) -> tensor<6x6x3x3xf32>
+  %fill = linalg.fill ins(%c0 : f32) outs(%init : tensor<6x6x3x3xf32>)
   // CHECK: linalg.generic
   %result = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0 + d2, d1 + d3)>,
                                              affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>],
@@ -1866,7 +1865,7 @@
   %cst_6 = arith.constant 4.000000e+00 : f32
   %1 = scf.for %arg0 = %c0 to %c64 step %c4 iter_args(%arg1 = %input) -> (tensor<120x64xf32>) {
     %extracted_slice = tensor.extract_slice %arg1[%c0, %arg0] [1, 4] [1, 1] : tensor<120x64xf32> to tensor<1x4xf32>
-    %10 = linalg.fill {__internal_linalg_transform__ = "1"} ins(%cst_6 : f32) outs(%extracted_slice : tensor<1x4xf32>) -> tensor<1x4xf32>
+    %10 = linalg.fill {__internal_linalg_transform__ = "1"} ins(%cst_6 : f32) outs(%extracted_slice : tensor<1x4xf32>)
     %11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} outs(%10 : tensor<1x4xf32>) {
     ^bb0(%out: f32):
       %12 = linalg.index 0 : index
diff --git a/mlir/test/Dialect/SCF/one-shot-bufferize-analysis.mlir b/mlir/test/Dialect/SCF/one-shot-bufferize-analysis.mlir
--- a/mlir/test/Dialect/SCF/one-shot-bufferize-analysis.mlir
+++ b/mlir/test/Dialect/SCF/one-shot-bufferize-analysis.mlir
@@ -620,9 +620,9 @@
     // CHECK: tensor.extract_slice {{.*}} {__inplace_operands_attr__ = ["true", "none"]}
     %6 = tensor.extract_slice %arg1[%arg0] [1] [1] : tensor<320xf32> to tensor<1xf32>
     // CHECK: linalg.fill {__inplace_operands_attr__ = ["none", "true"]}
-    %7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<1xf32>) -> tensor<1xf32>
+    %7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<1xf32>)
     // CHECK: linalg.fill {__inplace_operands_attr__ = ["none", "true"]}
-    %8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<1xf32>) -> tensor<1xf32>
+    %8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<1xf32>)
 
     scf.foreach_thread.perform_concurrently {
       // CHECK: tensor.parallel_insert_slice {{.*}} {__inplace_operands_attr__ = ["true", "true", "none"]}
@@ -647,14 +647,14 @@
   %0 = bufferization.alloc_tensor() : tensor<4xf32>
 
   // CHECK: linalg.fill {__inplace_operands_attr__ = ["none", "false"]}
-  %1 = linalg.fill ins(%cst : f32) outs(%0 : tensor<4xf32>) -> tensor<4xf32>
+  %1 = linalg.fill ins(%cst : f32) outs(%0 : tensor<4xf32>)
 
   %2 = scf.for %arg5 = %arg2 to %arg3 step %arg4 iter_args(%arg6 = %arg1) -> (tensor<4xf32>) {
     // CHECK: tensor.extract {{.*}} {__inplace_operands_attr__ = ["true", "none"]}
     %4 = tensor.extract %1[%arg4] : tensor<4xf32>
     vector.print %4 : f32
     // CHECK: linalg.fill {__inplace_operands_attr__ = ["none", "true"]}
-    %5 = linalg.fill ins(%cst2 : f32) outs(%0 : tensor<4xf32>) -> tensor<4xf32>
+    %5 = linalg.fill ins(%cst2 : f32) outs(%0 : tensor<4xf32>)
     scf.yield %5 : tensor<4xf32>
   }
 
@@ -677,14 +677,14 @@
   %0 = bufferization.alloc_tensor() : tensor<4xf32>
 
   // CHECK: linalg.fill {__inplace_operands_attr__ = ["none", "true"]}
-  %1 = linalg.fill ins(%cst : f32) outs(%0 : tensor<4xf32>) -> tensor<4xf32>
+  %1 = linalg.fill ins(%cst : f32) outs(%0 : tensor<4xf32>)
 
   %2 = scf.for %arg5 = %arg2 to %arg3 step %arg4 iter_args(%arg6 = %arg1) -> (tensor<4xf32>) {
     // CHECK: tensor.extract {{.*}} {__inplace_operands_attr__ = ["true", "none"]}
     %4 = tensor.extract %1[%arg4] : tensor<4xf32>
     vector.print %4 : f32
     // CHECK: linalg.fill {__inplace_operands_attr__ = ["none", "false"]}
-    %5 = linalg.fill ins(%cst2 : f32) outs(%1 : tensor<4xf32>) -> tensor<4xf32>
+    %5 = linalg.fill ins(%cst2 : f32) outs(%1 : tensor<4xf32>)
     scf.yield %5 : tensor<4xf32>
   }
 
@@ -693,7 +693,7 @@
   %6 = tensor.extract %1[%arg4] : tensor<4xf32>
   vector.print %6 : f32
   // CHECK: linalg.fill {__inplace_operands_attr__ = ["none", "true"]}
-  %7 = linalg.fill ins(%cst3 : f32) outs(%1 : tensor<4xf32>) -> tensor<4xf32>
+  %7 = linalg.fill ins(%cst3 : f32) outs(%1 : tensor<4xf32>)
 
   return %2, %7 : tensor<4xf32>, tensor<4xf32>
 }
diff --git a/mlir/test/Dialect/SCF/one-shot-bufferize.mlir b/mlir/test/Dialect/SCF/one-shot-bufferize.mlir
--- a/mlir/test/Dialect/SCF/one-shot-bufferize.mlir
+++ b/mlir/test/Dialect/SCF/one-shot-bufferize.mlir
@@ -54,7 +54,7 @@
   // CHECK: %[[clone:.*]] = bufferization.clone %[[alloc]]
   // CHECK: scf.for {{.*}} iter_args(%{{.*}} = %[[clone]])
   %0 = scf.for %iv = %lb to %ub step %c1 iter_args(%1 = %A) -> tensor<?xf32> {
-    %r = linalg.fill ins(%cst : f32) outs(%1 : tensor<?xf32>) -> tensor<?xf32>
+    %r = linalg.fill ins(%cst : f32) outs(%1 : tensor<?xf32>)
     scf.yield %B : tensor<?xf32>
   }
   %1 = tensor.extract %0[%c1] : tensor<?xf32>
@@ -547,7 +547,7 @@
       // CHECK: %[[subview:.*]] = memref.subview %[[arg2]][5] [%[[idx]]] [1]
       %6 = tensor.extract_slice %o[5] [%idx] [%c1] : tensor<?xf32> to tensor<?xf32>
       // CHECK: linalg.fill ins(%{{.*}}) outs(%[[subview]] : memref<?xf32
-      %8 = linalg.fill ins(%cst : f32) outs(%6 : tensor<?xf32>) -> tensor<?xf32>
+      %8 = linalg.fill ins(%cst : f32) outs(%6 : tensor<?xf32>)
       // Self-copy will DCE away later.
       // CHECK: memref.copy %[[subview]], %[[subview]]
 
@@ -594,7 +594,7 @@
       %6 = tensor.extract_slice %o[5] [%idx] [%c1] : tensor<?xf32> to tensor<?xf32>
 
       // CHECK: linalg.fill ins(%{{.*}}) outs(%[[subview1]] : memref<?xf32
-      %8 = linalg.fill ins(%cst : f32) outs(%6 : tensor<?xf32>) -> tensor<?xf32>
+      %8 = linalg.fill ins(%cst : f32) outs(%6 : tensor<?xf32>)
 
       // Now the copy of the actual insert_slice. (It will fold away.)
       // CHECK: memref.copy %[[subview1]], %[[subview1]]
@@ -637,7 +637,7 @@
     %7 = tensor.extract_slice %o[%1, %4] [4, 4] [1, 1] : tensor<8x8xf32> to tensor<4x4xf32>
 
     //      CHECK: linalg.matmul ins({{.*}}memref<4x8xf32, strided<[?, ?], offset: ?>>, memref<8x4xf32, strided<[?, ?], offset: ?>>) outs({{.*}} : memref<4x4xf32, strided<[?, ?], offset: ?>>)
-    %8 = linalg.matmul ins(%3, %6 : tensor<4x8xf32>, tensor<8x4xf32>) outs(%7 : tensor<4x4xf32>) -> tensor<4x4xf32>
+    %8 = linalg.matmul ins(%3, %6 : tensor<4x8xf32>, tensor<8x4xf32>) outs(%7 : tensor<4x4xf32>)
     scf.foreach_thread.perform_concurrently {
       tensor.parallel_insert_slice %8 into %o[%1, %4] [4, 4] [1, 1] : tensor<4x4xf32> into tensor<8x8xf32>
     }
diff --git a/mlir/test/Dialect/SparseTensor/sparse_expand.mlir b/mlir/test/Dialect/SparseTensor/sparse_expand.mlir
--- a/mlir/test/Dialect/SparseTensor/sparse_expand.mlir
+++ b/mlir/test/Dialect/SparseTensor/sparse_expand.mlir
@@ -124,7 +124,7 @@
   %C = bufferization.alloc_tensor() : tensor<8x4xf64, #CSR>
   %D = linalg.matmul
     ins(%A, %B: tensor<8x2xf64, #CSR>, tensor<2x4xf64, #CSR>)
-       outs(%C: tensor<8x4xf64, #CSR>) -> tensor<8x4xf64, #CSR>
+       outs(%C: tensor<8x4xf64, #CSR>)
   return %D: tensor<8x4xf64, #CSR>
 }
 
@@ -172,6 +172,6 @@
   %C = bufferization.alloc_tensor() : tensor<8x4xf64, #CSC>
   %D = linalg.matmul
     ins(%A, %B: tensor<8x2xf64, #CSC>, tensor<2x4xf64, #CSC>)
-       outs(%C: tensor<8x4xf64, #CSC>) -> tensor<8x4xf64, #CSC>
+       outs(%C: tensor<8x4xf64, #CSC>)
   return %D: tensor<8x4xf64, #CSC>
 }
diff --git a/mlir/test/Dialect/SparseTensor/sparse_fill_zero.mlir b/mlir/test/Dialect/SparseTensor/sparse_fill_zero.mlir
--- a/mlir/test/Dialect/SparseTensor/sparse_fill_zero.mlir
+++ b/mlir/test/Dialect/SparseTensor/sparse_fill_zero.mlir
@@ -124,8 +124,8 @@
   %0 = bufferization.alloc_tensor() : tensor<100x300xf64, #DCSR>
   %cst = arith.constant 0.000000e+00 : f64
   %1 = linalg.fill ins(%cst : f64)
-                   outs(%0 : tensor<100x300xf64, #DCSR>) -> tensor<100x300xf64, #DCSR>
+                   outs(%0 : tensor<100x300xf64, #DCSR>)
   %2 = linalg.matmul ins(%arg0, %arg1 : tensor<100x200xf64, #DCSR>, tensor<200x300xf64, #DCSR>)
-                     outs(%1 : tensor<100x300xf64, #DCSR>) -> tensor<100x300xf64, #DCSR>
+                     outs(%1 : tensor<100x300xf64, #DCSR>)
   return %2 : tensor<100x300xf64, #DCSR>
 }
diff --git a/mlir/test/Dialect/SparseTensor/sparse_kernels.mlir b/mlir/test/Dialect/SparseTensor/sparse_kernels.mlir
--- a/mlir/test/Dialect/SparseTensor/sparse_kernels.mlir
+++ b/mlir/test/Dialect/SparseTensor/sparse_kernels.mlir
@@ -47,7 +47,7 @@
               %c: tensor<10x30xf32>) -> tensor<10x30xf32> {
   %0 = linalg.matmul
     ins(%a, %b: tensor<10x20xf32, #DCSR>, tensor<20x30xf32>)
-    outs(%c: tensor<10x30xf32>) -> tensor<10x30xf32>
+    outs(%c: tensor<10x30xf32>)
   return %0 : tensor<10x30xf32>
 }
 
@@ -144,7 +144,7 @@
   %C = bufferization.alloc_tensor() : tensor<4x4xf64, #DCSR>
   %D = linalg.matmul
     ins(%A, %B: tensor<4x8xf64, #DCSR>, tensor<8x4xf64, #DCSR>)
-       outs(%C: tensor<4x4xf64, #DCSR>) -> tensor<4x4xf64, #DCSR>
+       outs(%C: tensor<4x4xf64, #DCSR>)
   return %D: tensor<4x4xf64, #DCSR>
 }
 
@@ -193,7 +193,7 @@
              %output: tensor<6x6xi32>) -> tensor<6x6xi32> {
   %0 = linalg.conv_2d
     ins  (%input, %filter: tensor<8x8xi32>, tensor<3x3xi32, #DCSR>)
-    outs (%output: tensor<6x6xi32>) -> tensor<6x6xi32>
+    outs (%output: tensor<6x6xi32>)
   return %0 : tensor<6x6xi32>
 }
 
@@ -244,7 +244,7 @@
   %c2 = arith.constant 2 : i32
   %0 = linalg.quantized_matmul
     ins(%input1, %input2, %c2, %c0 : tensor<5x3xi8>, tensor<3x6xi8, #DCSR>, i32, i32)
-    outs(%output : tensor<5x6xi64>) -> tensor<5x6xi64>
+    outs(%output : tensor<5x6xi64>)
   return %0: tensor<5x6xi64>
 }
 
@@ -306,6 +306,6 @@
 		 %x: tensor<f32>) -> tensor<f32> {
   %dot = linalg.dot ins(%a, %b: tensor<1024xf32, #SparseVector>,
                                 tensor<1024xf32, #SparseVector>)
-                   outs(%x: tensor<f32>) -> tensor<f32>
+                   outs(%x: tensor<f32>)
   return %dot : tensor<f32>
 }
diff --git a/mlir/test/Dialect/SparseTensor/sparse_matmul_codegen.mlir b/mlir/test/Dialect/SparseTensor/sparse_matmul_codegen.mlir
--- a/mlir/test/Dialect/SparseTensor/sparse_matmul_codegen.mlir
+++ b/mlir/test/Dialect/SparseTensor/sparse_matmul_codegen.mlir
@@ -157,6 +157,6 @@
   %C = bufferization.alloc_tensor() : tensor<4x4xf64, #CSR>
   %D = linalg.matmul
     ins(%A, %B: tensor<4x8xf64, #CSR>, tensor<8x4xf64, #CSR>)
-       outs(%C: tensor<4x4xf64, #CSR>) -> tensor<4x4xf64, #CSR>
+       outs(%C: tensor<4x4xf64, #CSR>)
   return %D: tensor<4x4xf64, #CSR>
 }
diff --git a/mlir/test/Dialect/Tensor/one-shot-bufferize.mlir b/mlir/test/Dialect/Tensor/one-shot-bufferize.mlir
--- a/mlir/test/Dialect/Tensor/one-shot-bufferize.mlir
+++ b/mlir/test/Dialect/Tensor/one-shot-bufferize.mlir
@@ -71,7 +71,7 @@
 
   /// Overwrite A inplace.
   //      CHECK: linalg.fill ins({{.*}}{{.*}}outs(%[[A]]
-  %r1 = linalg.fill ins(%f0 : f32) outs(%r0 : tensor<?xf32>) -> tensor<?xf32>
+  %r1 = linalg.fill ins(%f0 : f32) outs(%r0 : tensor<?xf32>)
 
   //     CHECK: return
   // CHECK-NOT: tensor
@@ -91,7 +91,7 @@
   %f0 = arith.constant 0.0 : f32
 
   //      CHECK: linalg.fill ins({{.*}}{{.*}}outs(%[[A]]
-  %r0 = linalg.fill ins(%f0 : f32) outs(%A : tensor<?xf32>) -> tensor<?xf32>
+  %r0 = linalg.fill ins(%f0 : f32) outs(%A : tensor<?xf32>)
 
   //  CHECK-NOT: alloc
   //      CHECK: %[[SV_A:.*]] = memref.subview %[[A]]
@@ -255,7 +255,7 @@
   // CHECK: memref.alloc
   %cst = arith.constant 4.200000e+01 : f32
   // CHECK: linalg.fill
-  %0 = linalg.fill ins(%cst : f32) outs(%t : tensor<10xf32>) -> tensor<10xf32>
+  %0 = linalg.fill ins(%cst : f32) outs(%t : tensor<10xf32>)
   // CHECK: memref.copy
   %1 = tensor.insert_slice %0 into %t[0][10][1] : tensor<10xf32> into tensor<10xf32>
   return %1 : tensor<10xf32>
@@ -298,7 +298,7 @@
   %c0 = arith.constant 0 : index
   // CHECK: %[[alloc:.*]] = memref.alloc() {{.*}} : memref<10xf32>
   // CHECK: linalg.fill {{.*}} outs(%[[alloc]] : memref<10xf32>)
-  %1 = linalg.fill ins(%cst : f32) outs(%t : tensor<10xf32>) -> tensor<10xf32>
+  %1 = linalg.fill ins(%cst : f32) outs(%t : tensor<10xf32>)
 
   // Read %1 so that it does not DCE away.
   %vec = vector.transfer_read %1[%c0], %cst : tensor<10xf32>, vector<10xf32>
@@ -319,7 +319,7 @@
   %cst = arith.constant 0.0 : f32
   %c0 = arith.constant 0 : index
   // CHECK: linalg.fill {{.*}} outs(%[[t]] : memref<10xf32,{{.*}}>)
-  %1 = linalg.fill ins(%cst : f32) outs(%t : tensor<10xf32>) -> tensor<10xf32>
+  %1 = linalg.fill ins(%cst : f32) outs(%t : tensor<10xf32>)
 
   // Read %1 so that it does not DCE away.
   %vec = vector.transfer_read %1[%c0], %cst : tensor<10xf32>, vector<10xf32>
diff --git a/mlir/test/Dialect/Transform/selective-targeting.mlir b/mlir/test/Dialect/Transform/selective-targeting.mlir
--- a/mlir/test/Dialect/Transform/selective-targeting.mlir
+++ b/mlir/test/Dialect/Transform/selective-targeting.mlir
@@ -9,11 +9,10 @@
   // CHECK-COUNT-3: scf.for
   // CHECK-COUNT-3: tensor.extract_slice
   // CHECK: linalg.matmul
-  // CHECK-SAME: -> tensor<4x4xf32>
+  // CHECK-SAME: tensor<4x4xf32>
   %0 = linalg.matmul { test.attrA }
                       ins(%arg0, %arg1: tensor<128x128xf32>, tensor<128x128xf32>)
                      outs(%arg2: tensor<128x128xf32>)
-    -> tensor<128x128xf32>
   func.return %0 : tensor<128x128xf32>
 }
 
@@ -31,7 +30,6 @@
   %0 = linalg.matmul { test.attrA, test.attrC }
                       ins(%arg0, %arg1: tensor<128x128xf32>, tensor<128x128xf32>)
                      outs(%arg2: tensor<128x128xf32>)
-    -> tensor<128x128xf32>
   func.return %0 : tensor<128x128xf32>
 }
 
@@ -48,7 +46,6 @@
   %0 = linalg.matmul { test.attrC }
                       ins(%arg0, %arg1: tensor<128x128xf32>, tensor<128x128xf32>)
                      outs(%arg2: tensor<128x128xf32>)
-    -> tensor<128x128xf32>
   func.return %0 : tensor<128x128xf32>
 }
 
@@ -95,7 +92,6 @@
   %0 = linalg.matmul {test.attrA}
                      ins(%arg0, %arg1: tensor<128x128xf32>, tensor<128x128xf32>)
                      outs(%arg2: tensor<128x128xf32>)
-    -> tensor<128x128xf32>
   func.return %0 : tensor<128x128xf32>
 }
 
@@ -106,7 +102,6 @@
   // CHECK: linalg.matmul
   %0 = linalg.matmul ins(%arg0, %arg1: tensor<128x128xf32>, tensor<128x128xf32>)
                      outs(%arg2: tensor<128x128xf32>)
-    -> tensor<128x128xf32>
   func.return %0 : tensor<128x128xf32>
 }
 
@@ -140,11 +135,9 @@
   %0 = linalg.matmul {test.attrA}
                      ins(%arg0, %arg1: tensor<128x128xf32>, tensor<128x128xf32>)
                      outs(%arg2: tensor<128x128xf32>)
-    -> tensor<128x128xf32>
   // CHECK: vector.contract
   %1 = linalg.matmul ins(%arg0, %0: tensor<128x128xf32>, tensor<128x128xf32>)
                      outs(%arg3: tensor<128x128xf32>)
-    -> tensor<128x128xf32>
   return %1 : tensor<128x128xf32>
 }
 
diff --git a/mlir/test/Dialect/Vector/transform-vector.mlir b/mlir/test/Dialect/Vector/transform-vector.mlir
--- a/mlir/test/Dialect/Vector/transform-vector.mlir
+++ b/mlir/test/Dialect/Vector/transform-vector.mlir
@@ -9,7 +9,6 @@
 // CHECK: vector.store {{.*}} : memref<8x32xf32>, vector<4xf32>
   %0 = linalg.matmul  ins(%arg0, %arg1: tensor<8x16xf32>, tensor<16x32xf32>)
                      outs(%arg2: tensor<8x32xf32>)
-    -> tensor<8x32xf32>
   return %0 : tensor<8x32xf32>
 }
 
diff --git a/mlir/test/Integration/Dialect/Linalg/CPU/test-one-shot-bufferize.mlir b/mlir/test/Integration/Dialect/Linalg/CPU/test-one-shot-bufferize.mlir
--- a/mlir/test/Integration/Dialect/Linalg/CPU/test-one-shot-bufferize.mlir
+++ b/mlir/test/Integration/Dialect/Linalg/CPU/test-one-shot-bufferize.mlir
@@ -14,7 +14,7 @@
   %cst = arith.constant 0.000000e+00 : f32
   %c2 = arith.constant 2 : index
   %c0 = arith.constant 0 : index
-  %0 = linalg.fill ins(%cst : f32) outs(%arg2 : tensor<f32>) -> tensor<f32>
+  %0 = linalg.fill ins(%cst : f32) outs(%arg2 : tensor<f32>)
   %1 = affine.apply #map0(%c0, %c64)[%c2]
   %2 = bufferization.alloc_tensor(%1) : tensor<?x2xf32>
   %3 = scf.for %arg3 = %c0 to %c64 step %c2 iter_args(%arg4 = %2) -> (tensor<?x2xf32>) {
@@ -61,7 +61,7 @@
     %13 = tensor.extract_slice %6[%12, 0] [1, 2] [1, 1] : tensor<?x2xf32> to tensor<2xf32>
     %14 = affine.apply #map1(%arg3, %c0)[%c2]
     %15 = tensor.extract_slice %3[%14, 0] [1, 2] [1, 1] : tensor<?x2xf32> to tensor<2xf32>
-    %16 = linalg.dot ins(%13, %15 : tensor<2xf32>, tensor<2xf32>) outs(%arg4 : tensor<f32>) -> tensor<f32>
+    %16 = linalg.dot ins(%13, %15 : tensor<2xf32>, tensor<2xf32>) outs(%arg4 : tensor<f32>)
 
     // %AA = tensor.cast %13 : tensor<2xf32> to tensor<*xf32>
     // call @printMemrefF32(%AA) : (tensor<*xf32>) -> ()
@@ -83,9 +83,9 @@
   %A = bufferization.alloc_tensor() : tensor<64xf32>
   %B = bufferization.alloc_tensor() : tensor<64xf32>
   %C = bufferization.alloc_tensor() : tensor<f32>
-  %AA = linalg.fill ins(%v1 : f32) outs(%A : tensor<64xf32>) -> tensor<64xf32>
-  %BB = linalg.fill ins(%v2 : f32) outs(%B : tensor<64xf32>) -> tensor<64xf32>
-  %CC = linalg.fill ins(%v0 : f32) outs(%C : tensor<f32>) -> tensor<f32>
+  %AA = linalg.fill ins(%v1 : f32) outs(%A : tensor<64xf32>)
+  %BB = linalg.fill ins(%v2 : f32) outs(%B : tensor<64xf32>)
+  %CC = linalg.fill ins(%v0 : f32) outs(%C : tensor<f32>)
 
   %res = call @init_and_dot(%AA, %BB, %CC) :
     (tensor<64xf32>, tensor<64xf32>, tensor<f32>) -> tensor<f32>
diff --git a/mlir/test/Integration/Dialect/Linalg/CPU/test-tensor-matmul.mlir b/mlir/test/Integration/Dialect/Linalg/CPU/test-tensor-matmul.mlir
--- a/mlir/test/Integration/Dialect/Linalg/CPU/test-tensor-matmul.mlir
+++ b/mlir/test/Integration/Dialect/Linalg/CPU/test-tensor-matmul.mlir
@@ -23,7 +23,7 @@
   %C = arith.constant dense<1000.0> : tensor<2x4xf32>
 
   %D = linalg.matmul ins(%A, %B: tensor<2x3xf32>, tensor<3x4xf32>)
-                     outs(%C: tensor<2x4xf32>) -> tensor<2x4xf32>
+                     outs(%C: tensor<2x4xf32>)
 
   %unranked = tensor.cast %D : tensor<2x4xf32> to tensor<*xf32>
   call @printMemrefF32(%unranked) : (tensor<*xf32>) -> ()
diff --git a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_conv_1d_nwc_wcf.mlir b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_conv_1d_nwc_wcf.mlir
--- a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_conv_1d_nwc_wcf.mlir
+++ b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_conv_1d_nwc_wcf.mlir
@@ -27,7 +27,7 @@
 // Creates and returns 3-D buffer of size (%s1, %s2, %s3) filled with the value %f
 func.func @alloc_3d_filled_f32(%s1 : index, %s2 : index, %s3 : index, %f : f32) -> tensor<?x?x?xf32> {
   %buf = bufferization.alloc_tensor(%s1, %s2, %s3) : tensor<?x?x?xf32>
-  %ret = linalg.fill ins(%f : f32) outs(%buf : tensor<?x?x?xf32>) -> tensor<?x?x?xf32>
+  %ret = linalg.fill ins(%f : f32) outs(%buf : tensor<?x?x?xf32>)
   return %ret : tensor<?x?x?xf32>
 }
 
@@ -35,7 +35,7 @@
   %ret = linalg.conv_1d_nwc_wcf {dilations = dense<1> : tensor<1xi64>,
                                    strides = dense<1> : tensor<1xi64>}
      ins (%arg0, %arg1: tensor<?x?x?xf32>, tensor<?x?x?xf32>)
-    outs (%arg2: tensor<?x?x?xf32>) -> tensor<?x?x?xf32>
+    outs (%arg2: tensor<?x?x?xf32>)
   return %ret : tensor<?x?x?xf32>
 }
 
@@ -47,7 +47,7 @@
   %ret = linalg.conv_1d_nwc_wcf {dilations = dense<1> : tensor<1xi64>,
                                    strides = dense<1> : tensor<1xi64>}
      ins (%arg0, %arg1: tensor<?x?x?xf32, #CCC>, tensor<?x?x?xf32, #CCC>)
-    outs (%s: tensor<?x?x?xf32, #CCC>) -> tensor<?x?x?xf32, #CCC>
+    outs (%s: tensor<?x?x?xf32, #CCC>)
   return %ret : tensor<?x?x?xf32, #CCC>
 }
 
@@ -59,7 +59,7 @@
   %ret = linalg.conv_1d_nwc_wcf {dilations = dense<1> : tensor<1xi64>,
                                    strides = dense<1> : tensor<1xi64>}
      ins (%arg0, %arg1: tensor<?x?x?xf32, #CDC>, tensor<?x?x?xf32, #CDC>)
-    outs (%s: tensor<?x?x?xf32, #CDC>) -> tensor<?x?x?xf32, #CDC>
+    outs (%s: tensor<?x?x?xf32, #CDC>)
   return %ret : tensor<?x?x?xf32, #CDC>
 }
 
diff --git a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_conv_2d.mlir b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_conv_2d.mlir
--- a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_conv_2d.mlir
+++ b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_conv_2d.mlir
@@ -30,7 +30,7 @@
                %output: tensor<6x6xi32>) -> tensor<6x6xi32> {
     %0 = linalg.conv_2d
       ins  (%input, %filter: tensor<8x8xi32>, tensor<3x3xi32, #DCSR>)
-      outs (%output: tensor<6x6xi32>) -> tensor<6x6xi32>
+      outs (%output: tensor<6x6xi32>)
     return %0 : tensor<6x6xi32>
   }
 
@@ -39,7 +39,7 @@
     %s = bufferization.alloc_tensor() : tensor<6x6xi32, #DCSR>
     %0 = linalg.conv_2d
       ins  (%input, %filter: tensor<8x8xi32>, tensor<3x3xi32, #DCSR>)
-      outs (%s: tensor<6x6xi32, #DCSR>) -> tensor<6x6xi32, #DCSR>
+      outs (%s: tensor<6x6xi32, #DCSR>)
     return %0 : tensor<6x6xi32, #DCSR>
   }
 
@@ -48,7 +48,7 @@
     %s = bufferization.alloc_tensor() : tensor<6x6xi32, #DCSR>
     %0 = linalg.conv_2d
       ins  (%input, %filter: tensor<8x8xi32, #DCSR>, tensor<3x3xi32, #DCSR>)
-      outs (%s: tensor<6x6xi32, #DCSR>) -> tensor<6x6xi32, #DCSR>
+      outs (%s: tensor<6x6xi32, #DCSR>)
     return %0 : tensor<6x6xi32, #DCSR>
   }
 
@@ -57,7 +57,7 @@
     %s = bufferization.alloc_tensor() : tensor<6x6xi32, #CSR>
     %0 = linalg.conv_2d
       ins  (%input, %filter: tensor<8x8xi32, #CSR>, tensor<3x3xi32, #CSR>)
-      outs (%s: tensor<6x6xi32, #CSR>) -> tensor<6x6xi32, #CSR>
+      outs (%s: tensor<6x6xi32, #CSR>)
     return %0 : tensor<6x6xi32, #CSR>
   }
 
@@ -66,7 +66,7 @@
     %s = bufferization.alloc_tensor() : tensor<6x6xi32, #CSC>
     %0 = linalg.conv_2d
       ins  (%input, %filter: tensor<8x8xi32, #CSC>, tensor<3x3xi32, #CSC>)
-      outs (%s: tensor<6x6xi32, #CSC>) -> tensor<6x6xi32, #CSC>
+      outs (%s: tensor<6x6xi32, #CSC>)
     return %0 : tensor<6x6xi32, #CSC>
   }
 
diff --git a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_conv_2d_nhwc_hwcf.mlir b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_conv_2d_nhwc_hwcf.mlir
--- a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_conv_2d_nhwc_hwcf.mlir
+++ b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_conv_2d_nhwc_hwcf.mlir
@@ -26,7 +26,7 @@
 // Creates and returns 4-D buffer of size (%s1, %s2, %s3, %s4) filled with the value %f
 func.func @alloc_4d_filled_f32(%s1 : index, %s2 : index, %s3 : index, %s4 : index, %f : f32) -> tensor<?x?x?x?xf32> {
   %buf = bufferization.alloc_tensor(%s1, %s2, %s3, %s4) : tensor<?x?x?x?xf32>
-  %ret = linalg.fill ins(%f : f32) outs(%buf : tensor<?x?x?x?xf32>) -> tensor<?x?x?x?xf32>
+  %ret = linalg.fill ins(%f : f32) outs(%buf : tensor<?x?x?x?xf32>)
   return %ret : tensor<?x?x?x?xf32>
 }
 
@@ -34,7 +34,7 @@
   %ret = linalg.conv_2d_nhwc_hwcf {dilations = dense<1> : tensor<2xi64>,
                                      strides = dense<1> : tensor<2xi64>}
      ins (%arg0, %arg1: tensor<?x?x?x?xf32>, tensor<?x?x?x?xf32>)
-    outs (%arg2: tensor<?x?x?x?xf32>) -> tensor<?x?x?x?xf32>
+    outs (%arg2: tensor<?x?x?x?xf32>)
   return %ret : tensor<?x?x?x?xf32>
 }
 
@@ -46,7 +46,7 @@
   %ret = linalg.conv_2d_nhwc_hwcf {dilations = dense<1> : tensor<2xi64>,
                                      strides = dense<1> : tensor<2xi64>}
      ins (%arg0, %arg1: tensor<?x?x?x?xf32, #CCCC>, tensor<?x?x?x?xf32, #CCCC>)
-    outs (%s: tensor<?x?x?x?xf32, #CCCC>) -> tensor<?x?x?x?xf32, #CCCC>
+    outs (%s: tensor<?x?x?x?xf32, #CCCC>)
   return %ret : tensor<?x?x?x?xf32, #CCCC>
 }
 
@@ -58,7 +58,7 @@
   %ret = linalg.conv_2d_nhwc_hwcf {dilations = dense<1> : tensor<2xi64>,
                                      strides = dense<1> : tensor<2xi64>}
      ins (%arg0, %arg1: tensor<?x?x?x?xf32, #CDCD>, tensor<?x?x?x?xf32, #CDCD>)
-    outs (%s: tensor<?x?x?x?xf32, #CDCD>) -> tensor<?x?x?x?xf32, #CDCD>
+    outs (%s: tensor<?x?x?x?xf32, #CDCD>)
   return %ret : tensor<?x?x?x?xf32, #CDCD>
 }
 
diff --git a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_conv_3d.mlir b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_conv_3d.mlir
--- a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_conv_3d.mlir
+++ b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_conv_3d.mlir
@@ -26,14 +26,14 @@
 // Creates and returns 3-D buffer of size (%s1, %s2, %s3) filled with the value %f
 func.func @alloc_3d_filled_f32(%s1 : index, %s2 : index, %s3 : index, %f : f32) -> tensor<?x?x?xf32> {
   %buf = bufferization.alloc_tensor(%s1, %s2, %s3) : tensor<?x?x?xf32>
-  %ret = linalg.fill ins(%f : f32) outs(%buf : tensor<?x?x?xf32>) -> tensor<?x?x?xf32>
+  %ret = linalg.fill ins(%f : f32) outs(%buf : tensor<?x?x?xf32>)
   return %ret : tensor<?x?x?xf32>
 }
 
 func.func @conv_3d(%arg0: tensor<?x?x?xf32>, %arg1: tensor<?x?x?xf32>, %arg2: tensor<?x?x?xf32>) -> tensor<?x?x?xf32> {
   %ret = linalg.conv_3d
      ins (%arg0, %arg1: tensor<?x?x?xf32>, tensor<?x?x?xf32>)
-    outs (%arg2: tensor<?x?x?xf32>) -> tensor<?x?x?xf32>
+    outs (%arg2: tensor<?x?x?xf32>)
   return %ret : tensor<?x?x?xf32>
 }
 
@@ -42,7 +42,7 @@
   %s = bufferization.alloc_tensor(%c6, %c6, %c6) : tensor<?x?x?xf32, #CCC>
   %ret = linalg.conv_3d
      ins (%arg0, %arg1: tensor<?x?x?xf32, #CCC>, tensor<?x?x?xf32, #CCC>)
-    outs (%s: tensor<?x?x?xf32, #CCC>) -> tensor<?x?x?xf32, #CCC>
+    outs (%s: tensor<?x?x?xf32, #CCC>)
   return %ret : tensor<?x?x?xf32, #CCC>
 }
 
@@ -51,7 +51,7 @@
   %s = bufferization.alloc_tensor(%c6, %c6, %c6) : tensor<?x?x?xf32, #CDC>
   %ret = linalg.conv_3d
      ins (%arg0, %arg1: tensor<?x?x?xf32, #CDC>, tensor<?x?x?xf32, #CDC>)
-    outs (%s: tensor<?x?x?xf32, #CDC>) -> tensor<?x?x?xf32, #CDC>
+    outs (%s: tensor<?x?x?xf32, #CDC>)
   return %ret : tensor<?x?x?xf32, #CDC>
 }
 
diff --git a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_conv_3d_ndhwc_dhwcf.mlir b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_conv_3d_ndhwc_dhwcf.mlir
--- a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_conv_3d_ndhwc_dhwcf.mlir
+++ b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_conv_3d_ndhwc_dhwcf.mlir
@@ -26,7 +26,7 @@
 // Creates and returns 5-D buffer of size (%s1, %s2, %s3, %s4, %s5) filled with the value %f
 func.func @alloc_5d_filled_f32(%s1 : index, %s2 : index, %s3 : index, %s4 : index, %s5 : index, %f : f32) -> tensor<?x?x?x?x?xf32> {
   %buf = bufferization.alloc_tensor(%s1, %s2, %s3, %s4, %s5) : tensor<?x?x?x?x?xf32>
-  %ret = linalg.fill ins(%f : f32) outs(%buf : tensor<?x?x?x?x?xf32>) -> tensor<?x?x?x?x?xf32>
+  %ret = linalg.fill ins(%f : f32) outs(%buf : tensor<?x?x?x?x?xf32>)
   return %ret : tensor<?x?x?x?x?xf32>
 }
 
@@ -36,7 +36,7 @@
   %ret = linalg.conv_3d_ndhwc_dhwcf {dilations = dense<1> : tensor<3xi64>,
                                        strides = dense<1> : tensor<3xi64>}
      ins (%arg0, %arg1: tensor<?x?x?x?x?xf32>, tensor<?x?x?x?x?xf32>)
-    outs (%arg2: tensor<?x?x?x?x?xf32>) -> tensor<?x?x?x?x?xf32>
+    outs (%arg2: tensor<?x?x?x?x?xf32>)
   return %ret : tensor<?x?x?x?x?xf32>
 }
 
@@ -50,7 +50,7 @@
   %ret = linalg.conv_3d_ndhwc_dhwcf {dilations = dense<1> : tensor<3xi64>,
                                        strides = dense<1> : tensor<3xi64>}
      ins (%arg0, %arg1: tensor<?x?x?x?x?xf32, #CCCCC>, tensor<?x?x?x?x?xf32, #CCCCC>)
-    outs (%s: tensor<?x?x?x?x?xf32, #CCCCC>) -> tensor<?x?x?x?x?xf32, #CCCCC>
+    outs (%s: tensor<?x?x?x?x?xf32, #CCCCC>)
   return %ret : tensor<?x?x?x?x?xf32, #CCCCC>
 }
 
@@ -64,7 +64,7 @@
   %ret = linalg.conv_3d_ndhwc_dhwcf {dilations = dense<1> : tensor<3xi64>,
                                        strides = dense<1> : tensor<3xi64>}
      ins (%arg0, %arg1: tensor<?x?x?x?x?xf32, #CDCDC>, tensor<?x?x?x?x?xf32, #CDCDC>)
-    outs (%s: tensor<?x?x?x?x?xf32, #CDCDC>) -> tensor<?x?x?x?x?xf32, #CDCDC>
+    outs (%s: tensor<?x?x?x?x?xf32, #CDCDC>)
   return %ret : tensor<?x?x?x?x?xf32, #CDCDC>
 }
 
diff --git a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_dot.mlir b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_dot.mlir
--- a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_dot.mlir
+++ b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_dot.mlir
@@ -27,7 +27,7 @@
                         %x: tensor<f32>) -> tensor<f32> {
     %dot = linalg.dot ins(%a, %b: tensor<1024xf32, #SparseVector>,
                                   tensor<1024xf32, #SparseVector>)
-         outs(%x: tensor<f32>) -> tensor<f32>
+         outs(%x: tensor<f32>)
     return %dot : tensor<f32>
   }
 
diff --git a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_expand.mlir b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_expand.mlir
--- a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_expand.mlir
+++ b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_expand.mlir
@@ -33,7 +33,7 @@
     %C = bufferization.alloc_tensor() : tensor<8x4xf64, #CSC>
     %D = linalg.matmul
       ins(%A, %B: tensor<8x2xf64, #CSC>, tensor<2x4xf64, #CSC>)
-         outs(%C: tensor<8x4xf64, #CSC>) -> tensor<8x4xf64, #CSC>
+         outs(%C: tensor<8x4xf64, #CSC>)
     return %D: tensor<8x4xf64, #CSC>
   }
 
diff --git a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_filter_conv2d.mlir b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_filter_conv2d.mlir
--- a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_filter_conv2d.mlir
+++ b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_filter_conv2d.mlir
@@ -25,7 +25,7 @@
                %output: tensor<6x6xi32>) -> tensor<6x6xi32> {
     %0 = linalg.conv_2d
       ins  (%input, %filter: tensor<8x8xi32>, tensor<3x3xi32, #DCSR>)
-      outs (%output: tensor<6x6xi32>) -> tensor<6x6xi32>
+      outs (%output: tensor<6x6xi32>)
     return %0 : tensor<6x6xi32>
   }
 
@@ -34,7 +34,7 @@
     %s = bufferization.alloc_tensor() : tensor<6x6xi32, #DCSR>
     %0 = linalg.conv_2d
       ins  (%input, %filter: tensor<8x8xi32>, tensor<3x3xi32, #DCSR>)
-      outs (%s: tensor<6x6xi32, #DCSR>) -> tensor<6x6xi32, #DCSR>
+      outs (%s: tensor<6x6xi32, #DCSR>)
     return %0 : tensor<6x6xi32, #DCSR>
   }
 
diff --git a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_matmul.mlir b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_matmul.mlir
--- a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_matmul.mlir
+++ b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_matmul.mlir
@@ -44,7 +44,7 @@
                      %C: tensor<4x4xf64>) -> tensor<4x4xf64> {
     %D = linalg.matmul
       ins(%A, %B: tensor<4x8xf64>, tensor<8x4xf64>)
-      outs(%C: tensor<4x4xf64>) -> tensor<4x4xf64>
+      outs(%C: tensor<4x4xf64>)
     return %D: tensor<4x4xf64>
   }
 
@@ -56,7 +56,7 @@
     %C = bufferization.alloc_tensor() : tensor<4x4xf64, #CSR>
     %D = linalg.matmul
       ins(%A, %B: tensor<4x8xf64, #CSR>, tensor<8x4xf64, #CSR>)
-         outs(%C: tensor<4x4xf64, #CSR>) -> tensor<4x4xf64, #CSR>
+         outs(%C: tensor<4x4xf64, #CSR>)
     return %D: tensor<4x4xf64, #CSR>
   }
 
@@ -68,7 +68,7 @@
     %C = bufferization.alloc_tensor() : tensor<4x4xf64, #DCSR>
     %D = linalg.matmul
       ins(%A, %B: tensor<4x8xf64, #DCSR>, tensor<8x4xf64, #DCSR>)
-         outs(%C: tensor<4x4xf64, #DCSR>) -> tensor<4x4xf64, #DCSR>
+         outs(%C: tensor<4x4xf64, #DCSR>)
     return %D: tensor<4x4xf64, #DCSR>
   }
 
diff --git a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_quantized_matmul.mlir b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_quantized_matmul.mlir
--- a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_quantized_matmul.mlir
+++ b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_quantized_matmul.mlir
@@ -30,7 +30,7 @@
     %c2 = arith.constant 2 : i32
     %0 = linalg.quantized_matmul
       ins(%input1, %input2, %c2, %c0 : tensor<5x3xi8>, tensor<3x6xi8, #DCSR>, i32, i32)
-      outs(%output : tensor<5x6xi32>) -> tensor<5x6xi32>
+      outs(%output : tensor<5x6xi32>)
     return %0: tensor<5x6xi32>
   }
 
diff --git a/mlir/test/Interfaces/TilingInterface/tile-and-fuse-using-interface.mlir b/mlir/test/Interfaces/TilingInterface/tile-and-fuse-using-interface.mlir
--- a/mlir/test/Interfaces/TilingInterface/tile-and-fuse-using-interface.mlir
+++ b/mlir/test/Interfaces/TilingInterface/tile-and-fuse-using-interface.mlir
@@ -7,10 +7,10 @@
   %d0 = tensor.dim %arg0, %c0 : tensor<?x?xf32>
   %d1 = tensor.dim %arg1, %c1 : tensor<?x?xf32>
   %init = tensor.empty(%d0, %d1) : tensor<?x?xf32>
-  %fill = linalg.fill ins(%cst : f32) outs(%init : tensor<?x?xf32>) -> tensor<?x?xf32>
+  %fill = linalg.fill ins(%cst : f32) outs(%init : tensor<?x?xf32>)
   %gemm = linalg.matmul {__internal_linalg_transform__ = "fusion"}
       ins(%arg0, %arg1 : tensor<?x?xf32>, tensor<?x?xf32>)
-      outs(%fill : tensor<?x?xf32>) -> tensor<?x?xf32>
+      outs(%fill : tensor<?x?xf32>)
   return %gemm : tensor<?x?xf32>
 }
 //      CHECK: func.func @gemm_fill_fusion(
@@ -42,10 +42,10 @@
   %d0 = tensor.dim %arg0, %c0 : tensor<?x?xf32>
   %d1 = tensor.dim %arg1, %c1 : tensor<?x?xf32>
   %init = tensor.empty(%d0, %d1) : tensor<?x?xf32>
-  %fill = linalg.fill ins(%cst : f32) outs(%init : tensor<?x?xf32>) -> tensor<?x?xf32>
+  %fill = linalg.fill ins(%cst : f32) outs(%init : tensor<?x?xf32>)
   %gemm = linalg.matmul
       ins(%arg0, %arg1 : tensor<?x?xf32>, tensor<?x?xf32>)
-      outs(%fill : tensor<?x?xf32>) -> tensor<?x?xf32>
+      outs(%fill : tensor<?x?xf32>)
   %generic = linalg.generic {
       __internal_linalg_transform__ = "fusion",
       indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>],
@@ -91,14 +91,14 @@
   %d0 = tensor.dim %lhs0, %c0 : tensor<?x?xf32>
   %d1 = tensor.dim %rhs0, %c1 : tensor<?x?xf32>
   %init0 = tensor.empty(%d0, %d1) : tensor<?x?xf32>
-  %fill0 = linalg.fill ins(%cst : f32) outs(%init0 : tensor<?x?xf32>) -> tensor<?x?xf32>
+  %fill0 = linalg.fill ins(%cst : f32) outs(%init0 : tensor<?x?xf32>)
   %gemm0 = linalg.matmul
-      ins(%lhs0, %rhs0 : tensor<?x?xf32>, tensor<?x?xf32>) outs(%fill0 : tensor<?x?xf32>) -> tensor<?x?xf32>
+      ins(%lhs0, %rhs0 : tensor<?x?xf32>, tensor<?x?xf32>) outs(%fill0 : tensor<?x?xf32>)
   %d2 = tensor.dim %rhs1, %c1 : tensor<?x?xf32>
   %init1 = tensor.empty(%d0, %d2) : tensor<?x?xf32>
-  %fill1 = linalg.fill ins(%cst : f32) outs(%init1 : tensor<?x?xf32>) -> tensor<?x?xf32>
+  %fill1 = linalg.fill ins(%cst : f32) outs(%init1 : tensor<?x?xf32>)
   %gemm1 = linalg.matmul  {__internal_linalg_transform__ = "gemm_fusion"}
-      ins(%gemm0, %rhs1 : tensor<?x?xf32>, tensor<?x?xf32>) outs(%fill1 : tensor<?x?xf32>) -> tensor<?x?xf32>
+      ins(%gemm0, %rhs1 : tensor<?x?xf32>, tensor<?x?xf32>) outs(%fill1 : tensor<?x?xf32>)
   return %gemm1 : tensor<?x?xf32>
 }
 //      CHECK: func.func @gemm_gemm_fusion(
@@ -141,10 +141,10 @@
   %d0 = tensor.dim %arg0, %c0 : tensor<?x?xf32>
   %d1 = tensor.dim %arg1, %c1 : tensor<?x?xf32>
   %init0 = tensor.empty(%d0, %d1) : tensor<?x?xf32>
-  %fill = linalg.fill ins(%cst : f32) outs(%init0 : tensor<?x?xf32>) -> tensor<?x?xf32>
+  %fill = linalg.fill ins(%cst : f32) outs(%init0 : tensor<?x?xf32>)
   %gemm = linalg.matmul
       ins(%arg0, %arg1 : tensor<?x?xf32>, tensor<?x?xf32>)
-      outs(%fill : tensor<?x?xf32>) -> tensor<?x?xf32>
+      outs(%fill : tensor<?x?xf32>)
   %init1 = tensor.empty(%d1, %d0) : tensor<?x?xf32>
   %transpose = linalg.generic {
       __internal_linalg_transform__ = "fusion",
@@ -193,10 +193,10 @@
   %d1 = tensor.dim %arg1, %c1 : tensor<?x?xf32>
   %cst = arith.constant 0.0 : f32
   %0 = tensor.empty(%d0, %d1) : tensor<?x?xf32>
-  %1 = linalg.fill ins(%cst : f32) outs(%0 : tensor<?x?xf32>) -> tensor<?x?xf32>
+  %1 = linalg.fill ins(%cst : f32) outs(%0 : tensor<?x?xf32>)
   %2 = linalg.matmul
       ins(%arg0, %arg1 : tensor<?x?xf32>, tensor<?x?xf32>)
-      outs(%1 : tensor<?x?xf32>) -> tensor<?x?xf32>
+      outs(%1 : tensor<?x?xf32>)
   %3 = linalg.generic {
       __internal_linalg_transform__ = "gemm_interchange_fusion",
       indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>],
@@ -240,7 +240,7 @@
   %0 = tensor.dim %arg2, %c0 : tensor<?x?xf32>
   %1 = tensor.dim %arg2, %c1 : tensor<?x?xf32>
   %2 = linalg.matmul ins(%arg0, %arg1 : tensor<?x?xf32>, tensor<?x?xf32>)
-    outs(%arg2 : tensor<?x?xf32>) -> tensor<?x?xf32>
+    outs(%arg2 : tensor<?x?xf32>)
   %3 = tensor.dim %2, %c0 : tensor<?x?xf32>
   %4 = tensor.dim %2, %c1 : tensor<?x?xf32>
   %5 = tensor.empty(%3, %4) : tensor<?x?xf32>
@@ -293,7 +293,7 @@
   %0 = tensor.dim %arg2, %c0 : tensor<?x?xf32>
   %1 = tensor.dim %arg2, %c1 : tensor<?x?xf32>
   %2 = linalg.matmul ins(%arg0, %arg1 : tensor<?x?xf32>, tensor<?x?xf32>)
-    outs(%arg2 : tensor<?x?xf32>) -> tensor<?x?xf32>
+    outs(%arg2 : tensor<?x?xf32>)
   %3 = tensor.dim %2, %c0 : tensor<?x?xf32>
   %4 = tensor.dim %2, %c1 : tensor<?x?xf32>
   %5 = tensor.empty(%3, %4) : tensor<?x?xf32>
@@ -348,13 +348,13 @@
     %arg2: tensor<?x?xf32>, %arg3: tensor<?x?xf32>, %arg4: tensor<?x?xf32>,
     %arg5: tensor<?x?xf32>, %arg6: tensor<?x?xf32>) -> tensor<?x?xf32> {
   %0 = linalg.matmul ins(%arg0, %arg1 : tensor<?x?xf32>, tensor<?x?xf32>)
-    outs(%arg2 : tensor<?x?xf32>) -> tensor<?x?xf32> // [M, N0] * [N0, N1]
+    outs(%arg2 : tensor<?x?xf32>)
   %1 = linalg.matmul ins(%0, %arg3 : tensor<?x?xf32>, tensor<?x?xf32>)
-    outs(%arg4 : tensor<?x?xf32>) -> tensor<?x?xf32> // [M, N1] * [N1, N2]
+    outs(%arg4 : tensor<?x?xf32>)
   %2 = linalg.matmul
     {__internal_linalg_transform__ = "gemm_sequence_fusion"}
     ins(%1, %arg5 : tensor<?x?xf32>, tensor<?x?xf32>)
-    outs(%arg6 : tensor<?x?xf32>) -> tensor<?x?xf32> // [M, N2] * [N2, N3]
+    outs(%arg6 : tensor<?x?xf32>)
   return %2 : tensor<?x?xf32>
 }
 
@@ -402,7 +402,7 @@
   %cst = arith.constant 0.000000e+00 : f32
   %cst_0 = arith.constant 0xFF800000 : f32
   %0 = tensor.empty() : tensor<30xf32>
-  %1 = linalg.fill ins(%cst_0 : f32) outs(%0 : tensor<30xf32>) -> tensor<30xf32>
+  %1 = linalg.fill ins(%cst_0 : f32) outs(%0 : tensor<30xf32>)
   %2 = linalg.generic {
       indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>],
       iterator_types = ["parallel", "reduction"]}
@@ -412,7 +412,7 @@
       linalg.yield %8 : f32
     } -> tensor<30xf32>
   %3 = tensor.empty() : tensor<30x3xf32>
-  %4 = linalg.fill ins(%cst : f32) outs(%0 : tensor<30xf32>) -> tensor<30xf32>
+  %4 = linalg.fill ins(%cst : f32) outs(%0 : tensor<30xf32>)
   %5:2 = linalg.generic {
       indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>,
                        affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d0, d1)>],
diff --git a/mlir/test/Interfaces/TilingInterface/tile-using-interface.mlir b/mlir/test/Interfaces/TilingInterface/tile-using-interface.mlir
--- a/mlir/test/Interfaces/TilingInterface/tile-using-interface.mlir
+++ b/mlir/test/Interfaces/TilingInterface/tile-using-interface.mlir
@@ -4,7 +4,7 @@
     %arg2 : tensor<?x?xf32>) -> tensor<?x?xf32> {
   %0 = linalg.matmul {__internal_linalg_transform__ = "simple_gemm"}
       ins(%arg0, %arg1 : tensor<?x?xf32>, tensor<?x?xf32>)
-      outs(%arg2 : tensor<?x?xf32>) -> tensor<?x?xf32>
+      outs(%arg2 : tensor<?x?xf32>)
   return %0 : tensor<?x?xf32>
 }
 //  CHECK-DAG: #[[$MAP0:.+]] = affine_map<(d0)[s0] -> (10, -d0 + s0)>
@@ -141,7 +141,7 @@
       dilation = dense<[4, 5]> : tensor<2xi64>,
       __internal_linalg_transform__ = "simple_conv"}
       ins(%arg0, %arg1 : tensor<?x?x?x?xf32>, tensor<?x?x?x?xf32>)
-      outs(%arg2 : tensor<?x?x?x?xf32>) -> tensor<?x?x?x?xf32>
+      outs(%arg2 : tensor<?x?x?x?xf32>)
   return %0 : tensor<?x?x?x?xf32>
 }
 //  CHECK-DAG: #[[$MAP0:.+]] = affine_map<(d0)[s0] -> (10, -d0 + s0)>
@@ -231,7 +231,7 @@
     %arg2 : tensor<?x?xf32>) -> tensor<?x?xf32> {
   %0 = linalg.matmul {__internal_linalg_transform__ = "gemm_interchange"}
       ins(%arg0, %arg1 : tensor<?x?xf32>, tensor<?x?xf32>)
-      outs(%arg2 : tensor<?x?xf32>) -> tensor<?x?xf32>
+      outs(%arg2 : tensor<?x?xf32>)
   return %0 : tensor<?x?xf32>
 }
 //  CHECK-DAG: #[[$MAP0:.+]] = affine_map<(d0)[s0] -> (20, -d0 + s0)>
diff --git a/mlir/test/python/dialects/linalg/ops.py b/mlir/test/python/dialects/linalg/ops.py
--- a/mlir/test/python/dialects/linalg/ops.py
+++ b/mlir/test/python/dialects/linalg/ops.py
@@ -21,7 +21,7 @@
       # CHECK-LABEL: func @fill_tensor
       #  CHECK-SAME:   %[[OUT:[0-9a-z]+]]: tensor<12x?xf32>
       #  CHECK-NEXT: %[[CST:.*]] = arith.constant 0.0{{.*}} : f32
-      #  CHECK-NEXT: %[[RES:.*]] = linalg.fill ins(%[[CST]] : f32) outs(%[[OUT]] : tensor<12x?xf32>) -> tensor<12x?xf32>
+      #  CHECK-NEXT: %[[RES:.*]] = linalg.fill ins(%[[CST]] : f32) outs(%[[OUT]] : tensor<12x?xf32>)
       #  CHECK-NEXT: return %[[RES]] : tensor<12x?xf32>
       @func.FuncOp.from_py_func(
           RankedTensorType.get((12, ShapedType.get_dynamic_size()), f32))