diff --git a/mlir/include/mlir/Dialect/Linalg/IR/LinalgOps.h b/mlir/include/mlir/Dialect/Linalg/IR/LinalgOps.h
--- a/mlir/include/mlir/Dialect/Linalg/IR/LinalgOps.h
+++ b/mlir/include/mlir/Dialect/Linalg/IR/LinalgOps.h
@@ -72,7 +72,7 @@
 ///
 /// Examples:
 ///
-/// 1. linalg.fill(%A, %f) : memref<f32>, f32
+/// 1. linalg.fill(%f, %A) : f32, memref<f32>
 ///   name mangles into `linalg_fill_viewf32_f32_impl`
 ///
 /// 2. linalg.dot %A, %B, %C :
diff --git a/mlir/include/mlir/Dialect/Linalg/IR/LinalgStructuredOps.td b/mlir/include/mlir/Dialect/Linalg/IR/LinalgStructuredOps.td
--- a/mlir/include/mlir/Dialect/Linalg/IR/LinalgStructuredOps.td
+++ b/mlir/include/mlir/Dialect/Linalg/IR/LinalgStructuredOps.td
@@ -211,9 +211,9 @@
   }];
 
   let assemblyFormat = [{
-    `(` $output `,` $value `)` attr-dict `:`
-        type($output) `,` type($value) (`->` type($result)^)?
-      custom<FillOpRegion>($region, ref(type($output)), ref(type($value)))
+    `(` $value `,` $output `)` attr-dict `:`
+        type($value) `,` type($output) (`->` type($result)^)?
+      custom<FillOpRegion>($region, ref(type($value)), ref(type($output)))
   }];
 
   let builders = [
diff --git a/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp b/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp
--- a/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp
+++ b/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp
@@ -435,8 +435,8 @@
                                  TypeRange{output.getType()}, {});
 }
 
-ParseResult parseFillOpRegion(OpAsmParser &parser, Region &r, Type outputType,
-                              Type valueType) {
+ParseResult parseFillOpRegion(OpAsmParser &parser, Region &r, Type valueType,
+                              Type outputType) {
   OpBuilder opBuilder(parser.getBuilder().getContext());
   fillStructuredOpRegion<FillOp>(opBuilder, r, TypeRange{valueType},
                                  TypeRange{outputType});
diff --git a/mlir/lib/Dialect/Linalg/Transforms/Fusion.cpp b/mlir/lib/Dialect/Linalg/Transforms/Fusion.cpp
--- a/mlir/lib/Dialect/Linalg/Transforms/Fusion.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/Fusion.cpp
@@ -648,7 +648,7 @@
 /// parallel loops and appear in the result of the map
 ///
 /// Example 1:
-///   linalg.fill(%c, %cst)
+///   linalg.fill(%cst, %c)
 ///   linalg.matmul ins(%a, %b) outs(%c)
 ///     Number of parallel loops : 2
 ///     producerIndexMap = affine_map<(i, j) ->(i , j)>
diff --git a/mlir/lib/Dialect/Vector/VectorTransforms.cpp b/mlir/lib/Dialect/Vector/VectorTransforms.cpp
--- a/mlir/lib/Dialect/Vector/VectorTransforms.cpp
+++ b/mlir/lib/Dialect/Vector/VectorTransforms.cpp
@@ -2413,7 +2413,7 @@
 ///      memref.cast %A: memref<A...> to compatibleMemRefType
 ///      scf.yield %view, ... : compatibleMemRefType, index, index
 ///    } else {
-///      %2 = linalg.fill(%alloc, %pad)
+///      %2 = linalg.fill(%pad, %alloc)
 ///      %3 = subview %view [...][...][...]
 ///      linalg.copy(%3, %alloc)
 ///      memref.cast %alloc: memref<B...> to compatibleMemRefType
diff --git a/mlir/test/Conversion/TosaToLinalg/tosa-to-linalg.mlir b/mlir/test/Conversion/TosaToLinalg/tosa-to-linalg.mlir
--- a/mlir/test/Conversion/TosaToLinalg/tosa-to-linalg.mlir
+++ b/mlir/test/Conversion/TosaToLinalg/tosa-to-linalg.mlir
@@ -544,7 +544,7 @@
 func @reduce_float(%arg0: tensor<5x4xf32>) -> () {
   // CHECK: [[INIT:%.+]] = linalg.init_tensor [4]
   // CHECK: [[CST0:%.+]] = constant 0.0
-  // CHECK: [[FILL:%.+]] = linalg.fill([[INIT]], [[CST0]])
+  // CHECK: [[FILL:%.+]] = linalg.fill([[CST0]], [[INIT]])
   // CHECK: [[GENERIC:%.+]] = linalg.generic {indexing_maps = [#[[$MAP0]], #[[$MAP1]]], iterator_types = ["reduction", "parallel"]} ins([[ARG0]] : tensor<5x4xf32>) outs([[FILL]] : tensor<4xf32>)
   // CHECK: ^bb0(%arg1: f32, %arg2: f32)
   // CHECK:   [[RES:%.+]] = addf %arg1, %arg2 : f32
@@ -554,7 +554,7 @@
 
   // CHECK: [[INIT:%.+]] = linalg.init_tensor [5]
   // CHECK: [[CST0:%.+]] = constant 0.0
-  // CHECK: [[FILL:%.+]] = linalg.fill([[INIT]], [[CST0]])
+  // CHECK: [[FILL:%.+]] = linalg.fill([[CST0]], [[INIT]])
   // CHECK: [[GENERIC:%.+]] = linalg.generic {indexing_maps = [#[[$MAP0]], #[[$MAP2]]], iterator_types = ["parallel", "reduction"]} ins([[ARG0]] : tensor<5x4xf32>) outs([[FILL]] : tensor<5xf32>)
   // CHECK: ^bb0(%arg1: f32, %arg2: f32)
   // CHECK:   [[RES:%.+]] = addf %arg1, %arg2 : f32
@@ -595,7 +595,7 @@
 func @reduce_int(%arg0: tensor<5x4xi32>) -> () {
   // CHECK: [[INIT:%.+]] = linalg.init_tensor [4]
   // CHECK: [[CST0:%.+]] = constant 0
-  // CHECK: [[FILL:%.+]] = linalg.fill([[INIT]], [[CST0]])
+  // CHECK: [[FILL:%.+]] = linalg.fill([[CST0]], [[INIT]])
   // CHECK: [[GENERIC:%.+]] = linalg.generic {indexing_maps = [#[[$MAP0]], #[[$MAP1]]], iterator_types = ["reduction", "parallel"]} ins([[ARG0]] : tensor<5x4xi32>) outs([[FILL]] : tensor<4xi32>)
   // CHECK: ^bb0(%arg1: i32, %arg2: i32)
   // CHECK:   [[RES:%.+]] = addi %arg1, %arg2 : i32
@@ -605,7 +605,7 @@
 
   // CHECK: [[INIT:%.+]] = linalg.init_tensor [5]
   // CHECK: [[CST0:%.+]] = constant 0
-  // CHECK: [[FILL:%.+]] = linalg.fill([[INIT]], [[CST0]])
+  // CHECK: [[FILL:%.+]] = linalg.fill([[CST0]], [[INIT]])
   // CHECK: [[GENERIC:%.+]] = linalg.generic {indexing_maps = [#[[$MAP0]], #[[$MAP2]]], iterator_types = ["parallel", "reduction"]} ins([[ARG0]] : tensor<5x4xi32>) outs([[FILL]] : tensor<5xi32>)
   // CHECK: ^bb0(%arg1: i32, %arg2: i32)
   // CHECK:   [[RES:%.+]] = addi %arg1, %arg2 : i32
@@ -645,7 +645,7 @@
 func @reduce_bool(%arg0: tensor<5x4xi1>) -> () {
   // CHECK: [[INIT:%.+]] = linalg.init_tensor [4]
   // CHECK: [[CST0:%.+]] = constant true
-  // CHECK: [[FILL:%.+]] = linalg.fill([[INIT]], [[CST0]])
+  // CHECK: [[FILL:%.+]] = linalg.fill([[CST0]], [[INIT]])
   // CHECK: [[GENERIC:%.+]] = linalg.generic {indexing_maps = [#[[$MAP0]], #[[$MAP1]]], iterator_types = ["reduction", "parallel"]} ins([[ARG0]] : tensor<5x4xi1>) outs([[FILL]] : tensor<4xi1>)
   // CHECK: ^bb0(%arg1: i1, %arg2: i1)
   // CHECK:   [[RES:%.+]] = and %arg1, %arg2 : i1
@@ -677,7 +677,7 @@
   // CHECK: [[RESULT_AXIS:%.+]] = addi [[ARG0_DIM0]], [[ARG1_AXIS]]
   // CHECK: [[INIT:%.+]] = linalg.init_tensor [11, 1]
   // CHECK: [[CST:%.+]] = constant 0.0
-  // CHECK: [[FILL:%.+]] = linalg.fill([[INIT]], [[CST]])
+  // CHECK: [[FILL:%.+]] = linalg.fill([[CST]], [[INIT]])
   // CHECK: [[ARG0_DIM0:%.+]] = memref.dim %arg0, [[AXIS]]
   // CHECK: [[INSERT0:%.+]] = tensor.insert_slice %arg0 into [[FILL]]{{\[}}[[OFFSET]], [[OFFSET]]] {{\[}}[[ARG0_DIM0]], [[ARG0_DIM1]]] {{\[}}[[STRIDE]], [[STRIDE]]]
   // CHECK: [[NEW_OFFSET:%.+]] = addi [[OFFSET]], [[ARG0_DIM0]]
@@ -696,7 +696,7 @@
   // CHECK: [[RESULT_AXIS:%.+]] = addi [[ARG0_DIM1]], [[ARG1_AXIS]]
   // CHECK: [[INIT:%.+]] = linalg.init_tensor [5, 2]
   // CHECK: [[CST:%.+]] = constant 0.0
-  // CHECK: [[FILL:%.+]] = linalg.fill([[INIT]], [[CST]])
+  // CHECK: [[FILL:%.+]] = linalg.fill([[CST]], [[INIT]])
   // CHECK: [[ARG0_DIM1:%.+]] = memref.dim %arg0, [[AXIS]]
   // CHECK: [[INSERT0:%.+]] = tensor.insert_slice %arg0 into [[FILL]]{{\[}}[[OFFSET]], [[OFFSET]]] {{\[}}[[ARG0_DIM0]], [[ARG0_DIM1]]] {{\[}}[[STRIDE]], [[STRIDE]]]
   // CHECK: [[NEW_OFFSET:%.+]] = addi [[OFFSET]], [[ARG0_DIM1]]
@@ -847,7 +847,7 @@
 func @matmul(%arg0: tensor<1x5x3xf32>, %arg1: tensor<1x3x6xf32>, %arg2: tensor<1x6xf32>) -> (tensor<1x5x6xf32>) {
   // CHECK: [[C0:%.+]] = constant 0
   // CHECK: [[INIT:%.+]] = linalg.init_tensor [1, 5, 6]
-  // CHECK: [[FILLED:%.+]] = linalg.fill([[INIT]], [[C0]]) : tensor<1x5x6xf32>, f32 -> tensor<1x5x6xf32>
+  // CHECK: [[FILLED:%.+]] = linalg.fill([[C0]], [[INIT]]) : f32, tensor<1x5x6xf32> -> tensor<1x5x6xf32>
   // CHECK: linalg.batch_matmul ins(%arg0, %arg1 : tensor<1x5x3xf32>, tensor<1x3x6xf32>) outs([[FILLED]] : tensor<1x5x6xf32>) -> tensor<1x5x6xf32>
   %0 = "tosa.matmul"(%arg0, %arg1) : (tensor<1x5x3xf32>, tensor<1x3x6xf32>)  -> (tensor<1x5x6xf32>)
   return %0 : tensor<1x5x6xf32>
@@ -928,10 +928,10 @@
 func @argmax(%arg0 : tensor<3x2xi32>, %arg1 : tensor<6xf32>) -> () {
   // CHECK: [[IDX_INIT:%.+]] = linalg.init_tensor [2]
   // CHECK: [[IDX_MIN:%.+]] = constant 0 : i32
-  // CHECK: [[IDX_FILL:%.+]] = linalg.fill([[IDX_INIT]], [[IDX_MIN]])
+  // CHECK: [[IDX_FILL:%.+]] = linalg.fill([[IDX_MIN]], [[IDX_INIT]])
   // CHECK: [[VAL_INIT:%.+]] = linalg.init_tensor [2]
   // CHECK: [[VAL_MIN:%.+]] = constant -2147483648
-  // CHECK: [[VAL_FILL:%.+]] = linalg.fill([[VAL_INIT]], [[VAL_MIN]])
+  // CHECK: [[VAL_FILL:%.+]] = linalg.fill([[VAL_MIN]], [[VAL_INIT]])
   // CHECK: linalg.generic {indexing_maps = [#[[$MAP0]], #[[$MAP1]], #[[$MAP1]]], iterator_types = ["reduction", "parallel"]} ins(%arg0 : tensor<3x2xi32>) outs([[IDX_FILL]], [[VAL_FILL]] : tensor<2xi32>, tensor<2xi32>)
   // CHECK:   [[IDX:%.+]] = linalg.index 0
   // CHECK:   [[CAST:%.+]] = index_cast [[IDX]]
@@ -943,10 +943,10 @@
 
   // CHECK: [[IDX_INIT:%.+]] = linalg.init_tensor [3]
   // CHECK: [[IDX_MIN:%.+]] = constant 0 : i32
-  // CHECK: [[IDX_FILL:%.+]] = linalg.fill([[IDX_INIT]], [[IDX_MIN]])
+  // CHECK: [[IDX_FILL:%.+]] = linalg.fill([[IDX_MIN]], [[IDX_INIT]])
   // CHECK: [[VAL_INIT:%.+]] = linalg.init_tensor [3]
   // CHECK: [[VAL_MIN:%.+]] = constant -2147483648
-  // CHECK: [[VAL_FILL:%.+]] = linalg.fill([[VAL_INIT]], [[VAL_MIN]])
+  // CHECK: [[VAL_FILL:%.+]] = linalg.fill([[VAL_MIN]], [[VAL_INIT]])
   // CHECK: linalg.generic {indexing_maps = [#map0, #map2, #map2], iterator_types = ["parallel", "reduction"]} ins(%arg0 : tensor<3x2xi32>) outs([[IDX_FILL]], [[VAL_FILL]] : tensor<3xi32>, tensor<3xi32>)
   // CHECK:   [[IDX:%.+]] = linalg.index 1
   // CHECK:   [[CAST:%.+]] = index_cast [[IDX]]
@@ -1047,7 +1047,7 @@
 func @max_pool(%arg0: tensor<1x6x34x62xf32>) -> () {
   // CHECK-DAG: [[CONST:%.+]] = constant -3.40282347E+38
   // CHECK-DAG: [[INIT:%.+]] = linalg.init_tensor [1, 4, 32, 62]
-  // CHECK-DAG: [[FILL:%.+]] = linalg.fill([[INIT]], [[CONST]])
+  // CHECK-DAG: [[FILL:%.+]] = linalg.fill([[CONST]], [[INIT]])
   // CHECK-DAG: [[KERNEL:%.+]] = linalg.init_tensor [3, 3]
   // CHECK: linalg.pooling_nhwc_max {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%arg0, [[KERNEL]] : tensor<1x6x34x62xf32>, tensor<3x3xf32>) outs([[FILL]] : tensor<1x4x32x62xf32>)
   %0 = "tosa.max_pool2d"(%arg0) {pad = [0, 0, 0, 0], kernel = [3, 3], stride = [1, 1]} : (tensor<1x6x34x62xf32>)  -> (tensor<1x4x32x62xf32>)
@@ -1061,7 +1061,7 @@
   // CHECK-DAG:   linalg.yield [[CONST]]
   // CHECK-DAG: [[INITVAL:%.+]] = constant -3.40282347E+38 : f32
   // CHECK-DAG: [[INIT:%.+]] = linalg.init_tensor [1, 4, 33, 62]
-  // CHECK-DAG: [[FILL:%.+]] = linalg.fill([[INIT]], [[INITVAL]])
+  // CHECK-DAG: [[FILL:%.+]] = linalg.fill([[INITVAL]], [[INIT]])
   // CHECK-DAG: [[KERNEL:%.+]] = linalg.init_tensor [3, 3]
   // CHECK: linalg.pooling_nhwc_max {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins([[PAD]], [[KERNEL]] : tensor<1x6x35x62xf32>, tensor<3x3xf32>) outs([[FILL]] : tensor<1x4x33x62xf32>)
   %0 = "tosa.max_pool2d"(%arg0) {pad = [0, 0, 0, 1], kernel = [3, 3], stride = [1, 1]} : (tensor<1x6x34x62xf32>)  -> (tensor<1x4x33x62xf32>)
@@ -1100,7 +1100,7 @@
   // CHECK: [[PAD:%.+]] = linalg.pad_tensor %arg0 low[0, 1, 1, 0] high[0, 1, 1, 0]
   // CHECK: [[CONST:%.+]] = constant 0
   // CHECK: [[INIT:%.+]] = linalg.init_tensor [1, 5, 33, 62]
-  // CHECK: [[FILL:%.+]] = linalg.fill([[INIT]], [[CONST]])
+  // CHECK: [[FILL:%.+]] = linalg.fill([[CONST]], [[INIT]])
   // CHECK: [[KERNEL:%.+]] = linalg.init_tensor [4, 4]
   // CHECK: [[POOL:%.+]] = linalg.pooling_nhwc_sum {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins([[PAD]], [[KERNEL]] : tensor<1x8x36x62xf32>, tensor<4x4xf32>) outs([[FILL]] : tensor<1x5x33x62xf32>)
   // CHECK: [[GENERIC:%.+]] = linalg.generic {indexing_maps = [#map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} outs([[POOL]] : tensor<1x5x33x62xf32>)
diff --git a/mlir/test/Dialect/Linalg/bufferize.mlir b/mlir/test/Dialect/Linalg/bufferize.mlir
--- a/mlir/test/Dialect/Linalg/bufferize.mlir
+++ b/mlir/test/Dialect/Linalg/bufferize.mlir
@@ -244,10 +244,10 @@
 func @bufferize_fill(%arg0: tensor<?xf32>) -> tensor<?xf32> {
   %c0 = constant 0.0 : f32
   // CHECK: %[[MEMREF:.*]] = memref.buffer_cast %[[IN]] : memref<?xf32>
-  // CHECK: linalg.fill(%[[MEMREF]], %cst) : memref<?xf32>, f32
+  // CHECK: linalg.fill(%cst, %[[MEMREF]]) : f32, memref<?xf32>
   // CHECK: %[[TENSOR:.*]] = memref.tensor_load %[[MEMREF]] : memref<?xf32>
   // CHECK: return %[[TENSOR]]
-  %0 = linalg.fill(%arg0, %c0) : tensor<?xf32>, f32 -> tensor<?xf32>
+  %0 = linalg.fill(%c0, %arg0) : f32, tensor<?xf32> -> tensor<?xf32>
   return %0 : tensor<?xf32>
 }
 
diff --git a/mlir/test/Dialect/Linalg/canonicalize.mlir b/mlir/test/Dialect/Linalg/canonicalize.mlir
--- a/mlir/test/Dialect/Linalg/canonicalize.mlir
+++ b/mlir/test/Dialect/Linalg/canonicalize.mlir
@@ -35,7 +35,7 @@
     %14 = affine.min affine_map<(d0) -> (-d0 + 192, 24)>(%arg3)
     %16 = memref.subview %out[%arg3] [%14] [1]
       : memref<192xf32, #map> to memref<?xf32, #map>
-    linalg.fill(%16, %cst) : memref<?xf32, #map>, f32
+    linalg.fill(%cst, %16) : f32, memref<?xf32, #map>
     linalg.yield
   }
   return
@@ -684,7 +684,7 @@
   %c0_i32 = constant 0 : i32
   %c0 = constant 0 : index
   %cst = constant 0.000000e+00 : f32
-  %0 = linalg.fill(%arg0, %c0_i32) : tensor<7x7xi32>, i32 -> tensor<7x7xi32>
+  %0 = linalg.fill(%c0_i32, %arg0) : i32, tensor<7x7xi32> -> tensor<7x7xi32>
   %1 = linalg.matmul ins(%arg1, %arg1: tensor<7x7xf32>, tensor<7x7xf32>)
                      outs(%arg1: tensor<7x7xf32>) -> tensor<7x7xf32>
   %2 = linalg.generic #trait outs(%arg0 : tensor<7x7xi32>) {
@@ -720,7 +720,7 @@
   %c21 = constant 21 : index
   %c42 = constant 42 : index
   %0 = linalg.init_tensor [%c21, %c42] : tensor<?x?xf32>
-  %1 = linalg.fill(%0, %arg1) : tensor<?x?xf32>, f32 -> tensor<?x?xf32>
+  %1 = linalg.fill(%arg1, %0) : f32, tensor<?x?xf32> -> tensor<?x?xf32>
   %2 = memref.dim %arg0, %c0 : tensor<?x?xf32>
   %3 = memref.dim %arg0, %c1 : tensor<?x?xf32>
   %4 = tensor.insert_slice %arg0 into %1[%arg2, %arg3] [%2, %3] [1, 1] : tensor<?x?xf32> into tensor<?x?xf32>
@@ -728,7 +728,7 @@
 }
 // CHECK-LABEL: func @propogate_casts
 //       CHECK:   %[[INIT:.+]] = linalg.init_tensor [21, 42]
-//       CHECK:   %[[FILL:.+]] = linalg.fill(%[[INIT]], %{{.+}})
+//       CHECK:   %[[FILL:.+]] = linalg.fill(%{{.+}}, %[[INIT]])
 //       CHECK:   %[[INSERTED:.+]] = tensor.insert_slice %{{.+}} into %[[FILL]]
 //       CHECK:   %[[RESULT:.+]] = tensor.cast %[[INSERTED]]
 //       CHECK:   return %[[RESULT]]
@@ -752,8 +752,8 @@
   %zero = constant 0.0 : f32
   // CHECK: %[[INIT:.+]] = linalg.init_tensor [6, 4] : tensor<6x4xf32>
   %init = linalg.init_tensor [1, 2, 3, 4] : tensor<1x2x3x4xf32>
-  // CHECK: %[[FILL:.+]] = linalg.fill(%[[INIT]], %cst) : tensor<6x4xf32>, f32 -> tensor<6x4xf32>
-  %fill = linalg.fill(%init, %zero) : tensor<1x2x3x4xf32>, f32 -> tensor<1x2x3x4xf32>
+  // CHECK: %[[FILL:.+]] = linalg.fill(%cst, %[[INIT]]) : f32, tensor<6x4xf32> -> tensor<6x4xf32>
+  %fill = linalg.fill(%zero, %init) : f32, tensor<1x2x3x4xf32> -> tensor<1x2x3x4xf32>
   %reshape = linalg.tensor_collapse_shape %fill [[0, 1, 2], [3]]
       : tensor<1x2x3x4xf32> into tensor<6x4xf32>
   // CHECK: return %[[FILL]] : tensor<6x4xf32>
@@ -767,8 +767,8 @@
 func @fold_fill_reshape_dynamic(%arg0 : tensor<?x?x?x?x?xf32>) -> tensor<?x?xf32> {
   %zero = constant 0.0 : f32
   // CHECK: %[[RESHAPE:.+]] = linalg.tensor_collapse_shape %[[ARG0]]
-  %0 = linalg.fill(%arg0, %zero) : tensor<?x?x?x?x?xf32>, f32 -> tensor<?x?x?x?x?xf32>
-  // CHECK: %[[RESULT:.+]] = linalg.fill(%[[RESHAPE]], %{{.+}})
+  %0 = linalg.fill(%zero, %arg0) : f32, tensor<?x?x?x?x?xf32> -> tensor<?x?x?x?x?xf32>
+  // CHECK: %[[RESULT:.+]] = linalg.fill(%{{.+}}, %[[RESHAPE]])
   %1 = linalg.tensor_collapse_shape %0 [[0, 1, 2], [3, 4]]
       : tensor<?x?x?x?x?xf32> into tensor<?x?xf32>
   // CHECK: return %[[RESULT]]
diff --git a/mlir/test/Dialect/Linalg/comprehensive-func-bufferize-analysis.mlir b/mlir/test/Dialect/Linalg/comprehensive-func-bufferize-analysis.mlir
--- a/mlir/test/Dialect/Linalg/comprehensive-func-bufferize-analysis.mlir
+++ b/mlir/test/Dialect/Linalg/comprehensive-func-bufferize-analysis.mlir
@@ -355,7 +355,7 @@
   // CHECK-SAME: {__inplace_results_attr__ = ["false"]}
   %sA = tensor.extract_slice %A[0, 0][%idx, %idx][1, 1] : tensor<?x?xf32> to tensor<?x?xf32>
   %ssA = tensor.extract_slice %sA[0, 0][4, 4][1, 1] : tensor<?x?xf32> to tensor<4x4xf32>
-  %FA = linalg.fill(%ssA, %f0) : tensor<4x4xf32>, f32 -> tensor<4x4xf32>
+  %FA = linalg.fill(%f0, %ssA) : f32, tensor<4x4xf32> -> tensor<4x4xf32>
   %rsA = tensor.insert_slice %FA into %sA[0, 0][4, 4][1, 1] : tensor<4x4xf32> into tensor<?x?xf32>
   %rA = tensor.insert_slice %rsA into %A[0, 0][%idx, %idx][1, 1] : tensor<?x?xf32> into tensor<?x?xf32>
 
@@ -382,7 +382,7 @@
   %sB = tensor.extract_slice %B[0, 0][%idx, %idx][1, 1] : tensor<?x?xf32> to tensor<?x?xf32>
   %ssB = tensor.extract_slice %sB[0, 0][4, %idx][1, 1] : tensor<?x?xf32> to tensor<4x?xf32>
   %sssB = tensor.extract_slice %ssB[0, 0][4, 4][1, 1] : tensor<4x?xf32> to tensor<4x4xf32>
-  %FB = linalg.fill(%sssB, %f0) : tensor<4x4xf32>, f32 -> tensor<4x4xf32>
+  %FB = linalg.fill(%f0, %sssB) : f32, tensor<4x4xf32> -> tensor<4x4xf32>
   %rssB = tensor.insert_slice %FB into %ssB[0, 0][4, 4][1, 1] : tensor<4x4xf32> into tensor<4x?xf32>
   %rsB = tensor.insert_slice %rssB into %sB[0, 0][4, %idx][1, 1] : tensor<4x?xf32> into tensor<?x?xf32>
   %rB = tensor.insert_slice %rsB into %B[0, 0][%idx, %idx][1, 1] : tensor<?x?xf32> into tensor<?x?xf32>
@@ -405,7 +405,7 @@
   // CHECK-SAME: {__inplace_results_attr__ = ["true"]}
   %sC = tensor.extract_slice %C[0, 0][%idx, %idx][1, 1] : tensor<?x?xf32> to tensor<?x?xf32>
   %ssC = tensor.extract_slice %sC[0, 0][4, 4][1, 1] : tensor<?x?xf32> to tensor<4x4xf32>
-  %FC = linalg.fill(%ssC, %f0) : tensor<4x4xf32>, f32 -> tensor<4x4xf32>
+  %FC = linalg.fill(%f0, %ssC) : f32, tensor<4x4xf32> -> tensor<4x4xf32>
   %rsC = tensor.insert_slice %FC into %sC[0, 0][12345, 67890][1, 1] : tensor<4x4xf32> into tensor<?x?xf32>
   %rC = tensor.insert_slice %rsC into %C[0, 0][%idx, %idx][1, 1] : tensor<?x?xf32> into tensor<?x?xf32>
 
diff --git a/mlir/test/Dialect/Linalg/comprehensive-func-bufferize.mlir b/mlir/test/Dialect/Linalg/comprehensive-func-bufferize.mlir
--- a/mlir/test/Dialect/Linalg/comprehensive-func-bufferize.mlir
+++ b/mlir/test/Dialect/Linalg/comprehensive-func-bufferize.mlir
@@ -12,8 +12,8 @@
 
   /// Inplaceable, no alloc
   // CHECK-NOT: alloc
-  //     CHECK: linalg.fill(%[[I]], %[[F0]]) : memref<?xf32, #[[$map_2d_dyn]]>, f32
-  %r = linalg.fill(%A, %f0) : tensor<?xf32>, f32 -> tensor<?xf32>
+  //     CHECK: linalg.fill(%[[F0]], %[[I]]) : f32, memref<?xf32, #[[$map_2d_dyn]]>
+  %r = linalg.fill(%f0, %A) : f32, tensor<?xf32> -> tensor<?xf32>
 
   //     CHECK:  %[[R:.*]] = memref.tensor_load %[[I]] : memref<?xf32, #[[$map_2d_dyn]]>
   //     CHECK:  return %[[R]] : tensor<?xf32>
@@ -37,8 +37,8 @@
   //     CHECK: %[[F0:.*]] = constant 0.000000e+00 : f32
   %f0 = constant 0.0 : f32
 
-  //     CHECK: linalg.fill(%[[I2]], %[[F0]]) : memref<?xf32, #[[$map_2d_dyn]]>, f32
-  %r = linalg.fill(%A, %f0) : tensor<?xf32>, f32 -> tensor<?xf32>
+  //     CHECK: linalg.fill(%[[F0]], %[[I2]]) : f32, memref<?xf32, #[[$map_2d_dyn]]>
+  %r = linalg.fill(%f0, %A) : f32, tensor<?xf32> -> tensor<?xf32>
 
   //     CHECK:  dealloc %[[ALLOC]] : memref<?xf32>
   //     CHECK:  %[[R:.*]] = memref.tensor_load %[[I2]] : memref<?xf32, #[[$map_2d_dyn]]>
@@ -58,8 +58,8 @@
   /// Cross-op multiple uses of %A, the first op which has interfering reads must alloc.
   //       CHECK: %[[ALLOC:.*]] = memref.alloc
   //       CHECK: %[[CAST:.*]] = memref.cast %[[ALLOC]]
-  //       CHECK: linalg.fill(%[[CAST]]
-  %f = linalg.fill(%A, %f0) : tensor<?x?xf32>, f32 -> tensor<?x?xf32>
+  //       CHECK: linalg.fill({{.*}}, %[[CAST]]
+  %f = linalg.fill(%f0, %A) : f32, tensor<?x?xf32> -> tensor<?x?xf32>
 
   /// The second op has no interfering reads and can reuse.
   //   CHECK-NOT: alloc
@@ -175,8 +175,8 @@
   %r0 = tensor.insert_slice %t into %A[0][4][1] : tensor<4xf32> into tensor<?xf32>
 
   /// Overwrite BUFFER_CAST_A inplace.
-  //      CHECK: linalg.fill(%[[BUFFER_CAST_A]]
-  %r1 = linalg.fill(%r0, %f0) : tensor<?xf32>, f32 -> tensor<?xf32>
+  //      CHECK: linalg.fill({{.*}}, %[[BUFFER_CAST_A]]
+  %r1 = linalg.fill(%f0, %r0) : f32, tensor<?xf32> -> tensor<?xf32>
   return %r1: tensor<?xf32>
 }
 
@@ -191,8 +191,8 @@
   //      CHECK: %[[BUFFER_CAST_A:.*]] = memref.buffer_cast {{.*}} : memref<?xf32
   //      CHECK: %[[BUFFER_CAST_B:.*]] = memref.buffer_cast {{.*}} : memref<4xf32
 
-  //      CHECK: linalg.fill(%[[BUFFER_CAST_A]]
-  %r0 = linalg.fill(%A, %f0) : tensor<?xf32>, f32 -> tensor<?xf32>
+  //      CHECK: linalg.fill({{.*}}, %[[BUFFER_CAST_A]]
+  %r0 = linalg.fill(%f0, %A) : f32, tensor<?xf32> -> tensor<?xf32>
 
   //  CHECK-NOT: alloc
   //      CHECK: %[[SV:.*]] = memref.subview %[[BUFFER_CAST_A]]
@@ -241,9 +241,9 @@
   // So we need to bufferize it out of place and make a new alloc.
   //  CHECK-DAG: %[[ALLOC:.*]] = memref.alloc({{.*}}) : memref<?xf32>
   //  CHECK-DAG: %[[ALLOC_CAST_DYNAMIC:.*]] = memref.cast %[[ALLOC]] : memref<?xf32> to memref<?xf32, {{.*}}
-  //      CHECK: linalg.fill(%[[ALLOC_CAST_DYNAMIC]]
+  //      CHECK: linalg.fill(%{{.*}}, %[[ALLOC_CAST_DYNAMIC]]
   //      CHECK: memref.dealloc %[[ALLOC]] : memref<?xf32>
-  %r1 = linalg.fill(%A, %f0) : tensor<?xf32>, f32 -> tensor<?xf32>
+  %r1 = linalg.fill(%f0, %A) : f32, tensor<?xf32> -> tensor<?xf32>
 
   //  CHECK-DAG: %[[RET_A:.*]] = memref.tensor_load %[[BUFFER_CAST_A]] : memref<?xf32, {{.*}}
   //  CHECK-DAG: %[[RET_B:.*]] = memref.tensor_load %[[ALLOC_CAST_DYNAMIC]] : memref<?xf32, {{.*}}
diff --git a/mlir/test/Dialect/Linalg/drop-unit-extent-dims.mlir b/mlir/test/Dialect/Linalg/drop-unit-extent-dims.mlir
--- a/mlir/test/Dialect/Linalg/drop-unit-extent-dims.mlir
+++ b/mlir/test/Dialect/Linalg/drop-unit-extent-dims.mlir
@@ -266,7 +266,7 @@
 func @fold_unit_dim_for_init_tensor(%input: tensor<1x1000xf32>) -> tensor<1xf32> {
   %cst = constant 0.0 : f32
   %init = linalg.init_tensor [1] : tensor<1xf32>
-  %fill = linalg.fill(%init, %cst) : tensor<1xf32>, f32 -> tensor<1xf32>
+  %fill = linalg.fill(%cst, %init) : f32, tensor<1xf32> -> tensor<1xf32>
   %add = linalg.generic {
       indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>],
       iterator_types = ["parallel", "reduction"]}
@@ -287,7 +287,7 @@
 
 //       CHECK: %[[INPUT_RESHAPE:.+]] = linalg.tensor_collapse_shape %{{.+}} {{\[}}[0, 1]] : tensor<1x1000xf32> into tensor<1000xf32>
 //       CHECK: %[[INIT:.+]] = linalg.init_tensor [] : tensor<f32>
-//       CHECK: %[[FILL:.+]] = linalg.fill(%[[INIT]], %cst) : tensor<f32>, f32 -> tensor<f32>
+//       CHECK: %[[FILL:.+]] = linalg.fill(%cst, %[[INIT]]) : f32, tensor<f32> -> tensor<f32>
 //       CHECK: %[[GENERIC:.+]] = linalg.generic
 //  CHECK-SAME:     indexing_maps = [#[[MAP1]], #[[MAP2]]]
 //  CHECK-SAME:     iterator_types = ["reduction"]
@@ -331,7 +331,7 @@
   %c3 = constant 3 : index
   %0 = memref.dim %arg0, %c3 : tensor<1x?x1x?xf32>
   %1 = linalg.init_tensor [1, %0] : tensor<1x?xf32>
-  %2 = linalg.fill(%1, %cst) : tensor<1x?xf32>, f32 -> tensor<1x?xf32>
+  %2 = linalg.fill(%cst, %1) : f32, tensor<1x?xf32> -> tensor<1x?xf32>
   %3 = linalg.generic {
     indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>,
                      affine_map<(d0, d1, d2, d3) -> (d0, d1)>],
@@ -350,7 +350,7 @@
 // CHECK-SAME:   %[[ARG0:.+]]: tensor<1x?x1x?xf32>
 //  CHECK-DAG:   %[[RESHAPE:.+]] = linalg.tensor_collapse_shape %[[ARG0]] {{\[}}[0, 1, 2], [3]]
 //      CHECK:   %[[INIT:.+]] = linalg.init_tensor [%{{.+}}] : tensor<?xf32>
-//      CHECK:   %[[FILL:.+]] = linalg.fill(%[[INIT]], %{{.+}})
+//      CHECK:   %[[FILL:.+]] = linalg.fill(%{{.+}}, %[[INIT]])
 //      CHECK:   %[[RESULT:.+]] = linalg.generic
 // CHECK-SAME:     indexing_maps = [#[[MAP2]], #[[MAP3]]]
 // CHECK-SAME:     iterator_types = ["parallel", "reduction"]
@@ -365,7 +365,7 @@
   %cst = constant 1.000000e+00 : f32
   %c3 = constant 3 : index
   %1 = linalg.init_tensor [1, 1] : tensor<1x1xf32>
-  %2 = linalg.fill(%1, %cst) : tensor<1x1xf32>, f32 -> tensor<1x1xf32>
+  %2 = linalg.fill(%cst, %1) : f32, tensor<1x1xf32> -> tensor<1x1xf32>
   %3 = linalg.generic {
     indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>,
                      affine_map<(d0, d1, d2, d3) -> (d0, d1)>],
@@ -384,7 +384,7 @@
 // CHECK-SAME:   %[[ARG0:.+]]: tensor<1x?x1x1xf32>
 //  CHECK-DAG:   %[[RESHAPE:.+]] = linalg.tensor_collapse_shape %[[ARG0]] {{\[}}[0, 1, 2], [3]]
 //      CHECK:   %[[INIT:.+]] = linalg.init_tensor [1] : tensor<1xf32>
-//      CHECK:   %[[FILL:.+]] = linalg.fill(%[[INIT]], %{{.+}})
+//      CHECK:   %[[FILL:.+]] = linalg.fill(%{{.+}}, %[[INIT]])
 //      CHECK:   %[[RESULT:.+]] = linalg.generic
 // CHECK-SAME:     indexing_maps = [#[[MAP2]], #[[MAP3]]]
 // CHECK-SAME:     iterator_types = ["parallel", "reduction"]
@@ -400,7 +400,7 @@
   %c2 = constant 2 : index
   %0 = memref.dim %arg0, %c2 : tensor<?x1x?x1xf32>
   %1 = linalg.init_tensor [%0, 1] : tensor<?x1xf32>
-  %2 = linalg.fill(%1, %cst) : tensor<?x1xf32>, f32 -> tensor<?x1xf32>
+  %2 = linalg.fill(%cst, %1) : f32, tensor<?x1xf32> -> tensor<?x1xf32>
   %3 = linalg.generic {
     indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>,
                      affine_map<(d0, d1, d2, d3) -> (d0, d1)>],
@@ -419,7 +419,7 @@
 // CHECK-SAME:   %[[ARG0:.+]]: tensor<?x1x?x1xf32>
 //  CHECK-DAG:   %[[RESHAPE:.+]] = linalg.tensor_collapse_shape %[[ARG0]] {{\[}}[0, 1], [2, 3]]
 //      CHECK:   %[[INIT:.+]] = linalg.init_tensor [%{{.+}}] : tensor<?xf32>
-//      CHECK:   %[[FILL:.+]] = linalg.fill(%[[INIT]], %{{.+}})
+//      CHECK:   %[[FILL:.+]] = linalg.fill(%{{.+}}, %[[INIT]])
 //      CHECK:   %[[RESULT:.+]] = linalg.generic
 // CHECK-SAME:     indexing_maps = [#[[MAP2]], #[[MAP3]]]
 // CHECK-SAME:     iterator_types = ["parallel", "reduction"]
diff --git a/mlir/test/Dialect/Linalg/forward-vector-transfers.mlir b/mlir/test/Dialect/Linalg/forward-vector-transfers.mlir
--- a/mlir/test/Dialect/Linalg/forward-vector-transfers.mlir
+++ b/mlir/test/Dialect/Linalg/forward-vector-transfers.mlir
@@ -29,7 +29,7 @@
   %c0 = constant 0: index
   %f0 = constant 0.0: f32
   %alloc = memref.alloc() : memref<32 x f32>
-  linalg.fill(%alloc, %f0): memref<32 x f32>, f32
+  linalg.fill(%f0, %alloc) : f32, memref<32 x f32>
   %subview = memref.subview %alloc[0][16][1] : memref<32 x f32> to memref<16 x f32>
   linalg.copy(%in, %subview): memref<? x f32>, memref<16 x f32>
   %0 = vector.transfer_read %alloc[%c0], %f0 {in_bounds = [true]} : memref<32 x f32>, vector<32 x f32>
@@ -69,7 +69,7 @@
   %alloc = memref.alloc() : memref<128 x i8>
   %view = memref.view %alloc[%c0][] : memref<128 x i8> to memref<32 x f32>
   %subview = memref.subview %view[0][16][1] : memref<32 x f32> to memref<16 x f32>
-  linalg.fill(%view, %f0): memref<32 x f32>, f32
+  linalg.fill(%f0, %view) : f32, memref<32 x f32>
   linalg.copy(%in, %subview): memref<? x f32>, memref<16 x f32>
   %0 = vector.transfer_read %view[%c0], %f0 {in_bounds = [true]} : memref<32 x f32>, vector<32 x f32>
   memref.dealloc %alloc : memref<128 x i8>
@@ -129,7 +129,7 @@
   %f0 = constant 0.0: f32
   %f1 = constant 1.0: f32
   %alloc = memref.alloc() : memref<32 x f32>
-  linalg.fill(%alloc, %f0): memref<32 x f32>, f32
+  linalg.fill(%f0, %alloc) : f32, memref<32 x f32>
   %subview = memref.subview %alloc[0][16][1] : memref<32 x f32> to memref<16 x f32>
   linalg.copy(%in, %subview): memref<? x f32>, memref<16 x f32>
   "some_interleaved_use"(%subview) : (memref<16 x f32>) -> ()
diff --git a/mlir/test/Dialect/Linalg/fusion-pattern.mlir b/mlir/test/Dialect/Linalg/fusion-pattern.mlir
--- a/mlir/test/Dialect/Linalg/fusion-pattern.mlir
+++ b/mlir/test/Dialect/Linalg/fusion-pattern.mlir
@@ -4,7 +4,7 @@
   func @basic_fusion(%arg0: memref<?x?xf32>, %arg1: memref<?x?xf32>,
                      %arg2: memref<?x?xf32>) {
     %cst = constant 0.000000e+00 : f32
-    linalg.fill(%arg2, %cst) : memref<?x?xf32>, f32
+    linalg.fill(%cst, %arg2) : f32, memref<?x?xf32>
     linalg.matmul {__internal_linalg_transform__ = "basic_fusion"}
       ins(%arg0, %arg1 : memref<?x?xf32>, memref<?x?xf32>)
       outs(%arg2 : memref<?x?xf32>)
@@ -28,7 +28,7 @@
 //  CHECK-DAG:   %[[C64:.+]] = constant 64 : index
 //  CHECK-DAG:   %[[C16:.+]] = constant 16 : index
 //  CHECK-DAG:   %[[CST:.+]] = constant 0.0{{.*}} : f32
-//  CHECK-DAG:   linalg.fill(%[[ARG2]], %[[CST]])
+//  CHECK-DAG:   linalg.fill(%[[CST]], %[[ARG2]])
 // CHECK-SAME:   __internal_linalg_transform__ = "after_basic_fusion_original"
 //  CHECK-DAG:   %[[M:.+]] = memref.dim %[[ARG0]], %[[C0]]
 //  CHECK-DAG:   %[[N:.+]] = memref.dim %[[ARG1]], %[[C1]]
@@ -53,7 +53,7 @@
 //      CHECK:     %[[TILE_N_3:.+]] = affine.min #[[MAP5]](%[[IV1]])[%[[N_2]], %[[N]]]
 //      CHECK:     %[[SV3_2:.+]] = memref.subview %[[ARG2]][%[[IV0]], %[[IV1]]]
 // CHECK-SAME:       [%[[TILE_M_3]], %[[TILE_N_3]]]
-//      CHECK:     linalg.fill(%[[SV3_2]], %[[CST]])
+//      CHECK:     linalg.fill(%[[CST]], %[[SV3_2]])
 // CHECK-SAME:       __internal_linalg_transform__ = "after_basic_fusion_producer"
 //      CHECK:     scf.for %[[IV2:.+]] = %[[C0]] to %[[K]] step %[[C16]] {
 //      CHECK:       %[[TILE_K:.+]] = affine.min #[[MAP3]](%[[IV2]])[%[[K]]]
@@ -79,7 +79,7 @@
                               %arg2: memref<?x?xf32>, %arg3: memref<?x?xf32>) {
     %cst = constant 0.000000e+00 : f32
     linalg.copy(%arg1, %arg2) : memref<?x?xf32>, memref<?x?xf32>
-    linalg.fill(%arg3, %cst) : memref<?x?xf32>, f32
+    linalg.fill(%cst, %arg3) : f32, memref<?x?xf32>
     linalg.matmul {__internal_linalg_transform__ = "rhs_fusion"}
       ins(%arg0, %arg2 : memref<?x?xf32>, memref<?x?xf32>)
       outs(%arg3 : memref<?x?xf32>)
@@ -161,7 +161,7 @@
                               %arg2: memref<?x?xf32>, %arg3: memref<?x?xf32>) {
     %cst = constant 0.000000e+00 : f32
     linalg.copy(%arg0, %arg1) : memref<?x?xf32>, memref<?x?xf32>
-    linalg.fill(%arg3, %cst) : memref<?x?xf32>, f32
+    linalg.fill(%cst, %arg3) : f32, memref<?x?xf32>
     linalg.matmul {__internal_linalg_transform__ = "two_operand_fusion"}
       ins(%arg1, %arg2 : memref<?x?xf32>, memref<?x?xf32>)
       outs(%arg3 : memref<?x?xf32>)
@@ -186,7 +186,7 @@
 //  CHECK-DAG:   %[[CST:.+]] = constant 0.0{{.*}} : f32
 //      CHECK:   linalg.copy(%[[ARG0]], %[[ARG1]])
 // CHECK-SAME:     __internal_linalg_transform__ = "after_two_operand_fusion_original"
-//      CHECK:   linalg.fill(%[[ARG3]], %[[CST]])
+//      CHECK:   linalg.fill(%[[CST]], %[[ARG3]])
 // CHECK-SAME:     __internal_linalg_transform__ = "after_two_operand_fusion_original"
 //  CHECK-DAG:   %[[M:.+]] = memref.dim %[[ARG1]], %[[C0]]
 //      CHECK:   scf.parallel (%[[IV0:.+]]) =
@@ -213,7 +213,7 @@
 // CHECK-SAME:       [%[[TILE_M_5]], %[[K]]]
 //      CHECK:     linalg.copy(%[[SV3]], %[[SV3_2]])
 // CHECK-SAME:       __internal_linalg_transform__ = "after_two_operand_fusion_producer"
-//      CHECK:     linalg.fill(%[[SV2_2]], %[[CST]])
+//      CHECK:     linalg.fill(%[[CST]], %[[SV2_2]])
 // CHECK-SAME:       __internal_linalg_transform__ = "after_two_operand_fusion_producer"
 //  CHECK-DAG:     %[[N_2:.+]] = memref.dim %[[ARG2]], %[[C1]]
 //      CHECK:     scf.parallel (%[[IV1:.+]]) =
@@ -428,7 +428,7 @@
     %c64 = constant 64 : index
     %c16 = constant 16 : index
     %cst = constant 0.000000e+00 : f32
-    linalg.fill(%arg2, %cst) : memref<?x?xf32>, f32
+    linalg.fill(%cst, %arg2) : f32, memref<?x?xf32>
     %0 = memref.dim %arg0, %c0 : memref<?x?xf32>
     %1 = memref.dim %arg1, %c1 : memref<?x?xf32>
     %2 = memref.dim %arg0, %c1 : memref<?x?xf32>
@@ -463,7 +463,7 @@
   func @basic_conv_fusion(%arg0: memref<?x?x?x?xf32>, %arg1: memref<?x?x?x?xf32>,
                           %arg2: memref<?x?x?x?xf32>) {
     %cst = constant 0.000000e+00 : f32
-    linalg.fill(%arg2, %cst) : memref<?x?x?x?xf32>, f32
+    linalg.fill(%cst, %arg2) : f32, memref<?x?x?x?xf32>
     linalg.conv(%arg0, %arg1, %arg2) {
       dilations = [1, 1], strides = [1, 1],
       __internal_linalg_transform__ = "basic_fusion"} :
diff --git a/mlir/test/Dialect/Linalg/fusion-sequence.mlir b/mlir/test/Dialect/Linalg/fusion-sequence.mlir
--- a/mlir/test/Dialect/Linalg/fusion-sequence.mlir
+++ b/mlir/test/Dialect/Linalg/fusion-sequence.mlir
@@ -9,7 +9,7 @@
     %d0 = memref.dim %arg0, %c0 : memref<?x?xf32>
     %d1 = memref.dim %arg1, %c1 : memref<?x?xf32>
     %0 = memref.alloc(%d0, %d1) : memref<?x?xf32>
-    linalg.fill(%0, %cst) : memref<?x?xf32>, f32
+    linalg.fill(%cst, %0) : f32, memref<?x?xf32>
     linalg.matmul ins(%arg0, %arg1 : memref<?x?xf32>, memref<?x?xf32>)
       outs(%0 : memref<?x?xf32>)
     linalg.generic
@@ -42,7 +42,7 @@
 //   CHECK-DAG:     %[[SV_ARG0:.+]] = memref.subview %[[ARG0]][%[[IV0]], 0]
 //   CHECK-DAG:     %[[SV_ARG1:.+]] = memref.subview %[[ARG1]][0, %[[IV1]]]
 //       CHECK:     %[[SV_TEMP_2:.+]] = memref.subview %[[TEMP]][%[[IV0]], %[[IV1]]]
-//       CHECK:     linalg.fill(%[[SV_TEMP_2]], %{{.+}})
+//       CHECK:     linalg.fill(%{{.+}}, %[[SV_TEMP_2]])
 //       CHECK:     linalg.matmul
 //  CHECK-SAME:       ins(%[[SV_ARG0]], %[[SV_ARG1]]
 //  CHECK-SAME:         : memref<?x?xf32, #[[MAP2]]>, memref<?x?xf32, #[[MAP2]]>)
@@ -69,13 +69,13 @@
     %n3 = memref.dim %arg3, %c1 : memref<?x?xf32>
     %0 = memref.alloc(%m, %n1) : memref<?x?xf32>
     %1 = memref.alloc(%m, %n2) : memref<?x?xf32>
-    linalg.fill(%0, %cst) : memref<?x?xf32>, f32
+    linalg.fill(%cst, %0) : f32, memref<?x?xf32>
     linalg.matmul ins(%arg0, %arg1 : memref<?x?xf32>, memref<?x?xf32>)
       outs(%0 : memref<?x?xf32>)
-    linalg.fill(%1, %cst) : memref<?x?xf32>, f32
+    linalg.fill(%cst, %1) : f32, memref<?x?xf32>
     linalg.matmul ins(%0, %arg2 : memref<?x?xf32>, memref<?x?xf32>)
       outs(%1 : memref<?x?xf32>)
-    linalg.fill(%arg4, %cst) : memref<?x?xf32>, f32
+    linalg.fill(%cst, %arg4) : f32, memref<?x?xf32>
     linalg.matmul ins(%1, %arg3 : memref<?x?xf32>, memref<?x?xf32>)
       outs(%arg4 : memref<?x?xf32>)
     return
@@ -124,15 +124,15 @@
 //       CHECK:     %[[N0:.+]] = memref.dim %[[ARG0]], %[[C1]]
 //       CHECK:     %[[SV_ARG0:.+]] = memref.subview %[[ARG0]][%[[IV0]], 0]
 //  CHECK-SAME:       [%[[TILE_M_5]], %[[N0]]]
-//       CHECK:     linalg.fill(%[[SV_ALLOC1]], %{{.+}})
+//       CHECK:     linalg.fill(%{{.+}}, %[[SV_ALLOC1]])
 //       CHECK:     linalg.matmul ins(%[[SV_ARG0]], %[[ARG1]]
 //  CHECK-SAME:        : memref<?x?xf32, #[[MAP1]]>, memref<?x?xf32>)
 //  CHECK-SAME:        outs(%[[SV_ALLOC1]] : memref<?x?xf32, #[[MAP1]]>)
-//       CHECK:     linalg.fill(%[[SV_ALLOC2]], %{{.+}})
+//       CHECK:     linalg.fill(%{{.+}}, %[[SV_ALLOC2]])
 //       CHECK:     linalg.matmul ins(%[[SV_ALLOC1]], %[[ARG2]]
 //  CHECK-SAME:        : memref<?x?xf32, #[[MAP1]]>, memref<?x?xf32>)
 //  CHECK-SAME:        outs(%[[SV_ALLOC2]] : memref<?x?xf32, #[[MAP1]]>)
-//       CHECK:     linalg.fill(%[[SV_ARG4_2]], %{{.+}})
+//       CHECK:     linalg.fill(%{{.+}}, %[[SV_ARG4_2]])
 //       CHECK:     linalg.matmul ins(%[[SV_ALLOC3]], %[[ARG3]]
 //  CHECK-SAME:        : memref<?x?xf32, #[[MAP1]]>, memref<?x?xf32>)
 //  CHECK-SAME:        outs(%[[SV_ARG4]] : memref<?x?xf32, #[[MAP1]]>)
diff --git a/mlir/test/Dialect/Linalg/fusion-tensor-pattern.mlir b/mlir/test/Dialect/Linalg/fusion-tensor-pattern.mlir
--- a/mlir/test/Dialect/Linalg/fusion-tensor-pattern.mlir
+++ b/mlir/test/Dialect/Linalg/fusion-tensor-pattern.mlir
@@ -251,7 +251,7 @@
   func @matmul_out_fusion(%arg0: tensor<?x?xf32>, %arg1: tensor<?x?xf32>,
                       %arg2: tensor<?x?xf32>) -> tensor<?x?xf32> {
     %c0 = constant 0.0 : f32
-    %0 = linalg.fill(%arg0, %c0) : tensor<?x?xf32>, f32 -> tensor<?x?xf32>
+    %0 = linalg.fill(%c0, %arg0) : f32, tensor<?x?xf32> -> tensor<?x?xf32>
     %1 = linalg.matmul {__internal_linalg_transform__ = "out_fusion"}
       ins(%arg1, %arg2 : tensor<?x?xf32>, tensor<?x?xf32>)
       outs(%0 : tensor<?x?xf32>) -> tensor<?x?xf32>
@@ -268,7 +268,7 @@
 //       CHECK: scf.for %[[I:.*]]{{.*}}iter_args(%{{.*}} = %[[ARG0]]) -> (tensor<?x?xf32>) {
 //       CHECK:   scf.for %[[J:.*]]
 //       CHECK:     %[[ST:.*]] = tensor.extract_slice %[[ARG0]]
-//       CHECK:     %[[ST_FILL:.*]] = linalg.fill(%[[ST]], %[[C0]]) {__internal_linalg_transform__ = "after_out_fusion_producer"} : tensor<?x?xf32>, f32 -> tensor<?x?xf32>
+//       CHECK:     %[[ST_FILL:.*]] = linalg.fill(%[[C0]], %[[ST]]) {__internal_linalg_transform__ = "after_out_fusion_producer"} : f32, tensor<?x?xf32> -> tensor<?x?xf32>
 //       CHECK:     %[[ST_MM_RES:.*]] = scf.for %[[K:.*]]{{.*}}iter_args(%[[BB:.*]] = %[[ST_FILL]]) -> (tensor<?x?xf32>) {
 //   CHECK-NOT:       fill
 //       CHECK:       %[[ST_MM:.*]] = linalg.matmul {__internal_linalg_transform__ = "after_out_fusion"} ins(%{{.*}}, %{{.*}} : tensor<?x?xf32>, tensor<?x?xf32>) outs(%[[BB]] : tensor<?x?xf32>) -> tensor<?x?xf32>
@@ -304,7 +304,7 @@
 // TLOOP:    %[[A_SUB:.*]] = tensor.extract_slice %[[A_]][%[[I]], 0]
 // TLOOP:    %[[B_SUB:.*]] = tensor.extract_slice %[[B_]][0, %[[J]]]
 // TLOOP:    %[[OUT_SUB:.*]] = tensor.extract_slice %[[OUT_]][%[[I]], %[[J]]]
-// TLOOP:    %[[INIT_SUB:.*]] = linalg.fill(%[[OUT_SUB]], %[[C0_F32_]])
+// TLOOP:    %[[INIT_SUB:.*]] = linalg.fill(%[[C0_F32_]], %[[OUT_SUB]])
 
 // TLOOP:    %[[AB_SUB:.*]] = linalg.tiled_loop (%[[K:.*]]) = (%[[C0]])
 // TLOOP-SAME: to (%[[DIM_A__1]]) step (%[[C16]])
diff --git a/mlir/test/Dialect/Linalg/fusion.mlir b/mlir/test/Dialect/Linalg/fusion.mlir
--- a/mlir/test/Dialect/Linalg/fusion.mlir
+++ b/mlir/test/Dialect/Linalg/fusion.mlir
@@ -678,7 +678,7 @@
 
 func @fill_and_conv(%arg0: memref<?x?x?x?xf32>, %arg1: memref<2x3x1x1xf32>, %arg2: memref<?x?x?x?xf32>) {
   %cst = constant 0.000000e+00 : f32
-  linalg.fill(%arg2, %cst) : memref<?x?x?x?xf32>, f32
+  linalg.fill(%cst, %arg2) : f32, memref<?x?x?x?xf32>
 
   %c4 = constant 4 : index
   %c1 = constant 1 : index
diff --git a/mlir/test/Dialect/Linalg/generalize-named-ops.mlir b/mlir/test/Dialect/Linalg/generalize-named-ops.mlir
--- a/mlir/test/Dialect/Linalg/generalize-named-ops.mlir
+++ b/mlir/test/Dialect/Linalg/generalize-named-ops.mlir
@@ -472,7 +472,7 @@
 // -----
 
 func @generalize_fill(%output: memref<?x?xf32>, %value : f32) {
-  linalg.fill(%output, %value) : memref<?x?xf32>, f32
+  linalg.fill(%value, %output) : f32, memref<?x?xf32>
   return
 }
 
diff --git a/mlir/test/Dialect/Linalg/invalid.mlir b/mlir/test/Dialect/Linalg/invalid.mlir
--- a/mlir/test/Dialect/Linalg/invalid.mlir
+++ b/mlir/test/Dialect/Linalg/invalid.mlir
@@ -641,7 +641,7 @@
 {
   %0 = linalg.init_tensor [%arg0, %arg1] : tensor<?x?xf32>
   // expected-error @+1 {{expected fill op with no result value to use memref type}}
-  linalg.fill(%0, %arg2) : tensor<?x?xf32>, f32
+  linalg.fill(%arg2, %0) : f32, tensor<?x?xf32>
 }
 
 // -----
@@ -649,7 +649,7 @@
 func @illegal_fill_memref_with_return(%arg0 : memref<?x?xf32>, %arg1 : f32) -> memref<?x?xf32>
 {
   // expected-error @+1 {{unexpected #results > #outputs}}
-  %0 = linalg.fill(%arg0, %arg1) : memref<?x?xf32>, f32 -> memref<?x?xf32>
+  %0 = linalg.fill(%arg1, %arg0) : f32, memref<?x?xf32> -> memref<?x?xf32>
   return %0 : memref<?x?xf32>
 }
 
@@ -659,7 +659,7 @@
   (%arg0 : memref<?x?xf32>, %arg1 : f32) -> tensor<?x?xf32>
 {
   // expected-error @+1 {{unexpected #results > #outputs}}
-  %0 = linalg.fill(%arg0, %arg1) : memref<?x?xf32>, f32 -> tensor<?x?xf32>
+  %0 = linalg.fill(%arg1, %arg0) : f32, memref<?x?xf32> -> tensor<?x?xf32>
   return %0 : tensor<?x?xf32>
 }
 
@@ -669,7 +669,7 @@
   (%arg0 : tensor<?x?xf32>, %arg1 : f32) -> memref<?x?xf32>
 {
   // expected-error @+1 {{expected type of operand #1 ('tensor<?x?xf32>') to match type of corresponding result ('memref<?x?xf32>')}}
-  %0 = linalg.fill(%arg0, %arg1) : tensor<?x?xf32>, f32 -> memref<?x?xf32>
+  %0 = linalg.fill(%arg1, %arg0) : f32, tensor<?x?xf32> -> memref<?x?xf32>
   return %0 : memref<?x?xf32>
 }
 
diff --git a/mlir/test/Dialect/Linalg/loops.mlir b/mlir/test/Dialect/Linalg/loops.mlir
--- a/mlir/test/Dialect/Linalg/loops.mlir
+++ b/mlir/test/Dialect/Linalg/loops.mlir
@@ -182,7 +182,7 @@
 //       CHECKPARALLEL:   store %[[res]], %{{.*}}[] : memref<f32>
 
 func @fill_view(%arg0: memref<?xf32, offset: ?, strides: [1]>, %arg1: f32) {
-  linalg.fill(%arg0, %arg1) : memref<?xf32, offset: ?, strides: [1]>, f32
+  linalg.fill(%arg1, %arg0) : f32, memref<?xf32, offset: ?, strides: [1]>
   return
 }
 // CHECK-LABEL: func @fill_view(
@@ -196,7 +196,7 @@
 //       CHECKPARALLEL:     store %{{.*}}, %{{.*}}[%{{.*}}] : memref<?xf32, #[[$strided1D]]>
 
 func @fill_view0(%arg0: memref<f32>, %arg1: f32) {
-  linalg.fill(%arg0, %arg1) : memref<f32>, f32
+  linalg.fill(%arg1, %arg0) : f32, memref<f32>
   return
 }
 // CHECK-LABEL: func @fill_view0(%{{.*}}: memref<f32>, %{{.*}}: f32) {
@@ -206,7 +206,7 @@
 //       CHECKPARALLEL:   store %{{.*}}, %{{.*}}[] : memref<f32>
 
 func @fill_view3(%arg0: memref<?x?x?xf32, offset: ?, strides: [?, ?, 1]>, %arg1: f32) {
-  linalg.fill(%arg0, %arg1) : memref<?x?x?xf32, offset: ?, strides: [?, ?, 1]>, f32
+  linalg.fill(%arg1, %arg0) : f32, memref<?x?x?xf32, offset: ?, strides: [?, ?, 1]>
   return
 }
 // CHECK-LABEL: func @fill_view3(
diff --git a/mlir/test/Dialect/Linalg/named-ops.mlir b/mlir/test/Dialect/Linalg/named-ops.mlir
--- a/mlir/test/Dialect/Linalg/named-ops.mlir
+++ b/mlir/test/Dialect/Linalg/named-ops.mlir
@@ -4,7 +4,7 @@
 func @depthwise_conv_2d_input_nhwc_filter_hwcf_tensor(%input: tensor<2x4x5x2xf32>, %filter: tensor<2x2x2x3xf32>) -> tensor<2x3x4x2x3xf32> {
   %zero = constant 0.000000e+00 : f32
   %init = linalg.init_tensor [2, 3, 4, 2, 3] : tensor<2x3x4x2x3xf32>
-  %fill = linalg.fill(%init, %zero) : tensor<2x3x4x2x3xf32>, f32 -> tensor<2x3x4x2x3xf32>
+  %fill = linalg.fill(%zero, %init) : f32, tensor<2x3x4x2x3xf32> -> tensor<2x3x4x2x3xf32>
   // CHECK:      %{{.+}} = linalg.depthwise_conv_2d_input_nhwc_filter_hwcf
   // CHECK-SAME:   {dilations = dense<1> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>}
   // CHECK-SAME:   ins(%{{.+}}, %{{.+}} : tensor<2x4x5x2xf32>, tensor<2x2x2x3xf32>)
@@ -57,7 +57,7 @@
 func @depthwise_conv_2d_input_nhwc_filter_hwcf_tensor_dilated(%input: tensor<2x8x9x2xf32>, %filter: tensor<2x2x2x3xf32>) -> tensor<2x6x7x2x3xf32> {
   %zero = constant 0.000000e+00 : f32
   %init = linalg.init_tensor [2, 6, 7, 2, 3] : tensor<2x6x7x2x3xf32>
-  %fill = linalg.fill(%init, %zero) : tensor<2x6x7x2x3xf32>, f32 -> tensor<2x6x7x2x3xf32>
+  %fill = linalg.fill(%zero, %init) : f32, tensor<2x6x7x2x3xf32> -> tensor<2x6x7x2x3xf32>
   // CHECK:      %{{.+}} = linalg.depthwise_conv_2d_input_nhwc_filter_hwcf
   // CHECK-SAME:   {dilations = dense<2> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>}
   // CHECK-SAME:   ins(%{{.+}}, %{{.+}} : tensor<2x8x9x2xf32>, tensor<2x2x2x3xf32>)
@@ -326,7 +326,7 @@
   %fake = linalg.init_tensor [3, 3] : tensor<3x3xf32>
   %init = linalg.init_tensor [1, 2, 2, 1] : tensor<1x2x2x1xf32>
   %cst = constant 0.000000e+00 : f32
-  %fill = linalg.fill(%init, %cst) : tensor<1x2x2x1xf32>, f32 -> tensor<1x2x2x1xf32>
+  %fill = linalg.fill(%cst, %init) : f32, tensor<1x2x2x1xf32> -> tensor<1x2x2x1xf32>
   %res = linalg.pooling_nhwc_sum {dilations = dense<1> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>}
     ins(%input, %fake: tensor<1x4x4x1xf32>, tensor<3x3xf32>)
     outs(%fill: tensor<1x2x2x1xf32>) -> tensor<1x2x2x1xf32>
@@ -360,7 +360,7 @@
   %fake = linalg.init_tensor [3, 3] : tensor<3x3xf32>
   %init = linalg.init_tensor [1, 2, 2, 1] : tensor<1x2x2x1xf32>
   %cst = constant 0.000000e+00 : f32
-  %fill = linalg.fill(%init, %cst) : tensor<1x2x2x1xf32>, f32 -> tensor<1x2x2x1xf32>
+  %fill = linalg.fill(%cst, %init) : f32, tensor<1x2x2x1xf32> -> tensor<1x2x2x1xf32>
   %res = linalg.pooling_nhwc_max {dilations = dense<1> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>}
     ins(%input, %fake: tensor<1x4x4x1xf32>, tensor<3x3xf32>)
     outs(%fill: tensor<1x2x2x1xf32>) -> tensor<1x2x2x1xf32>
@@ -394,7 +394,7 @@
   %fake = linalg.init_tensor [3, 3] : tensor<3x3xi8>
   %init = linalg.init_tensor [1, 2, 2, 1] : tensor<1x2x2x1xi8>
   %cst = constant 0 : i8
-  %fill = linalg.fill(%init, %cst) : tensor<1x2x2x1xi8>, i8 -> tensor<1x2x2x1xi8>
+  %fill = linalg.fill(%cst, %init) : i8, tensor<1x2x2x1xi8> -> tensor<1x2x2x1xi8>
   %res = linalg.pooling_nhwc_i8_max {dilations = dense<1> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>}
     ins(%input, %fake: tensor<1x4x4x1xi8>, tensor<3x3xi8>)
     outs(%fill: tensor<1x2x2x1xi8>) -> tensor<1x2x2x1xi8>
@@ -428,7 +428,7 @@
   %fake = linalg.init_tensor [3, 3] : tensor<3x3xi16>
   %init = linalg.init_tensor [1, 2, 2, 1] : tensor<1x2x2x1xi16>
   %cst = constant 0 : i16
-  %fill = linalg.fill(%init, %cst) : tensor<1x2x2x1xi16>, i16 -> tensor<1x2x2x1xi16>
+  %fill = linalg.fill(%cst, %init) : i16, tensor<1x2x2x1xi16> -> tensor<1x2x2x1xi16>
   %res = linalg.pooling_nhwc_i16_max {dilations = dense<1> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>}
     ins(%input, %fake: tensor<1x4x4x1xi16>, tensor<3x3xi16>)
     outs(%fill: tensor<1x2x2x1xi16>) -> tensor<1x2x2x1xi16>
@@ -462,7 +462,7 @@
   %fake = linalg.init_tensor [3, 3] : tensor<3x3xi32>
   %init = linalg.init_tensor [1, 2, 2, 1] : tensor<1x2x2x1xi32>
   %cst = constant 0 : i32
-  %fill = linalg.fill(%init, %cst) : tensor<1x2x2x1xi32>, i32 -> tensor<1x2x2x1xi32>
+  %fill = linalg.fill(%cst, %init) : i32, tensor<1x2x2x1xi32> -> tensor<1x2x2x1xi32>
   %res = linalg.pooling_nhwc_i32_max {dilations = dense<1> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>}
     ins(%input, %fake: tensor<1x4x4x1xi32>, tensor<3x3xi32>)
     outs(%fill: tensor<1x2x2x1xi32>) -> tensor<1x2x2x1xi32>
@@ -497,7 +497,7 @@
   %fake = linalg.init_tensor [3, 3] : tensor<3x3xf32>
   %init = linalg.init_tensor [1, 2, 2, 1] : tensor<1x2x2x1xf32>
   %cst = constant 0.000000e+00 : f32
-  %fill = linalg.fill(%init, %cst) : tensor<1x2x2x1xf32>, f32 -> tensor<1x2x2x1xf32>
+  %fill = linalg.fill(%cst, %init) : f32, tensor<1x2x2x1xf32> -> tensor<1x2x2x1xf32>
   %res = linalg.pooling_nhwc_min {dilations = dense<1> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>}
     ins(%input, %fake: tensor<1x4x4x1xf32>, tensor<3x3xf32>)
     outs(%fill: tensor<1x2x2x1xf32>) -> tensor<1x2x2x1xf32>
diff --git a/mlir/test/Dialect/Linalg/promotion_options.mlir b/mlir/test/Dialect/Linalg/promotion_options.mlir
--- a/mlir/test/Dialect/Linalg/promotion_options.mlir
+++ b/mlir/test/Dialect/Linalg/promotion_options.mlir
@@ -23,9 +23,9 @@
 //      CHECK:       %[[T19:.+]] = memref.subview %[[T18]]
 //      CHECK:       %[[T20:.+]] = memref.alloc(%{{.*}}, %{{.*}}) : memref<?x?xf32, 3>
 //      CHECK:       %[[T21:.+]] = memref.subview %[[T20]]
-//      CHECK:       linalg.fill(%[[T19]], %[[C42]])
+//      CHECK:       linalg.fill(%[[C42]], %[[T19]])
 //      CHECK:       linalg.copy(%[[T7]], %[[T19]])
-//      CHECK:       linalg.fill(%[[T21]], %[[C42]])
+//      CHECK:       linalg.fill(%[[C42]], %[[T21]])
 //      CHECK:       linalg.copy(%[[T17]], %[[T21]])
 //      CHECK:       linalg.matmul ins(%[[T19]], %[[T12]]{{.*}} outs(%[[T21]]
 //  CHECK-NOT:       linalg.fill
diff --git a/mlir/test/Dialect/Linalg/roundtrip.mlir b/mlir/test/Dialect/Linalg/roundtrip.mlir
--- a/mlir/test/Dialect/Linalg/roundtrip.mlir
+++ b/mlir/test/Dialect/Linalg/roundtrip.mlir
@@ -153,12 +153,12 @@
 
 
 func @fill_view(%arg0: memref<?xf32, offset: ?, strides: [1]>, %arg1: f32) {
-  linalg.fill(%arg0, %arg1) : memref<?xf32, offset: ?, strides: [1]>, f32
+  linalg.fill(%arg1, %arg0) : f32, memref<?xf32, offset: ?, strides: [1]>
   return
 }
 // CHECK-LABEL: func @fill_view(
 //       CHECK:  %{{.*}}: memref<?xf32, #[[$strided1D]]>, %{{.*}}: f32) {
-//       CHECK:   linalg.fill(%{{.*}}, %{{.*}}) : memref<?xf32, #[[$strided1D]]>, f32
+//       CHECK:   linalg.fill(%{{.*}}, %{{.*}}) : f32, memref<?xf32, #[[$strided1D]]>
 
 // -----
 
@@ -174,12 +174,12 @@
 
 
 func @fill_view3(%arg0: memref<?x?x?xf32, offset: ?, strides: [?, ?, 1]>, %arg1: f32) {
-  linalg.fill(%arg0, %arg1) : memref<?x?x?xf32, offset: ?, strides: [?, ?, 1]>, f32
+  linalg.fill(%arg1, %arg0) : f32, memref<?x?x?xf32, offset: ?, strides: [?, ?, 1]>
   return
 }
 // CHECK-LABEL: func @fill_view3(
 //       CHECK:  %{{.*}}: memref<?x?x?xf32, #[[$strided3D]]>, %{{.*}}: f32) {
-//       CHECK:   linalg.fill(%{{.*}}, %{{.*}}) : memref<?x?x?xf32, #[[$strided3D]]>, f32
+//       CHECK:   linalg.fill(%{{.*}}, %{{.*}}) : f32, memref<?x?x?xf32, #[[$strided3D]]>
 
 // -----
 
@@ -429,9 +429,9 @@
     -> (tensor<i32>, tensor<i32>) {
   %c0 = constant 0 : index
   %0 = linalg.init_tensor [] : tensor<i32>
-  %1 = linalg.fill(%0, %arg2) : tensor<i32>, i32 -> tensor<i32>
+  %1 = linalg.fill(%arg2, %0) : i32, tensor<i32> -> tensor<i32>
   %2 = linalg.init_tensor [] : tensor<i32>
-  %3 = linalg.fill(%2, %arg2) : tensor<i32>, i32 -> tensor<i32>
+  %3 = linalg.fill(%arg2, %2) : i32, tensor<i32> -> tensor<i32>
   %4:2 = linalg.generic {
     indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> ()>, affine_map<(d0) -> ()>],
     iterator_types = ["reduction"]}
@@ -704,10 +704,10 @@
 
 func @fill_tensor(%arg0 : index, %arg1 : index, %arg2 : f32) -> tensor<?x?xf32> {
   %0 = linalg.init_tensor [%arg0, %arg1] : tensor<?x?xf32>
-  %1 = linalg.fill(%0, %arg2) : tensor<?x?xf32>, f32 -> tensor<?x?xf32>
+  %1 = linalg.fill(%arg2, %0) : f32, tensor<?x?xf32> -> tensor<?x?xf32>
   return %1 : tensor<?x?xf32>
 }
-// CHECK: %{{.+}} = linalg.fill(%{{.+}}, %{{.+}}) : tensor<?x?xf32>, f32 -> tensor<?x?xf32>
+// CHECK: %{{.+}} = linalg.fill(%{{.+}}, %{{.+}}) : f32, tensor<?x?xf32> -> tensor<?x?xf32>
 
 // -----
 
diff --git a/mlir/test/Dialect/Linalg/tile-and-fuse-tensors.mlir b/mlir/test/Dialect/Linalg/tile-and-fuse-tensors.mlir
--- a/mlir/test/Dialect/Linalg/tile-and-fuse-tensors.mlir
+++ b/mlir/test/Dialect/Linalg/tile-and-fuse-tensors.mlir
@@ -77,7 +77,7 @@
   %cst = constant 0.0 : f32
 
   %init = linalg.init_tensor [1, 112, 112, 32] : tensor<1x112x112x32xf32>
-  %fill = linalg.fill(%init, %cst) : tensor<1x112x112x32xf32>, f32 -> tensor<1x112x112x32xf32>
+  %fill = linalg.fill(%cst, %init) : f32, tensor<1x112x112x32xf32> -> tensor<1x112x112x32xf32>
 
   %conv = linalg.conv_2d_input_nhwc_filter_hwcf
     {dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>}
@@ -121,7 +121,7 @@
 // CHECK-SAME: (%[[INPUT:.+]]: tensor<1x225x225x3xf32>, %[[FILTER:.+]]: tensor<3x3x3x32xf32>, %[[ELEM:.+]]: tensor<1x112x112x32xf32>)
 
 //      CHECK: %[[INIT:.+]] = linalg.init_tensor [1, 112, 112, 32] : tensor<1x112x112x32xf32>
-// CHECK-NEXT: %[[FILL:.+]] = linalg.fill(%[[INIT]], %cst) : tensor<1x112x112x32xf32>, f32 -> tensor<1x112x112x32xf32>
+// CHECK-NEXT: %[[FILL:.+]] = linalg.fill(%cst, %[[INIT]]) : f32, tensor<1x112x112x32xf32> -> tensor<1x112x112x32xf32>
 
 // CHECK-NEXT: scf.for %[[IV0:.+]] = %{{.+}} to %{{.+}} step %{{.+}} iter_args(%[[ARG0:.+]] = %[[FILL]])
 // CHECK-NEXT:   %[[OFFSET_H:.+]] = affine.apply #[[MAP0]](%[[IV0]])
@@ -159,7 +159,7 @@
   %oc = memref.dim %elementwise, %c3 : tensor<?x?x?x?xf32>
 
   %init = linalg.init_tensor [%n, %oh, %ow, %oc] : tensor<?x?x?x?xf32>
-  %fill = linalg.fill(%init, %cst) : tensor<?x?x?x?xf32>, f32 -> tensor<?x?x?x?xf32>
+  %fill = linalg.fill(%cst, %init) : f32, tensor<?x?x?x?xf32> -> tensor<?x?x?x?xf32>
 
   %conv = linalg.conv_2d_input_nhwc_filter_hwcf
     {dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>}
@@ -228,7 +228,7 @@
 //  CHECK-DAG:   %[[ELEM_OC:.+]] = memref.dim %[[ELEM]], %[[C3]] : tensor<?x?x?x?xf32>
 
 //      CHECK:   %[[INIT:.+]] = linalg.init_tensor [%[[ELEM_N]], %[[ELEM_OH]], %[[ELEM_OW]], %[[ELEM_OC]]] : tensor<?x?x?x?xf32>
-//      CHECK:   %[[FILL:.+]] = linalg.fill(%[[INIT]], %cst) : tensor<?x?x?x?xf32>, f32 -> tensor<?x?x?x?xf32>
+//      CHECK:   %[[FILL:.+]] = linalg.fill(%cst, %[[INIT]]) : f32, tensor<?x?x?x?xf32> -> tensor<?x?x?x?xf32>
 
 //  CHECK-DAG:   %[[FILTER_H:.+]] = memref.dim %[[FILTER]], %[[C0]] : tensor<?x?x?x?xf32>
 //  CHECK-DAG:   %[[FILTER_W:.+]] = memref.dim %[[FILTER]], %[[C1]] : tensor<?x?x?x?xf32>
@@ -319,7 +319,7 @@
     linalg.yield %zero : f32
   } : tensor<58x1xf32> to tensor<64x128xf32>
 
-  %fill = linalg.fill(%large_input, %zero) : tensor<64x128xf32>, f32 -> tensor<64x128xf32>
+  %fill = linalg.fill(%zero, %large_input) : f32, tensor<64x128xf32> -> tensor<64x128xf32>
 
   %for0 = scf.for %iv0 = %c0 to %d0 step %c16 iter_args(%arg0 = %fill) -> tensor<64x128xf32> {
     %for1 = scf.for %iv1 = %c0 to %d1 step %c32 iter_args(%arg1 = %arg0) -> tensor<64x128xf32> {
diff --git a/mlir/test/Dialect/Linalg/tile.mlir b/mlir/test/Dialect/Linalg/tile.mlir
--- a/mlir/test/Dialect/Linalg/tile.mlir
+++ b/mlir/test/Dialect/Linalg/tile.mlir
@@ -288,35 +288,35 @@
 //       TILE-234:    linalg.dot ins(%[[sAi]], %[[sBi]]{{.*}} outs(
 
 func @fill_static(%arg0: memref<127x99xf32>, %arg1: f32) {
-  linalg.fill(%arg0, %arg1) : memref<127x99xf32>, f32
+  linalg.fill(%arg1, %arg0) : f32, memref<127x99xf32>
   return
 }
 // TILE-2-LABEL: func @fill_static
 //       TILE-2:   for
 //   TILE-2-NOT:   for
 //       TILE-2:       memref.subview{{.*}} : memref<127x99xf32>
-//       TILE-2:       linalg.fill{{.*}} : memref<?x99xf32, #[[$stride_99_1_layout_map]]>, f32
+//       TILE-2:       linalg.fill{{.*}} : f32, memref<?x99xf32, #[[$stride_99_1_layout_map]]>
 
 // TILE-02-LABEL: func @fill_static
 //       TILE-02:   for
 //   TILE-02-NOT:   for
 //       TILE-02:       memref.subview{{.*}} : memref<127x99xf32>
-//       TILE-02:       linalg.fill{{.*}} : memref<127x?xf32, #[[$stride_99_1_layout_map]]>, f32
+//       TILE-02:       linalg.fill{{.*}} : f32, memref<127x?xf32, #[[$stride_99_1_layout_map]]>
 
 // TILE-002-LABEL: func @fill_static
 //   TILE-002-NOT:   for
-//       TILE-002:     linalg.fill{{.*}} memref<127x99xf32>, f32
+//       TILE-002:     linalg.fill{{.*}} f32, memref<127x99xf32>
 
 // TILE-234-LABEL: func @fill_static
 //       TILE-234:   for
 //       TILE-234:     for
 //   TILE-234-NOT:   for
 //       TILE-234:       memref.subview{{.*}} : memref<127x99xf32>
-//       TILE-234:       linalg.fill{{.*}} : memref<?x?xf32, #[[$stride_99_1_layout_map]]>, f32
+//       TILE-234:       linalg.fill{{.*}} : f32, memref<?x?xf32, #[[$stride_99_1_layout_map]]>
 
 
 func @fill(%arg0: memref<?x?xf32, offset: ?, strides: [?, 1]>, %arg1: f32) {
-  linalg.fill(%arg0, %arg1) : memref<?x?xf32, offset: ?, strides: [?, 1]>, f32
+  linalg.fill(%arg1, %arg0) : f32, memref<?x?xf32, offset: ?, strides: [?, 1]>
   return
 }
 // TILE-2-LABEL: func @fill
diff --git a/mlir/test/Dialect/Linalg/tiled-loops.mlir b/mlir/test/Dialect/Linalg/tiled-loops.mlir
--- a/mlir/test/Dialect/Linalg/tiled-loops.mlir
+++ b/mlir/test/Dialect/Linalg/tiled-loops.mlir
@@ -25,7 +25,7 @@
       : memref<192x192xf32> to memref<192x?xf32, #map1>
     %4 = memref.subview %C_[%i, %j] [%0, %2] [1, 1]
       : memref<192x192xf32> to memref<?x?xf32, #map1>
-    linalg.fill(%4, %cst) : memref<?x?xf32, #map1>, f32
+    linalg.fill(%cst, %4) : f32, memref<?x?xf32, #map1>
     linalg.matmul ins(%1, %3 : memref<?x192xf32, #map1>,
                                memref<192x?xf32, #map1>)
                   outs(%4 : memref<?x?xf32, #map1>)
@@ -63,7 +63,7 @@
       ins (%A_ = %A: memref<192x192xf32>, %B_ = %B:  memref<192x192xf32>)
       outs (%C_ = %C: memref<f32>)
       iterators["reduction", "reduction"] {
-    linalg.fill(%A_, %cst) : memref<192x192xf32>, f32
+    linalg.fill(%cst, %A_) : f32, memref<192x192xf32>
     linalg.yield
   }
   return
diff --git a/mlir/test/Dialect/Linalg/transform-patterns.mlir b/mlir/test/Dialect/Linalg/transform-patterns.mlir
--- a/mlir/test/Dialect/Linalg/transform-patterns.mlir
+++ b/mlir/test/Dialect/Linalg/transform-patterns.mlir
@@ -296,8 +296,8 @@
   %cf = constant 1.0 : f32
   %3 = memref.subview %arg0[%c0, %c0][%c2000, %c4000][%c1, %c1] :
  	 memref<?x?xf32, offset: ?, strides: [?, 1]> to memref<?x?xf32, offset: ?, strides: [?, ?]>
-  linalg.fill(%3, %cf) { __internal_linalg_transform__ = "_promote_views_aligned_"}
-  	:  memref<?x?xf32, offset: ?, strides: [?, ?]>, f32
+  linalg.fill(%cf, %3) { __internal_linalg_transform__ = "_promote_views_aligned_"}
+  	: f32, memref<?x?xf32, offset: ?, strides: [?, ?]>
   return
 }
 // CHECK-LABEL: func @aligned_promote_fill
@@ -306,9 +306,9 @@
 // CHECK:         %[[a0:.*]] = memref.alloc() {alignment = 32 : i64} : memref<32000000xi8>
 // CHECK:         %[[v0:.*]] = memref.view %[[a0]]{{.*}} : memref<32000000xi8> to memref<?x?xf32>
 // CHECK:         %[[l0:.*]] = memref.subview %[[v0]][0, 0] [%{{.*}}, %{{.*}}] [1, 1] : memref<?x?xf32> to memref<?x?xf32, #[[$STRIDED_2D_u_1]]>
-// CHECK:         linalg.fill(%[[v0]], {{.*}}) : memref<?x?xf32>, f32
+// CHECK:         linalg.fill({{.*}}, %[[v0]]) : f32, memref<?x?xf32>
 // CHECK:         linalg.copy(%[[s0]], %[[l0]]) : memref<?x?xf32, #map{{.*}}>, memref<?x?xf32, #map{{.*}}>
-// CHECK:         linalg.fill(%[[v0]], %[[cf]]) : memref<?x?xf32>, f32
+// CHECK:         linalg.fill(%[[cf]], %[[v0]]) : f32, memref<?x?xf32>
 
 func @aligned_promote_fill_complex(%arg0: memref<?x?xcomplex<f32>, offset: ?, strides: [?, 1]>) {
   %c2000 = constant 2000 : index
@@ -319,8 +319,8 @@
   %cc = complex.create %cf, %cf : complex<f32>
   %3 = memref.subview %arg0[%c0, %c0][%c2000, %c4000][%c1, %c1] :
  	 memref<?x?xcomplex<f32>, offset: ?, strides: [?, 1]> to memref<?x?xcomplex<f32>, offset: ?, strides: [?, ?]>
-  linalg.fill(%3, %cc) { __internal_linalg_transform__ = "_promote_views_aligned_"}
-  	:  memref<?x?xcomplex<f32>, offset: ?, strides: [?, ?]>, complex<f32>
+  linalg.fill(%cc, %3) { __internal_linalg_transform__ = "_promote_views_aligned_"}
+  	: complex<f32>, memref<?x?xcomplex<f32>, offset: ?, strides: [?, ?]>
   return
 }
 // CHECK-LABEL: func @aligned_promote_fill_complex
@@ -329,9 +329,9 @@
 // CHECK:         %[[a0:.*]] = memref.alloc() {alignment = 32 : i64} : memref<64000000xi8>
 // CHECK:         %[[v0:.*]] = memref.view %[[a0]]{{.*}} : memref<64000000xi8> to memref<?x?xcomplex<f32>>
 // CHECK:         %[[l0:.*]] = memref.subview %[[v0]][0, 0] [%{{.*}}, %{{.*}}] [1, 1] : memref<?x?xcomplex<f32>> to memref<?x?xcomplex<f32>, #[[$STRIDED_2D_u_1]]>
-// CHECK:         linalg.fill(%[[v0]], {{.*}}) : memref<?x?xcomplex<f32>>, complex<f32>
+// CHECK:         linalg.fill({{.*}}, %[[v0]]) : complex<f32>, memref<?x?xcomplex<f32>>
 // CHECK:         linalg.copy(%[[s0]], %[[l0]]) : memref<?x?xcomplex<f32>, #map{{.*}}>, memref<?x?xcomplex<f32>, #map{{.*}}>
-// CHECK:         linalg.fill(%[[v0]], %[[cc]]) : memref<?x?xcomplex<f32>>, complex<f32>
+// CHECK:         linalg.fill(%[[cc]], %[[v0]]) : complex<f32>, memref<?x?xcomplex<f32>>
 
 func @tile_permute_parallel_loop(%arg0: memref<?x?xf32>,
                                  %arg1: memref<?x?xf32>,
diff --git a/mlir/test/Dialect/Linalg/vectorization.mlir b/mlir/test/Dialect/Linalg/vectorization.mlir
--- a/mlir/test/Dialect/Linalg/vectorization.mlir
+++ b/mlir/test/Dialect/Linalg/vectorization.mlir
@@ -157,7 +157,7 @@
 func @test_vectorize_fill(%A : memref<8x16xf32>, %arg0 : f32) {
   //       CHECK: %[[V:.*]] = vector.broadcast {{.*}} : f32 to vector<8x16xf32>
   //       CHECK: vector.transfer_write %[[V]], {{.*}} : vector<8x16xf32>, memref<8x16xf32>
-  linalg.fill(%A, %arg0) :  memref<8x16xf32>, f32
+  linalg.fill(%arg0, %A) : f32, memref<8x16xf32>
   return
 }
 
@@ -167,7 +167,7 @@
 func @test_vectorize_fill_scalar(%A : memref<f32>, %arg0 : f32) {
   //  CHECK-SAME: (%[[M:.*]]: memref<f32>, %[[V:.*]]: f32)
   //       CHECK:   store %[[V]], %[[M]][] : memref<f32>
-  linalg.fill(%A, %arg0) :  memref<f32>, f32
+  linalg.fill(%arg0, %A) : f32, memref<f32>
   return
 }
 
@@ -584,7 +584,7 @@
 //       CHECK:   %[[V4:.*]] = addi %[[DIM3]], %[[C3]] : index
 //       CHECK:   %[[V5:.*]] = addi %[[V4]], %[[C2]] : index
 //       CHECK:   %[[INIT:.*]] = linalg.init_tensor [6, %[[V1]], %[[V2]], %[[V5]]] : tensor<6x?x?x?xf32>
-//       CHECK:   %[[FILL:.*]] = linalg.fill(%[[INIT]], %{{.*}}) : tensor<6x?x?x?xf32>, f32 -> tensor<6x?x?x?xf32>
+//       CHECK:   %[[FILL:.*]] = linalg.fill(%{{.*}}, %[[INIT]]) : f32, tensor<6x?x?x?xf32> -> tensor<6x?x?x?xf32>
 //       CHECK:   %[[SRCDIM:.*]] = memref.dim %[[SRC]], %[[C3]] : tensor<1x2x2x?xf32>
 //       CHECK:   %[[RESULT:.*]] = tensor.insert_slice %[[SRC]] into %[[FILL]][2, %[[LOW]], 3, 3] [1, 2, 2, %[[SRCDIM]]] [1, 1, 1, 1] : tensor<1x2x2x?xf32> into tensor<6x?x?x?xf32>
 //       CHECK:   return %[[RESULT]]
diff --git a/mlir/test/Dialect/Vector/vector-transfer-full-partial-split.mlir b/mlir/test/Dialect/Vector/vector-transfer-full-partial-split.mlir
--- a/mlir/test/Dialect/Vector/vector-transfer-full-partial-split.mlir
+++ b/mlir/test/Dialect/Vector/vector-transfer-full-partial-split.mlir
@@ -75,7 +75,7 @@
   //      LINALG:   scf.yield %[[A]], %[[i]], %[[j]] : memref<?x8xf32>, index, index
   //      LINALG: } else {
   //               slow path, fill tmp alloc and yield a memref_casted version of it
-  //      LINALG:   linalg.fill(%[[alloc]], %cst) : memref<4x8xf32>, f32
+  //      LINALG:   linalg.fill(%cst, %[[alloc]]) : f32, memref<4x8xf32>
   //      LINALG:   %[[d0:.*]] = memref.dim %[[A]], %[[c0]] : memref<?x8xf32>
   //      LINALG:   %[[sv0:.*]] = affine.min #[[$bounds_map_4]](%[[d0]], %[[i]], %[[c4]])
   //      LINALG:   %[[sv1:.*]] = affine.min #[[$bounds_map_8]](%[[c8]], %[[j]], %[[c8]])
@@ -167,7 +167,7 @@
   // LINALG-SAME:     memref<?x8xf32, #[[$map_2d_stride_1]]>, index, index
   //      LINALG: } else {
   //               slow path, fill tmp alloc and yield a memref_casted version of it
-  //      LINALG:   linalg.fill(%[[alloc]], %cst) : memref<4x8xf32>, f32
+  //      LINALG:   linalg.fill(%cst, %[[alloc]]) : f32, memref<4x8xf32>
   //      LINALG:   %[[sv0:.*]] = affine.min #[[$bounds_map_4]](%[[c7]], %[[i]], %[[c4]])
   //      LINALG:   %[[sv1:.*]] = affine.min #[[$bounds_map_8]](%[[c8]], %[[j]], %[[c8]])
   //      LINALG:   %[[sv:.*]] = memref.subview %[[A]][%[[i]], %[[j]]] [%[[sv0]], %[[sv1]]] [1, 1]
diff --git a/mlir/test/Integration/Dialect/Async/CPU/microbench-linalg-async-parallel-for.mlir b/mlir/test/Integration/Dialect/Async/CPU/microbench-linalg-async-parallel-for.mlir
--- a/mlir/test/Integration/Dialect/Async/CPU/microbench-linalg-async-parallel-for.mlir
+++ b/mlir/test/Integration/Dialect/Async/CPU/microbench-linalg-async-parallel-for.mlir
@@ -65,8 +65,8 @@
   %RHS10 = memref.alloc() {alignment = 64} : memref<1x10xf32>
   %DST10 = memref.alloc() {alignment = 64} : memref<1x10xf32>
 
-  linalg.fill(%LHS10, %f1) : memref<1x10xf32>, f32
-  linalg.fill(%RHS10, %f1) : memref<1x10xf32>, f32
+  linalg.fill(%f1, %LHS10) : f32, memref<1x10xf32>
+  linalg.fill(%f1, %RHS10) : f32, memref<1x10xf32>
 
   %LHS = memref.cast %LHS10 : memref<1x10xf32> to memref<?x?xf32>
   %RHS = memref.cast %RHS10 : memref<1x10xf32> to memref<?x?xf32>
diff --git a/mlir/test/Integration/Dialect/Linalg/CPU/benchmark_matmul.mlir b/mlir/test/Integration/Dialect/Linalg/CPU/benchmark_matmul.mlir
--- a/mlir/test/Integration/Dialect/Linalg/CPU/benchmark_matmul.mlir
+++ b/mlir/test/Integration/Dialect/Linalg/CPU/benchmark_matmul.mlir
@@ -59,9 +59,9 @@
   %B = memref.alloc() : !row_major_B
   %C = memref.alloc() : !row_major_C
 
-  linalg.fill(%A, %v1) : !row_major_A, !elem_type_a
-  linalg.fill(%B, %v1) : !row_major_B, !elem_type_b
-  linalg.fill(%C, %v0) : !row_major_C, !elem_type_c
+  linalg.fill(%v1, %A) : !elem_type_a, !row_major_A
+  linalg.fill(%v1, %B) : !elem_type_b, !row_major_B
+  linalg.fill(%v0, %C) : !elem_type_c, !row_major_C
 
   %c0 = constant 0: index
   %c1 = constant 1: index
@@ -71,7 +71,7 @@
   /// Preheating run:
   scf.for %arg0 = %c0 to %iters step %c1 {
     %z = constant 0.0 : !elem_type_c
-    linalg.fill(%C, %z) : !row_major_C, !elem_type_c
+    linalg.fill(%z, %C) : !elem_type_c, !row_major_C
     call @matmul(%A, %B, %C) : (!row_major_A, !row_major_B, !row_major_C) -> ()
   }
   %t_start_matmul = call @rtclock() : () -> f64
@@ -81,7 +81,7 @@
     // Once linalg on tensors is ready, fusing fill at the register level will
     // be easy.
     %z = constant 0.0 : !elem_type_c
-    linalg.fill(%C, %z) : !row_major_C, !elem_type_c
+    linalg.fill(%z, %C) : !elem_type_c, !row_major_C
     call @matmul(%A, %B, %C) : (!row_major_A, !row_major_B, !row_major_C) -> ()
   }
   %t_end_matmul = call @rtclock() : () -> f64
@@ -90,7 +90,7 @@
 
   // CHECK: {{^0$}}
   %C_ref = memref.alloc() : !row_major_C
-  linalg.fill(%C_ref, %v0) : !row_major_C, !elem_type_c
+  linalg.fill(%v0, %C_ref) : !elem_type_c, !row_major_C
   linalg.matmul ins(%A, %B : !row_major_A, !row_major_B)
     outs(%C_ref: !row_major_C)
   %act = memref.cast %C : !row_major_C to memref<*xf32>
diff --git a/mlir/test/Integration/Dialect/Linalg/CPU/benchmark_matmul_column_major_as_row_major.mlir b/mlir/test/Integration/Dialect/Linalg/CPU/benchmark_matmul_column_major_as_row_major.mlir
--- a/mlir/test/Integration/Dialect/Linalg/CPU/benchmark_matmul_column_major_as_row_major.mlir
+++ b/mlir/test/Integration/Dialect/Linalg/CPU/benchmark_matmul_column_major_as_row_major.mlir
@@ -69,9 +69,9 @@
   %cB = memref.alloc() : !column_major_B
   %cC = memref.alloc() : !column_major_C
 
-  linalg.fill(%cA, %f1) : !column_major_A, !elem_type_a
-  linalg.fill(%cB, %f1) : !column_major_B, !elem_type_b
-  linalg.fill(%cC, %f0) : !column_major_C, !elem_type_c
+  linalg.fill(%f1, %cA) : !elem_type_a, !column_major_A
+  linalg.fill(%f1, %cB) : !elem_type_b, !column_major_B
+  linalg.fill(%f0, %cC) : !elem_type_c, !column_major_C
 
   %c0 = constant 0: index
   %c1 = constant 1: index
@@ -87,7 +87,7 @@
     // This is accounts for about 10-15% perf hit on small sizes.
     // Once linalg on tensors is ready, fusing fill at the register level will
     // be easy.
-    linalg.fill(%C, %f0) : !row_major_C, !elem_type_c
+    linalg.fill(%f0, %C) : !elem_type_c, !row_major_C
     call @matmul_column_major_as_row_major(%cA, %cB, %cC, %A, %B, %C) :
       (!column_major_A, !column_major_B, !column_major_C,
        !row_major_A, !row_major_B, !row_major_C) -> ()
@@ -98,7 +98,7 @@
 
   // CHECK: {{^0$}}
   %cC_ref = memref.alloc() : !column_major_C
-  linalg.fill(%cC_ref, %f0) : !column_major_C, !elem_type_c
+  linalg.fill(%f0, %cC_ref) : !elem_type_c, !column_major_C
   linalg.matmul_column_major ins(%cA, %cB : !column_major_A, !column_major_B)
     outs(%cC_ref: !column_major_C)
   %act1 = memref.cast %cC : !column_major_C to memref<*xf32>
@@ -109,7 +109,7 @@
 
   // CHECK: {{^0$}}
   %C_ref = memref.alloc() : !row_major_C
-  linalg.fill(%C_ref, %f0) : !row_major_C, !elem_type_c
+  linalg.fill(%f0, %C_ref) : !elem_type_c, !row_major_C
   linalg.matmul ins(%A, %B : !row_major_A, !row_major_B)
     outs(%C_ref: !row_major_C)
   %act2 = memref.cast %C : !row_major_C to memref<*xf32>
diff --git a/mlir/test/Integration/Dialect/Linalg/CPU/benchmark_matmul_i8_i8_i32.mlir b/mlir/test/Integration/Dialect/Linalg/CPU/benchmark_matmul_i8_i8_i32.mlir
--- a/mlir/test/Integration/Dialect/Linalg/CPU/benchmark_matmul_i8_i8_i32.mlir
+++ b/mlir/test/Integration/Dialect/Linalg/CPU/benchmark_matmul_i8_i8_i32.mlir
@@ -59,9 +59,9 @@
   %B = memref.alloc() : !row_major_B
   %C = memref.alloc() : !row_major_C
 
-  linalg.fill(%A, %v1) : !row_major_A, !elem_type_a
-  linalg.fill(%B, %v1) : !row_major_B, !elem_type_b
-  linalg.fill(%C, %v0) : !row_major_C, !elem_type_c
+  linalg.fill(%v1, %A) : !elem_type_a, !row_major_A
+  linalg.fill(%v1, %B) : !elem_type_b, !row_major_B
+  linalg.fill(%v0, %C) : !elem_type_c, !row_major_C
 
   %c0 = constant 0: index
   %c1 = constant 1: index
@@ -70,7 +70,7 @@
   /// Run and dump performance for matmul.
   /// Preheating run:
   scf.for %arg0 = %c0 to %iters step %c1 {
-    linalg.fill(%C, %v0) : !row_major_C, !elem_type_c
+    linalg.fill(%v0, %C) : !elem_type_c, !row_major_C
     call @matmul(%A, %B, %C) : (!row_major_A, !row_major_B, !row_major_C) -> ()
   }
   %t_start_matmul = call @rtclock() : () -> f64
@@ -79,7 +79,7 @@
     // This is accounts for about 10-15% perf hit on small sizes.
     // Once linalg on tensors is ready, fusing fill at the register level will
     // be easy.
-    linalg.fill(%C, %v0) : !row_major_C, !elem_type_c
+    linalg.fill(%v0, %C) : !elem_type_c, !row_major_C
     call @matmul(%A, %B, %C) : (!row_major_A, !row_major_B, !row_major_C) -> ()
   }
   %t_end_matmul = call @rtclock() : () -> f64
@@ -88,7 +88,7 @@
 
   // CHECK: {{^0$}}
   %C_ref = memref.alloc() : !row_major_C
-  linalg.fill(%C_ref, %v0) : !row_major_C, !elem_type_c
+  linalg.fill(%v0, %C_ref) : !elem_type_c, !row_major_C
   linalg.matmul_i8_i8_i32 ins(%A, %B : !row_major_A, !row_major_B)
     outs(%C_ref: !row_major_C)
   %res = memref.cast %C : !row_major_C to memref<*xi32>
diff --git a/mlir/test/Integration/Dialect/Linalg/CPU/matmul-vs-matvec.mlir b/mlir/test/Integration/Dialect/Linalg/CPU/matmul-vs-matvec.mlir
--- a/mlir/test/Integration/Dialect/Linalg/CPU/matmul-vs-matvec.mlir
+++ b/mlir/test/Integration/Dialect/Linalg/CPU/matmul-vs-matvec.mlir
@@ -12,7 +12,7 @@
   %x = memref.dim %A, %c0 : memref<?x?xf32>
   %y = memref.dim %B, %c1 : memref<?x?xf32>
   %C = memref.alloc(%x, %y) : memref<?x?xf32>
-  linalg.fill(%C, %f0) : memref<?x?xf32>, f32
+  linalg.fill(%f0, %C) : f32, memref<?x?xf32>
   linalg.matmul ins(%A, %B: memref<?x?xf32>, memref<?x?xf32>)
                 outs(%C: memref<?x?xf32>)
   return %C : memref<?x?xf32>
@@ -26,7 +26,7 @@
   %x = memref.dim %A, %c1 : memref<?x?xf32>
   %n = memref.dim %B, %c1 : memref<?x?xf32>
   %C = memref.alloc(%m, %n) : memref<?x?xf32>
-  linalg.fill(%C, %f0) : memref<?x?xf32>, f32
+  linalg.fill(%f0, %C) : f32, memref<?x?xf32>
   scf.for %i = %c0 to %n step %c1 {
     %b = memref.subview %B[0, %i][%x, 1][1, 1] : memref<?x?xf32> to memref<?xf32, offset: ?, strides: [?]>
     %c = memref.subview %C[0, %i][%m, 1][1, 1] : memref<?x?xf32> to memref<?xf32, offset: ?, strides: [?]>
@@ -46,8 +46,8 @@
   %val2 = constant 17.0 : f32
   %A = memref.alloc(%m, %x) : memref<?x?xf32>
   %B = memref.alloc(%x, %n) : memref<?x?xf32>
-  linalg.fill(%A, %val1) : memref<?x?xf32>, f32
-  linalg.fill(%B, %val2) : memref<?x?xf32>, f32
+  linalg.fill(%val1, %A) : f32, memref<?x?xf32>
+  linalg.fill(%val2, %B) : f32, memref<?x?xf32>
   memref.store %val1, %B[%c0, %c0] : memref<?x?xf32>
   %C1 = call @matmul(%A, %B) : (memref<?x?xf32>, memref<?x?xf32>) -> memref<?x?xf32>
   %C2 = call @matvec(%A, %B) : (memref<?x?xf32>, memref<?x?xf32>) -> memref<?x?xf32>
diff --git a/mlir/test/Integration/Dialect/Linalg/CPU/test-conv-1d-call.mlir b/mlir/test/Integration/Dialect/Linalg/CPU/test-conv-1d-call.mlir
--- a/mlir/test/Integration/Dialect/Linalg/CPU/test-conv-1d-call.mlir
+++ b/mlir/test/Integration/Dialect/Linalg/CPU/test-conv-1d-call.mlir
@@ -25,7 +25,7 @@
 // Creates and returns a 1-D buffer of size %s1 filled with the value %f
 func @alloc_1d_filled_f32(%s1 : index, %f : f32) -> memref<?xf32> {
   %buf = memref.alloc(%s1) : memref<?xf32>
-  linalg.fill(%buf, %f) : memref<?xf32>, f32
+  linalg.fill(%f, %buf) : f32, memref<?xf32>
   return %buf : memref<?xf32>
 }
 
diff --git a/mlir/test/Integration/Dialect/Linalg/CPU/test-conv-1d-input-ncw-filter-wcf-call.mlir b/mlir/test/Integration/Dialect/Linalg/CPU/test-conv-1d-input-ncw-filter-wcf-call.mlir
--- a/mlir/test/Integration/Dialect/Linalg/CPU/test-conv-1d-input-ncw-filter-wcf-call.mlir
+++ b/mlir/test/Integration/Dialect/Linalg/CPU/test-conv-1d-input-ncw-filter-wcf-call.mlir
@@ -25,7 +25,7 @@
 // Creates and returns 3-D buffer of size (%s1, %s2, %s3) filled with the value %f
 func @alloc_3d_filled_f32(%s1 : index, %s2 : index, %s3 : index, %f : f32) -> memref<?x?x?xf32> {
   %buf = memref.alloc(%s1, %s2, %s3) : memref<?x?x?xf32>
-  linalg.fill(%buf, %f) : memref<?x?x?xf32>, f32
+  linalg.fill(%f, %buf) : f32, memref<?x?x?xf32>
   return %buf : memref<?x?x?xf32>
 }
 
diff --git a/mlir/test/Integration/Dialect/Linalg/CPU/test-conv-1d-input-nwc-filter-wcf-call.mlir b/mlir/test/Integration/Dialect/Linalg/CPU/test-conv-1d-input-nwc-filter-wcf-call.mlir
--- a/mlir/test/Integration/Dialect/Linalg/CPU/test-conv-1d-input-nwc-filter-wcf-call.mlir
+++ b/mlir/test/Integration/Dialect/Linalg/CPU/test-conv-1d-input-nwc-filter-wcf-call.mlir
@@ -25,7 +25,7 @@
 // Creates and returns 3-D buffer of size (%s1, %s2, %s3) filled with the value %f
 func @alloc_3d_filled_f32(%s1 : index, %s2 : index, %s3 : index, %f : f32) -> memref<?x?x?xf32> {
   %buf = memref.alloc(%s1, %s2, %s3) : memref<?x?x?xf32>
-  linalg.fill(%buf, %f) : memref<?x?x?xf32>, f32
+  linalg.fill(%f, %buf) : f32, memref<?x?x?xf32>
   return %buf : memref<?x?x?xf32>
 }
 
diff --git a/mlir/test/Integration/Dialect/Linalg/CPU/test-conv-1d-ncw-call.mlir b/mlir/test/Integration/Dialect/Linalg/CPU/test-conv-1d-ncw-call.mlir
--- a/mlir/test/Integration/Dialect/Linalg/CPU/test-conv-1d-ncw-call.mlir
+++ b/mlir/test/Integration/Dialect/Linalg/CPU/test-conv-1d-ncw-call.mlir
@@ -25,7 +25,7 @@
 // Creates and returns 3-D buffer of size (%s1, %s2, %s3) filled with the value %f
 func @alloc_3d_filled_f32(%s1 : index, %s2 : index, %s3 : index, %f : f32) -> memref<?x?x?xf32> {
   %buf = memref.alloc(%s1, %s2, %s3) : memref<?x?x?xf32>
-  linalg.fill(%buf, %f) : memref<?x?x?xf32>, f32
+  linalg.fill(%f, %buf) : f32, memref<?x?x?xf32>
   return %buf : memref<?x?x?xf32>
 }
 
diff --git a/mlir/test/Integration/Dialect/Linalg/CPU/test-conv-1d-nwc-call.mlir b/mlir/test/Integration/Dialect/Linalg/CPU/test-conv-1d-nwc-call.mlir
--- a/mlir/test/Integration/Dialect/Linalg/CPU/test-conv-1d-nwc-call.mlir
+++ b/mlir/test/Integration/Dialect/Linalg/CPU/test-conv-1d-nwc-call.mlir
@@ -25,7 +25,7 @@
 // Creates and returns 3-D buffer of size (%s1, %s2, %s3) filled with the value %f
 func @alloc_3d_filled_f32(%s1 : index, %s2 : index, %s3 : index, %f : f32) -> memref<?x?x?xf32> {
   %buf = memref.alloc(%s1, %s2, %s3) : memref<?x?x?xf32>
-  linalg.fill(%buf, %f) : memref<?x?x?xf32>, f32
+  linalg.fill(%f, %buf) : f32, memref<?x?x?xf32>
   return %buf : memref<?x?x?xf32>
 }
 
diff --git a/mlir/test/Integration/Dialect/Linalg/CPU/test-conv-2d-call.mlir b/mlir/test/Integration/Dialect/Linalg/CPU/test-conv-2d-call.mlir
--- a/mlir/test/Integration/Dialect/Linalg/CPU/test-conv-2d-call.mlir
+++ b/mlir/test/Integration/Dialect/Linalg/CPU/test-conv-2d-call.mlir
@@ -25,7 +25,7 @@
 // Creates and returns a 2-D buffer of size (%s1, %s2) filled with the value %f
 func @alloc_2d_filled_f32(%s1 : index, %s2 : index, %f : f32) -> memref<?x?xf32> {
   %buf = memref.alloc(%s1, %s2) : memref<?x?xf32>
-  linalg.fill(%buf, %f) : memref<?x?xf32>, f32
+  linalg.fill(%f, %buf) : f32, memref<?x?xf32>
   return %buf : memref<?x?xf32>
 }
 
diff --git a/mlir/test/Integration/Dialect/Linalg/CPU/test-conv-2d-input-nchw-filter-hwcf-call.mlir b/mlir/test/Integration/Dialect/Linalg/CPU/test-conv-2d-input-nchw-filter-hwcf-call.mlir
--- a/mlir/test/Integration/Dialect/Linalg/CPU/test-conv-2d-input-nchw-filter-hwcf-call.mlir
+++ b/mlir/test/Integration/Dialect/Linalg/CPU/test-conv-2d-input-nchw-filter-hwcf-call.mlir
@@ -25,7 +25,7 @@
 // Creates and returns 4-D buffer of size (%s1, %s2, %s3, %s4) filled with the value %f
 func @alloc_4d_filled_f32(%s1 : index, %s2 : index, %s3 : index, %s4 : index, %f : f32) -> memref<?x?x?x?xf32> {
   %buf = memref.alloc(%s1, %s2, %s3, %s4) : memref<?x?x?x?xf32>
-  linalg.fill(%buf, %f) : memref<?x?x?x?xf32>, f32
+  linalg.fill(%f, %buf) : f32, memref<?x?x?x?xf32>
   return %buf : memref<?x?x?x?xf32>
 }
 
diff --git a/mlir/test/Integration/Dialect/Linalg/CPU/test-conv-2d-input-nhwc-filter-hwcf-call.mlir b/mlir/test/Integration/Dialect/Linalg/CPU/test-conv-2d-input-nhwc-filter-hwcf-call.mlir
--- a/mlir/test/Integration/Dialect/Linalg/CPU/test-conv-2d-input-nhwc-filter-hwcf-call.mlir
+++ b/mlir/test/Integration/Dialect/Linalg/CPU/test-conv-2d-input-nhwc-filter-hwcf-call.mlir
@@ -25,7 +25,7 @@
 // Creates and returns 4-D buffer of size (%s1, %s2, %s3, %s4) filled with the value %f
 func @alloc_4d_filled_f32(%s1 : index, %s2 : index, %s3 : index, %s4 : index, %f : f32) -> memref<?x?x?x?xf32> {
   %buf = memref.alloc(%s1, %s2, %s3, %s4) : memref<?x?x?x?xf32>
-  linalg.fill(%buf, %f) : memref<?x?x?x?xf32>, f32
+  linalg.fill(%f, %buf) : f32, memref<?x?x?x?xf32>
   return %buf : memref<?x?x?x?xf32>
 }
 
diff --git a/mlir/test/Integration/Dialect/Linalg/CPU/test-conv-2d-nchw-call.mlir b/mlir/test/Integration/Dialect/Linalg/CPU/test-conv-2d-nchw-call.mlir
--- a/mlir/test/Integration/Dialect/Linalg/CPU/test-conv-2d-nchw-call.mlir
+++ b/mlir/test/Integration/Dialect/Linalg/CPU/test-conv-2d-nchw-call.mlir
@@ -25,7 +25,7 @@
 // Creates and returns 4-D buffer of size (%s1, %s2, %s3, %s4) filled with the value %f
 func @alloc_4d_filled_f32(%s1 : index, %s2 : index, %s3 : index, %s4 : index, %f : f32) -> memref<?x?x?x?xf32> {
   %buf = memref.alloc(%s1, %s2, %s3, %s4) : memref<?x?x?x?xf32>
-  linalg.fill(%buf, %f) : memref<?x?x?x?xf32>, f32
+  linalg.fill(%f, %buf) : f32, memref<?x?x?x?xf32>
   return %buf : memref<?x?x?x?xf32>
 }
 
diff --git a/mlir/test/Integration/Dialect/Linalg/CPU/test-conv-2d-nhwc-call.mlir b/mlir/test/Integration/Dialect/Linalg/CPU/test-conv-2d-nhwc-call.mlir
--- a/mlir/test/Integration/Dialect/Linalg/CPU/test-conv-2d-nhwc-call.mlir
+++ b/mlir/test/Integration/Dialect/Linalg/CPU/test-conv-2d-nhwc-call.mlir
@@ -25,7 +25,7 @@
 // Creates and returns 4-D buffer of size (%s1, %s2, %s3, %s4) filled with the value %f
 func @alloc_4d_filled_f32(%s1 : index, %s2 : index, %s3 : index, %s4 : index, %f : f32) -> memref<?x?x?x?xf32> {
   %buf = memref.alloc(%s1, %s2, %s3, %s4) : memref<?x?x?x?xf32>
-  linalg.fill(%buf, %f) : memref<?x?x?x?xf32>, f32
+  linalg.fill(%f, %buf) : f32, memref<?x?x?x?xf32>
   return %buf : memref<?x?x?x?xf32>
 }
 
diff --git a/mlir/test/Integration/Dialect/Linalg/CPU/test-conv-3d-call.mlir b/mlir/test/Integration/Dialect/Linalg/CPU/test-conv-3d-call.mlir
--- a/mlir/test/Integration/Dialect/Linalg/CPU/test-conv-3d-call.mlir
+++ b/mlir/test/Integration/Dialect/Linalg/CPU/test-conv-3d-call.mlir
@@ -25,7 +25,7 @@
 // Creates and returns 3-D buffer of size (%s1, %s2, %s3) filled with the value %f
 func @alloc_3d_filled_f32(%s1 : index, %s2 : index, %s3 : index, %f : f32) -> memref<?x?x?xf32> {
   %buf = memref.alloc(%s1, %s2, %s3) : memref<?x?x?xf32>
-  linalg.fill(%buf, %f) : memref<?x?x?xf32>, f32
+  linalg.fill(%f, %buf) : f32, memref<?x?x?xf32>
   return %buf : memref<?x?x?xf32>
 }
 
diff --git a/mlir/test/Integration/Dialect/Linalg/CPU/test-conv-3d-input-ncdhw-filter-dhwcf-call.mlir b/mlir/test/Integration/Dialect/Linalg/CPU/test-conv-3d-input-ncdhw-filter-dhwcf-call.mlir
--- a/mlir/test/Integration/Dialect/Linalg/CPU/test-conv-3d-input-ncdhw-filter-dhwcf-call.mlir
+++ b/mlir/test/Integration/Dialect/Linalg/CPU/test-conv-3d-input-ncdhw-filter-dhwcf-call.mlir
@@ -25,7 +25,7 @@
 // Creates and returns 5-D buffer of size (%s1, %s2, %s3, %s4, %s5) filled with the value %f
 func @alloc_5d_filled_f32(%s1 : index, %s2 : index, %s3 : index, %s4 : index, %s5 : index, %f : f32) -> memref<?x?x?x?x?xf32> {
   %buf = memref.alloc(%s1, %s2, %s3, %s4, %s5) : memref<?x?x?x?x?xf32>
-  linalg.fill(%buf, %f) : memref<?x?x?x?x?xf32>, f32
+  linalg.fill(%f, %buf) : f32, memref<?x?x?x?x?xf32>
   return %buf : memref<?x?x?x?x?xf32>
 }
 
diff --git a/mlir/test/Integration/Dialect/Linalg/CPU/test-conv-3d-input-ndhwc-filter-dhwcf-call.mlir b/mlir/test/Integration/Dialect/Linalg/CPU/test-conv-3d-input-ndhwc-filter-dhwcf-call.mlir
--- a/mlir/test/Integration/Dialect/Linalg/CPU/test-conv-3d-input-ndhwc-filter-dhwcf-call.mlir
+++ b/mlir/test/Integration/Dialect/Linalg/CPU/test-conv-3d-input-ndhwc-filter-dhwcf-call.mlir
@@ -25,7 +25,7 @@
 // Creates and returns 5-D buffer of size (%s1, %s2, %s3, %s4, %s5) filled with the value %f
 func @alloc_5d_filled_f32(%s1 : index, %s2 : index, %s3 : index, %s4 : index, %s5 : index, %f : f32) -> memref<?x?x?x?x?xf32> {
   %buf = memref.alloc(%s1, %s2, %s3, %s4, %s5) : memref<?x?x?x?x?xf32>
-  linalg.fill(%buf, %f) : memref<?x?x?x?x?xf32>, f32
+  linalg.fill(%f, %buf) : f32, memref<?x?x?x?x?xf32>
   return %buf : memref<?x?x?x?x?xf32>
 }
 
diff --git a/mlir/test/Integration/Dialect/Linalg/CPU/test-conv-3d-ncdhw-call.mlir b/mlir/test/Integration/Dialect/Linalg/CPU/test-conv-3d-ncdhw-call.mlir
--- a/mlir/test/Integration/Dialect/Linalg/CPU/test-conv-3d-ncdhw-call.mlir
+++ b/mlir/test/Integration/Dialect/Linalg/CPU/test-conv-3d-ncdhw-call.mlir
@@ -25,7 +25,7 @@
 // Creates and returns 5-D buffer of size (%s1, %s2, %s3, %s4, %s5) filled with the value %f
 func @alloc_5d_filled_f32(%s1 : index, %s2 : index, %s3 : index, %s4 : index, %s5 : index, %f : f32) -> memref<?x?x?x?x?xf32> {
   %buf = memref.alloc(%s1, %s2, %s3, %s4, %s5) : memref<?x?x?x?x?xf32>
-  linalg.fill(%buf, %f) : memref<?x?x?x?x?xf32>, f32
+  linalg.fill(%f, %buf) : f32, memref<?x?x?x?x?xf32>
   return %buf : memref<?x?x?x?x?xf32>
 }
 
diff --git a/mlir/test/Integration/Dialect/Linalg/CPU/test-conv-3d-ndhwc-call.mlir b/mlir/test/Integration/Dialect/Linalg/CPU/test-conv-3d-ndhwc-call.mlir
--- a/mlir/test/Integration/Dialect/Linalg/CPU/test-conv-3d-ndhwc-call.mlir
+++ b/mlir/test/Integration/Dialect/Linalg/CPU/test-conv-3d-ndhwc-call.mlir
@@ -25,7 +25,7 @@
 // Creates and returns 5-D buffer of size (%s1, %s2, %s3, %s4, %s5) filled with the value %f
 func @alloc_5d_filled_f32(%s1 : index, %s2 : index, %s3 : index, %s4 : index, %s5 : index, %f : f32) -> memref<?x?x?x?x?xf32> {
   %buf = memref.alloc(%s1, %s2, %s3, %s4, %s5) : memref<?x?x?x?x?xf32>
-  linalg.fill(%buf, %f) : memref<?x?x?x?x?xf32>, f32
+  linalg.fill(%f, %buf) : f32, memref<?x?x?x?x?xf32>
   return %buf : memref<?x?x?x?x?xf32>
 }
 
diff --git a/mlir/test/mlir-cpu-runner/async.mlir b/mlir/test/mlir-cpu-runner/async.mlir
--- a/mlir/test/mlir-cpu-runner/async.mlir
+++ b/mlir/test/mlir-cpu-runner/async.mlir
@@ -26,7 +26,7 @@
   %c4 = constant 4.0 : f32
 
   %A = memref.alloc() : memref<4xf32>
-  linalg.fill(%A, %c0) : memref<4xf32>, f32
+  linalg.fill(%c0, %A) : f32, memref<4xf32>
 
   // CHECK: [0, 0, 0, 0]
   %U = memref.cast %A :  memref<4xf32> to memref<*xf32>
diff --git a/mlir/test/mlir-cpu-runner/sgemm_naive_codegen.mlir b/mlir/test/mlir-cpu-runner/sgemm_naive_codegen.mlir
--- a/mlir/test/mlir-cpu-runner/sgemm_naive_codegen.mlir
+++ b/mlir/test/mlir-cpu-runner/sgemm_naive_codegen.mlir
@@ -7,14 +7,14 @@
 
   %cf1 = constant 1.00000e+00 : f32
 
-  linalg.fill(%A, %cf1) : memref<16x16xf32>, f32
-  linalg.fill(%B, %cf1) : memref<16x16xf32>, f32
+  linalg.fill(%cf1, %A) : f32, memref<16x16xf32>
+  linalg.fill(%cf1, %B) : f32, memref<16x16xf32>
 
   %reps = constant 1 : index
 
   %t_start = call @rtclock() : () -> f64
   affine.for %arg0 = 0 to 5 {
-    linalg.fill(%C, %cf1) : memref<16x16xf32>, f32
+    linalg.fill(%cf1, %C) : f32, memref<16x16xf32>
     call @sgemm_naive(%A, %B, %C) : (memref<16x16xf32>, memref<16x16xf32>, memref<16x16xf32>) -> ()
   }
   %t_end = call @rtclock() : () -> f64
diff --git a/mlir/test/mlir-cpu-runner/unranked_memref.mlir b/mlir/test/mlir-cpu-runner/unranked_memref.mlir
--- a/mlir/test/mlir-cpu-runner/unranked_memref.mlir
+++ b/mlir/test/mlir-cpu-runner/unranked_memref.mlir
@@ -45,18 +45,18 @@
     %f10 = constant 10.00000e+00 : f32
 
     %V = memref.cast %A : memref<10x3xf32, 0> to memref<?x?xf32>
-    linalg.fill(%V, %f10) : memref<?x?xf32, 0>, f32
+    linalg.fill(%f10, %V) : f32, memref<?x?xf32, 0>
     %U = memref.cast %A : memref<10x3xf32, 0> to memref<*xf32>
     call @print_memref_f32(%U) : (memref<*xf32>) -> ()
 
     %V2 = memref.cast %U : memref<*xf32> to memref<?x?xf32>
-    linalg.fill(%V2, %f5) : memref<?x?xf32, 0>, f32
+    linalg.fill(%f5, %V2) : f32, memref<?x?xf32, 0>
     %U2 = memref.cast %V2 : memref<?x?xf32, 0> to memref<*xf32>
     call @print_memref_f32(%U2) : (memref<*xf32>) -> ()
 
     %V3 = memref.cast %V2 : memref<?x?xf32> to memref<*xf32>
     %V4 = memref.cast %V3 : memref<*xf32> to memref<?x?xf32>
-    linalg.fill(%V4, %f2) : memref<?x?xf32, 0>, f32
+    linalg.fill(%f2, %V4) : f32, memref<?x?xf32, 0>
     %U3 = memref.cast %V2 : memref<?x?xf32> to memref<*xf32>
     call @print_memref_f32(%U3) : (memref<*xf32>) -> ()
 
@@ -81,7 +81,7 @@
 func @return_two_var_memref_caller() {
   %0 = memref.alloca() : memref<4x3xf32>
   %c0f32 = constant 1.0 : f32
-  linalg.fill(%0, %c0f32) : memref<4x3xf32>, f32
+  linalg.fill(%c0f32, %0) : f32, memref<4x3xf32>
   %1:2 = call @return_two_var_memref(%0) : (memref<4x3xf32>) -> (memref<*xf32>, memref<*xf32>)
   call @print_memref_f32(%1#0) : (memref<*xf32>) -> ()
   call @print_memref_f32(%1#1) : (memref<*xf32>) -> ()
@@ -96,7 +96,7 @@
 func @return_var_memref_caller() {
   %0 = memref.alloca() : memref<4x3xf32>
   %c0f32 = constant 1.0 : f32
-  linalg.fill(%0, %c0f32) : memref<4x3xf32>, f32
+  linalg.fill(%c0f32, %0) : f32, memref<4x3xf32>
   %1 = call @return_var_memref(%0) : (memref<4x3xf32>) -> memref<*xf32>
   call @print_memref_f32(%1) : (memref<*xf32>) -> ()
   return
diff --git a/mlir/test/mlir-cpu-runner/utils.mlir b/mlir/test/mlir-cpu-runner/utils.mlir
--- a/mlir/test/mlir-cpu-runner/utils.mlir
+++ b/mlir/test/mlir-cpu-runner/utils.mlir
@@ -19,7 +19,7 @@
   %f = constant 2.00000e+00 : f32
   %A = memref.alloc() : memref<16xf32>
   %B = memref.cast %A: memref<16xf32> to memref<?xf32>
-  linalg.fill(%B, %f) : memref<?xf32>, f32
+  linalg.fill(%f, %B) : f32, memref<?xf32>
   %U = memref.cast %B :  memref<?xf32> to memref<*xf32>
   call @print_memref_f32(%U): (memref<*xf32>) -> ()
   memref.dealloc %A : memref<16xf32>
@@ -33,7 +33,7 @@
   %f4 = constant 4.00000e+00 : f32
   %A = memref.alloc() : memref<3x4x5xf32>
   %B = memref.cast %A: memref<3x4x5xf32> to memref<?x?x?xf32>
-  linalg.fill(%B, %f) : memref<?x?x?xf32>, f32
+  linalg.fill(%f, %B) : f32, memref<?x?x?xf32>
 
   %c2 = constant 2 : index
   memref.store %f4, %B[%c2, %c2, %c2]: memref<?x?x?xf32>
diff --git a/mlir/test/python/dialects/linalg/ops.py b/mlir/test/python/dialects/linalg/ops.py
--- a/mlir/test/python/dialects/linalg/ops.py
+++ b/mlir/test/python/dialects/linalg/ops.py
@@ -59,7 +59,7 @@
       # CHECK-LABEL: func @fill_tensor
       #  CHECK-SAME:   %[[OUT:[0-9a-z]+]]: tensor<12x?xf32>
       #  CHECK-NEXT: %[[CST:.*]] = constant 0.0{{.*}} : f32
-      #  CHECK-NEXT: %[[RES:.*]] = linalg.fill(%[[OUT]], %[[CST]]) : tensor<12x?xf32>, f32 -> tensor<12x?xf32>
+      #  CHECK-NEXT: %[[RES:.*]] = linalg.fill(%[[CST]], %[[OUT]]) : f32, tensor<12x?xf32> -> tensor<12x?xf32>
       #  CHECK-NEXT: return %[[RES]] : tensor<12x?xf32>
       @builtin.FuncOp.from_py_func(
           RankedTensorType.get((12, -1), f32))
@@ -73,7 +73,7 @@
       # CHECK-LABEL: func @fill_buffer
       #  CHECK-SAME:   %[[OUT:[0-9a-z]+]]: memref<12x?xf32>
       #  CHECK-NEXT: %[[CST:.*]] = constant 0.0{{.*}} : f32
-      #  CHECK-NEXT: linalg.fill(%[[OUT]], %[[CST]]) : memref<12x?xf32>, f32
+      #  CHECK-NEXT: linalg.fill(%[[CST]], %[[OUT]]) : f32, memref<12x?xf32>
       #  CHECK-NEXT: return
       @builtin.FuncOp.from_py_func(
           MemRefType.get((12, -1), f32))
diff --git a/mlir/test/python/dialects/linalg/opsrun.py b/mlir/test/python/dialects/linalg/opsrun.py
--- a/mlir/test/python/dialects/linalg/opsrun.py
+++ b/mlir/test/python/dialects/linalg/opsrun.py
@@ -25,9 +25,9 @@
   %A = memref.alloc() : memref<4x16xf32>
   %B = memref.alloc() : memref<16x8xf32>
   %C = memref.alloc() : memref<4x8xf32>
-  linalg.fill(%A, %v1) : memref<4x16xf32>, f32
-  linalg.fill(%B, %v2) : memref<16x8xf32>, f32
-  linalg.fill(%C, %v0) : memref<4x8xf32>, f32
+  linalg.fill(%v1, %A) : f32, memref<4x16xf32>
+  linalg.fill(%v2, %B) : f32, memref<16x8xf32>
+  linalg.fill(%v0, %C) : f32, memref<4x8xf32>
 
   call @matmul_on_buffers(%A, %B, %C) :
     (memref<4x16xf32>, memref<16x8xf32>, memref<4x8xf32>) -> ()