diff --git a/mlir/test/Dialect/Linalg/pad.mlir b/mlir/test/Dialect/Linalg/pad.mlir
--- a/mlir/test/Dialect/Linalg/pad.mlir
+++ b/mlir/test/Dialect/Linalg/pad.mlir
@@ -1,235 +1,182 @@
-// RUN: mlir-opt %s -test-linalg-codegen-strategy="anchor-op=linalg.matmul pad pack-paddings=1,1,0 run-enable-pass=false" -cse -canonicalize -split-input-file | FileCheck %s
-// RUN: mlir-opt %s -test-linalg-codegen-strategy="anchor-op=linalg.fill pad pack-paddings=1,1,0 run-enable-pass=false" -cse -canonicalize -split-input-file | FileCheck %s --check-prefix=CHECK-FILL
-
-// CHECK-DAG: #[[MAP0:[0-9a-z]+]] = affine_map<(d0) -> (7, -d0 + 12)>
-// CHECK-DAG: #[[MAP1:[0-9a-z]+]] = affine_map<(d0) -> (-d0 + 7)>
-#map = affine_map<(d0) -> (7, -d0 + 12)>
-
-//      CHECK:  static_sizes_output_divisible
-// CHECK-SAME:    %[[ARG0:[0-9a-zA-Z]*]]: tensor<24x12xf32>
-// CHECK-SAME:    %[[ARG1:[0-9a-zA-Z]*]]: tensor<12x25xf32>
+// RUN: mlir-opt %s -test-linalg-codegen-strategy="anchor-op=linalg.matmul pad pack-paddings=1,1,0 run-enable-pass=false" -cse -canonicalize -split-input-file | FileCheck %s --check-prefix=MATMUL
+// RUN: mlir-opt %s -test-linalg-codegen-strategy="anchor-op=linalg.fill pad pack-paddings=1,1,0 run-enable-pass=false" -cse -canonicalize -split-input-file | FileCheck %s --check-prefix=FILL
+
+// MATMUL-DAG: #[[MAP0:[0-9a-z]+]] = affine_map<()[s0] -> (7, -s0 + 12)>
+// MATMUL-DAG: #[[MAP1:[0-9a-z]+]] = affine_map<()[s0] -> (-s0 + 7)>
+#map = affine_map<()[s0] -> (7, -s0 + 12)>
+
+//      MATMUL:  static_sizes_output_divisible
+// MATMUL-SAME:    %[[ARG0:[0-9a-zA-Z]*]]: tensor<24x12xf32>
+// MATMUL-SAME:    %[[ARG1:[0-9a-zA-Z]*]]: tensor<12x25xf32>
+// MATMUL-SAME:    %[[ARG2:[0-9a-zA-Z]*]]: tensor<24x25xf32>
+// MATMUL-SAME:    %[[IV0:[0-9a-zA-Z]*]]: index
+// MATMUL-SAME:    %[[IV1:[0-9a-zA-Z]*]]: index
+// MATMUL-SAME:    %[[IV2:[0-9a-zA-Z]*]]: index
 func @static_sizes_output_divisible(%arg0: tensor<24x12xf32>,
                                     %arg1: tensor<12x25xf32>,
-                                    %arg2: tensor<24x25xf32>) -> tensor<24x25xf32> {
-  //  CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index
-  //  CHECK-DAG: %[[C7:.*]] = arith.constant 7
-  %c0 = arith.constant 0 : index
-  %c12 = arith.constant 12 : index
-  %c25 = arith.constant 25 : index
-  %c24 = arith.constant 24 : index
-  %c7 = arith.constant 7 : index
-  %c5 = arith.constant 5 : index
-  %c4 = arith.constant 4 : index
-
-  //      CHECK:  scf.for %[[IV0:[0-9a-zA-Z]*]] =
-  %0 = scf.for %arg3 = %c0 to %c24 step %c4 iter_args(%arg4 = %arg2) -> (tensor<24x25xf32>) {
-
-    //      CHECK:  scf.for %[[IV1:[0-9a-zA-Z]*]] =
-    %1 = scf.for %arg5 = %c0 to %c25 step %c5 iter_args(%arg6 = %arg4) -> (tensor<24x25xf32>) {
-
-      //      CHECK:  scf.for %[[IV2:[0-9a-zA-Z]*]] = {{.*}} iter_args(%[[ARG4:.*]] =
-      %2 = scf.for %arg7 = %c0 to %c12 step %c7 iter_args(%arg8 = %arg6) -> (tensor<24x25xf32>) {
-
-        //      CHECK:   %[[TS2:.*]] = affine.min #[[MAP0]](%[[IV2]])
-        %3 = affine.min #map(%arg7)
-
-        //      CHECK:   %[[T0:.*]] = tensor.extract_slice %[[ARG0]]
-        //      CHECK:   %[[T1:.*]] = tensor.extract_slice %[[ARG1]]
-        //      CHECK:   %[[T2:.*]] = tensor.extract_slice %[[ARG4]]
-        %4 = tensor.extract_slice %arg0[%arg3, %arg7] [4, %3] [1, 1] : tensor<24x12xf32> to tensor<4x?xf32>
-        %5 = tensor.extract_slice %arg1[%arg7, %arg5] [%3, 5] [1, 1] : tensor<12x25xf32> to tensor<?x5xf32>
-        %6 = tensor.extract_slice %arg8[%arg3, %arg5] [4, 5] [1, 1] : tensor<24x25xf32> to tensor<4x5xf32>
-
-        // Check statically sized matmul inputs with partially divisible sizes are padded.
-        //      CHECK:   %[[V0:.*]] = affine.apply #[[MAP1]](%[[TS2]])
-        //      CHECK:   %[[T3:.*]] = linalg.pad_tensor %[[T0]] nofold
-        // CHECK-SAME:                  [%[[C0]], %[[C0]]]
-        // CHECK-SAME:                  [%[[C0]], %[[V0]]
-        //      CHECK:   %[[T4:.*]] = linalg.pad_tensor %[[T1]] nofold
-
-        // Check the statically sized matmul output with fully divisible sizes is not padded.
-        //      CHECK:   %[[T5:.*]] = linalg.matmul
-        // CHECK-SAME:                  ins(%[[T3]], %[[T4]] : tensor<4x7xf32>, tensor<7x5xf32>)
-        // CHECK-SAME:                  outs(%[[T2]] : tensor<4x5xf32>)
-        //      CHECK:   %[[T6:.*]] = tensor.insert_slice %[[T5]]
-        %7 = linalg.matmul ins(%4, %5 : tensor<4x?xf32>, tensor<?x5xf32>) outs(%6 : tensor<4x5xf32>) -> tensor<4x5xf32>
-        %8 = tensor.insert_slice %7 into %arg8[%arg3, %arg5] [4, 5] [1, 1] : tensor<4x5xf32> into tensor<24x25xf32>
-
-        //      CHECK:   scf.yield %[[T6]]
-        scf.yield %8 : tensor<24x25xf32>
-      }
-      scf.yield %2 : tensor<24x25xf32>
-    }
-    scf.yield %1 : tensor<24x25xf32>
-  }
-  return %0 : tensor<24x25xf32>
+                                    %arg2: tensor<24x25xf32>,
+                                    %iv0 : index, %iv1 : index, %iv2 : index) -> tensor<24x25xf32> {
+  //  MATMUL-DAG: %[[C0:.*]] = arith.constant 0 : index
+
+  //      MATMUL:   %[[TS2:.*]] = affine.min #[[MAP0]]()[%[[IV2]]]
+  %0 = affine.min #map()[%iv2]
+
+  //      MATMUL:   %[[T0:.*]] = tensor.extract_slice %[[ARG0]]
+  //      MATMUL:   %[[T1:.*]] = tensor.extract_slice %[[ARG1]]
+  //      MATMUL:   %[[T2:.*]] = tensor.extract_slice %[[ARG2]]
+  %1 = tensor.extract_slice %arg0[%iv0, %iv2] [4, %0] [1, 1] : tensor<24x12xf32> to tensor<4x?xf32>
+  %2 = tensor.extract_slice %arg1[%iv2, %iv1] [%0, 5] [1, 1] : tensor<12x25xf32> to tensor<?x5xf32>
+  %3 = tensor.extract_slice %arg2[%iv0, %iv1] [4, 5] [1, 1] : tensor<24x25xf32> to tensor<4x5xf32>
+
+  // Check statically sized matmul inputs with partially divisible sizes are padded.
+  //      MATMUL:   %[[V0:.*]] = affine.apply #[[MAP1]]()[%[[TS2]]]
+  //      MATMUL:   %[[T3:.*]] = linalg.pad_tensor %[[T0]] nofold
+  // MATMUL-SAME:                  [%[[C0]], %[[C0]]]
+  // MATMUL-SAME:                  [%[[C0]], %[[V0]]
+  //      MATMUL:   %[[T4:.*]] = linalg.pad_tensor %[[T1]] nofold
+
+  // Check the statically sized matmul output with fully divisible sizes is not padded.
+  //      MATMUL:   %[[T5:.*]] = linalg.matmul
+  // MATMUL-SAME:                  ins(%[[T3]], %[[T4]] : tensor<4x7xf32>, tensor<7x5xf32>)
+  // MATMUL-SAME:                  outs(%[[T2]] : tensor<4x5xf32>)
+  //      MATMUL:   %[[T6:.*]] = tensor.insert_slice %[[T5]]
+  %4 = linalg.matmul ins(%1, %2 : tensor<4x?xf32>, tensor<?x5xf32>) outs(%3 : tensor<4x5xf32>) -> tensor<4x5xf32>
+  %5 = tensor.insert_slice %4 into %arg2[%iv0, %iv1] [4, 5] [1, 1] : tensor<4x5xf32> into tensor<24x25xf32>
+  return %5 : tensor<24x25xf32>
 }
 
 // -----
 
-// CHECK-DAG: #[[MAP0:[0-9a-z]+]] = affine_map<(d0) -> (7, -d0 + 25)>
-// CHECK-DAG: #[[MAP1:[0-9a-z]+]] = affine_map<(d0) -> (-d0 + 7)>
-#map = affine_map<(d0) -> (7, -d0 + 25)>
+// MATMUL-DAG: #[[MAP0:[0-9a-z]+]] = affine_map<()[s0] -> (7, -s0 + 25)>
+// MATMUL-DAG: #[[MAP1:[0-9a-z]+]] = affine_map<()[s0] -> (-s0 + 7)>
+#map = affine_map<()[s0] -> (7, -s0 + 25)>
 
-//      CHECK:  static_sizes_input_divisible
+//      MATMUL:  static_sizes_input_divisible
+// MATMUL-SAME:    %[[ARG2:[0-9a-zA-Z]*]]: tensor<24x25xf32>
+// MATMUL-SAME:    %[[IV0:[0-9a-zA-Z]*]]: index
+// MATMUL-SAME:    %[[IV1:[0-9a-zA-Z]*]]: index
+// MATMUL-SAME:    %[[IV2:[0-9a-zA-Z]*]]: index
 func @static_sizes_input_divisible(%arg0: tensor<24x12xf32>,
                                    %arg1: tensor<12x25xf32>,
-                                   %arg2: tensor<24x25xf32>) -> tensor<24x25xf32> {
-  //  CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index
-  //  CHECK-DAG: %[[C7:.*]] = arith.constant 7
-  %c0 = arith.constant 0 : index
-  %c12 = arith.constant 12 : index
-  %c25 = arith.constant 25 : index
-  %c24 = arith.constant 24 : index
-  %c6 = arith.constant 6 : index
-  %c7 = arith.constant 7 : index
-  %c4 = arith.constant 4 : index
-
-  //      CHECK:  scf.for %[[IV0:[0-9a-zA-Z]*]] =
-  %0 = scf.for %arg3 = %c0 to %c24 step %c4 iter_args(%arg4 = %arg2) -> (tensor<24x25xf32>) {
-
-    //      CHECK:  scf.for %[[IV1:[0-9a-zA-Z]*]] =
-    %1 = scf.for %arg5 = %c0 to %c25 step %c7 iter_args(%arg6 = %arg4) -> (tensor<24x25xf32>) {
-
-      //      CHECK:  scf.for %[[IV2:[0-9a-zA-Z]*]] = {{.*}} iter_args(%[[ARG4:.*]] =
-      %2 = scf.for %arg7 = %c0 to %c12 step %c6 iter_args(%arg8 = %arg6) -> (tensor<24x25xf32>) {
-        %3 = tensor.extract_slice %arg0[%arg3, %arg7] [4, 6] [1, 1] : tensor<24x12xf32> to tensor<4x6xf32>
-
-        //      CHECK:   %[[TS1:.*]] = affine.min #[[MAP0]](%[[IV1]])
-        %4 = affine.min #map(%arg5)
-        %5 = tensor.extract_slice %arg1[%arg7, %arg5] [6, %4] [1, 1] : tensor<12x25xf32> to tensor<6x?xf32>
-
-        //      CHECK:   %[[T0:.*]] = tensor.extract_slice %[[ARG4]]
-        %6 = tensor.extract_slice %arg8[%arg3, %arg5] [4, %4] [1, 1] : tensor<24x25xf32> to tensor<4x?xf32>
-
-        // Check the statically sized matmul output with partially divisible sizes is padded.
-        //      CHECK:   %[[V0:.*]] = affine.apply #[[MAP1]](%[[TS1]])
-        //      CHECK:   %[[T1:.*]] = linalg.pad_tensor %[[T0]] low
-        // CHECK-SAME:                  [%[[C0]], %[[C0]]]
-        // CHECK-SAME:                  [%[[C0]], %[[V0]]
-
-        //      CHECK:   %[[T2:.*]] = linalg.matmul
-        // CHECK-SAME:                  outs(%[[T1]] : tensor<4x7xf32>)
-        //      CHECK:   %[[T3:.*]] = tensor.extract_slice %[[T2]]
-        //      CHECK:   %[[T4:.*]] = tensor.insert_slice %[[T3]]
-        %7 = linalg.matmul ins(%3, %5 : tensor<4x6xf32>, tensor<6x?xf32>) outs(%6 : tensor<4x?xf32>) -> tensor<4x?xf32>
-        %8 = tensor.insert_slice %7 into %arg8[%arg3, %arg5] [4, %4] [1, 1] : tensor<4x?xf32> into tensor<24x25xf32>
-
-        //      CHECK:   scf.yield %[[T4]]
-        scf.yield %8 : tensor<24x25xf32>
-      }
-      scf.yield %2 : tensor<24x25xf32>
-    }
-    scf.yield %1 : tensor<24x25xf32>
-  }
-  return %0 : tensor<24x25xf32>
+                                   %arg2: tensor<24x25xf32>,
+                                   %iv0 : index, %iv1 : index, %iv2 : index) ->  tensor<24x25xf32> {
+  //  MATMUL-DAG: %[[C0:.*]] = arith.constant 0 : index
+
+  %3 = tensor.extract_slice %arg0[%iv0, %iv2] [4, 6] [1, 1] : tensor<24x12xf32> to tensor<4x6xf32>
+
+  //      MATMUL:   %[[TS1:.*]] = affine.min #[[MAP0]]()[%[[IV1]]]
+  %4 = affine.min #map()[%iv1]
+  %5 = tensor.extract_slice %arg1[%iv2, %iv1] [6, %4] [1, 1] : tensor<12x25xf32> to tensor<6x?xf32>
+
+  //      MATMUL:   %[[T0:.*]] = tensor.extract_slice %[[ARG2]]
+  %6 = tensor.extract_slice %arg2[%iv0, %iv1] [4, %4] [1, 1] : tensor<24x25xf32> to tensor<4x?xf32>
+
+  // Check the statically sized matmul output with partially divisible sizes is padded.
+  //      MATMUL:   %[[V0:.*]] = affine.apply #[[MAP1]]()[%[[TS1]]]
+  //      MATMUL:   %[[T1:.*]] = linalg.pad_tensor %[[T0]] low
+  // MATMUL-SAME:                  [%[[C0]], %[[C0]]]
+  // MATMUL-SAME:                  [%[[C0]], %[[V0]]
+
+  //      MATMUL:   %[[T2:.*]] = linalg.matmul
+  // MATMUL-SAME:                  outs(%[[T1]] : tensor<4x7xf32>)
+  //      MATMUL:   %[[T3:.*]] = tensor.extract_slice %[[T2]]
+  //      MATMUL:   %[[T4:.*]] = tensor.insert_slice %[[T3]]
+  %7 = linalg.matmul ins(%3, %5 : tensor<4x6xf32>, tensor<6x?xf32>) outs(%6 : tensor<4x?xf32>) -> tensor<4x?xf32>
+  %8 = tensor.insert_slice %7 into %arg2[%iv0, %iv1] [4, %4] [1, 1] : tensor<4x?xf32> into tensor<24x25xf32>
+
+   //      MATMUL:   return %[[T4]]
+  return %8 : tensor<24x25xf32>
 }
 
 // -----
 
-// CHECK-DAG: #[[MAP0:[0-9a-z]+]] = affine_map<(d0)[s0] -> (5, -d0 + s0)>
-// CHECK-DAG: #[[MAP1:[0-9a-z]+]] = affine_map<(d0)[s0] -> (7, -d0 + s0)>
-// CHECK-DAG: #[[MAP2:[0-9a-z]+]] = affine_map<(d0)[s0] -> (6, -d0 + s0)>
-// CHECK-DAG: #[[MAP3:[0-9a-z]+]] = affine_map<(d0) -> (-d0 + 5)>
-// CHECK-DAG: #[[MAP4:[0-9a-z]+]] = affine_map<(d0) -> (-d0 + 6)>
-
-#map0 = affine_map<(d0)[s0] -> (5, -d0 + s0)>
-#map1 = affine_map<(d0)[s0] -> (6, -d0 + s0)>
-#map2 = affine_map<(d0)[s0] -> (7, -d0 + s0)>
-
-//      CHECK:  dynamic_sizes
-// CHECK-SAME:    %[[ARG0:[0-9a-zA-Z]*]]: tensor<?x?xf32>
-// CHECK-SAME:    %[[ARG1:[0-9a-zA-Z]*]]: tensor<?x?xf32>
-// CHECK-SAME:    %[[ARG2:[0-9a-zA-Z]*]]: tensor<?x?xf32>
+// MATMUL-DAG: #[[MAP0:[0-9a-z]+]] = affine_map<()[s0, s1] -> (5, -s0 + s1)>
+// MATMUL-DAG: #[[MAP1:[0-9a-z]+]] = affine_map<()[s0, s1] -> (7, -s0 + s1)>
+// MATMUL-DAG: #[[MAP2:[0-9a-z]+]] = affine_map<()[s0, s1] -> (6, -s0 + s1)>
+// MATMUL-DAG: #[[MAP3:[0-9a-z]+]] = affine_map<()[s0] -> (-s0 + 5)>
+// MATMUL-DAG: #[[MAP4:[0-9a-z]+]] = affine_map<()[s0] -> (-s0 + 6)>
+
+#map0 = affine_map<()[s0, s1] -> (5, -s0 + s1)>
+#map1 = affine_map<()[s0, s1] -> (6, -s0 + s1)>
+#map2 = affine_map<()[s0, s1] -> (7, -s0 + s1)>
+
+//      MATMUL:  dynamic_sizes
+// MATMUL-SAME:    %[[ARG0:[0-9a-zA-Z]*]]: tensor<?x?xf32>
+// MATMUL-SAME:    %[[ARG1:[0-9a-zA-Z]*]]: tensor<?x?xf32>
+// MATMUL-SAME:    %[[ARG2:[0-9a-zA-Z]*]]: tensor<?x?xf32>
+// MATMUL-SAME:    %[[IV0:[0-9a-zA-Z]*]]: index
+// MATMUL-SAME:    %[[IV1:[0-9a-zA-Z]*]]: index
+// MATMUL-SAME:    %[[IV2:[0-9a-zA-Z]*]]: index
 func @dynamic_sizes(%arg0: tensor<?x?xf32>,
                     %arg1: tensor<?x?xf32>,
-                    %arg2: tensor<?x?xf32>) -> tensor<?x?xf32> {
-  //  CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index
-  //  CHECK-DAG: %[[C1:.*]] = arith.constant 1
-  //  CHECK-DAG: %[[C5:.*]] = arith.constant 5
-  //  CHECK-DAG: %[[C6:.*]] = arith.constant 6
+                    %arg2: tensor<?x?xf32>,
+                    %iv0 : index, %iv1 : index, %iv2 : index) -> tensor<?x?xf32> {
+  //  MATMUL-DAG: %[[C0:.*]] = arith.constant 0 : index
+  //  MATMUL-DAG: %[[C1:.*]] = arith.constant 1
   %c1 = arith.constant 1 : index
   %c0 = arith.constant 0 : index
-  %c6 = arith.constant 6 : index
-  %c7 = arith.constant 7 : index
-  %c5 = arith.constant 5 : index
 
-  //  CHECK-DAG: %[[D0:.*]] = tensor.dim %[[ARG0]], %[[C0]]
-  //  CHECK-DAG: %[[D2:.*]] = tensor.dim %[[ARG0]], %[[C1]]
-  //  CHECK-DAG: %[[D1:.*]] = tensor.dim %[[ARG1]], %[[C1]]
+  //  MATMUL-DAG: %[[D0:.*]] = tensor.dim %[[ARG0]], %[[C0]]
+  //  MATMUL-DAG: %[[D2:.*]] = tensor.dim %[[ARG0]], %[[C1]]
+  //  MATMUL-DAG: %[[D1:.*]] = tensor.dim %[[ARG1]], %[[C1]]
   %0 = tensor.dim %arg0, %c0 : tensor<?x?xf32>
   %1 = tensor.dim %arg0, %c1 : tensor<?x?xf32>
   %2 = tensor.dim %arg1, %c1 : tensor<?x?xf32>
 
-  //      CHECK:  scf.for %[[IV0:[0-9a-zA-Z]*]] =
-  %3 = scf.for %arg3 = %c0 to %0 step %c5 iter_args(%arg4 = %arg2) -> (tensor<?x?xf32>) {
-
-    //      CHECK:  scf.for %[[IV1:[0-9a-zA-Z]*]] =
-    %4 = scf.for %arg5 = %c0 to %2 step %c7 iter_args(%arg6 = %arg4) -> (tensor<?x?xf32>) {
-
-      //      CHECK:  scf.for %[[IV2:[0-9a-zA-Z]*]] = {{.*}} iter_args(%[[ARG4:.*]] =
-      %5 = scf.for %arg7 = %c0 to %1 step %c6 iter_args(%arg8 = %arg6) -> (tensor<?x?xf32>) {
-
-        //      CHECK:   %[[TS0:.*]] = affine.min #[[MAP0]](%[[IV0]])[%[[D0]]]
-        //      CHECK:   %[[TS2:.*]] = affine.min #[[MAP2]](%[[IV2]])[%[[D2]]]
-        //      CHECK:   %[[TS1:.*]] = affine.min #[[MAP1]](%[[IV1]])[%[[D1]]]
-        %6 = affine.min #map0(%arg3)[%0]
-        %7 = affine.min #map1(%arg7)[%1]
-        %8 = tensor.extract_slice %arg0[%arg3, %arg7] [%6, %7] [1, 1] : tensor<?x?xf32> to tensor<?x?xf32>
-        %9 = affine.min #map2(%arg5)[%2]
-        %10 = tensor.extract_slice %arg1[%arg7, %arg5] [%7, %9] [1, 1] : tensor<?x?xf32> to tensor<?x?xf32>
-        %11 = tensor.extract_slice %arg8[%arg3, %arg5] [%6, %9] [1, 1] : tensor<?x?xf32> to tensor<?x?xf32>
-
-        // Check all matmul operands are padded.
-        //      CHECK:   %[[V0:.*]] = affine.apply #[[MAP3]](%[[TS0]])
-        //      CHECK:   %[[V1:.*]] = affine.apply #[[MAP4]](%[[TS2]])
-        //      CHECK:   %[[T3:.*]] = linalg.pad_tensor %{{.*}} nofold
-        // CHECK-SAME:                  [%[[C0]], %[[C0]]]
-        // CHECK-SAME:                  [%[[V0]], %[[V1]]
-        //      CHECK:   %[[T4:.*]] = linalg.pad_tensor %{{.*}} nofold
-        //      CHECK:   %[[T5:.*]] = linalg.pad_tensor %{{.*}} low
-
-        // Check the dynamic matmul has been erased.
-        //  CHECK-NOT:   = linalg.matmul {{.*}} tensor<?x?xf32>
-
-        // Check all padded matmul operands are statically sized.
-        //      CHECK:   %[[T6:.*]] = linalg.matmul
-        // CHECK-SAME:                  ins(%[[T3]], %[[T4]] : tensor<5x6xf32>, tensor<6x7xf32>)
-        // CHECK-SAME:                  outs(%[[T5]] : tensor<5x7xf32>)
-        //      CHECK:   %[[T7:.*]] = tensor.extract_slice %[[T6]][0, 0] [%[[TS0]], %[[TS1]]]
-        //      CHECK:   %[[T8:.*]] = tensor.insert_slice %[[T7]]
-        %12 = linalg.matmul ins(%8, %10 : tensor<?x?xf32>, tensor<?x?xf32>) outs(%11 : tensor<?x?xf32>) -> tensor<?x?xf32>
-        %13 = tensor.insert_slice %12 into %arg8[%arg3, %arg5] [%6, %9] [1, 1] : tensor<?x?xf32> into tensor<?x?xf32>
-
-        //      CHECK:   scf.yield %[[T8]]
-        scf.yield %13 : tensor<?x?xf32>
-      }
-      scf.yield %5 : tensor<?x?xf32>
-    }
-    scf.yield %4 : tensor<?x?xf32>
-  }
-  return %3 : tensor<?x?xf32>
+  //      MATMUL:   %[[TS0:.*]] = affine.min #[[MAP0]]()[%[[IV0]], %[[D0]]]
+  //      MATMUL:   %[[TS2:.*]] = affine.min #[[MAP2]]()[%[[IV2]], %[[D2]]]
+  //      MATMUL:   %[[TS1:.*]] = affine.min #[[MAP1]]()[%[[IV1]], %[[D1]]]
+  %6 = affine.min #map0()[%iv0, %0]
+  %7 = affine.min #map1()[%iv2, %1]
+  %8 = tensor.extract_slice %arg0[%iv0, %iv2] [%6, %7] [1, 1] : tensor<?x?xf32> to tensor<?x?xf32>
+  %9 = affine.min #map2()[%iv1, %2]
+  %10 = tensor.extract_slice %arg1[%iv2, %iv1] [%7, %9] [1, 1] : tensor<?x?xf32> to tensor<?x?xf32>
+  %11 = tensor.extract_slice %arg2[%iv0, %iv1] [%6, %9] [1, 1] : tensor<?x?xf32> to tensor<?x?xf32>
+
+  // Check all matmul operands are padded.
+  //      MATMUL:   %[[V0:.*]] = affine.apply #[[MAP3]]()[%[[TS0]]]
+  //      MATMUL:   %[[V1:.*]] = affine.apply #[[MAP4]]()[%[[TS2]]]
+  //      MATMUL:   %[[T3:.*]] = linalg.pad_tensor %{{.*}} nofold
+  // MATMUL-SAME:                  [%[[C0]], %[[C0]]]
+  // MATMUL-SAME:                  [%[[V0]], %[[V1]]
+  //      MATMUL:   %[[T4:.*]] = linalg.pad_tensor %{{.*}} nofold
+  //      MATMUL:   %[[T5:.*]] = linalg.pad_tensor %{{.*}} low
+
+  // Check the dynamic matmul has been erased.
+  //  MATMUL-NOT:   = linalg.matmul {{.*}} tensor<?x?xf32>
+
+  // Check all padded matmul operands are statically sized.
+  //      MATMUL:   %[[T6:.*]] = linalg.matmul
+  // MATMUL-SAME:                  ins(%[[T3]], %[[T4]] : tensor<5x6xf32>, tensor<6x7xf32>)
+  // MATMUL-SAME:                  outs(%[[T5]] : tensor<5x7xf32>)
+  //      MATMUL:   %[[T7:.*]] = tensor.extract_slice %[[T6]][0, 0] [%[[TS0]], %[[TS1]]]
+  //      MATMUL:   %[[T8:.*]] = tensor.insert_slice %[[T7]]
+  %12 = linalg.matmul ins(%8, %10 : tensor<?x?xf32>, tensor<?x?xf32>) outs(%11 : tensor<?x?xf32>) -> tensor<?x?xf32>
+  %13 = tensor.insert_slice %12 into %arg2[%iv0, %iv1] [%6, %9] [1, 1] : tensor<?x?xf32> into tensor<?x?xf32>
+
+  //      MATMUL:   return %[[T8]]
+  return %13 : tensor<?x?xf32>
 }
 
 // -----
 
 #map0 = affine_map<(d0) -> (64, d0)>
 
-//      CHECK:  compose_padding
-// CHECK-SAME:    %[[ARG0:[0-9a-zA-Z]*]]: tensor<64x64xf32>
+//      MATMUL:  compose_padding
+// MATMUL-SAME:    %[[ARG0:[0-9a-zA-Z]*]]: tensor<64x64xf32>
 func @compose_padding(%arg0: tensor<64x64xf32>,
                       %iv0 : index) -> tensor<?x?xf32> {
   %cst = arith.constant 0.0 : f32
 
-  //      CHECK:  %[[SIZE:.*]] = affine.min
+  //      MATMUL:  %[[SIZE:.*]] = affine.min
   %size = affine.min #map0(%iv0)
 
-  //      CHECK:  %[[T0:.*]] = tensor.extract_slice %[[ARG0]]
-  // CHECK-SAME:                                     [0, 0]
-  // CHECK-SAME:                                     [%[[SIZE]], %[[SIZE]]]
-  //      CHECK:  %[[T1:.*]] = linalg.pad_tensor %[[T0]]
-  //      CHECK:  %[[T2:.*]] = linalg.fill(%{{.*}}, %[[T1]]
-  //      CHECK:  %[[T3:.*]] = linalg.fill(%{{.*}}, %[[T2]]
+  //      MATMUL:  %[[T0:.*]] = tensor.extract_slice %[[ARG0]]
+  // MATMUL-SAME:                                     [0, 0]
+  // MATMUL-SAME:                                     [%[[SIZE]], %[[SIZE]]]
+  //      MATMUL:  %[[T1:.*]] = linalg.pad_tensor %[[T0]]
+  //      MATMUL:  %[[T2:.*]] = linalg.fill(%{{.*}}, %[[T1]]
+  //      MATMUL:  %[[T3:.*]] = linalg.fill(%{{.*}}, %[[T2]]
   %0 = tensor.extract_slice %arg0[0, 0] [%size, %size] [1, 1] : tensor<64x64xf32> to tensor<?x?xf32>
   %1 = linalg.pad_tensor %0 low[0, 0] high[%iv0, %iv0]  {
     ^bb0(%arg3: index, %arg4: index):  // no predecessors
@@ -240,16 +187,16 @@
   %4 = tensor.extract_slice %3[0, 0] [%size, %size] [1, 1] : tensor<64x64xf32> to tensor<?x?xf32>
 
   // Check there are no additional pad tensor operations.
-  //  CHECK-NOT:  linalg.pad_tensor
+  //  MATMUL-NOT:  linalg.pad_tensor
 
   // Check the matmul directly uses the result of the fill operation.
-  //      CHECK:  %[[T4:.*]] = linalg.matmul ins(%[[T3]]
-  //      CHECK:  %[[T5:.*]] = tensor.extract_slice %[[T4]]
-  // CHECK-SAME:                                     [0, 0]
-  // CHECK-SAME:                                     [%[[SIZE]], %[[SIZE]]]
+  //      MATMUL:  %[[T4:.*]] = linalg.matmul ins(%[[T3]]
+  //      MATMUL:  %[[T5:.*]] = tensor.extract_slice %[[T4]]
+  // MATMUL-SAME:                                     [0, 0]
+  // MATMUL-SAME:                                     [%[[SIZE]], %[[SIZE]]]
   %5 = linalg.matmul ins(%4, %4 : tensor<?x?xf32>, tensor<?x?xf32>) outs(%4 : tensor<?x?xf32>) -> tensor<?x?xf32>
 
-  //      CHECK:  return %[[T5]]
+  //      MATMUL:  return %[[T5]]
   return %5 : tensor<?x?xf32>
 }
 
@@ -257,7 +204,7 @@
 
 #map0 = affine_map<(d0) -> (64, d0)>
 
-//      CHECK:  different_padding_values
+//      MATMUL:  different_padding_values
 func @different_padding_values(%arg0: tensor<64x64xf32>,
                                %iv0 : index) -> tensor<?x?xf32> {
   %cst = arith.constant 42.0 : f32
@@ -271,9 +218,9 @@
   %4 = tensor.extract_slice %2[0, 0] [%size, %size] [1, 1] : tensor<64x64xf32> to tensor<?x?xf32>
 
   // Different padding values prevent composing the paddings (42.0 vs. 0.0).
-  //      CHECK:  = linalg.fill
-  //      CHECK:  = linalg.pad_tensor
-  //      CHECK:  = linalg.matmul
+  //      MATMUL:  = linalg.fill
+  //      MATMUL:  = linalg.pad_tensor
+  //      MATMUL:  = linalg.matmul
   %5 = linalg.matmul ins(%4, %4 : tensor<?x?xf32>, tensor<?x?xf32>) outs(%4 : tensor<?x?xf32>) -> tensor<?x?xf32>
   return %5 : tensor<?x?xf32>
 }
@@ -282,7 +229,7 @@
 
 #map0 = affine_map<(d0) -> (64, d0)>
 
-//      CHECK:  different_padding_dynamic_sizes
+//      MATMUL:  different_padding_dynamic_sizes
 func @different_padding_dynamic_sizes(%arg0: tensor<64x64xf32>,
                                       %iv0 : index) -> tensor<?x?xf32> {
   %cst = arith.constant 0.0 : f32
@@ -296,9 +243,9 @@
   %4 = tensor.extract_slice %2[0, 0] [%size, %size] [1, 1] : tensor<64x64xf32> to tensor<?x?xf32>
 
   // Different dynamic sizes prevent composing the paddings (%iv0 vs %size).
-  //      CHECK:  = linalg.fill
-  //      CHECK:  = linalg.pad_tensor
-  //      CHECK:  = linalg.matmul
+  //      MATMUL:  = linalg.fill
+  //      MATMUL:  = linalg.pad_tensor
+  //      MATMUL:  = linalg.matmul
   %5 = linalg.matmul ins(%4, %4 : tensor<?x?xf32>, tensor<?x?xf32>) outs(%4 : tensor<?x?xf32>) -> tensor<?x?xf32>
   return %5 : tensor<?x?xf32>
 }
@@ -307,7 +254,7 @@
 
 #map0 = affine_map<(d0) -> (64, d0)>
 
-//      CHECK:  different_padding_static_sizes
+//      MATMUL:  different_padding_static_sizes
 func @different_padding_static_sizes(%arg0: tensor<62x62xf32>,
                                      %iv0 : index) -> tensor<?x?xf32> {
   %cst = arith.constant 0.0 : f32
@@ -321,45 +268,32 @@
   %4 = tensor.extract_slice %2[0, 0] [%size, %size] [1, 1] : tensor<62x62xf32> to tensor<?x?xf32>
 
   // Different static sizes prevent composing the paddings (62 vs 64 derived from #map0).
-  //      CHECK:  = linalg.fill
-  //      CHECK:  = linalg.pad_tensor
-  //      CHECK:  = linalg.matmul
+  //      MATMUL:  = linalg.fill
+  //      MATMUL:  = linalg.pad_tensor
+  //      MATMUL:  = linalg.matmul
   %5 = linalg.matmul ins(%4, %4 : tensor<?x?xf32>, tensor<?x?xf32>) outs(%4 : tensor<?x?xf32>) -> tensor<?x?xf32>
   return %5 : tensor<?x?xf32>
 }
 
 // -----
 
-#map = affine_map<(d0) -> (7, -d0 + 12)>
-
-//      CHECK-FILL:  scalar_operand
-// CHECK-FILL-SAME:    %[[ARG0:[0-9a-zA-Z]*]]: f32
-// CHECK-FILL-SAME:    %[[ARG1:[0-9a-zA-Z]*]]: tensor<24x12xf32>
-func @scalar_operand(%arg0: f32, %arg1: tensor<24x12xf32>) -> tensor<24x12xf32> {
-  %c0 = arith.constant 0 : index
-  %c12 = arith.constant 12 : index
-  %c24 = arith.constant 24 : index
-  %c7 = arith.constant 7 : index
-  %c4 = arith.constant 4 : index
-
-  //      CHECK-FILL:  scf.for %[[IV0:[0-9a-zA-Z]*]] =
-  %0 = scf.for %arg2 = %c0 to %c24 step %c4 iter_args(%arg3 = %arg1) -> (tensor<24x12xf32>) {
-
-    //      CHECK-FILL:  scf.for %[[IV1:[0-9a-zA-Z]*]] = {{.*}} iter_args(%[[ARG2:.*]] =
-    %1 = scf.for %arg4 = %c0 to %c12 step %c7 iter_args(%arg5 = %arg3) -> (tensor<24x12xf32>) {
-      %2 = affine.min #map(%arg4)
-
-      //      CHECK-FILL:   %[[T0:.*]] = tensor.extract_slice %[[ARG2]]
-      //      CHECK-FILL:   %[[T1:.*]] = linalg.pad_tensor %[[T0]] nofold
-      %3 = tensor.extract_slice %arg5[%arg2, %arg4] [4, %2] [1, 1] : tensor<24x12xf32> to tensor<4x?xf32>
-
-      // Check only the fill output operand is padded.
-      //      CHECK-FILL:   %[[T6:.*]] = linalg.fill(%[[ARG0]], %[[T1]]
-      %4 = linalg.fill(%arg0, %3) : f32, tensor<4x?xf32> -> tensor<4x?xf32>
-      %5 = tensor.insert_slice %4 into %arg5[%arg2, %arg4] [4, %2] [1, 1] : tensor<4x?xf32> into tensor<24x12xf32>
-      scf.yield %5 : tensor<24x12xf32>
-    }
-    scf.yield %1 : tensor<24x12xf32>
-  }
-  return %0 : tensor<24x12xf32>
+#map = affine_map<(d0) -> (7, d0)>
+
+//      FILL:  scalar_operand
+// FILL-SAME:    %[[ARG0:[0-9a-zA-Z]*]]: f32
+// FILL-SAME:    %[[ARG1:[0-9a-zA-Z]*]]: tensor<24x12xf32>
+func @scalar_operand(%arg0: f32,
+                     %arg1: tensor<24x12xf32>,
+                     %iv0 : index) -> tensor<24x12xf32> {
+  %0 = affine.min #map(%iv0)
+
+  //      FILL:   %[[T0:.*]] = tensor.extract_slice %[[ARG1]]
+  //      FILL:   %[[T1:.*]] = linalg.pad_tensor %[[T0]] nofold
+  %1 = tensor.extract_slice %arg1[0, 0] [4, %0] [1, 1] : tensor<24x12xf32> to tensor<4x?xf32>
+
+  // Check only the fill output operand is padded.
+  //      FILL:   %[[T6:.*]] = linalg.fill(%[[ARG0]], %[[T1]]
+  %2 = linalg.fill(%arg0, %1) : f32, tensor<4x?xf32> -> tensor<4x?xf32>
+  %3 = tensor.insert_slice %2 into %arg1[0, 0] [4, %0] [1, 1] : tensor<4x?xf32> into tensor<24x12xf32>
+  return %3 : tensor<24x12xf32>
 }