diff --git a/mlir/lib/Dialect/Linalg/Transforms/Transforms.cpp b/mlir/lib/Dialect/Linalg/Transforms/Transforms.cpp
--- a/mlir/lib/Dialect/Linalg/Transforms/Transforms.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/Transforms.cpp
@@ -182,9 +182,16 @@
   if (failed(paddingValue))
     return failure(hasDynamicShape);
 
-  // Cannot construct a static bounding box if the operand is not defined by an
-  // ExtractSliceOp.
-  auto sliceOp = opOperand->get().getDefiningOp<tensor::ExtractSliceOp>();
+  // Follow the use-def chain if `currOpOperand` is defined by a LinalgOp.
+  OpOperand *currOpOperand = opOperand;
+  while (auto linalgOp = currOpOperand->get().getDefiningOp<LinalgOp>()) {
+    OpResult result = currOpOperand->get().cast<OpResult>();
+    currOpOperand = linalgOp.getOutputOperand(result.getResultNumber());
+  }
+
+  // Cannot construct a static bounding box if the `currOpOperand` is not
+  // defined by an ExtractSliceOp.
+  auto sliceOp = currOpOperand->get().getDefiningOp<tensor::ExtractSliceOp>();
   if (!sliceOp)
     return failure(hasDynamicShape);
 
diff --git a/mlir/test/Dialect/Linalg/pad.mlir b/mlir/test/Dialect/Linalg/pad.mlir
--- a/mlir/test/Dialect/Linalg/pad.mlir
+++ b/mlir/test/Dialect/Linalg/pad.mlir
@@ -1,10 +1,11 @@
-// RUN: mlir-opt %s -test-linalg-codegen-strategy="anchor-op=linalg.matmul pad pack-paddings=1,1,0 run-enable-pass=false" -cse -canonicalize -split-input-file | FileCheck %s --check-prefix=MATMUL
-// RUN: mlir-opt %s -test-linalg-codegen-strategy="anchor-op=linalg.fill pad pack-paddings=1,1,0 run-enable-pass=false" -cse -canonicalize -split-input-file | FileCheck %s --check-prefix=FILL
-// RUN: mlir-opt %s -test-linalg-codegen-strategy="anchor-op=linalg.matmul pad pack-paddings=1,1,0 pad-inputs-only run-enable-pass=false" -cse -canonicalize -split-input-file | FileCheck %s --check-prefix=INPUTS-ONLY
+// RUN: mlir-opt %s -test-linalg-codegen-strategy="anchor-op=linalg.matmul pad pack-paddings=1,1,0 run-enable-pass=false" -cse -split-input-file | FileCheck %s --check-prefix=MATMUL
+// RUN: mlir-opt %s -test-linalg-codegen-strategy="anchor-op=linalg.fill pad pack-paddings=1,1 run-enable-pass=false" -cse -split-input-file | FileCheck %s --check-prefix=FILL
+// RUN: mlir-opt %s -test-linalg-codegen-strategy="anchor-op=linalg.fill pad pack-paddings=1,0 run-enable-pass=false" -test-linalg-codegen-strategy="anchor-op=linalg.matmul pad pack-paddings=1,0 run-enable-pass=false" -cse -split-input-file | FileCheck %s --check-prefix=FILL-MATMUL
+// RUN: mlir-opt %s -test-linalg-codegen-strategy="anchor-op=linalg.matmul pad pack-paddings=1,1,0 pad-inputs-only run-enable-pass=false" -cse -split-input-file | FileCheck %s --check-prefix=INPUTS-ONLY
 
 // MATMUL-DAG: #[[MAP0:[0-9a-z]+]] = affine_map<()[s0] -> (-s0 + 12, 7)>
 // MATMUL-DAG: #[[MAP1:[0-9a-z]+]] = affine_map<()[s0] -> (-s0 + 7)>
-#map = affine_map<()[s0] -> (7, -s0 + 12)>
+#map = affine_map<()[s0] -> (-s0 + 12, 7)>
 
 //      MATMUL:  static_sizes_output_divisible
 // MATMUL-SAME:    %[[ARG0:[0-9a-zA-Z]*]]: tensor<24x12xf32>
@@ -13,10 +14,10 @@
 // MATMUL-SAME:    %[[IV0:[0-9a-zA-Z]*]]: index
 // MATMUL-SAME:    %[[IV1:[0-9a-zA-Z]*]]: index
 // MATMUL-SAME:    %[[IV2:[0-9a-zA-Z]*]]: index
-func @static_sizes_output_divisible(%arg0: tensor<24x12xf32>,
-                                    %arg1: tensor<12x25xf32>,
-                                    %arg2: tensor<24x25xf32>,
-                                    %iv0 : index, %iv1 : index, %iv2 : index) -> tensor<24x25xf32> {
+func.func @static_sizes_output_divisible(%arg0: tensor<24x12xf32>,
+                                         %arg1: tensor<12x25xf32>,
+                                         %arg2: tensor<24x25xf32>,
+                                         %iv0 : index, %iv1 : index, %iv2 : index) -> tensor<24x25xf32> {
   //  MATMUL-DAG: %[[C0:.*]] = arith.constant 0 : index
 
   //      MATMUL:   %[[TS2:.*]] = affine.min #[[MAP0]]()[%[[IV2]]]
@@ -43,24 +44,24 @@
   //      MATMUL:   %[[T6:.*]] = tensor.insert_slice %[[T5]]
   %4 = linalg.matmul ins(%1, %2 : tensor<4x?xf32>, tensor<?x5xf32>) outs(%3 : tensor<4x5xf32>) -> tensor<4x5xf32>
   %5 = tensor.insert_slice %4 into %arg2[%iv0, %iv1] [4, 5] [1, 1] : tensor<4x5xf32> into tensor<24x25xf32>
-  return %5 : tensor<24x25xf32>
+  func.return %5 : tensor<24x25xf32>
 }
 
 // -----
 
 // MATMUL-DAG: #[[MAP0:[0-9a-z]+]] = affine_map<()[s0] -> (-s0 + 25, 7)>
 // MATMUL-DAG: #[[MAP1:[0-9a-z]+]] = affine_map<()[s0] -> (-s0 + 7)>
-#map = affine_map<()[s0] -> (7, -s0 + 25)>
+#map = affine_map<()[s0] -> (-s0 + 25, 7)>
 
 //      MATMUL:  static_sizes_input_divisible
 // MATMUL-SAME:    %[[ARG2:[0-9a-zA-Z]*]]: tensor<24x25xf32>
 // MATMUL-SAME:    %[[IV0:[0-9a-zA-Z]*]]: index
 // MATMUL-SAME:    %[[IV1:[0-9a-zA-Z]*]]: index
 // MATMUL-SAME:    %[[IV2:[0-9a-zA-Z]*]]: index
-func @static_sizes_input_divisible(%arg0: tensor<24x12xf32>,
-                                   %arg1: tensor<12x25xf32>,
-                                   %arg2: tensor<24x25xf32>,
-                                   %iv0 : index, %iv1 : index, %iv2 : index) ->  tensor<24x25xf32> {
+func.func @static_sizes_input_divisible(%arg0: tensor<24x12xf32>,
+                                        %arg1: tensor<12x25xf32>,
+                                        %arg2: tensor<24x25xf32>,
+                                        %iv0 : index, %iv1 : index, %iv2 : index) ->  tensor<24x25xf32> {
   //  MATMUL-DAG: %[[C0:.*]] = arith.constant 0 : index
 
   %3 = tensor.extract_slice %arg0[%iv0, %iv2] [4, 6] [1, 1] : tensor<24x12xf32> to tensor<4x6xf32>
@@ -86,7 +87,7 @@
   %8 = tensor.insert_slice %7 into %arg2[%iv0, %iv1] [4, %4] [1, 1] : tensor<4x?xf32> into tensor<24x25xf32>
 
    //      MATMUL:   return %[[T4]]
-  return %8 : tensor<24x25xf32>
+  func.return %8 : tensor<24x25xf32>
 }
 
 // -----
@@ -97,9 +98,9 @@
 // MATMUL-DAG: #[[MAP3:[0-9a-z]+]] = affine_map<()[s0] -> (-s0 + 5)>
 // MATMUL-DAG: #[[MAP4:[0-9a-z]+]] = affine_map<()[s0] -> (-s0 + 6)>
 
-#map0 = affine_map<()[s0, s1] -> (5, -s0 + s1)>
-#map1 = affine_map<()[s0, s1] -> (6, -s0 + s1)>
-#map2 = affine_map<()[s0, s1] -> (7, -s0 + s1)>
+#map0 = affine_map<()[s0, s1] -> (-s0 + s1, 5)>
+#map1 = affine_map<()[s0, s1] -> (-s0 + s1, 6)>
+#map2 = affine_map<()[s0, s1] -> (-s0 + s1, 7)>
 
 //      MATMUL:  dynamic_sizes
 // MATMUL-SAME:    %[[ARG0:[0-9a-zA-Z]*]]: tensor<?x?xf32>
@@ -108,10 +109,10 @@
 // MATMUL-SAME:    %[[IV0:[0-9a-zA-Z]*]]: index
 // MATMUL-SAME:    %[[IV1:[0-9a-zA-Z]*]]: index
 // MATMUL-SAME:    %[[IV2:[0-9a-zA-Z]*]]: index
-func @dynamic_sizes(%arg0: tensor<?x?xf32>,
-                    %arg1: tensor<?x?xf32>,
-                    %arg2: tensor<?x?xf32>,
-                    %iv0 : index, %iv1 : index, %iv2 : index) -> tensor<?x?xf32> {
+func.func @dynamic_sizes(%arg0: tensor<?x?xf32>,
+                         %arg1: tensor<?x?xf32>,
+                         %arg2: tensor<?x?xf32>,
+                         %iv0 : index, %iv1 : index, %iv2 : index) -> tensor<?x?xf32> {
   //  MATMUL-DAG: %[[C0:.*]] = arith.constant 0 : index
   //  MATMUL-DAG: %[[C1:.*]] = arith.constant 1
   %c1 = arith.constant 1 : index
@@ -156,29 +157,53 @@
   %13 = tensor.insert_slice %12 into %arg2[%iv0, %iv1] [%6, %9] [1, 1] : tensor<?x?xf32> into tensor<?x?xf32>
 
   //      MATMUL:   return %[[T8]]
-  return %13 : tensor<?x?xf32>
+  func.return %13 : tensor<?x?xf32>
 }
 
 // -----
 
 #map0 = affine_map<()[s0] -> (64, s0)>
 
-//      FILL:  pad_multiple
-// FILL-SAME:    %[[ARG0:[0-9a-zA-Z]*]]: tensor<64x64xf32>
-func @pad_multiple(%arg0: tensor<64x64xf32>,
-                      %iv0 : index) -> tensor<?x?xf32> {
+//      FILL-MATMUL:  pad_multiple
+// FILL-MATMUL-SAME:    %[[ARG0:[0-9a-zA-Z]*]]: tensor<64x64xf32>
+func.func @pad_multiple(%arg0: tensor<64x64xf32>,
+                        %iv0 : index) -> tensor<?x?xf32> {
   %cst = arith.constant 0.0 : f32
   %size = affine.min #map0()[%iv0]
+
+  //      FILL-MATMUL:  %[[T0:.*]] = tensor.extract_slice
   %0 = tensor.extract_slice %arg0[0, 0] [%size, %size] [1, 1] : tensor<64x64xf32> to tensor<?x?xf32>
 
-  // Check both fill operations are padded by the same pad tensor operation.
-  //      FILL:  %[[T0:.*]] = tensor.pad
-  //      FILL:  %[[T1:.*]] = linalg.fill ins(%{{.*}}{{.*}}outs(%[[T0]]
-  //      FILL:  %[[T2:.*]] = linalg.fill ins(%{{.*}}{{.*}}outs(%[[T1]]
-  //      FILL:  = tensor.extract_slice %[[T2]]
+  // Check the two operations are padded by the same pad tensor operation.
+  //      FILL-MATMUL:  %[[T1:.*]] = tensor.pad %[[T0]]
+  //      FILL-MATMUL:  %[[T2:.*]] = linalg.fill {{.*}} outs(%[[T1]]
+  //      FILL-MATMUL:  %[[T3:.*]] = linalg.matmul {{.*}} outs(%[[T2]]
+  //      FILL-MATMUL:  = tensor.extract_slice %[[T3]]
+  %1 = linalg.fill ins(%cst : f32) outs(%0 : tensor<?x?xf32>) -> tensor<?x?xf32>
+  %2 = linalg.matmul ins(%0, %0 : tensor<?x?xf32>, tensor<?x?xf32>) outs(%1 : tensor<?x?xf32>) -> tensor<?x?xf32>
+  func.return %2 : tensor<?x?xf32>
+}
+
+// -----
+
+#map0 = affine_map<()[s0] -> (64, s0)>
+
+//      MATMUL:  pad_chain
+// MATMUL-SAME:    %[[ARG0:[0-9a-zA-Z]*]]: tensor<64x64xf32>
+func.func @pad_chain(%arg0: tensor<64x64xf32>,
+                     %iv0 : index) -> tensor<?x?xf32> {
+  %cst = arith.constant 0.0 : f32
+  %size = affine.min #map0()[%iv0]
+  %0 = tensor.extract_slice %arg0[0, 0] [%size, %size] [1, 1] : tensor<64x64xf32> to tensor<?x?xf32>
+
+  // Check the matmul at the end of the use-def chain is padded.
+  //      MATMUL:  %[[T0:.*]] = linalg.fill
+  //      MATMUL:  %[[T1:.*]] = tensor.pad %[[T0]]
+  //      MATMUL:  %[[T2:.*]] = linalg.matmul {{.*}} outs(%[[T1]]
+  //      MATMUL:  = tensor.extract_slice %[[T2]]
   %1 = linalg.fill ins(%cst : f32) outs(%0 : tensor<?x?xf32>) -> tensor<?x?xf32>
-  %2 = linalg.fill ins(%cst : f32) outs(%1 : tensor<?x?xf32>) -> tensor<?x?xf32>
-  return %2 : tensor<?x?xf32>
+  %2 = linalg.matmul ins(%0, %0 : tensor<?x?xf32>, tensor<?x?xf32>) outs(%1 : tensor<?x?xf32>) -> tensor<?x?xf32>
+  func.return %2 : tensor<?x?xf32>
 }
 
 // -----
@@ -187,8 +212,8 @@
 
 //      MATMUL:  compose_padding
 // MATMUL-SAME:    %[[ARG0:[0-9a-zA-Z]*]]: tensor<64x64xf32>
-func @compose_padding(%arg0: tensor<64x64xf32>,
-                      %iv0 : index) -> tensor<?x?xf32> {
+func.func @compose_padding(%arg0: tensor<64x64xf32>,
+                           %iv0 : index) -> tensor<?x?xf32> {
   %cst = arith.constant 0.0 : f32
 
   //      MATMUL:  %[[SIZE:.*]] = affine.min
@@ -220,7 +245,7 @@
   %5 = linalg.matmul ins(%4, %4 : tensor<?x?xf32>, tensor<?x?xf32>) outs(%4 : tensor<?x?xf32>) -> tensor<?x?xf32>
 
   //      MATMUL:  return %[[T5]]
-  return %5 : tensor<?x?xf32>
+  func.return %5 : tensor<?x?xf32>
 }
 
 // -----
@@ -228,8 +253,8 @@
 #map0 = affine_map<()[s0] -> (64, s0)>
 
 //      MATMUL:  different_padding_values
-func @different_padding_values(%arg0: tensor<64x64xf32>,
-                               %iv0 : index) -> tensor<?x?xf32> {
+func.func @different_padding_values(%arg0: tensor<64x64xf32>,
+                                    %iv0 : index) -> tensor<?x?xf32> {
   %cst = arith.constant 42.0 : f32
   %size = affine.min #map0()[%iv0]
   %0 = tensor.extract_slice %arg0[0, 0] [%size, %size] [1, 1] : tensor<64x64xf32> to tensor<?x?xf32>
@@ -245,7 +270,7 @@
   //      MATMUL:  = tensor.pad
   //      MATMUL:  = linalg.matmul
   %5 = linalg.matmul ins(%4, %4 : tensor<?x?xf32>, tensor<?x?xf32>) outs(%4 : tensor<?x?xf32>) -> tensor<?x?xf32>
-  return %5 : tensor<?x?xf32>
+  func.return %5 : tensor<?x?xf32>
 }
 
 // -----
@@ -253,8 +278,8 @@
 #map0 = affine_map<()[s0] -> (64, s0)>
 
 //      MATMUL:  different_padding_dynamic_sizes
-func @different_padding_dynamic_sizes(%arg0: tensor<64x64xf32>,
-                                      %iv0 : index) -> tensor<?x?xf32> {
+func.func @different_padding_dynamic_sizes(%arg0: tensor<64x64xf32>,
+                                           %iv0 : index) -> tensor<?x?xf32> {
   %cst = arith.constant 0.0 : f32
   %size = affine.min #map0()[%iv0]
   %0 = tensor.extract_slice %arg0[0, 0] [%iv0, %iv0] [1, 1] : tensor<64x64xf32> to tensor<?x?xf32>
@@ -270,7 +295,7 @@
   //      MATMUL:  = tensor.pad
   //      MATMUL:  = linalg.matmul
   %5 = linalg.matmul ins(%4, %4 : tensor<?x?xf32>, tensor<?x?xf32>) outs(%4 : tensor<?x?xf32>) -> tensor<?x?xf32>
-  return %5 : tensor<?x?xf32>
+  func.return %5 : tensor<?x?xf32>
 }
 
 // -----
@@ -278,8 +303,8 @@
 #map0 = affine_map<()[s0] -> (64, s0)>
 
 //      MATMUL:  different_padding_dynamic_rank
-func @different_padding_dynamic_rank(%arg0: tensor<64x64x1xf32>,
-                                     %iv0 : index) -> tensor<?x?xf32> {
+func.func @different_padding_dynamic_rank(%arg0: tensor<64x64x1xf32>,
+                                          %iv0 : index) -> tensor<?x?xf32> {
   %cst = arith.constant 0.0 : f32
   %size = affine.min #map0()[%iv0]
   %0 = tensor.extract_slice %arg0[0, 0, 0] [%size, %size, 1] [1, 1, 1] : tensor<64x64x1xf32> to tensor<?x?xf32>
@@ -295,7 +320,7 @@
   //      MATMUL:  = tensor.pad
   //      MATMUL:  = linalg.matmul
   %4 = linalg.matmul ins(%3, %3 : tensor<?x?xf32>, tensor<?x?xf32>) outs(%3 : tensor<?x?xf32>) -> tensor<?x?xf32>
-  return %4 : tensor<?x?xf32>
+  func.return %4 : tensor<?x?xf32>
 }
 
 // -----
@@ -303,8 +328,8 @@
 #map0 = affine_map<()[s0] -> (64, s0)>
 
 //      MATMUL:  different_padding_static_sizes
-func @different_padding_static_sizes(%arg0: tensor<62x62xf32>,
-                                     %iv0 : index) -> tensor<?x?xf32> {
+func.func @different_padding_static_sizes(%arg0: tensor<62x62xf32>,
+                                          %iv0 : index) -> tensor<?x?xf32> {
   %cst = arith.constant 0.0 : f32
   %size = affine.min #map0()[%iv0]
   %0 = tensor.extract_slice %arg0[0, 0] [%size, %size] [1, 1] : tensor<62x62xf32> to tensor<?x?xf32>
@@ -320,7 +345,7 @@
   //      MATMUL:  = tensor.pad
   //      MATMUL:  = linalg.matmul
   %5 = linalg.matmul ins(%4, %4 : tensor<?x?xf32>, tensor<?x?xf32>) outs(%4 : tensor<?x?xf32>) -> tensor<?x?xf32>
-  return %5 : tensor<?x?xf32>
+  func.return %5 : tensor<?x?xf32>
 }
 
 // -----
@@ -330,9 +355,9 @@
 //      FILL:  scalar_operand
 // FILL-SAME:    %[[ARG0:[0-9a-zA-Z]*]]: f32
 // FILL-SAME:    %[[ARG1:[0-9a-zA-Z]*]]: tensor<24x12xf32>
-func @scalar_operand(%arg0: f32,
-                     %arg1: tensor<24x12xf32>,
-                     %iv0 : index) -> tensor<24x12xf32> {
+func.func @scalar_operand(%arg0: f32,
+                          %arg1: tensor<24x12xf32>,
+                          %iv0 : index) -> tensor<24x12xf32> {
   %0 = affine.min #map0()[%iv0]
 
   //      FILL:   %[[T0:.*]] = tensor.extract_slice %[[ARG1]]
@@ -343,7 +368,7 @@
   //      FILL:   %[[T6:.*]] = linalg.fill ins(%[[ARG0]]{{.*}}outs(%[[T1]]
   %2 = linalg.fill ins(%arg0 : f32) outs(%1 : tensor<4x?xf32>) -> tensor<4x?xf32>
   %3 = tensor.insert_slice %2 into %arg1[0, 0] [4, %0] [1, 1] : tensor<4x?xf32> into tensor<24x12xf32>
-  return %3 : tensor<24x12xf32>
+  func.return %3 : tensor<24x12xf32>
 }
 
 // -----
@@ -352,10 +377,10 @@
 
 //      MATMUL:  static_extract_slice_missing
 // MATMUL-SAME:    %[[ARG2:[0-9a-zA-Z]*]]: tensor<4x5xf32>,
-func @static_extract_slice_missing(%arg0: tensor<24x12xf32>,
-                                   %arg1: tensor<12x25xf32>,
-                                   %arg2: tensor<4x5xf32>,
-                                   %iv0 : index, %iv1 : index, %iv2 : index) -> tensor<4x5xf32> {
+func.func @static_extract_slice_missing(%arg0: tensor<24x12xf32>,
+                                        %arg1: tensor<12x25xf32>,
+                                        %arg2: tensor<4x5xf32>,
+                                        %iv0 : index, %iv1 : index, %iv2 : index) -> tensor<4x5xf32> {
   %0 = affine.min #map0()[%iv2]
   %1 = tensor.extract_slice %arg0[%iv0, %iv2] [4, %0] [1, 1] : tensor<24x12xf32> to tensor<4x?xf32>
   %2 = tensor.extract_slice %arg1[%iv2, %iv1] [%0, 5] [1, 1] : tensor<12x25xf32> to tensor<?x5xf32>
@@ -366,7 +391,7 @@
   //      MATMUL:  = linalg.matmul ins(%[[T0]], %[[T1]]
   // MATMUL-SAME:                 outs(%[[ARG2]]
   %3 = linalg.matmul ins(%1, %2 : tensor<4x?xf32>, tensor<?x5xf32>) outs(%arg2 : tensor<4x5xf32>) -> tensor<4x5xf32>
-  return %3 : tensor<4x5xf32>
+  func.return %3 : tensor<4x5xf32>
 }
 
 // -----
@@ -377,10 +402,10 @@
 // MATMUL-SAME:    %[[ARG0:[0-9a-zA-Z]*]]: tensor<4x?xf32>,
 // MATMUL-SAME:    %[[ARG1:[0-9a-zA-Z]*]]: tensor<12x25xf32>,
 // MATMUL-SAME:    %[[ARG2:[0-9a-zA-Z]*]]: tensor<24x25xf32>,
-func @dynamic_extract_slice_missing(%arg0: tensor<4x?xf32>,
-                                    %arg1: tensor<12x25xf32>,
-                                    %arg2: tensor<24x25xf32>,
-                                    %iv0 : index, %iv1 : index, %iv2 : index) -> tensor<24x25xf32> {
+func.func @dynamic_extract_slice_missing(%arg0: tensor<4x?xf32>,
+                                         %arg1: tensor<12x25xf32>,
+                                         %arg2: tensor<24x25xf32>,
+                                         %iv0 : index, %iv1 : index, %iv2 : index) -> tensor<24x25xf32> {
   %0 = affine.min #map0()[%iv2]
 
   //      MATMUL:  %[[T0:.*]] = tensor.extract_slice %[[ARG1]]
@@ -393,7 +418,7 @@
   // MATMUL-SAME:                 outs(%[[T1]]
   %4 = linalg.matmul ins(%arg0, %2 : tensor<4x?xf32>, tensor<?x5xf32>) outs(%3 : tensor<4x5xf32>) -> tensor<4x5xf32>
   %5 = tensor.insert_slice %4 into %arg2[%iv0, %iv1] [4, 5] [1, 1] : tensor<4x5xf32> into tensor<24x25xf32>
-  return %5 : tensor<24x25xf32>
+  func.return %5 : tensor<24x25xf32>
 }
 
 // -----
@@ -402,10 +427,10 @@
 
 //      INPUTS-ONLY:  static_input_padding_only
 // INPUTS-ONLY-SAME:    %[[ARG2:[0-9a-zA-Z]*]]: tensor<24x25xf32>,
-func @static_input_padding_only(%arg0: tensor<24x12xf32>,
-                                %arg1: tensor<12x25xf32>,
-                                %arg2: tensor<24x25xf32>,
-                                %iv0 : index, %iv1 : index, %iv2 : index) -> tensor<24x25xf32> {
+func.func @static_input_padding_only(%arg0: tensor<24x12xf32>,
+                                     %arg1: tensor<12x25xf32>,
+                                     %arg2: tensor<24x25xf32>,
+                                     %iv0 : index, %iv1 : index, %iv2 : index) -> tensor<24x25xf32> {
   %0 = affine.min #map0()[%iv2]
   %1 = tensor.extract_slice %arg0[%iv0, %iv2] [4, %0] [1, 1] : tensor<24x12xf32> to tensor<4x?xf32>
   %2 = tensor.extract_slice %arg1[%iv2, %iv1] [%0, 5] [1, 1] : tensor<12x25xf32> to tensor<?x5xf32>
@@ -420,7 +445,7 @@
   // INPUTS-ONLY-SAME:             outs(%[[T0]]
   %4 = linalg.matmul ins(%1, %2 : tensor<4x?xf32>, tensor<?x5xf32>) outs(%3 : tensor<4x5xf32>) -> tensor<4x5xf32>
   %5 = tensor.insert_slice %4 into %arg2[%iv0, %iv1] [4, 5] [1, 1] : tensor<4x5xf32> into tensor<24x25xf32>
-  return %5 : tensor<24x25xf32>
+  func.return %5 : tensor<24x25xf32>
 }
 
 // -----
@@ -431,10 +456,10 @@
 // INPUTS-ONLY-SAME:    %[[ARG0:[0-9a-zA-Z]*]]: tensor<24x12xf32>,
 // INPUTS-ONLY-SAME:    %[[ARG1:[0-9a-zA-Z]*]]: tensor<12x25xf32>,
 // INPUTS-ONLY-SAME:    %[[ARG2:[0-9a-zA-Z]*]]: tensor<24x25xf32>,
-func @dynamic_input_padding_only(%arg0: tensor<24x12xf32>,
-                                 %arg1: tensor<12x25xf32>,
-                                 %arg2: tensor<24x25xf32>,
-                                 %iv0 : index, %iv1 : index, %iv2 : index) -> tensor<24x25xf32> {
+func.func @dynamic_input_padding_only(%arg0: tensor<24x12xf32>,
+                                      %arg1: tensor<12x25xf32>,
+                                      %arg2: tensor<24x25xf32>,
+                                      %iv0 : index, %iv1 : index, %iv2 : index) -> tensor<24x25xf32> {
   %0 = affine.min #map0()[%iv2]
 
   // INPUTS-ONLY:  %[[T0:.*]] = tensor.extract_slice %[[ARG0]]
@@ -449,7 +474,7 @@
   // INPUTS-ONLY-SAME:             outs(%[[T2]]
   %4 = linalg.matmul ins(%1, %2 : tensor<4x?xf32>, tensor<?x?xf32>) outs(%3 : tensor<4x?xf32>) -> tensor<4x?xf32>
   %5 = tensor.insert_slice %4 into %arg2[%iv0, %iv1] [4, %0] [1, 1] : tensor<4x?xf32> into tensor<24x25xf32>
-  return %5 : tensor<24x25xf32>
+  func.return %5 : tensor<24x25xf32>
 }
 
 // -----
@@ -458,8 +483,8 @@
 
 //      FILL:  rank_reducing
 // FILL-SAME:    %[[ARG0:[0-9a-zA-Z]*]]: tensor<1x64x1x64xf32>
-func @rank_reducing(%arg0: tensor<1x64x1x64xf32>,
-                    %iv0 : index) -> tensor<1x?x?xf32> {
+func.func @rank_reducing(%arg0: tensor<1x64x1x64xf32>,
+                         %iv0 : index) -> tensor<1x?x?xf32> {
   %cst = arith.constant 0.0 : f32
   %size = affine.min #map0()[%iv0]
   %0 = tensor.extract_slice %arg0[0, 0, 0, 0] [1, %size, 1, %size] [1, 1, 1, 1] : tensor<1x64x1x64xf32> to tensor<1x?x?xf32>
@@ -470,5 +495,5 @@
   // FILL-SAME:    tensor<1x64x64xf32>
   //      FILL:  = tensor.extract_slice %[[T1]]
   %1 = linalg.fill ins(%cst : f32) outs(%0 : tensor<1x?x?xf32>) -> tensor<1x?x?xf32>
-  return %1 : tensor<1x?x?xf32>
+  func.return %1 : tensor<1x?x?xf32>
 }
diff --git a/mlir/test/lib/Dialect/Linalg/TestLinalgCodegenStrategy.cpp b/mlir/test/lib/Dialect/Linalg/TestLinalgCodegenStrategy.cpp
--- a/mlir/test/lib/Dialect/Linalg/TestLinalgCodegenStrategy.cpp
+++ b/mlir/test/lib/Dialect/Linalg/TestLinalgCodegenStrategy.cpp
@@ -184,6 +184,7 @@
     LinalgPaddingOptions paddingOptions,
     vector::VectorContractLowering vectorContractLowering,
     vector::VectorTransferSplit vectorTransferSplit) {
+  std::string anchorOpNameOrWildcard = fuse ? "" : anchorOpName.getValue();
   CodegenStrategy strategy;
   strategy
       .tileAndFuseIf(fuse && !tileSizes.empty(), anchorOpName,
@@ -199,11 +200,11 @@
                  LinalgPromotionOptions()
                      .setAlignment(16)
                      .setUseFullTileBuffersByDefault(registerPromoteFullTile))
-      .padIf(pad, "", std::move(paddingOptions))
+      .padIf(pad, anchorOpNameOrWildcard, std::move(paddingOptions))
       .decomposeIf(decompose)
-      .generalizeIf(generalize, "")
+      .generalizeIf(generalize, anchorOpNameOrWildcard)
       .interchangeIf(!iteratorInterchange.empty(), iteratorInterchange)
-      .vectorizeIf(vectorize, "", nullptr, vectorizePadding)
+      .vectorizeIf(vectorize, anchorOpNameOrWildcard, nullptr, vectorizePadding)
       .vectorLowering(
           LinalgVectorLoweringOptions()
               .setVectorTransformsOptions(