diff --git a/mlir/lib/Dialect/Linalg/Transforms/HoistPadding.cpp b/mlir/lib/Dialect/Linalg/Transforms/HoistPadding.cpp
--- a/mlir/lib/Dialect/Linalg/Transforms/HoistPadding.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/HoistPadding.cpp
@@ -166,23 +166,65 @@
   if (analysisFailure || backwardSlice.empty())
     return;
 
-  // Backward slice is a topologically sorted list of ops starting at
-  // `outermostEnclosingForOp`.
-  assert(outermostEnclosingForOp == backwardSlice.front());
-
-  // Filter out the loops whose induction variable is not used to compute the
-  // padded result. As a first approximation, just look for IVs that have no use
-  // in the backwardSlice.
-  // These are the dimensions of reuse that we can exploit to reduce the amount
-  // of copy / memory.
-  for (scf::ForOp forOp : llvm::reverse(reverseEnclosingLoops)) {
-    for (Operation *user : forOp.getInductionVar().getUsers()) {
-      if (backwardSlice.contains(user)) {
-        packingLoops.insert(forOp);
-        break;
-      }
+  // Compute the index slice of `padTensorOp` and filter out any loops not
+  // used for packing. A loops is not used for packing if all loop iterations
+  // access the same data. As a result, all loop iterations can share the same
+  // buffer which reduces the memory footprint and increases the cache reuse.
+  // While computing the index slice, we keep track of all index and data
+  // values consumed by the padding and add only operations producing one of
+  // these values. The computation overapproximates the index slice since and
+  // with the exception of ForOp is unaware of op semantics.
+  SetVector<Value> indexAndDataEdges;
+  indexAndDataEdges.insert(padTensorOp.getResult());
+  SetVector<Operation *> indexSlice;
+  getBackwardSlice(padTensorOp.getOperation(), &indexSlice, [&](Operation *op) {
+    if (!domInfo.dominates(outermostEnclosingForOp, op))
+      return false;
+    // Adding a loop iteration argument to the index slice makes all enclosing
+    // loops packing loops possibly leading to over allocation. At the same
+    // time, a padding of an output operand passed in by iteration argument
+    // cannot be hoisted. We thus do not add the iteration arguments to the
+    // index and data edges to avoid spurious use-def relations when computing
+    // the index slice.
+    if (auto forOp = dyn_cast<scf::ForOp>(op)) {
+      assert(llvm::none_of(forOp.getIterOpOperands(),
+                           [&](OpOperand &opOperand) {
+                             return indexAndDataEdges.contains(
+                                 forOp.getRegionIterArgForOpOperand(opOperand));
+                           }) &&
+             "expect the padding does not depend on an iteration argument");
+      assert(llvm::none_of(forOp.getResults(),
+                           [&](Value value) {
+                             return indexAndDataEdges.contains(value);
+                           }) &&
+             "expect the padding does not depend on a loop result");
+      if (!indexAndDataEdges.contains(forOp.getInductionVar()))
+        return false;
+      indexAndDataEdges.insert(forOp.lowerBound());
+      indexAndDataEdges.insert(forOp.upperBound());
+      indexAndDataEdges.insert(forOp.step());
+      return true;
     }
+    // Otherwise, we assume all operands of an operation with a result part of
+    // the index and data edges are true dependencies of padding and should be
+    // included in the index slice. This is a conservative overapproximation
+    // favoring correctness over performance.
+    if (llvm::none_of(op->getResults(), [&](Value value) {
+          return indexAndDataEdges.contains(value);
+        }))
+      return false;
+    for (Value value : op->getOperands())
+      indexAndDataEdges.insert(value);
+    return true;
+  });
+
+  // Add only the loops part of the index slice to the packing loops.
+  for (scf::ForOp forOp : llvm::reverse(reverseEnclosingLoops)) {
+    if (indexSlice.contains(forOp))
+      packingLoops.insert(forOp);
   }
+  if (packingLoops.empty())
+    valid = false;
 
   // The analysis is valid and hoisting can occur.
   valid = true;
diff --git a/mlir/test/Dialect/Linalg/pad-and-hoist.mlir b/mlir/test/Dialect/Linalg/pad-and-hoist.mlir
--- a/mlir/test/Dialect/Linalg/pad-and-hoist.mlir
+++ b/mlir/test/Dialect/Linalg/pad-and-hoist.mlir
@@ -1,25 +1,24 @@
 // RUN: mlir-opt %s -test-linalg-transform-patterns="test-pad-pattern pack-paddings=1,1,0 hoist-paddings=2,1,0" -cse -canonicalize -split-input-file | FileCheck %s
-// RUN: mlir-opt %s -test-linalg-transform-patterns="test-pad-pattern pack-paddings=1,1,0 hoist-paddings=4,3,0" -cse -canonicalize -split-input-file | FileCheck %s --check-prefix=CHECK-DOUBLE
+// RUN: mlir-opt %s -test-linalg-transform-patterns="test-pad-pattern pack-paddings=1,1,0 hoist-paddings=3,2,0" -cse -canonicalize -split-input-file | FileCheck %s --check-prefix=CHECK-DOUBLE
 
 // CHECK-DAG: #[[MAP0:[0-9a-z]+]] = affine_map<(d0) -> (5, -d0 + 24)>
-// CHECK-DAG: #[[MAP1:[0-9a-z]+]] = affine_map<(d0) -> (8, -d0 + 12)>
+// CHECK-DAG: #[[MAP1:[0-9a-z]+]] = affine_map<(d0) -> (7, -d0 + 25)>
 // CHECK-DAG: #[[DIV6:[0-9a-z]+]] = affine_map<(d0) -> (d0 ceildiv 6)>
 #map0 = affine_map<(d0) -> (5, -d0 + 24)>
-#map1 = affine_map<(d0) -> (8, -d0 + 12)>
-#map2 = affine_map<(d0) -> (7, -d0 + 25)>
+#map1 = affine_map<(d0) -> (7, -d0 + 25)>
 
-//      CHECK:  single_tiling
-//      CHECK-DOUBLE:  single_tiling
+//      CHECK:  static_sizes
+//      CHECK-DOUBLE:  static_sizes
 
 // CHECK-SAME:    %[[ARG0:[0-9a-zA-Z]*]]: tensor<24x12xf32>
 // CHECK-SAME:    %[[ARG1:[0-9a-zA-Z]*]]: tensor<12x25xf32>
 // CHECK-SAME:    %[[ARG2:[0-9a-zA-Z]*]]: tensor<24x25xf32>
-func @single_tiling(%arg0: tensor<24x12xf32>,
-                    %arg1: tensor<12x25xf32>,
-                    %arg2: tensor<24x25xf32>) -> tensor<24x25xf32> {
+func @static_sizes(%arg0: tensor<24x12xf32>,
+                   %arg1: tensor<12x25xf32>,
+                   %arg2: tensor<24x25xf32>) -> tensor<24x25xf32> {
   //  CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index
   //  CHECK-DAG: %[[C5:.*]] = arith.constant 5
-  //  CHECK-DAG: %[[C8:.*]] = arith.constant 8
+  //  CHECK-DAG: %[[C7:.*]] = arith.constant 7
   %c0 = arith.constant 0 : index
   %c12 = arith.constant 12 : index
   %c25 = arith.constant 25 : index
@@ -33,11 +32,11 @@
 
     // Packing the first input operand for all values of IV2 (IV2x5x6).
     //      CHECK:  = linalg.init_tensor [2, 5, 6]
-    //      CHECK:  %[[PT0:.*]] = scf.for %[[P0IV2:[0-9a-z]+]] =
-    //        CHECK:   %[[PIDX0:.*]] = affine.apply #[[DIV6]](%[[P0IV2]])
+    //      CHECK:  %[[PT0:.*]] = scf.for %[[PIV0:[0-9a-z]+]] =
+    //        CHECK:   %[[PIDX0:.*]] = affine.apply #[[DIV6]](%[[PIV0]])
     //        CHECK:   %[[TS0:.*]] = affine.min #[[MAP0]](%[[IV0]])
     //        CHECK:   %[[T0:.*]] = tensor.extract_slice %[[ARG0]]
-    //   CHECK-SAME:                                     %[[IV0]], %[[P0IV2]]
+    //   CHECK-SAME:                                     %[[IV0]], %[[PIV0]]
     //   CHECK-SAME:                                     %[[TS0]], 6
     //        CHECK:   %[[V0:.*]] = arith.subi %[[C5]], %[[TS0]]
     //        CHECK:   %[[T1:.*]] = linalg.pad_tensor %[[T0]] nofold {{.*}} high[%[[V0]]
@@ -47,15 +46,15 @@
     //      CHECK:  scf.for %[[IV1:[0-9a-zA-Z]*]] =
     %1 = scf.for %arg5 = %c0 to %c25 step %c7 iter_args(%arg6 = %arg4) -> (tensor<24x25xf32>) {
 
-      // Packing the second input operand for all values of IV2 (IV2x6x8).
-      //      CHECK:  = linalg.init_tensor [2, 6, 8]
-      //      CHECK:  %[[PT1:.*]] = scf.for %[[P1IV2:[0-9a-z]+]] =
-      //        CHECK:   %[[PIDX1:.*]] = affine.apply #[[DIV6]](%[[P1IV2]])
+      // Packing the second input operand for all values of IV2 (IV2x6x7).
+      //      CHECK:  = linalg.init_tensor [2, 6, 7]
+      //      CHECK:  %[[PT1:.*]] = scf.for %[[PIV1:[0-9a-z]+]] =
+      //        CHECK:   %[[PIDX1:.*]] = affine.apply #[[DIV6]](%[[PIV1]])
       //        CHECK:   %[[TS1:.*]] = affine.min #[[MAP1]](%[[IV1]])
       //        CHECK:   %[[T3:.*]] = tensor.extract_slice %[[ARG1]]
-      //   CHECK-SAME:                                     %[[P1IV2]], %[[IV1]]
+      //   CHECK-SAME:                                     %[[PIV1]], %[[IV1]]
       //   CHECK-SAME:                                     6, %[[TS1]]
-      //        CHECK:   %[[V1:.*]] = arith.subi %[[C8]], %[[TS1]]
+      //        CHECK:   %[[V1:.*]] = arith.subi %[[C7]], %[[TS1]]
       //        CHECK:   %[[T4:.*]] = linalg.pad_tensor %[[T3]] nofold {{.*}} high[%[[C0]], %[[V1]]
       //        CHECK:   %[[T5:.*]] = tensor.insert_slice %[[T4:.*]] into %{{.*}}[%[[PIDX1]], 0, 0]
       //        CHECK:   scf.yield %[[T5:.*]]
@@ -63,6 +62,7 @@
       //      CHECK:  scf.for %[[IV2:[0-9a-zA-Z]*]] = {{.*}} iter_args(%[[ARG4:.*]] =
       %2 = scf.for %arg7 = %c0 to %c12 step %c6 iter_args(%arg8 = %arg6) -> (tensor<24x25xf32>) {
         %3 = affine.min #map0(%arg3)
+
         // Index the packed operands.
         //    CHECK-DAG:   %[[IDX:.*]] = affine.apply #[[DIV6]](%[[IV2]])
         //    CHECK-DAG:   %[[T6:.*]] = tensor.extract_slice %[[PT0]][%[[IDX]]
@@ -91,6 +91,109 @@
 
 // -----
 
+// CHECK-DAG: #[[MAP0:[0-9a-z]+]] = affine_map<(d0)[s0] -> (5, -d0 + s0)>
+// CHECK-DAG: #[[MAP1:[0-9a-z]+]] = affine_map<(d0)[s0] -> (6, -d0 + s0)>
+// CHECK-DAG: #[[MAP2:[0-9a-z]+]] = affine_map<(d0)[s0] -> (7, -d0 + s0)>
+// CHECK-DAG: #[[SDIV6:[0-9a-z]+]] = affine_map<()[s0] -> (s0 ceildiv 6)>
+// CHECK-DAG: #[[DDIV6:[0-9a-z]+]] = affine_map<(d0) -> (d0 ceildiv 6)>
+#map0 = affine_map<(d0)[s0] -> (5, -d0 + s0)>
+#map1 = affine_map<(d0)[s0] -> (6, -d0 + s0)>
+#map2 = affine_map<(d0)[s0] -> (7, -d0 + s0)>
+
+//      CHECK:  dynamic_sizes
+//      CHECK-DOUBLE:  dynamic_sizes
+
+// CHECK-SAME:    %[[ARG0:[0-9a-zA-Z]*]]: tensor<?x?xf32>
+// CHECK-SAME:    %[[ARG1:[0-9a-zA-Z]*]]: tensor<?x?xf32>
+// CHECK-SAME:    %[[ARG2:[0-9a-zA-Z]*]]: tensor<?x?xf32>
+func @dynamic_sizes(%arg0: tensor<?x?xf32>,
+                    %arg1: tensor<?x?xf32>,
+                    %arg2: tensor<?x?xf32>) -> tensor<?x?xf32> {
+  //  CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index
+  //  CHECK-DAG: %[[C1:.*]] = arith.constant 1
+  //  CHECK-DAG: %[[C5:.*]] = arith.constant 5
+  //  CHECK-DAG: %[[C6:.*]] = arith.constant 6
+  %c1 = arith.constant 1 : index
+  %c0 = arith.constant 0 : index
+  %c6 = arith.constant 6 : index
+  %c7 = arith.constant 7 : index
+  %c5 = arith.constant 5 : index
+
+  //  CHECK-DAG: %[[D0:.*]] = tensor.dim %[[ARG0]], %[[C0]]
+  //  CHECK-DAG: %[[D1:.*]] = tensor.dim %[[ARG0]], %[[C1]]
+  //  CHECK-DAG: %[[D2:.*]] = tensor.dim %[[ARG1]], %[[C1]]
+  %0 = tensor.dim %arg0, %c0 : tensor<?x?xf32>
+  %1 = tensor.dim %arg0, %c1 : tensor<?x?xf32>
+  %2 = tensor.dim %arg1, %c1 : tensor<?x?xf32>
+
+  //      CHECK:  scf.for %[[IV0:[0-9a-zA-Z]*]] =
+  %3 = scf.for %arg3 = %c0 to %0 step %c5 iter_args(%arg4 = %arg2) -> (tensor<?x?xf32>) {
+
+    // Packing the first input operand for all values of IV2 (IV2x5x6).
+    //      CHECK:  %[[PS0:.*]] = affine.apply #[[SDIV6]]()[%[[D1]]
+    //      CHECK:  = linalg.init_tensor [%[[PS0]], 5, 6]
+    //      CHECK:  %[[PT0:.*]] = scf.for %[[PIV0:[0-9a-z]+]] =
+    //        CHECK:   %[[PIDX0:.*]] = affine.apply #[[DDIV6]](%[[PIV0]])
+    //        CHECK:   %[[TS0:.*]] = affine.min #[[MAP0]](%[[IV0]])[%[[D0]]
+    //        CHECK:   %[[TS1:.*]] = affine.min #[[MAP1]](%[[PIV0]])[%[[D1]]
+    //        CHECK:   %[[T0:.*]] = tensor.extract_slice %[[ARG0]]
+    //   CHECK-SAME:                                     %[[IV0]], %[[PIV0]]
+    //   CHECK-SAME:                                     %[[TS0]], %[[TS1]]
+    //        CHECK:   %[[V0:.*]] = arith.subi %[[C5]], %[[TS0]]
+    //        CHECK:   %[[V1:.*]] = arith.subi %[[C6]], %[[TS1]]
+    //        CHECK:   %[[T1:.*]] = linalg.pad_tensor %[[T0]] nofold {{.*}} high[%[[V0]], %[[V1]]
+    //        CHECK:   %[[T2:.*]] = tensor.insert_slice %[[T1:.*]] into %{{.*}}[%[[PIDX0]], 0, 0]
+    //        CHECK:   scf.yield %[[T2:.*]]
+
+    //      CHECK:  scf.for %[[IV1:[0-9a-zA-Z]*]] =
+    %4 = scf.for %arg5 = %c0 to %2 step %c7 iter_args(%arg6 = %arg4) -> (tensor<?x?xf32>) {
+
+      // Packing the second input operand for all values of IV2 (IV2x6x7).
+      //      CHECK:  = linalg.init_tensor [%[[PS0]], 6, 7]
+      //      CHECK:  %[[PT1:.*]] = scf.for %[[PIV1:[0-9a-z]+]] =
+      //        CHECK:   %[[PIDX1:.*]] = affine.apply #[[DDIV6]](%[[PIV1]])
+      //        CHECK:   %[[TS2:.*]] = affine.min #[[MAP1]](%[[PIV1]])[%[[D1]]
+      //        CHECK:   %[[TS3:.*]] = affine.min #[[MAP2]](%[[IV1]])[%[[D2]]
+      //        CHECK:   %[[T3:.*]] = tensor.extract_slice %[[ARG1]]
+      //   CHECK-SAME:                                     %[[PIV1]], %[[IV1]]
+      //   CHECK-SAME:                                     %[[TS2]], %[[TS3]]
+      //        CHECK:   %[[V2:.*]] = arith.subi %[[C6]], %[[TS2]]
+      //        CHECK:   %[[V3:.*]] = arith.subi %[[C7]], %[[TS3]]
+      //        CHECK:   %[[T4:.*]] = linalg.pad_tensor %[[T3]] nofold {{.*}} high[%[[V2]], %[[V3]]
+      //        CHECK:   %[[T5:.*]] = tensor.insert_slice %[[T4:.*]] into %{{.*}}[%[[PIDX1]], 0, 0]
+      //        CHECK:   scf.yield %[[T5:.*]]
+
+      //      CHECK:  scf.for %[[IV2:[0-9a-zA-Z]*]] = {{.*}} iter_args(%[[ARG4:.*]] =
+      %5 = scf.for %arg7 = %c0 to %1 step %c6 iter_args(%arg8 = %arg6) -> (tensor<?x?xf32>) {
+        %6 = affine.min #map0(%arg3)[%0]
+        %7 = affine.min #map1(%arg7)[%1]
+
+        // Index the packed operands.
+        //    CHECK-DAG:   %[[IDX:.*]] = affine.apply #[[DDIV6]](%[[IV2]])
+        //    CHECK-DAG:   %[[T6:.*]] = tensor.extract_slice %[[PT0]][%[[IDX]]
+        //    CHECK-DAG:   %[[T7:.*]] = tensor.extract_slice %[[PT1]][%[[IDX]]
+        %8 = tensor.extract_slice %arg0[%arg3, %arg7] [%6, %7] [1, 1] : tensor<?x?xf32> to tensor<?x?xf32>
+        %9 = affine.min #map2(%arg5)[%2]
+        %10 = tensor.extract_slice %arg1[%arg7, %arg5] [%7, %9] [1, 1] : tensor<?x?xf32> to tensor<?x?xf32>
+        %11 = tensor.extract_slice %arg8[%arg3, %arg5] [%6, %9] [1, 1] : tensor<?x?xf32> to tensor<?x?xf32>
+
+        // Check matmul uses the packed input operands.
+        //        CHECK:   = linalg.matmul ins(%[[T6]], %[[T7]]
+        %12 = linalg.matmul {__internal_linalg_transform__ = "pad"} ins(%8, %10 : tensor<?x?xf32>, tensor<?x?xf32>) outs(%11 : tensor<?x?xf32>) -> tensor<?x?xf32>
+        %13 = tensor.insert_slice %12 into %arg8[%arg3, %arg5] [%6, %9] [1, 1] : tensor<?x?xf32> into tensor<?x?xf32>
+        scf.yield %13 : tensor<?x?xf32>
+      }
+      scf.yield %5 : tensor<?x?xf32>
+    }
+    scf.yield %4 : tensor<?x?xf32>
+  }
+  return %3 : tensor<?x?xf32>
+}
+
+// -----
+
+// CHECK-DOUBLE-DAG: #[[DIV5:[0-9a-z]+]] = affine_map<(d0) -> (d0 ceildiv 5)>
+// CHECK-DOUBLE-DAG: #[[DIV6:[0-9a-z]+]] = affine_map<(d0) -> (d0 ceildiv 6)>
 #map0 = affine_map<(d0) -> (15, -d0 + 24)>
 #map1 = affine_map<(d0) -> (16, -d0 + 25)>
 #map2 = affine_map<(d0, d1) -> (5, -d0 + d1)>
@@ -114,16 +217,17 @@
   %c5 = arith.constant 5 : index
   %c6 = arith.constant 6 : index
 
-  // Packing the first input operand.
-  //    CHECK-DOUBLE:  = linalg.init_tensor
-  //    CHECK-DOUBLE:  = linalg.pad_tensor {{.*}} nofold
-
   //    CHECK-DOUBLE:  scf.for %[[IV0:[0-9a-zA-Z]*]] =
   %0 = scf.for %arg3 = %c0 to %c24 step %c15 iter_args(%arg4 = %arg2) -> (tensor<24x25xf32>) {
 
-    // Packing the second input operand.
-    //    CHECK-DOUBLE:  = linalg.init_tensor
-    //    CHECK-DOUBLE:  = linalg.pad_tensor {{.*}} nofold
+    // Packing the first input operand.
+    //    CHECK-DOUBLE:  = linalg.init_tensor [3, 5, 12]
+    //    CHECK-DOUBLE:  %[[PT0:.*]] = scf.for %[[PIV0:[0-9a-z]+]] =
+    //      CHECK-DOUBLE:   %[[PIDX0:.*]] = affine.apply #[[DIV5]](%[[PIV0]])
+    //      CHECK-DOUBLE:   %[[T0:.*]] = tensor.extract_slice %[[ARG0]]
+    //      CHECK-DOUBLE:   %[[T1:.*]] = linalg.pad_tensor %[[T0]] nofold
+    //      CHECK-DOUBLE:   %[[T2:.*]] = tensor.insert_slice %[[T1:.*]] into %{{.*}}[%[[PIDX0]], 0, 0]
+    //      CHECK-DOUBLE:   scf.yield %[[T2:.*]]
 
     //    CHECK-DOUBLE:  scf.for %[[IV1:[0-9a-zA-Z]*]] =
     %1 = scf.for %arg5 = %c0 to %c25 step %c16 iter_args(%arg6 = %arg4) -> (tensor<24x25xf32>) {
@@ -131,6 +235,15 @@
       %3 = affine.min #map1(%arg5)
       %4 = tensor.extract_slice %arg6[%arg3, %arg5] [%2, %3] [1, 1] : tensor<24x25xf32> to tensor<?x?xf32>
 
+      // Packing the second input operand.
+      //    CHECK-DOUBLE:  = linalg.init_tensor [%{{.*}}, 12, 6]
+      //    CHECK-DOUBLE:  %[[PT1:.*]] = scf.for %[[PIV1:[0-9a-z]+]] =
+      //      CHECK-DOUBLE:   %[[PIDX1:.*]] = affine.apply #[[DIV6]](%[[PIV1]])
+      //      CHECK-DOUBLE:   %[[T3:.*]] = tensor.extract_slice %[[ARG1]]
+      //      CHECK-DOUBLE:   %[[T4:.*]] = linalg.pad_tensor %[[T3]] nofold
+      //      CHECK-DOUBLE:   %[[T5:.*]] = tensor.insert_slice %[[T4:.*]] into %{{.*}}[%[[PIDX1]], 0, 0]
+      //      CHECK-DOUBLE:   scf.yield %[[T5:.*]]
+
       //    CHECK-DOUBLE:  scf.for %[[IV2:[0-9a-zA-Z]*]] =
       %5 = scf.for %arg7 = %c0 to %2 step %c5 iter_args(%arg8 = %4) -> (tensor<?x?xf32>) {
 
@@ -138,6 +251,12 @@
         %7 = scf.for %arg9 = %c0 to %3 step %c6 iter_args(%arg10 = %arg8) -> (tensor<?x?xf32>) {
           %8 = affine.min #map2(%arg7, %2)
           %9 = affine.apply #map3(%arg7, %arg3)
+
+          // Index the packed operands.
+          //    CHECK-DOUBLE-DAG:   %[[IDX0:.*]] = affine.apply #[[DIV5]](%[[IV2]])
+          //    CHECK-DOUBLE-DAG:   %[[T6:.*]] = tensor.extract_slice %[[PT0]][%[[IDX0]]
+          //    CHECK-DOUBLE-DAG:   %[[IDX1:.*]] = affine.apply #[[DIV6]](%[[IV3]])
+          //    CHECK-DOUBLE-DAG:   %[[T7:.*]] = tensor.extract_slice %[[PT1]][%[[IDX1]]
           %10 = tensor.extract_slice %arg0[%9, 0] [%8, 12] [1, 1] : tensor<24x12xf32> to tensor<?x12xf32>
           %11 = affine.min #map4(%arg9, %3)
           %12 = affine.apply #map3(%arg9, %arg5)
@@ -146,9 +265,8 @@
           %15 = affine.min #map4(%arg9, %3)
           %16 = tensor.extract_slice %arg10[%arg7, %arg9] [%14, %15] [1, 1] : tensor<?x?xf32> to tensor<?x?xf32>
 
-          // Pad the output operand and perform the multiplication.
-          //        CHECK-DOUBLE:   = linalg.pad_tensor
-          //        CHECK-DOUBLE:   = linalg.matmul
+          // Check matmul uses the packed input operands.
+          //    CHECK-DOUBLE:   = linalg.matmul ins(%[[T6]], %[[T7]]
           %17 = linalg.matmul {__internal_linalg_transform__ = "pad"} ins(%10, %13 : tensor<?x12xf32>, tensor<12x?xf32>) outs(%16 : tensor<?x?xf32>) -> tensor<?x?xf32>
           %18 = tensor.insert_slice %17 into %arg10[%arg7, %arg9] [%14, %15] [1, 1] : tensor<?x?xf32> into tensor<?x?xf32>
           scf.yield %18 : tensor<?x?xf32>