diff --git a/mlir/lib/Dialect/Linalg/Transforms/HoistPadding.cpp b/mlir/lib/Dialect/Linalg/Transforms/HoistPadding.cpp
--- a/mlir/lib/Dialect/Linalg/Transforms/HoistPadding.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/HoistPadding.cpp
@@ -48,6 +48,10 @@
 ///      contains an unknown op with a region.
 ///   4. The backward slice from the pad op to the scf::ForOp to hoist above is
 ///      empty.
+///   5. The source tensor of pad op is not defined by an extract slice op.
+///   6. The source tensor of the extract slice op is not defined outside of
+///      the outermost enclosing scf::ForOp.
+///   7. There is no enclosing scf::ForOp that indexes the padded data.
 /// Other cases succeed and will trigger hoisting of the pad op.
 struct HoistingAnalysis {
   HoistingAnalysis(PadTensorOp padTensorOp, int nLevels);
@@ -82,6 +86,26 @@
       packingLoops;
 
 private:
+  /// Returns the loops in `backwardSlice` used to index the padded data. The
+  /// method starts from `padTensorOp` and `sliceOp`, follows the use-def
+  /// chains of their index operands, and stores any enclosing loop whose
+  /// induction variable is part of the walked index computation.
+  ///
+  /// Example:
+  /// ```
+  /// %source = linalg.fill(%cst, %arg0)
+  /// scf.for %i
+  ///   scf.for %j
+  ///     scf.for %k // not used to index %source!
+  ///       %ubi = affine.min #map(%i)
+  ///       %ubj = affine.min #map(%j)
+  ///       %slice = tensor.extract_slice %source [%i, %j] [%ubi, %ubj]
+  ///       %padded_slice = linalg.pad_tensor %slice
+  /// ```
+  /// getIndexingLoops(%padded_slice, %slice) returns [scf.for %i, scf.for %j]
+  SetVector<Operation *> getIndexingLoops(PadTensorOp padTensorOp,
+                                          tensor::ExtractSliceOp sliceOp);
+
   /// Encodes whether the analysis is valid and hoisting can proceed.
   bool valid;
 };
@@ -166,28 +190,114 @@
   if (analysisFailure || backwardSlice.empty())
     return;
 
-  // Backward slice is a topologically sorted list of ops starting at
-  // `outermostEnclosingForOp`.
-  assert(outermostEnclosingForOp == backwardSlice.front());
+  // Get the `sliceOp` that defines the source tensor of `padTensorOp` and
+  // check its source is defined outside of the outermost loop. This check
+  // ensures the padded data is available for packing before entering the
+  // outermost enclosing loop.
+  //
+  // Example:
+  // ```
+  // %source = linalg.fill(%cst, %arg0)
+  // // %source is available for packing here!
+  // scf.for %i
+  //   scf.for %j
+  //     scf.for %k
+  //       %slice = tensor.extract_slice %source [%i, %j]
+  //       %padded_slice = linalg.pad_tensor %slice
+  // ```
+  auto sliceOp = padTensorOp.source().getDefiningOp<tensor::ExtractSliceOp>();
+  if (!sliceOp) {
+    LLVM_DEBUG(DBGS() << "Cannot find the extract slice op -> skip\n");
+    return;
+  }
+  if (!outermostEnclosingForOp.isDefinedOutsideOfLoop(sliceOp.source())) {
+    LLVM_DEBUG(DBGS() << "Source not defined outside of loops -> skip\n");
+    return;
+  }
 
-  // Filter out the loops whose induction variable is not used to compute the
-  // padded result. As a first approximation, just look for IVs that have no use
-  // in the backwardSlice.
-  // These are the dimensions of reuse that we can exploit to reduce the amount
-  // of copy / memory.
+  // Search the loops found in `backwardSlice` used to index the padded data.
+  SetVector<Operation *> indexingLoops = getIndexingLoops(padTensorOp, sliceOp);
+
+  // Add only the loops part of `indexingLoops` to the packing loops. All other
+  // loops are not used to index the padded data and consequently access the
+  // same data in every loop iteration. Adding them to the packing loops would
+  // increase the cache footprint of the packed data by storing the same data
+  // multiple times.
   for (scf::ForOp forOp : llvm::reverse(reverseEnclosingLoops)) {
-    for (Operation *user : forOp.getInductionVar().getUsers()) {
-      if (backwardSlice.contains(user)) {
-        packingLoops.insert(forOp);
-        break;
-      }
-    }
+    if (indexingLoops.contains(forOp))
+      packingLoops.insert(forOp);
+  }
+  assert(indexingLoops.size() == packingLoops.size() &&
+         "expect the all indexing loops are enclosing loops");
+  if (packingLoops.empty()) {
+    LLVM_DEBUG(DBGS() << "Cannot find a packing loop -> skip\n");
+    return;
   }
 
   // The analysis is valid and hoisting can occur.
   valid = true;
 }
 
+SetVector<Operation *>
+HoistingAnalysis::getIndexingLoops(PadTensorOp padTensorOp,
+                                   tensor::ExtractSliceOp sliceOp) {
+  // Set of all values used for index computation.
+  SetVector<Value> indexEdges;
+
+  // Helper function that adds all index operands of an operation to
+  // `indexEdges`. An operand is an index operand if it is of index type.
+  auto addIndexOperandsToIndexEdges = [&](Operation *op) {
+    for (Value operand : op->getOperands())
+      if (operand.getType().isIndex())
+        indexEdges.insert(operand);
+  };
+
+  // Starting from `padTensorOp` and `sliceOp` walk the use-def edges of index
+  // type in `backwardSlice`. Add the index operands of an operation to
+  // `indexEdges` if one of its results is an index edge found so far and store
+  // all loops part of the index computation to `indexingLoops`.
+  //
+  // Example:
+  // ```
+  // %source = linalg.fill(%cst, %arg0)
+  // scf.for %i
+  //   scf.for %j
+  //     scf.for %k // not used to index %source!
+  //       %ubi = affine.min #map(%i)
+  //       %ubj = affine.min #map(%j)
+  //       %slice = tensor.extract_slice %source [%i, %j] [%ubi, %ubj]
+  //       %padded_slice = linalg.pad_tensor %slice
+  // ```
+  // After iterating `backwardSlice` we obtain:
+  // indexEdges = [%i, %j, %ubi, %ubj]
+  // indexingLoops = [scf.for %i, scf.for %j]
+  SetVector<Operation *> indexingLoops;
+  for (Operation *op : llvm::reverse(backwardSlice)) {
+    // Add the index operands of `padTensorOp` and `sliceOp` to start the
+    // exploration of the index computation.
+    if (op == padTensorOp || op == sliceOp) {
+      addIndexOperandsToIndexEdges(op);
+      continue;
+    }
+    // Add the index operands of the loop if its induction variable is
+    // used for index computation. Additionally, insert the loop into
+    // `indexingLoops`
+    if (auto forOp = dyn_cast<scf::ForOp>(op)) {
+      if (indexEdges.contains(forOp.getInductionVar())) {
+        addIndexOperandsToIndexEdges(op);
+        indexingLoops.insert(forOp);
+        continue;
+      }
+    }
+    // Add the index operands of all other operations if at least one result is
+    // used for index computation.
+    if (llvm::any_of(op->getResults(),
+                     [&](Value result) { return indexEdges.contains(result); }))
+      addIndexOperandsToIndexEdges(op);
+  }
+  return indexingLoops;
+}
+
 static bool isDefinedOutsideOrConstant(scf::ForOp outer, Value v) {
   return outer.isDefinedOutsideOfLoop(v) || v.getDefiningOp<ConstantOp>();
 }
diff --git a/mlir/test/Dialect/Linalg/hoist-padding.mlir b/mlir/test/Dialect/Linalg/hoist-padding.mlir
--- a/mlir/test/Dialect/Linalg/hoist-padding.mlir
+++ b/mlir/test/Dialect/Linalg/hoist-padding.mlir
@@ -141,29 +141,25 @@
 
 // -----
 
-
 // CHECK-DAG: #[[$MIN_REST8:[0-9a-z]+]] = affine_map<(d0)[s0] -> (8, -d0 + s0)>
-// CHECK-DAG: #[[$MIN_REST4:[0-9a-z]+]] = affine_map<(d0, d1) -> (4, d0 - d1)>
-// CHECK-DAG: #[[$MIN_REST2:[0-9a-z]+]] = affine_map<(d0, d1) -> (2, d0 - d1)>
 // CHECK-DAG: #[[$DIV4:[0-9a-z]+]] = affine_map<(d0) -> (d0 ceildiv 4)>
 // CHECK-DAG: #[[$DIV2:[0-9a-z]+]] = affine_map<(d0) -> (d0 ceildiv 2)>
 #map0 = affine_map<(d0)[s0] -> (8, -d0 + s0)>
-#map1 = affine_map<(d0, d1) -> (4, d0 - d1)>
-#map2 = affine_map<(d0, d1) -> (2, d0 - d1)>
+#map1 = affine_map<(d0, d1) -> (4, -d0 + d1)>
+#map2 = affine_map<(d0, d1) -> (2, -d0 + d1)>
+#map3 = affine_map<(d0, d1, d2) -> (d0 + d1 + d2)>
 
 // CHECK-LABEL: func @dot
 // VERIFIER-ONLY-LABEL: func @dot
 func @dot(%arg0: tensor<?xf32>, %arg1: tensor<?xf32>, %arg2: tensor<f32>)
     -> tensor<f32>
 {
+  %cst = arith.constant 0.000000e+00 : f32
   %c8 = arith.constant 8 : index
+  %c0 = arith.constant 0 : index
   %c4 = arith.constant 4 : index
-  %cst = arith.constant 0.000000e+00 : f32
   %c2 = arith.constant 2 : index
-  %c0 = arith.constant 0 : index
-  %1 = tensor.dim %arg0, %c0 : tensor<?xf32>
-  %2 = tensor.dim %arg0, %c0 : tensor<?xf32>
-  %3 = tensor.dim %arg1, %c0 : tensor<?xf32>
+  %0 = tensor.dim %arg0, %c0 : tensor<?xf32>
 
   //      CHECK: scf.for %[[I:[0-9a-z]+]] =
   //
@@ -194,39 +190,32 @@
   //      CHECK:       %[[B:.*]] = tensor.extract_slice %[[PACKED_B]][%[[IDX0_2]], %[[IDX1_2]], 0] [1, 1, 2] [1, 1, 1] : tensor<?x?x2xf32> to tensor<2xf32>
   //      CHECK:       linalg.dot ins(%[[A]], %[[B]] : tensor<2xf32>, tensor<2xf32>) outs(%[[C]] : tensor<f32>) -> tensor<f32>
 
-  %4 = scf.for %arg3 = %c0 to %1 step %c8 iter_args(%arg4 = %arg2) -> (tensor<f32>) {
-    %5 = affine.min #map0(%arg3)[%2]
-    %6 = tensor.extract_slice %arg0[%arg3] [%5] [1] : tensor<?xf32> to tensor<?xf32>
-    %7 = affine.min #map0(%arg3)[%3]
-    %8 = tensor.extract_slice %arg1[%arg3] [%7] [1] : tensor<?xf32> to tensor<?xf32>
-    %9 = scf.for %arg5 = %c0 to %5 step %c4 iter_args(%arg6 = %arg4) -> (tensor<f32>) {
-      %10 = affine.min #map1(%5, %arg5)
-      %11 = tensor.extract_slice %6[%arg5] [%10] [1] : tensor<?xf32> to tensor<?xf32>
-      %12 = affine.min #map1(%7, %arg5)
-      %13 = tensor.extract_slice %8[%arg5] [%12] [1] : tensor<?xf32> to tensor<?xf32>
-      %14 = scf.for %arg7 = %c0 to %10 step %c2 iter_args(%arg8 = %arg6) -> (tensor<f32>) {
-        %15 = affine.min #map2(%10, %arg7)
-        %16 = tensor.extract_slice %11[%arg7] [%15] [1] : tensor<?xf32> to tensor<?xf32>
-        %17 = affine.min #map2(%12, %arg7)
-        %18 = tensor.extract_slice %13[%arg7] [%17] [1] : tensor<?xf32> to tensor<?xf32>
-        %19 = arith.subi %c2, %15 : index
-        %20 = linalg.pad_tensor %16 low[%c0] high[%19]  {
+  %1 = scf.for %arg3 = %c0 to %0 step %c8 iter_args(%arg4 = %arg2) -> (tensor<f32>) {
+    %2 = affine.min #map0(%arg3)[%0]
+    %3 = scf.for %arg5 = %c0 to %2 step %c4 iter_args(%arg6 = %arg4) -> (tensor<f32>) {
+      %4 = affine.min #map1(%arg5, %2)
+      %5 = scf.for %arg7 = %c0 to %4 step %c2 iter_args(%arg8 = %arg6) -> (tensor<f32>) {
+        %6 = affine.min #map2(%arg7, %4)
+        %7 = affine.apply #map3(%arg7, %arg5, %arg3)
+        %8 = tensor.extract_slice %arg0[%7] [%6] [1] : tensor<?xf32> to tensor<?xf32>
+        %9 = tensor.extract_slice %arg1[%7] [%6] [1] : tensor<?xf32> to tensor<?xf32>
+        %10 = arith.subi %c2, %6 : index
+        %11 = linalg.pad_tensor %8 low[%c0] high[%10]  {
         ^bb0(%arg9: index):  // no predecessors
           linalg.yield %cst : f32
         } : tensor<?xf32> to tensor<2xf32>
-        %21 = arith.subi %c2, %17 : index
-        %22 = linalg.pad_tensor %18 low[%c0] high[%21]  {
+        %12 = linalg.pad_tensor %9 low[%c0] high[%10]  {
         ^bb0(%arg9: index):  // no predecessors
           linalg.yield %cst : f32
         } : tensor<?xf32> to tensor<2xf32>
-        %23 = linalg.dot ins(%20, %22 : tensor<2xf32>, tensor<2xf32>) outs(%arg8 : tensor<f32>) -> tensor<f32>
-        scf.yield %23 : tensor<f32>
+        %13 = linalg.dot ins(%11, %12 : tensor<2xf32>, tensor<2xf32>) outs(%arg8 : tensor<f32>) -> tensor<f32>
+        scf.yield %13 : tensor<f32>
       }
-      scf.yield %14 : tensor<f32>
+      scf.yield %5 : tensor<f32>
     }
-    scf.yield %9 : tensor<f32>
+    scf.yield %3 : tensor<f32>
   }
-  return %4 : tensor<f32>
+  return %1 : tensor<f32>
 }
 
 // -----
diff --git a/mlir/test/Dialect/Linalg/pad-and-hoist.mlir b/mlir/test/Dialect/Linalg/pad-and-hoist.mlir
--- a/mlir/test/Dialect/Linalg/pad-and-hoist.mlir
+++ b/mlir/test/Dialect/Linalg/pad-and-hoist.mlir
@@ -1,25 +1,23 @@
 // RUN: mlir-opt %s -test-linalg-transform-patterns="test-pad-pattern pack-paddings=1,1,0 hoist-paddings=2,1,0" -cse -canonicalize -split-input-file | FileCheck %s
-// RUN: mlir-opt %s -test-linalg-transform-patterns="test-pad-pattern pack-paddings=1,1,0 hoist-paddings=4,3,0" -cse -canonicalize -split-input-file | FileCheck %s --check-prefix=CHECK-DOUBLE
+// RUN: mlir-opt %s -test-linalg-transform-patterns="test-pad-pattern pack-paddings=1,1,0 hoist-paddings=3,2,0" -cse -canonicalize -split-input-file | FileCheck %s --check-prefix=CHECK-DOUBLE
 
 // CHECK-DAG: #[[MAP0:[0-9a-z]+]] = affine_map<(d0) -> (5, -d0 + 24)>
-// CHECK-DAG: #[[MAP1:[0-9a-z]+]] = affine_map<(d0) -> (8, -d0 + 12)>
+// CHECK-DAG: #[[MAP1:[0-9a-z]+]] = affine_map<(d0) -> (7, -d0 + 25)>
 // CHECK-DAG: #[[DIV6:[0-9a-z]+]] = affine_map<(d0) -> (d0 ceildiv 6)>
 #map0 = affine_map<(d0) -> (5, -d0 + 24)>
-#map1 = affine_map<(d0) -> (8, -d0 + 12)>
-#map2 = affine_map<(d0) -> (7, -d0 + 25)>
-
-//      CHECK:  single_tiling
-//      CHECK-DOUBLE:  single_tiling
+#map1 = affine_map<(d0) -> (7, -d0 + 25)>
 
+//      CHECK:  static_sizes
+//      CHECK-DOUBLE:  static_sizes
 // CHECK-SAME:    %[[ARG0:[0-9a-zA-Z]*]]: tensor<24x12xf32>
 // CHECK-SAME:    %[[ARG1:[0-9a-zA-Z]*]]: tensor<12x25xf32>
 // CHECK-SAME:    %[[ARG2:[0-9a-zA-Z]*]]: tensor<24x25xf32>
-func @single_tiling(%arg0: tensor<24x12xf32>,
-                    %arg1: tensor<12x25xf32>,
-                    %arg2: tensor<24x25xf32>) -> tensor<24x25xf32> {
+func @static_sizes(%arg0: tensor<24x12xf32>,
+                   %arg1: tensor<12x25xf32>,
+                   %arg2: tensor<24x25xf32>) -> tensor<24x25xf32> {
   //  CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index
   //  CHECK-DAG: %[[C5:.*]] = arith.constant 5
-  //  CHECK-DAG: %[[C8:.*]] = arith.constant 8
+  //  CHECK-DAG: %[[C7:.*]] = arith.constant 7
   %c0 = arith.constant 0 : index
   %c12 = arith.constant 12 : index
   %c25 = arith.constant 25 : index
@@ -33,11 +31,11 @@
 
     // Packing the first input operand for all values of IV2 (IV2x5x6).
     //      CHECK:  = linalg.init_tensor [2, 5, 6]
-    //      CHECK:  %[[PT0:.*]] = scf.for %[[P0IV2:[0-9a-z]+]] =
-    //        CHECK:   %[[PIDX0:.*]] = affine.apply #[[DIV6]](%[[P0IV2]])
+    //      CHECK:  %[[PT0:.*]] = scf.for %[[PIV0:[0-9a-z]+]] =
+    //        CHECK:   %[[PIDX0:.*]] = affine.apply #[[DIV6]](%[[PIV0]])
     //        CHECK:   %[[TS0:.*]] = affine.min #[[MAP0]](%[[IV0]])
     //        CHECK:   %[[T0:.*]] = tensor.extract_slice %[[ARG0]]
-    //   CHECK-SAME:                                     %[[IV0]], %[[P0IV2]]
+    //   CHECK-SAME:                                     %[[IV0]], %[[PIV0]]
     //   CHECK-SAME:                                     %[[TS0]], 6
     //        CHECK:   %[[V0:.*]] = arith.subi %[[C5]], %[[TS0]]
     //        CHECK:   %[[T1:.*]] = linalg.pad_tensor %[[T0]] nofold {{.*}} high[%[[V0]]
@@ -47,15 +45,15 @@
     //      CHECK:  scf.for %[[IV1:[0-9a-zA-Z]*]] =
     %1 = scf.for %arg5 = %c0 to %c25 step %c7 iter_args(%arg6 = %arg4) -> (tensor<24x25xf32>) {
 
-      // Packing the second input operand for all values of IV2 (IV2x6x8).
-      //      CHECK:  = linalg.init_tensor [2, 6, 8]
-      //      CHECK:  %[[PT1:.*]] = scf.for %[[P1IV2:[0-9a-z]+]] =
-      //        CHECK:   %[[PIDX1:.*]] = affine.apply #[[DIV6]](%[[P1IV2]])
+      // Packing the second input operand for all values of IV2 (IV2x6x7).
+      //      CHECK:  = linalg.init_tensor [2, 6, 7]
+      //      CHECK:  %[[PT1:.*]] = scf.for %[[PIV1:[0-9a-z]+]] =
+      //        CHECK:   %[[PIDX1:.*]] = affine.apply #[[DIV6]](%[[PIV1]])
       //        CHECK:   %[[TS1:.*]] = affine.min #[[MAP1]](%[[IV1]])
       //        CHECK:   %[[T3:.*]] = tensor.extract_slice %[[ARG1]]
-      //   CHECK-SAME:                                     %[[P1IV2]], %[[IV1]]
+      //   CHECK-SAME:                                     %[[PIV1]], %[[IV1]]
       //   CHECK-SAME:                                     6, %[[TS1]]
-      //        CHECK:   %[[V1:.*]] = arith.subi %[[C8]], %[[TS1]]
+      //        CHECK:   %[[V1:.*]] = arith.subi %[[C7]], %[[TS1]]
       //        CHECK:   %[[T4:.*]] = linalg.pad_tensor %[[T3]] nofold {{.*}} high[%[[C0]], %[[V1]]
       //        CHECK:   %[[T5:.*]] = tensor.insert_slice %[[T4:.*]] into %{{.*}}[%[[PIDX1]], 0, 0]
       //        CHECK:   scf.yield %[[T5:.*]]
@@ -63,6 +61,7 @@
       //      CHECK:  scf.for %[[IV2:[0-9a-zA-Z]*]] = {{.*}} iter_args(%[[ARG4:.*]] =
       %2 = scf.for %arg7 = %c0 to %c12 step %c6 iter_args(%arg8 = %arg6) -> (tensor<24x25xf32>) {
         %3 = affine.min #map0(%arg3)
+
         // Index the packed operands.
         //    CHECK-DAG:   %[[IDX:.*]] = affine.apply #[[DIV6]](%[[IV2]])
         //    CHECK-DAG:   %[[T6:.*]] = tensor.extract_slice %[[PT0]][%[[IDX]]
@@ -91,6 +90,108 @@
 
 // -----
 
+// CHECK-DAG: #[[MAP0:[0-9a-z]+]] = affine_map<(d0)[s0] -> (5, -d0 + s0)>
+// CHECK-DAG: #[[MAP1:[0-9a-z]+]] = affine_map<(d0)[s0] -> (6, -d0 + s0)>
+// CHECK-DAG: #[[MAP2:[0-9a-z]+]] = affine_map<(d0)[s0] -> (7, -d0 + s0)>
+// CHECK-DAG: #[[SDIV6:[0-9a-z]+]] = affine_map<()[s0] -> (s0 ceildiv 6)>
+// CHECK-DAG: #[[DDIV6:[0-9a-z]+]] = affine_map<(d0) -> (d0 ceildiv 6)>
+#map0 = affine_map<(d0)[s0] -> (5, -d0 + s0)>
+#map1 = affine_map<(d0)[s0] -> (6, -d0 + s0)>
+#map2 = affine_map<(d0)[s0] -> (7, -d0 + s0)>
+
+//      CHECK:  dynamic_sizes
+//      CHECK-DOUBLE:  dynamic_sizes
+// CHECK-SAME:    %[[ARG0:[0-9a-zA-Z]*]]: tensor<?x?xf32>
+// CHECK-SAME:    %[[ARG1:[0-9a-zA-Z]*]]: tensor<?x?xf32>
+// CHECK-SAME:    %[[ARG2:[0-9a-zA-Z]*]]: tensor<?x?xf32>
+func @dynamic_sizes(%arg0: tensor<?x?xf32>,
+                    %arg1: tensor<?x?xf32>,
+                    %arg2: tensor<?x?xf32>) -> tensor<?x?xf32> {
+  //  CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index
+  //  CHECK-DAG: %[[C1:.*]] = arith.constant 1
+  //  CHECK-DAG: %[[C5:.*]] = arith.constant 5
+  //  CHECK-DAG: %[[C6:.*]] = arith.constant 6
+  %c1 = arith.constant 1 : index
+  %c0 = arith.constant 0 : index
+  %c6 = arith.constant 6 : index
+  %c7 = arith.constant 7 : index
+  %c5 = arith.constant 5 : index
+
+  //  CHECK-DAG: %[[D0:.*]] = tensor.dim %[[ARG0]], %[[C0]]
+  //  CHECK-DAG: %[[D1:.*]] = tensor.dim %[[ARG0]], %[[C1]]
+  //  CHECK-DAG: %[[D2:.*]] = tensor.dim %[[ARG1]], %[[C1]]
+  %0 = tensor.dim %arg0, %c0 : tensor<?x?xf32>
+  %1 = tensor.dim %arg0, %c1 : tensor<?x?xf32>
+  %2 = tensor.dim %arg1, %c1 : tensor<?x?xf32>
+
+  //      CHECK:  scf.for %[[IV0:[0-9a-zA-Z]*]] =
+  %3 = scf.for %arg3 = %c0 to %0 step %c5 iter_args(%arg4 = %arg2) -> (tensor<?x?xf32>) {
+
+    // Packing the first input operand for all values of IV2 (IV2x5x6).
+    //      CHECK:  %[[PS0:.*]] = affine.apply #[[SDIV6]]()[%[[D1]]
+    //      CHECK:  = linalg.init_tensor [%[[PS0]], 5, 6]
+    //      CHECK:  %[[PT0:.*]] = scf.for %[[PIV0:[0-9a-z]+]] =
+    //        CHECK:   %[[PIDX0:.*]] = affine.apply #[[DDIV6]](%[[PIV0]])
+    //        CHECK:   %[[TS0:.*]] = affine.min #[[MAP0]](%[[IV0]])[%[[D0]]
+    //        CHECK:   %[[TS1:.*]] = affine.min #[[MAP1]](%[[PIV0]])[%[[D1]]
+    //        CHECK:   %[[T0:.*]] = tensor.extract_slice %[[ARG0]]
+    //   CHECK-SAME:                                     %[[IV0]], %[[PIV0]]
+    //   CHECK-SAME:                                     %[[TS0]], %[[TS1]]
+    //        CHECK:   %[[V0:.*]] = arith.subi %[[C5]], %[[TS0]]
+    //        CHECK:   %[[V1:.*]] = arith.subi %[[C6]], %[[TS1]]
+    //        CHECK:   %[[T1:.*]] = linalg.pad_tensor %[[T0]] nofold {{.*}} high[%[[V0]], %[[V1]]
+    //        CHECK:   %[[T2:.*]] = tensor.insert_slice %[[T1:.*]] into %{{.*}}[%[[PIDX0]], 0, 0]
+    //        CHECK:   scf.yield %[[T2:.*]]
+
+    //      CHECK:  scf.for %[[IV1:[0-9a-zA-Z]*]] =
+    %4 = scf.for %arg5 = %c0 to %2 step %c7 iter_args(%arg6 = %arg4) -> (tensor<?x?xf32>) {
+
+      // Packing the second input operand for all values of IV2 (IV2x6x7).
+      //      CHECK:  = linalg.init_tensor [%[[PS0]], 6, 7]
+      //      CHECK:  %[[PT1:.*]] = scf.for %[[PIV1:[0-9a-z]+]] =
+      //        CHECK:   %[[PIDX1:.*]] = affine.apply #[[DDIV6]](%[[PIV1]])
+      //        CHECK:   %[[TS2:.*]] = affine.min #[[MAP1]](%[[PIV1]])[%[[D1]]
+      //        CHECK:   %[[TS3:.*]] = affine.min #[[MAP2]](%[[IV1]])[%[[D2]]
+      //        CHECK:   %[[T3:.*]] = tensor.extract_slice %[[ARG1]]
+      //   CHECK-SAME:                                     %[[PIV1]], %[[IV1]]
+      //   CHECK-SAME:                                     %[[TS2]], %[[TS3]]
+      //        CHECK:   %[[V2:.*]] = arith.subi %[[C6]], %[[TS2]]
+      //        CHECK:   %[[V3:.*]] = arith.subi %[[C7]], %[[TS3]]
+      //        CHECK:   %[[T4:.*]] = linalg.pad_tensor %[[T3]] nofold {{.*}} high[%[[V2]], %[[V3]]
+      //        CHECK:   %[[T5:.*]] = tensor.insert_slice %[[T4:.*]] into %{{.*}}[%[[PIDX1]], 0, 0]
+      //        CHECK:   scf.yield %[[T5:.*]]
+
+      //      CHECK:  scf.for %[[IV2:[0-9a-zA-Z]*]] = {{.*}} iter_args(%[[ARG4:.*]] =
+      %5 = scf.for %arg7 = %c0 to %1 step %c6 iter_args(%arg8 = %arg6) -> (tensor<?x?xf32>) {
+        %6 = affine.min #map0(%arg3)[%0]
+        %7 = affine.min #map1(%arg7)[%1]
+
+        // Index the packed operands.
+        //    CHECK-DAG:   %[[IDX:.*]] = affine.apply #[[DDIV6]](%[[IV2]])
+        //    CHECK-DAG:   %[[T6:.*]] = tensor.extract_slice %[[PT0]][%[[IDX]]
+        //    CHECK-DAG:   %[[T7:.*]] = tensor.extract_slice %[[PT1]][%[[IDX]]
+        %8 = tensor.extract_slice %arg0[%arg3, %arg7] [%6, %7] [1, 1] : tensor<?x?xf32> to tensor<?x?xf32>
+        %9 = affine.min #map2(%arg5)[%2]
+        %10 = tensor.extract_slice %arg1[%arg7, %arg5] [%7, %9] [1, 1] : tensor<?x?xf32> to tensor<?x?xf32>
+        %11 = tensor.extract_slice %arg8[%arg3, %arg5] [%6, %9] [1, 1] : tensor<?x?xf32> to tensor<?x?xf32>
+
+        // Check matmul uses the packed input operands.
+        //        CHECK:   = linalg.matmul ins(%[[T6]], %[[T7]]
+        %12 = linalg.matmul {__internal_linalg_transform__ = "pad"} ins(%8, %10 : tensor<?x?xf32>, tensor<?x?xf32>) outs(%11 : tensor<?x?xf32>) -> tensor<?x?xf32>
+        %13 = tensor.insert_slice %12 into %arg8[%arg3, %arg5] [%6, %9] [1, 1] : tensor<?x?xf32> into tensor<?x?xf32>
+        scf.yield %13 : tensor<?x?xf32>
+      }
+      scf.yield %5 : tensor<?x?xf32>
+    }
+    scf.yield %4 : tensor<?x?xf32>
+  }
+  return %3 : tensor<?x?xf32>
+}
+
+// -----
+
+// CHECK-DOUBLE-DAG: #[[DIV5:[0-9a-z]+]] = affine_map<(d0) -> (d0 ceildiv 5)>
+// CHECK-DOUBLE-DAG: #[[DIV6:[0-9a-z]+]] = affine_map<(d0) -> (d0 ceildiv 6)>
 #map0 = affine_map<(d0) -> (15, -d0 + 24)>
 #map1 = affine_map<(d0) -> (16, -d0 + 25)>
 #map2 = affine_map<(d0, d1) -> (5, -d0 + d1)>
@@ -99,7 +200,6 @@
 
 //      CHECK:  double_tiling
 //      CHECK-DOUBLE:  double_tiling
-
 // CHECK-DOUBLE-SAME:    %[[ARG0:[0-9a-zA-Z]*]]: tensor<24x12xf32>
 // CHECK-DOUBLE-SAME:    %[[ARG1:[0-9a-zA-Z]*]]: tensor<12x25xf32>
 // CHECK-DOUBLE-SAME:    %[[ARG2:[0-9a-zA-Z]*]]: tensor<24x25xf32>
@@ -114,16 +214,17 @@
   %c5 = arith.constant 5 : index
   %c6 = arith.constant 6 : index
 
-  // Packing the first input operand.
-  //    CHECK-DOUBLE:  = linalg.init_tensor
-  //    CHECK-DOUBLE:  = linalg.pad_tensor {{.*}} nofold
-
   //    CHECK-DOUBLE:  scf.for %[[IV0:[0-9a-zA-Z]*]] =
   %0 = scf.for %arg3 = %c0 to %c24 step %c15 iter_args(%arg4 = %arg2) -> (tensor<24x25xf32>) {
 
-    // Packing the second input operand.
-    //    CHECK-DOUBLE:  = linalg.init_tensor
-    //    CHECK-DOUBLE:  = linalg.pad_tensor {{.*}} nofold
+    // Packing the first input operand.
+    //    CHECK-DOUBLE:  = linalg.init_tensor [3, 5, 12]
+    //    CHECK-DOUBLE:  %[[PT0:.*]] = scf.for %[[PIV0:[0-9a-z]+]] =
+    //      CHECK-DOUBLE:   %[[PIDX0:.*]] = affine.apply #[[DIV5]](%[[PIV0]])
+    //      CHECK-DOUBLE:   %[[T0:.*]] = tensor.extract_slice %[[ARG0]]
+    //      CHECK-DOUBLE:   %[[T1:.*]] = linalg.pad_tensor %[[T0]] nofold
+    //      CHECK-DOUBLE:   %[[T2:.*]] = tensor.insert_slice %[[T1:.*]] into %{{.*}}[%[[PIDX0]], 0, 0]
+    //      CHECK-DOUBLE:   scf.yield %[[T2:.*]]
 
     //    CHECK-DOUBLE:  scf.for %[[IV1:[0-9a-zA-Z]*]] =
     %1 = scf.for %arg5 = %c0 to %c25 step %c16 iter_args(%arg6 = %arg4) -> (tensor<24x25xf32>) {
@@ -131,6 +232,15 @@
       %3 = affine.min #map1(%arg5)
       %4 = tensor.extract_slice %arg6[%arg3, %arg5] [%2, %3] [1, 1] : tensor<24x25xf32> to tensor<?x?xf32>
 
+      // Packing the second input operand.
+      //    CHECK-DOUBLE:  = linalg.init_tensor [%{{.*}}, 12, 6]
+      //    CHECK-DOUBLE:  %[[PT1:.*]] = scf.for %[[PIV1:[0-9a-z]+]] =
+      //      CHECK-DOUBLE:   %[[PIDX1:.*]] = affine.apply #[[DIV6]](%[[PIV1]])
+      //      CHECK-DOUBLE:   %[[T3:.*]] = tensor.extract_slice %[[ARG1]]
+      //      CHECK-DOUBLE:   %[[T4:.*]] = linalg.pad_tensor %[[T3]] nofold
+      //      CHECK-DOUBLE:   %[[T5:.*]] = tensor.insert_slice %[[T4:.*]] into %{{.*}}[%[[PIDX1]], 0, 0]
+      //      CHECK-DOUBLE:   scf.yield %[[T5:.*]]
+
       //    CHECK-DOUBLE:  scf.for %[[IV2:[0-9a-zA-Z]*]] =
       %5 = scf.for %arg7 = %c0 to %2 step %c5 iter_args(%arg8 = %4) -> (tensor<?x?xf32>) {
 
@@ -138,6 +248,12 @@
         %7 = scf.for %arg9 = %c0 to %3 step %c6 iter_args(%arg10 = %arg8) -> (tensor<?x?xf32>) {
           %8 = affine.min #map2(%arg7, %2)
           %9 = affine.apply #map3(%arg7, %arg3)
+
+          // Index the packed operands.
+          //    CHECK-DOUBLE-DAG:   %[[IDX0:.*]] = affine.apply #[[DIV5]](%[[IV2]])
+          //    CHECK-DOUBLE-DAG:   %[[T6:.*]] = tensor.extract_slice %[[PT0]][%[[IDX0]]
+          //    CHECK-DOUBLE-DAG:   %[[IDX1:.*]] = affine.apply #[[DIV6]](%[[IV3]])
+          //    CHECK-DOUBLE-DAG:   %[[T7:.*]] = tensor.extract_slice %[[PT1]][%[[IDX1]]
           %10 = tensor.extract_slice %arg0[%9, 0] [%8, 12] [1, 1] : tensor<24x12xf32> to tensor<?x12xf32>
           %11 = affine.min #map4(%arg9, %3)
           %12 = affine.apply #map3(%arg9, %arg5)
@@ -146,9 +262,8 @@
           %15 = affine.min #map4(%arg9, %3)
           %16 = tensor.extract_slice %arg10[%arg7, %arg9] [%14, %15] [1, 1] : tensor<?x?xf32> to tensor<?x?xf32>
 
-          // Pad the output operand and perform the multiplication.
-          //        CHECK-DOUBLE:   = linalg.pad_tensor
-          //        CHECK-DOUBLE:   = linalg.matmul
+          // Check matmul uses the packed input operands.
+          //    CHECK-DOUBLE:   = linalg.matmul ins(%[[T6]], %[[T7]]
           %17 = linalg.matmul {__internal_linalg_transform__ = "pad"} ins(%10, %13 : tensor<?x12xf32>, tensor<12x?xf32>) outs(%16 : tensor<?x?xf32>) -> tensor<?x?xf32>
           %18 = tensor.insert_slice %17 into %arg10[%arg7, %arg9] [%14, %15] [1, 1] : tensor<?x?xf32> into tensor<?x?xf32>
           scf.yield %18 : tensor<?x?xf32>