diff --git a/mlir/include/mlir/Dialect/Linalg/Transforms/HoistPadding.h b/mlir/include/mlir/Dialect/Linalg/Transforms/HoistPadding.h
--- a/mlir/include/mlir/Dialect/Linalg/Transforms/HoistPadding.h
+++ b/mlir/include/mlir/Dialect/Linalg/Transforms/HoistPadding.h
@@ -17,7 +17,7 @@
 namespace linalg {
 class PadTensorOp;
 
-/// Mechanically hoist padding operations on tensors by `nLoops` into a new,
+/// Mechanically hoist padding operations on tensors by `numLoops` into a new,
 /// generally larger tensor. This achieves packing of multiple padding ops into
 /// a larger tensor. On success, `padTensorOp` is replaced by the cloned version
 /// in the packing loop so the caller can continue reasoning about the padding
diff --git a/mlir/lib/Dialect/Linalg/Transforms/HoistPadding.cpp b/mlir/lib/Dialect/Linalg/Transforms/HoistPadding.cpp
--- a/mlir/lib/Dialect/Linalg/Transforms/HoistPadding.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/HoistPadding.cpp
@@ -54,7 +54,7 @@
 ///   7. There is no enclosing scf::ForOp that indexes the padded data.
 /// Other cases succeed and will trigger hoisting of the pad op.
 struct HoistingAnalysis {
-  HoistingAnalysis(PadTensorOp padTensorOp, int nLevels);
+  HoistingAnalysis(PadTensorOp padTensorOp, int numLoops);
 
   bool isValid() { return valid; }
 
@@ -62,12 +62,6 @@
   /// `backwardSlice`.
   FailureOr<SmallVector<Value>> getPackedTensorSizes(ImplicitLocOpBuilder &b);
 
-  /// The padTensorOp that needs to be hoisted.
-  PadTensorOp padTensorOp;
-
-  /// The maximum number of immediately enclosing scf::ForOp to hoist over.
-  int nLevels;
-
   /// The outermost loop, determined by `nLevels` above which `padTensorOp` will
   /// be hoisted.
   scf::ForOp outermostEnclosingForOp;
@@ -81,9 +75,7 @@
   ///  2. whose induction variable is used, directly or indirectly, in the
   ///     computation of `padTensorOp`.
   /// The span of these loops determines the footprint of the packed tensor.
-  /// SmallSetVector<scf::ForOp> packingLoops;
-  SetVector<scf::ForOp, SmallVector<scf::ForOp>, DenseSet<Operation *>>
-      packingLoops;
+  SmallVector<scf::ForOp> packingLoops;
 
 private:
   /// Returns the loops in `backwardSlice` used to index the padded data. The
@@ -103,8 +95,8 @@
   ///       %padded_slice = linalg.pad_tensor %slice
   /// ```
   /// getIndexingLoops(%padded_slice, %slice) returns [scf.for %i, scf.for %j]
-  SetVector<Operation *> getIndexingLoops(PadTensorOp padTensorOp,
-                                          tensor::ExtractSliceOp sliceOp);
+  SmallVector<scf::ForOp> getIndexingLoops(PadTensorOp padTensorOp,
+                                           tensor::ExtractSliceOp sliceOp);
 
   /// Encodes whether the analysis is valid and hoisting can proceed.
   bool valid;
@@ -148,10 +140,8 @@
   }
 }
 
-HoistingAnalysis::HoistingAnalysis(PadTensorOp padTensorOp, int nLevels)
-    : padTensorOp(padTensorOp), nLevels(nLevels), valid(false) {
-  AsmState state(padTensorOp->getParentOfType<mlir::FuncOp>());
-  (void)state;
+HoistingAnalysis::HoistingAnalysis(PadTensorOp padTensorOp, int numLoops) {
+  valid = false;
 
   // Bail on any use that isn't an input of a Linalg op.
   // Hoisting of inplace updates happens after vectorization.
@@ -160,7 +150,7 @@
 
   // Get at most nLevels of immediately enclosing loops.
   SmallVector<scf::ForOp> reverseEnclosingLoops;
-  getAtMostNEnclosingLoops(padTensorOp, nLevels, reverseEnclosingLoops);
+  getAtMostNEnclosingLoops(padTensorOp, numLoops, reverseEnclosingLoops);
   if (reverseEnclosingLoops.empty()) {
     LLVM_DEBUG(DBGS() << "No immediately enclosing loop -> skip\n");
     return;
@@ -216,19 +206,20 @@
   }
 
   // Search the loops found in `backwardSlice` used to index the padded data.
-  SetVector<Operation *> indexingLoops = getIndexingLoops(padTensorOp, sliceOp);
+  SmallVector<scf::ForOp> indexingLoops =
+      getIndexingLoops(padTensorOp, sliceOp);
 
   // Add only the loops part of `indexingLoops` to the packing loops. All other
   // loops are not used to index the padded data and consequently access the
   // same data in every loop iteration. Adding them to the packing loops would
   // increase the cache footprint of the packed data by storing the same data
   // multiple times.
-  for (scf::ForOp forOp : llvm::reverse(reverseEnclosingLoops)) {
-    if (indexingLoops.contains(forOp))
-      packingLoops.insert(forOp);
-  }
-  assert(indexingLoops.size() == packingLoops.size() &&
+  for (scf::ForOp forOp : llvm::reverse(reverseEnclosingLoops))
+    if (!indexingLoops.empty() && indexingLoops.back() == forOp)
+      packingLoops.push_back(indexingLoops.pop_back_val());
+  assert(indexingLoops.empty() &&
          "expect the all indexing loops are enclosing loops");
+
   if (packingLoops.empty()) {
     LLVM_DEBUG(DBGS() << "Cannot find a packing loop -> skip\n");
     return;
@@ -247,7 +238,7 @@
       indexEdges.insert(operand);
 }
 
-SetVector<Operation *>
+SmallVector<scf::ForOp>
 HoistingAnalysis::getIndexingLoops(PadTensorOp padTensorOp,
                                    tensor::ExtractSliceOp sliceOp) {
   // Set of all values used for index computation.
@@ -272,7 +263,7 @@
   // After iterating `backwardSlice` we obtain:
   // indexEdges = [%i, %j, %ubi, %ubj]
   // indexingLoops = [scf.for %i, scf.for %j]
-  SetVector<Operation *> indexingLoops;
+  SmallVector<scf::ForOp> indexingLoops;
   for (Operation *op : llvm::reverse(backwardSlice)) {
     // Add the index operands of `padTensorOp` and `sliceOp` to start the
     // exploration of the index computation.
@@ -286,7 +277,7 @@
     if (auto forOp = dyn_cast<scf::ForOp>(op)) {
       if (indexEdges.contains(forOp.getInductionVar())) {
         addIndexOperandsToIndexEdges(op, indexEdges);
-        indexingLoops.insert(forOp);
+        indexingLoops.push_back(forOp);
         continue;
       }
     }
@@ -442,7 +433,7 @@
 
   // Iteratively try to fold the upper bounds into the constraints set.
   if (failed(foldUpperBoundsIntoConstraintsSet(
-          constraints, outermostEnclosingForOp, packingLoops.getArrayRef())))
+          constraints, outermostEnclosingForOp, packingLoops)))
     return failure();
 
   int nPackedLoops = packingLoops.size();
@@ -577,7 +568,7 @@
     auto forOp = dyn_cast<scf::ForOp>(op);
     assert(forOp && "Expected scf::ForOp when hoisting pad ops");
     // Unused loop, just skip it.
-    if (!analysis.packingLoops.contains(forOp))
+    if (!llvm::is_contained(analysis.packingLoops, forOp))
       continue;
 
     auto clonedForOp =