diff --git a/mlir/include/mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.td b/mlir/include/mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.td
--- a/mlir/include/mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.td
+++ b/mlir/include/mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.td
@@ -794,9 +794,51 @@
 // HoistPadOp
 //===----------------------------------------------------------------------===//
 
+def HoistPadBuildPackingLoopNestOp :
+    Op<Transform_Dialect,
+       "structured.hoist_pad.build_packing_loop_nest",
+    [DeclareOpInterfaceMethods<MemoryEffectsOpInterface>,
+     DeclareOpInterfaceMethods<TransformOpInterface>]> {
+  let description = [{
+    Helper transform used to hoist a tensor.pad target operation. This operation
+    creates the packing loop nest required by the hoist_pad operation and makes
+    that functionality available independently.
+
+    TODO: In the future, we should consider rewriting as a tensor.pack after
+    hoisting since this abstraction is now available.
+
+    #### Return modes
+
+    This operation ignores non-tensor.pad ops and drops them in the result.
+    If any non-tensor.pad is passed, the transform emits a silenceable failure.
+
+    The return handle points to only the subset of successfully created packing
+    loop nests, which can be empty.
+  }];
+
+  // Also allow any !pdl.operation for simpler composition. Non-tensor.pad ops
+  // will be dropped from the results.
+  let arguments =
+    (ins TransformHandleTypeInterface:$target,
+         TransformHandleTypeInterface:$loop,
+         DefaultValuedAttr<DenseI64ArrayAttr, "{}">:$transpose);
+  let results = (outs TransformHandleTypeInterface:$packing_loop);
+
+  let assemblyFormat = [{
+    $target
+    `above` $loop
+    (`,` `transpose` `by` $transpose^)?
+    attr-dict
+    `:` functional-type(operands, results)
+  }];
+  let hasVerifier = 1;
+}
+
 def HoistPadOp : Op<Transform_Dialect, "structured.hoist_pad",
-    [FunctionalStyleTransformOpTrait, MemoryEffectsOpInterface,
-     TransformOpInterface, TransformEachOpTrait]> {
+    [FunctionalStyleTransformOpTrait,
+     MemoryEffectsOpInterface,
+     TransformOpInterface,
+     TransformEachOpTrait]> {
   let description = [{
     Hoist the tensor.pad target operation by at most the given number of loops.
     Optionally apply the transpose attribute to the inner dimensions.
diff --git a/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h b/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h
--- a/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h
+++ b/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h
@@ -362,6 +362,25 @@
                   ArrayRef<Attribute> paddingValues,
                   ArrayRef<bool> packPaddings, LinalgOp &paddedOp);
 
+namespace detail {
+
+/// Helper struct to hold the results of building a packing loop nest.
+struct PackingLoopNestResult {
+  SmallVector<OpFoldResult> offsets, sizes, strides;
+  SmallVector<Value> clonedLoopIvs, leadingPackedTensorIndexings;
+  GenericOp maybeTransposeOp;
+};
+
+/// Build the packing loop nest required to hoist `opToHoist` above
+/// `outermostEnclosingForOp`.
+/// The loop nest is built just before `outermostEnclosingForOp`.
+FailureOr<PackingLoopNestResult>
+buildPackingLoopNest(RewriterBase &rewriter, tensor::PadOp opToHoist,
+                     scf::ForOp outermostEnclosingForOp,
+                     ArrayRef<int64_t> transposeVector);
+
+} // namespace detail
+
 /// Mechanically hoist padding operations on tensors by `numLoops` into a new,
 /// generally larger tensor. This achieves packing of multiple padding ops into
 /// a larger tensor. On success, `opToHoist` is replaced by the cloned version
diff --git a/mlir/include/mlir/Dialect/SCF/Utils/Utils.h b/mlir/include/mlir/Dialect/SCF/Utils/Utils.h
--- a/mlir/include/mlir/Dialect/SCF/Utils/Utils.h
+++ b/mlir/include/mlir/Dialect/SCF/Utils/Utils.h
@@ -54,6 +54,18 @@
                                     ValueRange newIterOperands,
                                     const NewYieldValueFn &newYieldValuesFn,
                                     bool replaceIterOperandsUsesInLoop = true);
+// Simpler API if the new yields are just a list of values that can be
+// determined ahead of time.
+inline scf::ForOp
+replaceLoopWithNewYields(OpBuilder &builder, scf::ForOp loop,
+                         ValueRange newIterOperands, ValueRange newYields,
+                         bool replaceIterOperandsUsesInLoop = true) {
+  auto fn = [&](OpBuilder &b, Location loc, ArrayRef<BlockArgument> newBBArgs) {
+    return SmallVector<Value>(newYields.begin(), newYields.end());
+  };
+  return replaceLoopWithNewYields(builder, loop, newIterOperands, fn,
+                                  replaceIterOperandsUsesInLoop);
+}
 
 /// Update a perfectly nested loop nest to yield new values from the innermost
 /// loop and propagating it up through the loop nest. This function
diff --git a/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp b/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp
--- a/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp
+++ b/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp
@@ -1757,6 +1757,52 @@
 // HoistPadOp
 //===---------------------------------------------------------------------===//
 
+DiagnosedSilenceableFailure transform::HoistPadBuildPackingLoopNestOp::apply(
+    transform::TransformResults &transformResults,
+    transform::TransformState &state) {
+  ArrayRef<Operation *> targetOps = state.getPayloadOps(getTarget());
+  ArrayRef<Operation *> loopOps = state.getPayloadOps(getLoop());
+  if (targetOps.size() != 1 || loopOps.size() != 1) {
+    return emitDefiniteFailure()
+           << "requires exactly one target and loop handle (got "
+           << targetOps.size() << " and " << loopOps.size() << ")";
+  }
+
+  auto padOp = dyn_cast_or_null<tensor::PadOp>(targetOps.front());
+  auto loopOp = dyn_cast_or_null<scf::ForOp>(targetOps.front());
+  IRRewriter rewriter(getContext());
+  FailureOr<linalg::detail::PackingLoopNestResult> result =
+      linalg::detail::buildPackingLoopNest(rewriter, padOp, loopOp,
+                                           getTranspose());
+  if (failed(result))
+    return emitDefiniteFailure() << "could not build packing loop nest";
+
+  auto outerPackedLoop =
+      scf::getForInductionVarOwner(result->clonedLoopIvs.front());
+  transformResults.set(getPackingLoop().cast<OpResult>(),
+                       outerPackedLoop.getOperation());
+  return DiagnosedSilenceableFailure::success();
+}
+
+LogicalResult transform::HoistPadBuildPackingLoopNestOp::verify() {
+  ArrayRef<int64_t> transpose = getTranspose();
+  auto sequence = llvm::to_vector(llvm::seq<int64_t>(0, transpose.size()));
+  if (!std::is_permutation(sequence.begin(), sequence.end(), transpose.begin(),
+                           transpose.end())) {
+    return emitOpError() << "expects transpose to be a permutation, found "
+                         << getTranspose();
+  }
+  return success();
+}
+
+void transform::HoistPadBuildPackingLoopNestOp::getEffects(
+    SmallVectorImpl<MemoryEffects::EffectInstance> &effects) {
+  transform::onlyReadsHandle(getTarget(), effects);
+  transform::onlyReadsHandle(getLoop(), effects);
+  transform::producesHandle(getPackingLoop(), effects);
+  transform::modifiesPayload(effects);
+}
+
 DiagnosedSilenceableFailure
 transform::HoistPadOp::applyToOne(tensor::PadOp target,
                                   transform::ApplyToEachResultList &results,
diff --git a/mlir/lib/Dialect/Linalg/Transforms/HoistPadding.cpp b/mlir/lib/Dialect/Linalg/Transforms/HoistPadding.cpp
--- a/mlir/lib/Dialect/Linalg/Transforms/HoistPadding.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/HoistPadding.cpp
@@ -17,9 +17,11 @@
 #include "mlir/Dialect/Linalg/Transforms/Hoisting.h"
 #include "mlir/Dialect/Linalg/Transforms/Transforms.h"
 #include "mlir/Dialect/SCF/IR/SCF.h"
+#include "mlir/Dialect/SCF/Utils/Utils.h"
 #include "mlir/Dialect/Tensor/Utils/Utils.h"
 #include "mlir/Dialect/Utils/IndexingUtils.h"
 #include "mlir/IR/AsmState.h"
+#include "mlir/IR/BuiltinTypes.h"
 #include "mlir/IR/Dominance.h"
 #include "mlir/IR/Matchers.h"
 #include "mlir/Transforms/RegionUtils.h"
@@ -33,6 +35,7 @@
 
 using namespace mlir;
 using namespace mlir::linalg;
+using namespace mlir::linalg::detail;
 
 #ifndef NDEBUG
 static bool debugPrintLoopInShortForm(Operation *op) {
@@ -61,6 +64,73 @@
              DBGS() << "\n";);
 }
 
+/// Return at most nLevels of immediately enclosing scf::ForOp loops.
+/// Stops at the first parent that is not an scf::ForOp.
+/// Multi-loops such as scf.parallel or linalg.tiled_loop are not modeled atm.
+/// Control-flow and other containing ops with regions are not modeled atm.
+static void
+getAtMostNEnclosingLoops(tensor::PadOp padOp, int nLevels,
+                         SmallVector<scf::ForOp> &reverseEnclosingLoops) {
+  scf::ForOp outermostEnclosingForOp = nullptr;
+  Operation *nextEnclosingOp = padOp->getParentOp();
+  while (nLevels-- > 0 &&
+         (outermostEnclosingForOp = dyn_cast<scf::ForOp>(nextEnclosingOp))) {
+    LLVM_DEBUG(DBGS() << "loops: ";
+               debugPrintLoopInShortForm(outermostEnclosingForOp);
+               dbgs() << "\n");
+    reverseEnclosingLoops.push_back(outermostEnclosingForOp);
+    nextEnclosingOp = outermostEnclosingForOp->getParentOp();
+  }
+}
+
+/// Return at most nLevels of immediately enclosing scf::ForOp loops.
+/// Stops at the first parent that is not an scf::ForOp.
+/// Multi-loops such as scf.parallel or linalg.tiled_loop are not modeled atm.
+/// Control-flow and other containing ops with regions are not modeled atm.
+static void
+getEnclosingLoopsUntil(tensor::PadOp padOp, scf::ForOp untilLoop,
+                       SmallVector<scf::ForOp> &reverseEnclosingLoops) {
+  scf::ForOp outermostEnclosingForOp = nullptr;
+  Operation *nextEnclosingOp = padOp->getParentOp();
+  while (outermostEnclosingForOp != untilLoop &&
+         (outermostEnclosingForOp = dyn_cast<scf::ForOp>(nextEnclosingOp))) {
+    LLVM_DEBUG(DBGS() << "loops: ";
+               debugPrintLoopInShortForm(outermostEnclosingForOp);
+               dbgs() << "\n");
+    reverseEnclosingLoops.push_back(outermostEnclosingForOp);
+    nextEnclosingOp = outermostEnclosingForOp->getParentOp();
+  }
+}
+
+// Get all the ops in the backwards slice starting from `padOp` and that
+// are dominated by the outermost enclosing loop.
+// This also requires tracking ops defining values used in the region but
+// defined above.
+static void computeBackwardSlice(tensor::PadOp padOp,
+                                 scf::ForOp outermostEnclosingForOp,
+                                 SetVector<Operation *> &backwardSlice) {
+  DominanceInfo domInfo(outermostEnclosingForOp);
+  auto filter = [&](Operation *op) {
+    return domInfo.dominates(outermostEnclosingForOp, op) &&
+           !padOp->isProperAncestor(op);
+  };
+  // First, add the ops required to compute the region to the backwardSlice.
+  SetVector<Value> valuesDefinedAbove;
+  getUsedValuesDefinedAbove(padOp.getRegion(), padOp.getRegion(),
+                            valuesDefinedAbove);
+  for (Value v : valuesDefinedAbove) {
+    getBackwardSlice(v, &backwardSlice, filter, /*inclusive=*/true);
+  }
+  // Then, add the backward slice from padOp itself.
+  getBackwardSlice(padOp.getOperation(), &backwardSlice, filter,
+                   /*inclusive=*/true);
+}
+
+//===----------------------------------------------------------------------===//
+// HoistPaddingAnalysis Implementation.
+//===----------------------------------------------------------------------===//
+
+namespace {
 /// Analysis class to support tensor::PadOp hoisting across multiple enclosing
 /// loops. The failure conditions are:
 ///   1. Pad op has a use that is not an input of a LinalgOp.
@@ -76,36 +146,47 @@
 ///      the outermost enclosing scf::ForOp.
 ///   8. There is no enclosing scf::ForOp that indexes the padded data.
 /// Other cases succeed and will trigger hoisting of the pad op.
-struct HoistingAnalysis {
-  HoistingAnalysis(tensor::PadOp padOp, int numLoops);
+struct HoistPaddingAnalysis {
+  HoistPaddingAnalysis(tensor::PadOp padOp, int numLoops);
+  HoistPaddingAnalysis(tensor::PadOp padOp, scf::ForOp outermostEnclosingForOp);
 
   bool isValid() { return valid; }
 
   /// Footprint of the packedTensor, computed from the packingLoops.
-  SmallVector<Value> getPackedTensorSizes(RewriterBase &b, Location loc);
-
-  /// The outermost loop, determined by `nLevels` above which `padOp` will
-  /// be hoisted.
-  scf::ForOp outermostEnclosingForOp;
+  SmallVector<Value> getPackedTensorSizes(RewriterBase &rewriter,
+                                          Location loc) const;
 
-  /// Backward slice rooted at `padOp` and nested under
-  /// `outermostEnclosingForOp`.
-  SetVector<Operation *> backwardSlice;
+  /// Performs optional hoisting to enable hoist padding to occur. This may be
+  /// necessary when `sliceOp` is not defined outside of the outermost enclosing
+  /// loop we want to hoist above.
+  ///
+  /// Example:
+  /// ```
+  /// %source = linalg.fill(%cst, %arg0)
+  /// // %source is available for packing here!
+  /// scf.for %i
+  ///   scf.for %j
+  ///     scf.for %k
+  ///       %slice = tensor.extract_slice %source [%i, %j]
+  ///       %padded_slice = tensor.pad %slice
+  /// ```
+  void enableHoistPadding(RewriterBase &rewriter);
 
-  /// The scf::ForOp immediately enclosing `padOp` such that:
-  ///  1. they are nested under `outermostEnclosingForOp` (inclusive)
-  ///  2. whose induction variable is used, directly or indirectly, in the
-  ///     computation of `padOp`.
-  /// The span of these loops determines the footprint of the packed tensor.
-  SmallVector<scf::ForOp> packingLoops;
+  /// Common analysis builder to finalize the construction of the analysis once
+  /// optional `enableHoistPadding` has run.
+  /// `reverseEnclosingLoops.back()` is the loop to hoist above.
+  void finalizeHoistPaddingAnalysis();
+  
+private:
+  /// Encodes whether the analysis is valid and hoisting can proceed.
+  bool valid;
 
-  /// The ExtractSliceOp that feeds the PadOp we want to hoist.
-  tensor::ExtractSliceOp sliceOp;
+  /// The padOp to hoist.
+  tensor::PadOp opToHoist;
 
-  /// If non-empty, this is the unique scf::ForOp that consumes the `sliceOp`.
-  scf::ForOp padConsumingForOp;
+  /// Immediately enclosing loops considered for hoisting padding.
+  SmallVector<scf::ForOp> reverseEnclosingLoops;
 
-private:
   /// Drop any non-index dependencies of `padOp` and `sliceOp` from
   /// `backwardSlice`. The method follows the use-def chains of the index
   /// operands consumed by `padOp` and `sliceOp` and drops the operations
@@ -130,96 +211,79 @@
   /// ```
   /// dropNonIndexDependencies(%padded_slice, %slice)
   /// removes [scf.for %k, linalg.fill(%cst, %arg1)] from backwardSlice.
-  LogicalResult dropNonIndexDependencies(tensor::PadOp padOp);
+  LogicalResult dropNonIndexDependencies();
 
-  /// Encodes whether the analysis is valid and hoisting can proceed.
-  bool valid;
-};
+public:
+  /// The outermost loop, determined by `nLevels` above which `padOp` will
+  /// be hoisted.
+  scf::ForOp outermostEnclosingForOp;
 
-/// Return at most nLevels of immediately enclosing scf::ForOp loops.
-/// Stops at the first parent that is not an scf::ForOp.
-/// Multi-loops such as scf.parallel or linalg.tiled_loop are not modeled atm.
-/// Control-flow and other containing ops with regions are not modeled atm.
-static void
-getAtMostNEnclosingLoops(tensor::PadOp padOp, int nLevels,
-                         SmallVector<scf::ForOp> &reverseEnclosingLoops) {
-  scf::ForOp outermostEnclosingForOp = nullptr;
-  Operation *nextEnclosingOp = padOp->getParentOp();
-  while (nLevels-- > 0 &&
-         (outermostEnclosingForOp = dyn_cast<scf::ForOp>(nextEnclosingOp))) {
-    LLVM_DEBUG(DBGS() << "loops: ";
-               debugPrintLoopInShortForm(outermostEnclosingForOp);
-               dbgs() << "\n");
-    reverseEnclosingLoops.push_back(outermostEnclosingForOp);
-    nextEnclosingOp = outermostEnclosingForOp->getParentOp();
-  }
-}
+  /// Backward slice rooted at `padOp` and nested under
+  /// `outermostEnclosingForOp`.
+  SetVector<Operation *> backwardSlice;
 
-// Get all the ops in the backwards slice starting from `padOp` and that
-// are dominated by the outermost enclosing loop.
-// This also requires tracking ops defining values used in the region but
-// defined above.
-static void computeBackwardSlice(tensor::PadOp padOp,
-                                 scf::ForOp outermostEnclosingForOp,
-                                 SetVector<Operation *> &backwardSlice) {
-  DominanceInfo domInfo(outermostEnclosingForOp);
-  auto filter = [&](Operation *op) {
-    return domInfo.dominates(outermostEnclosingForOp, op) &&
-           !padOp->isProperAncestor(op);
-  };
-  // First, add the ops required to compute the region to the backwardSlice.
-  SetVector<Value> valuesDefinedAbove;
-  getUsedValuesDefinedAbove(padOp.getRegion(), padOp.getRegion(),
-                            valuesDefinedAbove);
-  for (Value v : valuesDefinedAbove) {
-    getBackwardSlice(v, &backwardSlice, filter, /*inclusive=*/true);
-  }
-  // Then, add the backward slice from padOp itself.
-  getBackwardSlice(padOp.getOperation(), &backwardSlice, filter,
-                   /*inclusive=*/true);
-}
+  /// The scf::ForOp immediately enclosing `padOp` such that:
+  ///  1. they are nested under `outermostEnclosingForOp` (inclusive)
+  ///  2. whose induction variable is used, directly or indirectly, in the
+  ///     computation of `padOp`.
+  /// The span of these loops determines the footprint of the packed tensor.
+  SmallVector<scf::ForOp> packingLoops;
+
+  /// The ExtractSliceOp that feeds the PadOp we want to hoist.
+  tensor::ExtractSliceOp sliceOp;
+
+  /// If non-empty, this is the unique scf::ForOp that consumes the `sliceOp`.
+  scf::ForOp padConsumingForOp;
+};
 
-HoistingAnalysis::HoistingAnalysis(tensor::PadOp padOp, int numLoops) {
-  valid = false;
+} // namespace
 
+HoistPaddingAnalysis::HoistPaddingAnalysis(tensor::PadOp padOp, int numLoops)
+    : valid(false), opToHoist(padOp) {
   // Get at most `numLoops` of immediately enclosing loops.
   SmallVector<scf::ForOp> reverseEnclosingLoops;
-  getAtMostNEnclosingLoops(padOp, numLoops, reverseEnclosingLoops);
+  getAtMostNEnclosingLoops(opToHoist, numLoops, reverseEnclosingLoops);
   if (reverseEnclosingLoops.empty()) {
     LLVM_DEBUG(DBGS() << "--No immediately enclosing loop -> Skip\n");
     return;
   }
-
   outermostEnclosingForOp = reverseEnclosingLoops.back();
-
-  // Get the `sliceOp` that defines the source tensor of `padOp` and
-  // check its source is defined outside of the outermost loop. This check
-  // ensures the padded data is available for packing before entering the
-  // outermost enclosing loop.
-  //
-  // Example:
-  // ```
-  // %source = linalg.fill(%cst, %arg0)
-  // // %source is available for packing here!
-  // scf.for %i
-  //   scf.for %j
-  //     scf.for %k
-  //       %slice = tensor.extract_slice %source [%i, %j]
-  //       %padded_slice = tensor.pad %slice
-  // ```
-  sliceOp = padOp.getSource().getDefiningOp<tensor::ExtractSliceOp>();
+  sliceOp = opToHoist.getSource().getDefiningOp<tensor::ExtractSliceOp>();
   if (!sliceOp) {
     LLVM_DEBUG(DBGS() << "--Cannot find the extract slice op -> Skip\n");
     return;
   }
+}
+
+HoistPaddingAnalysis::HoistPaddingAnalysis(tensor::PadOp padOp,
+                                           scf::ForOp outermostEnclosingForOp)
+    : valid(false), opToHoist(padOp) {
+  // Get enclosing loops until outermostEnclosingForOp.
+  SmallVector<scf::ForOp> reverseEnclosingLoops;
+  getEnclosingLoopsUntil(padOp, outermostEnclosingForOp, reverseEnclosingLoops);
+  if (reverseEnclosingLoops.empty()) {
+    LLVM_DEBUG(DBGS() << "--No immediately enclosing loop -> Skip\n");
+    return;
+  }
+  this->outermostEnclosingForOp = reverseEnclosingLoops.back();
+  if (this->outermostEnclosingForOp != outermostEnclosingForOp) {
+    LLVM_DEBUG(DBGS() << "--Unexpected outermost enclosing loop -> Skip\n");
+    return;
+  }
+}
+
+void HoistPaddingAnalysis::enableHoistPadding(RewriterBase &rewriter) {
   // If the padded data is not yet available before entering the outermost
   // enclosing loop, try to apply hoisting on this outermost loop.
-  // TODO: we may want finer-grained hoisting of only that particular `sliceOp`.
-  IRRewriter rewriter(outermostEnclosingForOp->getContext());
+  // TODO: we may want finer-grained hoisting of only that particular
+  // `sliceOp`.
   if (!outermostEnclosingForOp.isDefinedOutsideOfLoop(sliceOp.getSource())) {
     outermostEnclosingForOp =
         hoistRedundantSubsetExtractInsert(rewriter, outermostEnclosingForOp);
   }
+}
+
+void HoistPaddingAnalysis::finalizeHoistPaddingAnalysis() {
   if (!outermostEnclosingForOp.isDefinedOutsideOfLoop(sliceOp.getSource())) {
     LLVM_DEBUG(DBGS() << "--outermostEnclosingForOp:\n"
                       << outermostEnclosingForOp << "\n"
@@ -236,14 +300,14 @@
   // Check the region of `padOp` depends on a constant only. Adding hoisting
   // support for arbitrary padding regions would require cloning all
   // dependencies captured by the padding region.
-  Value paddingValue = padOp.getConstantPaddingValue();
+  Value paddingValue = opToHoist.getConstantPaddingValue();
   if (!paddingValue ||
       !isa_and_nonnull<arith::ConstantOp>(paddingValue.getDefiningOp())) {
     LLVM_DEBUG(DBGS() << "Cannot find constant padding value -> Skip\n");
     return;
   }
 
-  computeBackwardSlice(padOp, outermostEnclosingForOp, backwardSlice);
+  computeBackwardSlice(opToHoist, outermostEnclosingForOp, backwardSlice);
   if (backwardSlice.size() <= 1)
     return;
 
@@ -251,7 +315,7 @@
   // Remove all ops in the backward slice that are not used to index
   // the padded tensor. In particular, keep `padOp`, `sliceOp`, and
   // the loop and affine operations used for the index computation.
-  if (failed(dropNonIndexDependencies(padOp))) {
+  if (failed(dropNonIndexDependencies())) {
     LLVM_DEBUG(DBGS() << "--Cannot dropNonIndexDependencies -> Skip\n");
     return;
   }
@@ -281,7 +345,8 @@
   valid = true;
 }
 
-LogicalResult HoistingAnalysis::dropNonIndexDependencies(tensor::PadOp padOp) {
+LogicalResult
+HoistPaddingAnalysis::dropNonIndexDependencies() {
   // Set of all values used for index computation.
   SetVector<Value> indexEdges;
 
@@ -300,7 +365,7 @@
     });
   };
 
-  // Starting from `padOp` and `sliceOp` walk the use-def edges of index
+  // Starting from `opToHoist` and `sliceOp` walk the use-def edges of index
   // type in `backwardSlice`. Add the index operands of an operation to
   // `indexEdges` and remove all operations from `backwardSlice` that are not
   // part of the index computation.
@@ -322,9 +387,9 @@
   // backwardSlice = backwardSlice / [linalg.fill(%cst, %arg1), scf.for %k]
   SetVector<Operation *> operationsToRemove;
   for (Operation *op : llvm::reverse(backwardSlice)) {
-    // Add the index operands of `padOp` and `sliceOp` to start the
+    // Add the index operands of `opToHoist` and `sliceOp` to start the
     // exploration of the index computation.
-    if (op == padOp || op == sliceOp) {
+    if (op == opToHoist || op == sliceOp) {
       addIndexOperandsToIndexEdges(op);
       continue;
     }
@@ -358,7 +423,7 @@
       continue;
     }
     // Remove all other operations not used by the index computation. An
-    // exception are constant operations that may be used by `padOp`.
+    // exception are constant operations that may be used by `opToHoist`.
     if (!isa<arith::ConstantOp>(op))
       operationsToRemove.insert(op);
   }
@@ -367,7 +432,8 @@
 }
 
 SmallVector<Value>
-HoistingAnalysis::getPackedTensorSizes(RewriterBase &rewriter, Location loc) {
+HoistPaddingAnalysis::getPackedTensorSizes(RewriterBase &rewriter,
+                                           Location loc) const {
   SmallVector<Value> dynamicTensorSizes;
 
   // Upper bound the packing loop lengths to size the packed tensor. Taking
@@ -403,6 +469,10 @@
   return outer.isDefinedOutsideOfLoop(v) || matchPattern(v, m_Constant());
 }
 
+//===----------------------------------------------------------------------===//
+// buildPackingLoopNest Implementation.
+//===----------------------------------------------------------------------===//
+
 /// Return the current iteration number in the loop (iv - lb).ceilDiv(step).
 /// The returned Value is guaranteed not to depend on any loop comprised in
 /// [`outer`, `forOp`].
@@ -423,12 +493,6 @@
       loc, (iv - lb).ceilDiv(step), ValueRange{ivVal, lbVal, stepVal});
 }
 
-struct PackingLoopNestResult {
-  SmallVector<OpFoldResult> offsets, sizes, strides;
-  SmallVector<Value> clonedLoopIvs, leadingPackedTensorIndexings;
-  GenericOp maybeTransposeOp;
-};
-
 // Build a packing loop nest by iteratively traversing the backward slice and
 // clone the operations, iteratively stepping into the loops that we encounter.
 // The implementation proceeds in a stack-like fashion:
@@ -439,10 +503,10 @@
 //   3. At the innermost loop level, create a InsertSliceOp.
 //   4. Iteratively pop and yield the result of the InsertSliceOp across the
 //      cloned loops.
-static PackingLoopNestResult buildPackingLoopNest(
+static PackingLoopNestResult buildPackingLoopNestImpl(
     RewriterBase &rewriter, IRMapping &bvm, tensor::PadOp opToHoist,
     ArrayRef<int64_t> transposeVector, RankedTensorType transposedTensorType,
-    tensor::EmptyOp emptyOp, const HoistingAnalysis &analysis) {
+    tensor::EmptyOp emptyOp, const HoistPaddingAnalysis &analysis) {
   SmallVector<OpFoldResult> offsets, sizes, strides;
   SmallVector<Value> clonedLoopIvs, leadingPackedTensorIndexings;
 
@@ -552,6 +616,73 @@
                                maybeTransposeOp};
 }
 
+/// Build the packing loop nest required to hoist `opToHoist` above
+/// `outermostEnclosingForOp`.
+/// The loop nest is built just before `outermostEnclosingForOp`.
+static FailureOr<PackingLoopNestResult> buildPackingLoopNestImpl(
+    RewriterBase &rewriter, IRMapping &bvm, tensor::PadOp opToHoist,
+    ArrayRef<int64_t> transposeVector, const HoistPaddingAnalysis &analysis) {
+  // Update actual number of loops, which may be smaller.
+  int nPackedLoops = analysis.packingLoops.size();
+  LLVM_DEBUG(DBGS() << "\n";
+             DBGS() << "Func:\n"
+                    << *opToHoist->getParentOfType<func::FuncOp>() << "\n";
+             DBGS() << "Start hoisting above " << nPackedLoops << " loops\n");
+
+  Location loc = opToHoist->getLoc();
+  RankedTensorType paddedTensorType = opToHoist.getResultType();
+
+  // Compute the type of the transposed padded tensor.
+  FailureOr<RankedTensorType> transposedTensorType =
+      tensor::computeTransposedType(paddedTensorType, transposeVector);
+  if (failed(transposedTensorType)) {
+    LLVM_DEBUG(DBGS() << "--Could not compute transposed type -> Skip\n");
+    return failure();
+  }
+
+  // Create the packed tensor<?x?x..? x transposedShape>.
+  SmallVector<int64_t> packedShape(nPackedLoops, ShapedType::kDynamic);
+  // TODO: go grab dims when needed, atm tensor::PadOp yields a static tensor.
+  llvm::append_range(packedShape, transposedTensorType->getShape());
+  auto packedTensorType = RankedTensorType::get(
+      packedShape, transposedTensorType->getElementType());
+
+  // Set the insertion point right before the outer loop and start packing.
+  scf::ForOp outerLoop = analysis.outermostEnclosingForOp;
+  OpBuilder::InsertionGuard g(rewriter);
+  rewriter.setInsertionPoint(outerLoop);
+  SmallVector<Value> dynamicTensorSizes =
+      analysis.getPackedTensorSizes(rewriter, loc);
+  auto emptyOp = rewriter.create<tensor::EmptyOp>(
+      loc, packedTensorType.getShape(), packedTensorType.getElementType(),
+      dynamicTensorSizes);
+
+  return buildPackingLoopNestImpl(rewriter, bvm, opToHoist, transposeVector,
+                                  paddedTensorType, emptyOp, analysis);
+}
+
+/// Build the packing loop nest required to hoist `opToHoist` above
+/// `outermostEnclosingForOp`.
+/// The loop nest is built just before `outermostEnclosingForOp`.
+FailureOr<PackingLoopNestResult> mlir::linalg::detail::buildPackingLoopNest(
+    RewriterBase &rewriter, tensor::PadOp opToHoist,
+    scf::ForOp outermostEnclosingForOp, ArrayRef<int64_t> transposeVector) {
+  HoistPaddingAnalysis analysis(opToHoist, outermostEnclosingForOp);
+  analysis.enableHoistPadding(rewriter);
+  analysis.finalizeHoistPaddingAnalysis();
+  if (!analysis.isValid()) {
+    LLVM_DEBUG(DBGS() << "--Analysis failed -> Skip\n");
+    return failure();
+  }
+  IRMapping bvm;
+  return buildPackingLoopNestImpl(rewriter, bvm, opToHoist, transposeVector,
+                                  analysis);
+}
+
+//===----------------------------------------------------------------------===//
+// hoistPaddingOnTensors Implementation.
+//===----------------------------------------------------------------------===//
+
 // If the original consumer of `sliceOp` was a `forOp` (i.e. through an iter
 // arg), propagate the `packedTensor` value through the same iter arg.
 // TODO: for multiple loops we need to track the use to the innermost loop.
@@ -574,6 +705,7 @@
   std::optional<unsigned> operandNumber =
       forOp.getIterArgNumberForOpOperand(*pUse);
   assert(operandNumber.has_value() && "expected a proper iter arg number");
+
   SmallVector<Value> initArgs = forOp.getInitArgs();
   initArgs[operandNumber.value()] = casted;
   rewriter.startRootUpdate(forOp);
@@ -586,7 +718,7 @@
 /// replacement for `opToHoist` in callers.
 static Value replaceByPackingLoopNestResult(
     RewriterBase &rewriter, const IRMapping &bvm, tensor::PadOp opToHoist,
-    RankedTensorType transposedTensorType, const HoistingAnalysis &analysis,
+    RankedTensorType transposedTensorType, const HoistPaddingAnalysis &analysis,
     const PackingLoopNestResult &packingResult) {
   // The replacement occurs under a single insertion point within the original
   // loop, just before opToHoist.
@@ -657,59 +789,38 @@
     SmallVectorImpl<GenericOp> &transposeOps) {
   LLVM_DEBUG(DBGS() << "\n"; DBGS() << " Try to hoist " << *(opToHoist) << "\n";
              DBGS() << " by " << numLoops << " loops\n");
-  HoistingAnalysis analysis(opToHoist, numLoops);
+
+  HoistPaddingAnalysis analysis(opToHoist, numLoops);
+  analysis.enableHoistPadding(rewriter);
+  analysis.finalizeHoistPaddingAnalysis();
   if (!analysis.isValid()) {
     LLVM_DEBUG(DBGS() << "--Analysis failed -> Skip\n");
     return failure();
   }
 
-  // Update actual number of loops, which may be smaller.
-  int nPackedLoops = analysis.packingLoops.size();
-  LLVM_DEBUG(DBGS() << "\n";
-             DBGS() << "Func:\n"
-                    << *opToHoist->getParentOfType<func::FuncOp>() << "\n";
-             DBGS() << "Start hoisting above " << nPackedLoops << " loops\n");
-
-  Location loc = opToHoist->getLoc();
-  RankedTensorType paddedTensorType = opToHoist.getResultType();
-
-  // Compute the type of the transposed padded tensor.
-  FailureOr<RankedTensorType> transposedTensorType =
-      tensor::computeTransposedType(paddedTensorType, transposeVector);
-  if (failed(transposedTensorType)) {
-    LLVM_DEBUG(DBGS() << "--Could not compute transposed type -> Skip\n");
+  /// Construct the packing loop nest.
+  IRMapping bvm;
+  FailureOr<PackingLoopNestResult> packingResult = buildPackingLoopNestImpl(
+      rewriter, bvm, opToHoist, transposeVector, analysis);
+  if (failed(packingResult)) {
+    LLVM_DEBUG(DBGS() << "--buildPackingLoopNestImpl failed -> Skip\n");
     return failure();
   }
 
-  // Create the packed tensor<?x?x..? x transposedShape>.
-  SmallVector<int64_t> packedShape(nPackedLoops, ShapedType::kDynamic);
-  // TODO: go grab dims when needed, atm tensor::PadOp yields a static tensor.
-  llvm::append_range(packedShape, transposedTensorType->getShape());
-  auto packedTensorType = RankedTensorType::get(
-      packedShape, transposedTensorType->getElementType());
-
-  // Set the insertion point right before the outer loop and start packing.
-  scf::ForOp outerLoop = analysis.outermostEnclosingForOp;
-  OpBuilder::InsertionGuard g(rewriter);
-  rewriter.setInsertionPoint(outerLoop);
-  SmallVector<Value> dynamicTensorSizes =
-      analysis.getPackedTensorSizes(rewriter, loc);
-  auto emptyOp = rewriter.create<tensor::EmptyOp>(
-      loc, packedTensorType.getShape(), packedTensorType.getElementType(),
-      dynamicTensorSizes);
-
-  /// Construct the packing loop nest.
-  IRMapping bvm;
-  PackingLoopNestResult packingResult =
-      buildPackingLoopNest(rewriter, bvm, opToHoist, transposeVector,
-                           *transposedTensorType, emptyOp, analysis);
   if (!transposeVector.empty())
-    transposeOps.push_back(packingResult.maybeTransposeOp);
+    transposeOps.push_back(packingResult->maybeTransposeOp);
 
   // Now the packed tensor is ready, replace the original padding op by a
   // 1x..x1 slice [originalLoopIvs, 0 .. 0][1 .. 1, paddedShape][1 .. 1].
+  auto outerPackedLoop =
+      scf::getForInductionVarOwner(packingResult->clonedLoopIvs.front());
+  auto packedTensorType =
+      outerPackedLoop.getResult(0).getType().cast<RankedTensorType>();
   Value newResult = replaceByPackingLoopNestResult(
-      rewriter, bvm, opToHoist, *transposedTensorType, analysis, packingResult);
+      rewriter, bvm, opToHoist, packedTensorType, analysis, *packingResult);
+
+  Location loc = opToHoist->getLoc();
+  RankedTensorType paddedTensorType = opToHoist.getResultType();
   if (!transposeVector.empty()) {
     OpBuilder::InsertionGuard g(rewriter);
     rewriter.setInsertionPointAfter(newResult.getDefiningOp());
diff --git a/mlir/lib/Dialect/Tensor/Utils/Utils.cpp b/mlir/lib/Dialect/Tensor/Utils/Utils.cpp
--- a/mlir/lib/Dialect/Tensor/Utils/Utils.cpp
+++ b/mlir/lib/Dialect/Tensor/Utils/Utils.cpp
@@ -87,6 +87,7 @@
                                     ArrayRef<int64_t> transposeVector) {
   if (transposeVector.empty())
     return rankedTensorType;
+
   if (!isPermutationVector(transposeVector) ||
       transposeVector.size() != static_cast<size_t>(rankedTensorType.getRank()))
     return failure();
diff --git a/mlir/test/Dialect/Linalg/transform-op-hoist-pad.mlir b/mlir/test/Dialect/Linalg/transform-op-hoist-pad.mlir
--- a/mlir/test/Dialect/Linalg/transform-op-hoist-pad.mlir
+++ b/mlir/test/Dialect/Linalg/transform-op-hoist-pad.mlir
@@ -186,8 +186,201 @@
   }
 
   %pad = transform.get_producer_of_operand %matmul_padded[2]
-    : (!pdl.operation) -> !transform.op<"tensor.pad">
+    : (!pdl.operation) -> !pdl.operation
 
-  transform.structured.hoist_pad %pad by 1 loops
-     : (!transform.op<"tensor.pad">) -> !pdl.operation
+  transform.structured.hoist_pad.build_packing_loop_nest %pad above %loops_l1#1
+     : (!pdl.operation) -> !pdl.operation
+}
+
+// -----
+
+func.func @pad_and_hoist_rhs(
+  %arg0: tensor<24x12xf32>, %arg1: tensor<12x25xf32>, %arg2: tensor<24x25xf32>)
+     -> tensor<24x25xf32> 
+{
+  // expected-note @below {{payload operation}}
+  %0 = linalg.matmul ins(%arg0, %arg1 : tensor<24x12xf32>, tensor<12x25xf32>) outs(%arg2 : tensor<24x25xf32>) -> tensor<24x25xf32>
+  func.return %0 : tensor<24x25xf32>
+}
+
+transform.sequence failures(propagate) {
+^bb1(%arg1: !pdl.operation):
+  %matmul = transform.structured.match ops{["linalg.matmul"]} in %arg1 
+    : (!pdl.operation) -> !pdl.operation
+
+  
+  %matmul_l1, %loops_l1 = transform.structured.tile_to_scf_for %matmul [5]
+
+  %matmul_padded = transform.structured.pad %matmul_l1 {
+    padding_values=[0.0: f32, 0.0 : f32, 0.0 : f32],
+    padding_dimensions=[0, 1, 2]
+  }
+
+  // In this case, the pad op is actually empty: we only tile the first dimension
+  // and it does not have an impact on the RHS operand.
+  // expected-error @below {{incompatible payload operation name}}
+  %pad = transform.get_producer_of_operand %matmul_padded[1]
+    : (!pdl.operation) -> !pdl.operation
+
+  transform.structured.hoist_pad.build_packing_loop_nest %pad above %loops_l1
+     : (!pdl.operation) -> !pdl.operation
+}
+
+// -----
+
+func.func @pad_and_hoist_init(
+  %arg0: tensor<24x12xf32>, %arg1: tensor<12x25xf32>, %arg2: tensor<24x25xf32>)
+     -> tensor<24x25xf32> 
+{
+  // expected-note @below {{when applied to this op}}
+  %0 = linalg.matmul ins(%arg0, %arg1 : tensor<24x12xf32>, tensor<12x25xf32>) outs(%arg2 : tensor<24x25xf32>) -> tensor<24x25xf32>
+  func.return %0 : tensor<24x25xf32>
+}
+
+transform.sequence failures(propagate) {
+^bb1(%arg1: !pdl.operation):
+  %matmul = transform.structured.match ops{["linalg.matmul"]} in %arg1 
+    : (!pdl.operation) -> !pdl.operation
+
+  
+  %matmul_l1, %loops_l1 = transform.structured.tile_to_scf_for %matmul [5]
+
+  %matmul_padded = transform.structured.pad %matmul_l1 {
+    padding_values=[0.0: f32, 0.0 : f32, 0.0 : f32],
+    padding_dimensions=[0, 1, 2]
+  }
+
+  %pad = transform.get_producer_of_operand %matmul_padded[2]
+    : (!pdl.operation) -> !pdl.operation
+
+  // We do not know yet how to hoist the init.
+  // expected-error @below {{transform.structured.hoist_pad failed to apply}}
+  transform.structured.hoist_pad.build_packing_loop_nest %pad above %loops_l1
+     : (!pdl.operation) -> !pdl.operation
+}
+
+// -----
+
+//     CHECK-LABEL: pad_and_hoist_lhs
+func.func @pad_and_hoist_lhs(
+  %arg0: tensor<24x12xf32>, %arg1: tensor<12x25xf32>, %arg2: tensor<24x25xf32>)
+     -> tensor<24x25xf32> 
+{
+  //     CHECK: %[[PACKED:.*]] = scf.for %{{.*}} -> (tensor<5x5x12xf32>) {
+  //     CHECK:   tensor.pad %{{.*}} 
+  //     CHECK:     : tensor<?x12xf32> to tensor<5x12xf32>
+  //     CHECK:   tensor.insert_slice %{{.*}} into %{{.*}}[%{{.*}}, 0, 0] [1, 5, 12] [1, 1, 1] 
+  // CHECK-SAME:   : tensor<5x12xf32> into tensor<5x5x12xf32>
+  //     CHECK: scf.for %{{.*}} -> (tensor<24x25xf32>) {
+  //     CHECK:   %[[PADDED:.*]] = tensor.extract_slice %[[PACKED]][%{{.*}}, 0, 0] [1, 5, 12] [1, 1, 1] 
+  // CHECK-SAME:    : tensor<5x5x12xf32> to tensor<5x12xf32>
+  //     CHECK:   linalg.matmul ins(%[[PADDED]]
+  %0 = linalg.matmul ins(%arg0, %arg1 : tensor<24x12xf32>, tensor<12x25xf32>) outs(%arg2 : tensor<24x25xf32>) -> tensor<24x25xf32>
+  func.return %0 : tensor<24x25xf32>
+}
+
+transform.sequence failures(propagate) {
+^bb1(%arg1: !pdl.operation):
+  %matmul = transform.structured.match ops{["linalg.matmul"]} in %arg1 
+    : (!pdl.operation) -> !pdl.operation
+
+  
+  %matmul_l1, %loops_l1 = transform.structured.tile_to_scf_for %matmul [5]
+
+  %matmul_padded = transform.structured.pad %matmul_l1 {
+    padding_values=[0.0: f32, 0.0 : f32, 0.0 : f32],
+    padding_dimensions=[0, 1, 2]
+  }
+
+  %pad = transform.get_producer_of_operand %matmul_padded[0]
+    : (!pdl.operation) -> !pdl.operation
+
+  transform.structured.hoist_pad.build_packing_loop_nest %pad above %loops_l1
+     : (!pdl.operation) -> !pdl.operation
+}
+
+// -----
+
+//     CHECK-LABEL: pad_and_hoist_lhs_transpose
+func.func @pad_and_hoist_lhs_transpose(
+  %arg0: tensor<24x12xf32>, %arg1: tensor<12x25xf32>, %arg2: tensor<24x25xf32>)
+     -> tensor<24x25xf32> 
+{
+  //     CHECK: %[[PACKED:.*]] = scf.for %{{.*}} -> (tensor<5x12x5xf32>) {
+  //     CHECK:   tensor.pad %{{.*}} 
+  //     CHECK:     : tensor<?x12xf32> to tensor<5x12xf32>
+  //     CHECK:   linalg.generic
+  //     CHECK:     -> tensor<12x5xf32>
+  //     CHECK:   tensor.insert_slice %{{.*}} into %{{.*}}[%{{.*}}, 0, 0] [1, 12, 5] [1, 1, 1] 
+  // CHECK-SAME:   : tensor<12x5xf32> into tensor<5x12x5xf32>
+  //     CHECK: scf.for %{{.*}} -> (tensor<24x25xf32>) {
+  //     CHECK:   %[[PADDED:.*]] = tensor.extract_slice %[[PACKED]][%{{.*}}, 0, 0] [1, 12, 5] [1, 1, 1] 
+  // CHECK-SAME:    : tensor<5x12x5xf32> to tensor<12x5xf32>
+  //     CHECK:   %[[TRANSPOSED:.*]] = linalg.generic
+  //     CHECK:     -> tensor<5x12xf32>
+  //     CHECK:   linalg.matmul ins(%[[TRANSPOSED]]
+  %0 = linalg.matmul ins(%arg0, %arg1 : tensor<24x12xf32>, tensor<12x25xf32>) outs(%arg2 : tensor<24x25xf32>) -> tensor<24x25xf32>
+  func.return %0 : tensor<24x25xf32>
+}
+
+transform.sequence failures(propagate) {
+^bb1(%arg1: !pdl.operation):
+  %matmul = transform.structured.match ops{["linalg.matmul"]} in %arg1 
+    : (!pdl.operation) -> !pdl.operation
+
+  
+  %matmul_l1, %loops_l1 = transform.structured.tile_to_scf_for %matmul [5]
+
+  %matmul_padded = transform.structured.pad %matmul_l1 {
+    padding_values=[0.0: f32, 0.0 : f32, 0.0 : f32],
+    padding_dimensions=[0, 1, 2]
+  }
+
+  %pad = transform.get_producer_of_operand %matmul_padded[0]
+    : (!pdl.operation) -> !pdl.operation
+
+  transform.structured.hoist_pad.build_packing_loop_nest %pad above %loops_l1, transpose by [1, 0]
+     : (!pdl.operation) -> !pdl.operation
+}
+
+// -----
+
+//     CHECK-LABEL: pad_and_hoist_init
+func.func @pad_and_hoist_init(
+  %arg0: tensor<24x12xf32>, %arg1: tensor<12x25xf32>, %arg2: tensor<24x25xf32>)
+     -> tensor<24x25xf32> 
+{
+
+  //      CHECK: scf.for %{{.*}} -> (tensor<24x25xf32>) {
+  //      CHECK:   %[[PADDED:.*]] = tensor.pad %{{.*}} 
+  //      CHECK:     : tensor<?x25xf32> to tensor<5x25xf32>
+  //      CHECK:   scf.for %{{.*}} iter_args(%[[INNER_PADDED:[0-9a-zA-Z]*]] = %[[PADDED]]) -> (tensor<5x25xf32>)
+  //      CHECK:     %[[RES:.*]] = linalg.matmul {{.*}} outs(%[[INNER_PADDED]]
+  // CHECK-SAME:       : tensor<5x25xf32>
+  //      CHECK:     scf.yield %[[RES]] : tensor<5x25xf32>
+  //      CHECK:   %[[CAST:.*]] = tensor.cast %{{.*}} : tensor<5x25xf32> to tensor<?x25xf32>
+  //      CHECK:   tensor.insert_slice %[[CAST]] into %{{.*}}[%{{.*}}, 0] [%{{.*}}, 25] [1, 1]
+  // CHECK-SAME:     : tensor<?x25xf32> into tensor<24x25xf32>
+  %0 = linalg.matmul ins(%arg0, %arg1 : tensor<24x12xf32>, tensor<12x25xf32>) outs(%arg2 : tensor<24x25xf32>) -> tensor<24x25xf32>
+  func.return %0 : tensor<24x25xf32>
+}
+
+transform.sequence failures(propagate) {
+^bb1(%arg1: !pdl.operation):
+  %matmul = transform.structured.match ops{["linalg.matmul"]} in %arg1 
+    : (!pdl.operation) -> !pdl.operation
+
+  
+  %matmul_l1, %loops_l1:2 = transform.structured.tile_to_scf_for %matmul [5, 0, 7]
+
+  %matmul_padded = transform.structured.pad %matmul_l1 {
+    padding_values=[0.0: f32, 0.0 : f32, 0.0 : f32],
+    padding_dimensions=[0, 1, 2]
+  }
+
+  %pad = transform.get_producer_of_operand %matmul_padded[2]
+    : (!pdl.operation) -> !pdl.operation
+
+  transform.structured.hoist_pad.build_packing_loop_nest %pad above %loops_l1#1
+     : (!pdl.operation) -> !pdl.operation
 }