diff --git a/mlir/include/mlir/Dialect/Linalg/Transforms/Hoisting.h b/mlir/include/mlir/Dialect/Linalg/Transforms/Hoisting.h
--- a/mlir/include/mlir/Dialect/Linalg/Transforms/Hoisting.h
+++ b/mlir/include/mlir/Dialect/Linalg/Transforms/Hoisting.h
@@ -11,8 +11,10 @@
 
 namespace mlir {
 class FuncOp;
+struct LogicalResult;
 
 namespace linalg {
+class SimplePadOp;
 
 /// Hoist alloc/dealloc pairs and alloca op out of immediately enclosing
 /// scf::ForOp if both conditions are true:
@@ -40,6 +42,44 @@
 /// instead of buffers.
 void hoistRedundantVectorTransfersOnTensor(FuncOp func);
 
+/// Mechanically hoist padding operations on tensors by `nLoops` into a new,
+/// generally larger tensor. This achieves packing of multiple padding ops into
+/// a larger tensor. On success, `simplePadOp` is replaced by the cloned version
+/// in the packing loop so the caller can continue reasoning about the padding
+/// operation.
+///
+/// Example in pseudo-mlir:
+/// =======================
+///
+/// If hoistPaddingOnTensors is called with `nLoops` = 2 on the following IR.
+/// ```
+///    scf.for (%i, %j, %k)
+///      %st0 = subtensor f(%i, %k) : ... to tensor<?x?xf32>
+///      %0 = linalg.simple_pad %st0 pad %pad :
+///             tensor<?x?xf32> to tensor<4x8xf32>
+///      compute(%0)
+/// ```
+///
+/// IR resembling the following is produced:
+///
+/// ```
+///    scf.for (%i) {
+///      %packed_init = linalg.init_tensor range(%j) : tensor<?x4x8xf32>
+///      %packed = scf.for (%k) iter_args(%p : %packed_init)
+///        %st0 = subtensor f(%i, %k) : ... to tensor<?x?xf32>
+///        %0 = linalg.simple_pad %st0 pad %pad :
+///               tensor<?x?xf32> to tensor<4x8xf32>
+///        scf.yield %1: tensor<?x4x8xf32>
+///      } -> tensor<?x4x8xf32>
+///      scf.for (%j, %k) {
+///        %st0 = subtensor %packed [%k, 0, 0][1, 4, 8][1, 1, 1] :
+///                 tensor<?x4x8xf32> to tensor<4x8xf32>
+///        compute(%st0)
+///      }
+///    }
+/// ```
+LogicalResult hoistPaddingOnTensors(SimplePadOp &simplePadOp, unsigned nLoops);
+
 } // namespace linalg
 } // namespace mlir
 
diff --git a/mlir/include/mlir/Dialect/StandardOps/IR/Ops.td b/mlir/include/mlir/Dialect/StandardOps/IR/Ops.td
--- a/mlir/include/mlir/Dialect/StandardOps/IR/Ops.td
+++ b/mlir/include/mlir/Dialect/StandardOps/IR/Ops.td
@@ -3058,6 +3058,17 @@
     // Build a SubTensorOp with all dynamic entries and custom result type.
     OpBuilderDAG<(ins "RankedTensorType":$resultType, "Value":$source,
       "ValueRange":$offsets, "ValueRange":$sizes, "ValueRange":$strides,
+      CArg<"ArrayRef<NamedAttribute>", "{}">:$attrs)>,
+    // Build a SubTensorOp with mixed static and dynamic entries and inferred
+    // result type.
+    OpBuilderDAG<(ins "Value":$source, "ArrayRef<OpFoldResult>":$offsets,
+      "ArrayRef<OpFoldResult>":$sizes, "ArrayRef<OpFoldResult>":$strides,
+      CArg<"ArrayRef<NamedAttribute>", "{}">:$attrs)>,
+    // Build a SubTensorOp with mixed static and dynamic entries and custom
+    // result type. If the type passed is nullptr, it is inferred.
+    OpBuilderDAG<(ins "RankedTensorType":$resultType, "Value":$source,
+      "ArrayRef<OpFoldResult>":$offsets, "ArrayRef<OpFoldResult>":$sizes,
+      "ArrayRef<OpFoldResult>":$strides,
       CArg<"ArrayRef<NamedAttribute>", "{}">:$attrs)>
   ];
 
@@ -3154,6 +3165,11 @@
     // Build a SubTensorInsertOp with all dynamic entries.
     OpBuilderDAG<(ins "Value":$source, "Value":$dest, "ValueRange":$offsets,
       "ValueRange":$sizes, "ValueRange":$strides,
+      CArg<"ArrayRef<NamedAttribute>", "{}">:$attrs)>,
+    // Build a SubTensorInsertOp with mixed static and dynamic entries.
+    OpBuilderDAG<(ins "Value":$source, "Value":$dest,
+      "ArrayRef<OpFoldResult>":$offsets, "ArrayRef<OpFoldResult>":$sizes,
+      "ArrayRef<OpFoldResult>":$strides,
       CArg<"ArrayRef<NamedAttribute>", "{}">:$attrs)>
   ];
 
diff --git a/mlir/lib/Dialect/Linalg/Transforms/Hoisting.cpp b/mlir/lib/Dialect/Linalg/Transforms/Hoisting.cpp
--- a/mlir/lib/Dialect/Linalg/Transforms/Hoisting.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/Hoisting.cpp
@@ -334,3 +334,253 @@
     });
   }
 }
+
+/// Ensure prerequisites that guarantee pad op hoisting can occur.
+/// Return failure in the cases when we cannot perform hoisting; i.e. if either:
+///   1. There exists a use of `simplePadOp` that is not a linalg input operand.
+///   2. There isn't an enclosing `outermostEnclosingForOp` loop.
+///   3. There exists an op with a region that is dominated by
+///   `outermostEnclosingForOp` and that isn't a LoopLikeInterface or a
+///    LinalgOp.
+///
+/// While ensuring prerequisites:
+///   1. Fill the `backwardSlice` to contain the topologically sorted ops
+///   dominated by `outermostEnclosingForOp`.
+///   2. Fill the `packingLoops` to contain only the enclosing loops of
+///   `backwardSlice` whose IV is actually used in computing padding. Loops that
+///   remain in `backwardSlice` but that are not in `packingLoops` are
+///   dimensions of reuse.
+static LogicalResult
+hoistPaddingOnTensorsPrerequisites(linalg::SimplePadOp simplePadOp, int nLevels,
+                                   llvm::SetVector<Operation *> &backwardSlice,
+                                   llvm::SetVector<Operation *> &packingLoops) {
+  // Bail on any use that isn't an input of a Linalg op.
+  // Hoisting of inplace updates happens after vectorization.
+  for (OpOperand &use : simplePadOp.result().getUses()) {
+    auto linalgUser = dyn_cast<linalg::LinalgOp>(use.getOwner());
+    if (!linalgUser || !linalgUser.isInputTensor(&use))
+      return failure();
+  }
+
+  // Get at most nLevels of enclosing loops.
+  SmallVector<LoopLikeOpInterface> reverseEnclosingLoops;
+  Operation *outermostEnclosingForOp = nullptr,
+            *nextEnclosingForOp =
+                simplePadOp->getParentOfType<LoopLikeOpInterface>();
+  while (nLevels-- > 0 && nextEnclosingForOp) {
+    outermostEnclosingForOp = nextEnclosingForOp;
+    reverseEnclosingLoops.push_back(outermostEnclosingForOp);
+    nextEnclosingForOp =
+        nextEnclosingForOp->getParentOfType<LoopLikeOpInterface>();
+  }
+  if (!outermostEnclosingForOp)
+    return failure();
+
+  // Get the backwards slice from `simplePadOp` that is dominated by the
+  // outermost enclosing loop.
+  DominanceInfo domInfo(outermostEnclosingForOp);
+  getBackwardSlice(simplePadOp, &backwardSlice, [&](Operation *op) {
+    return domInfo.dominates(outermostEnclosingForOp, op);
+  });
+
+  // Bail on any op with a region that is not a LoopLikeInterface or a LinalgOp.
+  if (llvm::any_of(backwardSlice, [](Operation *op) {
+        return op->getNumRegions() > 0 && !isa<LoopLikeOpInterface>(op) &&
+               !isa<LinalgOp>(op);
+      }))
+    return failure();
+
+  // Filter out the loops whose induction variable is not used to compute the
+  // padded result. As a first approximation, just look for IVs that have no use
+  // in the backwardSlice.
+  // These are the dimensions of reuse that we can exploit to reduce the amount
+  // of work / memory.
+  // TODO: would this optimization compose better as a canonicalization?
+  for (LoopLikeOpInterface loop : reverseEnclosingLoops) {
+    auto forOp = dyn_cast<scf::ForOp>(loop.getOperation());
+    if (!forOp)
+      continue;
+    for (Operation *user : forOp.getInductionVar().getUsers()) {
+      if (backwardSlice.contains(user)) {
+        packingLoops.insert(forOp);
+        break;
+      }
+    }
+  }
+
+  // Backward slice is a topologically sorted list of ops starting at
+  // `outermostEnclosingForOp`.
+  assert(outermostEnclosingForOp == backwardSlice.front());
+
+  return success();
+}
+
+static Value buildLoopTripCount(OpBuilder &b, Operation *op) {
+  MLIRContext *ctx = op->getContext();
+  AffineExpr lb, ub, step = getAffineSymbolExpr(0, ctx);
+  bindDims(ctx, lb, ub);
+  scf::ForOp forOp = cast<scf::ForOp>(op);
+  return b.create<AffineApplyOp>(
+      op->getLoc(), AffineMap::get(2, 1, {(ub - lb).ceilDiv(step)}, ctx),
+      ValueRange{forOp.lowerBound(), forOp.upperBound(), forOp.step()});
+}
+
+/// Mechanically hoist padding operations on tensors by at most `nLoops` into a
+/// new, generally larger tensor. This achieves packing of multiple padding ops
+/// into a larger tensor. On success, `simplePadOp` is replaced by the cloned
+/// version in the packing loop so the caller can continue reasoning about the
+/// padding operation.
+///
+/// Example in pseudo-mlir:
+/// =======================
+///
+/// If hoistPaddingOnTensors is called with `nLoops` = 2 on the following IR.
+/// ```
+///    scf.for (%i, %j, %k)
+///      %st0 = subtensor f(%i, %k) : ... to tensor<?x?xf32>
+///      %0 = linalg.simple_pad %st0 pad %pad :
+///             tensor<?x?xf32> to tensor<4x8xf32>
+///      compute(%0)
+/// ```
+///
+/// IR resembling the following is produced:
+///
+/// ```
+///    scf.for (%i) {
+///      %packed_init = linalg.init_tensor range(%j) : tensor<?x4x8xf32>
+///      %packed = scf.for (%k) iter_args(%p : %packed_init)
+///        %st0 = subtensor f(%i, %k) : ... to tensor<?x?xf32>
+///        %0 = linalg.simple_pad %st0 pad %pad :
+///               tensor<?x?xf32> to tensor<4x8xf32>
+///        scf.yield %1: tensor<?x4x8xf32>
+///      } -> tensor<?x4x8xf32>
+///      scf.for (%j, %k) {
+///        %st0 = subtensor %packed [%k, 0, 0][1, 4, 8][1, 1, 1] :
+///                 tensor<?x4x8xf32> to tensor<4x8xf32>
+///        compute(%st0)
+///      }
+///    }
+/// ```
+LogicalResult mlir::linalg::hoistPaddingOnTensors(SimplePadOp &simplePadOp,
+                                                  unsigned nLoops) {
+  llvm::SetVector<Operation *> backwardSlice, packingLoops;
+  if (failed(hoistPaddingOnTensorsPrerequisites(simplePadOp, nLoops,
+                                                backwardSlice, packingLoops)))
+    return failure();
+
+  // Update actual number of loops, which may be smaller.
+  nLoops = packingLoops.size();
+
+  Location loc = simplePadOp->getLoc();
+  RankedTensorType paddedTensorType = simplePadOp.getResultType();
+  unsigned paddedRank = paddedTensorType.getRank();
+
+  // Backward slice is a topologically sorted list of ops starting at
+  // `outermostEnclosingForOp`.
+  Operation *outermostEnclosingForOp = backwardSlice.front();
+  // IP just before the outermost loop considered that we hoist above.
+  OpBuilder b(outermostEnclosingForOp);
+
+  // Create the packed tensor<?x?x..?xpadded_shape> into which we amortize
+  // padding.
+  SmallVector<int64_t> packedShape(nLoops, ShapedType::kDynamicSize);
+  // TODO: go grab dims when necessary, for now SimplePadOp returns a static
+  // tensor.
+  llvm::append_range(packedShape, paddedTensorType.getShape());
+  auto packedTensorType =
+      RankedTensorType::get(packedShape, paddedTensorType.getElementType());
+  auto dynamicSizes = llvm::to_vector<4>(llvm::map_range(
+      packingLoops, [&](Operation *op) { return buildLoopTripCount(b, op); }));
+  Value packedTensor = b.create<linalg::InitTensorOp>(
+      loc, dynamicSizes, packedTensorType.getShape(),
+      packedTensorType.getElementType());
+
+  // Clone the operations involved in the backward slice, iteratively stepping
+  // into the loops that we encounter.
+  // The implementation proceeds in a stack-like fashion:
+  //   1. Iteratively clone and step into the loops, pushing the `packedTensor`
+  //      deeper in the stack.
+  //   2. Create a SubTensorInsert at the top of the stack.
+  //   3. Iteratively pop and yield the result of the SubTensorInsertOp across
+  //     the cloned loops.
+  SmallVector<Value> clonedLoopIvs;
+  clonedLoopIvs.reserve(nLoops);
+  BlockAndValueMapping bvm;
+  // Stack step 1. iteratively clone loops and push `packedTensor`.
+  // Insert `simplePadOp` into the backwardSlice so we clone it too.
+  backwardSlice.insert(simplePadOp);
+  for (Operation *op : backwardSlice) {
+    if (op->getNumRegions() == 0) {
+      b.clone(*op, bvm);
+      continue;
+    }
+    // TODO: support more cases as they appear.
+    auto forOp = dyn_cast<scf::ForOp>(op);
+    assert(forOp && "Expected scf::ForOp when hoisting pad ops");
+    // Unused loop, just skip it.
+    if (!packingLoops.contains(forOp))
+      continue;
+    auto clonedForOp =
+        b.create<scf::ForOp>(loc, forOp.lowerBound(), forOp.upperBound(),
+                             forOp.step(), packedTensor);
+    assert(clonedForOp->getNumRegions() == 1);
+    clonedLoopIvs.push_back(clonedForOp.getInductionVar());
+    b.setInsertionPointToStart(&clonedForOp->getRegion(0).front());
+    bvm.map(forOp.getInductionVar(), clonedLoopIvs.back());
+    packedTensor = clonedForOp.getRegionIterArgs().front();
+  }
+
+  // Stack step 2. create SubTensorInsertOp at the top of the stack.
+  // offsets = [clonedLoopIvs, 0 .. 0].
+  SmallVector<OpFoldResult> offsets(clonedLoopIvs.begin(), clonedLoopIvs.end());
+  offsets.append(paddedRank, b.getIndexAttr(0));
+  // sizes = [1 .. 1, paddedShape].
+  SmallVector<OpFoldResult> sizes(nLoops, b.getIndexAttr(1));
+  for (int64_t sz : paddedTensorType.getShape()) {
+    // TODO: go grab dims when necessary, for now SimplePadOp returns a static
+    // tensor.
+    assert(!ShapedType::isDynamic(sz) && "padded tensor needs static sizes");
+    sizes.push_back(b.getIndexAttr(sz));
+  }
+  // strides = [1 .. 1].
+  SmallVector<OpFoldResult> strides(nLoops + paddedRank, b.getIndexAttr(1));
+
+  Value inserted =
+      b.create<SubTensorInsertOp>(loc, bvm.lookup(simplePadOp.result()),
+                                  packedTensor, offsets, sizes, strides);
+
+  // Stack step 3. iteratively pop the stack and propagate the yield.
+  Value valueToYield = inserted;
+  for (Value iv : llvm::reverse(clonedLoopIvs)) {
+    auto forOp = scf::getForInductionVarOwner(iv);
+    b.setInsertionPointToEnd(&forOp.getRegion().front());
+    b.create<scf::YieldOp>(loc, valueToYield);
+    valueToYield = forOp.getResult(0);
+  }
+
+  // Now the packed tensor is ready, replace the original padding op by a
+  // 1x..x1 SubTensor [originalLoopIvs, 0 .. 0][1 .. 1, paddedShape][1 .. 1].
+  b.setInsertionPoint(simplePadOp);
+  SmallVector<Value> originalLoopIvs =
+      llvm::to_vector<4>(llvm::map_range(packingLoops, [](Operation *loop) {
+        return cast<scf::ForOp>(loop).getInductionVar();
+      }));
+  // offsets = [originalLoopIvs, 0 .. 0].
+  offsets.assign(originalLoopIvs.begin(), originalLoopIvs.end());
+  offsets.append(paddedRank, b.getIndexAttr(0));
+  // sizes = [1 .. 1, paddedShape] (definedabove).
+  // strides = [1 .. 1] (defined above)
+  packedTensor =
+      scf::getForInductionVarOwner(clonedLoopIvs.front())->getResult(0);
+  simplePadOp.replaceAllUsesWith(
+      b.create<SubTensorOp>(loc, simplePadOp.getResultType(), packedTensor,
+                            offsets, sizes, strides)
+          ->getResult(0));
+  simplePadOp.erase();
+
+  // Make the newly cloned `simplePadOp` available to the caller.
+  simplePadOp =
+      cast<SimplePadOp>(bvm.lookup(simplePadOp.result()).getDefiningOp());
+
+  return success();
+}
diff --git a/mlir/lib/Dialect/StandardOps/IR/Ops.cpp b/mlir/lib/Dialect/StandardOps/IR/Ops.cpp
--- a/mlir/lib/Dialect/StandardOps/IR/Ops.cpp
+++ b/mlir/lib/Dialect/StandardOps/IR/Ops.cpp
@@ -3505,6 +3505,69 @@
         staticStridesVector, offsets, sizes, strides, attrs);
 }
 
+/// Dispatch `ofr` into either `dynamicVec` if it is a Value or into `staticVec`
+/// otherwise.  In the dynamic case, `sentinel` is appended to `staticVec` to
+/// represent the dynamic value `?`.
+static void unpackOpFoldResult(OpFoldResult ofr,
+                               SmallVectorImpl<Value> &dynamicVec,
+                               SmallVectorImpl<int64_t> &staticVec,
+                               int64_t sentinel) {
+  Value v = ofr.dyn_cast<Value>();
+  if (v) {
+    dynamicVec.push_back(v);
+    staticVec.push_back(sentinel);
+  } else {
+    APInt apInt = ofr.dyn_cast<Attribute>().cast<IntegerAttr>().getValue();
+    staticVec.push_back(apInt.getSExtValue());
+  }
+}
+
+static void unpackOpFoldResults(ArrayRef<OpFoldResult> ofrs,
+                                SmallVector<Value> &dynamicVec,
+                                SmallVector<int64_t> &staticVec,
+                                int64_t sentinel) {
+  for (auto ofr : ofrs)
+    unpackOpFoldResult(ofr, dynamicVec, staticVec, sentinel);
+}
+
+// Build a SubTensorOp with mixed static and dynamic entries and custom result
+// type. If the type passed is nullptr, it is inferred.
+void mlir::SubTensorOp::build(OpBuilder &b, OperationState &result,
+                              RankedTensorType resultType, Value source,
+                              ArrayRef<OpFoldResult> offsets,
+                              ArrayRef<OpFoldResult> sizes,
+                              ArrayRef<OpFoldResult> strides,
+                              ArrayRef<NamedAttribute> attrs) {
+  SmallVector<int64_t> staticOffsets, staticSizes, staticStrides;
+  SmallVector<Value> dynamicOffsets, dynamicSizes, dynamicStrides;
+  unpackOpFoldResults(offsets, dynamicOffsets, staticOffsets,
+                      ShapedType::kDynamicStrideOrOffset);
+  unpackOpFoldResults(sizes, dynamicSizes, staticSizes,
+                      ShapedType::kDynamicSize);
+  unpackOpFoldResults(strides, dynamicStrides, staticStrides,
+                      ShapedType::kDynamicStrideOrOffset);
+  auto sourceRankedTensorType = source.getType().cast<RankedTensorType>();
+  // Structuring implementation this way avoids duplication between builders.
+  if (!resultType) {
+    resultType =
+        SubTensorOp::inferResultType(sourceRankedTensorType, staticOffsets,
+                                     staticSizes, staticStrides)
+            .cast<RankedTensorType>();
+  }
+  build(b, result, resultType, source, staticOffsets, staticSizes,
+        staticStrides, dynamicOffsets, dynamicSizes, dynamicStrides, attrs);
+}
+
+// Build a SubTensorOp with mixed static and dynamic entries and inferred result
+// type.
+void mlir::SubTensorOp::build(OpBuilder &b, OperationState &result,
+                              Value source, ArrayRef<OpFoldResult> offsets,
+                              ArrayRef<OpFoldResult> sizes,
+                              ArrayRef<OpFoldResult> strides,
+                              ArrayRef<NamedAttribute> attrs) {
+  build(b, result, RankedTensorType(), source, offsets, sizes, strides, attrs);
+}
+
 /// Verifier for SubTensorOp.
 static LogicalResult verify(SubTensorOp op) {
   // Verify result type against inferred type.
@@ -3600,6 +3663,25 @@
         staticStridesVector, offsets, sizes, strides, attrs);
 }
 
+// Build a SubTensorInsertOp with mixed static and dynamic entries.
+void mlir::SubTensorInsertOp::build(OpBuilder &b, OperationState &result,
+                                    Value source, Value dest,
+                                    ArrayRef<OpFoldResult> offsets,
+                                    ArrayRef<OpFoldResult> sizes,
+                                    ArrayRef<OpFoldResult> strides,
+                                    ArrayRef<NamedAttribute> attrs) {
+  SmallVector<int64_t> staticOffsets, staticSizes, staticStrides;
+  SmallVector<Value> dynamicOffsets, dynamicSizes, dynamicStrides;
+  unpackOpFoldResults(offsets, dynamicOffsets, staticOffsets,
+                      ShapedType::kDynamicStrideOrOffset);
+  unpackOpFoldResults(sizes, dynamicSizes, staticSizes,
+                      ShapedType::kDynamicSize);
+  unpackOpFoldResults(strides, dynamicStrides, staticStrides,
+                      ShapedType::kDynamicStrideOrOffset);
+  build(b, result, source, dest, staticOffsets, staticSizes, staticStrides,
+        dynamicOffsets, dynamicSizes, dynamicStrides, attrs);
+}
+
 /// Verifier for SubViewOp.
 static LogicalResult verify(SubTensorInsertOp op) {
   if (op.getType() != op.dest().getType())
diff --git a/mlir/test/Dialect/Linalg/hoist-padding.mlir b/mlir/test/Dialect/Linalg/hoist-padding.mlir
new file mode 100644
--- /dev/null
+++ b/mlir/test/Dialect/Linalg/hoist-padding.mlir
@@ -0,0 +1,85 @@
+// RUN: mlir-opt %s -test-linalg-transform-patterns=test-hoist-padding-2-level -canonicalize | FileCheck %s
+
+#map0 = affine_map<(d0)[s0] -> (2, -d0 + s0)>
+#map1 = affine_map<(d0)[s0] -> (4, -d0 + s0)>
+#map2 = affine_map<(d0)[s0] -> (3, -d0 + s0)>
+#map3 = affine_map<(d0, d1) -> (2, d0 - d1)>
+#map4 = affine_map<(d0, d1) -> (3, d0 - d1)>
+
+// CHECK-LABEL: func @matmul_tensors
+func @matmul_tensors(
+  %arg0: tensor<?x?xf32>, %arg1: tensor<?x?xf32>, %arg2: tensor<?x?xf32>)
+  -> tensor<?x?xf32>
+{
+  %c2 = constant 2 : index
+  %c3 = constant 3 : index
+  %c4 = constant 4 : index
+  %cst = constant 0.000000e+00 : f32
+  %c0 = constant 0 : index
+  %c1 = constant 1 : index
+  %0 = dim %arg0, %c0 : tensor<?x?xf32>
+  %1 = dim %arg0, %c1 : tensor<?x?xf32>
+  %2 = dim %arg1, %c1 : tensor<?x?xf32>
+
+  //      CHECK: scf.for
+  //      CHECK:   linalg.init_tensor [%{{.*}}, 2, 4] : tensor<?x2x4xf32>
+  // 1-D loop
+  //      CHECK:   %[[A:.*]] = scf.for
+  //  CHECK-NOT:     scf.for
+  //      CHECK:     subtensor %{{.*}} [1, 1] : tensor<?x?xf32> to tensor<?x?xf32>
+  //      CHECK:     linalg.simple_pad %{{.*}} : tensor<?x?xf32> to tensor<2x4xf32> pad f32
+  //      CHECK:     subtensor_insert %{{.*}} into %{{.*}}[%{{.*}}, 0, 0]
+  // CHECK-SAME:       [1, 2, 4] [1, 1, 1] : tensor<2x4xf32> into tensor<?x2x4xf32>
+  // 2-D loop
+  //      CHECK:   linalg.init_tensor [%{{.*}}, %{{.*}}, 4, 3] : tensor<?x?x4x3xf32>
+  //      CHECK:   %[[B:.*]] = scf.for
+  //      CHECK:     scf.for
+  //  CHECK-NOT:       scf.for
+  //      CHECK:       subtensor %{{.*}} [1, 1] : tensor<?x?xf32> to tensor<?x?xf32>
+  //      CHECK:       linalg.simple_pad %{{.*}} : tensor<?x?xf32> to tensor<4x3xf32> pad f32
+  //      CHECK:       subtensor_insert %{{.*}} into %{{.*}}[%{{.*}}, %{{.*}}, 0, 0]
+  // CHECK-SAME:         [1, 1, 4, 3] [1, 1, 1, 1] : tensor<4x3xf32> into tensor<?x?x4x3xf32>
+  // 2-D loop
+  //      CHECK:   scf.for %[[J:[0-9a-zA-Z]+]]
+  //      CHECK:     scf.for %[[K:[0-9a-zA-Z]+]]
+  //  CHECK-NOT:       scf.for
+  //      CHECK:       %[[stA:.*]] = subtensor %[[A]][%[[K]], 0, 0] [1, 2, 4] [1, 1, 1] :
+  // CHECK-SAME:         tensor<?x2x4xf32> to tensor<2x4xf32>
+  //      CHECK:       %[[stB:.*]] = subtensor %[[B]][%[[K]], %[[J]], 0, 0] [1, 1, 4, 3] [1, 1, 1, 1] :
+  // CHECK-SAME:         tensor<?x?x4x3xf32> to tensor<4x3xf32>
+  //      CHECK:       %[[stC:.*]] = linalg.simple_pad %{{.*}} pad %{{.*}} :
+  // CHECK-SAME:         tensor<?x?xf32> to tensor<2x3xf32> pad f32
+  //      CHECK:       linalg.matmul ins(%[[stA]], %[[stB]] : tensor<2x4xf32>, tensor<4x3xf32>)
+  // CHECK-SAME:         outs(%[[stC]] : tensor<2x3xf32>) -> tensor<2x3xf32>
+  %3 = scf.for %arg3 = %c0 to %0 step %c2 iter_args(%arg4 = %arg2) -> (tensor<?x?xf32>) {
+    %4 = scf.for %arg5 = %c0 to %2 step %c3 iter_args(%arg6 = %arg4) -> (tensor<?x?xf32>) {
+      %5 = scf.for %arg7 = %c0 to %1 step %c4 iter_args(%arg8 = %arg6) -> (tensor<?x?xf32>) {
+        %6 = dim %arg0, %c0 : tensor<?x?xf32>
+        %7 = affine.min #map0(%arg3)[%6]
+        %8 = dim %arg0, %c1 : tensor<?x?xf32>
+        %9 = affine.min #map1(%arg7)[%8]
+        %10 = subtensor %arg0[%arg3, %arg7] [%7, %9] [1, 1] : tensor<?x?xf32> to tensor<?x?xf32>
+        %11 = dim %arg1, %c0 : tensor<?x?xf32>
+        %12 = affine.min #map1(%arg7)[%11]
+        %13 = dim %arg1, %c1 : tensor<?x?xf32>
+        %14 = affine.min #map2(%arg5)[%13]
+        %15 = subtensor %arg1[%arg7, %arg5] [%12, %14] [1, 1] : tensor<?x?xf32> to tensor<?x?xf32>
+        %16 = dim %arg8, %c0 : tensor<?x?xf32>
+        %17 = affine.min #map3(%16, %arg3)
+        %18 = dim %arg8, %c1 : tensor<?x?xf32>
+        %19 = affine.min #map4(%18, %arg5)
+        %20 = subtensor %arg8[%arg3, %arg5] [%17, %19] [1, 1] : tensor<?x?xf32> to tensor<?x?xf32>
+        %21 = linalg.simple_pad %10 pad %cst : tensor<?x?xf32> to tensor<2x4xf32> pad f32
+        %22 = linalg.simple_pad %15 pad %cst : tensor<?x?xf32> to tensor<4x3xf32> pad f32
+        %23 = linalg.simple_pad %20 pad %cst : tensor<?x?xf32> to tensor<2x3xf32> pad f32
+        %24 = linalg.matmul ins(%21, %22 : tensor<2x4xf32>, tensor<4x3xf32>) outs(%23 : tensor<2x3xf32>) -> tensor<2x3xf32>
+        %25 = subtensor %24[0, 0] [%7, %14] [1, 1] : tensor<2x3xf32> to tensor<?x?xf32>
+        %26 = subtensor_insert %25 into %arg8[%arg3, %arg5] [%17, %19] [%c1, %c1] : tensor<?x?xf32> into tensor<?x?xf32>
+        scf.yield %26 : tensor<?x?xf32>
+      }
+      scf.yield %5 : tensor<?x?xf32>
+    }
+    scf.yield %4 : tensor<?x?xf32>
+  }
+  return %3 : tensor<?x?xf32>
+}
diff --git a/mlir/test/lib/Transforms/TestLinalgTransforms.cpp b/mlir/test/lib/Transforms/TestLinalgTransforms.cpp
--- a/mlir/test/lib/Transforms/TestLinalgTransforms.cpp
+++ b/mlir/test/lib/Transforms/TestLinalgTransforms.cpp
@@ -13,6 +13,7 @@
 #include "mlir/Dialect/Affine/IR/AffineOps.h"
 #include "mlir/Dialect/GPU/GPUDialect.h"
 #include "mlir/Dialect/Linalg/IR/LinalgOps.h"
+#include "mlir/Dialect/Linalg/Transforms/Hoisting.h"
 #include "mlir/Dialect/Linalg/Transforms/Transforms.h"
 #include "mlir/Dialect/Linalg/Utils/Utils.h"
 #include "mlir/Dialect/StandardOps/IR/Ops.h"
@@ -82,6 +83,9 @@
   Option<bool> testTileAndPadPattern{
       *this, "test-tile-and-pad-pattern",
       llvm::cl::desc("Test tile and pad pattern"), llvm::cl::init(false)};
+  Option<bool> testHoistPadding2Levels{*this, "test-hoist-padding-2-level",
+                                       llvm::cl::desc("Test hoist padding"),
+                                       llvm::cl::init(false)};
 };
 } // end anonymous namespace
 
@@ -546,6 +550,11 @@
     return applyAffineMinSCFCanonicalizationPatterns(getFunction());
   if (testTileAndPadPattern)
     return applyTileAndPadPattern(getFunction());
+  if (testHoistPadding2Levels) {
+    getFunction().walk([](linalg::SimplePadOp simplePadOp) {
+      linalg::hoistPaddingOnTensors(simplePadOp, 2);
+    });
+  }
 }
 
 namespace mlir {