diff --git a/mlir/lib/Dialect/Linalg/Transforms/Hoisting.cpp b/mlir/lib/Dialect/Linalg/Transforms/Hoisting.cpp
--- a/mlir/lib/Dialect/Linalg/Transforms/Hoisting.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/Hoisting.cpp
@@ -13,7 +13,9 @@
 
 #include "mlir/Dialect/Linalg/Transforms/Hoisting.h"
 #include "mlir/Analysis/SliceAnalysis.h"
+#include "mlir/Dialect/Affine/Utils.h"
 #include "mlir/Dialect/Linalg/IR/LinalgOps.h"
+#include "mlir/Dialect/Linalg/Transforms/Transforms.h"
 #include "mlir/Dialect/SCF/SCF.h"
 #include "mlir/Dialect/SCF/Utils.h"
 #include "mlir/Dialect/StandardOps/IR/Ops.h"
@@ -503,6 +505,73 @@
   }
 }
 
+bool isDefinedOutsideOrConstant(scf::ForOp outer, Value v) {
+  return outer.isDefinedOutsideOfLoop(v) || v.getDefiningOp<ConstantOp>();
+}
+
+/// Compute the tightest upper bound with quantities that are all defined
+/// outside of `outer`.
+Value computeLoopIndependentLowerBound(OpBuilder &b, scf::ForOp outer,
+                                       Value v) {
+  if (isDefinedOutsideOrConstant(outer, v))
+    return v;
+  v.dump();
+  llvm_unreachable("Unsupported loop independent lower bound case");
+}
+
+/// Compute the tightest upper bound with quantities that are all defined
+/// outside of `outer`.
+Value computeLoopIndependentUpperBound(OpBuilder &b, scf::ForOp outer,
+                                       Value v) {
+  if (isDefinedOutsideOrConstant(outer, v))
+    return v;
+
+  auto minOp = v.getDefiningOp<AffineMinOp>();
+  if (minOp) {
+    LLVM_DEBUG(DBGS() << "Begin loopIndependentUpperBound for: " << v << "\n");
+    // Compute a backward slice up to, but not including, `outer`.
+    llvm::SetVector<Operation *> backwardSlice;
+    getBackwardSlice(minOp.getOperation(), &backwardSlice, [&](Operation *op) {
+      return outer->isProperAncestor(op);
+    });
+    backwardSlice.insert(minOp);
+
+    OpBuilder::InsertionGuard g(b);
+    b.setInsertionPoint(outer);
+    Value res;
+    BlockAndValueMapping bvm;
+    for (Operation *op : backwardSlice) {
+      // For now, skip op with regions. This avoids pulling in the whole loops
+      // for which we should only indirectly depend on the induction variables.
+      // TODO: revisit this assumption when new data comes in.
+      if (op->getNumRegions() > 0)
+        continue;
+      auto sliceMinOp = dyn_cast<AffineMinOp>(op);
+      if (!sliceMinOp) {
+        b.clone(*op, bvm);
+        continue;
+      }
+      auto mapAndOperands = substituteMin(
+          sliceMinOp, [&](Operation *op) { return outer->isAncestor(op); });
+      SmallVector<Value> resultOperands = mapAndOperands.dims;
+      llvm::append_range(resultOperands, mapAndOperands.symbols);
+      AffineMap map = mapAndOperands.map;
+      canonicalizeMapAndOperands(&map, &resultOperands);
+      map = simplifyAffineMap(map);
+      res = b.create<AffineMinOp>(outer->getLoc(), map,
+                                  llvm::to_vector<4>(llvm::map_range(
+                                      resultOperands, [&](Value operand) {
+                                        return bvm.lookupOrDefault(operand);
+                                      })));
+      bvm.map(sliceMinOp, res);
+    }
+    LLVM_DEBUG(DBGS() << "End loopIndependentUpperBound with: " << res << "\n");
+    return res;
+  }
+  v.dump();
+  llvm_unreachable("Unsupported loop independent upper bound case");
+}
+
 /// Ensure prerequisites that guarantee pad op hoisting can occur.
 /// Return failure in the cases when we cannot perform hoisting; i.e. if either:
 ///   1. There exists a use of `padTensorOp` that is not a linalg input operand.
@@ -510,7 +579,7 @@
 ///   3. There exists an op with a region that is dominated by
 ///   `outermostEnclosingForOp` and that isn't a LoopLikeInterface or a
 ///    LinalgOp.
-///   3. There exists an op with side effects that is dominated by
+///   4. There exists an op with side effects that is dominated by
 ///    `outermostEnclosingForOp` and that isn't a LoopLikeInterface.
 ///
 /// While ensuring prerequisites:
@@ -587,25 +656,39 @@
 }
 
 /// Return the number of iterations in the loop (ub - lb).ceilDiv(step).
-static Value buildLoopTripCount(OpBuilder &b, scf::ForOp forOp) {
+static Value buildLoopTripCount(OpBuilder &b, scf::ForOp outer,
+                                scf::ForOp forOp) {
   MLIRContext *ctx = forOp->getContext();
   AffineExpr lb, ub, step;
   bindDims(ctx, lb, ub);
   bindSymbols(ctx, step);
-  return b.create<AffineApplyOp>(
+  Value lbVal = computeLoopIndependentLowerBound(b, outer, forOp.lowerBound()),
+        ubVal = computeLoopIndependentUpperBound(b, outer, forOp.upperBound()),
+        stepVal = forOp.step();
+  assert(isDefinedOutsideOrConstant(outer, lbVal));
+  assert(isDefinedOutsideOrConstant(outer, ubVal));
+  assert(isDefinedOutsideOrConstant(outer, stepVal));
+  Value res = b.create<AffineApplyOp>(
       forOp->getLoc(), AffineMap::get(2, 1, {(ub - lb).ceilDiv(step)}, ctx),
-      ValueRange{forOp.lowerBound(), forOp.upperBound(), forOp.step()});
+      ValueRange{lbVal, ubVal, stepVal});
+  return res;
 }
 
 /// Return the current iteration number in the loop (iv - lb).ceilDiv(step).
-static Value buildLoopIterationCount(OpBuilder &b, scf::ForOp forOp) {
+static Value buildLoopIterationCount(OpBuilder &b, scf::ForOp outer,
+                                     scf::ForOp forOp) {
   MLIRContext *ctx = forOp->getContext();
   AffineExpr iv, lb, step;
   bindDims(ctx, iv, lb);
   bindSymbols(ctx, step);
+  Value ivVal = forOp.getInductionVar(),
+        lbVal = computeLoopIndependentLowerBound(b, outer, forOp.lowerBound()),
+        stepVal = forOp.step();
+  assert(isDefinedOutsideOrConstant(outer, lbVal));
+  assert(isDefinedOutsideOrConstant(outer, stepVal));
   return b.create<AffineApplyOp>(
       forOp->getLoc(), AffineMap::get(2, 1, {(iv - lb).ceilDiv(step)}, ctx),
-      ValueRange{forOp.getInductionVar(), forOp.lowerBound(), forOp.step()});
+      ValueRange{ivVal, lbVal, stepVal});
 }
 
 LogicalResult mlir::linalg::hoistPaddingOnTensors(PadTensorOp &padTensorOp,
@@ -634,11 +717,13 @@
   // TODO: go grab dims when necessary, for now PadTensorOp returns a static
   // tensor.
   llvm::append_range(packedShape, paddedTensorType.getShape());
+
   auto packedTensorType =
       RankedTensorType::get(packedShape, paddedTensorType.getElementType());
   auto dynamicSizes =
       llvm::to_vector<4>(llvm::map_range(packingLoops, [&](Operation *op) {
-        return buildLoopTripCount(b, cast<scf::ForOp>(op));
+        return buildLoopTripCount(b, cast<scf::ForOp>(outermostEnclosingForOp),
+                                  cast<scf::ForOp>(op));
       }));
   Value packedTensor = b.create<linalg::InitTensorOp>(
       loc, dynamicSizes, packedTensorType.getShape(),
@@ -656,9 +741,9 @@
   clonedLoopIvs.reserve(nLoops);
   leadingPackedTensorIndexings.reserve(nLoops);
   BlockAndValueMapping bvm;
-  // Stack step 1. iteratively clone loops and push `packedTensor`.
   // Insert `padTensorOp` into the backwardSlice so we clone it too.
   backwardSlice.insert(padTensorOp);
+  // Stack step 1. iteratively clone loops and push `packedTensor`.
   for (Operation *op : backwardSlice) {
     if (op->getNumRegions() == 0 || isa<linalg::PadTensorOp>(op)) {
       b.clone(*op, bvm);
@@ -670,15 +755,19 @@
     // Unused loop, just skip it.
     if (!packingLoops.contains(forOp))
       continue;
+
     auto clonedForOp =
-        b.create<scf::ForOp>(loc, forOp.lowerBound(), forOp.upperBound(),
-                             forOp.step(), packedTensor);
+        b.create<scf::ForOp>(loc, bvm.lookupOrDefault(forOp.lowerBound()),
+                             bvm.lookupOrDefault(forOp.upperBound()),
+                             bvm.lookupOrDefault(forOp.step()), packedTensor);
+
+    bvm.map(forOp.getInductionVar(), clonedForOp.getInductionVar());
     assert(clonedForOp->getNumRegions() == 1);
     clonedLoopIvs.push_back(clonedForOp.getInductionVar());
+
     b.setInsertionPointToStart(&clonedForOp->getRegion(0).front());
-    leadingPackedTensorIndexings.push_back(
-        buildLoopIterationCount(b, clonedForOp));
-    bvm.map(forOp.getInductionVar(), clonedLoopIvs.back());
+    leadingPackedTensorIndexings.push_back(buildLoopIterationCount(
+        b, cast<scf::ForOp>(outermostEnclosingForOp), clonedForOp));
     packedTensor = clonedForOp.getRegionIterArgs().front();
   }
 
@@ -716,7 +805,9 @@
   b.setInsertionPoint(padTensorOp);
   SmallVector<Value> loopIterationCounts =
       llvm::to_vector<4>(llvm::map_range(packingLoops, [&](Operation *loop) {
-        return buildLoopIterationCount(b, cast<scf::ForOp>(loop));
+        return buildLoopIterationCount(
+            b, cast<scf::ForOp>(outermostEnclosingForOp),
+            cast<scf::ForOp>(loop));
       }));
   // offsets = [originalLoopIvs, 0 .. 0].
   offsets.assign(loopIterationCounts.begin(), loopIterationCounts.end());
diff --git a/mlir/test/Dialect/Linalg/hoist-padding.mlir b/mlir/test/Dialect/Linalg/hoist-padding.mlir
--- a/mlir/test/Dialect/Linalg/hoist-padding.mlir
+++ b/mlir/test/Dialect/Linalg/hoist-padding.mlir
@@ -1,16 +1,15 @@
-// RUN: mlir-opt %s -test-linalg-transform-patterns=test-hoist-padding-2-level -canonicalize | FileCheck %s
+// RUN: mlir-opt %s -split-input-file -test-linalg-transform-patterns=test-hoist-padding-2-level -canonicalize | FileCheck %s
 
+// CHECK-DAG: #[[$DIV3:[0-9a-z]+]] = affine_map<(d0) -> (d0 ceildiv 3)>
+// CHECK-DAG: #[[$DIV4:[0-9a-z]+]] = affine_map<(d0) -> (d0 ceildiv 4)>
+// CHECK-DAG: #[[$DIVS3:[0-9a-z]+]] = affine_map<()[s0] -> (s0 ceildiv 3)>
+// CHECK-DAG: #[[$DIVS4:[0-9a-z]+]] = affine_map<()[s0] -> (s0 ceildiv 4)>
 #map0 = affine_map<(d0)[s0] -> (2, -d0 + s0)>
 #map1 = affine_map<(d0)[s0] -> (4, -d0 + s0)>
 #map2 = affine_map<(d0)[s0] -> (3, -d0 + s0)>
 #map3 = affine_map<(d0, d1) -> (2, d0 - d1)>
 #map4 = affine_map<(d0, d1) -> (3, d0 - d1)>
 
-// CHECK-DAG: #[[$DIV3:[0-9a-z]+]] = affine_map<(d0) -> (d0 ceildiv 3)>
-// CHECK-DAG: #[[$DIV4:[0-9a-z]+]] = affine_map<(d0) -> (d0 ceildiv 4)>
-// CHECK-DAG: #[[$DIVS3:[0-9a-z]+]] = affine_map<()[s0] -> (s0 ceildiv 3)>
-// CHECK-DAG: #[[$DIVS4:[0-9a-z]+]] = affine_map<()[s0] -> (s0 ceildiv 4)>
-
 // CHECK-LABEL: func @matmul_tensors
 //  CHECK-SAME:   %[[TA:[0-9a-z]+]]: tensor
 //  CHECK-SAME:   %[[TB:[0-9a-z]+]]: tensor
@@ -129,3 +128,92 @@
   }
   return %3 : tensor<?x?xf32>
 }
+
+// -----
+
+// CHECK-DAG: #[[$MIN_REST8:[0-9a-z]+]] = affine_map<(d0)[s0] -> (8, -d0 + s0)>
+// CHECK-DAG: #[[$MIN_MOD4:[0-9a-z]+]] = affine_map<(d0) -> (4, d0 - ((d0 - 1) floordiv 4) * 4)>
+// CHECK-DAG: #[[$DIV4:[0-9a-z]+]] = affine_map<(d0) -> (d0 ceildiv 4)>
+// CHECK-DAG: #[[$DIV2:[0-9a-z]+]] = affine_map<(d0) -> (d0 ceildiv 2)>
+#map0 = affine_map<(d0)[s0] -> (8, -d0 + s0)>
+#map1 = affine_map<(d0, d1) -> (4, d0 - d1)>
+#map2 = affine_map<(d0, d1) -> (2, d0 - d1)>
+
+// CHECK-LABEL: func @dot
+func @dot(%arg0: tensor<?xf32>, %arg1: tensor<?xf32>, %arg2: tensor<f32>)
+    -> tensor<f32>
+{
+  %c8 = constant 8 : index
+  %c4 = constant 4 : index
+  %cst = constant 0.000000e+00 : f32
+  %c2 = constant 2 : index
+  %c0 = constant 0 : index
+  %1 = memref.dim %arg0, %c0 : tensor<?xf32>
+  %2 = memref.dim %arg0, %c0 : tensor<?xf32>
+  %3 = memref.dim %arg1, %c0 : tensor<?xf32>
+
+  //      CHECK: scf.for %[[I:[0-9a-z]+]] =
+  //
+  //      CHECK:   %[[MR8:.*]] = affine.min #[[$MIN_REST8]](%[[I]])
+  //      CHECK:   %[[D0:.*]] = affine.apply #[[$DIV4]](%[[MR8]])
+  //      CHECK:   %[[MM4:.*]] = affine.min #[[$MIN_MOD4]](%[[MR8]])
+  //      CHECK:   %[[D1:.*]] = affine.apply #[[$DIV2]](%[[MM4]])
+  // Init tensor and pack.
+  //      CHECK:   %[[INIT_PACKED_A:.*]] = linalg.init_tensor [%[[D0]], %[[D1]], 2] : tensor<?x?x2xf32>
+  //      CHECK:   %[[PACKED_A:.*]] = scf.for %[[II:[0-9a-z]+]] = {{.*}} iter_args(%{{.*}} = %[[INIT_PACKED_A]]) -> (tensor<?x?x2xf32>) {
+  //      CHECK:     scf.for %[[III:[0-9a-z]+]] =
+  //      CHECK:       subtensor_insert %{{.*}} into %{{.*}}[%{{.*}}, %{{.*}}, 0] [1, 1, 2] [1, 1, 1] : tensor<2xf32> into tensor<?x?x2xf32>
+  //
+  //      CHECK:   %[[D0_2:.*]] = affine.apply #[[$DIV4]](%[[MR8]])
+  //      CHECK:   %[[MM4_2:.*]] = affine.min #[[$MIN_MOD4]](%[[MR8]])
+  //      CHECK:   %[[D1_2:.*]] = affine.apply #[[$DIV2]](%[[MM4_2]])
+  // Init tensor and pack.
+  //      CHECK:   %[[INIT_PACKED_B:.*]] = linalg.init_tensor [%[[D0_2]], %[[D1_2]], 2] : tensor<?x?x2xf32>
+  //      CHECK:   %[[PACKED_B:.*]] = scf.for %[[II_2:[0-9a-z]+]] = {{.*}} iter_args(%{{.*}} = %[[INIT_PACKED_B]]) -> (tensor<?x?x2xf32>) {
+  //      CHECK:     scf.for %[[III_2:[0-9a-z]+]] =
+  //      CHECK:       subtensor_insert %{{.*}} into %{{.*}}[%{{.*}}, %{{.*}}, 0] [1, 1, 2] [1, 1, 1] : tensor<2xf32> into tensor<?x?x2xf32>
+  // Compute.
+  //      CHECK:   scf.for %[[II_3:[0-9a-z]+]] =
+  //      CHECK:     scf.for %[[III_3:[0-9a-z]+]] = {{.*}} iter_args(%[[C:.*]] = %{{.*}}) -> (tensor<f32>) {
+  //      CHECK:       %[[IDX0:.*]] = affine.apply #[[$DIV4]](%[[II_3]])
+  //      CHECK:       %[[IDX1:.*]] = affine.apply #[[$DIV2]](%[[III_3]])
+  //      CHECK:       %[[A:.*]] = subtensor %[[PACKED_A]][%[[IDX0]], %[[IDX1]], 0] [1, 1, 2] [1, 1, 1] : tensor<?x?x2xf32> to tensor<2xf32>
+  //      CHECK:       %[[IDX0_2:.*]] = affine.apply #[[$DIV4]](%[[II_3]])
+  //      CHECK:       %[[IDX1_2:.*]] = affine.apply #[[$DIV2]](%[[III_3]])
+  //      CHECK:       %[[B:.*]] = subtensor %[[PACKED_B]][%[[IDX0_2]], %[[IDX1_2]], 0] [1, 1, 2] [1, 1, 1] : tensor<?x?x2xf32> to tensor<2xf32>
+  //      CHECK:       linalg.dot ins(%[[A]], %[[B]] : tensor<2xf32>, tensor<2xf32>) outs(%[[C]] : tensor<f32>) -> tensor<f32>
+
+  %4 = scf.for %arg3 = %c0 to %1 step %c8 iter_args(%arg4 = %arg2) -> (tensor<f32>) {
+    %5 = affine.min #map0(%arg3)[%2]
+    %6 = subtensor %arg0[%arg3] [%5] [1] : tensor<?xf32> to tensor<?xf32>
+    %7 = affine.min #map0(%arg3)[%3]
+    %8 = subtensor %arg1[%arg3] [%7] [1] : tensor<?xf32> to tensor<?xf32>
+    %9 = scf.for %arg5 = %c0 to %5 step %c4 iter_args(%arg6 = %arg4) -> (tensor<f32>) {
+      %10 = affine.min #map1(%5, %arg5)
+      %11 = subtensor %6[%arg5] [%10] [1] : tensor<?xf32> to tensor<?xf32>
+      %12 = affine.min #map1(%7, %arg5)
+      %13 = subtensor %8[%arg5] [%12] [1] : tensor<?xf32> to tensor<?xf32>
+      %14 = scf.for %arg7 = %c0 to %10 step %c2 iter_args(%arg8 = %arg6) -> (tensor<f32>) {
+        %15 = affine.min #map2(%10, %arg7)
+        %16 = subtensor %11[%arg7] [%15] [1] : tensor<?xf32> to tensor<?xf32>
+        %17 = affine.min #map2(%12, %arg7)
+        %18 = subtensor %13[%arg7] [%17] [1] : tensor<?xf32> to tensor<?xf32>
+        %19 = subi %c2, %15 : index
+        %20 = linalg.pad_tensor %16 low[%c0] high[%19]  {
+        ^bb0(%arg9: index):  // no predecessors
+          linalg.yield %cst : f32
+        } : tensor<?xf32> to tensor<2xf32>
+        %21 = subi %c2, %17 : index
+        %22 = linalg.pad_tensor %18 low[%c0] high[%21]  {
+        ^bb0(%arg9: index):  // no predecessors
+          linalg.yield %cst : f32
+        } : tensor<?xf32> to tensor<2xf32>
+        %23 = linalg.dot ins(%20, %22 : tensor<2xf32>, tensor<2xf32>) outs(%arg8 : tensor<f32>) -> tensor<f32>
+        scf.yield %23 : tensor<f32>
+      }
+      scf.yield %14 : tensor<f32>
+    }
+    scf.yield %9 : tensor<f32>
+  }
+  return %4 : tensor<f32>
+}