diff --git a/mlir/include/mlir/Dialect/Affine/IR/AffineOps.td b/mlir/include/mlir/Dialect/Affine/IR/AffineOps.td
--- a/mlir/include/mlir/Dialect/Affine/IR/AffineOps.td
+++ b/mlir/include/mlir/Dialect/Affine/IR/AffineOps.td
@@ -75,6 +75,11 @@
     OpBuilder<(ins "AffineMap":$map, "ValueRange":$mapOperands),
     [{
       build($_builder, $_state, $_builder.getIndexType(), map, mapOperands);
+    }]>,
+    OpBuilder<(ins "ArrayRef<AffineExpr> ":$exprList,"ValueRange":$mapOperands),
+    [{
+      build($_builder, $_state, $_builder.getIndexType(),
+            AffineMap::inferFromExprList(exprList).front(), mapOperands);
     }]>
   ];
 
diff --git a/mlir/lib/Dialect/Linalg/Transforms/Hoisting.cpp b/mlir/lib/Dialect/Linalg/Transforms/Hoisting.cpp
--- a/mlir/lib/Dialect/Linalg/Transforms/Hoisting.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/Hoisting.cpp
@@ -13,7 +13,9 @@
 
 #include "mlir/Dialect/Linalg/Transforms/Hoisting.h"
 #include "mlir/Analysis/SliceAnalysis.h"
+#include "mlir/Dialect/Affine/Utils.h"
 #include "mlir/Dialect/Linalg/IR/LinalgOps.h"
+#include "mlir/Dialect/Linalg/Transforms/Transforms.h"
 #include "mlir/Dialect/SCF/SCF.h"
 #include "mlir/Dialect/SCF/Utils.h"
 #include "mlir/Dialect/StandardOps/IR/Ops.h"
@@ -503,6 +505,134 @@
   }
 }
 
+/// Return success if `v` is a value that is only transitively defined by ops of
+/// type in `OpTypeList`.
+template <typename... OpTypeList>
+static bool backwardsSliceOnlyHasOpsOfType(scf::ForOp outerLimit, Value v) {
+  // Compute a backward slice up to, but not including, `outerLimit`.
+  llvm::SetVector<Operation *> backwardSlice;
+  getBackwardSlice(v, &backwardSlice, [&](Operation *op) {
+    return outerLimit->isProperAncestor(op);
+  });
+  // Traverse the backward slice and ensure we can perform the computation to
+  // hoist.
+  for (Operation *op : backwardSlice) {
+    if (isa<OpTypeList...>(op))
+      continue;
+    LLVM_DEBUG(DBGS() << "Abort: unadmissible op in slice " << *op << "\n");
+    return false;
+  }
+  return true;
+}
+
+bool isDefinedOutsideOrConstant(scf::ForOp outer, Value v) {
+  return outer.isDefinedOutsideOfLoop(v) || v.getDefiningOp<ConstantOp>();
+}
+
+/// Compute the tightest lower bound with quantities that are all defined
+/// outside of `outer`.
+/// Return null if such a bound cannot be computed.
+Value computeLoopIndependentLowerBound(OpBuilder &b, scf::ForOp outer,
+                                       Value v) {
+  if (isDefinedOutsideOrConstant(outer, v))
+    return v;
+  return Value();
+}
+
+/// Compute the tightest upper bound with quantities that are all defined
+/// outside of `outer`.
+/// Expects all ops in the backward slice of `v` up to `outer` to be either
+/// scf.for, affine.min or affine.apply.
+static Value computeLoopIndependentUpperBound(OpBuilder &b, scf::ForOp outer,
+                                              Value v) {
+  if (isDefinedOutsideOrConstant(outer, v))
+    return v;
+
+  LLVM_DEBUG(DBGS() << "Begin loopIndependentUpperBound for: " << v << "\n");
+
+  bool ok =
+      backwardsSliceOnlyHasOpsOfType<scf::ForOp, AffineMinOp, AffineApplyOp>(
+          outer, v);
+  assert(ok && "expected to only be defined by scf::ForOp and AffineMinOp");
+
+  // Compute a backward slice up to, but not including, `outer`.
+  llvm::SetVector<Operation *> backwardSlice;
+  getBackwardSlice(v, &backwardSlice,
+                   [&](Operation *op) { return outer->isProperAncestor(op); });
+  backwardSlice.insert(v.getDefiningOp());
+
+  OpBuilder::InsertionGuard g(b);
+  b.setInsertionPoint(outer);
+  Value res = v;
+  BlockAndValueMapping bvm;
+  for (Operation *op : backwardSlice) {
+    if (isa<scf::ForOp>(op))
+      continue;
+    if (isa<AffineApplyOp>(op)) {
+      b.clone(*op, bvm);
+      continue;
+    }
+    auto sliceMinOp = cast<AffineMinOp>(op);
+    // Perform the substitution of the operands of AffineMinOp.
+    auto mapAndOperands = substituteMin(
+        sliceMinOp, [&](Operation *op) { return outer->isAncestor(op); });
+    SmallVector<Value> resultOperands = mapAndOperands.dims;
+    llvm::append_range(resultOperands, mapAndOperands.symbols);
+    AffineMap map = mapAndOperands.map;
+    canonicalizeMapAndOperands(&map, &resultOperands);
+    map = simplifyAffineMap(map);
+    res = b.create<AffineMinOp>(
+        outer->getLoc(), map,
+        llvm::to_vector<4>(llvm::map_range(resultOperands, [&](Value operand) {
+          return bvm.lookupOrDefault(operand);
+        })));
+    bvm.map(sliceMinOp, res);
+  }
+  LLVM_DEBUG(DBGS() << "End loopIndependentUpperBound with: " << res << "\n");
+  return res;
+}
+
+/// Return the number of iterations in the loop (ub - lb).ceilDiv(step).
+/// The returned Value is guaranteed not to depend on any loop comprised in
+/// [`outer`, `forOp`].
+/// Return null if such a loop-independent quantity cannot be computed.
+static Value buildLoopTripCount(OpBuilder &b, scf::ForOp outer,
+                                scf::ForOp forOp) {
+  MLIRContext *ctx = forOp->getContext();
+  AffineExpr lb, ub, step;
+  bindDims(ctx, lb, ub);
+  bindSymbols(ctx, step);
+  Value lbVal = computeLoopIndependentLowerBound(b, outer, forOp.lowerBound()),
+        ubVal = computeLoopIndependentUpperBound(b, outer, forOp.upperBound()),
+        stepVal = forOp.step();
+  if (!lbVal || !ubVal || !stepVal)
+    return Value();
+  auto loc = forOp->getLoc();
+  Value res = b.create<AffineApplyOp>(loc, (ub - lb).ceilDiv(step),
+                                      ValueRange{lbVal, ubVal, stepVal});
+  return res;
+}
+
+/// Return the current iteration number in the loop (iv - lb).ceilDiv(step).
+/// The returned Value is guaranteed not to depend on any loop comprised in
+/// [`outer`, `forOp`].
+/// Return null if such a loop-independent quantity cannot be computed.
+static Value buildLoopIterationCount(OpBuilder &b, scf::ForOp outer,
+                                     scf::ForOp forOp) {
+  MLIRContext *ctx = forOp->getContext();
+  AffineExpr iv, lb, step;
+  bindDims(ctx, iv, lb);
+  bindSymbols(ctx, step);
+  Value ivVal = forOp.getInductionVar(),
+        lbVal = computeLoopIndependentLowerBound(b, outer, forOp.lowerBound()),
+        stepVal = forOp.step();
+  if (!ivVal || !lbVal || !stepVal)
+    return Value();
+  auto loc = forOp->getLoc();
+  return b.create<AffineApplyOp>(loc, (iv - lb).ceilDiv(step),
+                                 ValueRange{ivVal, lbVal, stepVal});
+}
+
 /// Ensure prerequisites that guarantee pad op hoisting can occur.
 /// Return failure in the cases when we cannot perform hoisting; i.e. if either:
 ///   1. There exists a use of `padTensorOp` that is not a linalg input operand.
@@ -510,8 +640,10 @@
 ///   3. There exists an op with a region that is dominated by
 ///   `outermostEnclosingForOp` and that isn't a LoopLikeInterface or a
 ///    LinalgOp.
-///   3. There exists an op with side effects that is dominated by
-///    `outermostEnclosingForOp` and that isn't a LoopLikeInterface.
+///   4. There exists an op with side effects that is dominated by
+///   `outermostEnclosingForOp` and that isn't a LoopLikeInterface.
+///   5. The lower bound, upper bound and step of all the loops involved in the
+///   hoisting can be
 ///
 /// While ensuring prerequisites:
 ///   1. Fill the `backwardSlice` to contain the topologically sorted ops
@@ -523,7 +655,8 @@
 static LogicalResult
 hoistPaddingOnTensorsPrerequisites(linalg::PadTensorOp padTensorOp, int nLevels,
                                    llvm::SetVector<Operation *> &backwardSlice,
-                                   llvm::SetVector<Operation *> &packingLoops) {
+                                   llvm::SetVector<Operation *> &packingLoops,
+                                   SmallVector<Value> &dynamicTensorSizes) {
   // Bail on any use that isn't an input of a Linalg op.
   // Hoisting of inplace updates happens after vectorization.
   for (OpOperand &use : padTensorOp.result().getUses()) {
@@ -583,36 +716,39 @@
   // `outermostEnclosingForOp`.
   assert(outermostEnclosingForOp == backwardSlice.front());
 
-  return success();
-}
-
-/// Return the number of iterations in the loop (ub - lb).ceilDiv(step).
-static Value buildLoopTripCount(OpBuilder &b, scf::ForOp forOp) {
-  MLIRContext *ctx = forOp->getContext();
-  AffineExpr lb, ub, step;
-  bindDims(ctx, lb, ub);
-  bindSymbols(ctx, step);
-  return b.create<AffineApplyOp>(
-      forOp->getLoc(), AffineMap::get(2, 1, {(ub - lb).ceilDiv(step)}, ctx),
-      ValueRange{forOp.lowerBound(), forOp.upperBound(), forOp.step()});
-}
+  scf::ForOp outer = cast<scf::ForOp>(outermostEnclosingForOp);
+  if (llvm::any_of(packingLoops, [&](Operation *op) {
+        scf::ForOp forOp = cast<scf::ForOp>(op);
+        Value lb = forOp.lowerBound(), ub = forOp.upperBound(),
+              step = forOp.step();
+        return !isDefinedOutsideOrConstant(outer, lb) ||
+               !(isDefinedOutsideOrConstant(outer, ub) ||
+                 backwardsSliceOnlyHasOpsOfType<scf::ForOp, AffineMinOp,
+                                                AffineApplyOp>(outer, ub)) ||
+               !isDefinedOutsideOrConstant(outer, step);
+      }))
+    return failure();
 
-/// Return the current iteration number in the loop (iv - lb).ceilDiv(step).
-static Value buildLoopIterationCount(OpBuilder &b, scf::ForOp forOp) {
-  MLIRContext *ctx = forOp->getContext();
-  AffineExpr iv, lb, step;
-  bindDims(ctx, iv, lb);
-  bindSymbols(ctx, step);
-  return b.create<AffineApplyOp>(
-      forOp->getLoc(), AffineMap::get(2, 1, {(iv - lb).ceilDiv(step)}, ctx),
-      ValueRange{forOp.getInductionVar(), forOp.lowerBound(), forOp.step()});
+  // IP just before the outermost loop considered that we hoist above.
+  OpBuilder b(outermostEnclosingForOp);
+  dynamicTensorSizes =
+      llvm::to_vector<4>(llvm::map_range(packingLoops, [&](Operation *op) {
+        return buildLoopTripCount(b, cast<scf::ForOp>(outermostEnclosingForOp),
+                                  cast<scf::ForOp>(op));
+      }));
+  // Assert all loop trip counts can be computed.
+  if (!llvm::all_of(dynamicTensorSizes, [](Value v) { return v; }))
+    llvm_unreachable("loop independence prerequisite not met");
+  return success();
 }
 
 LogicalResult mlir::linalg::hoistPaddingOnTensors(PadTensorOp &padTensorOp,
                                                   unsigned nLoops) {
+  SmallVector<Value> dynamicTensorSizes;
   llvm::SetVector<Operation *> backwardSlice, packingLoops;
   if (failed(hoistPaddingOnTensorsPrerequisites(padTensorOp, nLoops,
-                                                backwardSlice, packingLoops)))
+                                                backwardSlice, packingLoops,
+                                                dynamicTensorSizes)))
     return failure();
 
   // Update actual number of loops, which may be smaller.
@@ -636,12 +772,8 @@
   llvm::append_range(packedShape, paddedTensorType.getShape());
   auto packedTensorType =
       RankedTensorType::get(packedShape, paddedTensorType.getElementType());
-  auto dynamicSizes =
-      llvm::to_vector<4>(llvm::map_range(packingLoops, [&](Operation *op) {
-        return buildLoopTripCount(b, cast<scf::ForOp>(op));
-      }));
   Value packedTensor = b.create<linalg::InitTensorOp>(
-      loc, dynamicSizes, packedTensorType.getShape(),
+      loc, dynamicTensorSizes, packedTensorType.getShape(),
       packedTensorType.getElementType());
 
   // Clone the operations involved in the backward slice, iteratively stepping
@@ -656,9 +788,9 @@
   clonedLoopIvs.reserve(nLoops);
   leadingPackedTensorIndexings.reserve(nLoops);
   BlockAndValueMapping bvm;
-  // Stack step 1. iteratively clone loops and push `packedTensor`.
   // Insert `padTensorOp` into the backwardSlice so we clone it too.
   backwardSlice.insert(padTensorOp);
+  // Stack step 1. iteratively clone loops and push `packedTensor`.
   for (Operation *op : backwardSlice) {
     if (op->getNumRegions() == 0 || isa<linalg::PadTensorOp>(op)) {
       b.clone(*op, bvm);
@@ -670,15 +802,23 @@
     // Unused loop, just skip it.
     if (!packingLoops.contains(forOp))
       continue;
+
     auto clonedForOp =
-        b.create<scf::ForOp>(loc, forOp.lowerBound(), forOp.upperBound(),
-                             forOp.step(), packedTensor);
+        b.create<scf::ForOp>(loc, bvm.lookupOrDefault(forOp.lowerBound()),
+                             bvm.lookupOrDefault(forOp.upperBound()),
+                             bvm.lookupOrDefault(forOp.step()), packedTensor);
+
+    bvm.map(forOp.getInductionVar(), clonedForOp.getInductionVar());
     assert(clonedForOp->getNumRegions() == 1);
     clonedLoopIvs.push_back(clonedForOp.getInductionVar());
+
     b.setInsertionPointToStart(&clonedForOp->getRegion(0).front());
-    leadingPackedTensorIndexings.push_back(
-        buildLoopIterationCount(b, clonedForOp));
-    bvm.map(forOp.getInductionVar(), clonedLoopIvs.back());
+    Value loopIndependentIterationCount = buildLoopIterationCount(
+        b, cast<scf::ForOp>(outermostEnclosingForOp), clonedForOp);
+    // Assert the loop-independent iteration count can be computed.
+    if (!loopIndependentIterationCount)
+      llvm_unreachable("loop independence prerequisite not met");
+    leadingPackedTensorIndexings.push_back(loopIndependentIterationCount);
     packedTensor = clonedForOp.getRegionIterArgs().front();
   }
 
@@ -716,8 +856,13 @@
   b.setInsertionPoint(padTensorOp);
   SmallVector<Value> loopIterationCounts =
       llvm::to_vector<4>(llvm::map_range(packingLoops, [&](Operation *loop) {
-        return buildLoopIterationCount(b, cast<scf::ForOp>(loop));
+        return buildLoopIterationCount(
+            b, cast<scf::ForOp>(outermostEnclosingForOp),
+            cast<scf::ForOp>(loop));
       }));
+  // Assert all loop iteration counts can be computed.
+  if (llvm::any_of(loopIterationCounts, [](Value v) { return !v; }))
+    llvm_unreachable("loop independence prerequisite not met");
   // offsets = [originalLoopIvs, 0 .. 0].
   offsets.assign(loopIterationCounts.begin(), loopIterationCounts.end());
   offsets.append(paddedRank, b.getIndexAttr(0));
diff --git a/mlir/test/Dialect/Linalg/hoist-padding.mlir b/mlir/test/Dialect/Linalg/hoist-padding.mlir
--- a/mlir/test/Dialect/Linalg/hoist-padding.mlir
+++ b/mlir/test/Dialect/Linalg/hoist-padding.mlir
@@ -1,16 +1,15 @@
-// RUN: mlir-opt %s -test-linalg-transform-patterns=test-hoist-padding-2-level -canonicalize | FileCheck %s
+// RUN: mlir-opt %s -split-input-file -test-linalg-transform-patterns=test-hoist-padding-2-level -canonicalize | FileCheck %s
 
+// CHECK-DAG: #[[$DIV3:[0-9a-z]+]] = affine_map<(d0) -> (d0 ceildiv 3)>
+// CHECK-DAG: #[[$DIV4:[0-9a-z]+]] = affine_map<(d0) -> (d0 ceildiv 4)>
+// CHECK-DAG: #[[$DIVS3:[0-9a-z]+]] = affine_map<()[s0] -> (s0 ceildiv 3)>
+// CHECK-DAG: #[[$DIVS4:[0-9a-z]+]] = affine_map<()[s0] -> (s0 ceildiv 4)>
 #map0 = affine_map<(d0)[s0] -> (2, -d0 + s0)>
 #map1 = affine_map<(d0)[s0] -> (4, -d0 + s0)>
 #map2 = affine_map<(d0)[s0] -> (3, -d0 + s0)>
 #map3 = affine_map<(d0, d1) -> (2, d0 - d1)>
 #map4 = affine_map<(d0, d1) -> (3, d0 - d1)>
 
-// CHECK-DAG: #[[$DIV3:[0-9a-z]+]] = affine_map<(d0) -> (d0 ceildiv 3)>
-// CHECK-DAG: #[[$DIV4:[0-9a-z]+]] = affine_map<(d0) -> (d0 ceildiv 4)>
-// CHECK-DAG: #[[$DIVS3:[0-9a-z]+]] = affine_map<()[s0] -> (s0 ceildiv 3)>
-// CHECK-DAG: #[[$DIVS4:[0-9a-z]+]] = affine_map<()[s0] -> (s0 ceildiv 4)>
-
 // CHECK-LABEL: func @matmul_tensors
 //  CHECK-SAME:   %[[TA:[0-9a-z]+]]: tensor
 //  CHECK-SAME:   %[[TB:[0-9a-z]+]]: tensor
@@ -129,3 +128,92 @@
   }
   return %3 : tensor<?x?xf32>
 }
+
+// -----
+
+// CHECK-DAG: #[[$MIN_REST8:[0-9a-z]+]] = affine_map<(d0)[s0] -> (8, -d0 + s0)>
+// CHECK-DAG: #[[$MIN_MOD4:[0-9a-z]+]] = affine_map<(d0) -> (4, d0 - ((d0 - 1) floordiv 4) * 4)>
+// CHECK-DAG: #[[$DIV4:[0-9a-z]+]] = affine_map<(d0) -> (d0 ceildiv 4)>
+// CHECK-DAG: #[[$DIV2:[0-9a-z]+]] = affine_map<(d0) -> (d0 ceildiv 2)>
+#map0 = affine_map<(d0)[s0] -> (8, -d0 + s0)>
+#map1 = affine_map<(d0, d1) -> (4, d0 - d1)>
+#map2 = affine_map<(d0, d1) -> (2, d0 - d1)>
+
+// CHECK-LABEL: func @dot
+func @dot(%arg0: tensor<?xf32>, %arg1: tensor<?xf32>, %arg2: tensor<f32>)
+    -> tensor<f32>
+{
+  %c8 = constant 8 : index
+  %c4 = constant 4 : index
+  %cst = constant 0.000000e+00 : f32
+  %c2 = constant 2 : index
+  %c0 = constant 0 : index
+  %1 = memref.dim %arg0, %c0 : tensor<?xf32>
+  %2 = memref.dim %arg0, %c0 : tensor<?xf32>
+  %3 = memref.dim %arg1, %c0 : tensor<?xf32>
+
+  //      CHECK: scf.for %[[I:[0-9a-z]+]] =
+  //
+  //      CHECK:   %[[MR8:.*]] = affine.min #[[$MIN_REST8]](%[[I]])
+  //      CHECK:   %[[D0:.*]] = affine.apply #[[$DIV4]](%[[MR8]])
+  //      CHECK:   %[[MM4:.*]] = affine.min #[[$MIN_MOD4]](%[[MR8]])
+  //      CHECK:   %[[D1:.*]] = affine.apply #[[$DIV2]](%[[MM4]])
+  // Init tensor and pack.
+  //      CHECK:   %[[INIT_PACKED_A:.*]] = linalg.init_tensor [%[[D0]], %[[D1]], 2] : tensor<?x?x2xf32>
+  //      CHECK:   %[[PACKED_A:.*]] = scf.for %[[II:[0-9a-z]+]] = {{.*}} iter_args(%{{.*}} = %[[INIT_PACKED_A]]) -> (tensor<?x?x2xf32>) {
+  //      CHECK:     scf.for %[[III:[0-9a-z]+]] =
+  //      CHECK:       subtensor_insert %{{.*}} into %{{.*}}[%{{.*}}, %{{.*}}, 0] [1, 1, 2] [1, 1, 1] : tensor<2xf32> into tensor<?x?x2xf32>
+  //
+  //      CHECK:   %[[D0_2:.*]] = affine.apply #[[$DIV4]](%[[MR8]])
+  //      CHECK:   %[[MM4_2:.*]] = affine.min #[[$MIN_MOD4]](%[[MR8]])
+  //      CHECK:   %[[D1_2:.*]] = affine.apply #[[$DIV2]](%[[MM4_2]])
+  // Init tensor and pack.
+  //      CHECK:   %[[INIT_PACKED_B:.*]] = linalg.init_tensor [%[[D0_2]], %[[D1_2]], 2] : tensor<?x?x2xf32>
+  //      CHECK:   %[[PACKED_B:.*]] = scf.for %[[II_2:[0-9a-z]+]] = {{.*}} iter_args(%{{.*}} = %[[INIT_PACKED_B]]) -> (tensor<?x?x2xf32>) {
+  //      CHECK:     scf.for %[[III_2:[0-9a-z]+]] =
+  //      CHECK:       subtensor_insert %{{.*}} into %{{.*}}[%{{.*}}, %{{.*}}, 0] [1, 1, 2] [1, 1, 1] : tensor<2xf32> into tensor<?x?x2xf32>
+  // Compute.
+  //      CHECK:   scf.for %[[II_3:[0-9a-z]+]] =
+  //      CHECK:     scf.for %[[III_3:[0-9a-z]+]] = {{.*}} iter_args(%[[C:.*]] = %{{.*}}) -> (tensor<f32>) {
+  //      CHECK:       %[[IDX0:.*]] = affine.apply #[[$DIV4]](%[[II_3]])
+  //      CHECK:       %[[IDX1:.*]] = affine.apply #[[$DIV2]](%[[III_3]])
+  //      CHECK:       %[[A:.*]] = subtensor %[[PACKED_A]][%[[IDX0]], %[[IDX1]], 0] [1, 1, 2] [1, 1, 1] : tensor<?x?x2xf32> to tensor<2xf32>
+  //      CHECK:       %[[IDX0_2:.*]] = affine.apply #[[$DIV4]](%[[II_3]])
+  //      CHECK:       %[[IDX1_2:.*]] = affine.apply #[[$DIV2]](%[[III_3]])
+  //      CHECK:       %[[B:.*]] = subtensor %[[PACKED_B]][%[[IDX0_2]], %[[IDX1_2]], 0] [1, 1, 2] [1, 1, 1] : tensor<?x?x2xf32> to tensor<2xf32>
+  //      CHECK:       linalg.dot ins(%[[A]], %[[B]] : tensor<2xf32>, tensor<2xf32>) outs(%[[C]] : tensor<f32>) -> tensor<f32>
+
+  %4 = scf.for %arg3 = %c0 to %1 step %c8 iter_args(%arg4 = %arg2) -> (tensor<f32>) {
+    %5 = affine.min #map0(%arg3)[%2]
+    %6 = subtensor %arg0[%arg3] [%5] [1] : tensor<?xf32> to tensor<?xf32>
+    %7 = affine.min #map0(%arg3)[%3]
+    %8 = subtensor %arg1[%arg3] [%7] [1] : tensor<?xf32> to tensor<?xf32>
+    %9 = scf.for %arg5 = %c0 to %5 step %c4 iter_args(%arg6 = %arg4) -> (tensor<f32>) {
+      %10 = affine.min #map1(%5, %arg5)
+      %11 = subtensor %6[%arg5] [%10] [1] : tensor<?xf32> to tensor<?xf32>
+      %12 = affine.min #map1(%7, %arg5)
+      %13 = subtensor %8[%arg5] [%12] [1] : tensor<?xf32> to tensor<?xf32>
+      %14 = scf.for %arg7 = %c0 to %10 step %c2 iter_args(%arg8 = %arg6) -> (tensor<f32>) {
+        %15 = affine.min #map2(%10, %arg7)
+        %16 = subtensor %11[%arg7] [%15] [1] : tensor<?xf32> to tensor<?xf32>
+        %17 = affine.min #map2(%12, %arg7)
+        %18 = subtensor %13[%arg7] [%17] [1] : tensor<?xf32> to tensor<?xf32>
+        %19 = subi %c2, %15 : index
+        %20 = linalg.pad_tensor %16 low[%c0] high[%19]  {
+        ^bb0(%arg9: index):  // no predecessors
+          linalg.yield %cst : f32
+        } : tensor<?xf32> to tensor<2xf32>
+        %21 = subi %c2, %17 : index
+        %22 = linalg.pad_tensor %18 low[%c0] high[%21]  {
+        ^bb0(%arg9: index):  // no predecessors
+          linalg.yield %cst : f32
+        } : tensor<?xf32> to tensor<2xf32>
+        %23 = linalg.dot ins(%20, %22 : tensor<2xf32>, tensor<2xf32>) outs(%arg8 : tensor<f32>) -> tensor<f32>
+        scf.yield %23 : tensor<f32>
+      }
+      scf.yield %14 : tensor<f32>
+    }
+    scf.yield %9 : tensor<f32>
+  }
+  return %4 : tensor<f32>
+}