diff --git a/mlir/include/mlir/Dialect/SCF/Transforms.h b/mlir/include/mlir/Dialect/SCF/Transforms.h
--- a/mlir/include/mlir/Dialect/SCF/Transforms.h
+++ b/mlir/include/mlir/Dialect/SCF/Transforms.h
@@ -68,8 +68,8 @@
 
 /// Rewrite a for loop with bounds/step that potentially do not divide evenly
 /// into a for loop where the step divides the iteration space evenly, followed
-/// by an scf.if for the last (partial) iteration (if any; returned via `ifOp`).
-/// This transformation is called "loop peeling".
+/// by another scf.for for the last (partial) iteration (if any; returned via
+/// `partialIteration`). This transformation is called "loop peeling".
 ///
 /// This transformation is beneficial for a wide range of transformations such
 /// as vectorization or loop tiling: It enables additional canonicalizations
@@ -88,16 +88,16 @@
 /// scf.for %iv = %c0 to %newUb step %c4 {
 ///   (loop body)
 /// }
-/// scf.if %newUb < %ub {
+/// scf.for %iv2 = %newUb to %ub {
 ///   (loop body)
 /// }
 /// ```
 ///
 /// After loop peeling, this function tries to simplify/canonicalize affine.min
-/// and affine.max ops in the body of the loop and the scf.if, taking advantage
-/// of the fact that the peeled loop has only "full" iterations and the scf.if
-/// is always a partial iteration (if any). This canonicalization is expected to
-/// enable further canonicalization opportunities through other patterns.
+/// and affine.max ops in the body of the peeled loop and in the body of the
+/// partial iteration loop, taking advantage of the fact that the peeled loop
+/// has only "full" iterations. This canonicalization is expected to enable
+/// further canonicalization opportunities through other patterns.
 ///
 /// The return value indicates whether the loop was rewritten or not. Loops are
 /// not rewritten if:
@@ -106,10 +106,10 @@
 ///   iteration space evenly.
 ///
 /// Note: This function rewrites the given scf.for loop in-place and creates a
-/// new scf.if operation for the last iteration. It replaces all uses of the
-/// unpeeled loop with the results of the newly generated scf.if.
+/// new scf.for operation for the last iteration. It replaces all uses of the
+/// unpeeled loop with the results of the newly generated scf.for.
 LogicalResult peelAndCanonicalizeForLoop(RewriterBase &rewriter, ForOp forOp,
-                                         scf::IfOp &ifOp);
+                                         scf::ForOp &partialIteration);
 
 /// Try to simplify a min/max operation `op` after loop peeling. This function
 /// can simplify min/max operations such as (ub is the previous upper bound of
diff --git a/mlir/lib/Dialect/SCF/Transforms/LoopSpecialization.cpp b/mlir/lib/Dialect/SCF/Transforms/LoopSpecialization.cpp
--- a/mlir/lib/Dialect/SCF/Transforms/LoopSpecialization.cpp
+++ b/mlir/lib/Dialect/SCF/Transforms/LoopSpecialization.cpp
@@ -106,8 +106,8 @@
 /// The newly generated scf.if operation is returned via `ifOp`. The boundary
 /// at which the loop is split (new upper bound) is returned via `splitBound`.
 /// The return value indicates whether the loop was rewritten or not.
-static LogicalResult peelForLoop(RewriterBase &b, ForOp forOp, scf::IfOp &ifOp,
-                                 Value &splitBound) {
+static LogicalResult peelForLoop(RewriterBase &b, ForOp forOp,
+                                 ForOp &partialIteration, Value &splitBound) {
   RewriterBase::InsertionGuard guard(b);
   auto lbInt = getConstantIntValue(forOp.lowerBound());
   auto ubInt = getConstantIntValue(forOp.upperBound());
@@ -130,34 +130,16 @@
       loc, modMap,
       ValueRange{forOp.lowerBound(), forOp.upperBound(), forOp.step()});
 
+  // Create ForOp for partial iteration.
+  b.setInsertionPointAfter(forOp);
+  partialIteration = cast<ForOp>(b.clone(*forOp.getOperation()));
+  partialIteration.lowerBoundMutable().assign(splitBound);
+  forOp.replaceAllUsesWith(partialIteration->getResults());
+  partialIteration.initArgsMutable().assign(forOp->getResults());
+
   // Set new upper loop bound.
-  Value previousUb = forOp.upperBound();
   b.updateRootInPlace(forOp,
                       [&]() { forOp.upperBoundMutable().assign(splitBound); });
-  b.setInsertionPointAfter(forOp);
-
-  // Do we need one more iteration?
-  Value hasMoreIter =
-      b.create<CmpIOp>(loc, CmpIPredicate::slt, splitBound, previousUb);
-
-  // Create IfOp for last iteration.
-  auto resultTypes = forOp.getResultTypes();
-  ifOp = b.create<scf::IfOp>(loc, resultTypes, hasMoreIter,
-                             /*withElseRegion=*/!resultTypes.empty());
-  forOp.replaceAllUsesWith(ifOp->getResults());
-
-  // Build then case.
-  BlockAndValueMapping bvm;
-  bvm.map(forOp.region().getArgument(0), splitBound);
-  for (auto it : llvm::zip(forOp.getRegionIterArgs(), forOp->getResults())) {
-    bvm.map(std::get<0>(it), std::get<1>(it));
-  }
-  b.cloneRegionBefore(forOp.region(), ifOp.thenRegion(),
-                      ifOp.thenRegion().begin(), bvm);
-  // Build else case.
-  if (!resultTypes.empty())
-    ifOp.getElseBodyBuilder(b.getListener())
-        .create<scf::YieldOp>(loc, forOp->getResults());
 
   return success();
 }
@@ -370,37 +352,43 @@
 }
 
 template <typename OpTy, bool IsMin>
-static void
-rewriteAffineOpAfterPeeling(RewriterBase &rewriter, ForOp forOp, scf::IfOp ifOp,
-                            Value iv, Value splitBound, Value ub, Value step) {
+static void rewriteAffineOpAfterPeeling(RewriterBase &rewriter, ForOp forOp,
+                                        ForOp partialIteration,
+                                        Value previousUb) {
+  Value mainIv = forOp.getInductionVar();
+  Value partialIv = partialIteration.getInductionVar();
+  assert(forOp.step() == partialIteration.step() &&
+         "expected same step in main and partial loop");
+  Value step = forOp.step();
+
   forOp.walk([&](OpTy affineOp) {
     AffineMap map = affineOp.getAffineMap();
     (void)scf::rewritePeeledMinMaxOp(rewriter, affineOp, map,
-                                     affineOp.operands(), IsMin, iv, ub, step,
+                                     affineOp.operands(), IsMin, mainIv,
+                                     previousUb, step,
                                      /*insideLoop=*/true);
   });
-  ifOp.walk([&](OpTy affineOp) {
+  partialIteration.walk([&](OpTy affineOp) {
     AffineMap map = affineOp.getAffineMap();
     (void)scf::rewritePeeledMinMaxOp(rewriter, affineOp, map,
-                                     affineOp.operands(), IsMin, splitBound, ub,
-                                     step, /*insideLoop=*/false);
+                                     affineOp.operands(), IsMin, partialIv,
+                                     previousUb, step, /*insideLoop=*/false);
   });
 }
 
 LogicalResult mlir::scf::peelAndCanonicalizeForLoop(RewriterBase &rewriter,
                                                     ForOp forOp,
-                                                    scf::IfOp &ifOp) {
-  Value ub = forOp.upperBound();
+                                                    ForOp &partialIteration) {
+  Value previousUb = forOp.upperBound();
   Value splitBound;
-  if (failed(peelForLoop(rewriter, forOp, ifOp, splitBound)))
+  if (failed(peelForLoop(rewriter, forOp, partialIteration, splitBound)))
     return failure();
 
   // Rewrite affine.min and affine.max ops.
-  Value iv = forOp.getInductionVar(), step = forOp.step();
   rewriteAffineOpAfterPeeling<AffineMinOp, /*IsMin=*/true>(
-      rewriter, forOp, ifOp, iv, splitBound, ub, step);
+      rewriter, forOp, partialIteration, previousUb);
   rewriteAffineOpAfterPeeling<AffineMaxOp, /*IsMin=*/false>(
-      rewriter, forOp, ifOp, iv, splitBound, ub, step);
+      rewriter, forOp, partialIteration, previousUb);
 
   return success();
 }
@@ -491,23 +479,24 @@
     if (forOp->hasAttr(kPeeledLoopLabel))
       return failure();
     if (skipPartial) {
-      // No peeling of loops inside the partial iteration (scf.if) of another
-      // peeled loop.
+      // No peeling of loops inside the partial iteration of another peeled
+      // loop.
       Operation *op = forOp.getOperation();
-      while ((op = op->getParentOfType<scf::IfOp>())) {
+      while ((op = op->getParentOfType<scf::ForOp>())) {
         if (op->hasAttr(kPartialIterationLabel))
           return failure();
       }
     }
     // Apply loop peeling.
-    scf::IfOp ifOp;
-    if (failed(peelAndCanonicalizeForLoop(rewriter, forOp, ifOp)))
+    scf::ForOp partialIteration;
+    if (failed(peelAndCanonicalizeForLoop(rewriter, forOp, partialIteration)))
       return failure();
     // Apply label, so that the same loop is not rewritten a second time.
+    partialIteration->setAttr(kPeeledLoopLabel, rewriter.getUnitAttr());
     rewriter.updateRootInPlace(forOp, [&]() {
       forOp->setAttr(kPeeledLoopLabel, rewriter.getUnitAttr());
     });
-    ifOp->setAttr(kPartialIterationLabel, rewriter.getUnitAttr());
+    partialIteration->setAttr(kPartialIterationLabel, rewriter.getUnitAttr());
     return success();
   }
 
diff --git a/mlir/test/Dialect/SCF/for-loop-peeling.mlir b/mlir/test/Dialect/SCF/for-loop-peeling.mlir
--- a/mlir/test/Dialect/SCF/for-loop-peeling.mlir
+++ b/mlir/test/Dialect/SCF/for-loop-peeling.mlir
@@ -2,7 +2,7 @@
 // RUN: mlir-opt %s -for-loop-peeling=skip-partial=false -canonicalize -split-input-file | FileCheck %s -check-prefix=CHECK-NO-SKIP
 
 //  CHECK-DAG: #[[MAP0:.*]] = affine_map<()[s0, s1, s2] -> (s1 - (s1 - s0) mod s2)>
-//  CHECK-DAG: #[[MAP1:.*]] = affine_map<()[s0, s1, s2] -> (-(s0 - (s0 - s1) mod s2) + s0)>
+//  CHECK-DAG: #[[MAP1:.*]] = affine_map<(d0)[s0] -> (-d0 + s0)>
 //      CHECK: func @fully_dynamic_bounds(
 // CHECK-SAME:     %[[LB:.*]]: index, %[[UB:.*]]: index, %[[STEP:.*]]: index
 //      CHECK:   %[[C0_I32:.*]] = constant 0 : i32
@@ -13,14 +13,12 @@
 //      CHECK:     %[[ADD:.*]] = addi %[[ACC]], %[[CAST]] : i32
 //      CHECK:     scf.yield %[[ADD]]
 //      CHECK:   }
-//      CHECK:   %[[HAS_MORE:.*]] = cmpi slt, %[[NEW_UB]], %[[UB]]
-//      CHECK:   %[[RESULT:.*]] = scf.if %[[HAS_MORE]] -> (i32) {
-//      CHECK:     %[[REM:.*]] = affine.apply #[[MAP1]]()[%[[UB]], %[[LB]], %[[STEP]]]
+//      CHECK:   %[[RESULT:.*]] = scf.for %[[IV2:.*]] = %[[NEW_UB]] to %[[UB]]
+// CHECK-SAME:       step %[[STEP]] iter_args(%[[ACC2:.*]] = %[[LOOP]]) -> (i32) {
+//      CHECK:     %[[REM:.*]] = affine.apply #[[MAP1]](%[[IV2]])[%[[UB]]]
 //      CHECK:     %[[CAST2:.*]] = index_cast %[[REM]]
-//      CHECK:     %[[ADD2:.*]] = addi %[[LOOP]], %[[CAST2]]
+//      CHECK:     %[[ADD2:.*]] = addi %[[ACC2]], %[[CAST2]]
 //      CHECK:     scf.yield %[[ADD2]]
-//      CHECK:   } else {
-//      CHECK:     scf.yield %[[LOOP]]
 //      CHECK:   }
 //      CHECK:   return %[[RESULT]]
 #map = affine_map<(d0, d1)[s0] -> (s0, d0 - d1)>
@@ -70,7 +68,7 @@
 // -----
 
 //  CHECK-DAG: #[[MAP0:.*]] = affine_map<()[s0] -> ((s0 floordiv 4) * 4)>
-//  CHECK-DAG: #[[MAP1:.*]] = affine_map<()[s0] -> (s0 mod 4)>
+//  CHECK-DAG: #[[MAP1:.*]] = affine_map<(d0)[s0] -> (-d0 + s0)>
 //      CHECK: func @dynamic_upper_bound(
 // CHECK-SAME:     %[[UB:.*]]: index
 //  CHECK-DAG:   %[[C0_I32:.*]] = constant 0 : i32
@@ -83,14 +81,12 @@
 //      CHECK:     %[[ADD:.*]] = addi %[[ACC]], %[[C4_I32]] : i32
 //      CHECK:     scf.yield %[[ADD]]
 //      CHECK:   }
-//      CHECK:   %[[HAS_MORE:.*]] = cmpi slt, %[[NEW_UB]], %[[UB]]
-//      CHECK:   %[[RESULT:.*]] = scf.if %[[HAS_MORE]] -> (i32) {
-//      CHECK:     %[[REM:.*]] = affine.apply #[[MAP1]]()[%[[UB]]]
+//      CHECK:   %[[RESULT:.*]] = scf.for %[[IV2:.*]] = %[[NEW_UB]] to %[[UB]]
+// CHECK-SAME:       step %[[C4]] iter_args(%[[ACC2:.*]] = %[[LOOP]]) -> (i32) {
+//      CHECK:     %[[REM:.*]] = affine.apply #[[MAP1]](%[[IV2]])[%[[UB]]]
 //      CHECK:     %[[CAST2:.*]] = index_cast %[[REM]]
-//      CHECK:     %[[ADD2:.*]] = addi %[[LOOP]], %[[CAST2]]
+//      CHECK:     %[[ADD2:.*]] = addi %[[ACC2]], %[[CAST2]]
 //      CHECK:     scf.yield %[[ADD2]]
-//      CHECK:   } else {
-//      CHECK:     scf.yield %[[LOOP]]
 //      CHECK:   }
 //      CHECK:   return %[[RESULT]]
 #map = affine_map<(d0, d1)[s0] -> (s0, d0 - d1)>
@@ -111,7 +107,7 @@
 // -----
 
 //  CHECK-DAG: #[[MAP0:.*]] = affine_map<()[s0] -> ((s0 floordiv 4) * 4)>
-//  CHECK-DAG: #[[MAP1:.*]] = affine_map<()[s0] -> (s0 mod 4)>
+//  CHECK-DAG: #[[MAP1:.*]] = affine_map<(d0)[s0] -> (-d0 + s0)>
 //      CHECK: func @no_loop_results(
 // CHECK-SAME:     %[[UB:.*]]: index, %[[MEMREF:.*]]: memref<i32>
 //  CHECK-DAG:   %[[C4_I32:.*]] = constant 4 : i32
@@ -123,9 +119,8 @@
 //      CHECK:     %[[ADD:.*]] = addi %[[LOAD]], %[[C4_I32]] : i32
 //      CHECK:     memref.store %[[ADD]], %[[MEMREF]]
 //      CHECK:   }
-//      CHECK:   %[[HAS_MORE:.*]] = cmpi slt, %[[NEW_UB]], %[[UB]]
-//      CHECK:   scf.if %[[HAS_MORE]] {
-//      CHECK:     %[[REM:.*]] = affine.apply #[[MAP1]]()[%[[UB]]]
+//      CHECK:   scf.for %[[IV2:.*]] = %[[NEW_UB]] to %[[UB]] step %[[C4]] {
+//      CHECK:     %[[REM:.*]] = affine.apply #[[MAP1]](%[[IV2]])[%[[UB]]]
 //      CHECK:     %[[LOAD2:.*]] = memref.load %[[MEMREF]][]
 //      CHECK:     %[[CAST2:.*]] = index_cast %[[REM]]
 //      CHECK:     %[[ADD2:.*]] = addi %[[LOAD2]], %[[CAST2]]
@@ -157,11 +152,10 @@
 //  CHECK-DAG: #[[MAP2:.*]] = affine_map<(d0)[s0, s1] -> (s0, -d0 + s1 - 1)>
 //  CHECK-DAG: #[[MAP3:.*]] = affine_map<(d0)[s0, s1, s2] -> (s0, -d0 + s1, s2)>
 //  CHECK-DAG: #[[MAP4:.*]] = affine_map<()[s0] -> (-s0)>
-//  CHECK-DAG: #[[MAP5:.*]] = affine_map<()[s0, s1, s2] -> (-(s0 - (s0 - s1) mod s2) + s0)>
-//  CHECK-DAG: #[[MAP6:.*]] = affine_map<()[s0, s1, s2] -> (-(s0 - (s0 - s1) mod s2) + s0 + 1)>
-//  CHECK-DAG: #[[MAP7:.*]] = affine_map<()[s0, s1, s2] -> (-(s0 - (s0 - s1) mod s2) + s0 - 1)>
-//  CHECK-DAG: #[[MAP8:.*]] = affine_map<()[s0, s1, s2, s3] -> (s0, s2 - (s2 - (s2 - s1) mod s0), s3)>
-//  CHECK-DAG: #[[MAP9:.*]] = affine_map<()[s0, s1, s2] -> (s0 - (s0 - s1) mod s2 - s0)>
+//  CHECK-DAG: #[[MAP5:.*]] = affine_map<(d0)[s0] -> (-d0 + s0)>
+//  CHECK-DAG: #[[MAP6:.*]] = affine_map<(d0)[s0] -> (-d0 + s0 + 1)>
+//  CHECK-DAG: #[[MAP7:.*]] = affine_map<(d0)[s0] -> (-d0 + s0 - 1)>
+//  CHECK-DAG: #[[MAP8:.*]] = affine_map<(d0)[s0] -> (d0 - s0)>
 //      CHECK: func @test_affine_op_rewrite(
 // CHECK-SAME:     %[[LB:.*]]: index, %[[UB:.*]]: index, %[[STEP:.*]]: index,
 // CHECK-SAME:     %[[MEMREF:.*]]: memref<?xindex>, %[[SOME_VAL:.*]]: index
@@ -179,18 +173,18 @@
 //      CHECK:     %[[RES5:.*]] = affine.apply #[[MAP4]]()[%[[STEP]]]
 //      CHECK:     memref.store %[[RES5]]
 //      CHECK:   }
-//      CHECK:   scf.if {{.*}} {
-//      CHECK:     %[[RES_IF_0:.*]] = affine.apply #[[MAP5]]()[%[[UB]], %[[LB]], %[[STEP]]]
+//      CHECK:   scf.for %[[IV2:.*]] = {{.*}} to %[[UB]] step %[[STEP]] {
+//      CHECK:     %[[RES_IF_0:.*]] = affine.apply #[[MAP5]](%[[IV2]])[%[[UB]]]
 //      CHECK:     memref.store %[[RES_IF_0]]
-//      CHECK:     %[[RES_IF_1:.*]] = affine.apply #[[MAP6]]()[%[[UB]], %[[LB]], %[[STEP]]]
+//      CHECK:     %[[RES_IF_1:.*]] = affine.apply #[[MAP6]](%[[IV2]])[%[[UB]]]
 //      CHECK:     memref.store %[[RES_IF_1]]
-//      CHECK:     %[[RES_IF_2:.*]] = affine.apply #[[MAP6]]()[%[[UB]], %[[LB]], %[[STEP]]]
+//      CHECK:     %[[RES_IF_2:.*]] = affine.apply #[[MAP6]](%[[IV2]])[%[[UB]]]
 //      CHECK:     memref.store %[[RES_IF_2]]
-//      CHECK:     %[[RES_IF_3:.*]] = affine.apply #[[MAP7]]()[%[[UB]], %[[LB]], %[[STEP]]]
+//      CHECK:     %[[RES_IF_3:.*]] = affine.apply #[[MAP7]](%[[IV2]])[%[[UB]]]
 //      CHECK:     memref.store %[[RES_IF_3]]
-//      CHECK:     %[[RES_IF_4:.*]] = affine.min #[[MAP8]]()[%[[STEP]], %[[LB]], %[[UB]], %[[SOME_VAL]]]
+//      CHECK:     %[[RES_IF_4:.*]] = affine.min #[[MAP3]](%[[IV2]])[%[[STEP]], %[[UB]], %[[SOME_VAL]]]
 //      CHECK:     memref.store %[[RES_IF_4]]
-//      CHECK:     %[[RES_IF_5:.*]] = affine.apply #[[MAP9]]()[%[[UB]], %[[LB]], %[[STEP]]]
+//      CHECK:     %[[RES_IF_5:.*]] = affine.apply #[[MAP8]](%[[IV2]])[%[[UB]]]
 //      CHECK:     memref.store %[[RES_IF_5]]
 #map0 = affine_map<(d0, d1)[s0] -> (s0, d0 - d1)>
 #map1 = affine_map<(d0, d1)[s0] -> (d0 - d1 + 1, s0)>
@@ -243,26 +237,26 @@
 //     CHECK:   scf.for {{.*}} {
 //     CHECK:     scf.for {{.*}} {
 //     CHECK:     }
-//     CHECK:     scf.if {{.*}} {
+//     CHECK:     scf.for {{.*}} {
 //     CHECK:     }
 //     CHECK:   }
-//     CHECK:   scf.if {{.*}} {
+//     CHECK:   scf.for {{.*}} {
 //     CHECK:     scf.for {{.*}} {
 //     CHECK:     }
-// CHECK-NOT:     scf.if
+// CHECK-NOT:     scf.for
 //     CHECK:   }
 
 //     CHECK-NO-SKIP: func @nested_loops
 //     CHECK-NO-SKIP:   scf.for {{.*}} {
 //     CHECK-NO-SKIP:     scf.for {{.*}} {
 //     CHECK-NO-SKIP:     }
-//     CHECK-NO-SKIP:     scf.if {{.*}} {
+//     CHECK-NO-SKIP:     scf.for {{.*}} {
 //     CHECK-NO-SKIP:     }
 //     CHECK-NO-SKIP:   }
-//     CHECK-NO-SKIP:   scf.if {{.*}} {
+//     CHECK-NO-SKIP:   scf.for {{.*}} {
 //     CHECK-NO-SKIP:     scf.for {{.*}} {
 //     CHECK-NO-SKIP:     }
-//     CHECK-NO-SKIP:     scf.if {{.*}} {
+//     CHECK-NO-SKIP:     scf.for {{.*}} {
 //     CHECK-NO-SKIP:     }
 //     CHECK-NO-SKIP:   }
 #map = affine_map<(d0, d1)[s0] -> (s0, d0 - d1)>
diff --git a/mlir/test/Dialect/SparseTensor/sparse_vector_peeled.mlir b/mlir/test/Dialect/SparseTensor/sparse_vector_peeled.mlir
--- a/mlir/test/Dialect/SparseTensor/sparse_vector_peeled.mlir
+++ b/mlir/test/Dialect/SparseTensor/sparse_vector_peeled.mlir
@@ -18,7 +18,7 @@
 }
 
 // CHECK-DAG:   #[[$map0:.*]] = affine_map<()[s0, s1] -> (s0 + ((-s0 + s1) floordiv 16) * 16)>
-// CHECK-DAG:   #[[$map1:.*]] = affine_map<()[s0, s1] -> ((s0 - s1) mod 16)>
+// CHECK-DAG:   #[[$map1:.*]] = affine_map<(d0)[s0] -> (-d0 + s0)>
 // CHECK-LABEL: func @mul_s
 // CHECK-DAG:   %[[c0:.*]] = constant 0 : index
 // CHECK-DAG:   %[[c1:.*]] = constant 1 : index
@@ -39,13 +39,12 @@
 // CHECK:         %[[m:.*]] = mulf %[[la]], %[[lb]] : vector<16xf32>
 // CHECK:         vector.scatter %{{.*}}[%[[c0]]] [%[[zi]]], %[[mask]], %[[m]] : memref<1024xf32>, vector<16xi64>, vector<16xi1>, vector<16xf32>
 // CHECK:       }
-// CHECK:       %[[has_more:.*]] = cmpi slt, %[[boundary]], %[[s]] : index
-// CHECK:       scf.if %[[has_more]] {
-// CHECK:         %[[sub:.*]] = affine.apply #[[$map1]]()[%[[s]], %[[q]]]
+// CHECK:       scf.for %[[i2:.*]] = %[[boundary]] to %[[s]] step %[[c16]] {
+// CHECK:         %[[sub:.*]] = affine.apply #[[$map1]](%[[i2]])[%[[s]]]
 // CHECK:         %[[mask2:.*]] = vector.create_mask %[[sub]] : vector<16xi1>
-// CHECK:         %[[li2:.*]] = vector.maskedload %{{.*}}[%[[boundary]]], %[[mask2]], %{{.*}} : memref<?xi32>, vector<16xi1>, vector<16xi32> into vector<16xi32>
+// CHECK:         %[[li2:.*]] = vector.maskedload %{{.*}}[%[[i2]]], %[[mask2]], %{{.*}} : memref<?xi32>, vector<16xi1>, vector<16xi32> into vector<16xi32>
 // CHECK:         %[[zi2:.*]] = zexti %[[li2]] : vector<16xi32> to vector<16xi64>
-// CHECK:         %[[la2:.*]] = vector.maskedload %{{.*}}[%[[boundary]]], %[[mask2]], %{{.*}} : memref<?xf32>, vector<16xi1>, vector<16xf32> into vector<16xf32>
+// CHECK:         %[[la2:.*]] = vector.maskedload %{{.*}}[%[[i2]]], %[[mask2]], %{{.*}} : memref<?xf32>, vector<16xi1>, vector<16xf32> into vector<16xf32>
 // CHECK:         %[[lb2:.*]] = vector.gather %{{.*}}[%[[c0]]] [%[[zi2]]], %[[mask2]], %{{.*}} : memref<1024xf32>, vector<16xi64>, vector<16xi1>, vector<16xf32> into vector<16xf32>
 // CHECK:         %[[m2:.*]] = mulf %[[la2]], %[[lb2]] : vector<16xf32>
 // CHECK:         vector.scatter %{{.*}}[%[[c0]]] [%[[zi2]]], %[[mask2]], %[[m2]] : memref<1024xf32>, vector<16xi64>, vector<16xi1>, vector<16xf32>