diff --git a/mlir/include/mlir/Dialect/SCF/Transforms.h b/mlir/include/mlir/Dialect/SCF/Transforms.h --- a/mlir/include/mlir/Dialect/SCF/Transforms.h +++ b/mlir/include/mlir/Dialect/SCF/Transforms.h @@ -68,8 +68,8 @@ /// Rewrite a for loop with bounds/step that potentially do not divide evenly /// into a for loop where the step divides the iteration space evenly, followed -/// by an scf.if for the last (partial) iteration (if any; returned via `ifOp`). -/// This transformation is called "loop peeling". +/// by another scf.for for the last (partial) iteration (if any; returned via +/// `partialIteration`). This transformation is called "loop peeling". /// /// This transformation is beneficial for a wide range of transformations such /// as vectorization or loop tiling: It enables additional canonicalizations @@ -88,16 +88,16 @@ /// scf.for %iv = %c0 to %newUb step %c4 { /// (loop body) /// } -/// scf.if %newUb < %ub { +/// scf.for %iv2 = %newUb to %ub { /// (loop body) /// } /// ``` /// /// After loop peeling, this function tries to simplify/canonicalize affine.min -/// and affine.max ops in the body of the loop and the scf.if, taking advantage -/// of the fact that the peeled loop has only "full" iterations and the scf.if -/// is always a partial iteration (if any). This canonicalization is expected to -/// enable further canonicalization opportunities through other patterns. +/// and affine.max ops in the body of the peeled loop and in the body of the +/// partial iteration loop, taking advantage of the fact that the peeled loop +/// has only "full" iterations. This canonicalization is expected to enable +/// further canonicalization opportunities through other patterns. /// /// The return value indicates whether the loop was rewritten or not. Loops are /// not rewritten if: @@ -106,10 +106,10 @@ /// iteration space evenly. /// /// Note: This function rewrites the given scf.for loop in-place and creates a -/// new scf.if operation for the last iteration. It replaces all uses of the -/// unpeeled loop with the results of the newly generated scf.if. +/// new scf.for operation for the last iteration. It replaces all uses of the +/// unpeeled loop with the results of the newly generated scf.for. LogicalResult peelAndCanonicalizeForLoop(RewriterBase &rewriter, ForOp forOp, - scf::IfOp &ifOp); + scf::ForOp &partialIteration); /// Try to simplify a min/max operation `op` after loop peeling. This function /// can simplify min/max operations such as (ub is the previous upper bound of diff --git a/mlir/lib/Dialect/SCF/Transforms/LoopSpecialization.cpp b/mlir/lib/Dialect/SCF/Transforms/LoopSpecialization.cpp --- a/mlir/lib/Dialect/SCF/Transforms/LoopSpecialization.cpp +++ b/mlir/lib/Dialect/SCF/Transforms/LoopSpecialization.cpp @@ -106,8 +106,8 @@ /// The newly generated scf.if operation is returned via `ifOp`. The boundary /// at which the loop is split (new upper bound) is returned via `splitBound`. /// The return value indicates whether the loop was rewritten or not. -static LogicalResult peelForLoop(RewriterBase &b, ForOp forOp, scf::IfOp &ifOp, - Value &splitBound) { +static LogicalResult peelForLoop(RewriterBase &b, ForOp forOp, + ForOp &partialIteration, Value &splitBound) { RewriterBase::InsertionGuard guard(b); auto lbInt = getConstantIntValue(forOp.lowerBound()); auto ubInt = getConstantIntValue(forOp.upperBound()); @@ -130,34 +130,16 @@ loc, modMap, ValueRange{forOp.lowerBound(), forOp.upperBound(), forOp.step()}); + // Create ForOp for partial iteration. + b.setInsertionPointAfter(forOp); + partialIteration = cast(b.clone(*forOp.getOperation())); + partialIteration.lowerBoundMutable().assign(splitBound); + forOp.replaceAllUsesWith(partialIteration->getResults()); + partialIteration.initArgsMutable().assign(forOp->getResults()); + // Set new upper loop bound. - Value previousUb = forOp.upperBound(); b.updateRootInPlace(forOp, [&]() { forOp.upperBoundMutable().assign(splitBound); }); - b.setInsertionPointAfter(forOp); - - // Do we need one more iteration? - Value hasMoreIter = - b.create(loc, CmpIPredicate::slt, splitBound, previousUb); - - // Create IfOp for last iteration. - auto resultTypes = forOp.getResultTypes(); - ifOp = b.create(loc, resultTypes, hasMoreIter, - /*withElseRegion=*/!resultTypes.empty()); - forOp.replaceAllUsesWith(ifOp->getResults()); - - // Build then case. - BlockAndValueMapping bvm; - bvm.map(forOp.region().getArgument(0), splitBound); - for (auto it : llvm::zip(forOp.getRegionIterArgs(), forOp->getResults())) { - bvm.map(std::get<0>(it), std::get<1>(it)); - } - b.cloneRegionBefore(forOp.region(), ifOp.thenRegion(), - ifOp.thenRegion().begin(), bvm); - // Build else case. - if (!resultTypes.empty()) - ifOp.getElseBodyBuilder(b.getListener()) - .create(loc, forOp->getResults()); return success(); } @@ -370,37 +352,43 @@ } template -static void -rewriteAffineOpAfterPeeling(RewriterBase &rewriter, ForOp forOp, scf::IfOp ifOp, - Value iv, Value splitBound, Value ub, Value step) { +static void rewriteAffineOpAfterPeeling(RewriterBase &rewriter, ForOp forOp, + ForOp partialIteration, + Value previousUb) { + Value mainIv = forOp.getInductionVar(); + Value partialIv = partialIteration.getInductionVar(); + assert(forOp.step() == partialIteration.step() && + "expected same step in main and partial loop"); + Value step = forOp.step(); + forOp.walk([&](OpTy affineOp) { AffineMap map = affineOp.getAffineMap(); (void)scf::rewritePeeledMinMaxOp(rewriter, affineOp, map, - affineOp.operands(), IsMin, iv, ub, step, + affineOp.operands(), IsMin, mainIv, + previousUb, step, /*insideLoop=*/true); }); - ifOp.walk([&](OpTy affineOp) { + partialIteration.walk([&](OpTy affineOp) { AffineMap map = affineOp.getAffineMap(); (void)scf::rewritePeeledMinMaxOp(rewriter, affineOp, map, - affineOp.operands(), IsMin, splitBound, ub, - step, /*insideLoop=*/false); + affineOp.operands(), IsMin, partialIv, + previousUb, step, /*insideLoop=*/false); }); } LogicalResult mlir::scf::peelAndCanonicalizeForLoop(RewriterBase &rewriter, ForOp forOp, - scf::IfOp &ifOp) { - Value ub = forOp.upperBound(); + ForOp &partialIteration) { + Value previousUb = forOp.upperBound(); Value splitBound; - if (failed(peelForLoop(rewriter, forOp, ifOp, splitBound))) + if (failed(peelForLoop(rewriter, forOp, partialIteration, splitBound))) return failure(); // Rewrite affine.min and affine.max ops. - Value iv = forOp.getInductionVar(), step = forOp.step(); rewriteAffineOpAfterPeeling( - rewriter, forOp, ifOp, iv, splitBound, ub, step); + rewriter, forOp, partialIteration, previousUb); rewriteAffineOpAfterPeeling( - rewriter, forOp, ifOp, iv, splitBound, ub, step); + rewriter, forOp, partialIteration, previousUb); return success(); } @@ -491,23 +479,24 @@ if (forOp->hasAttr(kPeeledLoopLabel)) return failure(); if (skipPartial) { - // No peeling of loops inside the partial iteration (scf.if) of another - // peeled loop. + // No peeling of loops inside the partial iteration of another peeled + // loop. Operation *op = forOp.getOperation(); - while ((op = op->getParentOfType())) { + while ((op = op->getParentOfType())) { if (op->hasAttr(kPartialIterationLabel)) return failure(); } } // Apply loop peeling. - scf::IfOp ifOp; - if (failed(peelAndCanonicalizeForLoop(rewriter, forOp, ifOp))) + scf::ForOp partialIteration; + if (failed(peelAndCanonicalizeForLoop(rewriter, forOp, partialIteration))) return failure(); // Apply label, so that the same loop is not rewritten a second time. + partialIteration->setAttr(kPeeledLoopLabel, rewriter.getUnitAttr()); rewriter.updateRootInPlace(forOp, [&]() { forOp->setAttr(kPeeledLoopLabel, rewriter.getUnitAttr()); }); - ifOp->setAttr(kPartialIterationLabel, rewriter.getUnitAttr()); + partialIteration->setAttr(kPartialIterationLabel, rewriter.getUnitAttr()); return success(); } diff --git a/mlir/test/Dialect/SCF/for-loop-peeling.mlir b/mlir/test/Dialect/SCF/for-loop-peeling.mlir --- a/mlir/test/Dialect/SCF/for-loop-peeling.mlir +++ b/mlir/test/Dialect/SCF/for-loop-peeling.mlir @@ -2,7 +2,7 @@ // RUN: mlir-opt %s -for-loop-peeling=skip-partial=false -canonicalize -split-input-file | FileCheck %s -check-prefix=CHECK-NO-SKIP // CHECK-DAG: #[[MAP0:.*]] = affine_map<()[s0, s1, s2] -> (s1 - (s1 - s0) mod s2)> -// CHECK-DAG: #[[MAP1:.*]] = affine_map<()[s0, s1, s2] -> (-(s0 - (s0 - s1) mod s2) + s0)> +// CHECK-DAG: #[[MAP1:.*]] = affine_map<(d0)[s0] -> (-d0 + s0)> // CHECK: func @fully_dynamic_bounds( // CHECK-SAME: %[[LB:.*]]: index, %[[UB:.*]]: index, %[[STEP:.*]]: index // CHECK: %[[C0_I32:.*]] = constant 0 : i32 @@ -13,14 +13,12 @@ // CHECK: %[[ADD:.*]] = addi %[[ACC]], %[[CAST]] : i32 // CHECK: scf.yield %[[ADD]] // CHECK: } -// CHECK: %[[HAS_MORE:.*]] = cmpi slt, %[[NEW_UB]], %[[UB]] -// CHECK: %[[RESULT:.*]] = scf.if %[[HAS_MORE]] -> (i32) { -// CHECK: %[[REM:.*]] = affine.apply #[[MAP1]]()[%[[UB]], %[[LB]], %[[STEP]]] +// CHECK: %[[RESULT:.*]] = scf.for %[[IV2:.*]] = %[[NEW_UB]] to %[[UB]] +// CHECK-SAME: step %[[STEP]] iter_args(%[[ACC2:.*]] = %[[LOOP]]) -> (i32) { +// CHECK: %[[REM:.*]] = affine.apply #[[MAP1]](%[[IV2]])[%[[UB]]] // CHECK: %[[CAST2:.*]] = index_cast %[[REM]] -// CHECK: %[[ADD2:.*]] = addi %[[LOOP]], %[[CAST2]] +// CHECK: %[[ADD2:.*]] = addi %[[ACC2]], %[[CAST2]] // CHECK: scf.yield %[[ADD2]] -// CHECK: } else { -// CHECK: scf.yield %[[LOOP]] // CHECK: } // CHECK: return %[[RESULT]] #map = affine_map<(d0, d1)[s0] -> (s0, d0 - d1)> @@ -70,7 +68,7 @@ // ----- // CHECK-DAG: #[[MAP0:.*]] = affine_map<()[s0] -> ((s0 floordiv 4) * 4)> -// CHECK-DAG: #[[MAP1:.*]] = affine_map<()[s0] -> (s0 mod 4)> +// CHECK-DAG: #[[MAP1:.*]] = affine_map<(d0)[s0] -> (-d0 + s0)> // CHECK: func @dynamic_upper_bound( // CHECK-SAME: %[[UB:.*]]: index // CHECK-DAG: %[[C0_I32:.*]] = constant 0 : i32 @@ -83,14 +81,12 @@ // CHECK: %[[ADD:.*]] = addi %[[ACC]], %[[C4_I32]] : i32 // CHECK: scf.yield %[[ADD]] // CHECK: } -// CHECK: %[[HAS_MORE:.*]] = cmpi slt, %[[NEW_UB]], %[[UB]] -// CHECK: %[[RESULT:.*]] = scf.if %[[HAS_MORE]] -> (i32) { -// CHECK: %[[REM:.*]] = affine.apply #[[MAP1]]()[%[[UB]]] +// CHECK: %[[RESULT:.*]] = scf.for %[[IV2:.*]] = %[[NEW_UB]] to %[[UB]] +// CHECK-SAME: step %[[C4]] iter_args(%[[ACC2:.*]] = %[[LOOP]]) -> (i32) { +// CHECK: %[[REM:.*]] = affine.apply #[[MAP1]](%[[IV2]])[%[[UB]]] // CHECK: %[[CAST2:.*]] = index_cast %[[REM]] -// CHECK: %[[ADD2:.*]] = addi %[[LOOP]], %[[CAST2]] +// CHECK: %[[ADD2:.*]] = addi %[[ACC2]], %[[CAST2]] // CHECK: scf.yield %[[ADD2]] -// CHECK: } else { -// CHECK: scf.yield %[[LOOP]] // CHECK: } // CHECK: return %[[RESULT]] #map = affine_map<(d0, d1)[s0] -> (s0, d0 - d1)> @@ -111,7 +107,7 @@ // ----- // CHECK-DAG: #[[MAP0:.*]] = affine_map<()[s0] -> ((s0 floordiv 4) * 4)> -// CHECK-DAG: #[[MAP1:.*]] = affine_map<()[s0] -> (s0 mod 4)> +// CHECK-DAG: #[[MAP1:.*]] = affine_map<(d0)[s0] -> (-d0 + s0)> // CHECK: func @no_loop_results( // CHECK-SAME: %[[UB:.*]]: index, %[[MEMREF:.*]]: memref // CHECK-DAG: %[[C4_I32:.*]] = constant 4 : i32 @@ -123,9 +119,8 @@ // CHECK: %[[ADD:.*]] = addi %[[LOAD]], %[[C4_I32]] : i32 // CHECK: memref.store %[[ADD]], %[[MEMREF]] // CHECK: } -// CHECK: %[[HAS_MORE:.*]] = cmpi slt, %[[NEW_UB]], %[[UB]] -// CHECK: scf.if %[[HAS_MORE]] { -// CHECK: %[[REM:.*]] = affine.apply #[[MAP1]]()[%[[UB]]] +// CHECK: scf.for %[[IV2:.*]] = %[[NEW_UB]] to %[[UB]] step %[[C4]] { +// CHECK: %[[REM:.*]] = affine.apply #[[MAP1]](%[[IV2]])[%[[UB]]] // CHECK: %[[LOAD2:.*]] = memref.load %[[MEMREF]][] // CHECK: %[[CAST2:.*]] = index_cast %[[REM]] // CHECK: %[[ADD2:.*]] = addi %[[LOAD2]], %[[CAST2]] @@ -157,11 +152,10 @@ // CHECK-DAG: #[[MAP2:.*]] = affine_map<(d0)[s0, s1] -> (s0, -d0 + s1 - 1)> // CHECK-DAG: #[[MAP3:.*]] = affine_map<(d0)[s0, s1, s2] -> (s0, -d0 + s1, s2)> // CHECK-DAG: #[[MAP4:.*]] = affine_map<()[s0] -> (-s0)> -// CHECK-DAG: #[[MAP5:.*]] = affine_map<()[s0, s1, s2] -> (-(s0 - (s0 - s1) mod s2) + s0)> -// CHECK-DAG: #[[MAP6:.*]] = affine_map<()[s0, s1, s2] -> (-(s0 - (s0 - s1) mod s2) + s0 + 1)> -// CHECK-DAG: #[[MAP7:.*]] = affine_map<()[s0, s1, s2] -> (-(s0 - (s0 - s1) mod s2) + s0 - 1)> -// CHECK-DAG: #[[MAP8:.*]] = affine_map<()[s0, s1, s2, s3] -> (s0, s2 - (s2 - (s2 - s1) mod s0), s3)> -// CHECK-DAG: #[[MAP9:.*]] = affine_map<()[s0, s1, s2] -> (s0 - (s0 - s1) mod s2 - s0)> +// CHECK-DAG: #[[MAP5:.*]] = affine_map<(d0)[s0] -> (-d0 + s0)> +// CHECK-DAG: #[[MAP6:.*]] = affine_map<(d0)[s0] -> (-d0 + s0 + 1)> +// CHECK-DAG: #[[MAP7:.*]] = affine_map<(d0)[s0] -> (-d0 + s0 - 1)> +// CHECK-DAG: #[[MAP8:.*]] = affine_map<(d0)[s0] -> (d0 - s0)> // CHECK: func @test_affine_op_rewrite( // CHECK-SAME: %[[LB:.*]]: index, %[[UB:.*]]: index, %[[STEP:.*]]: index, // CHECK-SAME: %[[MEMREF:.*]]: memref, %[[SOME_VAL:.*]]: index @@ -179,18 +173,18 @@ // CHECK: %[[RES5:.*]] = affine.apply #[[MAP4]]()[%[[STEP]]] // CHECK: memref.store %[[RES5]] // CHECK: } -// CHECK: scf.if {{.*}} { -// CHECK: %[[RES_IF_0:.*]] = affine.apply #[[MAP5]]()[%[[UB]], %[[LB]], %[[STEP]]] +// CHECK: scf.for %[[IV2:.*]] = {{.*}} to %[[UB]] step %[[STEP]] { +// CHECK: %[[RES_IF_0:.*]] = affine.apply #[[MAP5]](%[[IV2]])[%[[UB]]] // CHECK: memref.store %[[RES_IF_0]] -// CHECK: %[[RES_IF_1:.*]] = affine.apply #[[MAP6]]()[%[[UB]], %[[LB]], %[[STEP]]] +// CHECK: %[[RES_IF_1:.*]] = affine.apply #[[MAP6]](%[[IV2]])[%[[UB]]] // CHECK: memref.store %[[RES_IF_1]] -// CHECK: %[[RES_IF_2:.*]] = affine.apply #[[MAP6]]()[%[[UB]], %[[LB]], %[[STEP]]] +// CHECK: %[[RES_IF_2:.*]] = affine.apply #[[MAP6]](%[[IV2]])[%[[UB]]] // CHECK: memref.store %[[RES_IF_2]] -// CHECK: %[[RES_IF_3:.*]] = affine.apply #[[MAP7]]()[%[[UB]], %[[LB]], %[[STEP]]] +// CHECK: %[[RES_IF_3:.*]] = affine.apply #[[MAP7]](%[[IV2]])[%[[UB]]] // CHECK: memref.store %[[RES_IF_3]] -// CHECK: %[[RES_IF_4:.*]] = affine.min #[[MAP8]]()[%[[STEP]], %[[LB]], %[[UB]], %[[SOME_VAL]]] +// CHECK: %[[RES_IF_4:.*]] = affine.min #[[MAP3]](%[[IV2]])[%[[STEP]], %[[UB]], %[[SOME_VAL]]] // CHECK: memref.store %[[RES_IF_4]] -// CHECK: %[[RES_IF_5:.*]] = affine.apply #[[MAP9]]()[%[[UB]], %[[LB]], %[[STEP]]] +// CHECK: %[[RES_IF_5:.*]] = affine.apply #[[MAP8]](%[[IV2]])[%[[UB]]] // CHECK: memref.store %[[RES_IF_5]] #map0 = affine_map<(d0, d1)[s0] -> (s0, d0 - d1)> #map1 = affine_map<(d0, d1)[s0] -> (d0 - d1 + 1, s0)> @@ -243,26 +237,26 @@ // CHECK: scf.for {{.*}} { // CHECK: scf.for {{.*}} { // CHECK: } -// CHECK: scf.if {{.*}} { +// CHECK: scf.for {{.*}} { // CHECK: } // CHECK: } -// CHECK: scf.if {{.*}} { +// CHECK: scf.for {{.*}} { // CHECK: scf.for {{.*}} { // CHECK: } -// CHECK-NOT: scf.if +// CHECK-NOT: scf.for // CHECK: } // CHECK-NO-SKIP: func @nested_loops // CHECK-NO-SKIP: scf.for {{.*}} { // CHECK-NO-SKIP: scf.for {{.*}} { // CHECK-NO-SKIP: } -// CHECK-NO-SKIP: scf.if {{.*}} { +// CHECK-NO-SKIP: scf.for {{.*}} { // CHECK-NO-SKIP: } // CHECK-NO-SKIP: } -// CHECK-NO-SKIP: scf.if {{.*}} { +// CHECK-NO-SKIP: scf.for {{.*}} { // CHECK-NO-SKIP: scf.for {{.*}} { // CHECK-NO-SKIP: } -// CHECK-NO-SKIP: scf.if {{.*}} { +// CHECK-NO-SKIP: scf.for {{.*}} { // CHECK-NO-SKIP: } // CHECK-NO-SKIP: } #map = affine_map<(d0, d1)[s0] -> (s0, d0 - d1)> diff --git a/mlir/test/Dialect/SparseTensor/sparse_vector_peeled.mlir b/mlir/test/Dialect/SparseTensor/sparse_vector_peeled.mlir --- a/mlir/test/Dialect/SparseTensor/sparse_vector_peeled.mlir +++ b/mlir/test/Dialect/SparseTensor/sparse_vector_peeled.mlir @@ -18,7 +18,7 @@ } // CHECK-DAG: #[[$map0:.*]] = affine_map<()[s0, s1] -> (s0 + ((-s0 + s1) floordiv 16) * 16)> -// CHECK-DAG: #[[$map1:.*]] = affine_map<()[s0, s1] -> ((s0 - s1) mod 16)> +// CHECK-DAG: #[[$map1:.*]] = affine_map<(d0)[s0] -> (-d0 + s0)> // CHECK-LABEL: func @mul_s // CHECK-DAG: %[[c0:.*]] = constant 0 : index // CHECK-DAG: %[[c1:.*]] = constant 1 : index @@ -39,13 +39,12 @@ // CHECK: %[[m:.*]] = mulf %[[la]], %[[lb]] : vector<16xf32> // CHECK: vector.scatter %{{.*}}[%[[c0]]] [%[[zi]]], %[[mask]], %[[m]] : memref<1024xf32>, vector<16xi64>, vector<16xi1>, vector<16xf32> // CHECK: } -// CHECK: %[[has_more:.*]] = cmpi slt, %[[boundary]], %[[s]] : index -// CHECK: scf.if %[[has_more]] { -// CHECK: %[[sub:.*]] = affine.apply #[[$map1]]()[%[[s]], %[[q]]] +// CHECK: scf.for %[[i2:.*]] = %[[boundary]] to %[[s]] step %[[c16]] { +// CHECK: %[[sub:.*]] = affine.apply #[[$map1]](%[[i2]])[%[[s]]] // CHECK: %[[mask2:.*]] = vector.create_mask %[[sub]] : vector<16xi1> -// CHECK: %[[li2:.*]] = vector.maskedload %{{.*}}[%[[boundary]]], %[[mask2]], %{{.*}} : memref, vector<16xi1>, vector<16xi32> into vector<16xi32> +// CHECK: %[[li2:.*]] = vector.maskedload %{{.*}}[%[[i2]]], %[[mask2]], %{{.*}} : memref, vector<16xi1>, vector<16xi32> into vector<16xi32> // CHECK: %[[zi2:.*]] = zexti %[[li2]] : vector<16xi32> to vector<16xi64> -// CHECK: %[[la2:.*]] = vector.maskedload %{{.*}}[%[[boundary]]], %[[mask2]], %{{.*}} : memref, vector<16xi1>, vector<16xf32> into vector<16xf32> +// CHECK: %[[la2:.*]] = vector.maskedload %{{.*}}[%[[i2]]], %[[mask2]], %{{.*}} : memref, vector<16xi1>, vector<16xf32> into vector<16xf32> // CHECK: %[[lb2:.*]] = vector.gather %{{.*}}[%[[c0]]] [%[[zi2]]], %[[mask2]], %{{.*}} : memref<1024xf32>, vector<16xi64>, vector<16xi1>, vector<16xf32> into vector<16xf32> // CHECK: %[[m2:.*]] = mulf %[[la2]], %[[lb2]] : vector<16xf32> // CHECK: vector.scatter %{{.*}}[%[[c0]]] [%[[zi2]]], %[[mask2]], %[[m2]] : memref<1024xf32>, vector<16xi64>, vector<16xi1>, vector<16xf32>