diff --git a/mlir/include/mlir/InitAllPasses.h b/mlir/include/mlir/InitAllPasses.h --- a/mlir/include/mlir/InitAllPasses.h +++ b/mlir/include/mlir/InitAllPasses.h @@ -108,6 +108,7 @@ createConvertLinalgToLLVMPass(); // LoopOps + createParallelLoopCollapsingPass(); createParallelLoopFusionPass(); createParallelLoopSpecializationPass(); createParallelLoopTilingPass(); diff --git a/mlir/include/mlir/Transforms/LoopUtils.h b/mlir/include/mlir/Transforms/LoopUtils.h --- a/mlir/include/mlir/Transforms/LoopUtils.h +++ b/mlir/include/mlir/Transforms/LoopUtils.h @@ -28,6 +28,7 @@ namespace loop { class ForOp; +class ParallelOp; } // end namespace loop /// Unrolls this for operation completely if the trip count is known to be @@ -225,6 +226,12 @@ /// independent of any loop induction variable involved in the nest. void coalesceLoops(MutableArrayRef loops); +/// Take the ParallelLoop and for each set of dimension indices, combine them +/// into a single dimension. combinedDimensions must contain each index into +/// loops exactly once. +void collapsePLoops(loop::ParallelOp loops, + std::vector> combinedDimensions); + /// Maps `forOp` for execution on a parallel grid of virtual `processorIds` of /// size given by `numProcessors`. This is achieved by embedding the SSA values /// corresponding to `processorIds` and `numProcessors` into the bounds and step diff --git a/mlir/include/mlir/Transforms/Passes.h b/mlir/include/mlir/Transforms/Passes.h --- a/mlir/include/mlir/Transforms/Passes.h +++ b/mlir/include/mlir/Transforms/Passes.h @@ -91,6 +91,10 @@ /// bounds into a single loop. std::unique_ptr> createLoopCoalescingPass(); +/// Creates a pass that transforms a single ParallelLoop over N induction +/// variables into another ParallelLoop over less than N induction variables. +std::unique_ptr> createParallelLoopCollapsingPass(); + /// Performs packing (or explicit copying) of accessed memref regions into /// buffers in the specified faster memory space through either pointwise copies /// or DMA operations. diff --git a/mlir/lib/Transforms/CMakeLists.txt b/mlir/lib/Transforms/CMakeLists.txt --- a/mlir/lib/Transforms/CMakeLists.txt +++ b/mlir/lib/Transforms/CMakeLists.txt @@ -16,6 +16,7 @@ LoopUnroll.cpp MemRefDataFlowOpt.cpp OpStats.cpp + ParallelLoopCollapsing.cpp PipelineDataTransfer.cpp SimplifyAffineStructures.cpp StripDebugInfo.cpp diff --git a/mlir/lib/Transforms/Utils/LoopUtils.cpp b/mlir/lib/Transforms/Utils/LoopUtils.cpp --- a/mlir/lib/Transforms/Utils/LoopUtils.cpp +++ b/mlir/lib/Transforms/Utils/LoopUtils.cpp @@ -973,69 +973,82 @@ } } -// Transform a loop with a strictly positive step -// for %i = %lb to %ub step %s -// into a 0-based loop with step 1 -// for %ii = 0 to ceildiv(%ub - %lb, %s) step 1 { -// %i = %ii * %s + %lb -// Insert the induction variable remapping in the body of `inner`, which is -// expected to be either `loop` or another loop perfectly nested under `loop`. -// Insert the definition of new bounds immediate before `outer`, which is -// expected to be either `loop` or its parent in the loop nest. -static void normalizeLoop(loop::ForOp loop, loop::ForOp outer, - loop::ForOp inner) { - OpBuilder builder(outer); - Location loc = loop.getLoc(); - +/// Return the new lower bound, upper bound, and step in that order. Insert any +/// additional bounds calculations before the given builder and any additional +/// conversion back to the original loop induction value inside the given Block. +static std::tuple +normalizeLoop(OpBuilder &boundsBuilder, OpBuilder &insideLoopBuilder, + Location loc, Value lowerBound, Value upperBound, Value step, + Value inductionVar) { + Value newLowerBound, newUpperBound, newStep; // Check if the loop is already known to have a constant zero lower bound or // a constant one step. bool isZeroBased = false; if (auto ubCst = - dyn_cast_or_null(loop.lowerBound().getDefiningOp())) + dyn_cast_or_null(lowerBound.getDefiningOp())) isZeroBased = ubCst.getValue() == 0; bool isStepOne = false; - if (auto stepCst = - dyn_cast_or_null(loop.step().getDefiningOp())) + if (auto stepCst = dyn_cast_or_null(step.getDefiningOp())) isStepOne = stepCst.getValue() == 1; - if (isZeroBased && isStepOne) - return; // Compute the number of iterations the loop executes: ceildiv(ub - lb, step) // assuming the step is strictly positive. Update the bounds and the step // of the loop to go from 0 to the number of iterations, if necessary. // TODO(zinenko): introduce support for negative steps or emit dynamic asserts // on step positivity, whatever gets implemented first. - Value diff = - builder.create(loc, loop.upperBound(), loop.lowerBound()); - Value numIterations = ceilDivPositive(builder, loc, diff, loop.step()); - loop.setUpperBound(numIterations); - - Value lb = loop.lowerBound(); - if (!isZeroBased) { - Value cst0 = builder.create(loc, 0); - loop.setLowerBound(cst0); - } + if (isZeroBased && isStepOne) + return {lowerBound, upperBound, step}; - Value step = loop.step(); - if (!isStepOne) { - Value cst1 = builder.create(loc, 1); - loop.setStep(cst1); - } + Value diff = boundsBuilder.create(loc, upperBound, lowerBound); + newUpperBound = ceilDivPositive(boundsBuilder, loc, diff, step); + + if (isZeroBased) + newLowerBound = lowerBound; + else + newLowerBound = boundsBuilder.create(loc, 0); + + if (isStepOne) + newStep = step; + else + newStep = boundsBuilder.create(loc, 1); // Insert code computing the value of the original loop induction variable // from the "normalized" one. - builder.setInsertionPointToStart(inner.getBody()); Value scaled = - isStepOne ? loop.getInductionVar() - : builder.create(loc, loop.getInductionVar(), step); + isStepOne ? inductionVar + : insideLoopBuilder.create(loc, inductionVar, step); Value shifted = - isZeroBased ? scaled : builder.create(loc, scaled, lb); + isZeroBased ? scaled + : insideLoopBuilder.create(loc, scaled, lowerBound); SmallPtrSet preserve{scaled.getDefiningOp(), shifted.getDefiningOp()}; - replaceAllUsesExcept(loop.getInductionVar(), shifted, preserve); + replaceAllUsesExcept(inductionVar, shifted, preserve); + return {newLowerBound, newUpperBound, newStep}; +} + +/// Transform a loop with a strictly positive step +/// for %i = %lb to %ub step %s +/// into a 0-based loop with step 1 +/// for %ii = 0 to ceildiv(%ub - %lb, %s) step 1 { +/// %i = %ii * %s + %lb +/// Insert the induction variable remapping in the body of `inner`, which is +/// expected to be either `loop` or another loop perfectly nested under `loop`. +/// Insert the definition of new bounds immediate before `outer`, which is +/// expected to be either `loop` or its parent in the loop nest. +static void normalizeLoop(loop::ForOp loop, loop::ForOp outer, + loop::ForOp inner) { + OpBuilder builder(outer); + OpBuilder innerBuilder(inner.getBody(), inner.getBody()->begin()); + auto loopPieces = + normalizeLoop(builder, innerBuilder, loop.getLoc(), loop.lowerBound(), + loop.upperBound(), loop.step(), loop.getInductionVar()); + + loop.setLowerBound(std::get<0>(loopPieces)); + loop.setStep(std::get<2>(loopPieces)); + loop.setUpperBound(std::get<1>(loopPieces)); } void mlir::coalesceLoops(MutableArrayRef loops) { @@ -1093,6 +1106,74 @@ second.erase(); } +void mlir::collapsePLoops( + loop::ParallelOp loops, + std::vector> combinedDimensions) { + OpBuilder outsideBuilder(loops); + Location loc = loops.getLoc(); + + // Normalize ParallelOp's iteration pattern. + SmallVector normalizedLowerBounds; + SmallVector normalizedSteps; + SmallVector normalizedUpperBounds; + for (unsigned i = 0, e = loops.getNumLoops(); i < e; ++i) { + OpBuilder insideLoopBuilder(loops.getBody(), loops.getBody()->begin()); + auto resultBounds = + normalizeLoop(outsideBuilder, insideLoopBuilder, loc, + loops.lowerBound()[i], loops.upperBound()[i], + loops.step()[i], loops.getBody()->getArgument(i)); + + normalizedLowerBounds.push_back(std::get<0>(resultBounds)); + normalizedUpperBounds.push_back(std::get<1>(resultBounds)); + normalizedSteps.push_back(std::get<2>(resultBounds)); + } + + // Combine iteration spaces + SmallVector lowerBounds; + SmallVector steps; + SmallVector upperBounds; + auto cst0 = outsideBuilder.create(loc, 0); + auto cst1 = outsideBuilder.create(loc, 1); + for (unsigned i = 0, e = combinedDimensions.size(); i < e; ++i) { + Value newUpperBound = outsideBuilder.create(loc, 1); + for (auto idx : combinedDimensions[i]) { + newUpperBound = outsideBuilder.create(loc, newUpperBound, + normalizedUpperBounds[idx]); + } + lowerBounds.push_back(cst0); + steps.push_back(cst1); + upperBounds.push_back(newUpperBound); + } + + // Create new ParallelLoop with conversions to the original induction values. + auto newPloop = outsideBuilder.create(loc, lowerBounds, + upperBounds, steps); + OpBuilder insideBuilder(newPloop.getBody(), newPloop.getBody()->begin()); + for (unsigned i = 0, e = combinedDimensions.size(); i < e; ++i) { + Value previous = newPloop.getBody()->getArgument(i); + for (unsigned idx = 0, e = combinedDimensions[i].size(); idx < e; ++idx) { + unsigned ivar_idx = combinedDimensions[i][idx]; + if (idx != 0) + previous = insideBuilder.create( + loc, previous, loops.upperBound()[ivar_idx]); + + Value iv = (idx == e - 1) + ? previous + : insideBuilder.create( + loc, previous, loops.upperBound()[ivar_idx]); + replaceAllUsesInRegionWith(loops.getBody()->getArgument(ivar_idx), iv, + loops.region()); + } + } + + // Replace the old loop with the new loop. + loops.getBody()->back().erase(); + newPloop.getBody()->getOperations().splice( + Block::iterator(newPloop.getBody()->back()), + loops.getBody()->getOperations()); + loops.erase(); +} + void mlir::mapLoopToProcessorIds(loop::ForOp forOp, ArrayRef processorId, ArrayRef numProcessors) { assert(processorId.size() == numProcessors.size());