diff --git a/mlir/include/mlir/Transforms/LoopUtils.h b/mlir/include/mlir/Transforms/LoopUtils.h --- a/mlir/include/mlir/Transforms/LoopUtils.h +++ b/mlir/include/mlir/Transforms/LoopUtils.h @@ -28,6 +28,7 @@ namespace loop { class ForOp; +class ParallelOp; } // end namespace loop /// Unrolls this for operation completely if the trip count is known to be @@ -225,6 +226,9 @@ /// independent of any loop induction variable involved in the nest. void coalesceLoops(MutableArrayRef loops); +void coalescePLoops(loop::ParallelOp loops, + std::vector> combinedDimensions); + /// Maps `forOp` for execution on a parallel grid of virtual `processorIds` of /// size given by `numProcessors`. This is achieved by embedding the SSA values /// corresponding to `processorIds` and `numProcessors` into the bounds and step diff --git a/mlir/include/mlir/Transforms/Passes.h b/mlir/include/mlir/Transforms/Passes.h --- a/mlir/include/mlir/Transforms/Passes.h +++ b/mlir/include/mlir/Transforms/Passes.h @@ -91,6 +91,11 @@ /// bounds into a single loop. std::unique_ptr> createLoopCoalescingPass(); +/// Creates a pass that transforms a single ParallelLoop over N induction +/// variables into another ParallelLoop over less than N induction variables. +std::unique_ptr> createParallelLoopCoalescingPass(); + + /// Performs packing (or explicit copying) of accessed memref regions into /// buffers in the specified faster memory space through either pointwise copies /// or DMA operations. diff --git a/mlir/lib/Transforms/CMakeLists.txt b/mlir/lib/Transforms/CMakeLists.txt --- a/mlir/lib/Transforms/CMakeLists.txt +++ b/mlir/lib/Transforms/CMakeLists.txt @@ -16,6 +16,7 @@ LoopUnroll.cpp MemRefDataFlowOpt.cpp OpStats.cpp + ParallelLoopCoalescing.cpp PipelineDataTransfer.cpp SimplifyAffineStructures.cpp StripDebugInfo.cpp diff --git a/mlir/lib/Transforms/ParallelLoopCoalescing.cpp b/mlir/lib/Transforms/ParallelLoopCoalescing.cpp new file mode 100644 --- /dev/null +++ b/mlir/lib/Transforms/ParallelLoopCoalescing.cpp @@ -0,0 +1,49 @@ +//===- ParallelLoopCoalescing.cpp - Pass coalescing parallel loop indices -===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "mlir/Dialect/LoopOps/LoopOps.h" +#include "mlir/Dialect/StandardOps/IR/Ops.h" +#include "mlir/Pass/Pass.h" +#include "mlir/Transforms/LoopUtils.h" +#include "mlir/Transforms/Passes.h" +#include "mlir/Transforms/RegionUtils.h" +#include "llvm/Support/Debug.h" + +#define PASS_NAME "parallel-loop-coalescing" +#define DEBUG_TYPE PASS_NAME + +using namespace mlir; + +namespace { +class ParallelLoopCoalescingPass + : public FunctionPass { +public: + void runOnFunction() override { + FuncOp func = getFunction(); + + // The common case for GPU dialect will be simplifying the ParallelOp to 3 + // arguments, so we do that here to simplify things. + func.walk([&](loop::ParallelOp op) { + std::vector> combinedLoops(3); + for (unsigned i = 0; i < op.getNumInductionVars(); i++) { + combinedLoops[i % 3].push_back(i); + } + coalescePLoops(op, combinedLoops); + func.dump(); + }); + } +}; + +} // namespace + +std::unique_ptr> mlir::createParallelLoopCoalescingPass() { + return std::make_unique(); +} + +static PassRegistration + reg(PASS_NAME, "coalesce parallel loops to use less induction variables."); diff --git a/mlir/lib/Transforms/Utils/LoopUtils.cpp b/mlir/lib/Transforms/Utils/LoopUtils.cpp --- a/mlir/lib/Transforms/Utils/LoopUtils.cpp +++ b/mlir/lib/Transforms/Utils/LoopUtils.cpp @@ -973,69 +973,83 @@ } } -// Transform a loop with a strictly positive step -// for %i = %lb to %ub step %s -// into a 0-based loop with step 1 -// for %ii = 0 to ceildiv(%ub - %lb, %s) step 1 { -// %i = %ii * %s + %lb -// Insert the induction variable remapping in the body of `inner`, which is -// expected to be either `loop` or another loop perfectly nested under `loop`. -// Insert the definition of new bounds immediate before `outer`, which is -// expected to be either `loop` or its parent in the loop nest. -static void normalizeLoop(loop::ForOp loop, loop::ForOp outer, - loop::ForOp inner) { - OpBuilder builder(outer); - Location loc = loop.getLoc(); - +static std::tuple +normalizeLoop(OpBuilder &builder, Block *body, Value lowerBound, Value step, + Value upperBound, Value inductionVar) { + Value newLowerBound, newUpperBound, newStep; + auto loc = builder.getUnknownLoc(); // Check if the loop is already known to have a constant zero lower bound or // a constant one step. bool isZeroBased = false; if (auto ubCst = - dyn_cast_or_null(loop.lowerBound().getDefiningOp())) + dyn_cast_or_null(lowerBound.getDefiningOp())) isZeroBased = ubCst.getValue() == 0; bool isStepOne = false; - if (auto stepCst = - dyn_cast_or_null(loop.step().getDefiningOp())) + if (auto stepCst = dyn_cast_or_null(step.getDefiningOp())) isStepOne = stepCst.getValue() == 1; - if (isZeroBased && isStepOne) - return; // Compute the number of iterations the loop executes: ceildiv(ub - lb, step) // assuming the step is strictly positive. Update the bounds and the step // of the loop to go from 0 to the number of iterations, if necessary. // TODO(zinenko): introduce support for negative steps or emit dynamic asserts // on step positivity, whatever gets implemented first. - Value diff = - builder.create(loc, loop.upperBound(), loop.lowerBound()); - Value numIterations = ceilDivPositive(builder, loc, diff, loop.step()); - loop.setUpperBound(numIterations); - - Value lb = loop.lowerBound(); - if (!isZeroBased) { - Value cst0 = builder.create(loc, 0); - loop.setLowerBound(cst0); + + if (isZeroBased && isStepOne) { + return {lowerBound, step, upperBound}; } - Value step = loop.step(); - if (!isStepOne) { - Value cst1 = builder.create(loc, 1); - loop.setStep(cst1); + Value diff = builder.create(loc, upperBound, lowerBound); + newUpperBound = ceilDivPositive(builder, loc, diff, step); + + if (isZeroBased) { + newLowerBound = lowerBound; + } else { + newLowerBound = builder.create(loc, 0); + } + + if (isStepOne) { + newStep = step; + } else { + newStep = builder.create(loc, 1); } // Insert code computing the value of the original loop induction variable // from the "normalized" one. - builder.setInsertionPointToStart(inner.getBody()); - Value scaled = - isStepOne ? loop.getInductionVar() - : builder.create(loc, loop.getInductionVar(), step); - Value shifted = - isZeroBased ? scaled : builder.create(loc, scaled, lb); + OpBuilder insideBuilder(body, body->begin()); + Value scaled = isStepOne + ? inductionVar + : insideBuilder.create(loc, inductionVar, step); + Value shifted = isZeroBased + ? scaled + : insideBuilder.create(loc, scaled, lowerBound); SmallPtrSet preserve{scaled.getDefiningOp(), shifted.getDefiningOp()}; - replaceAllUsesExcept(loop.getInductionVar(), shifted, preserve); + replaceAllUsesExcept(inductionVar, shifted, preserve); + return {newLowerBound, newStep, newUpperBound}; +} + +// Transform a loop with a strictly positive step +// for %i = %lb to %ub step %s +// into a 0-based loop with step 1 +// for %ii = 0 to ceildiv(%ub - %lb, %s) step 1 { +// %i = %ii * %s + %lb +// Insert the induction variable remapping in the body of `inner`, which is +// expected to be either `loop` or another loop perfectly nested under `loop`. +// Insert the definition of new bounds immediate before `outer`, which is +// expected to be either `loop` or its parent in the loop nest. +static void normalizeLoop(loop::ForOp loop, loop::ForOp outer, + loop::ForOp inner) { + OpBuilder builder(outer); + auto loopPieces = + normalizeLoop(builder, inner.getBody(), loop.lowerBound(), loop.step(), + loop.upperBound(), loop.getInductionVar()); + + loop.setLowerBound(std::get<0>(loopPieces)); + loop.setStep(std::get<1>(loopPieces)); + loop.setUpperBound(std::get<2>(loopPieces)); } void mlir::coalesceLoops(MutableArrayRef loops) { @@ -1093,6 +1107,118 @@ second.erase(); } +void mlir::coalescePLoops( + loop::ParallelOp loops, + std::vector> combinedDimensions) { + OpBuilder outsideBuilder(loops); + + SmallVector normalizedLowerBounds; + SmallVector normalizedSteps; + SmallVector normalizedUpperBounds; + // Normalize ParallelOp's iteration pattern. + for (unsigned i = 0; i < loops.getNumLoops(); i++) { + auto resultBounds = normalizeLoop( + outsideBuilder, loops.getBody(), loops.lowerBound()[i], loops.step()[i], + loops.upperBound()[i], loops.getBody()->getArgument(i)); + + normalizedLowerBounds.push_back(std::get<0>(resultBounds)); + normalizedSteps.push_back(std::get<1>(resultBounds)); + normalizedUpperBounds.push_back(std::get<2>(resultBounds)); + } + + // Combine iteration spaces + SmallVector lowerBounds; + SmallVector steps; + SmallVector upperBounds; + auto cst0 = outsideBuilder.create(loops.getLoc(), 0); + ; + auto cst1 = outsideBuilder.create(loops.getLoc(), 1); + ; + for (unsigned i = 0; i < combinedDimensions.size(); i++) { + Value newUpperBound = + outsideBuilder.create(loops.getLoc(), 1); + for (auto idx : combinedDimensions[i]) { + newUpperBound = outsideBuilder.create( + loops.getLoc(), newUpperBound, normalizedUpperBounds[idx]); + } + lowerBounds.push_back(cst0); + steps.push_back(cst1); + upperBounds.push_back(newUpperBound); + } + auto newPloop = outsideBuilder.create( + loops.getLoc(), lowerBounds, upperBounds, steps); + OpBuilder insideBuilder(newPloop.getBody(), newPloop.getBody()->begin()); + // TODO: iterate the other way and mod to the get the original values and + // then replace all uses of the original induction vars with the new moded + // values of the one induction variable + for (unsigned i = 0; i < combinedDimensions.size(); i++) { + Value previous = newPloop.getBody()->getArgument(i); + for (unsigned idx = 0, e = combinedDimensions[i].size(); idx < e; ++idx) { + unsigned ivar_idx = combinedDimensions[i][idx]; + if (idx != 0) + previous = insideBuilder.create( + loops.getLoc(), previous, loops.upperBound()[ivar_idx]); + + Value iv = (idx == e - 1) ? previous + : insideBuilder.create( + loops.getLoc(), previous, + loops.upperBound()[ivar_idx]); + replaceAllUsesInRegionWith(loops.getBody()->getArgument(ivar_idx), iv, + loops.region()); + } + } + + loops.getBody()->back().erase(); + newPloop.getBody()->getOperations().splice( + Block::iterator(newPloop.getBody()->back()), + loops.getBody()->getOperations()); + loops.erase(); + + /* + // 2. Emit code computing the upper bound of the coalesced loop as product + // of the number of iterations of all loops. + OpBuilder builder(outermost); + Location loc = outermost.getLoc(); + Value upperBound = outermost.upperBound(); + for (auto loop : loops.drop_front()) + upperBound = builder.create(loc, upperBound, loop.upperBound()); + outermost.setUpperBound(upperBound); + + builder.setInsertionPointToStart(outermost.getBody()); + + // 3. Remap induction variables. For each original loop, the value of the + // induction variable can be obtained by dividing the induction variable of + // the linearized loop by the total number of iterations of the loops nested + // in it modulo the number of iterations in this loop (remove the values + // related to the outer loops): + // iv_i = floordiv(iv_linear, product-of-loop-ranges-until-i) mod range_i. + // Compute these iteratively from the innermost loop by creating a "running + // quotient" of division by the range. + Value previous = outermost.getInductionVar(); + for (unsigned i = 0, e = loops.size(); i < e; ++i) { + unsigned idx = loops.size() - i - 1; + if (i != 0) + previous = builder.create(loc, previous, + loops[idx + 1].upperBound()); + + Value iv = (i == e - 1) ? previous + : builder.create( + loc, previous, loops[idx].upperBound()); + replaceAllUsesInRegionWith(loops[idx].getInductionVar(), iv, + loops.back().region()); + } + + // 4. Move the operations from the innermost just above the second-outermost + // loop, delete the extra terminator and the second-outermost loop. + loop::ForOp second = loops[1]; + innermost.getBody()->back().erase(); + outermost.getBody()->getOperations().splice( + Block::iterator(second.getOperation()), + innermost.getBody()->getOperations()); + second.erase(); + */ +} + void mlir::mapLoopToProcessorIds(loop::ForOp forOp, ArrayRef processorId, ArrayRef numProcessors) { assert(processorId.size() == numProcessors.size()); diff --git a/mlir/test/Transforms/parallel-loop-coalescing.mlir b/mlir/test/Transforms/parallel-loop-coalescing.mlir new file mode 100644 --- /dev/null +++ b/mlir/test/Transforms/parallel-loop-coalescing.mlir @@ -0,0 +1,25 @@ +// RUN: mlir-opt -parallel-loop-coalescing %s | FileCheck %s + +// CHECK-LABEL: @parallel_many_dims +func @parallel_many_dims() { + %c0 = constant 0 : index + %c1 = constant 1 : index + %c2 = constant 2 : index + %c3 = constant 3 : index + %c4 = constant 4 : index + %c5 = constant 5 : index + %c6 = constant 6 : index + %c7 = constant 7 : index + %c8 = constant 8 : index + %c9 = constant 9 : index + %c10 = constant 10 : index + %c11 = constant 11 : index + %c12 = constant 12 : index + %c13 = constant 13 : index + %c14 = constant 14 : index + loop.parallel (%i0, %i1, %i2, %i3, %i4) = (%c0, %c3, %c6, %c9, %c12) to (%c2, %c5, %c8, %c11, %c14) + step (%c1, %c4, %c7, %c10, %c13) { + %result = "magic.op"(%i0, %i1, %i2, %i3, %i4): (index, index, index, index, index) -> index + } + return +}