diff --git a/mlir/include/mlir/InitAllPasses.h b/mlir/include/mlir/InitAllPasses.h --- a/mlir/include/mlir/InitAllPasses.h +++ b/mlir/include/mlir/InitAllPasses.h @@ -111,6 +111,7 @@ createParallelLoopFusionPass(); createParallelLoopSpecializationPass(); createParallelLoopTilingPass(); + createParallelLoopCoalescingPass(); // QuantOps quant::createConvertSimulatedQuantPass(); diff --git a/mlir/include/mlir/Transforms/LoopUtils.h b/mlir/include/mlir/Transforms/LoopUtils.h --- a/mlir/include/mlir/Transforms/LoopUtils.h +++ b/mlir/include/mlir/Transforms/LoopUtils.h @@ -28,6 +28,7 @@ namespace loop { class ForOp; +class ParallelOp; } // end namespace loop /// Unrolls this for operation completely if the trip count is known to be @@ -225,6 +226,9 @@ /// independent of any loop induction variable involved in the nest. void coalesceLoops(MutableArrayRef loops); +void coalescePLoops(loop::ParallelOp loops, + std::vector> combinedDimensions); + /// Maps `forOp` for execution on a parallel grid of virtual `processorIds` of /// size given by `numProcessors`. This is achieved by embedding the SSA values /// corresponding to `processorIds` and `numProcessors` into the bounds and step diff --git a/mlir/include/mlir/Transforms/Passes.h b/mlir/include/mlir/Transforms/Passes.h --- a/mlir/include/mlir/Transforms/Passes.h +++ b/mlir/include/mlir/Transforms/Passes.h @@ -91,6 +91,10 @@ /// bounds into a single loop. std::unique_ptr> createLoopCoalescingPass(); +/// Creates a pass that transforms a single ParallelLoop over N induction +/// variables into another ParallelLoop over less than N induction variables. +std::unique_ptr> createParallelLoopCoalescingPass(); + /// Performs packing (or explicit copying) of accessed memref regions into /// buffers in the specified faster memory space through either pointwise copies /// or DMA operations. diff --git a/mlir/lib/Transforms/CMakeLists.txt b/mlir/lib/Transforms/CMakeLists.txt --- a/mlir/lib/Transforms/CMakeLists.txt +++ b/mlir/lib/Transforms/CMakeLists.txt @@ -16,6 +16,7 @@ LoopUnroll.cpp MemRefDataFlowOpt.cpp OpStats.cpp + ParallelLoopCoalescing.cpp PipelineDataTransfer.cpp SimplifyAffineStructures.cpp StripDebugInfo.cpp diff --git a/mlir/lib/Transforms/ParallelLoopCoalescing.cpp b/mlir/lib/Transforms/ParallelLoopCoalescing.cpp new file mode 100644 --- /dev/null +++ b/mlir/lib/Transforms/ParallelLoopCoalescing.cpp @@ -0,0 +1,68 @@ +//===- ParallelLoopCoalescing.cpp - Pass coalescing parallel loop indices -===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "mlir/Dialect/LoopOps/LoopOps.h" +#include "mlir/Dialect/StandardOps/IR/Ops.h" +#include "mlir/Pass/Pass.h" +#include "mlir/Transforms/LoopUtils.h" +#include "mlir/Transforms/Passes.h" +#include "mlir/Transforms/RegionUtils.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" + +#define PASS_NAME "parallel-loop-coalescing" +#define DEBUG_TYPE PASS_NAME + +using namespace mlir; + +static llvm::cl::OptionCategory clOptionsCategory(DEBUG_TYPE " options"); + +static llvm::cl::list clCoalescedIndices0( + "coalesced-indices-0", + llvm::cl::desc("Which loop indices to combine 0th loop index"), + llvm::cl::MiscFlags::CommaSeparated, llvm::cl::cat(clOptionsCategory)); + +static llvm::cl::list clCoalescedIndices1( + "coalesced-indices-1", + llvm::cl::desc( + "Which loop indices to combine into the position 1 loop index"), + llvm::cl::MiscFlags::CommaSeparated, llvm::cl::cat(clOptionsCategory)); + +static llvm::cl::list clCoalescedIndices2( + "coalesced-indices-2", + llvm::cl::desc( + "Which loop indices to combine into the position 2 loop index"), + llvm::cl::MiscFlags::CommaSeparated, llvm::cl::cat(clOptionsCategory)); + +namespace { +class ParallelLoopCoalescingPass + : public FunctionPass { +public: + void runOnFunction() override { + FuncOp func = getFunction(); + + // The common case for GPU dialect will be simplifying the ParallelOp to 3 + // arguments, so we do that here to simplify things. + func.walk([&](loop::ParallelOp op) { + std::vector> combinedLoops(3); + combinedLoops[0] = clCoalescedIndices0; + combinedLoops[1] = clCoalescedIndices1; + combinedLoops[2] = clCoalescedIndices2; + coalescePLoops(op, combinedLoops); + }); + } +}; + +} // namespace + +std::unique_ptr> mlir::createParallelLoopCoalescingPass() { + return std::make_unique(); +} + +static PassRegistration + reg(PASS_NAME, "coalesce parallel loops to use less induction variables."); diff --git a/mlir/lib/Transforms/Utils/LoopUtils.cpp b/mlir/lib/Transforms/Utils/LoopUtils.cpp --- a/mlir/lib/Transforms/Utils/LoopUtils.cpp +++ b/mlir/lib/Transforms/Utils/LoopUtils.cpp @@ -973,69 +973,85 @@ } } -// Transform a loop with a strictly positive step -// for %i = %lb to %ub step %s -// into a 0-based loop with step 1 -// for %ii = 0 to ceildiv(%ub - %lb, %s) step 1 { -// %i = %ii * %s + %lb -// Insert the induction variable remapping in the body of `inner`, which is -// expected to be either `loop` or another loop perfectly nested under `loop`. -// Insert the definition of new bounds immediate before `outer`, which is -// expected to be either `loop` or its parent in the loop nest. -static void normalizeLoop(loop::ForOp loop, loop::ForOp outer, - loop::ForOp inner) { - OpBuilder builder(outer); - Location loc = loop.getLoc(); - +// Return the new lower bound, upper bound, and step in that order. Insert any +// additional bounds calculations before the given builder and any additional +// conversion back to the original loop induction value inside the given Block. +static std::tuple +normalizeLoop(OpBuilder &boundsBuilder, OpBuilder &insideLoopBuilder, + Location loc, Value lowerBound, Value upperBound, Value step, + Value inductionVar) { + Value newLowerBound, newUpperBound, newStep; // Check if the loop is already known to have a constant zero lower bound or // a constant one step. bool isZeroBased = false; if (auto ubCst = - dyn_cast_or_null(loop.lowerBound().getDefiningOp())) + dyn_cast_or_null(lowerBound.getDefiningOp())) isZeroBased = ubCst.getValue() == 0; bool isStepOne = false; - if (auto stepCst = - dyn_cast_or_null(loop.step().getDefiningOp())) + if (auto stepCst = dyn_cast_or_null(step.getDefiningOp())) isStepOne = stepCst.getValue() == 1; - if (isZeroBased && isStepOne) - return; // Compute the number of iterations the loop executes: ceildiv(ub - lb, step) // assuming the step is strictly positive. Update the bounds and the step // of the loop to go from 0 to the number of iterations, if necessary. // TODO(zinenko): introduce support for negative steps or emit dynamic asserts // on step positivity, whatever gets implemented first. - Value diff = - builder.create(loc, loop.upperBound(), loop.lowerBound()); - Value numIterations = ceilDivPositive(builder, loc, diff, loop.step()); - loop.setUpperBound(numIterations); - - Value lb = loop.lowerBound(); - if (!isZeroBased) { - Value cst0 = builder.create(loc, 0); - loop.setLowerBound(cst0); + if (isZeroBased && isStepOne) { + return {lowerBound, upperBound, step}; } - Value step = loop.step(); - if (!isStepOne) { - Value cst1 = builder.create(loc, 1); - loop.setStep(cst1); + Value diff = boundsBuilder.create(loc, upperBound, lowerBound); + newUpperBound = ceilDivPositive(boundsBuilder, loc, diff, step); + + if (isZeroBased) { + newLowerBound = lowerBound; + } else { + newLowerBound = boundsBuilder.create(loc, 0); + } + + if (isStepOne) { + newStep = step; + } else { + newStep = boundsBuilder.create(loc, 1); } // Insert code computing the value of the original loop induction variable // from the "normalized" one. - builder.setInsertionPointToStart(inner.getBody()); Value scaled = - isStepOne ? loop.getInductionVar() - : builder.create(loc, loop.getInductionVar(), step); + isStepOne ? inductionVar + : insideLoopBuilder.create(loc, inductionVar, step); Value shifted = - isZeroBased ? scaled : builder.create(loc, scaled, lb); + isZeroBased ? scaled + : insideLoopBuilder.create(loc, scaled, lowerBound); SmallPtrSet preserve{scaled.getDefiningOp(), shifted.getDefiningOp()}; - replaceAllUsesExcept(loop.getInductionVar(), shifted, preserve); + replaceAllUsesExcept(inductionVar, shifted, preserve); + return {newLowerBound, newUpperBound, newStep}; +} + +// Transform a loop with a strictly positive step +// for %i = %lb to %ub step %s +// into a 0-based loop with step 1 +// for %ii = 0 to ceildiv(%ub - %lb, %s) step 1 { +// %i = %ii * %s + %lb +// Insert the induction variable remapping in the body of `inner`, which is +// expected to be either `loop` or another loop perfectly nested under `loop`. +// Insert the definition of new bounds immediate before `outer`, which is +// expected to be either `loop` or its parent in the loop nest. +static void normalizeLoop(loop::ForOp loop, loop::ForOp outer, + loop::ForOp inner) { + OpBuilder builder(outer); + OpBuilder innerBuilder(inner.getBody(), inner.getBody()->begin()); + auto loopPieces = + normalizeLoop(builder, innerBuilder, loop.getLoc(), loop.lowerBound(), + loop.upperBound(), loop.step(), loop.getInductionVar()); + + loop.setLowerBound(std::get<0>(loopPieces)); + loop.setStep(std::get<2>(loopPieces)); + loop.setUpperBound(std::get<1>(loopPieces)); } void mlir::coalesceLoops(MutableArrayRef loops) { @@ -1093,6 +1109,74 @@ second.erase(); } +void mlir::coalescePLoops( + loop::ParallelOp loops, + std::vector> combinedDimensions) { + OpBuilder outsideBuilder(loops); + + // Normalize ParallelOp's iteration pattern. + SmallVector normalizedLowerBounds; + SmallVector normalizedSteps; + SmallVector normalizedUpperBounds; + for (unsigned i = 0; i < loops.getNumLoops(); i++) { + OpBuilder insideLoopBuilder(loops.getBody(), loops.getBody()->begin()); + auto resultBounds = + normalizeLoop(outsideBuilder, insideLoopBuilder, loops.getLoc(), + loops.lowerBound()[i], loops.upperBound()[i], + loops.step()[i], loops.getBody()->getArgument(i)); + + normalizedLowerBounds.push_back(std::get<0>(resultBounds)); + normalizedUpperBounds.push_back(std::get<1>(resultBounds)); + normalizedSteps.push_back(std::get<2>(resultBounds)); + } + + // Combine iteration spaces + SmallVector lowerBounds; + SmallVector steps; + SmallVector upperBounds; + auto cst0 = outsideBuilder.create(loops.getLoc(), 0); + auto cst1 = outsideBuilder.create(loops.getLoc(), 1); + for (unsigned i = 0; i < combinedDimensions.size(); i++) { + Value newUpperBound = + outsideBuilder.create(loops.getLoc(), 1); + for (auto idx : combinedDimensions[i]) { + newUpperBound = outsideBuilder.create( + loops.getLoc(), newUpperBound, normalizedUpperBounds[idx]); + } + lowerBounds.push_back(cst0); + steps.push_back(cst1); + upperBounds.push_back(newUpperBound); + } + + // Create new ParallelLoop with conversions to the original induction values. + auto newPloop = outsideBuilder.create( + loops.getLoc(), lowerBounds, upperBounds, steps); + OpBuilder insideBuilder(newPloop.getBody(), newPloop.getBody()->begin()); + for (unsigned i = 0; i < combinedDimensions.size(); i++) { + Value previous = newPloop.getBody()->getArgument(i); + for (unsigned idx = 0, e = combinedDimensions[i].size(); idx < e; ++idx) { + unsigned ivar_idx = combinedDimensions[i][idx]; + if (idx != 0) + previous = insideBuilder.create( + loops.getLoc(), previous, loops.upperBound()[ivar_idx]); + + Value iv = (idx == e - 1) ? previous + : insideBuilder.create( + loops.getLoc(), previous, + loops.upperBound()[ivar_idx]); + replaceAllUsesInRegionWith(loops.getBody()->getArgument(ivar_idx), iv, + loops.region()); + } + } + + // Replace the old loop with the new loop. + loops.getBody()->back().erase(); + newPloop.getBody()->getOperations().splice( + Block::iterator(newPloop.getBody()->back()), + loops.getBody()->getOperations()); + loops.erase(); +} + void mlir::mapLoopToProcessorIds(loop::ForOp forOp, ArrayRef processorId, ArrayRef numProcessors) { assert(processorId.size() == numProcessors.size()); diff --git a/mlir/test/Transforms/parallel-loop-coalescing.mlir b/mlir/test/Transforms/parallel-loop-coalescing.mlir new file mode 100644 --- /dev/null +++ b/mlir/test/Transforms/parallel-loop-coalescing.mlir @@ -0,0 +1,96 @@ +// RUN: mlir-opt -parallel-loop-coalescing --coalesced-indices-0=0,3 --coalesced-indices-1=1,4 --coalesced-indices-2=2 %s | FileCheck %s + +// CHECK-LABEL: func @parallel_many_dims() { +func @parallel_many_dims() { + // CHECK: [[VAL_0:%.*]] = constant 0 : index + // CHECK: [[VAL_1:%.*]] = constant 1 : index + // CHECK: [[VAL_2:%.*]] = constant 2 : index + // CHECK: [[VAL_3:%.*]] = constant 3 : index + // CHECK: [[VAL_4:%.*]] = constant 4 : index + // CHECK: [[VAL_5:%.*]] = constant 5 : index + // CHECK: [[VAL_6:%.*]] = constant 6 : index + // CHECK: [[VAL_7:%.*]] = constant 7 : index + // CHECK: [[VAL_8:%.*]] = constant 8 : index + // CHECK: [[VAL_9:%.*]] = constant 9 : index + // CHECK: [[VAL_10:%.*]] = constant 10 : index + // CHECK: [[VAL_11:%.*]] = constant 11 : index + // CHECK: [[VAL_12:%.*]] = constant 12 : index + // CHECK: [[VAL_13:%.*]] = constant 13 : index + // CHECK: [[VAL_14:%.*]] = constant 14 : index + // CHECK: [[VAL_15:%.*]] = subi [[VAL_5]], [[VAL_3]] : index + // CHECK: [[VAL_16:%.*]] = constant 1 : index + // CHECK: [[VAL_17:%.*]] = subi [[VAL_4]], [[VAL_16]] : index + // CHECK: [[VAL_18:%.*]] = addi [[VAL_15]], [[VAL_17]] : index + // CHECK: [[VAL_19:%.*]] = divi_signed [[VAL_18]], [[VAL_4]] : index + // CHECK: [[VAL_20:%.*]] = constant 0 : index + // CHECK: [[VAL_21:%.*]] = constant 1 : index + // CHECK: [[VAL_22:%.*]] = subi [[VAL_8]], [[VAL_6]] : index + // CHECK: [[VAL_23:%.*]] = constant 1 : index + // CHECK: [[VAL_24:%.*]] = subi [[VAL_7]], [[VAL_23]] : index + // CHECK: [[VAL_25:%.*]] = addi [[VAL_22]], [[VAL_24]] : index + // CHECK: [[VAL_26:%.*]] = divi_signed [[VAL_25]], [[VAL_7]] : index + // CHECK: [[VAL_27:%.*]] = constant 0 : index + // CHECK: [[VAL_28:%.*]] = constant 1 : index + // CHECK: [[VAL_29:%.*]] = subi [[VAL_11]], [[VAL_9]] : index + // CHECK: [[VAL_30:%.*]] = constant 1 : index + // CHECK: [[VAL_31:%.*]] = subi [[VAL_10]], [[VAL_30]] : index + // CHECK: [[VAL_32:%.*]] = addi [[VAL_29]], [[VAL_31]] : index + // CHECK: [[VAL_33:%.*]] = divi_signed [[VAL_32]], [[VAL_10]] : index + // CHECK: [[VAL_34:%.*]] = constant 0 : index + // CHECK: [[VAL_35:%.*]] = constant 1 : index + // CHECK: [[VAL_36:%.*]] = subi [[VAL_14]], [[VAL_12]] : index + // CHECK: [[VAL_37:%.*]] = constant 1 : index + // CHECK: [[VAL_38:%.*]] = subi [[VAL_13]], [[VAL_37]] : index + // CHECK: [[VAL_39:%.*]] = addi [[VAL_36]], [[VAL_38]] : index + // CHECK: [[VAL_40:%.*]] = divi_signed [[VAL_39]], [[VAL_13]] : index + // CHECK: [[VAL_41:%.*]] = constant 0 : index + // CHECK: [[VAL_42:%.*]] = constant 1 : index + // CHECK: [[VAL_43:%.*]] = constant 0 : index + // CHECK: [[VAL_44:%.*]] = constant 1 : index + // CHECK: [[VAL_45:%.*]] = constant 1 : index + // CHECK: [[VAL_46:%.*]] = muli [[VAL_45]], [[VAL_2]] : index + // CHECK: [[VAL_47:%.*]] = muli [[VAL_46]], [[VAL_33]] : index + // CHECK: [[VAL_48:%.*]] = constant 1 : index + // CHECK: [[VAL_49:%.*]] = muli [[VAL_48]], [[VAL_19]] : index + // CHECK: [[VAL_50:%.*]] = muli [[VAL_49]], [[VAL_40]] : index + // CHECK: [[VAL_51:%.*]] = constant 1 : index + // CHECK: [[VAL_52:%.*]] = muli [[VAL_51]], [[VAL_26]] : index + %c0 = constant 0 : index + %c1 = constant 1 : index + %c2 = constant 2 : index + %c3 = constant 3 : index + %c4 = constant 4 : index + %c5 = constant 5 : index + %c6 = constant 6 : index + %c7 = constant 7 : index + %c8 = constant 8 : index + %c9 = constant 9 : index + %c10 = constant 10 : index + %c11 = constant 11 : index + %c12 = constant 12 : index + %c13 = constant 13 : index + %c14 = constant 14 : index +// CHECK: loop.parallel ([[VAL_53:%.*]], [[VAL_54:%.*]], [[VAL_55:%.*]]) = ([[VAL_43]], [[VAL_43]], [[VAL_43]]) to ([[VAL_47]], [[VAL_50]], [[VAL_52]]) step ([[VAL_44]], [[VAL_44]], [[VAL_44]]) { + + loop.parallel (%i0, %i1, %i2, %i3, %i4) = (%c0, %c3, %c6, %c9, %c12) to (%c2, %c5, %c8, %c11, %c14) + step (%c1, %c4, %c7, %c10, %c13) { + // CHECK: [[VAL_56:%.*]] = remi_signed [[VAL_53]], [[VAL_2]] : index + // CHECK: [[VAL_57:%.*]] = divi_signed [[VAL_53]], [[VAL_11]] : index + // CHECK: [[VAL_58:%.*]] = remi_signed [[VAL_54]], [[VAL_5]] : index + // CHECK: [[VAL_59:%.*]] = divi_signed [[VAL_54]], [[VAL_14]] : index + // CHECK: [[VAL_60:%.*]] = muli [[VAL_59]], [[VAL_13]] : index + // CHECK: [[VAL_61:%.*]] = addi [[VAL_60]], [[VAL_12]] : index + // CHECK: [[VAL_62:%.*]] = muli [[VAL_57]], [[VAL_10]] : index + // CHECK: [[VAL_63:%.*]] = addi [[VAL_62]], [[VAL_9]] : index + // CHECK: [[VAL_64:%.*]] = muli [[VAL_55]], [[VAL_7]] : index + // CHECK: [[VAL_65:%.*]] = addi [[VAL_64]], [[VAL_6]] : index + // CHECK: [[VAL_66:%.*]] = muli [[VAL_58]], [[VAL_4]] : index + // CHECK: [[VAL_67:%.*]] = addi [[VAL_66]], [[VAL_3]] : index + // CHECK: [[VAL_68:%.*]] = "magic.op"([[VAL_56]], [[VAL_67]], [[VAL_65]], [[VAL_63]], [[VAL_61]]) : (index, index, index, index, index) -> index + + %result = "magic.op"(%i0, %i1, %i2, %i3, %i4): (index, index, index, index, index) -> index + // CHECK: loop.yield + } + // CHECK: return + return +}