diff --git a/mlir/include/mlir/Transforms/LoopUtils.h b/mlir/include/mlir/Transforms/LoopUtils.h --- a/mlir/include/mlir/Transforms/LoopUtils.h +++ b/mlir/include/mlir/Transforms/LoopUtils.h @@ -243,6 +243,14 @@ /// independent of any loop induction variable involved in the nest. void coalesceLoops(MutableArrayRef loops); +/// Replace a perfect nest of "for" loops with a single linearized loop. Assumes +/// `loops` contains a list of perfectly nested loops outermost to innermost +/// that are normalized (step one and lower bound of zero) and with bounds and +/// steps independent of any loop induction variable involved in the nest. +/// Coalescing affine.for loops is not always possible, i.e., the result may not +/// be representable using affine.for. +LogicalResult coalesceLoops(MutableArrayRef loops); + /// Take the ParallelLoop and for each set of dimension indices, combine them /// into a single dimension. combinedDimensions must contain each index into /// loops exactly once. diff --git a/mlir/lib/Transforms/LoopCoalescing.cpp b/mlir/lib/Transforms/LoopCoalescing.cpp --- a/mlir/lib/Transforms/LoopCoalescing.cpp +++ b/mlir/lib/Transforms/LoopCoalescing.cpp @@ -7,6 +7,7 @@ //===----------------------------------------------------------------------===// #include "PassDetail.h" +#include "mlir/Dialect/Affine/IR/AffineOps.h" #include "mlir/Dialect/SCF/SCF.h" #include "mlir/Transforms/LoopUtils.h" #include "mlir/Transforms/Passes.h" @@ -20,65 +21,73 @@ namespace { struct LoopCoalescingPass : public LoopCoalescingBase { - void runOnFunction() override { - FuncOp func = getFunction(); - func.walk([](scf::ForOp op) { - // Ignore nested loops. - if (op->getParentOfType()) - return; + /// Walk either an scf.for or an affine.for to find a band to coalesce. + template + static void walkLoop(LoopOpTy op) { + // Ignore nested loops. + if (op->template getParentOfType()) + return; - SmallVector loops; - getPerfectlyNestedLoops(loops, op); - LLVM_DEBUG(llvm::dbgs() - << "found a perfect nest of depth " << loops.size() << '\n'); + SmallVector loops; + getPerfectlyNestedLoops(loops, op); + LLVM_DEBUG(llvm::dbgs() + << "found a perfect nest of depth " << loops.size() << '\n'); - // Look for a band of loops that can be coalesced, i.e. perfectly nested - // loops with bounds defined above some loop. - // 1. For each loop, find above which parent loop its operands are - // defined. - SmallVector operandsDefinedAbove(loops.size()); - for (unsigned i = 0, e = loops.size(); i < e; ++i) { - operandsDefinedAbove[i] = i; - for (unsigned j = 0; j < i; ++j) { - if (areValuesDefinedAbove(loops[i].getOperands(), - loops[j].region())) { - operandsDefinedAbove[i] = j; - break; - } + // Look for a band of loops that can be coalesced, i.e. perfectly nested + // loops with bounds defined above some loop. + // 1. For each loop, find above which parent loop its operands are + // defined. + SmallVector operandsDefinedAbove(loops.size()); + for (unsigned i = 0, e = loops.size(); i < e; ++i) { + operandsDefinedAbove[i] = i; + for (unsigned j = 0; j < i; ++j) { + if (areValuesDefinedAbove(loops[i].getOperands(), loops[j].region())) { + operandsDefinedAbove[i] = j; + break; } - LLVM_DEBUG(llvm::dbgs() - << " bounds of loop " << i << " are known above depth " - << operandsDefinedAbove[i] << '\n'); } + LLVM_DEBUG(llvm::dbgs() + << " bounds of loop " << i << " are known above depth " + << operandsDefinedAbove[i] << '\n'); + } - // 2. Identify bands of loops such that the operands of all of them are - // defined above the first loop in the band. Traverse the nest bottom-up - // so that modifications don't invalidate the inner loops. - for (unsigned end = loops.size(); end > 0; --end) { - unsigned start = 0; - for (; start < end - 1; ++start) { - auto maxPos = - *std::max_element(std::next(operandsDefinedAbove.begin(), start), - std::next(operandsDefinedAbove.begin(), end)); - if (maxPos > start) - continue; + // 2. Identify bands of loops such that the operands of all of them are + // defined above the first loop in the band. Traverse the nest bottom-up + // so that modifications don't invalidate the inner loops. + for (unsigned end = loops.size(); end > 0; --end) { + unsigned start = 0; + for (; start < end - 1; ++start) { + auto maxPos = + *std::max_element(std::next(operandsDefinedAbove.begin(), start), + std::next(operandsDefinedAbove.begin(), end)); + if (maxPos > start) + continue; - assert(maxPos == start && - "expected loop bounds to be known at the start of the band"); - LLVM_DEBUG(llvm::dbgs() << " found coalesceable band from " << start - << " to " << end << '\n'); + assert(maxPos == start && + "expected loop bounds to be known at the start of the band"); + LLVM_DEBUG(llvm::dbgs() << " found coalesceable band from " << start + << " to " << end << '\n'); - auto band = - llvm::makeMutableArrayRef(loops.data() + start, end - start); - coalesceLoops(band); - break; - } - // If a band was found and transformed, keep looking at the loops above - // the outermost transformed loop. - if (start != end - 1) - end = start + 1; + auto band = + llvm::makeMutableArrayRef(loops.data() + start, end - start); + (void)coalesceLoops(band); + break; } + // If a band was found and transformed, keep looking at the loops above + // the outermost transformed loop. + if (start != end - 1) + end = start + 1; + } + } + + void runOnFunction() override { + FuncOp func = getFunction(); + func.walk([&](Operation *op) { + if (auto scfForOp = dyn_cast(op)) + walkLoop(scfForOp); + else if (auto affineForOp = dyn_cast(op)) + walkLoop(affineForOp); }); } }; diff --git a/mlir/lib/Transforms/Utils/LoopUtils.cpp b/mlir/lib/Transforms/Utils/LoopUtils.cpp --- a/mlir/lib/Transforms/Utils/LoopUtils.cpp +++ b/mlir/lib/Transforms/Utils/LoopUtils.cpp @@ -2045,7 +2045,7 @@ builder.setInsertionPointToStart(outermost.getBody()); - // 3. Remap induction variables. For each original loop, the value of the + // 3. Remap induction variables. For each original loop, the value of the // induction variable can be obtained by dividing the induction variable of // the linearized loop by the total number of iterations of the loops nested // in it modulo the number of iterations in this loop (remove the values @@ -2077,6 +2077,120 @@ second.erase(); } +LogicalResult mlir::coalesceLoops(MutableArrayRef loops) { + if (loops.size() < 2) + return success(); + + AffineForOp innermost = loops.back(); + AffineForOp outermost = loops.front(); + AffineBound ub = outermost.getUpperBound(); + AffineMap origUbMap = ub.getMap(); + Location loc = outermost.getLoc(); + OpBuilder builder(outermost); + for (AffineForOp loop : loops) { + // We only work on normalized loops. + if (loop.getStep() != 1 || !loop.hasConstantLowerBound() || + loop.getConstantLowerBound() != 0) + return failure(); + } + SmallVector upperBoundSymbols; + SmallVector ubOperands(ub.getOperands().begin(), + ub.getOperands().end()); + + // 1. Store the upper bound of the outermost loop in a variable. + Value prev; + if (!llvm::hasSingleElement(origUbMap.getResults())) + prev = builder.create(loc, origUbMap, ubOperands); + else + prev = builder.create(loc, origUbMap, ubOperands); + upperBoundSymbols.push_back(prev); + + // 2. Emit code computing the upper bound of the coalesced loop as product of + // the number of iterations of all loops. + for (AffineForOp loop : loops.drop_front()) { + ub = loop.getUpperBound(); + origUbMap = ub.getMap(); + ubOperands = ub.getOperands(); + Value upperBound; + // If upper bound map has more than one result, take their minimum. + if (!llvm::hasSingleElement(origUbMap.getResults())) + upperBound = builder.create(loc, origUbMap, ubOperands); + else + upperBound = builder.create(loc, origUbMap, ubOperands); + upperBoundSymbols.push_back(upperBound); + SmallVector operands; + operands.push_back(prev); + operands.push_back(upperBound); + // Maintain running product of loop upper bounds. + prev = builder.create( + loc, + AffineMap::get(/*numDims=*/1, + /*numSymbols=*/1, + builder.getAffineDimExpr(0) * + builder.getAffineSymbolExpr(0)), + operands); + } + // Set upper bound of the coalesced loop. + AffineMap newUbMap = AffineMap::get( + /*numDims=*/0, + /*numSymbols=*/1, builder.getAffineSymbolExpr(0), builder.getContext()); + outermost.setUpperBound(prev, newUbMap); + + builder.setInsertionPointToStart(outermost.getBody()); + + // 3. Remap induction variables. For each original loop, the value of the + // induction variable can be obtained by dividing the induction variable of + // the linearized loop by the total number of iterations of the loops nested + // in it modulo the number of iterations in this loop (remove the values + // related to the outer loops): + // iv_i = floordiv(iv_linear, product-of-loop-ranges-until-i) mod range_i. + // Compute these iteratively from the innermost loop by creating a "running + // quotient" of division by the range. + Value previous = outermost.getInductionVar(); + for (unsigned idx = loops.size(); idx > 0; --idx) { + if (idx != loops.size()) { + SmallVector operands; + operands.push_back(previous); + operands.push_back(upperBoundSymbols[idx]); + previous = builder.create( + loc, + AffineMap::get( + /*numDims=*/1, /*numSymbols=*/1, + builder.getAffineDimExpr(0).floorDiv( + builder.getAffineSymbolExpr(0))), + operands); + } + // Modified value of the induction variables of the nested loops after + // coalescing. + Value inductionVariable; + if (idx == 1) { + inductionVariable = previous; + } else { + SmallVector applyOperands; + applyOperands.push_back(previous); + applyOperands.push_back(upperBoundSymbols[idx - 1]); + inductionVariable = builder.create( + loc, + AffineMap::get( + /*numDims=*/1, /*numSymbols=*/1, + builder.getAffineDimExpr(0) % builder.getAffineSymbolExpr(0)), + applyOperands); + } + replaceAllUsesInRegionWith(loops[idx - 1].getInductionVar(), + inductionVariable, loops.back().region()); + } + + // 4. Move the operations from the innermost just above the second-outermost + // loop, delete the extra terminator and the second-outermost loop. + AffineForOp secondOutermostLoop = loops[1]; + innermost.getBody()->back().erase(); + outermost.getBody()->getOperations().splice( + Block::iterator(secondOutermostLoop.getOperation()), + innermost.getBody()->getOperations()); + secondOutermostLoop.erase(); + return success(); +} + void mlir::collapseParallelLoops( scf::ParallelOp loops, ArrayRef> combinedDimensions) { OpBuilder outsideBuilder(loops); diff --git a/mlir/test/Transforms/loop-coalescing.mlir b/mlir/test/Transforms/loop-coalescing.mlir --- a/mlir/test/Transforms/loop-coalescing.mlir +++ b/mlir/test/Transforms/loop-coalescing.mlir @@ -1,4 +1,4 @@ -// RUN: mlir-opt -allow-unregistered-dialect -loop-coalescing %s | FileCheck %s +// RUN: mlir-opt -split-input-file -allow-unregistered-dialect -loop-coalescing %s | FileCheck %s // CHECK-LABEL: @one_3d_nest func @one_3d_nest() { @@ -191,3 +191,170 @@ } return } + +// ----- + +// Check coalescing of affine.for loops when all the loops have constant upper bound. +// CHECK-DAG: #[[SIXTEEN:.*]] = affine_map<() -> (16)> +// CHECK-DAG: #[[SIXTY_FOUR:.*]] = affine_map<() -> (64)> +// CHECK-DAG: #[[PRODUCT:.*]] = affine_map<(d0)[s0] -> (d0 * s0)> +// CHECK-DAG: #[[EIGHT:.*]] = affine_map<() -> (8)> +// CHECK-DAG: #[[MOD:.*]] = affine_map<(d0)[s0] -> (d0 mod s0)> +// CHECK-DAG: #[[DIV:.*]] = affine_map<(d0)[s0] -> (d0 floordiv s0)> +func @coalesce_affine_for() { + affine.for %i = 0 to 16 { + affine.for %j = 0 to 64 { + affine.for %k = 0 to 8 { + "test.foo"(%i, %j, %k) : (index, index, index) -> () + } + } + } + return +} +// CHECK-DAG: %[[T0:.*]] = affine.apply #[[SIXTEEN]]() +// CHECK-DAG: %[[T1:.*]] = affine.apply #[[SIXTY_FOUR]]() +// CHECK-DAG: %[[T2:.*]] = affine.apply #[[PRODUCT]](%[[T0]])[%[[T1]]] +// CHECK-DAG: %[[T3:.*]] = affine.apply #[[EIGHT]]() +// CHECK-DAG: %[[T4:.*]] = affine.apply #[[PRODUCT]](%[[T2]])[%[[T3]]] +// CHECK: affine.for %[[IV:.*]] = 0 to %[[T4]] +// CHECK-DAG: %[[K:.*]] = affine.apply #[[MOD]](%[[IV]])[%[[T3]]] +// CHECK-DAG: %[[T6:.*]] = affine.apply #[[DIV]](%[[IV]])[%[[T3]]] +// CHECK-DAG: %[[J:.*]] = affine.apply #[[MOD]](%[[T6]])[%[[T1]]] +// CHECK-DAG: %[[I:.*]] = affine.apply #[[DIV]](%[[T6]])[%[[T1]]] +// CHECK-NEXT: "test.foo"(%[[I]], %[[J]], %[[K]]) +// CHECK-NEXT: } +// CHECK-NEXT: return + +// ----- + +// Check coalescing of affine.for loops when all the loops have non constant upper bounds. +// CHECK-DAG: #[[IDENTITY:.*]] = affine_map<()[s0] -> (s0)> +// CHECK-DAG: #[[PRODUCT:.*]] = affine_map<(d0)[s0] -> (d0 * s0)> +// CHECK-DAG: #[[MOD:.*]] = affine_map<(d0)[s0] -> (d0 mod s0)> +// CHECK-DAG: #[[FLOOR:.*]] = affine_map<(d0)[s0] -> (d0 floordiv s0)> +func @coalesce_affine_for(%arg0: memref) { + %c0 = constant 0 : index + %M = memref.dim %arg0, %c0 : memref + %N = memref.dim %arg0, %c0 : memref + %K = memref.dim %arg0, %c0 : memref + affine.for %i = 0 to %M { + affine.for %j = 0 to %N { + affine.for %k = 0 to %K { + "test.foo"(%i, %j, %k) : (index, index, index) -> () + } + } + } + return +} +// CHECK: %[[T0:.*]] = memref.dim %arg{{.*}}, %c{{.*}} : memref +// CHECK: %[[T1:.*]] = memref.dim %arg{{.*}}, %c{{.*}} : memref +// CHECK: %[[T2:.*]] = memref.dim %arg{{.*}}, %c{{.*}} : memref +// CHECK-DAG: %[[T3:.*]] = affine.apply #[[IDENTITY]]()[%[[T0]]] +// CHECK-DAG: %[[T4:.*]] = affine.apply #[[IDENTITY]]()[%[[T1]]] +// CHECK-DAG: %[[T5:.*]] = affine.apply #[[PRODUCT]](%[[T3]])[%[[T4]]] +// CHECK-DAG: %[[T6:.*]] = affine.apply #[[IDENTITY]]()[%[[T2]]] +// CHECK-DAG: %[[T7:.*]] = affine.apply #[[PRODUCT]](%[[T5]])[%[[T6]]] +// CHECK: affine.for %[[IV:.*]] = 0 to %[[T7]] +// CHECK-DAG: %[[K:.*]] = affine.apply #[[MOD]](%[[IV]])[%[[T6]]] +// CHECK-DAG: %[[T9:.*]] = affine.apply #[[FLOOR]](%[[IV]])[%[[T6]]] +// CHECK-DAG: %[[J:.*]] = affine.apply #[[MOD]](%[[T9]])[%[[T4]]] +// CHECK-DAG: %[[I:.*]] = affine.apply #[[FLOOR]](%[[T9]])[%[[T4]]] +// CHECK-NEXT: "test.foo"(%[[I]], %[[J]], %[[K]]) +// CHECK-NEXT: } +// CHECK-NEXT: return + +// ----- + +// Check coalescing of affine.for loops when some of the loop has constant upper bounds while others have nin constant upper bounds. +// CHECK-DAG: #[[IDENTITY:.*]] = affine_map<()[s0] -> (s0)> +// CHECK-DAG: #[[PRODUCT:.*]] = affine_map<(d0)[s0] -> (d0 * s0)> +// CHECK-DAG: #[[SIXTY_FOUR:.*]] = affine_map<() -> (64)> +// CHECK-DAG: #[[MOD:.*]] = affine_map<(d0)[s0] -> (d0 mod s0)> +// CHECK-DAG: #[[DIV:.*]] = affine_map<(d0)[s0] -> (d0 floordiv s0)> +func @coalesce_affine_for(%arg0: memref) { + %c0 = constant 0 : index + %M = memref.dim %arg0, %c0 : memref + %N = memref.dim %arg0, %c0 : memref + affine.for %i = 0 to %M { + affine.for %j = 0 to %N { + affine.for %k = 0 to 64 { + "test.foo"(%i, %j, %k) : (index, index, index) -> () + } + } + } + return +} +// CHECK: %[[T0:.*]] = memref.dim %arg{{.*}}, %c{{.*}} : memref +// CHECK: %[[T1:.*]] = memref.dim %arg{{.*}}, %c{{.*}} : memref +// CHECK-DAG: %[[T2:.*]] = affine.apply #[[IDENTITY]]()[%[[T0]]] +// CHECK-DAG: %[[T3:.*]] = affine.apply #[[IDENTITY]]()[%[[T1]]] +// CHECK-DAG: %[[T4:.*]] = affine.apply #[[PRODUCT]](%[[T2]])[%[[T3]]] +// CHECK-DAG: %[[T5:.*]] = affine.apply #[[SIXTY_FOUR]]() +// CHECK-DAG: %[[T6:.*]] = affine.apply #[[PRODUCT]](%[[T4]])[%[[T5]]] +// CHECK: affine.for %[[IV:.*]] = 0 to %[[T6]] +// CHECK-DAG: %[[K:.*]] = affine.apply #[[MOD]](%[[IV]])[%[[T5]]] +// CHECK-DAG: %[[T8:.*]] = affine.apply #[[DIV]](%[[IV]])[%[[T5]]] +// CHECK-DAG: %[[J:.*]] = affine.apply #[[MOD]](%[[T8]])[%[[T3]]] +// CHECK-DAG: %[[I:.*]] = affine.apply #[[DIV]](%[[T8]])[%[[T3]]] +// CHECK-NEXT: "test.foo"(%[[I]], %[[J]], %[[K]]) +// CHECK-NEXT: } +// CHECK-NEXT: return + +// ----- + +// Check coalescing of affine.for loops when upper bound contains multi result upper bound map. +// CHECK-DAG: #[[MAP0:.*]] = affine_map<()[s0] -> (s0, -s0)> +// CHECK-DAG: #[[IDENTITY:.*]] = affine_map<()[s0] -> (s0)> +// CHECK-DAG: #[[PRODUCT:.*]] = affine_map<(d0)[s0] -> (d0 * s0)> +// CHECK-DAG: #[[MOD:.*]] = affine_map<(d0)[s0] -> (d0 mod s0)> +// CHECK-DAG: #[[DIV:.*]] = affine_map<(d0)[s0] -> (d0 floordiv s0)> +#myMap = affine_map<()[s1] -> (s1, -s1)> +func @coalesce_affine_for(%arg0: memref) { + %c0 = constant 0 : index + %M = memref.dim %arg0, %c0 : memref + %N = memref.dim %arg0, %c0 : memref + %K = memref.dim %arg0, %c0 : memref + affine.for %i = 0 to min #myMap()[%M] { + affine.for %j = 0 to %N { + affine.for %k = 0 to %K { + "test.foo"(%i, %j, %k) : (index, index, index) -> () + } + } + } + return +} +// CHECK: %[[T0:.*]] = memref.dim %arg{{.*}}, %c{{.*}} : memref +// CHECK: %[[T1:.*]] = memref.dim %arg{{.*}}, %c{{.*}} : memref +// CHECK: %[[T2:.*]] = memref.dim %arg{{.*}}, %c{{.*}} : memref +// CHECK-DAG: %[[T3:.*]] = affine.min #[[MAP0]]()[%[[T0]]] +// CHECK-DAG: %[[T4:.*]] = affine.apply #[[IDENTITY]]()[%[[T1]]] +// CHECK-DAG: %[[T5:.*]] = affine.apply #[[PRODUCT]](%[[T3]])[%[[T4]]] +// CHECK-DAG: %[[T6:.*]] = affine.apply #[[IDENTITY]]()[%[[T2]]] +// CHECK-DAG: %[[T7:.*]] = affine.apply #[[PRODUCT]](%[[T5]])[%[[T6]]] +// CHECK: affine.for %[[IV:.*]] = 0 to %[[T7]] +// CHECK-DAG: %[[K:.*]] = affine.apply #[[MOD]](%[[IV]])[%[[T6]]] +// CHECK-DAG: %[[T9:.*]] = affine.apply #[[DIV]](%[[IV]])[%[[T6]]] +// CHECK-DAG: %[[J:.*]] = affine.apply #[[MOD]](%[[T9]])[%[[T4]]] +// CHECK-DAG: %[[I:.*]] = affine.apply #[[DIV]](%[[T9]])[%[[T4]]] +// CHECK-NEXT: "test.foo"(%[[I]], %[[J]], %[[K]]) +// CHECK-NEXT: } +// CHECK-NEXT: return + +// ----- + +// CHECK-DAG: #[[MAP0:.*]] = affine_map<(d0) -> (d0 * 110)> +// CHECK-DAG: #[[MAP1:.*]] = affine_map<(d0) -> (696, d0 * 110 + 110)> +#map0 = affine_map<(d0) -> (d0 * 110)> +#map1 = affine_map<(d0) -> (696, d0 * 110 + 110)> +func @test_loops_do_not_get_coalesced() { + affine.for %i = 0 to 7 { + affine.for %j = #map0(%i) to min #map1(%i) { + } + } + return +} +// CHECK: affine.for %[[IV0:.*]] = 0 to 7 +// CHECK-NEXT: affine.for %[[IV1:.*]] = #[[MAP0]](%[[IV0]]) to min #[[MAP1]](%[[IV0]]) +// CHECK-NEXT: } +// CHECK-NEXT: } +// CHECK-NEXT: return