diff --git a/mlir/include/mlir/Transforms/LoopUtils.h b/mlir/include/mlir/Transforms/LoopUtils.h --- a/mlir/include/mlir/Transforms/LoopUtils.h +++ b/mlir/include/mlir/Transforms/LoopUtils.h @@ -243,6 +243,25 @@ /// independent of any loop induction variable involved in the nest. void coalesceLoops(MutableArrayRef loops); +/// Replace a perfect nest of "for" loops with a single linearized loop. Assumes +/// `loops` contains a list of perfectly nested loops that are normalized (step +/// one and lower bound of zero) and with bounds and steps independent of any +/// loop induction variable involved in the nest. Coalescing affine.for loops is +/// not always possible, i.e., the result may not be representable using +/// affine.for. +LogicalResult coalesceLoops(MutableArrayRef loops); + +/// Vectorizes a loop (either outer or inner, with a perfect or imperfectly +/// nested body). `simdWidth` is the bit width of the vectors on target. `forOp` +/// is never invalidated. This method does not check for contiguity of accesses +/// or validity of vectorization. A memref.vector_cast op is used to create a +/// vector element type version of the memref. If the memref involved with the +/// vectorization is a function argument attribute, its `memref.alignment` +/// argument attribute is checked to determine if it has the desired alignment. +/// If not, the loop isn't vectorize and this returns failure. +LogicalResult loopVectorize(AffineForOp forOp, unsigned simdWidth, + DenseMap *vecMemRefs = nullptr); + /// Take the ParallelLoop and for each set of dimension indices, combine them /// into a single dimension. combinedDimensions must contain each index into /// loops exactly once. diff --git a/mlir/lib/Transforms/LoopCoalescing.cpp b/mlir/lib/Transforms/LoopCoalescing.cpp --- a/mlir/lib/Transforms/LoopCoalescing.cpp +++ b/mlir/lib/Transforms/LoopCoalescing.cpp @@ -7,6 +7,7 @@ //===----------------------------------------------------------------------===// #include "PassDetail.h" +#include "mlir/Dialect/Affine/IR/AffineOps.h" #include "mlir/Dialect/SCF/SCF.h" #include "mlir/Transforms/LoopUtils.h" #include "mlir/Transforms/Passes.h" @@ -20,66 +21,70 @@ namespace { struct LoopCoalescingPass : public LoopCoalescingBase { - void runOnFunction() override { - FuncOp func = getFunction(); - func.walk([](scf::ForOp op) { - // Ignore nested loops. - if (op->getParentOfType()) - return; + /// Walk either an scf.for or an affine.for to find a band to coalesce. + template + static void walkLoop(LoopOpTy op) { + // Ignore nested loops. + if (op->template getParentOfType()) + return; - SmallVector loops; - getPerfectlyNestedLoops(loops, op); - LLVM_DEBUG(llvm::dbgs() - << "found a perfect nest of depth " << loops.size() << '\n'); + SmallVector loops; + getPerfectlyNestedLoops(loops, op); + LLVM_DEBUG(llvm::dbgs() + << "found a perfect nest of depth " << loops.size() << '\n'); - // Look for a band of loops that can be coalesced, i.e. perfectly nested - // loops with bounds defined above some loop. - // 1. For each loop, find above which parent loop its operands are - // defined. - SmallVector operandsDefinedAbove(loops.size()); - for (unsigned i = 0, e = loops.size(); i < e; ++i) { - operandsDefinedAbove[i] = i; - for (unsigned j = 0; j < i; ++j) { - if (areValuesDefinedAbove(loops[i].getOperands(), - loops[j].region())) { - operandsDefinedAbove[i] = j; - break; - } + // Look for a band of loops that can be coalesced, i.e. perfectly nested + // loops with bounds defined above some loop. + // 1. For each loop, find above which parent loop its operands are + // defined. + SmallVector operandsDefinedAbove(loops.size()); + for (unsigned i = 0, e = loops.size(); i < e; ++i) { + operandsDefinedAbove[i] = i; + for (unsigned j = 0; j < i; ++j) { + if (areValuesDefinedAbove(loops[i].getOperands(), loops[j].region())) { + operandsDefinedAbove[i] = j; + break; } - LLVM_DEBUG(llvm::dbgs() - << " bounds of loop " << i << " are known above depth " - << operandsDefinedAbove[i] << '\n'); } + LLVM_DEBUG(llvm::dbgs() + << " bounds of loop " << i << " are known above depth " + << operandsDefinedAbove[i] << '\n'); + } - // 2. Identify bands of loops such that the operands of all of them are - // defined above the first loop in the band. Traverse the nest bottom-up - // so that modifications don't invalidate the inner loops. - for (unsigned end = loops.size(); end > 0; --end) { - unsigned start = 0; - for (; start < end - 1; ++start) { - auto maxPos = - *std::max_element(std::next(operandsDefinedAbove.begin(), start), - std::next(operandsDefinedAbove.begin(), end)); - if (maxPos > start) - continue; + // 2. Identify bands of loops such that the operands of all of them are + // defined above the first loop in the band. Traverse the nest bottom-up + // so that modifications don't invalidate the inner loops. + for (unsigned end = loops.size(); end > 0; --end) { + unsigned start = 0; + for (; start < end - 1; ++start) { + auto maxPos = + *std::max_element(std::next(operandsDefinedAbove.begin(), start), + std::next(operandsDefinedAbove.begin(), end)); + if (maxPos > start) + continue; - assert(maxPos == start && - "expected loop bounds to be known at the start of the band"); - LLVM_DEBUG(llvm::dbgs() << " found coalesceable band from " << start - << " to " << end << '\n'); + assert(maxPos == start && + "expected loop bounds to be known at the start of the band"); + LLVM_DEBUG(llvm::dbgs() << " found coalesceable band from " << start + << " to " << end << '\n'); - auto band = - llvm::makeMutableArrayRef(loops.data() + start, end - start); - coalesceLoops(band); - break; - } - // If a band was found and transformed, keep looking at the loops above - // the outermost transformed loop. - if (start != end - 1) - end = start + 1; + auto band = + llvm::makeMutableArrayRef(loops.data() + start, end - start); + (void)coalesceLoops(band); + break; } - }); + // If a band was found and transformed, keep looking at the loops above + // the outermost transformed loop. + if (start != end - 1) + end = start + 1; + } + } + + void runOnFunction() override { + FuncOp func = getFunction(); + func.walk([&](scf::ForOp op) { walkLoop(op); }); + func.walk([&](AffineForOp op) { walkLoop(op); }); } }; diff --git a/mlir/lib/Transforms/Utils/LoopUtils.cpp b/mlir/lib/Transforms/Utils/LoopUtils.cpp --- a/mlir/lib/Transforms/Utils/LoopUtils.cpp +++ b/mlir/lib/Transforms/Utils/LoopUtils.cpp @@ -2045,7 +2045,7 @@ builder.setInsertionPointToStart(outermost.getBody()); - // 3. Remap induction variables. For each original loop, the value of the + // 3. Remap induction variables. For each original loop, the value of the // induction variable can be obtained by dividing the induction variable of // the linearized loop by the total number of iterations of the loops nested // in it modulo the number of iterations in this loop (remove the values @@ -2077,6 +2077,128 @@ second.erase(); } +LogicalResult mlir::coalesceLoops(MutableArrayRef loops) { + if (loops.size() < 2) + return success(); + + AffineForOp innermost = loops.back(); + AffineForOp outermost = loops.front(); + AffineBound ub = outermost.getUpperBound(); + AffineMap origUbMap = ub.getMap(); + Location loc = outermost.getLoc(); + OpBuilder builder(outermost); + for (AffineForOp loop : loops) { + // We only work on normalized loops. + if (loop.getStep() != 1 || loop.getConstantLowerBound() != 0) + return failure(); + } + SmallVector upperBoundSymbols; + SmallVector ubOperands(ub.getOperands().begin(), + ub.getOperands().end()); + + // 1. Store the upper bound of the outermost loop in a variable. + Value prev; + if (!llvm::hasSingleElement(origUbMap.getResults())) + prev = builder.create(loc, origUbMap, ubOperands); + else + prev = builder.create(loc, origUbMap, ubOperands); + upperBoundSymbols.push_back(prev); + + // 2. Emit code computing the upper bound of the coalesced loop as product of + // the number of iterations of all loops. + for (AffineForOp loop : loops.drop_front()) { + ub = loop.getUpperBound(); + origUbMap = ub.getMap(); + ubOperands = ub.getOperands(); + Value upperBound; + // If upper bound map has more than one result, take their minimum. + if (!(origUbMap.getResults().empty() || + llvm::hasSingleElement(origUbMap.getResults()))) + upperBound = builder.create(loc, origUbMap, ubOperands); + else + upperBound = builder.create(loc, origUbMap, ubOperands); + upperBoundSymbols.push_back(upperBound); + SmallVector operands; + operands.push_back(prev); + operands.push_back(upperBound); + // Maintain running product of loop upper bounds. + prev = builder.create( + loc, + AffineMap::get(/*numDims=*/ + 1, /*numSymbols=*/1, + builder.getAffineDimExpr(0) * + builder.getAffineSymbolExpr(0)), + operands); + } + // Set upper bound of the coalesced loop. + AffineMap newUbMap = AffineMap::get( + /*numDims=*/0, + /*numSymbols=*/1, builder.getAffineSymbolExpr(0), builder.getContext()); + outermost.setUpperBound(prev, newUbMap); + + builder.setInsertionPointToStart(outermost.getBody()); + + // 3. Remap induction variables. For each original loop, the value of the + // induction variable can be obtained by dividing the induction variable of + // the linearized loop by the total number of iterations of the loops nested + // in it modulo the number of iterations in this loop (remove the values + // related to the outer loops): + // iv_i = floordiv(iv_linear, product-of-loop-ranges-until-i) mod range_i. + // Compute these iteratively from the innermost loop by creating a "running + // quotient" of division by the range. + Value previous = outermost.getInductionVar(); + for (unsigned i = 0, e = loops.size(); i < e; ++i) { + unsigned idx = loops.size() - i - 1; + if (i != 0) { + ub = loops[idx + 1].getUpperBound(); + origUbMap = ub.getMap(); + ubOperands = ub.getOperands(); + SmallVector operands; + operands.push_back(previous); + operands.push_back(upperBoundSymbols[idx + 1]); + previous = builder.create( + loc, + AffineMap::get( + /*numDims=*/1, /*numSymbols=*/1, + builder.getAffineDimExpr(0).floorDiv( + builder.getAffineSymbolExpr(0))), + operands); + } + // Modified value of the induction variables of the nested loops after + // coalescing. + Value inductionVariable; + if (i == e - 1) { + inductionVariable = previous; + } else { + ub = loops[idx].getUpperBound(); + origUbMap = ub.getMap(); + ubOperands = ub.getOperands(); + SmallVector applyOperands; + applyOperands.push_back(previous); + applyOperands.push_back(upperBoundSymbols[idx]); + inductionVariable = builder.create( + loc, + AffineMap::get( + /*numDims=*/1, /*numSymbols=*/1, + builder.getAffineDimExpr(0) % builder.getAffineSymbolExpr(0)), + applyOperands); + } + + replaceAllUsesInRegionWith(loops[idx].getInductionVar(), inductionVariable, + loops.back().region()); + } + + // 4. Move the operations from the innermost just above the second-outermost + // loop, delete the extra terminator and the second-outermost loop. + AffineForOp second = loops[1]; + innermost.getBody()->back().erase(); + outermost.getBody()->getOperations().splice( + Block::iterator(second.getOperation()), + innermost.getBody()->getOperations()); + second.erase(); + return success(); +} + void mlir::collapseParallelLoops( scf::ParallelOp loops, ArrayRef> combinedDimensions) { OpBuilder outsideBuilder(loops); diff --git a/mlir/test/Transforms/loop-coalescing.mlir b/mlir/test/Transforms/loop-coalescing.mlir --- a/mlir/test/Transforms/loop-coalescing.mlir +++ b/mlir/test/Transforms/loop-coalescing.mlir @@ -1,4 +1,4 @@ -// RUN: mlir-opt -allow-unregistered-dialect -loop-coalescing %s | FileCheck %s +// RUN: mlir-opt -split-input-file -allow-unregistered-dialect -loop-coalescing %s | FileCheck %s // CHECK-LABEL: @one_3d_nest func @one_3d_nest() { @@ -191,3 +191,147 @@ } return } + +// ----- + +// CHECK-DAG: #[[MAP0:.*]] = affine_map<() -> (16)> +// CHECK-DAG: #[[MAP1:.*]] = affine_map<() -> (64)> +// CHECK-DAG: #[[MAP2:.*]] = affine_map<(d0)[s0] -> (d0 * s0)> +// CHECK-DAG: #[[MAP3:.*]] = affine_map<() -> (8)> +// CHECK-DAG: #[[MAP4:.*]] = affine_map<(d0)[s0] -> (d0 mod s0)> +// CHECK-DAG: #[[MAP5:.*]] = affine_map<(d0)[s0] -> (d0 floordiv s0)> +func @coalesce_affine_for() { + affine.for %i = 0 to 16 { + affine.for %j = 0 to 64 { + affine.for %k = 0 to 8 { + "test.foo"(%i, %j, %k) : (index, index, index) -> () + } + } + } + return +} +// CHECK-DAG: %[[T0:.*]] = affine.apply #[[MAP0]]() +// CHECK-DAG: %[[T1:.*]] = affine.apply #[[MAP1]]() +// CHECK-DAG: %[[T2:.*]] = affine.apply #[[MAP2]](%[[T0]])[%[[T1]]] +// CHECK-DAG: %[[T3:.*]] = affine.apply #[[MAP3]]() +// CHECK-DAG: %[[T4:.*]] = affine.apply #[[MAP2]](%[[T2]])[%[[T3]]] +// CHECK: affine.for %[[IV:.*]] = 0 to %[[T4]] +// CHECK-DAG: %[[K:.*]] = affine.apply #[[MAP4]](%[[IV]])[%[[T3]]] +// CHECK-DAG: %[[T6:.*]] = affine.apply #[[MAP5]](%[[IV]])[%[[T3]]] +// CHECK-DAG: %[[J:.*]] = affine.apply #[[MAP4]](%[[T6]])[%[[T1]]] +// CHECK-DAG: %[[I:.*]] = affine.apply #[[MAP5]](%[[T6]])[%[[T1]]] +// CHECK-NEXT: "test.foo"(%[[I]], %[[J]], %[[K]]) +// CHECK-NEXT: } +// CHECK-NEXT: return + +// ----- + +// CHECK-DAG: #[[MAP:.*]] = affine_map<()[s0] -> (s0)> +// CHECK-DAG: #[[PRODUCT:.*]] = affine_map<(d0)[s0] -> (d0 * s0)> +// CHECK-DAG: #[[MOD:.*]] = affine_map<(d0)[s0] -> (d0 mod s0)> +// CHECK-DAG: #[[FLOOR:.*]] = affine_map<(d0)[s0] -> (d0 floordiv s0)> +func @coalesce_affine_for(%arg0: memref) { + %c0 = constant 0 : index + %M = memref.dim %arg0, %c0 : memref + %N = memref.dim %arg0, %c0 : memref + %K = memref.dim %arg0, %c0 : memref + affine.for %i = 0 to %M { + affine.for %j = 0 to %N { + affine.for %k = 0 to %K { + "test.foo"(%i, %j, %k) : (index, index, index) -> () + } + } + } + return +} +// CHECK: %[[T0:.*]] = memref.dim %arg{{.*}}, %c{{.*}} : memref +// CHECK: %[[T1:.*]] = memref.dim %arg{{.*}}, %c{{.*}} : memref +// CHECK: %[[T2:.*]] = memref.dim %arg{{.*}}, %c{{.*}} : memref +// CHECK-DAG: %[[T3:.*]] = affine.apply #[[MAP]]()[%[[T0]]] +// CHECK-DAG: %[[T4:.*]] = affine.apply #[[MAP]]()[%[[T1]]] +// CHECK-DAG: %[[T5:.*]] = affine.apply #[[PRODUCT]](%[[T3]])[%[[T4]]] +// CHECK-DAG: %[[T6:.*]] = affine.apply #[[MAP]]()[%[[T2]]] +// CHECK-DAG: %[[T7:.*]] = affine.apply #[[PRODUCT]](%[[T5]])[%[[T6]]] +// CHECK: affine.for %[[IV:.*]] = 0 to %[[T7]] +// CHECK-DAG: %[[K:.*]] = affine.apply #[[MOD]](%[[IV]])[%[[T6]]] +// CHECK-DAG: %[[T9:.*]] = affine.apply #[[FLOOR]](%[[IV]])[%[[T6]]] +// CHECK-DAG: %[[J:.*]] = affine.apply #[[MOD]](%[[T9]])[%[[T4]]] +// CHECK-DAG: %[[I:.*]] = affine.apply #[[FLOOR]](%[[T9]])[%[[T4]]] +// CHECK-NEXT: "test.foo"(%[[I]], %[[J]], %[[K]]) +// CHECK-NEXT: } +// CHECK-NEXT: return + +// ----- + +// CHECK-DAG: #[[MAP0:.*]] = affine_map<()[s0] -> (s0)> +// CHECK-DAG: #[[MAP1:.*]] = affine_map<(d0)[s0] -> (d0 * s0)> +// CHECK-DAG: #[[MAP2:.*]] = affine_map<() -> (64)> +// CHECK-DAG: #[[MAP3:.*]] = affine_map<(d0)[s0] -> (d0 mod s0)> +// CHECK-DAG: #[[MAP4:.*]] = affine_map<(d0)[s0] -> (d0 floordiv s0)> +func @coalesce_affine_for(%arg0: memref) { + %c0 = constant 0 : index + %M = memref.dim %arg0, %c0 : memref + %N = memref.dim %arg0, %c0 : memref + affine.for %i = 0 to %M { + affine.for %j = 0 to %N { + affine.for %k = 0 to 64 { + "test.foo"(%i, %j, %k) : (index, index, index) -> () + } + } + } + return +} +// CHECK: %[[T0:.*]] = memref.dim %arg{{.*}}, %c{{.*}} : memref +// CHECK: %[[T1:.*]] = memref.dim %arg{{.*}}, %c{{.*}} : memref +// CHECK-DAG: %[[T2:.*]] = affine.apply #[[MAP0]]()[%[[T0]]] +// CHECK-DAG: %[[T3:.*]] = affine.apply #[[MAP0]]()[%[[T1]]] +// CHECK-DAG: %[[T4:.*]] = affine.apply #[[MAP1]](%[[T2]])[%[[T3]]] +// CHECK-DAG: %[[T5:.*]] = affine.apply #[[MAP2]]() +// CHECK-DAG: %[[T6:.*]] = affine.apply #[[MAP1]](%[[T4]])[%[[T5]]] +// CHECK: affine.for %[[IV:.*]] = 0 to %[[T6]] +// CHECK-DAG: %[[K:.*]] = affine.apply #[[MAP3]](%[[IV]])[%[[T5]]] +// CHECK-DAG: %[[T8:.*]] = affine.apply #[[MAP4]](%[[IV]])[%[[T5]]] +// CHECK-DAG: %[[J:.*]] = affine.apply #[[MAP3]](%[[T8]])[%[[T3]]] +// CHECK-DAG: %[[I:.*]] = affine.apply #[[MAP4]](%[[T8]])[%[[T3]]] +// CHECK-NEXT: "test.foo"(%[[I]], %[[J]], %[[K]]) +// CHECK-NEXT: } +// CHECK-NEXT: return + +// ----- + +// CHECK-DAG: #[[MAP0:.*]] = affine_map<()[s0] -> (s0, -s0)> +// CHECK-DAG: #[[MAP1:.*]] = affine_map<()[s0] -> (s0)> +// CHECK-DAG: #[[MAP2:.*]] = affine_map<(d0)[s0] -> (d0 * s0)> +// CHECK-DAG: #[[MAP3:.*]] = affine_map<(d0)[s0] -> (d0 mod s0)> +// CHECK-DAG: #[[MAP4:.*]] = affine_map<(d0)[s0] -> (d0 floordiv s0)> +#myMap = affine_map<()[s1] -> (s1, -s1)> +func @coalesce_affine_for(%arg0: memref) { + %c0 = constant 0 : index + %M = memref.dim %arg0, %c0 : memref + %N = memref.dim %arg0, %c0 : memref + %K = memref.dim %arg0, %c0 : memref + affine.for %i = 0 to min #myMap()[%M] { + affine.for %j = 0 to %N { + affine.for %k = 0 to %K { + "test.foo"(%i, %j, %k) : (index, index, index) -> () + } + } + } + return +} +// CHECK: %[[T0:.*]] = memref.dim %arg{{.*}}, %c{{.*}} : memref +// CHECK: %[[T1:.*]] = memref.dim %arg{{.*}}, %c{{.*}} : memref +// CHECK: %[[T2:.*]] = memref.dim %arg{{.*}}, %c{{.*}} : memref +// CHECK-DAG: %[[T3:.*]] = affine.min #[[MAP0]]()[%[[T0]]] +// CHECK-DAG: %[[T4:.*]] = affine.apply #[[MAP1]]()[%[[T1]]] +// CHECK-DAG: %[[T5:.*]] = affine.apply #[[MAP2]](%[[T3]])[%[[T4]]] +// CHECK-DAG: %[[T6:.*]] = affine.apply #[[MAP1]]()[%[[T2]]] +// CHECK-DAG: %[[T7:.*]] = affine.apply #[[MAP2]](%[[T5]])[%[[T6]]] +// CHECK: affine.for %[[IV:.*]] = 0 to %[[T7]] +// CHECK-DAG: %[[K:.*]] = affine.apply #[[MAP3]](%[[IV]])[%[[T6]]] +// CHECK-DAG: %[[T9:.*]] = affine.apply #[[MAP4]](%[[IV]])[%[[T6]]] +// CHECK-DAG: %[[J:.*]] = affine.apply #[[MAP3]](%[[T9]])[%[[T4]]] +// CHECK-DAG: %[[I:.*]] = affine.apply #[[MAP4]](%[[T9]])[%[[T4]]] +// CHECK-NEXT: "test.foo"(%[[I]], %[[J]], %[[K]]) +// CHECK-NEXT: } +// CHECK-NEXT: return