diff --git a/mlir/include/mlir/Transforms/LoopUtils.h b/mlir/include/mlir/Transforms/LoopUtils.h --- a/mlir/include/mlir/Transforms/LoopUtils.h +++ b/mlir/include/mlir/Transforms/LoopUtils.h @@ -88,15 +88,38 @@ LogicalResult affineForOpBodySkew(AffineForOp forOp, ArrayRef shifts, bool unrollPrologueEpilogue = false); +/// Identify valid and profitable bands of loops to tile. This is currently just +/// a temporary placeholder to test the mechanics of tiled code generation. +/// Returns all maximal outermost perfect loop nests to tile. +void getTileableBands(FuncOp f, + std::vector> *bands); + /// Tiles the specified band of perfectly nested loops creating tile-space loops -/// and intra-tile loops. A band is a contiguous set of loops. `tiledNest` when -/// non-null is set to the loops of the tiled nest from outermost to innermost. -/// Loops in `input` are erased when the tiling is successful. +/// and intra-tile loops. A band is a contiguous set of loops. The last argument +/// is a user defined function which constructs and sets the loop bounds for the +/// tiled loop nest. LLVM_NODISCARD LogicalResult tilePerfectlyNested(MutableArrayRef input, ArrayRef tileSizes, - SmallVectorImpl *tiledNest = nullptr); + SmallVectorImpl &tiledNest, + std::function &input, + MutableArrayRef tiledLoops, + ArrayRef &tileSizes)> + constructAndSetLoopBounds); + +/// Tiles the specified band of perfectly nested loops creating tile-space +/// loops and intra-tile loops, using SSA values as tiling parameters. A band +/// is a contiguous set of loops. The last argument is a user defined function +/// which constructs and sets the loop bounds for the tiled loop nest. +LLVM_NODISCARD +LogicalResult tilePerfectlyNestedParametric( + MutableArrayRef input, ArrayRef tileSizes, + SmallVectorImpl &tiledNest, + std::function &input, + MutableArrayRef tiledLoops, + ArrayRef &tileSizes)> + constructAndSetLoopBounds); /// Performs loop interchange on 'forOpA' and 'forOpB'. Requires that 'forOpA' /// and 'forOpB' are part of a perfectly nested sequence of loops. diff --git a/mlir/lib/Dialect/Affine/Transforms/LoopTiling.cpp b/mlir/lib/Dialect/Affine/Transforms/LoopTiling.cpp --- a/mlir/lib/Dialect/Affine/Transforms/LoopTiling.cpp +++ b/mlir/lib/Dialect/Affine/Transforms/LoopTiling.cpp @@ -61,276 +61,76 @@ return std::make_unique(); } -// Move the loop body of AffineForOp 'src' from 'src' into the specified -// location in destination's body, ignoring the terminator. -static inline void moveLoopBody(AffineForOp src, AffineForOp dest, - Block::iterator loc) { - auto &insts = src.getBody()->getOperations(); - dest.getBody()->getOperations().splice(loc, insts, insts.begin(), - std::prev(insts.end())); +/// Set lower and upper bounds of inter-tile loops for parametric tiling. +static void setInterTileBounds(OpBuilder &b, AffineForOp origLoop, + AffineForOp newLoop, unsigned tileSize) { + OperandRange newLbOperands = origLoop.getLowerBoundOperands(); + OperandRange newUbOperands = origLoop.getUpperBoundOperands(); + newLoop.setLowerBound(newLbOperands, origLoop.getLowerBoundMap()); + newLoop.setUpperBound(newUbOperands, origLoop.getUpperBoundMap()); + newLoop.setStep(tileSize); } -// Move the loop body of AffineForOp 'src' from 'src' to the start of dest's -// body. -static inline void moveLoopBody(AffineForOp src, AffineForOp dest) { - moveLoopBody(src, dest, dest.getBody()->begin()); -} - -/// Constructs and sets new loop bounds after tiling for the case of -/// hyper-rectangular index sets, where the bounds of one dimension do not -/// depend on other dimensions. Bounds of each dimension can thus be treated -/// independently, and deriving the new bounds is much simpler and faster -/// than for the case of tiling arbitrary polyhedral shapes. -static void -constructTiledIndexSetHyperRect(MutableArrayRef origLoops, - MutableArrayRef newLoops, - ArrayRef tileSizes) { - assert(!origLoops.empty()); - assert(origLoops.size() == tileSizes.size()); - - OpBuilder b(origLoops[0].getOperation()); - unsigned width = origLoops.size(); - - // Bounds for tile space loops. - for (unsigned i = 0; i < width; i++) { - OperandRange newLbOperands = origLoops[i].getLowerBoundOperands(); - OperandRange newUbOperands = origLoops[i].getUpperBoundOperands(); - newLoops[i].setLowerBound(newLbOperands, origLoops[i].getLowerBoundMap()); - newLoops[i].setUpperBound(newUbOperands, origLoops[i].getUpperBoundMap()); - newLoops[i].setStep(tileSizes[i]); - } - // Bounds for intra-tile loops. - for (unsigned i = 0; i < width; i++) { - int64_t largestDiv = getLargestDivisorOfTripCount(origLoops[i]); - auto mayBeConstantCount = getConstantTripCount(origLoops[i]); - // The lower bound is just the tile-space loop. - AffineMap lbMap = b.getDimIdentityMap(); - newLoops[width + i].setLowerBound( - /*operands=*/newLoops[i].getInductionVar(), lbMap); - - // Set the upper bound. - if (mayBeConstantCount && mayBeConstantCount.getValue() < tileSizes[i]) { - // Trip count is less than the tile size: upper bound is lower bound + - // trip count. - auto ubMap = b.getSingleDimShiftAffineMap(mayBeConstantCount.getValue()); - newLoops[width + i].setUpperBound( - /*operands=*/newLoops[i].getInductionVar(), ubMap); - } else if (largestDiv % tileSizes[i] != 0) { - // Intra-tile loop ii goes from i to min(i + tileSize, ub_i). - // Construct the upper bound map; the operands are the original operands - // with 'i' (tile-space loop) appended to it. The new upper bound map is - // the original one with an additional expression i + tileSize appended. - - // Add dim operands from original upper bound. - SmallVector ubOperands; - auto ub = origLoops[i].getUpperBound(); - ubOperands.reserve(ub.getNumOperands() + 1); - auto origUbMap = ub.getMap(); - for (unsigned j = 0, e = origUbMap.getNumDims(); j < e; ++j) - ubOperands.push_back(ub.getOperand(j)); - - // Add dim operand for new loop upper bound. - ubOperands.push_back(newLoops[i].getInductionVar()); - - // Add symbol operands from original upper bound. - for (unsigned j = 0, e = origUbMap.getNumSymbols(); j < e; ++j) - ubOperands.push_back(ub.getOperand(origUbMap.getNumDims() + j)); - - SmallVector boundExprs; - boundExprs.reserve(1 + origUbMap.getNumResults()); - auto dim = b.getAffineDimExpr(origUbMap.getNumDims()); - // The new upper bound map is the original one with an additional - // expression i + tileSize appended. - boundExprs.push_back(dim + tileSizes[i]); - boundExprs.append(origUbMap.getResults().begin(), - origUbMap.getResults().end()); - auto ubMap = - AffineMap::get(origUbMap.getNumDims() + 1, origUbMap.getNumSymbols(), - boundExprs, b.getContext()); - newLoops[width + i].setUpperBound(/*operands=*/ubOperands, ubMap); - } else { - // No need of the min expression. - auto dim = b.getAffineDimExpr(0); - auto ubMap = AffineMap::get(1, 0, dim + tileSizes[i]); - newLoops[width + i].setUpperBound(newLoops[i].getInductionVar(), ubMap); - } - } -} - -/// This function checks whether hyper-rectangular loop tiling of the nest -/// represented by `origLoops` is valid. The validity condition is from Irigoin -/// and Triolet, which states that two tiles cannot depend on each other. We -/// simplify such condition to just checking whether there is any negative -/// dependence direction, since we have the prior knowledge that the tiling -/// results will be hyper-rectangles, which are scheduled in the -/// lexicographically increasing order on the vector of loop indices. This -/// function will return failure when any dependence component is negative along -/// any of `origLoops`. -static LogicalResult -checkTilingLegality(MutableArrayRef origLoops) { - assert(!origLoops.empty() && "no original loops provided"); - - // We first find out all dependences we intend to check. - SmallVector loadAndStoreOps; - origLoops[0].getOperation()->walk([&](Operation *op) { - if (isa(op)) - loadAndStoreOps.push_back(op); - }); - - unsigned numOps = loadAndStoreOps.size(); - unsigned numLoops = origLoops.size(); - FlatAffineConstraints dependenceConstraints; - for (unsigned d = 1; d <= numLoops + 1; ++d) { - for (unsigned i = 0; i < numOps; ++i) { - Operation *srcOp = loadAndStoreOps[i]; - MemRefAccess srcAccess(srcOp); - for (unsigned j = 0; j < numOps; ++j) { - Operation *dstOp = loadAndStoreOps[j]; - MemRefAccess dstAccess(dstOp); - - SmallVector depComps; - dependenceConstraints.reset(); - DependenceResult result = checkMemrefAccessDependence( - srcAccess, dstAccess, d, &dependenceConstraints, &depComps); - - // Skip if there is no dependence in this case. - if (!hasDependence(result)) - continue; - - // Check whether there is any negative direction vector in the - // dependence components found above, which means that dependence is - // violated by the default hyper-rect tiling method. - LLVM_DEBUG(llvm::dbgs() << "Checking whether tiling legality violated " - "for dependence at depth: " - << Twine(d) << " between:\n";); - LLVM_DEBUG(srcAccess.opInst->dump();); - LLVM_DEBUG(dstAccess.opInst->dump();); - for (unsigned k = 0, e = depComps.size(); k < e; k++) { - DependenceComponent depComp = depComps[k]; - if (depComp.lb.hasValue() && depComp.ub.hasValue() && - depComp.lb.getValue() < depComp.ub.getValue() && - depComp.ub.getValue() < 0) { - LLVM_DEBUG(llvm::dbgs() - << "Dependence component lb = " - << Twine(depComp.lb.getValue()) - << " ub = " << Twine(depComp.ub.getValue()) - << " is negative at depth: " << Twine(d) - << " and thus violates the legality rule.\n"); - return failure(); - } - } - } - } +/// Set lower and upper bounds of intra-tile loops for parametric tiling. +static void setIntraTileBounds(OpBuilder &b, AffineForOp origLoop, + AffineForOp newInterTileLoop, + AffineForOp newIntraTileLoop, + unsigned tileSize) { + int64_t largestDiv = getLargestDivisorOfTripCount(origLoop); + Optional mayBeConstantCount = getConstantTripCount(origLoop); + // The lower bound is just the tile-space loop. + AffineMap lbMap = b.getDimIdentityMap(); + newIntraTileLoop.setLowerBound( + /*operands=*/newInterTileLoop.getInductionVar(), lbMap); + + // Set the upper bound. + if (mayBeConstantCount && mayBeConstantCount.getValue() < tileSize) { + // Trip count is less than the tile size: upper bound is lower bound + // + trip count. + AffineMap ubMap = + b.getSingleDimShiftAffineMap(mayBeConstantCount.getValue()); + newIntraTileLoop.setUpperBound( + /*operands=*/newInterTileLoop.getInductionVar(), ubMap); + } else if (largestDiv % tileSize != 0) { + // Intra-tile loop ii goes from i to min(i + tileSize, ub_i). + // Construct the upper bound map; the operands are the original + // operands with 'i' (tile-space loop) appended to it. The new upper + // bound map is the original one with an additional expression i + + // tileSize appended. + + // Add dim operands from original upper bound. + SmallVector ubOperands; + AffineBound ub = origLoop.getUpperBound(); + ubOperands.reserve(ub.getNumOperands() + 1); + AffineMap origUbMap = ub.getMap(); + for (unsigned j = 0, e = origUbMap.getNumDims(); j < e; ++j) + ubOperands.push_back(ub.getOperand(j)); + + // Add dim operand for new loop upper bound. + ubOperands.push_back(newInterTileLoop.getInductionVar()); + + // Add symbol operands from original upper bound. + for (unsigned j = 0, e = origUbMap.getNumSymbols(); j < e; ++j) + ubOperands.push_back(ub.getOperand(origUbMap.getNumDims() + j)); + + SmallVector boundExprs; + boundExprs.reserve(1 + origUbMap.getNumResults()); + AffineExpr dim = b.getAffineDimExpr(origUbMap.getNumDims()); + // The new upper bound map is the original one with an additional + // expression i + tileSize appended. + boundExprs.push_back(dim + tileSize); + boundExprs.append(origUbMap.getResults().begin(), + origUbMap.getResults().end()); + AffineMap ubMap = + AffineMap::get(origUbMap.getNumDims() + 1, origUbMap.getNumSymbols(), + boundExprs, b.getContext()); + newIntraTileLoop.setUpperBound(/*operands=*/ubOperands, ubMap); + } else { + // No need of the min expression. + AffineExpr dim = b.getAffineDimExpr(0); + AffineMap ubMap = AffineMap::get(1, 0, dim + tileSize); + newIntraTileLoop.setUpperBound(newInterTileLoop.getInductionVar(), ubMap); } - - return success(); -} -/// Tiles the specified band of perfectly nested loops creating tile-space loops -/// and intra-tile loops. A band is a contiguous set of loops. -// TODO: handle non hyper-rectangular spaces. -LogicalResult -mlir::tilePerfectlyNested(MutableArrayRef input, - ArrayRef tileSizes, - SmallVectorImpl *tiledNest) { - // Check if the supplied for op's are all successively nested. - assert(!input.empty() && "no loops in input band"); - assert(input.size() == tileSizes.size() && "Too few/many tile sizes"); - - assert(isPerfectlyNested(input) && "input loops not perfectly nested"); - - auto origLoops = input; - - // Perform tiling legality test. - if (failed(checkTilingLegality(origLoops))) - origLoops[0].emitRemark("tiled code is illegal due to dependences"); - - AffineForOp rootAffineForOp = origLoops[0]; - auto loc = rootAffineForOp.getLoc(); - // Note that width is at least one since band isn't empty. - unsigned width = input.size(); - - SmallVector tiledLoops(2 * width); - - // The outermost among the loops as we add more.. - auto *topLoop = rootAffineForOp.getOperation(); - AffineForOp innermostPointLoop; - - // Add intra-tile (or point) loops. - for (unsigned i = 0; i < width; i++) { - OpBuilder b(topLoop); - // Loop bounds will be set later. - auto pointLoop = b.create(loc, 0, 0); - pointLoop.getBody()->getOperations().splice( - pointLoop.getBody()->begin(), topLoop->getBlock()->getOperations(), - topLoop); - tiledLoops[2 * width - 1 - i] = pointLoop; - topLoop = pointLoop.getOperation(); - if (i == 0) - innermostPointLoop = pointLoop; - } - - // Add tile space loops; - for (unsigned i = width; i < 2 * width; i++) { - OpBuilder b(topLoop); - // Loop bounds will be set later. - auto tileSpaceLoop = b.create(loc, 0, 0); - tileSpaceLoop.getBody()->getOperations().splice( - tileSpaceLoop.getBody()->begin(), topLoop->getBlock()->getOperations(), - topLoop); - tiledLoops[2 * width - i - 1] = tileSpaceLoop; - topLoop = tileSpaceLoop.getOperation(); - } - - // Move the loop body of the original nest to the new one. - moveLoopBody(origLoops.back(), innermostPointLoop); - - SmallVector origLoopIVs; - extractForInductionVars(input, &origLoopIVs); - - FlatAffineConstraints cst; - SmallVector ops; - ops.reserve(input.size()); - for (AffineForOp forOp : input) - ops.push_back(forOp); - getIndexSet(ops, &cst); - if (!cst.isHyperRectangular(0, width)) { - rootAffineForOp.emitError("tiled code generation unimplemented for the " - "non-hyperrectangular case"); - return failure(); - } - - constructTiledIndexSetHyperRect(origLoops, tiledLoops, tileSizes); - - // Replace original IVs with intra-tile loop IVs. - for (unsigned i = 0; i < width; i++) - origLoopIVs[i].replaceAllUsesWith(tiledLoops[i + width].getInductionVar()); - - // Erase the old loop nest. - rootAffineForOp.erase(); - - if (tiledNest) - *tiledNest = std::move(tiledLoops); - - return success(); -} - -// Identify valid and profitable bands of loops to tile. This is currently just -// a temporary placeholder to test the mechanics of tiled code generation. -// Returns all maximal outermost perfect loop nests to tile. -static void getTileableBands(FuncOp f, - std::vector> *bands) { - // Get maximal perfect nest of 'affine.for' insts starting from root - // (inclusive). - auto getMaximalPerfectLoopNest = [&](AffineForOp root) { - SmallVector band; - getPerfectlyNestedLoops(band, root); - bands->push_back(band); - }; - - for (auto &block : f) - for (auto &op : block) - if (auto forOp = dyn_cast(op)) - getMaximalPerfectLoopNest(forOp); } /// Reduces each tile size to the largest divisor of the corresponding trip @@ -340,7 +140,7 @@ assert(band.size() == tileSizes->size() && "invalid tile size count"); for (unsigned i = 0, e = band.size(); i < e; i++) { unsigned &tSizeAdjusted = (*tileSizes)[i]; - auto mayConst = getConstantTripCount(band[i]); + Optional mayConst = getConstantTripCount(band[i]); if (!mayConst) continue; // Adjust the tile size to largest factor of the trip count less than @@ -379,14 +179,14 @@ tileSizes->resize(band.size()); // The first loop in the band. - auto rootForOp = band[0]; + AffineForOp rootForOp = band[0]; (void)rootForOp; // Obtain memory footprint and set tile sizes so that a tile fits in // the cache size. This is an approximation with the assumption that the // footprint increases with the tile size linearly in that dimension (i.e., // assumes one-to-one access function). - auto fp = getMemoryFootprintBytes(band[0], 0); + Optional fp = getMemoryFootprintBytes(band[0], 0); if (!fp) { // Fill with default tile sizes if footprint is unknown. std::fill(tileSizes->begin(), tileSizes->end(), @@ -433,6 +233,32 @@ } void LoopTiling::runOnFunction() { + // Constructs and sets new loop bounds after tiling for the case of + // hyper-rectangular index sets, where the bounds of one dimension do not + // depend on other dimensions. Bounds of each dimension can thus be treated + // independently, and deriving the new bounds is much simpler and faster + // than for the case of tiling arbitrary polyhedral shapes. + auto constructTiledIndexSetHyperRect = + [](MutableArrayRef &origLoops, + MutableArrayRef newLoops, ArrayRef &tileSizes) { + assert(!origLoops.empty()); + assert(origLoops.size() == tileSizes.size()); + + OpBuilder b(origLoops[0].getOperation()); + unsigned width = origLoops.size(); + + // Bounds for tile space loops. + for (unsigned i = 0; i < width; i++) { + setInterTileBounds(b, origLoops[i], newLoops[i], tileSizes[i]); + } + + // Bounds for intra-tile loops. + for (unsigned i = 0; i < width; i++) { + setIntraTileBounds(b, origLoops[i], newLoops[i], newLoops[width + i], + tileSizes[i]); + } + }; + // Bands of loops to tile. std::vector> bands; getTileableBands(getFunction(), &bands); @@ -445,12 +271,13 @@ getTileSizes(band, &tileSizes); if (llvm::DebugFlag) { auto diag = band[0].emitRemark("using tile sizes ["); - for (auto tSize : tileSizes) + for (unsigned tSize : tileSizes) diag << tSize << ' '; diag << "]\n"; } SmallVector tiledNest; - if (failed(tilePerfectlyNested(band, tileSizes, &tiledNest))) + if (failed(tilePerfectlyNested(band, tileSizes, tiledNest, + constructTiledIndexSetHyperRect))) return signalPassFailure(); // Separate full and partial tiles. diff --git a/mlir/lib/Transforms/Utils/LoopUtils.cpp b/mlir/lib/Transforms/Utils/LoopUtils.cpp --- a/mlir/lib/Transforms/Utils/LoopUtils.cpp +++ b/mlir/lib/Transforms/Utils/LoopUtils.cpp @@ -418,10 +418,225 @@ return success(); } -// Collect perfectly nested loops starting from `rootForOps`. Loops are -// perfectly nested if each loop is the first and only non-terminator operation -// in the parent loop. Collect at most `maxLoops` loops and append them to -// `forOps`. +/// Checks the legality of tiling of a hyper-rectangular loop nest by simply +/// checking if there is a 'negative' dependence in the memrefs present in +/// the loop nest. If yes then tiling is invalid. +LogicalResult +checkTilingLegalityImpl(MutableArrayRef origLoops) { + assert(!origLoops.empty() && "no original loops provided"); + + // We first find out all dependences we intend to check. + SmallVector loadAndStoreOps; + origLoops[0].getOperation()->walk([&](Operation *op) { + if (isa(op)) + loadAndStoreOps.push_back(op); + }); + + unsigned numOps = loadAndStoreOps.size(); + unsigned numLoops = origLoops.size(); + FlatAffineConstraints dependenceConstraints; + for (unsigned d = 1; d <= numLoops + 1; ++d) { + for (unsigned i = 0; i < numOps; ++i) { + Operation *srcOp = loadAndStoreOps[i]; + MemRefAccess srcAccess(srcOp); + for (unsigned j = 0; j < numOps; ++j) { + Operation *dstOp = loadAndStoreOps[j]; + MemRefAccess dstAccess(dstOp); + + SmallVector depComps; + dependenceConstraints.reset(); + DependenceResult result = checkMemrefAccessDependence( + srcAccess, dstAccess, d, &dependenceConstraints, &depComps); + + // Skip if there is no dependence in this case. + if (!hasDependence(result)) + continue; + + // Check whether there is any negative direction vector in the + // dependence components found above, which means that dependence is + // violated by the default hyper-rect tiling method. + LLVM_DEBUG(llvm::dbgs() << "Checking whether tiling legality violated " + "for dependence at depth: " + << Twine(d) << " between:\n";); + LLVM_DEBUG(srcAccess.opInst->dump();); + LLVM_DEBUG(dstAccess.opInst->dump();); + for (unsigned k = 0, e = depComps.size(); k < e; k++) { + DependenceComponent depComp = depComps[k]; + if (depComp.lb.hasValue() && depComp.ub.hasValue() && + depComp.lb.getValue() < depComp.ub.getValue() && + depComp.ub.getValue() < 0) { + LLVM_DEBUG(llvm::dbgs() + << "Dependence component lb = " + << Twine(depComp.lb.getValue()) + << " ub = " << Twine(depComp.ub.getValue()) + << " is negative at depth: " << Twine(d) + << " and thus violates the legality rule.\n"); + return failure(); + } + } + } + } + } + + return success(); +} + +/// Checks whether hyper-rectangular loop tiling of the nest +/// represented by `origLoops` is valid. The validity condition is from Irigoin +/// and Triolet, which states that two tiles cannot depend on each other. We +/// simplify such condition to just checking whether there is any negative +/// dependence direction, since we have the prior knowledge that the tiling +/// results will be hyper-rectangles, which are scheduled in the +/// lexicographically increasing order on the vector of loop indices. This +/// function will return failure when any dependence component is negative along +/// any of `origLoops`. +LogicalResult +checkTilingLegality(MutableArrayRef origLoops) { + return checkTilingLegalityImpl(origLoops); +} + +/// Move the loop body of AffineForOp 'src' from 'src' into the specified +/// location in destination's body, ignoring the terminator. +void moveLoopBodyImpl(AffineForOp src, AffineForOp dest, Block::iterator loc) { + auto &ops = src.getBody()->getOperations(); + dest.getBody()->getOperations().splice(loc, ops, ops.begin(), + std::prev(ops.end())); +} + +/// Move the loop body of AffineForOp 'src' from 'src' to the start of dest's +/// body. +void moveLoopBody(AffineForOp src, AffineForOp dest) { + moveLoopBodyImpl(src, dest, dest.getBody()->begin()); +} + +/// Generates tiled loop nest from a given perfectly nested loop nest. The +/// bounds for the tiled loop nest are set using the 'constructAndSetLoopBounds' +/// function that is supplied and is totally upto the user to implement and +/// customize it. +template +LogicalResult tilePerfectlyNestedAffineImpl( + MutableArrayRef input, ArrayRef tileSizes, + SmallVectorImpl &tiledNest, + std::function &input, + MutableArrayRef tiledNest, + ArrayRef &tileSizes)> + constructAndSetLoopBounds) { + + // Check if the supplied for op's are all successively nested. + assert(!input.empty() && "no loops in input band"); + assert(input.size() == tileSizes.size() && "Too few/many tile sizes"); + + assert(isPerfectlyNested(input) && "input loops not perfectly nested"); + + auto origLoops = input; + + // Perform tiling legality test. + if (failed(checkTilingLegality(origLoops))) + origLoops[0].emitRemark("tiled code is illegal due to dependences"); + + AffineForOp rootAffineForOp = origLoops[0]; + auto loc = rootAffineForOp.getLoc(); + // Note that width is at least one since band isn't empty. + unsigned width = input.size(); + + SmallVector tiledLoops(2 * width); + + // The outermost among the loops as we add more.. + auto *topLoop = rootAffineForOp.getOperation(); + AffineForOp innermostPointLoop; + + // Add intra-tile (or point) loops. + for (unsigned i = 0; i < width; i++) { + OpBuilder b(topLoop); + // Loop bounds will be set later. + auto pointLoop = b.create(loc, 0, 0); + pointLoop.getBody()->getOperations().splice( + pointLoop.getBody()->begin(), topLoop->getBlock()->getOperations(), + topLoop); + tiledLoops[2 * width - 1 - i] = pointLoop; + topLoop = pointLoop.getOperation(); + if (i == 0) + innermostPointLoop = pointLoop; + } + + // Add tile space loops; + for (unsigned i = width; i < 2 * width; i++) { + OpBuilder b(topLoop); + // Loop bounds will be set later. + auto tileSpaceLoop = b.create(loc, 0, 0); + tileSpaceLoop.getBody()->getOperations().splice( + tileSpaceLoop.getBody()->begin(), topLoop->getBlock()->getOperations(), + topLoop); + tiledLoops[2 * width - i - 1] = tileSpaceLoop; + topLoop = tileSpaceLoop.getOperation(); + } + + // Move the loop body of the original nest to the new one. + moveLoopBody(origLoops.back(), innermostPointLoop); + + SmallVector origLoopIVs; + extractForInductionVars(input, &origLoopIVs); + + FlatAffineConstraints cst; + SmallVector ops; + ops.reserve(input.size()); + for (AffineForOp forOp : input) + ops.push_back(forOp); + getIndexSet(ops, &cst); + if (!cst.isHyperRectangular(0, width)) { + rootAffineForOp.emitError("tiled code generation unimplemented for the " + "non-hyperrectangular case"); + return failure(); + } + + constructAndSetLoopBounds(origLoops, tiledLoops, tileSizes); + + // Replace original IVs with intra-tile loop IVs. + for (unsigned i = 0; i < width; i++) + origLoopIVs[i].replaceAllUsesWith(tiledLoops[i + width].getInductionVar()); + + // Erase the old loop nest. + rootAffineForOp.erase(); + + tiledNest = std::move(tiledLoops); + + return success(); +} + +/// Tiles the specified band of perfectly nested loops creating tile-space loops +/// and intra-tile loops. A band is a contiguous set of loops.The last argument +/// is a user defined function which constructs and sets the loop bounds for +/// the tiled loop nest. +LogicalResult mlir::tilePerfectlyNested( + MutableArrayRef input, ArrayRef tileSizes, + SmallVectorImpl &tiledNest, + std::function &input, + MutableArrayRef tiledNest, + ArrayRef &tileSizes)> + constructAndSetLoopBounds) { + return tilePerfectlyNestedAffineImpl(input, tileSizes, tiledNest, + constructAndSetLoopBounds); +} + +/// Tiles the specified band of perfectly nested loops creating tile-space +/// loops and intra-tile loops, using SSA values as tiling parameters. A band +/// is a contiguous set of loops. The last argument is a user defined function +/// which constructs and sets the loop bounds for the tiled loop nest. +LogicalResult mlir::tilePerfectlyNestedParametric( + MutableArrayRef input, ArrayRef tileSizes, + SmallVectorImpl &tiledNest, + std::function &input, + MutableArrayRef tiledNest, + ArrayRef &tileSizes)> + constructAndSetLoopBounds) { + return tilePerfectlyNestedAffineImpl(input, tileSizes, tiledNest, + constructAndSetLoopBounds); +} + +/// Collect perfectly nested loops starting from `rootForOps`. Loops are +/// perfectly nested if each loop is the first and only non-terminator operation +/// in the parent loop. Collect at most `maxLoops` loops and append them to +/// `forOps`. template static void getPerfectlyNestedLoopsImpl( SmallVectorImpl &forOps, T rootForOp, @@ -452,6 +667,22 @@ getPerfectlyNestedLoopsImpl(nestedLoops, root); } +/// Identify valid and profitable bands of loops to tile. This is currently just +/// a temporary placeholder to test the mechanics of tiled code generation. +/// Returns all maximal outermost perfect loop nests to tile. +void mlir::getTileableBands(FuncOp f, + std::vector> *bands) { + // Get maximal perfect nest of 'affine.for' insts starting from root + // (inclusive). + for (Block &block : f) + for (Operation &op : block) + if (auto forOp = dyn_cast(op)) { + SmallVector band; + getPerfectlyNestedLoops(band, forOp); + bands->push_back(band); + } +} + /// Unrolls this loop completely. LogicalResult mlir::loopUnrollFull(AffineForOp forOp) { Optional mayBeConstantTripCount = getConstantTripCount(forOp); diff --git a/mlir/test/Dialect/Affine/loop-tiling-parametric.mlir b/mlir/test/Dialect/Affine/loop-tiling-parametric.mlir new file mode 100644 --- /dev/null +++ b/mlir/test/Dialect/Affine/loop-tiling-parametric.mlir @@ -0,0 +1,270 @@ +// RUN: mlir-opt %s -split-input-file -test-affine-parametric-tile | FileCheck %s + +// ----- + +// CHECK-DAG: [[LBI:#map[0-9]+]] = affine_map<(d0)[s0] -> (d0 * s0)> +// CHECK-DAG: [[UBI0:#map[0-9]+]] = affine_map<(d0)[s0] -> (d0 * s0 + s0, 256)> +// CHECK-DAG: [[UBI1:#map[0-9]+]] = affine_map<(d0)[s0] -> (d0 * s0 + s0, 512)> +// CHECK-DAG: [[UBI2:#map[0-9]+]] = affine_map<(d0)[s0] -> (d0 * s0 + s0, 1024)> +// CHECK-DAG: [[UBO0:#map[0-9]+]] = affine_map<()[s0] -> (256 ceildiv s0)> +// CHECK-DAG: [[UBO1:#map[0-9]+]] = affine_map<()[s0] -> (512 ceildiv s0)> +// CHECK-DAG: [[UBO2:#map[0-9]+]] = affine_map<()[s0] -> (1024 ceildiv s0)> + +// CHECK: func @loop_tiling_3d([[ARG0:%arg[0-9]+]]: index, [[ARG1:%arg[0-9]+]]: index, [[ARG2:%arg[0-9]+]]: index) +// CHECK-NEXT: affine.for [[ARG3:%arg[0-9]+]] = 0 to [[UBO0]](){{.*}}[[ARG0]] +// CHECK-NEXT: affine.for [[ARG4:%arg[0-9]+]] = 0 to [[UBO1]](){{.*}}[[ARG1]] +// CHECK-NEXT: affine.for [[ARG5:%arg[0-9]+]] = 0 to [[UBO2]](){{.*}}[[ARG2]] +// CHECK-NEXT: affine.for %[[I:.*]] = [[LBI]]{{.*}}[[ARG3]]{{.*}}[[ARG0]]{{.*}} to min [[UBI0]]{{.*}}[[ARG3]]{{.*}}[[ARG0]] +// CHECK-NEXT: affine.for %[[J:.*]] = [[LBI]]{{.*}}[[ARG4]]{{.*}}[[ARG1]]{{.*}} to min [[UBI1]]{{.*}}[[ARG4]]{{.*}}[[ARG1]] +// CHECK-NEXT: affine.for %[[K:.*]] = [[LBI]]{{.*}}[[ARG5]]{{.*}}[[ARG2]]{{.*}} to min [[UBI2]]{{.*}}[[ARG5]]{{.*}}[[ARG2]] +// CHECK-NEXT: "test.foo"(%[[I]], %[[J]], %[[K]]) +func @loop_tiling_3d(%t0 : index, %t1 : index, %t2 : index) { + affine.for %i = 0 to 256 { + affine.for %j = 0 to 512 { + affine.for %k = 0 to 1024 { + "test.foo"(%i, %j, %k) : (index, index, index) -> () + } + } + } + return +} + +// ----- + +// CHECK-DAG: [[LBI:#map[0-9]+]] = affine_map<(d0)[s0] -> (d0 * s0)> +// CHECK-DAG: [[UBI0:#map[0-9]+]] = affine_map<(d0)[s0] -> (d0 * s0 + s0 * 4, 256)> +// CHECK-DAG: [[UBI1:#map[0-9]+]] = affine_map<(d0)[s0] -> (d0 * s0 + s0 * 3, 512)> +// CHECK-DAG: [[UBI2:#map[0-9]+]] = affine_map<(d0)[s0] -> (d0 * s0 + s0 * 2, 1024)> +// CHECK-DAG: [[UBO0:#map[0-9]+]] = affine_map<()[s0] -> (256 ceildiv s0)> +// CHECK-DAG: [[UBO1:#map[0-9]+]] = affine_map<()[s0] -> (512 ceildiv s0)> +// CHECK-DAG: [[UBO2:#map[0-9]+]] = affine_map<()[s0] -> (1024 ceildiv s0)> + +// CHECK: func @loop_tiling_non_unit_step([[ARG0:%arg[0-9]+]]: index, [[ARG1:%arg[0-9]+]]: index, [[ARG2:%arg[0-9]+]]: index) +// CHECK-NEXT: affine.for [[ARG3:%arg[0-9]+]] = 0 to [[UBO0]](){{.*}}[[ARG0]]{{.*}}step 4 +// CHECK-NEXT: affine.for [[ARG4:%arg[0-9]+]] = 0 to [[UBO1]](){{.*}}[[ARG1]]{{.*}} step 3 +// CHECK-NEXT: affine.for [[ARG5:%arg[0-9]+]] = 0 to [[UBO2]](){{.*}}[[ARG2]]{{.*}} step 2 +// CHECK-NEXT: affine.for %[[I:.*]] = [[LBI]]{{.*}}[[ARG3]]{{.*}}[[ARG0]]{{.*}} to min [[UBI0]]{{.*}}[[ARG3]]{{.*}}[[ARG0]]{{.*}} step 4 +// CHECK-NEXT: affine.for %[[J:.*]] = [[LBI]]{{.*}}[[ARG4]]{{.*}}[[ARG1]]{{.*}} to min [[UBI1]]{{.*}}[[ARG4]]{{.*}}[[ARG1]]{{.*}} step 3 +// CHECK-NEXT: affine.for %[[K:.*]] = [[LBI]]{{.*}}[[ARG5]]{{.*}}[[ARG2]]{{.*}} to min [[UBI2]]{{.*}}[[ARG5]]{{.*}}[[ARG2]]{{.*}} step 2 +// CHECK-NEXT: "test.foo"(%[[I]], %[[J]], %[[K]]) +func @loop_tiling_non_unit_step(%t0: index, %t1: index, %t2: index){ + affine.for %i = 0 to 256 step 4 { + affine.for %j = 0 to 512 step 3 { + affine.for %k = 0 to 1024 step 2 { + "test.foo"(%i, %j, %k) : (index, index, index) -> () + } + } + } + return +} + +// ----- + +// CHECK-DAG: [[LBI0:#map[0-9]+]] = affine_map<(d0)[s0] -> (d0 * s0)> +// CHECK-DAG: [[UBI0:#map[0-9]+]] = affine_map<(d0)[s0, s1, s2] -> (d0 * s2 + s2, s0, 4096 floordiv s1)> +// CHECK-DAG: [[UBO0:#map[0-9]+]] = affine_map<()[s0, s1, s2] -> (s0 ceildiv s2, (4096 floordiv s1) ceildiv s2)> + +// CHECK: func @tile_loop_with_div_in_upper_bound([[ARG0:%arg[0-9]+]]: index, %{{.*}}: memref, %{{.*}}: index, %{{.*}}: index) +#ub = affine_map<()[s0, s1] -> (s0, 4096 floordiv s1)> +func @tile_loop_with_div_in_upper_bound(%t5 : index, %A : memref, %L : index, %U : index) { + %c0 = constant 0 : index + %M = dim %A, %c0 : memref + affine.for %i = 0 to min #ub()[%M, %U] { + addi %i, %i : index + } + // CHECK: affine.for [[ARG1:%arg[0-9]+]] = 0 to min [[UBO0]]()[%{{.*}}, %{{.*}}, [[ARG0]]] + // CHECK-NEXT: affine.for %[[I:.*]] = [[LBI0]]([[ARG1]]){{.*}}[[ARG0]]{{.*}} to min [[UBI0]]({{.*}})[{{.*}}, {{.*}}, [[ARG0]]] + // CHECK-NEXT: addi %[[I]], %[[I]] + return +} + +// ----- + +// CHECK-DAG: [[LBI0:#map[0-9]+]] = affine_map<(d0)[s0] -> (d0 * s0)> +// CHECK-DAG: [[UBI0:#map[0-9]+]] = affine_map<(d0)[s0, s1, s2] -> (d0 * s2 + s2 * 4, s0, 4096 floordiv s1)> +// CHECK-DAG: [[UBO0:#map[0-9]+]] = affine_map<()[s0, s1, s2] -> (s0 ceildiv s2, (4096 floordiv s1) ceildiv s2)> + +// CHECK: func @tile_loop_with_div_in_upper_bound_non_unit_step([[ARG0:%arg[0-9]+]]: index, %{{.*}}: memref, %{{.*}}: index, %{{.*}}: index) +#ub = affine_map<()[s0, s1] -> (s0, 4096 floordiv s1)> +func @tile_loop_with_div_in_upper_bound_non_unit_step(%t5 : index, %A : memref, %L : index, %U : index) { + %c0 = constant 0 : index + %M = dim %A, %c0 : memref + affine.for %i = 0 to min #ub()[%M, %U] step 4 { + addi %i, %i : index + } + // CHECK: affine.for [[ARG1:%arg[0-9]+]] = 0 to min [[UBO0]]()[%{{.*}}, %{{.*}}, [[ARG0]]]{{.*}} step 4{{.*}} + // CHECK-NEXT: affine.for %[[I:.*]] = [[LBI0]]([[ARG1]]){{.*}}[[ARG0]]{{.*}} to min [[UBI0]]({{.*}})[{{.*}}, {{.*}}, [[ARG0]]]{{.*}} step 4{{.*}} + // CHECK-NEXT: addi %[[I]], %[[I]] + return +} + +// ----- + +// CHECK-DAG: [[LBI0:#map[0-9]+]] = affine_map<(d0)[s0] -> ((d0 - 8) * s0 + 8)> +// CHECK-DAG: [[UBI2:#map[0-9]+]] = affine_map<(d0)[s0, s1] -> ((d0 - 8) * s1 + s1 * 4 + 8, s0 + 16)> +// CHECK-DAG: [[UBI1:#map[0-9]+]] = affine_map<(d0)[s0, s1] -> ((d0 - 8) * s1 + s1 + 8, s0 + 16)> +// CHECK-DAG: [[UBI0:#map[0-9]+]] = affine_map<(d0)[s0] -> ((d0 - 8) * s0 + s0 + 8, 256)> +// CHECK-DAG: [[UBO1:#map[0-9]+]] = affine_map<()[s0, s1] -> ((s0 + 8) ceildiv s1 + 8)> +// CHECK-DAG: [[UBO0:#map[0-9]+]] = affine_map<()[s0] -> (248 ceildiv s0 + 8)> + +// CHECK: func @tile_loop_with_non_zero_lb([[ARG0:%arg[0-9]+]]: index, [[ARG1:%arg[0-9]+]]: index, [[ARG2:%arg[0-9]+]]: index, %{{.*}}: index) +// CHECK-NEXT: affine.for [[ARG3:%arg[0-9+]]] = 8 to [[UBO0]]{{.*}}[[ARG0]]{{.*}} +// CHECK-NEXT: affine.for [[ARG4:%arg[0-9+]]] = 8 to [[UBO1]]{{.*}}[[ARG1]]{{.*}} +// CHECK-NEXT: affine.for [[ARG5:%arg[0-9+]]] = 8 to [[UBO1]]{{.*}}[[ARG2]]{{.*}} step 4 +// CHECK-NEXT: affine.for %[[I:.*]] = [[LBI0]]([[ARG3]]){{.*}}[[ARG0]]{{.*}} to min [[UBI0]]([[ARG3]]){{.*}}[[ARG0]]{{.*}} +// CHECK-NEXT: affine.for %[[J:.*]] = [[LBI0]]([[ARG4]]){{.*}}[[ARG1]]{{.*}} to min [[UBI1]]([[ARG4]]){{.*}}[[ARG1]]{{.*}} +// CHECK-NEXT: affine.for %[[K:.*]] = [[LBI0]]([[ARG5]]){{.*}}[[ARG2]]{{.*}} to min [[UBI2]]([[ARG5]]){{.*}}[[ARG2]]{{.*}}step 4{{.*}} +// CHECK-NEXT: "test.foo"(%[[I]], %[[J]], %[[K]]) : (index, index, index) -> () +#ubi = affine_map<()[s0] -> (s0 + 16)> +func @tile_loop_with_non_zero_lb(%t0: index, %t1: index, %t2: index, %U: index){ + affine.for %i = 8 to 256 { + affine.for %j = 8 to #ubi()[%U] { + affine.for %k = 8 to #ubi()[%U] step 4 { + "test.foo"(%i, %j, %k) : (index, index, index) -> () + } + } + } + return +} + +// ----- + +// CHECK-DAG: [[LBI:#map[0-9]+]] = affine_map<(d0)[s0] -> (d0 * s0)> +// CHECK-DAG: [[UBI0:#map[0-9]+]] = affine_map<(d0)[s0] -> (d0 * s0 + s0, 256)> +// CHECK-DAG: [[UBI1:#map[0-9]+]] = affine_map<(d0)[s0] -> (d0 * s0 + s0, 250)> +// CHECK-DAG: [[UBO0:#map[0-9]+]] = affine_map<()[s0] -> (256 ceildiv s0)> +// CHECK-DAG: [[UBO1:#map[0-9]+]] = affine_map<()[s0] -> (250 ceildiv s0)> + +// CHECK: func @simple_matmul([[ARG0:%arg[0-9]+]]: index, [[ARG1:%arg[0-9]+]]: index, [[ARG2:%arg[0-9]+]]: index{{.*}}) +// CHECK-NEXT: affine.for [[ARG3:%arg[0-9]+]] = 0 to [[UBO0]](){{.*}}[[ARG0]]{{.*}} +// CHECK-NEXT: affine.for [[ARG4:%arg[0-9]+]] = 0 to [[UBO0]](){{.*}}[[ARG1]]{{.*}} +// CHECK-NEXT: affine.for [[ARG5:%arg[0-9]+]] = 0 to [[UBO1]](){{.*}}[[ARG2]]{{.*}} +// CHECK-NEXT: affine.for %[[I:.*]] = [[LBI]]{{.*}}[[ARG3]]{{.*}}[[ARG0]]{{.*}} to min [[UBI0]]{{.*}}[[ARG3]]{{.*}}[[ARG0]]{{.*}} +// CHECK-NEXT: affine.for %[[J:.*]] = [[LBI]]{{.*}}[[ARG4]]{{.*}}[[ARG1]]{{.*}} to min [[UBI0]]{{.*}}[[ARG4]]{{.*}}[[ARG1]]{{.*}} +// CHECK-NEXT: affine.for %[[K:.*]] = [[LBI]]{{.*}}[[ARG5]]{{.*}}[[ARG2]]{{.*}} to min [[UBI1]]{{.*}}[[ARG5]]{{.*}}[[ARG2]]{{.*}} +// CHECK-NEXT: affine.load %{{.*}}[%[[I]], %[[K]]] +// CHECK-NEXT: affine.load %{{.*}}[%[[K]], %[[J]]] +// CHECK-NEXT: affine.load %{{.*}}[%[[I]], %[[J]]] +// CHECK-NEXT: mulf %{{.*}} +// CHECK-NEXT: addf %{{.*}} +// CHECK-NEXT: affine.store %{{.*}}[%[[I]], %[[J]]] +func @simple_matmul(%t6 : index, %t7 : index, %t8 : index, %arg0: memref<256x256xvector<64xf32>>, %arg1: memref<256x256xvector<64xf32>>, %arg2: memref<256x256xvector<64xf32>>) -> memref<256x256xvector<64xf32>> { + affine.for %i = 0 to 256 { + affine.for %j = 0 to 256 { + affine.for %k = 0 to 250 { + %l = affine.load %arg0[%i, %k] : memref<256x256xvector<64xf32>> + %r = affine.load %arg1[%k, %j] : memref<256x256xvector<64xf32>> + %o = affine.load %arg2[%i, %j] : memref<256x256xvector<64xf32>> + %m = mulf %l, %r : vector<64xf32> + %a = addf %o, %m : vector<64xf32> + affine.store %a, %arg2[%i, %j] : memref<256x256xvector<64xf32>> + } + } + } + return %arg2 : memref<256x256xvector<64xf32>> +} + +// ----- + +// CHECK-DAG: [[LBI0:#map[0-9]+]] = affine_map<(d0)[s0] -> (d0 * s0)> +// CHECK-DAG: [[UBI0:#map[0-9]+]] = affine_map<(d0)[s0, s1] -> (d0 * s1 + s1, s0)> +// CHECK-DAG: [[UBO0:#map[0-9]+]] = affine_map<()[s0, s1] -> (s0 ceildiv s1)> + +// CHECK: func @tile_with_symbolic_loop_upper_bounds([[ARG0:%arg[0-9]+]]: index, [[ARG1:%arg[0-9]+]]: index{{.*}}){{.*}} +// CHECK: affine.for [[ARG2:%arg[0-9]+]] = 0 to [[UBO0]](){{.*}}[[ARG0]]{{.*}} +// CHECK-NEXT: affine.for [[ARG3:%arg[0-9]+]] = 0 to [[UBO0]](){{.*}}[[ARG1]]{{.*}} +// CHECK-NEXT: affine.for %[[I0:.*]] = [[LBI0]]{{.*}}[[ARG2]]{{.*}}[[ARG0]]{{.*}} to min [[UBI0]]{{.*}}[[ARG2]]{{.*}}[[ARG0]]{{.*}} +// CHECK-NEXT: affine.for %[[I1:.*]] = [[LBI0]]{{.*}}[[ARG3]]{{.*}}[[ARG1]]{{.*}} to min [[UBI0]]{{.*}}[[ARG3]]{{.*}}[[ARG1]]{{.*}} +// CHECK-NEXT: affine.store %{{.*}}, %{{.*}}[%[[I0]], %[[I1]]] : memref +// CHECK-NEXT: affine.for %[[I2:.*]] = 0 to %{{.*}} { +// CHECK-NEXT: affine.load %{{.*}}%[[I0]], %[[I2]] +// CHECK-NEXT: affine.load %{{.*}}%[[I2]], %[[I1]] +// CHECK-NEXT: mulf +// CHECK-NEXT: affine.load %{{.*}}%[[I0]], %[[I1]] +// CHECK-NEXT: addf +// CHECK-NEXT: affine.store %{{.*}}%[[I0]], %[[I1]] +func @tile_with_symbolic_loop_upper_bounds(%t9 : index, %t10: index, %arg0: memref, %arg1: memref, %arg2: memref) { + %cst = constant 0.000000e+00 : f32 + %c0 = constant 0 : index + %0 = dim %arg0, %c0 : memref + affine.for %i0 = 0 to %0 { + affine.for %i1 = 0 to %0 { + affine.store %cst, %arg2[%i0, %i1] : memref + affine.for %i2 = 0 to %0 { + %1 = affine.load %arg0[%i0, %i2] : memref + %2 = affine.load %arg1[%i2, %i1] : memref + %3 = mulf %1, %2 : f32 + %4 = affine.load %arg2[%i0, %i1] : memref + %5 = addf %4, %3 : f32 + affine.store %5, %arg2[%i0, %i1] : memref + } + } + } + return +} + +// ----- + +// CHECK-DAG: [[LBI0:#map[0-9]+]] = affine_map<(d0)[s0] -> (d0 * s0)> +// CHECK-DAG: [[UBI0:#map[0-9]+]] = affine_map<(d0)[s0, s1, s2] -> (d0 * s2 + s2, s0 + s1)> +// CHECK-DAG: [[UBO0:#map[0-9]+]] = affine_map<()[s0, s1, s2] -> ((s0 + s1) ceildiv s2)> + +// CHECK: func @tile_with_loop_upper_bounds_in_two_symbols([[ARG0:%arg[0-9]+]]: index{{.*}}){{.*}} +func @tile_with_loop_upper_bounds_in_two_symbols(%t11 : index, %arg0: memref, %limit: index) { + %c0 = constant 0 : index + %dim0 = dim %arg0, %c0 : memref + affine.for %i0 = 0 to affine_map<()[s0, s1] -> (s0 + s1)> ()[%dim0, %limit] { + %v0 = affine.load %arg0[%i0] : memref + } + // CHECK: affine.for [[ARG1:%arg[0-9]+]] = 0 to [[UBO0]]()[%{{.*}}, %{{.*}}, [[ARG0]]] + // CHECK-NEXT: affine.for %[[I:.*]] = [[LBI0]]([[ARG1]]){{.*}}[[ARG0]]{{.*}} to min [[UBI0]]([[ARG1]])[{{.*}}, {{.*}}, [[ARG0]]] + // CHECK-NEXT: affine.load %{{.*}}[%[[I]]] + return +} + +// ----- + +// CHECK-DAG: [[LBI0:#map[0-9]+]] = affine_map<(d0)[s0] -> (d0 * s0)> +// CHECK-DAG: [[UBI1:#map[0-9]+]] = affine_map<(d0, d1)[s0, s1] -> (d1 * s1 + s1, d0 + s0 + 4)> +// CHECK-DAG: [[UBI0:#map[0-9]+]] = affine_map<(d0, d1)[s0, s1] -> (d1 * s1 + s1, d0 + s0 + 2)> +// CHECK-DAG: [[LBO0:#map[0-9]+]] = affine_map<() -> (0)> +// CHECK-DAG: [[UBO1:#map[0-9]+]] = affine_map<(d0)[s0, s1] -> ((d0 + s0 + 4) ceildiv s1)> +// CHECK-DAG: [[UBO0:#map[0-9]+]] = affine_map<(d0)[s0, s1] -> ((d0 + s0 + 2) ceildiv s1)> + +// CHECK: func @tile_with_upper_bounds_in_dimensions_and_symbols([[ARG0:%arg[0-9]+]]: index, [[ARG1:%arg[0-9]+]]: index, [[ARG2:%arg[0-9]+]]: index, [[ARG3:%arg[0-9]+]]: index{{.*}}){{.*}} +// CHECK-NEXT: affine.for [[ARG4:%arg[0-9]+]] = 0 to [[UBO0]]({{.*}}){{.*}}[[ARG0]] +// CHECK-NEXT: affine.for [[ARG5:%arg[0-9]+]] = 0 to [[UBO1]]({{.*}}){{.*}}[[ARG1]] +// CHECK-NEXT: affine.for {{.*}} = [[LBI0]]([[ARG4]]){{.*}}[[ARG0]]{{.*}} to min [[UBI0]]({{.*}}, [[ARG4]]){{.*}}[[ARG0]]{{.*}} +// CHECK-NEXT: affine.for {{.*}} = [[LBI0]]([[ARG5]]){{.*}}[[ARG1]]{{.*}} to min [[UBI1]]({{.*}}, [[ARG5]]){{.*}}[[ARG1]]{{.*}} +func @tile_with_upper_bounds_in_dimensions_and_symbols(%t12 : index, %t13 :index, %M: index, %N: index, %K: index) { + affine.for %i = 0 to affine_map<(d0)[s0] -> (d0 + s0 + 2)>(%M)[%K] { + affine.for %j = 0 to affine_map<(d0)[s0] -> (d0 + s0 + 4)>(%N)[%K] { + "test.foo" () : () -> () + } + } + return +} + +// ----- + +// CHECK-DAG: [[LBI0:#map[0-9]+]] = affine_map<(d0)[s0] -> (d0 * s0)> +// CHECK-DAG: [[UBI1:#map[0-9]+]] = affine_map<(d0, d1)[s0, s1] -> (d1 * s1 + s1 * 4, d0 + s0 + 4)> +// CHECK-DAG: [[UBI0:#map[0-9]+]] = affine_map<(d0, d1)[s0, s1] -> (d1 * s1 + s1 * 2, d0 + s0 + 2)> +// CHECK-DAG: [[LBO0:#map[0-9]+]] = affine_map<() -> (0)> +// CHECK-DAG: [[UBO1:#map[0-9]+]] = affine_map<(d0)[s0, s1] -> ((d0 + s0 + 4) ceildiv s1)> +// CHECK-DAG: [[UBO0:#map[0-9]+]] = affine_map<(d0)[s0, s1] -> ((d0 + s0 + 2) ceildiv s1)> + +// CHECK: func @tile_with_upper_bounds_in_dimensions_and_symbols_non_unit_steps +// CHECK-SAME: ([[ARG0:%arg[0-9]+]]: index, [[ARG1:%arg[0-9]+]]: index, [[ARG2:%arg[0-9]+]]: index, [[ARG3:%arg[0-9]+]]: index{{.*}}){{.*}} +// CHECK-NEXT: affine.for [[ARG4:%arg[0-9]+]] = 0 to [[UBO0]]({{.*}}){{.*}}[[ARG0]]{{.*}} step 2{{.*}} +// CHECK-NEXT: affine.for [[ARG5:%arg[0-9]+]] = 0 to [[UBO1]]({{.*}}){{.*}}[[ARG1]]{{.*}} step 4{{.*}} +// CHECK-NEXT: affine.for {{.*}} = [[LBI0]]([[ARG4]]){{.*}}[[ARG0]]{{.*}} to min [[UBI0]]({{.*}}, [[ARG4]]){{.*}}[[ARG0]]{{.*}} step 2{{.*}} +// CHECK-NEXT: affine.for {{.*}} = [[LBI0]]([[ARG5]]){{.*}}[[ARG1]]{{.*}} to min [[UBI1]]({{.*}}, [[ARG5]]){{.*}}[[ARG1]]{{.*}} step 4{{.*}} +func @tile_with_upper_bounds_in_dimensions_and_symbols_non_unit_steps(%t12 : index, %t13 :index, %M: index, %N : index, %K: index) { + affine.for %i = 0 to affine_map<(d0)[s0] -> (d0 + s0 + 2)>(%M)[%K] step 2 { + affine.for %j = 0 to affine_map<(d0)[s0] -> (d0 + s0 + 4)>(%N)[%K] step 4 { + "test.foo" () : () -> () + } + } + return +} diff --git a/mlir/test/lib/Transforms/CMakeLists.txt b/mlir/test/lib/Transforms/CMakeLists.txt --- a/mlir/test/lib/Transforms/CMakeLists.txt +++ b/mlir/test/lib/Transforms/CMakeLists.txt @@ -1,6 +1,7 @@ # Exclude tests from libMLIR.so add_mlir_library(MLIRTestTransforms TestAllReduceLowering.cpp + TestAffineLoopParametricTiling.cpp TestBufferPlacement.cpp TestExpandTanh.cpp TestCallGraph.cpp diff --git a/mlir/test/lib/Transforms/TestAffineLoopParametricTiling.cpp b/mlir/test/lib/Transforms/TestAffineLoopParametricTiling.cpp new file mode 100644 --- /dev/null +++ b/mlir/test/lib/Transforms/TestAffineLoopParametricTiling.cpp @@ -0,0 +1,322 @@ +//= TestAffineLoopParametricTiling.cpp -- Parametric Affine loop tiling pass =// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file implements a test pass to test parametric tiling of perfectly +// nested affine for loops. +// +//===----------------------------------------------------------------------===// + +#include "mlir/Analysis/AffineAnalysis.h" +#include "mlir/Analysis/AffineStructures.h" +#include "mlir/Analysis/LoopAnalysis.h" +#include "mlir/Analysis/Utils.h" +#include "mlir/Dialect/Affine/IR/AffineOps.h" +#include "mlir/Dialect/Affine/IR/AffineValueMap.h" +#include "mlir/Dialect/Affine/Passes.h" +#include "mlir/IR/Block.h" +#include "mlir/IR/Builders.h" +#include "mlir/Transforms/LoopUtils.h" +#include "llvm/Support/Debug.h" + +using namespace mlir; + +#define DEBUG_TYPE "test-affine-parametric-tile" + +namespace { + +struct TestAffineLoopParametricTiling + : public PassWrapper { + void runOnFunction() override; +}; +} // end anonymous namespace + +/// Checks if the function enclosing the loop nest has any arguments passed to +/// it, which can be used as tiling parameters. Assumes that atleast 'n' +/// arguments are passed, where 'n' is the number of loops in the loop nest. +static void checkIfTilingParametersExist(ArrayRef band) { + assert(!band.empty() && "no loops in input band"); + AffineForOp topLoop = band[0]; + + if (FuncOp funcOp = dyn_cast(topLoop.getParentOp())) + assert(funcOp.getNumArguments() >= band.size() && "Too few tile sizes"); +} + +/// Set lower and upper bounds of intra-tile loops for parametric tiling. +// TODO: Handle non-constant lower bounds. +static void setIntraTileBoundsParametric(OpBuilder &b, AffineForOp origLoop, + AffineForOp newInterTileLoop, + AffineForOp newIntraTileLoop, + Value tileSize) { + // The lower bound for the intra-tile loop is represented by an affine map + // as (%i, %t0)->((%i - %origlb) * %t0 + %origlb). Similarly, the upper bound + // for the intra-tile loop is represented by an affine map as (%i, %t0)->((%i + // - %origlb) * %t0) + (%t0 * %origLoopStep) + %origlb), where %i is loop IV + // of the corresponding inter-tile loop, %t0 is the corresponding tiling + // parameter, %origlb is lower bound and %origLoopStep is the loop step of the + // corresponding inter-tile loop. + + assert(origLoop.hasConstantLowerBound() && + "expected input loops to have constant lower bound."); + + AffineExpr origLowerBoundExpr; + // Get lower bound of original loop as an affine expression. + origLowerBoundExpr = + b.getAffineConstantExpr(origLoop.getConstantLowerBound()); + + // Add dim operands from original lower/upper bound. + SmallVector lbOperands, ubOperands; + AffineBound lb = origLoop.getLowerBound(); + AffineBound ub = origLoop.getUpperBound(); + lbOperands.reserve(lb.getNumOperands() + 2); + ubOperands.reserve(ub.getNumOperands() + 2); + AffineMap origLbMap = lb.getMap(); + AffineMap origUbMap = ub.getMap(); + for (unsigned j = 0, e = origLbMap.getNumDims(); j < e; ++j) + lbOperands.push_back(lb.getOperand(j)); + for (unsigned j = 0, e = origUbMap.getNumDims(); j < e; ++j) + ubOperands.push_back(ub.getOperand(j)); + + // Add a new dim operand in lb/ubOperands corresponding to the origLoop + // IV. + lbOperands.push_back(newInterTileLoop.getInductionVar()); + ubOperands.push_back(newInterTileLoop.getInductionVar()); + + // Get loop IV as an affine expression for lower/upper bound. Size of + // lb/ubOperands is guaranteed to be atleast one. + AffineExpr lbLoopIvExpr = b.getAffineDimExpr(lbOperands.size() - 1); + AffineExpr ubLoopIvExpr = b.getAffineDimExpr(ubOperands.size() - 1); + + // Add symbol operands from original lower/upper bound. + for (unsigned j = 0, e = origLbMap.getNumSymbols(); j < e; ++j) + lbOperands.push_back(lb.getOperand(origLbMap.getNumDims() + j)); + for (unsigned j = 0, e = origUbMap.getNumSymbols(); j < e; ++j) + ubOperands.push_back(ub.getOperand(origUbMap.getNumDims() + j)); + + // Add a new symbol operand which is the tile size for this loop. + lbOperands.push_back(tileSize); + ubOperands.push_back(tileSize); + + SmallVector lbBoundExprs; + SmallVector ubBoundExprs; + lbBoundExprs.reserve(origLbMap.getNumResults()); + ubBoundExprs.reserve(origUbMap.getNumResults()); + + // Get tiling parameter as an affine expression for lb/ub. + AffineExpr lbTileParameter = b.getAffineSymbolExpr(origLbMap.getNumSymbols()); + AffineExpr ubTileParameter = b.getAffineSymbolExpr(origUbMap.getNumSymbols()); + + // Insert lb as inter-tile ((loop IV - origlb) * tilingParameter) + origlb. + lbBoundExprs.push_back( + ((lbLoopIvExpr - origLowerBoundExpr) * lbTileParameter) + + origLowerBoundExpr); + + // Get the origLoopStep as an affine expression. + AffineExpr origLoopStep = b.getAffineConstantExpr(origLoop.getStep()); + + // Insert ub as inter-tile ((loop IV - origlb) * tilingParameter) + + // (tilingParameter * origLoopStep) + origlb. + ubBoundExprs.push_back( + ((ubLoopIvExpr - origLowerBoundExpr) * ubTileParameter) + + (ubTileParameter * origLoopStep) + origLowerBoundExpr); + + ubBoundExprs.append(origUbMap.getResults().begin(), + origUbMap.getResults().end()); + + AffineMap lbMap = + AffineMap::get(origLbMap.getNumDims() + 1, origLbMap.getNumSymbols() + 1, + lbBoundExprs, b.getContext()); + newIntraTileLoop.setLowerBound(lbOperands, lbMap); + + AffineMap ubMap = + AffineMap::get(origUbMap.getNumDims() + 1, origUbMap.getNumSymbols() + 1, + ubBoundExprs, b.getContext()); + newIntraTileLoop.setUpperBound(ubOperands, ubMap); + + // Original loop step must be preserved. + newIntraTileLoop.setStep(origLoop.getStep()); +} + +/// Set lower and upper bounds of inter-tile loops for parametric tiling. +// TODO: Handle non-constant lower bounds. +static void setInterTileBoundsParametric(OpBuilder &b, AffineForOp origLoop, + AffineForOp newLoop, Value tileSize) { + OperandRange newLbOperands = origLoop.getLowerBoundOperands(); + + // The lower bound for inter-tile loops are same as the lower bounds of + // original loops. + newLoop.setLowerBound(newLbOperands, origLoop.getLowerBoundMap()); + + // The new upper bound map for inter-tile loops, assuming constant lower + // bounds, are now originalLowerBound + ceildiv((orignalUpperBound - + // originalLowerBound), tiling paramter); where tiling parameter is the + // respective tile size for that loop. For e.g. if the original ubmap was + // ()->(1024), the new map will be + // ()[s0]->(ceildiv((1024 -lb) % s0)), where s0 is the tiling parameter. + // Therefore a new symbol operand is inserted in the map and the result + // expression is overwritten. + + assert(origLoop.hasConstantLowerBound() && + "expected input loops to have constant lower bound."); + + AffineExpr origLowerBoundExpr; + + // Get lower bound of original loop as an affine expression. + origLowerBoundExpr = + b.getAffineConstantExpr(origLoop.getConstantLowerBound()); + + // Add dim operands from original upper bound. + SmallVector ubOperands; + AffineBound ub = origLoop.getUpperBound(); + ubOperands.reserve(ub.getNumOperands() + 1); + AffineMap origUbMap = ub.getMap(); + for (unsigned j = 0, e = origUbMap.getNumDims(); j < e; ++j) + ubOperands.push_back(ub.getOperand(j)); + + // Add symbol operands from original upper bound. + for (unsigned j = 0, e = origUbMap.getNumSymbols(); j < e; ++j) + ubOperands.push_back(ub.getOperand(origUbMap.getNumDims() + j)); + + // Add a new symbol operand which is the tile size for this loop. + ubOperands.push_back(tileSize); + + // Get tiling parameter as an affine expression. + AffineExpr tileParameter = b.getAffineSymbolExpr(origUbMap.getNumSymbols()); + + SmallVector boundExprs; + boundExprs.reserve(origUbMap.getNumResults()); + int64_t origUpperBound; + AffineExpr origUpperBoundExpr; + + // If upper bound for the original loop is constant, then the constant can + // be obtained as an affine expression straight away. + if (origLoop.hasConstantUpperBound()) { + origUpperBound = origLoop.getConstantUpperBound(); + + // Get original constant upper bound as an affine expression. + origUpperBoundExpr = b.getAffineConstantExpr(origUpperBound); + + // Insert the bound as originalLowerBoundceildiv((originalUpperBound - + // originalLowerBound), tilingParameter). + boundExprs.push_back( + origLowerBoundExpr + + (origUpperBoundExpr - origLowerBoundExpr).ceilDiv(tileParameter)); + } else { + // If upper bound for the original loop is not constant then two cases + // are possible. 1.) The result of ubmap has only one result expression. + // For e.g. + // affine.for %i = 5 to %ub + // + // A symbol operand is added which represents the tiling paramater. The + // new loop bounds here will be like ()[s0, s1] -> ((s0 - 5) ceildiv s1) + // where 's0' is the original upper bound and 's1' is the tiling + // parameter. 2.) When ubMap has more than one result expression. For e.g. + // #map0 = affine_map<()[s0, s1] -> (s0, s1) + // affine.for %i = 5 to min #map0()[%s0, %s1] + // + // A symbol operand is added which represents the tiling parameter. The + // new loop bounds will be like ()[s0, s1, s2] -> ((s0 - 5) ceildiv s2, + // (s1 -5) ceildiv s2), where s2 is the tiling parameter. + + // Insert the bounds as originalLowerBound + ceildiv((originalUpperBound - + // originalLowerBound), tilingParameter). + for (AffineExpr origUpperBoundExpr : origUbMap.getResults()) + boundExprs.push_back( + origLowerBoundExpr + + (origUpperBoundExpr - origLowerBoundExpr).ceilDiv(tileParameter)); + } + + AffineMap ubMap = + AffineMap::get(origUbMap.getNumDims(), origUbMap.getNumSymbols() + 1, + boundExprs, b.getContext()); + newLoop.setUpperBound(ubOperands, ubMap); + + // Original loop step must be preserved. + newLoop.setStep(origLoop.getStep()); +} + +/// Captures tiling parameters, which are expected to be passed as arguments +/// to the function enclosing the loop nest. Also checks if the required +/// parameters are of index type. This approach is temporary for testing +/// purposes. +// TODO: Find a generic way to capture way to capture tiling parameters. +static void getTilingParameters(ArrayRef band, + SmallVectorImpl &tilingParameters) { + AffineForOp topLoop = band[0]; + Region *funcOpRegion = topLoop.getParentRegion(); + unsigned nestDepth = band.size(); + + for (BlockArgument blockArgument : + funcOpRegion->getArguments().take_front(nestDepth)) { + if (blockArgument.getArgNumber() < nestDepth) { + assert(blockArgument.getType().isIndex() && + "expected tiling parameters to be of index type."); + tilingParameters.push_back(blockArgument); + } + } +} + +void TestAffineLoopParametricTiling::runOnFunction() { + // Constructs and sets new loop bounds after tiling for the case of + // hyper-rectangular index sets, where the bounds of one dimension do not + // depend on other dimensions and tiling parameters are captured from SSA + // values. Bounds of each dimension can thus be treated independently, + // and deriving the new bounds is much simpler and faster than for the case of + // tiling arbitrary polyhedral shapes. + auto constructParametricallyTiledIndexSetHyperRect = + [](MutableArrayRef &origLoops, + MutableArrayRef newLoops, ArrayRef &tileSizes) { + assert(!origLoops.empty() && "expected atleast one loop in band"); + assert(origLoops.size() == tileSizes.size() && + "expected tiling parameter for each loop in band."); + + OpBuilder b(origLoops[0].getOperation()); + unsigned width = origLoops.size(); + + // Set bounds for tile space loops. + for (unsigned i = 0; i < width; ++i) { + setInterTileBoundsParametric(b, origLoops[i], newLoops[i], + tileSizes[i]); + } + + // Set bounds for intra-tile loops. + for (unsigned i = 0; i < width; ++i) { + setIntraTileBoundsParametric(b, origLoops[i], newLoops[i], + newLoops[i + width], tileSizes[i]); + } + }; + + // Bands of loops to tile. + std::vector> bands; + getTileableBands(getFunction(), &bands); + + // Tile each band. + for (SmallVectorImpl &band : bands) { + // Capture the tiling parameters from the arguments to the function + // enclosing this loop nest. + SmallVector tiledNest; + SmallVector tilingParameters; + // Check if tiling parameters are present. + checkIfTilingParametersExist(band); + + // Get function arguments as tiling parameters. + getTilingParameters(band, tilingParameters); + + if (failed(tilePerfectlyNestedParametric( + band, tilingParameters, tiledNest, + constructParametricallyTiledIndexSetHyperRect))) + return signalPassFailure(); + } +} + +namespace mlir { +void registerTestAffineLoopParametricTilingPass() { + PassRegistration( + "test-affine-parametric-tile", + "Tile affine loops using SSA values as tile sizes"); +} +} // namespace mlir diff --git a/mlir/tools/mlir-opt/mlir-opt.cpp b/mlir/tools/mlir-opt/mlir-opt.cpp --- a/mlir/tools/mlir-opt/mlir-opt.cpp +++ b/mlir/tools/mlir-opt/mlir-opt.cpp @@ -40,6 +40,7 @@ void registerSimpleParametricTilingPass(); void registerSymbolTestPasses(); void registerTestAffineDataCopyPass(); +void registerTestAffineLoopParametricTilingPass(); void registerTestAffineLoopUnswitchingPass(); void registerTestAllReduceLoweringPass(); void registerTestBufferPlacementPreparationPass(); @@ -102,6 +103,7 @@ #if MLIR_ROCM_CONVERSIONS_ENABLED registerTestConvertGPUKernelToHsacoPass(); #endif + registerTestAffineLoopParametricTilingPass(); registerTestBufferPlacementPreparationPass(); registerTestDominancePass(); registerTestFunc();