diff --git a/mlir/include/mlir/Transforms/LoopFusionUtils.h b/mlir/include/mlir/Dialect/Affine/LoopFusionUtils.h rename from mlir/include/mlir/Transforms/LoopFusionUtils.h rename to mlir/include/mlir/Dialect/Affine/LoopFusionUtils.h --- a/mlir/include/mlir/Transforms/LoopFusionUtils.h +++ b/mlir/include/mlir/Dialect/Affine/LoopFusionUtils.h @@ -12,8 +12,8 @@ // //===----------------------------------------------------------------------===// -#ifndef MLIR_TRANSFORMS_LOOPFUSIONUTILS_H -#define MLIR_TRANSFORMS_LOOPFUSIONUTILS_H +#ifndef MLIR_DIALECT_AFFINE_LOOPFUSIONUTILS_H +#define MLIR_DIALECT_AFFINE_LOOPFUSIONUTILS_H #include "mlir/IR/Value.h" #include "mlir/Support/LLVM.h" @@ -167,4 +167,4 @@ DenseSet &producerConsumerMemrefs); } // namespace mlir -#endif // MLIR_TRANSFORMS_LOOPFUSIONUTILS_H +#endif // MLIR_DIALECT_AFFINE_LOOPFUSIONUTILS_H diff --git a/mlir/include/mlir/Transforms/LoopUtils.h b/mlir/include/mlir/Dialect/Affine/LoopUtils.h rename from mlir/include/mlir/Transforms/LoopUtils.h rename to mlir/include/mlir/Dialect/Affine/LoopUtils.h --- a/mlir/include/mlir/Transforms/LoopUtils.h +++ b/mlir/include/mlir/Dialect/Affine/LoopUtils.h @@ -12,8 +12,8 @@ // //===----------------------------------------------------------------------===// -#ifndef MLIR_TRANSFORMS_LOOPUTILS_H -#define MLIR_TRANSFORMS_LOOPUTILS_H +#ifndef MLIR_DIALECT_AFFINE_LOOPUTILS_H +#define MLIR_DIALECT_AFFINE_LOOPUTILS_H #include "mlir/IR/Block.h" #include "mlir/Support/LLVM.h" @@ -45,9 +45,6 @@ LogicalResult loopUnrollByFactor( AffineForOp forOp, uint64_t unrollFactor, function_ref annotateFn = nullptr); -LogicalResult loopUnrollByFactor( - scf::ForOp forOp, uint64_t unrollFactor, - function_ref annotateFn = nullptr); /// Unrolls this loop by the specified unroll factor or its trip count, /// whichever is lower. @@ -63,8 +60,6 @@ /// AffineForOp, and the second op is a terminator). void getPerfectlyNestedLoops(SmallVectorImpl &nestedLoops, AffineForOp root); -void getPerfectlyNestedLoops(SmallVectorImpl &nestedLoops, - scf::ForOp root); /// Unrolls and jams this loop by the specified factor. `forOp` can be a loop /// with iteration arguments performing supported reductions and its inner loops @@ -78,10 +73,9 @@ LogicalResult loopUnrollJamUpToFactor(AffineForOp forOp, uint64_t unrollJamFactor); -/// Promotes the loop body of a AffineForOp/scf::ForOp to its containing block -/// if the loop was known to have a single iteration. +/// Promotes the loop body of a AffineForOp to its containing block if the loop +/// was known to have a single iteration. LogicalResult promoteIfSingleIteration(AffineForOp forOp); -LogicalResult promoteIfSingleIteration(scf::ForOp forOp); /// Promotes all single iteration AffineForOp's in the Function, i.e., moves /// their body into the containing Block. @@ -146,13 +140,9 @@ /// occurrence in `forOps`, under each of the `targets`. /// Returns the new AffineForOps, one per each of (`forOps`, `targets`) pair, /// nested immediately under each of `targets`. -using Loops = SmallVector; -using TileLoops = std::pair; SmallVector, 8> tile(ArrayRef forOps, ArrayRef sizes, ArrayRef targets); -SmallVector tile(ArrayRef forOps, ArrayRef sizes, - ArrayRef targets); /// Performs tiling (with interchange) by strip-mining the `forOps` by `sizes` /// and sinking them, in their order of occurrence in `forOps`, under `target`. @@ -160,15 +150,6 @@ /// `target`. SmallVector tile(ArrayRef forOps, ArrayRef sizes, AffineForOp target); -Loops tile(ArrayRef forOps, ArrayRef sizes, - scf::ForOp target); - -/// Tile a nest of scf::ForOp loops rooted at `rootForOp` with the given -/// (parametric) sizes. Sizes are expected to be strictly positive values at -/// runtime. If more sizes than loops are provided, discard the trailing values -/// in sizes. Assumes the loop nest is permutable. -/// Returns the newly created intra-tile loops. -Loops tilePerfectlyNested(scf::ForOp rootForOp, ArrayRef sizes); /// Explicit copy / DMA generation options for mlir::affineDataCopyGenerate. struct AffineCopyOptions { @@ -236,16 +217,6 @@ const AffineCopyOptions ©Options, CopyGenerateResult &result); -/// Tile a nest of standard for loops rooted at `rootForOp` by finding such -/// parametric tile sizes that the outer loops have a fixed number of iterations -/// as defined in `sizes`. -TileLoops extractFixedOuterLoops(scf::ForOp rootFOrOp, ArrayRef sizes); - -/// Replace a perfect nest of "for" loops with a single linearized loop. Assumes -/// `loops` contains a list of perfectly nested loops with bounds and steps -/// independent of any loop induction variable involved in the nest. -void coalesceLoops(MutableArrayRef loops); - /// Replace a perfect nest of "for" loops with a single linearized loop. Assumes /// `loops` contains a list of perfectly nested loops outermost to innermost /// that are normalized (step one and lower bound of zero) and with bounds and @@ -254,12 +225,6 @@ /// be representable using affine.for. LogicalResult coalesceLoops(MutableArrayRef loops); -/// Take the ParallelLoop and for each set of dimension indices, combine them -/// into a single dimension. combinedDimensions must contain each index into -/// loops exactly once. -void collapseParallelLoops(scf::ParallelOp loops, - ArrayRef> combinedDimensions); - /// Maps `forOp` for execution on a parallel grid of virtual `processorIds` of /// size given by `numProcessors`. This is achieved by embedding the SSA values /// corresponding to `processorIds` and `numProcessors` into the bounds and step @@ -321,9 +286,6 @@ separateFullTiles(MutableArrayRef nest, SmallVectorImpl *fullTileNest = nullptr); -/// Move loop invariant code out of `looplike`. -LogicalResult moveLoopInvariantCode(LoopLikeOpInterface looplike); - } // namespace mlir -#endif // MLIR_TRANSFORMS_LOOPUTILS_H +#endif // MLIR_DIALECT_AFFINE_LOOPUTILS_H diff --git a/mlir/include/mlir/Dialect/Affine/Passes.h b/mlir/include/mlir/Dialect/Affine/Passes.h --- a/mlir/include/mlir/Dialect/Affine/Passes.h +++ b/mlir/include/mlir/Dialect/Affine/Passes.h @@ -21,6 +21,10 @@ class AffineForOp; +/// Fusion mode to attempt. The default mode `Greedy` does both +/// producer-consumer and sibling fusion. +enum FusionMode { Greedy, ProducerConsumer, Sibling }; + /// Creates a simplification pass for affine structures (maps and sets). In /// addition, this pass also normalizes memrefs to have the trivial (identity) /// layout map. @@ -53,6 +57,19 @@ /// dead allocs. std::unique_ptr> createAffineScalarReplacementPass(); +/// Creates a pass that transforms perfectly nested loops with independent +/// bounds into a single loop. +std::unique_ptr> createLoopCoalescingPass(); + +/// Creates a loop fusion pass which fuses loops according to type of fusion +/// specified in `fusionMode`. Buffers of size less than or equal to +/// `localBufSizeThreshold` are promoted to memory space `fastMemorySpace`. +std::unique_ptr> +createLoopFusionPass(unsigned fastMemorySpace = 0, + uint64_t localBufSizeThreshold = 0, + bool maximalFusion = false, + enum FusionMode fusionMode = FusionMode::Greedy); + /// Creates a pass to perform tiling on loop nests. std::unique_ptr> createLoopTilingPass(uint64_t cacheSizeBytes); @@ -76,6 +93,10 @@ std::unique_ptr> createLoopUnrollAndJamPass(int unrollJamFactor = -1); +/// Creates a pass to pipeline explicit movement of data across levels of the +/// memory hierarchy. +std::unique_ptr> createPipelineDataTransferPass(); + /// Creates a pass to vectorize loops, operations and data types using a /// target-independent, n-D super-vector abstraction. std::unique_ptr> diff --git a/mlir/include/mlir/Dialect/Affine/Passes.td b/mlir/include/mlir/Dialect/Affine/Passes.td --- a/mlir/include/mlir/Dialect/Affine/Passes.td +++ b/mlir/include/mlir/Dialect/Affine/Passes.td @@ -43,6 +43,138 @@ ]; } +def AffineLoopFusion : Pass<"affine-loop-fusion", "FuncOp"> { + let summary = "Fuse affine loop nests"; + let description = [{ + This pass performs fusion of loop nests using a slicing-based approach. It + combines two fusion strategies: producer-consumer fusion and sibling fusion. + Producer-consumer fusion is aimed at fusing pairs of loops where the first + one writes to a memref that the second reads. Sibling fusion targets pairs + of loops that share no dependences between them but that load from the same + memref. The fused loop nests, when possible, are rewritten to access + significantly smaller local buffers instead of the original memref's, and + the latter are often either completely optimized away or contracted. This + transformation leads to enhanced locality and lower memory footprint through + the elimination or contraction of temporaries/intermediate memref's. These + benefits are sometimes achieved at the expense of redundant computation + through a cost model that evaluates available choices such as the depth at + which a source slice should be materialized in the designation slice. + + Example 1: Producer-consumer fusion. + Input: + ```mlir + func @producer_consumer_fusion(%arg0: memref<10xf32>, %arg1: memref<10xf32>) { + %0 = memref.alloc() : memref<10xf32> + %1 = memref.alloc() : memref<10xf32> + %cst = arith.constant 0.000000e+00 : f32 + affine.for %arg2 = 0 to 10 { + affine.store %cst, %0[%arg2] : memref<10xf32> + affine.store %cst, %1[%arg2] : memref<10xf32> + } + affine.for %arg2 = 0 to 10 { + %2 = affine.load %0[%arg2] : memref<10xf32> + %3 = arith.addf %2, %2 : f32 + affine.store %3, %arg0[%arg2] : memref<10xf32> + } + affine.for %arg2 = 0 to 10 { + %2 = affine.load %1[%arg2] : memref<10xf32> + %3 = arith.mulf %2, %2 : f32 + affine.store %3, %arg1[%arg2] : memref<10xf32> + } + return + } + ``` + Output: + ```mlir + func @producer_consumer_fusion(%arg0: memref<10xf32>, %arg1: memref<10xf32>) { + %0 = memref.alloc() : memref<1xf32> + %1 = memref.alloc() : memref<1xf32> + %cst = arith.constant 0.000000e+00 : f32 + affine.for %arg2 = 0 to 10 { + affine.store %cst, %0[0] : memref<1xf32> + affine.store %cst, %1[0] : memref<1xf32> + %2 = affine.load %1[0] : memref<1xf32> + %3 = arith.mulf %2, %2 : f32 + affine.store %3, %arg1[%arg2] : memref<10xf32> + %4 = affine.load %0[0] : memref<1xf32> + %5 = arith.addf %4, %4 : f32 + affine.store %5, %arg0[%arg2] : memref<10xf32> + } + return + } + ``` + + Example 2: Sibling fusion. + Input: + ```mlir + func @sibling_fusion(%arg0: memref<10x10xf32>, %arg1: memref<10x10xf32>, + %arg2: memref<10x10xf32>, %arg3: memref<10x10xf32>, + %arg4: memref<10x10xf32>) { + affine.for %arg5 = 0 to 3 { + affine.for %arg6 = 0 to 3 { + %0 = affine.load %arg0[%arg5, %arg6] : memref<10x10xf32> + %1 = affine.load %arg1[%arg5, %arg6] : memref<10x10xf32> + %2 = arith.mulf %0, %1 : f32 + affine.store %2, %arg3[%arg5, %arg6] : memref<10x10xf32> + } + } + affine.for %arg5 = 0 to 3 { + affine.for %arg6 = 0 to 3 { + %0 = affine.load %arg0[%arg5, %arg6] : memref<10x10xf32> + %1 = affine.load %arg2[%arg5, %arg6] : memref<10x10xf32> + %2 = arith.addf %0, %1 : f32 + affine.store %2, %arg4[%arg5, %arg6] : memref<10x10xf32> + } + } + return + } + ``` + Output: + ```mlir + func @sibling_fusion(%arg0: memref<10x10xf32>, %arg1: memref<10x10xf32>, + %arg2: memref<10x10xf32>, %arg3: memref<10x10xf32>, + %arg4: memref<10x10xf32>) { + affine.for %arg5 = 0 to 3 { + affine.for %arg6 = 0 to 3 { + %0 = affine.load %arg0[%arg5, %arg6] : memref<10x10xf32> + %1 = affine.load %arg1[%arg5, %arg6] : memref<10x10xf32> + %2 = arith.mulf %0, %1 : f32 + affine.store %2, %arg3[%arg5, %arg6] : memref<10x10xf32> + %3 = affine.load %arg0[%arg5, %arg6] : memref<10x10xf32> + %4 = affine.load %arg2[%arg5, %arg6] : memref<10x10xf32> + %5 = arith.addf %3, %4 : f32 + affine.store %5, %arg4[%arg5, %arg6] : memref<10x10xf32> + } + } + return + } + ``` + }]; + let constructor = "mlir::createLoopFusionPass()"; + let options = [ + Option<"computeToleranceThreshold", "fusion-compute-tolerance", "double", + /*default=*/"0.30f", "Fractional increase in additional computation " + "tolerated while fusing">, + Option<"fastMemorySpace", "fusion-fast-mem-space", "unsigned", + /*default=*/"0", + "Faster memory space number to promote fusion buffers to">, + Option<"localBufSizeThreshold", "fusion-local-buf-threshold", "uint64_t", + /*default=*/"0", "Threshold size (KiB) for promoting local buffers " + "to fast memory space">, + Option<"maximalFusion", "fusion-maximal", "bool", /*default=*/"false", + "Enables maximal loop fusion">, + Option<"affineFusionMode", "mode", "enum FusionMode", + "mlir::FusionMode::Greedy", "fusion mode to attempt", + "llvm::cl::values(clEnumValN(mlir::FusionMode::Greedy," + " \"greedy\", \"Perform greedy (both producer-consumer and sibling) fusion\"), " + "clEnumValN( mlir::FusionMode::ProducerConsumer, " + "\"producer\", \"Perform only producer-consumer fusion\"), " + "clEnumValN( mlir::FusionMode::Sibling, " + "\"sibling\", \"Perform only sibling fusion\"))">, + ]; + let dependentDialects = ["memref::MemRefDialect"]; +} + def AffineLoopInvariantCodeMotion : Pass<"affine-loop-invariant-code-motion", "FuncOp"> { let summary = "Hoist loop invariant instructions outside of affine loops"; @@ -94,6 +226,75 @@ ]; } +def AffinePipelineDataTransfer + : Pass<"affine-pipeline-data-transfer", "FuncOp"> { + let summary = "Pipeline non-blocking data transfers between explicitly " + "managed levels of the memory hierarchy"; + let description = [{ + This pass performs a transformation to overlap non-blocking DMA operations + in a loop with computations through double buffering. This is achieved by + advancing dma_start operations with respect to other operations. + + Input + + ```mlir + func @pipelinedatatransfer() { + %0 = memref.alloc() : memref<256xf32> + %1 = memref.alloc() : memref<32xf32, 1> + %2 = memref.alloc() : memref<1xf32> + %c0 = arith.constant 0 : index + %c128 = arith.constant 128 : index + affine.for %i0 = 0 to 8 { + affine.dma_start %0[%i0], %1[%i0], %2[%c0], %c128 : memref<256xf32>, memref<32xf32, 1>, memref<1xf32> + affine.dma_wait %2[%c0], %c128 : memref<1xf32> + %3 = affine.load %1[%i0] : memref<32xf32, 1> + %4 = "compute"(%3) : (f32) -> f32 + affine.store %4, %1[%i0] : memref<32xf32, 1> + } + return + } + ``` + + Output + + ```mlir + module { + func @pipelinedatatransfer() { + %c8 = arith.constant 8 : index + %c0 = arith.constant 0 : index + %0 = memref.alloc() : memref<256xf32> + %c0_0 = arith.constant 0 : index + %c128 = arith.constant 128 : index + %1 = memref.alloc() : memref<2x32xf32, 1> + %2 = memref.alloc() : memref<2x1xf32> + affine.dma_start %0[%c0], %1[%c0 mod 2, %c0], %2[%c0 mod 2, symbol(%c0_0)], %c128 : memref<256xf32>, memref<2x32xf32, 1>, memref<2x1xf32> + affine.for %arg0 = 1 to 8 { + affine.dma_start %0[%arg0], %1[%arg0 mod 2, %arg0], %2[%arg0 mod 2, symbol(%c0_0)], %c128 : memref<256xf32>, memref<2x32xf32, 1>, memref<2x1xf32> + %8 = affine.apply #map3(%arg0) + %9 = affine.apply #map4(%8) + %10 = affine.apply #map4(%8) + affine.dma_wait %2[%8 mod 2, symbol(%c0_0)], %c128 : memref<2x1xf32> + %11 = affine.load %1[%8 mod 2, %8] : memref<2x32xf32, 1> + %12 = "compute"(%11) : (f32) -> f32 + affine.store %12, %1[%8 mod 2, %8] : memref<2x32xf32, 1> + } + %3 = affine.apply #map3(%c8) + %4 = affine.apply #map4(%3) + %5 = affine.apply #map4(%3) + affine.dma_wait %2[%3 mod 2, symbol(%c0_0)], %c128 : memref<2x1xf32> + %6 = affine.load %1[%3 mod 2, %3] : memref<2x32xf32, 1> + %7 = "compute"(%6) : (f32) -> f32 + affine.store %7, %1[%3 mod 2, %3] : memref<2x32xf32, 1> + memref.dealloc %2 : memref<2x1xf32> + memref.dealloc %1 : memref<2x32xf32, 1> + return + } + } + ``` + }]; + let constructor = "mlir::createPipelineDataTransferPass()"; +} + def AffineScalarReplacement : Pass<"affine-scalrep", "FuncOp"> { let summary = "Replace affine memref acceses by scalars by forwarding stores " "to loads and eliminating redundant loads"; @@ -184,6 +385,13 @@ let constructor = "mlir::createAffineLoopNormalizePass()"; } +def LoopCoalescing : Pass<"loop-coalescing", "FuncOp"> { + let summary = "Coalesce nested loops with independent bounds into a single " + "loop"; + let constructor = "mlir::createLoopCoalescingPass()"; + let dependentDialects = ["arith::ArithmeticDialect"]; +} + def SimplifyAffineStructures : Pass<"simplify-affine-structures", "FuncOp"> { let summary = "Simplify affine expressions in maps/sets and normalize " "memrefs"; diff --git a/mlir/include/mlir/Dialect/Affine/Utils.h b/mlir/include/mlir/Dialect/Affine/Utils.h --- a/mlir/include/mlir/Dialect/Affine/Utils.h +++ b/mlir/include/mlir/Dialect/Affine/Utils.h @@ -24,6 +24,10 @@ class Operation; class PostDominanceInfo; +namespace memref { +class AllocOp; +} // namespace memref + struct LogicalResult; using ReductionLoopMap = DenseMap>; @@ -168,6 +172,121 @@ AffineExpr substWithMin(AffineExpr e, AffineExpr dim, AffineExpr min, AffineExpr max, bool positivePath = true); +/// Replaces all "dereferencing" uses of `oldMemRef` with `newMemRef` while +/// optionally remapping the old memref's indices using the supplied affine map, +/// `indexRemap`. The new memref could be of a different shape or rank. +/// `extraIndices` provides any additional access indices to be added to the +/// start. +/// +/// `indexRemap` remaps indices of the old memref access to a new set of indices +/// that are used to index the memref. Additional input operands to indexRemap +/// can be optionally provided in `extraOperands`, and they occupy the start +/// of its input list. `indexRemap`'s dimensional inputs are expected to +/// correspond to memref's indices, and its symbolic inputs if any should be +/// provided in `symbolOperands`. +/// +/// `domOpFilter`, if non-null, restricts the replacement to only those +/// operations that are dominated by the former; similarly, `postDomOpFilter` +/// restricts replacement to only those operations that are postdominated by it. +/// +/// 'allowNonDereferencingOps', if set, allows replacement of non-dereferencing +/// uses of a memref without any requirement for access index rewrites as long +/// as the user operation has the MemRefsNormalizable trait. The default value +/// of this flag is false. +/// +/// 'replaceInDeallocOp', if set, lets DeallocOp, a non-dereferencing user, to +/// also be a candidate for replacement. The default value of this flag is +/// false. +/// +/// Returns true on success and false if the replacement is not possible, +/// whenever a memref is used as an operand in a non-dereferencing context and +/// 'allowNonDereferencingOps' is false, except for dealloc's on the memref +/// which are left untouched. See comments at function definition for an +/// example. +// +// Ex: to replace load %A[%i, %j] with load %Abuf[%t mod 2, %ii - %i, %j]: +// The SSA value corresponding to '%t mod 2' should be in 'extraIndices', and +// index remap will perform (%i, %j) -> (%ii - %i, %j), i.e., indexRemap = (d0, +// d1, d2) -> (d0 - d1, d2), and %ii will be the extra operand. Without any +// extra operands, note that 'indexRemap' would just be applied to existing +// indices (%i, %j). +// TODO: allow extraIndices to be added at any position. +LogicalResult replaceAllMemRefUsesWith( + Value oldMemRef, Value newMemRef, ArrayRef extraIndices = {}, + AffineMap indexRemap = AffineMap(), ArrayRef extraOperands = {}, + ArrayRef symbolOperands = {}, Operation *domOpFilter = nullptr, + Operation *postDomOpFilter = nullptr, bool allowNonDereferencingOps = false, + bool replaceInDeallocOp = false); + +/// Performs the same replacement as the other version above but only for the +/// dereferencing uses of `oldMemRef` in `op`, except in cases where +/// 'allowNonDereferencingOps' is set to true where we replace the +/// non-dereferencing uses as well. +LogicalResult replaceAllMemRefUsesWith(Value oldMemRef, Value newMemRef, + Operation *op, + ArrayRef extraIndices = {}, + AffineMap indexRemap = AffineMap(), + ArrayRef extraOperands = {}, + ArrayRef symbolOperands = {}, + bool allowNonDereferencingOps = false); + +/// Rewrites the memref defined by this alloc op to have an identity layout map +/// and updates all its indexing uses. Returns failure if any of its uses +/// escape (while leaving the IR in a valid state). +LogicalResult normalizeMemRef(memref::AllocOp *op); + +/// Uses the old memref type map layout and computes the new memref type to have +/// a new shape and a layout map, where the old layout map has been normalized +/// to an identity layout map. It returns the old memref in case no +/// normalization was needed or a failure occurs while transforming the old map +/// layout to an identity layout map. +MemRefType normalizeMemRefType(MemRefType memrefType, OpBuilder builder, + unsigned numSymbolicOperands); + +/// Creates and inserts into 'builder' a new AffineApplyOp, with the number of +/// its results equal to the number of operands, as a composition +/// of all other AffineApplyOps reachable from input parameter 'operands'. If +/// different operands were drawing results from multiple affine apply ops, +/// these will also be collected into a single (multi-result) affine apply op. +/// The final results of the composed AffineApplyOp are returned in output +/// parameter 'results'. Returns the affine apply op created. +Operation *createComposedAffineApplyOp(OpBuilder &builder, Location loc, + ArrayRef operands, + ArrayRef affineApplyOps, + SmallVectorImpl *results); + +/// Given an operation, inserts one or more single result affine apply +/// operations, results of which are exclusively used by this operation. +/// The operands of these newly created affine apply ops are +/// guaranteed to be loop iterators or terminal symbols of a function. +/// +/// Before +/// +/// affine.for %i = 0 to #map(%N) +/// %idx = affine.apply (d0) -> (d0 mod 2) (%i) +/// send %A[%idx], ... +/// %v = "compute"(%idx, ...) +/// +/// After +/// +/// affine.for %i = 0 to #map(%N) +/// %idx = affine.apply (d0) -> (d0 mod 2) (%i) +/// send %A[%idx], ... +/// %idx_ = affine.apply (d0) -> (d0 mod 2) (%i) +/// %v = "compute"(%idx_, ...) + +/// This allows the application of different transformations on send and +/// compute (for eg. different shifts/delays) +/// +/// Fills `sliceOps` with the list of affine.apply operations. +/// In the following cases, `sliceOps` remains empty: +/// 1. If none of opInst's operands were the result of an affine.apply +/// (i.e., there was no affine computation slice to create). +/// 2. If all the affine.apply op's supplying operands to this opInst did not +/// have any uses other than those in this opInst. +void createAffineComputationSlice(Operation *opInst, + SmallVectorImpl *sliceOps); + } // namespace mlir #endif // MLIR_DIALECT_AFFINE_UTILS_H diff --git a/mlir/include/mlir/Dialect/SCF/Passes.h b/mlir/include/mlir/Dialect/SCF/Passes.h --- a/mlir/include/mlir/Dialect/SCF/Passes.h +++ b/mlir/include/mlir/Dialect/SCF/Passes.h @@ -32,6 +32,10 @@ /// inside of scf.for loops with known lower and upper bounds. std::unique_ptr createSCFForLoopCanonicalizationPass(); +/// Creates a pass that transforms a single ParallelLoop over N induction +/// variables into another ParallelLoop over less than N induction variables. +std::unique_ptr createParallelLoopCollapsingPass(); + /// Creates a loop fusion pass which fuses parallel loops. std::unique_ptr createParallelLoopFusionPass(); diff --git a/mlir/include/mlir/Dialect/SCF/Passes.td b/mlir/include/mlir/Dialect/SCF/Passes.td --- a/mlir/include/mlir/Dialect/SCF/Passes.td +++ b/mlir/include/mlir/Dialect/SCF/Passes.td @@ -52,6 +52,22 @@ let constructor = "mlir::createParallelLoopFusionPass()"; } +def SCFParallelLoopCollapsing : Pass<"parallel-loop-collapsing"> { + let summary = "Collapse parallel loops to use less induction variables"; + let constructor = "mlir::createParallelLoopCollapsingPass()"; + let options = [ + ListOption<"clCollapsedIndices0", "collapsed-indices-0", "unsigned", + "Which loop indices to combine 0th loop index", + "llvm::cl::MiscFlags::CommaSeparated">, + ListOption<"clCollapsedIndices1", "collapsed-indices-1", "unsigned", + "Which loop indices to combine into the position 1 loop index", + "llvm::cl::MiscFlags::CommaSeparated">, + ListOption<"clCollapsedIndices2", "collapsed-indices-2", "unsigned", + "Which loop indices to combine into the position 2 loop index", + "llvm::cl::MiscFlags::CommaSeparated">, + ]; +} + def SCFParallelLoopSpecialization : Pass<"parallel-loop-specialization", "FuncOp"> { let summary = "Specialize parallel loops for vectorization"; diff --git a/mlir/include/mlir/Dialect/SCF/Utils.h b/mlir/include/mlir/Dialect/SCF/Utils.h --- a/mlir/include/mlir/Dialect/SCF/Utils.h +++ b/mlir/include/mlir/Dialect/SCF/Utils.h @@ -98,5 +98,65 @@ SmallVectorImpl &symbols, llvm::function_ref loopFilter = nullptr); +/// Replace a perfect nest of "for" loops with a single linearized loop. Assumes +/// `loops` contains a list of perfectly nested loops with bounds and steps +/// independent of any loop induction variable involved in the nest. +void coalesceLoops(MutableArrayRef loops); + +/// Take the ParallelLoop and for each set of dimension indices, combine them +/// into a single dimension. combinedDimensions must contain each index into +/// loops exactly once. +void collapseParallelLoops(scf::ParallelOp loops, + ArrayRef> combinedDimensions); + +/// Promotes the loop body of a scf::ForOp to its containing block if the loop +/// was known to have a single iteration. +LogicalResult promoteIfSingleIteration(scf::ForOp forOp); + +/// Unrolls this for operation by the specified unroll factor. Returns failure +/// if the loop cannot be unrolled either due to restrictions or due to invalid +/// unroll factors. Requires positive loop bounds and step. If specified, +/// annotates the Ops in each unrolled iteration by applying `annotateFn`. +LogicalResult loopUnrollByFactor( + scf::ForOp forOp, uint64_t unrollFactor, + function_ref annotateFn = nullptr); + +/// Tile a nest of standard for loops rooted at `rootForOp` by finding such +/// parametric tile sizes that the outer loops have a fixed number of iterations +/// as defined in `sizes`. +using Loops = SmallVector; +using TileLoops = std::pair; +TileLoops extractFixedOuterLoops(scf::ForOp rootFOrOp, ArrayRef sizes); + +/// Performs tiling fo imperfectly nested loops (with interchange) by +/// strip-mining the `forOps` by `sizes` and sinking them, in their order of +/// occurrence in `forOps`, under each of the `targets`. +/// Returns the new AffineForOps, one per each of (`forOps`, `targets`) pair, +/// nested immediately under each of `targets`. +SmallVector tile(ArrayRef forOps, ArrayRef sizes, + ArrayRef targets); + +/// Performs tiling (with interchange) by strip-mining the `forOps` by `sizes` +/// and sinking them, in their order of occurrence in `forOps`, under `target`. +/// Returns the new AffineForOps, one per `forOps`, nested immediately under +/// `target`. +Loops tile(ArrayRef forOps, ArrayRef sizes, + scf::ForOp target); + +/// Tile a nest of scf::ForOp loops rooted at `rootForOp` with the given +/// (parametric) sizes. Sizes are expected to be strictly positive values at +/// runtime. If more sizes than loops are provided, discard the trailing values +/// in sizes. Assumes the loop nest is permutable. +/// Returns the newly created intra-tile loops. +Loops tilePerfectlyNested(scf::ForOp rootForOp, ArrayRef sizes); + +/// Get perfectly nested sequence of loops starting at root of loop nest +/// (the first op being another AffineFor, and the second op - a terminator). +/// A loop is perfectly nested iff: the first op in the loop's body is another +/// AffineForOp, and the second op is a terminator). +void getPerfectlyNestedLoops(SmallVectorImpl &nestedLoops, + scf::ForOp root); + } // namespace mlir + #endif // MLIR_DIALECT_SCF_UTILS_H_ diff --git a/mlir/include/mlir/Interfaces/LoopLikeInterface.h b/mlir/include/mlir/Interfaces/LoopLikeInterface.h --- a/mlir/include/mlir/Interfaces/LoopLikeInterface.h +++ b/mlir/include/mlir/Interfaces/LoopLikeInterface.h @@ -15,7 +15,20 @@ #include "mlir/IR/OpDefinition.h" +//===----------------------------------------------------------------------===// +// LoopLike Interfaces +//===----------------------------------------------------------------------===// + /// Include the generated interface declarations. #include "mlir/Interfaces/LoopLikeInterface.h.inc" +//===----------------------------------------------------------------------===// +// LoopLike Utilities +//===----------------------------------------------------------------------===// + +namespace mlir { +/// Move loop invariant code out of a `looplike` operation. +LogicalResult moveLoopInvariantCode(LoopLikeOpInterface looplike); +} // namespace mlir + #endif // MLIR_INTERFACES_LOOPLIKEINTERFACE_H_ diff --git a/mlir/include/mlir/Transforms/ControlFlowSinkUtils.h b/mlir/include/mlir/Transforms/ControlFlowSinkUtils.h new file mode 100644 --- /dev/null +++ b/mlir/include/mlir/Transforms/ControlFlowSinkUtils.h @@ -0,0 +1,70 @@ +//===- ControlFlowSinkUtils.h - ControlFlow Sink Utils ----------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef MLIR_TRANSFORMS_CONTROLFLOWSINKUTILS_H +#define MLIR_TRANSFORMS_CONTROLFLOWSINKUTILS_H + +#include "mlir/Support/LLVM.h" + +namespace mlir { + +class DominanceInfo; +class Operation; +class Region; +class RegionBranchOpInterface; + +/// Given a list of regions, perform control flow sinking on them. For each +/// region, control-flow sinking moves operations that dominate the region but +/// whose only users are in the region into the regions so that they aren't +/// executed on paths where their results are not needed. +/// +/// TODO: For the moment, this is a *simple* control-flow sink, i.e., no +/// duplicating of ops. It should be made to accept a cost model to determine +/// whether duplicating a particular op is profitable. +/// +/// Example: +/// +/// ```mlir +/// %0 = arith.addi %arg0, %arg1 +/// scf.if %cond { +/// scf.yield %0 +/// } else { +/// scf.yield %arg2 +/// } +/// ``` +/// +/// After control-flow sink: +/// +/// ```mlir +/// scf.if %cond { +/// %0 = arith.addi %arg0, %arg1 +/// scf.yield %0 +/// } else { +/// scf.yield %arg2 +/// } +/// ``` +/// +/// Users must supply a callback `shouldMoveIntoRegion` that determines whether +/// the given operation that only has users in the given operation should be +/// moved into that region. +/// +/// Returns the number of operations sunk. +size_t +controlFlowSink(ArrayRef regions, DominanceInfo &domInfo, + function_ref shouldMoveIntoRegion); + +/// Populates `regions` with regions of the provided region branch op that are +/// executed at most once at that are reachable given the current operands of +/// the op. These regions can be passed to `controlFlowSink` to perform sinking +/// on the regions of the operation. +void getSinglyExecutedRegionsToSink(RegionBranchOpInterface branch, + SmallVectorImpl ®ions); + +} // namespace mlir + +#endif // MLIR_TRANSFORMS_CONTROLFLOWSINKUTILS_H diff --git a/mlir/include/mlir/Transforms/Passes.h b/mlir/include/mlir/Transforms/Passes.h --- a/mlir/include/mlir/Transforms/Passes.h +++ b/mlir/include/mlir/Transforms/Passes.h @@ -22,13 +22,8 @@ namespace mlir { -class AffineForOp; class GreedyRewriteConfig; -/// Fusion mode to attempt. The default mode `Greedy` does both -/// producer-consumer and sibling fusion. -enum FusionMode { Greedy, ProducerConsumer, Sibling }; - //===----------------------------------------------------------------------===// // Passes //===----------------------------------------------------------------------===// @@ -56,31 +51,10 @@ /// Creates a pass to perform common sub expression elimination. std::unique_ptr createCSEPass(); -/// Creates a loop fusion pass which fuses loops according to type of fusion -/// specified in `fusionMode`. Buffers of size less than or equal to -/// `localBufSizeThreshold` are promoted to memory space `fastMemorySpace`. -std::unique_ptr> -createLoopFusionPass(unsigned fastMemorySpace = 0, - uint64_t localBufSizeThreshold = 0, - bool maximalFusion = false, - enum FusionMode fusionMode = FusionMode::Greedy); - /// Creates a loop invariant code motion pass that hoists loop invariant /// instructions out of the loop. std::unique_ptr createLoopInvariantCodeMotionPass(); -/// Creates a pass to pipeline explicit movement of data across levels of the -/// memory hierarchy. -std::unique_ptr> createPipelineDataTransferPass(); - -/// Creates a pass that transforms perfectly nested loops with independent -/// bounds into a single loop. -std::unique_ptr> createLoopCoalescingPass(); - -/// Creates a pass that transforms a single ParallelLoop over N induction -/// variables into another ParallelLoop over less than N induction variables. -std::unique_ptr createParallelLoopCollapsingPass(); - /// Creates a pass to strip debug information from a function. std::unique_ptr createStripDebugInfoPass(); diff --git a/mlir/include/mlir/Transforms/Passes.td b/mlir/include/mlir/Transforms/Passes.td --- a/mlir/include/mlir/Transforms/Passes.td +++ b/mlir/include/mlir/Transforms/Passes.td @@ -16,207 +16,6 @@ include "mlir/Pass/PassBase.td" include "mlir/Rewrite/PassUtil.td" -def AffineLoopFusion : Pass<"affine-loop-fusion", "FuncOp"> { - let summary = "Fuse affine loop nests"; - let description = [{ - This pass performs fusion of loop nests using a slicing-based approach. It - combines two fusion strategies: producer-consumer fusion and sibling fusion. - Producer-consumer fusion is aimed at fusing pairs of loops where the first - one writes to a memref that the second reads. Sibling fusion targets pairs - of loops that share no dependences between them but that load from the same - memref. The fused loop nests, when possible, are rewritten to access - significantly smaller local buffers instead of the original memref's, and - the latter are often either completely optimized away or contracted. This - transformation leads to enhanced locality and lower memory footprint through - the elimination or contraction of temporaries/intermediate memref's. These - benefits are sometimes achieved at the expense of redundant computation - through a cost model that evaluates available choices such as the depth at - which a source slice should be materialized in the designation slice. - - Example 1: Producer-consumer fusion. - Input: - ```mlir - func @producer_consumer_fusion(%arg0: memref<10xf32>, %arg1: memref<10xf32>) { - %0 = memref.alloc() : memref<10xf32> - %1 = memref.alloc() : memref<10xf32> - %cst = arith.constant 0.000000e+00 : f32 - affine.for %arg2 = 0 to 10 { - affine.store %cst, %0[%arg2] : memref<10xf32> - affine.store %cst, %1[%arg2] : memref<10xf32> - } - affine.for %arg2 = 0 to 10 { - %2 = affine.load %0[%arg2] : memref<10xf32> - %3 = arith.addf %2, %2 : f32 - affine.store %3, %arg0[%arg2] : memref<10xf32> - } - affine.for %arg2 = 0 to 10 { - %2 = affine.load %1[%arg2] : memref<10xf32> - %3 = arith.mulf %2, %2 : f32 - affine.store %3, %arg1[%arg2] : memref<10xf32> - } - return - } - ``` - Output: - ```mlir - func @producer_consumer_fusion(%arg0: memref<10xf32>, %arg1: memref<10xf32>) { - %0 = memref.alloc() : memref<1xf32> - %1 = memref.alloc() : memref<1xf32> - %cst = arith.constant 0.000000e+00 : f32 - affine.for %arg2 = 0 to 10 { - affine.store %cst, %0[0] : memref<1xf32> - affine.store %cst, %1[0] : memref<1xf32> - %2 = affine.load %1[0] : memref<1xf32> - %3 = arith.mulf %2, %2 : f32 - affine.store %3, %arg1[%arg2] : memref<10xf32> - %4 = affine.load %0[0] : memref<1xf32> - %5 = arith.addf %4, %4 : f32 - affine.store %5, %arg0[%arg2] : memref<10xf32> - } - return - } - ``` - - Example 2: Sibling fusion. - Input: - ```mlir - func @sibling_fusion(%arg0: memref<10x10xf32>, %arg1: memref<10x10xf32>, - %arg2: memref<10x10xf32>, %arg3: memref<10x10xf32>, - %arg4: memref<10x10xf32>) { - affine.for %arg5 = 0 to 3 { - affine.for %arg6 = 0 to 3 { - %0 = affine.load %arg0[%arg5, %arg6] : memref<10x10xf32> - %1 = affine.load %arg1[%arg5, %arg6] : memref<10x10xf32> - %2 = arith.mulf %0, %1 : f32 - affine.store %2, %arg3[%arg5, %arg6] : memref<10x10xf32> - } - } - affine.for %arg5 = 0 to 3 { - affine.for %arg6 = 0 to 3 { - %0 = affine.load %arg0[%arg5, %arg6] : memref<10x10xf32> - %1 = affine.load %arg2[%arg5, %arg6] : memref<10x10xf32> - %2 = arith.addf %0, %1 : f32 - affine.store %2, %arg4[%arg5, %arg6] : memref<10x10xf32> - } - } - return - } - ``` - Output: - ```mlir - func @sibling_fusion(%arg0: memref<10x10xf32>, %arg1: memref<10x10xf32>, - %arg2: memref<10x10xf32>, %arg3: memref<10x10xf32>, - %arg4: memref<10x10xf32>) { - affine.for %arg5 = 0 to 3 { - affine.for %arg6 = 0 to 3 { - %0 = affine.load %arg0[%arg5, %arg6] : memref<10x10xf32> - %1 = affine.load %arg1[%arg5, %arg6] : memref<10x10xf32> - %2 = arith.mulf %0, %1 : f32 - affine.store %2, %arg3[%arg5, %arg6] : memref<10x10xf32> - %3 = affine.load %arg0[%arg5, %arg6] : memref<10x10xf32> - %4 = affine.load %arg2[%arg5, %arg6] : memref<10x10xf32> - %5 = arith.addf %3, %4 : f32 - affine.store %5, %arg4[%arg5, %arg6] : memref<10x10xf32> - } - } - return - } - ``` - }]; - let constructor = "mlir::createLoopFusionPass()"; - let options = [ - Option<"computeToleranceThreshold", "fusion-compute-tolerance", "double", - /*default=*/"0.30f", "Fractional increase in additional computation " - "tolerated while fusing">, - Option<"fastMemorySpace", "fusion-fast-mem-space", "unsigned", - /*default=*/"0", - "Faster memory space number to promote fusion buffers to">, - Option<"localBufSizeThreshold", "fusion-local-buf-threshold", "uint64_t", - /*default=*/"0", "Threshold size (KiB) for promoting local buffers " - "to fast memory space">, - Option<"maximalFusion", "fusion-maximal", "bool", /*default=*/"false", - "Enables maximal loop fusion">, - Option<"affineFusionMode", "mode", "enum FusionMode", - "mlir::FusionMode::Greedy", "fusion mode to attempt", - "llvm::cl::values(clEnumValN(mlir::FusionMode::Greedy," - " \"greedy\", \"Perform greedy (both producer-consumer and sibling) fusion\"), " - "clEnumValN( mlir::FusionMode::ProducerConsumer, " - "\"producer\", \"Perform only producer-consumer fusion\"), " - "clEnumValN( mlir::FusionMode::Sibling, " - "\"sibling\", \"Perform only sibling fusion\"))">, - ]; - let dependentDialects = ["memref::MemRefDialect"]; -} - -def AffinePipelineDataTransfer - : Pass<"affine-pipeline-data-transfer", "FuncOp"> { - let summary = "Pipeline non-blocking data transfers between explicitly " - "managed levels of the memory hierarchy"; - let description = [{ - This pass performs a transformation to overlap non-blocking DMA operations - in a loop with computations through double buffering. This is achieved by - advancing dma_start operations with respect to other operations. - - Input - - ```mlir - func @pipelinedatatransfer() { - %0 = memref.alloc() : memref<256xf32> - %1 = memref.alloc() : memref<32xf32, 1> - %2 = memref.alloc() : memref<1xf32> - %c0 = arith.constant 0 : index - %c128 = arith.constant 128 : index - affine.for %i0 = 0 to 8 { - affine.dma_start %0[%i0], %1[%i0], %2[%c0], %c128 : memref<256xf32>, memref<32xf32, 1>, memref<1xf32> - affine.dma_wait %2[%c0], %c128 : memref<1xf32> - %3 = affine.load %1[%i0] : memref<32xf32, 1> - %4 = "compute"(%3) : (f32) -> f32 - affine.store %4, %1[%i0] : memref<32xf32, 1> - } - return - } - ``` - - Output - - ```mlir - module { - func @pipelinedatatransfer() { - %c8 = arith.constant 8 : index - %c0 = arith.constant 0 : index - %0 = memref.alloc() : memref<256xf32> - %c0_0 = arith.constant 0 : index - %c128 = arith.constant 128 : index - %1 = memref.alloc() : memref<2x32xf32, 1> - %2 = memref.alloc() : memref<2x1xf32> - affine.dma_start %0[%c0], %1[%c0 mod 2, %c0], %2[%c0 mod 2, symbol(%c0_0)], %c128 : memref<256xf32>, memref<2x32xf32, 1>, memref<2x1xf32> - affine.for %arg0 = 1 to 8 { - affine.dma_start %0[%arg0], %1[%arg0 mod 2, %arg0], %2[%arg0 mod 2, symbol(%c0_0)], %c128 : memref<256xf32>, memref<2x32xf32, 1>, memref<2x1xf32> - %8 = affine.apply #map3(%arg0) - %9 = affine.apply #map4(%8) - %10 = affine.apply #map4(%8) - affine.dma_wait %2[%8 mod 2, symbol(%c0_0)], %c128 : memref<2x1xf32> - %11 = affine.load %1[%8 mod 2, %8] : memref<2x32xf32, 1> - %12 = "compute"(%11) : (f32) -> f32 - affine.store %12, %1[%8 mod 2, %8] : memref<2x32xf32, 1> - } - %3 = affine.apply #map3(%c8) - %4 = affine.apply #map4(%3) - %5 = affine.apply #map4(%3) - affine.dma_wait %2[%3 mod 2, symbol(%c0_0)], %c128 : memref<2x1xf32> - %6 = affine.load %1[%3 mod 2, %3] : memref<2x32xf32, 1> - %7 = "compute"(%6) : (f32) -> f32 - affine.store %7, %1[%3 mod 2, %3] : memref<2x32xf32, 1> - memref.dealloc %2 : memref<2x1xf32> - memref.dealloc %1 : memref<2x32xf32, 1> - return - } - } - ``` - }]; - let constructor = "mlir::createPipelineDataTransferPass()"; -} - def Canonicalizer : Pass<"canonicalize"> { let summary = "Canonicalize operations"; let description = [{ @@ -339,34 +138,11 @@ ]; } -def LoopCoalescing : Pass<"loop-coalescing", "FuncOp"> { - let summary = "Coalesce nested loops with independent bounds into a single " - "loop"; - let constructor = "mlir::createLoopCoalescingPass()"; - let dependentDialects = ["arith::ArithmeticDialect"]; -} - def LoopInvariantCodeMotion : Pass<"loop-invariant-code-motion"> { let summary = "Hoist loop invariant instructions outside of the loop"; let constructor = "mlir::createLoopInvariantCodeMotionPass()"; } -def ParallelLoopCollapsing : Pass<"parallel-loop-collapsing"> { - let summary = "Collapse parallel loops to use less induction variables"; - let constructor = "mlir::createParallelLoopCollapsingPass()"; - let options = [ - ListOption<"clCollapsedIndices0", "collapsed-indices-0", "unsigned", - "Which loop indices to combine 0th loop index", - "llvm::cl::MiscFlags::CommaSeparated">, - ListOption<"clCollapsedIndices1", "collapsed-indices-1", "unsigned", - "Which loop indices to combine into the position 1 loop index", - "llvm::cl::MiscFlags::CommaSeparated">, - ListOption<"clCollapsedIndices2", "collapsed-indices-2", "unsigned", - "Which loop indices to combine into the position 2 loop index", - "llvm::cl::MiscFlags::CommaSeparated">, - ]; -} - def PrintOpStats : Pass<"print-op-stats"> { let summary = "Print statistics of operations"; let constructor = "mlir::createPrintOpStatsPass()"; diff --git a/mlir/include/mlir/Transforms/Utils.h b/mlir/include/mlir/Transforms/Utils.h deleted file mode 100644 --- a/mlir/include/mlir/Transforms/Utils.h +++ /dev/null @@ -1,200 +0,0 @@ -//===- Utils.h - General transformation utilities ---------------*- C++ -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// This header file defines prototypes for various transformation utilities for -// memref's and non-loop IR structures. These are not passes by themselves but -// are used either by passes, optimization sequences, or in turn by other -// transformation utilities. -// -//===----------------------------------------------------------------------===// - -#ifndef MLIR_TRANSFORMS_UTILS_H -#define MLIR_TRANSFORMS_UTILS_H - -#include "mlir/Dialect/StandardOps/IR/Ops.h" -#include "mlir/IR/AffineMap.h" -#include "llvm/ADT/ArrayRef.h" -#include "llvm/ADT/DenseMap.h" - -namespace mlir { - -class AffineApplyOp; -class AffineForOp; -class DominanceInfo; -class Location; -class OpBuilder; - -namespace memref { -class AllocOp; -} // namespace memref - -/// Replaces all "dereferencing" uses of `oldMemRef` with `newMemRef` while -/// optionally remapping the old memref's indices using the supplied affine map, -/// `indexRemap`. The new memref could be of a different shape or rank. -/// `extraIndices` provides any additional access indices to be added to the -/// start. -/// -/// `indexRemap` remaps indices of the old memref access to a new set of indices -/// that are used to index the memref. Additional input operands to indexRemap -/// can be optionally provided in `extraOperands`, and they occupy the start -/// of its input list. `indexRemap`'s dimensional inputs are expected to -/// correspond to memref's indices, and its symbolic inputs if any should be -/// provided in `symbolOperands`. -/// -/// `domOpFilter`, if non-null, restricts the replacement to only those -/// operations that are dominated by the former; similarly, `postDomOpFilter` -/// restricts replacement to only those operations that are postdominated by it. -/// -/// 'allowNonDereferencingOps', if set, allows replacement of non-dereferencing -/// uses of a memref without any requirement for access index rewrites as long -/// as the user operation has the MemRefsNormalizable trait. The default value -/// of this flag is false. -/// -/// 'replaceInDeallocOp', if set, lets DeallocOp, a non-dereferencing user, to -/// also be a candidate for replacement. The default value of this flag is -/// false. -/// -/// Returns true on success and false if the replacement is not possible, -/// whenever a memref is used as an operand in a non-dereferencing context and -/// 'allowNonDereferencingOps' is false, except for dealloc's on the memref -/// which are left untouched. See comments at function definition for an -/// example. -// -// Ex: to replace load %A[%i, %j] with load %Abuf[%t mod 2, %ii - %i, %j]: -// The SSA value corresponding to '%t mod 2' should be in 'extraIndices', and -// index remap will perform (%i, %j) -> (%ii - %i, %j), i.e., indexRemap = (d0, -// d1, d2) -> (d0 - d1, d2), and %ii will be the extra operand. Without any -// extra operands, note that 'indexRemap' would just be applied to existing -// indices (%i, %j). -// TODO: allow extraIndices to be added at any position. -LogicalResult replaceAllMemRefUsesWith( - Value oldMemRef, Value newMemRef, ArrayRef extraIndices = {}, - AffineMap indexRemap = AffineMap(), ArrayRef extraOperands = {}, - ArrayRef symbolOperands = {}, Operation *domOpFilter = nullptr, - Operation *postDomOpFilter = nullptr, bool allowNonDereferencingOps = false, - bool replaceInDeallocOp = false); - -/// Performs the same replacement as the other version above but only for the -/// dereferencing uses of `oldMemRef` in `op`, except in cases where -/// 'allowNonDereferencingOps' is set to true where we replace the -/// non-dereferencing uses as well. -LogicalResult replaceAllMemRefUsesWith(Value oldMemRef, Value newMemRef, - Operation *op, - ArrayRef extraIndices = {}, - AffineMap indexRemap = AffineMap(), - ArrayRef extraOperands = {}, - ArrayRef symbolOperands = {}, - bool allowNonDereferencingOps = false); - -/// Rewrites the memref defined by this alloc op to have an identity layout map -/// and updates all its indexing uses. Returns failure if any of its uses -/// escape (while leaving the IR in a valid state). -LogicalResult normalizeMemRef(memref::AllocOp *op); - -/// Uses the old memref type map layout and computes the new memref type to have -/// a new shape and a layout map, where the old layout map has been normalized -/// to an identity layout map. It returns the old memref in case no -/// normalization was needed or a failure occurs while transforming the old map -/// layout to an identity layout map. -MemRefType normalizeMemRefType(MemRefType memrefType, OpBuilder builder, - unsigned numSymbolicOperands); - -/// Creates and inserts into 'builder' a new AffineApplyOp, with the number of -/// its results equal to the number of operands, as a composition -/// of all other AffineApplyOps reachable from input parameter 'operands'. If -/// different operands were drawing results from multiple affine apply ops, -/// these will also be collected into a single (multi-result) affine apply op. -/// The final results of the composed AffineApplyOp are returned in output -/// parameter 'results'. Returns the affine apply op created. -Operation *createComposedAffineApplyOp(OpBuilder &builder, Location loc, - ArrayRef operands, - ArrayRef affineApplyOps, - SmallVectorImpl *results); - -/// Given an operation, inserts one or more single result affine apply -/// operations, results of which are exclusively used by this operation. -/// The operands of these newly created affine apply ops are -/// guaranteed to be loop iterators or terminal symbols of a function. -/// -/// Before -/// -/// affine.for %i = 0 to #map(%N) -/// %idx = affine.apply (d0) -> (d0 mod 2) (%i) -/// send %A[%idx], ... -/// %v = "compute"(%idx, ...) -/// -/// After -/// -/// affine.for %i = 0 to #map(%N) -/// %idx = affine.apply (d0) -> (d0 mod 2) (%i) -/// send %A[%idx], ... -/// %idx_ = affine.apply (d0) -> (d0 mod 2) (%i) -/// %v = "compute"(%idx_, ...) - -/// This allows the application of different transformations on send and -/// compute (for eg. different shifts/delays) -/// -/// Fills `sliceOps` with the list of affine.apply operations. -/// In the following cases, `sliceOps` remains empty: -/// 1. If none of opInst's operands were the result of an affine.apply -/// (i.e., there was no affine computation slice to create). -/// 2. If all the affine.apply op's supplying operands to this opInst did not -/// have any uses other than those in this opInst. -void createAffineComputationSlice(Operation *opInst, - SmallVectorImpl *sliceOps); - -/// Given a list of regions, perform control flow sinking on them. For each -/// region, control-flow sinking moves operations that dominate the region but -/// whose only users are in the region into the regions so that they aren't -/// executed on paths where their results are not needed. -/// -/// TODO: For the moment, this is a *simple* control-flow sink, i.e., no -/// duplicating of ops. It should be made to accept a cost model to determine -/// whether duplicating a particular op is profitable. -/// -/// Example: -/// -/// ```mlir -/// %0 = arith.addi %arg0, %arg1 -/// scf.if %cond { -/// scf.yield %0 -/// } else { -/// scf.yield %arg2 -/// } -/// ``` -/// -/// After control-flow sink: -/// -/// ```mlir -/// scf.if %cond { -/// %0 = arith.addi %arg0, %arg1 -/// scf.yield %0 -/// } else { -/// scf.yield %arg2 -/// } -/// ``` -/// -/// Users must supply a callback `shouldMoveIntoRegion` that determines whether -/// the given operation that only has users in the given operation should be -/// moved into that region. -/// -/// Returns the number of operations sunk. -size_t -controlFlowSink(ArrayRef regions, DominanceInfo &domInfo, - function_ref shouldMoveIntoRegion); - -/// Populates `regions` with regions of the provided region branch op that are -/// executed at most once at that are reachable given the current operands of -/// the op. These regions can be passed to `controlFlowSink` to perform sinking -/// on the regions of the operation. -void getSinglyExecutedRegionsToSink(RegionBranchOpInterface branch, - SmallVectorImpl ®ions); - -} // namespace mlir - -#endif // MLIR_TRANSFORMS_UTILS_H diff --git a/mlir/lib/Conversion/SCFToGPU/SCFToGPU.cpp b/mlir/lib/Conversion/SCFToGPU/SCFToGPU.cpp --- a/mlir/lib/Conversion/SCFToGPU/SCFToGPU.cpp +++ b/mlir/lib/Conversion/SCFToGPU/SCFToGPU.cpp @@ -27,7 +27,6 @@ #include "mlir/IR/Builders.h" #include "mlir/Pass/Pass.h" #include "mlir/Transforms/DialectConversion.h" -#include "mlir/Transforms/LoopUtils.h" #include "mlir/Transforms/Passes.h" #include "mlir/Transforms/RegionUtils.h" #include "llvm/ADT/Sequence.h" diff --git a/mlir/lib/Conversion/SCFToStandard/SCFToStandard.cpp b/mlir/lib/Conversion/SCFToStandard/SCFToStandard.cpp --- a/mlir/lib/Conversion/SCFToStandard/SCFToStandard.cpp +++ b/mlir/lib/Conversion/SCFToStandard/SCFToStandard.cpp @@ -23,7 +23,6 @@ #include "mlir/IR/PatternMatch.h" #include "mlir/Transforms/DialectConversion.h" #include "mlir/Transforms/Passes.h" -#include "mlir/Transforms/Utils.h" using namespace mlir; using namespace mlir::scf; diff --git a/mlir/lib/Conversion/StandardToLLVM/StandardToLLVM.cpp b/mlir/lib/Conversion/StandardToLLVM/StandardToLLVM.cpp --- a/mlir/lib/Conversion/StandardToLLVM/StandardToLLVM.cpp +++ b/mlir/lib/Conversion/StandardToLLVM/StandardToLLVM.cpp @@ -33,7 +33,6 @@ #include "mlir/Support/MathExtras.h" #include "mlir/Transforms/DialectConversion.h" #include "mlir/Transforms/Passes.h" -#include "mlir/Transforms/Utils.h" #include "llvm/ADT/TypeSwitch.h" #include "llvm/IR/DerivedTypes.h" #include "llvm/IR/IRBuilder.h" diff --git a/mlir/lib/Conversion/VectorToSCF/VectorToSCF.cpp b/mlir/lib/Conversion/VectorToSCF/VectorToSCF.cpp --- a/mlir/lib/Conversion/VectorToSCF/VectorToSCF.cpp +++ b/mlir/lib/Conversion/VectorToSCF/VectorToSCF.cpp @@ -16,7 +16,6 @@ #include "../PassDetail.h" #include "mlir/Dialect/Affine/IR/AffineOps.h" -#include "mlir/Dialect/Affine/Utils.h" #include "mlir/Dialect/Arithmetic/IR/Arithmetic.h" #include "mlir/Dialect/MemRef/IR/MemRef.h" #include "mlir/Dialect/SCF/SCF.h" diff --git a/mlir/lib/Dialect/Affine/Transforms/AffineDataCopyGeneration.cpp b/mlir/lib/Dialect/Affine/Transforms/AffineDataCopyGeneration.cpp --- a/mlir/lib/Dialect/Affine/Transforms/AffineDataCopyGeneration.cpp +++ b/mlir/lib/Dialect/Affine/Transforms/AffineDataCopyGeneration.cpp @@ -22,12 +22,12 @@ #include "PassDetail.h" #include "mlir/Dialect/Affine/Analysis/Utils.h" #include "mlir/Dialect/Affine/IR/AffineOps.h" +#include "mlir/Dialect/Affine/LoopUtils.h" #include "mlir/Dialect/Affine/Passes.h" #include "mlir/Dialect/Arithmetic/IR/Arithmetic.h" #include "mlir/Dialect/MemRef/IR/MemRef.h" #include "mlir/Dialect/StandardOps/IR/Ops.h" #include "mlir/Transforms/GreedyPatternRewriteDriver.h" -#include "mlir/Transforms/LoopUtils.h" #include "llvm/ADT/MapVector.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" diff --git a/mlir/lib/Dialect/Affine/Transforms/AffineLoopInvariantCodeMotion.cpp b/mlir/lib/Dialect/Affine/Transforms/AffineLoopInvariantCodeMotion.cpp --- a/mlir/lib/Dialect/Affine/Transforms/AffineLoopInvariantCodeMotion.cpp +++ b/mlir/lib/Dialect/Affine/Transforms/AffineLoopInvariantCodeMotion.cpp @@ -17,13 +17,13 @@ #include "mlir/Dialect/Affine/Analysis/LoopAnalysis.h" #include "mlir/Dialect/Affine/Analysis/Utils.h" #include "mlir/Dialect/Affine/IR/AffineOps.h" +#include "mlir/Dialect/Affine/LoopUtils.h" #include "mlir/Dialect/Affine/Passes.h" +#include "mlir/Dialect/Affine/Utils.h" #include "mlir/Dialect/Arithmetic/IR/Arithmetic.h" #include "mlir/IR/AffineExpr.h" #include "mlir/IR/AffineMap.h" #include "mlir/IR/Builders.h" -#include "mlir/Transforms/LoopUtils.h" -#include "mlir/Transforms/Utils.h" #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/DenseSet.h" #include "llvm/ADT/SmallPtrSet.h" diff --git a/mlir/lib/Dialect/Affine/Transforms/AffineParallelize.cpp b/mlir/lib/Dialect/Affine/Transforms/AffineParallelize.cpp --- a/mlir/lib/Dialect/Affine/Transforms/AffineParallelize.cpp +++ b/mlir/lib/Dialect/Affine/Transforms/AffineParallelize.cpp @@ -18,10 +18,10 @@ #include "mlir/Dialect/Affine/Analysis/Utils.h" #include "mlir/Dialect/Affine/IR/AffineOps.h" #include "mlir/Dialect/Affine/IR/AffineValueMap.h" +#include "mlir/Dialect/Affine/LoopUtils.h" #include "mlir/Dialect/Affine/Passes.h" #include "mlir/Dialect/Affine/Passes.h.inc" #include "mlir/Dialect/Affine/Utils.h" -#include "mlir/Transforms/LoopUtils.h" #include "llvm/Support/Debug.h" #include diff --git a/mlir/lib/Dialect/Affine/Transforms/CMakeLists.txt b/mlir/lib/Dialect/Affine/Transforms/CMakeLists.txt --- a/mlir/lib/Dialect/Affine/Transforms/CMakeLists.txt +++ b/mlir/lib/Dialect/Affine/Transforms/CMakeLists.txt @@ -4,9 +4,12 @@ AffineLoopNormalize.cpp AffineParallelize.cpp AffineScalarReplacement.cpp + LoopCoalescing.cpp + LoopFusion.cpp LoopTiling.cpp LoopUnroll.cpp LoopUnrollAndJam.cpp + PipelineDataTransfer.cpp SuperVectorize.cpp SimplifyAffineStructures.cpp diff --git a/mlir/lib/Transforms/LoopCoalescing.cpp b/mlir/lib/Dialect/Affine/Transforms/LoopCoalescing.cpp rename from mlir/lib/Transforms/LoopCoalescing.cpp rename to mlir/lib/Dialect/Affine/Transforms/LoopCoalescing.cpp --- a/mlir/lib/Transforms/LoopCoalescing.cpp +++ b/mlir/lib/Dialect/Affine/Transforms/LoopCoalescing.cpp @@ -8,9 +8,10 @@ #include "PassDetail.h" #include "mlir/Dialect/Affine/IR/AffineOps.h" +#include "mlir/Dialect/Affine/LoopUtils.h" #include "mlir/Dialect/Arithmetic/IR/Arithmetic.h" #include "mlir/Dialect/SCF/SCF.h" -#include "mlir/Transforms/LoopUtils.h" +#include "mlir/Dialect/SCF/Utils.h" #include "mlir/Transforms/Passes.h" #include "mlir/Transforms/RegionUtils.h" #include "llvm/Support/Debug.h" diff --git a/mlir/lib/Transforms/LoopFusion.cpp b/mlir/lib/Dialect/Affine/Transforms/LoopFusion.cpp rename from mlir/lib/Transforms/LoopFusion.cpp rename to mlir/lib/Dialect/Affine/Transforms/LoopFusion.cpp --- a/mlir/lib/Transforms/LoopFusion.cpp +++ b/mlir/lib/Dialect/Affine/Transforms/LoopFusion.cpp @@ -16,14 +16,14 @@ #include "mlir/Dialect/Affine/Analysis/LoopAnalysis.h" #include "mlir/Dialect/Affine/Analysis/Utils.h" #include "mlir/Dialect/Affine/IR/AffineOps.h" +#include "mlir/Dialect/Affine/LoopFusionUtils.h" +#include "mlir/Dialect/Affine/LoopUtils.h" +#include "mlir/Dialect/Affine/Utils.h" #include "mlir/Dialect/MemRef/IR/MemRef.h" #include "mlir/IR/AffineExpr.h" #include "mlir/IR/AffineMap.h" #include "mlir/IR/Builders.h" -#include "mlir/Transforms/LoopFusionUtils.h" -#include "mlir/Transforms/LoopUtils.h" #include "mlir/Transforms/Passes.h" -#include "mlir/Transforms/Utils.h" #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/DenseSet.h" #include "llvm/ADT/SetVector.h" diff --git a/mlir/lib/Dialect/Affine/Transforms/LoopTiling.cpp b/mlir/lib/Dialect/Affine/Transforms/LoopTiling.cpp --- a/mlir/lib/Dialect/Affine/Transforms/LoopTiling.cpp +++ b/mlir/lib/Dialect/Affine/Transforms/LoopTiling.cpp @@ -17,11 +17,11 @@ #include "mlir/Dialect/Affine/Analysis/Utils.h" #include "mlir/Dialect/Affine/IR/AffineOps.h" #include "mlir/Dialect/Affine/IR/AffineValueMap.h" +#include "mlir/Dialect/Affine/LoopUtils.h" #include "mlir/Dialect/Affine/Passes.h" +#include "mlir/Dialect/Affine/Utils.h" #include "mlir/IR/BlockAndValueMapping.h" #include "mlir/IR/Builders.h" -#include "mlir/Transforms/LoopUtils.h" -#include "mlir/Transforms/Utils.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" using namespace mlir; diff --git a/mlir/lib/Dialect/Affine/Transforms/LoopUnroll.cpp b/mlir/lib/Dialect/Affine/Transforms/LoopUnroll.cpp --- a/mlir/lib/Dialect/Affine/Transforms/LoopUnroll.cpp +++ b/mlir/lib/Dialect/Affine/Transforms/LoopUnroll.cpp @@ -12,11 +12,11 @@ #include "PassDetail.h" #include "mlir/Dialect/Affine/Analysis/LoopAnalysis.h" #include "mlir/Dialect/Affine/IR/AffineOps.h" +#include "mlir/Dialect/Affine/LoopUtils.h" #include "mlir/Dialect/Affine/Passes.h" #include "mlir/IR/AffineExpr.h" #include "mlir/IR/AffineMap.h" #include "mlir/IR/Builders.h" -#include "mlir/Transforms/LoopUtils.h" #include "llvm/ADT/DenseMap.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" diff --git a/mlir/lib/Dialect/Affine/Transforms/LoopUnrollAndJam.cpp b/mlir/lib/Dialect/Affine/Transforms/LoopUnrollAndJam.cpp --- a/mlir/lib/Dialect/Affine/Transforms/LoopUnrollAndJam.cpp +++ b/mlir/lib/Dialect/Affine/Transforms/LoopUnrollAndJam.cpp @@ -37,12 +37,12 @@ #include "mlir/Dialect/Affine/Analysis/AffineAnalysis.h" #include "mlir/Dialect/Affine/Analysis/LoopAnalysis.h" #include "mlir/Dialect/Affine/IR/AffineOps.h" +#include "mlir/Dialect/Affine/LoopUtils.h" #include "mlir/Dialect/Affine/Passes.h" #include "mlir/IR/AffineExpr.h" #include "mlir/IR/AffineMap.h" #include "mlir/IR/BlockAndValueMapping.h" #include "mlir/IR/Builders.h" -#include "mlir/Transforms/LoopUtils.h" #include "llvm/ADT/DenseMap.h" #include "llvm/Support/CommandLine.h" diff --git a/mlir/lib/Dialect/Affine/Transforms/PassDetail.h b/mlir/lib/Dialect/Affine/Transforms/PassDetail.h --- a/mlir/lib/Dialect/Affine/Transforms/PassDetail.h +++ b/mlir/lib/Dialect/Affine/Transforms/PassDetail.h @@ -9,6 +9,7 @@ #ifndef DIALECT_AFFINE_TRANSFORMS_PASSDETAIL_H_ #define DIALECT_AFFINE_TRANSFORMS_PASSDETAIL_H_ +#include "mlir/Dialect/Affine/Passes.h" #include "mlir/Pass/Pass.h" namespace mlir { @@ -16,6 +17,10 @@ template void registerDialect(DialectRegistry ®istry); +namespace arith { +class ArithmeticDialect; +} // namespace arith + namespace linalg { class LinalgDialect; } // namespace linalg diff --git a/mlir/lib/Transforms/PipelineDataTransfer.cpp b/mlir/lib/Dialect/Affine/Transforms/PipelineDataTransfer.cpp rename from mlir/lib/Transforms/PipelineDataTransfer.cpp rename to mlir/lib/Dialect/Affine/Transforms/PipelineDataTransfer.cpp --- a/mlir/lib/Transforms/PipelineDataTransfer.cpp +++ b/mlir/lib/Dialect/Affine/Transforms/PipelineDataTransfer.cpp @@ -11,17 +11,16 @@ //===----------------------------------------------------------------------===// #include "PassDetail.h" -#include "mlir/Transforms/Passes.h" - #include "mlir/Dialect/Affine/Analysis/AffineAnalysis.h" #include "mlir/Dialect/Affine/Analysis/LoopAnalysis.h" #include "mlir/Dialect/Affine/Analysis/Utils.h" #include "mlir/Dialect/Affine/IR/AffineOps.h" +#include "mlir/Dialect/Affine/LoopUtils.h" +#include "mlir/Dialect/Affine/Utils.h" #include "mlir/Dialect/MemRef/IR/MemRef.h" #include "mlir/Dialect/StandardOps/Utils/Utils.h" #include "mlir/IR/Builders.h" -#include "mlir/Transforms/LoopUtils.h" -#include "mlir/Transforms/Utils.h" +#include "mlir/Transforms/Passes.h" #include "llvm/ADT/DenseMap.h" #include "llvm/Support/Debug.h" diff --git a/mlir/lib/Dialect/Affine/Transforms/SimplifyAffineStructures.cpp b/mlir/lib/Dialect/Affine/Transforms/SimplifyAffineStructures.cpp --- a/mlir/lib/Dialect/Affine/Transforms/SimplifyAffineStructures.cpp +++ b/mlir/lib/Dialect/Affine/Transforms/SimplifyAffineStructures.cpp @@ -14,9 +14,9 @@ #include "mlir/Dialect/Affine/Analysis/Utils.h" #include "mlir/Dialect/Affine/IR/AffineOps.h" #include "mlir/Dialect/Affine/Passes.h" +#include "mlir/Dialect/Affine/Utils.h" #include "mlir/IR/IntegerSet.h" #include "mlir/Transforms/GreedyPatternRewriteDriver.h" -#include "mlir/Transforms/Utils.h" #define DEBUG_TYPE "simplify-affine-structure" diff --git a/mlir/lib/Dialect/Affine/Utils/CMakeLists.txt b/mlir/lib/Dialect/Affine/Utils/CMakeLists.txt --- a/mlir/lib/Dialect/Affine/Utils/CMakeLists.txt +++ b/mlir/lib/Dialect/Affine/Utils/CMakeLists.txt @@ -1,4 +1,6 @@ add_mlir_dialect_library(MLIRAffineUtils + LoopFusionUtils.cpp + LoopUtils.cpp Utils.cpp ADDITIONAL_HEADER_DIRS @@ -7,5 +9,6 @@ LINK_LIBS PUBLIC MLIRAffine MLIRAnalysis + MLIRMemRef MLIRTransformUtils ) diff --git a/mlir/lib/Transforms/Utils/LoopFusionUtils.cpp b/mlir/lib/Dialect/Affine/Utils/LoopFusionUtils.cpp rename from mlir/lib/Transforms/Utils/LoopFusionUtils.cpp rename to mlir/lib/Dialect/Affine/Utils/LoopFusionUtils.cpp --- a/mlir/lib/Transforms/Utils/LoopFusionUtils.cpp +++ b/mlir/lib/Dialect/Affine/Utils/LoopFusionUtils.cpp @@ -10,21 +10,20 @@ // //===----------------------------------------------------------------------===// -#include "mlir/Transforms/LoopFusionUtils.h" - +#include "mlir/Dialect/Affine/LoopFusionUtils.h" #include "mlir/Analysis/SliceAnalysis.h" #include "mlir/Dialect/Affine/Analysis/AffineAnalysis.h" #include "mlir/Dialect/Affine/Analysis/AffineStructures.h" #include "mlir/Dialect/Affine/Analysis/LoopAnalysis.h" #include "mlir/Dialect/Affine/Analysis/Utils.h" #include "mlir/Dialect/Affine/IR/AffineOps.h" +#include "mlir/Dialect/Affine/LoopUtils.h" #include "mlir/IR/AffineExpr.h" #include "mlir/IR/AffineMap.h" #include "mlir/IR/BlockAndValueMapping.h" #include "mlir/IR/Builders.h" #include "mlir/IR/BuiltinOps.h" #include "mlir/IR/Operation.h" -#include "mlir/Transforms/LoopUtils.h" #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/SmallVector.h" #include "llvm/Support/Debug.h" diff --git a/mlir/lib/Transforms/Utils/LoopUtils.cpp b/mlir/lib/Dialect/Affine/Utils/LoopUtils.cpp rename from mlir/lib/Transforms/Utils/LoopUtils.cpp rename to mlir/lib/Dialect/Affine/Utils/LoopUtils.cpp --- a/mlir/lib/Transforms/Utils/LoopUtils.cpp +++ b/mlir/lib/Dialect/Affine/Utils/LoopUtils.cpp @@ -10,14 +10,14 @@ // //===----------------------------------------------------------------------===// -#include "mlir/Transforms/LoopUtils.h" - +#include "mlir/Dialect/Affine/LoopUtils.h" #include "mlir/Analysis/SliceAnalysis.h" #include "mlir/Dialect/Affine/Analysis/AffineAnalysis.h" #include "mlir/Dialect/Affine/Analysis/LoopAnalysis.h" #include "mlir/Dialect/Affine/Analysis/Utils.h" #include "mlir/Dialect/Affine/IR/AffineOps.h" #include "mlir/Dialect/Affine/IR/AffineValueMap.h" +#include "mlir/Dialect/Affine/Utils.h" #include "mlir/Dialect/MemRef/IR/MemRef.h" #include "mlir/Dialect/SCF/SCF.h" #include "mlir/IR/BlockAndValueMapping.h" @@ -25,7 +25,6 @@ #include "mlir/Support/MathExtras.h" #include "mlir/Transforms/GreedyPatternRewriteDriver.h" #include "mlir/Transforms/RegionUtils.h" -#include "mlir/Transforms/Utils.h" #include "llvm/ADT/MapVector.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/Support/Debug.h" @@ -108,42 +107,9 @@ lb.erase(); } -// Build the IR that performs ceil division of a positive value by a constant: -// ceildiv(a, B) = divis(a + (B-1), B) -// where divis is rounding-to-zero division. -static Value ceilDivPositive(OpBuilder &builder, Location loc, Value dividend, - int64_t divisor) { - assert(divisor > 0 && "expected positive divisor"); - assert(dividend.getType().isIndex() && "expected index-typed value"); - - Value divisorMinusOneCst = - builder.create(loc, divisor - 1); - Value divisorCst = builder.create(loc, divisor); - Value sum = builder.create(loc, dividend, divisorMinusOneCst); - return builder.create(loc, sum, divisorCst); -} - -// Build the IR that performs ceil division of a positive value by another -// positive value: -// ceildiv(a, b) = divis(a + (b - 1), b) -// where divis is rounding-to-zero division. -static Value ceilDivPositive(OpBuilder &builder, Location loc, Value dividend, - Value divisor) { - assert(dividend.getType().isIndex() && "expected index-typed value"); - - Value cstOne = builder.create(loc, 1); - Value divisorMinusOne = builder.create(loc, divisor, cstOne); - Value sum = builder.create(loc, dividend, divisorMinusOne); - return builder.create(loc, sum, divisor); -} - /// Helper to replace uses of loop carried values (iter_args) and loop -/// yield values while promoting single iteration affine.for and scf.for ops. -template -static void replaceIterArgsAndYieldResults(AffineOrSCFForOp forOp) { - static_assert( - llvm::is_one_of::value, - "only for affine.for and scf.for ops"); +/// yield values while promoting single iteration affine.for ops. +static void replaceIterArgsAndYieldResults(AffineForOp forOp) { // Replace uses of iter arguments with iter operands (initial values). auto iterOperands = forOp.getIterOperands(); auto iterArgs = forOp.getRegionIterArgs(); @@ -203,46 +169,6 @@ return success(); } -/// Promotes the loop body of a forOp to its containing block if the forOp -/// it can be determined that the loop has a single iteration. -LogicalResult mlir::promoteIfSingleIteration(scf::ForOp forOp) { - auto lbCstOp = forOp.getLowerBound().getDefiningOp(); - auto ubCstOp = forOp.getUpperBound().getDefiningOp(); - auto stepCstOp = forOp.getStep().getDefiningOp(); - if (!lbCstOp || !ubCstOp || !stepCstOp || lbCstOp.value() < 0 || - ubCstOp.value() < 0 || stepCstOp.value() < 0) - return failure(); - int64_t tripCount = - mlir::ceilDiv(ubCstOp.value() - lbCstOp.value(), stepCstOp.value()); - if (tripCount != 1) - return failure(); - auto iv = forOp.getInductionVar(); - iv.replaceAllUsesWith(lbCstOp); - - replaceIterArgsAndYieldResults(forOp); - - // Move the loop body operations, except for its terminator, to the loop's - // containing block. - auto *parentBlock = forOp->getBlock(); - forOp.getBody()->getTerminator()->erase(); - parentBlock->getOperations().splice(Block::iterator(forOp), - forOp.getBody()->getOperations()); - forOp.erase(); - return success(); -} - -/// Promotes all single iteration 'for' ops in `f`, i.e., moves -/// their body into the containing Block. -void mlir::promoteSingleIterationLoops(FuncOp f) { - // Gathers all innermost loops through a post order pruned walk. - f.walk([](Operation *op) { - if (auto forOp = dyn_cast(op)) - (void)promoteIfSingleIteration(forOp); - else if (auto forOp = dyn_cast(op)) - (void)promoteIfSingleIteration(forOp); - }); -} - /// Generates an affine.for op with the specified lower and upper bounds /// while generating the right IV remappings to realize shifts for operations in /// its body. The operations that go into the loop body are specified in @@ -1011,38 +937,22 @@ return success(); } -/// Collect perfectly nested loops starting from `rootForOps`. Loops are -/// perfectly nested if each loop is the first and only non-terminator operation -/// in the parent loop. Collect at most `maxLoops` loops and append them to -/// `forOps`. -template -static void getPerfectlyNestedLoopsImpl( - SmallVectorImpl &forOps, T rootForOp, - unsigned maxLoops = std::numeric_limits::max()) { - for (unsigned i = 0; i < maxLoops; ++i) { - forOps.push_back(rootForOp); - Block &body = rootForOp.getRegion().front(); - if (body.begin() != std::prev(body.end(), 2)) - return; - - rootForOp = dyn_cast(&body.front()); - if (!rootForOp) - return; - } -} - /// Get perfectly nested sequence of loops starting at root of loop nest /// (the first op being another AffineFor, and the second op - a terminator). /// A loop is perfectly nested iff: the first op in the loop's body is another /// AffineForOp, and the second op is a terminator). void mlir::getPerfectlyNestedLoops(SmallVectorImpl &nestedLoops, AffineForOp root) { - getPerfectlyNestedLoopsImpl(nestedLoops, root); -} + for (unsigned i = 0; i < std::numeric_limits::max(); ++i) { + nestedLoops.push_back(root); + Block &body = root.getRegion().front(); + if (body.begin() != std::prev(body.end(), 2)) + return; -void mlir::getPerfectlyNestedLoops(SmallVectorImpl &nestedLoops, - scf::ForOp root) { - getPerfectlyNestedLoopsImpl(nestedLoops, root); + root = dyn_cast(&body.front()); + if (!root) + return; + } } /// Identify valid and profitable bands of loops to tile. This is currently just @@ -1084,10 +994,10 @@ return loopUnrollByFactor(forOp, unrollFactor); } -/// Generates unrolled copies of AffineForOp or scf::ForOp 'loopBodyBlock', with -/// associated 'forOpIV' by 'unrollFactor', calling 'ivRemapFn' to remap -/// 'forOpIV' for each unrolled body. If specified, annotates the Ops in each -/// unrolled iteration using annotateFn. +/// Generates unrolled copies of AffineForOp 'loopBodyBlock', with associated +/// 'forOpIV' by 'unrollFactor', calling 'ivRemapFn' to remap 'forOpIV' for each +/// unrolled body. If specified, annotates the Ops in each unrolled iteration +/// using annotateFn. static void generateUnrolledLoop( Block *loopBodyBlock, Value forOpIV, uint64_t unrollFactor, function_ref ivRemapFn, @@ -1237,127 +1147,6 @@ return success(); } -/// Unrolls 'forOp' by 'unrollFactor', returns success if the loop is unrolled. -LogicalResult mlir::loopUnrollByFactor( - scf::ForOp forOp, uint64_t unrollFactor, - function_ref annotateFn) { - assert(unrollFactor > 0 && "expected positive unroll factor"); - - // Return if the loop body is empty. - if (llvm::hasSingleElement(forOp.getBody()->getOperations())) - return success(); - - // Compute tripCount = ceilDiv((upperBound - lowerBound), step) and populate - // 'upperBoundUnrolled' and 'stepUnrolled' for static and dynamic cases. - OpBuilder boundsBuilder(forOp); - auto loc = forOp.getLoc(); - auto step = forOp.getStep(); - Value upperBoundUnrolled; - Value stepUnrolled; - bool generateEpilogueLoop = true; - - auto lbCstOp = forOp.getLowerBound().getDefiningOp(); - auto ubCstOp = forOp.getUpperBound().getDefiningOp(); - auto stepCstOp = forOp.getStep().getDefiningOp(); - if (lbCstOp && ubCstOp && stepCstOp) { - // Constant loop bounds computation. - int64_t lbCst = lbCstOp.value(); - int64_t ubCst = ubCstOp.value(); - int64_t stepCst = stepCstOp.value(); - assert(lbCst >= 0 && ubCst >= 0 && stepCst >= 0 && - "expected positive loop bounds and step"); - int64_t tripCount = mlir::ceilDiv(ubCst - lbCst, stepCst); - - if (unrollFactor == 1) { - if (tripCount == 1 && failed(promoteIfSingleIteration(forOp))) - return failure(); - return success(); - } - - int64_t tripCountEvenMultiple = tripCount - (tripCount % unrollFactor); - int64_t upperBoundUnrolledCst = lbCst + tripCountEvenMultiple * stepCst; - assert(upperBoundUnrolledCst <= ubCst); - int64_t stepUnrolledCst = stepCst * unrollFactor; - - // Create constant for 'upperBoundUnrolled' and set epilogue loop flag. - generateEpilogueLoop = upperBoundUnrolledCst < ubCst; - if (generateEpilogueLoop) - upperBoundUnrolled = boundsBuilder.create( - loc, upperBoundUnrolledCst); - else - upperBoundUnrolled = ubCstOp; - - // Create constant for 'stepUnrolled'. - stepUnrolled = stepCst == stepUnrolledCst - ? step - : boundsBuilder.create( - loc, stepUnrolledCst); - } else { - // Dynamic loop bounds computation. - // TODO: Add dynamic asserts for negative lb/ub/step, or - // consider using ceilDiv from AffineApplyExpander. - auto lowerBound = forOp.getLowerBound(); - auto upperBound = forOp.getUpperBound(); - Value diff = - boundsBuilder.create(loc, upperBound, lowerBound); - Value tripCount = ceilDivPositive(boundsBuilder, loc, diff, step); - Value unrollFactorCst = - boundsBuilder.create(loc, unrollFactor); - Value tripCountRem = - boundsBuilder.create(loc, tripCount, unrollFactorCst); - // Compute tripCountEvenMultiple = tripCount - (tripCount % unrollFactor) - Value tripCountEvenMultiple = - boundsBuilder.create(loc, tripCount, tripCountRem); - // Compute upperBoundUnrolled = lowerBound + tripCountEvenMultiple * step - upperBoundUnrolled = boundsBuilder.create( - loc, lowerBound, - boundsBuilder.create(loc, tripCountEvenMultiple, step)); - // Scale 'step' by 'unrollFactor'. - stepUnrolled = - boundsBuilder.create(loc, step, unrollFactorCst); - } - - // Create epilogue clean up loop starting at 'upperBoundUnrolled'. - if (generateEpilogueLoop) { - OpBuilder epilogueBuilder(forOp->getContext()); - epilogueBuilder.setInsertionPoint(forOp->getBlock(), - std::next(Block::iterator(forOp))); - auto epilogueForOp = cast(epilogueBuilder.clone(*forOp)); - epilogueForOp.setLowerBound(upperBoundUnrolled); - - // Update uses of loop results. - auto results = forOp.getResults(); - auto epilogueResults = epilogueForOp.getResults(); - auto epilogueIterOperands = epilogueForOp.getIterOperands(); - - for (auto e : llvm::zip(results, epilogueResults, epilogueIterOperands)) { - std::get<0>(e).replaceAllUsesWith(std::get<1>(e)); - epilogueForOp->replaceUsesOfWith(std::get<2>(e), std::get<0>(e)); - } - (void)promoteIfSingleIteration(epilogueForOp); - } - - // Create unrolled loop. - forOp.setUpperBound(upperBoundUnrolled); - forOp.setStep(stepUnrolled); - - auto iterArgs = ValueRange(forOp.getRegionIterArgs()); - auto yieldedValues = forOp.getBody()->getTerminator()->getOperands(); - - generateUnrolledLoop( - forOp.getBody(), forOp.getInductionVar(), unrollFactor, - [&](unsigned i, Value iv, OpBuilder b) { - // iv' = iv + step * i; - auto stride = b.create( - loc, step, b.create(loc, i)); - return b.create(loc, iv, stride); - }, - annotateFn, iterArgs, yieldedValues); - // Promote the loop body up if this has turned into a single iteration loop. - (void)promoteIfSingleIteration(forOp); - return success(); -} - LogicalResult mlir::loopUnrollJamUpToFactor(AffineForOp forOp, uint64_t unrollJamFactor) { Optional mayBeConstantTripCount = getConstantTripCount(forOp); @@ -1888,61 +1677,25 @@ return innerLoops; } -static Loops stripmineSink(scf::ForOp forOp, Value factor, - ArrayRef targets) { - auto originalStep = forOp.getStep(); - auto iv = forOp.getInductionVar(); - - OpBuilder b(forOp); - forOp.setStep(b.create(forOp.getLoc(), originalStep, factor)); - - Loops innerLoops; - for (auto t : targets) { - // Save information for splicing ops out of t when done - auto begin = t.getBody()->begin(); - auto nOps = t.getBody()->getOperations().size(); - - // Insert newForOp before the terminator of `t`. - auto b = OpBuilder::atBlockTerminator((t.getBody())); - Value stepped = b.create(t.getLoc(), iv, forOp.getStep()); - Value less = b.create(t.getLoc(), arith::CmpIPredicate::slt, - forOp.getUpperBound(), stepped); - Value ub = - b.create(t.getLoc(), less, forOp.getUpperBound(), stepped); - - // Splice [begin, begin + nOps - 1) into `newForOp` and replace uses. - auto newForOp = b.create(t.getLoc(), iv, ub, originalStep); - newForOp.getBody()->getOperations().splice( - newForOp.getBody()->getOperations().begin(), - t.getBody()->getOperations(), begin, std::next(begin, nOps - 1)); - replaceAllUsesInRegionWith(iv, newForOp.getInductionVar(), - newForOp.getRegion()); - - innerLoops.push_back(newForOp); - } - - return innerLoops; -} - // Stripmines a `forOp` by `factor` and sinks it under a single `target`. // Returns the new AffineForOps, nested immediately under `target`. -template -static ForType stripmineSink(ForType forOp, SizeType factor, ForType target) { +template +static AffineForOp stripmineSink(AffineForOp forOp, SizeType factor, + AffineForOp target) { // TODO: Use cheap structural assertions that targets are nested under // forOp and that targets are not nested under each other when DominanceInfo // exposes the capability. It seems overkill to construct a whole function // dominance tree at this point. - auto res = stripmineSink(forOp, factor, ArrayRef{target}); + auto res = stripmineSink(forOp, factor, ArrayRef(target)); assert(res.size() == 1 && "Expected 1 inner forOp"); return res[0]; } -template -static SmallVector, 8> -tileImpl(ArrayRef forOps, ArrayRef sizes, - ArrayRef targets) { - SmallVector, 8> res; - SmallVector currentTargets(targets.begin(), targets.end()); +SmallVector, 8> +mlir::tile(ArrayRef forOps, ArrayRef sizes, + ArrayRef targets) { + SmallVector, 8> res; + SmallVector currentTargets(targets.begin(), targets.end()); for (auto it : llvm::zip(forOps, sizes)) { auto step = stripmineSink(std::get<0>(it), std::get<1>(it), currentTargets); res.push_back(step); @@ -1951,286 +1704,15 @@ return res; } -SmallVector, 8> -mlir::tile(ArrayRef forOps, ArrayRef sizes, - ArrayRef targets) { - return tileImpl(forOps, sizes, targets); -} - -SmallVector mlir::tile(ArrayRef forOps, - ArrayRef sizes, - ArrayRef targets) { - return tileImpl(forOps, sizes, targets); -} - -template -static SmallVector -tileImpl(ArrayRef forOps, ArrayRef sizes, ForType target) { - SmallVector res; - for (auto loops : tile(forOps, sizes, ArrayRef{target})) { - assert(loops.size() == 1); - res.push_back(loops[0]); - } - return res; -} - SmallVector mlir::tile(ArrayRef forOps, ArrayRef sizes, AffineForOp target) { - return tileImpl(forOps, sizes, target); -} - -Loops mlir::tile(ArrayRef forOps, ArrayRef sizes, - scf::ForOp target) { - return tileImpl(forOps, sizes, target); -} - -Loops mlir::tilePerfectlyNested(scf::ForOp rootForOp, ArrayRef sizes) { - // Collect perfectly nested loops. If more size values provided than nested - // loops available, truncate `sizes`. - SmallVector forOps; - forOps.reserve(sizes.size()); - getPerfectlyNestedLoopsImpl(forOps, rootForOp, sizes.size()); - if (forOps.size() < sizes.size()) - sizes = sizes.take_front(forOps.size()); - - return ::tile(forOps, sizes, forOps.back()); -} - -// Hoist the ops within `outer` that appear before `inner`. -// Such ops include the ops that have been introduced by parametric tiling. -// Ops that come from triangular loops (i.e. that belong to the program slice -// rooted at `outer`) and ops that have side effects cannot be hoisted. -// Return failure when any op fails to hoist. -static LogicalResult hoistOpsBetween(scf::ForOp outer, scf::ForOp inner) { - SetVector forwardSlice; - getForwardSlice( - outer.getInductionVar(), &forwardSlice, - [&inner](Operation *op) { return op != inner.getOperation(); }); - LogicalResult status = success(); - SmallVector toHoist; - for (auto &op : outer.getBody()->without_terminator()) { - // Stop when encountering the inner loop. - if (&op == inner.getOperation()) - break; - // Skip over non-hoistable ops. - if (forwardSlice.count(&op) > 0) { - status = failure(); - continue; - } - // Skip intermediate scf::ForOp, these are not considered a failure. - if (isa(op)) - continue; - // Skip other ops with regions. - if (op.getNumRegions() > 0) { - status = failure(); - continue; - } - // Skip if op has side effects. - // TODO: loads to immutable memory regions are ok. - if (!MemoryEffectOpInterface::hasNoEffect(&op)) { - status = failure(); - continue; - } - toHoist.push_back(&op); - } - auto *outerForOp = outer.getOperation(); - for (auto *op : toHoist) - op->moveBefore(outerForOp); - return status; -} - -// Traverse the interTile and intraTile loops and try to hoist ops such that -// bands of perfectly nested loops are isolated. -// Return failure if either perfect interTile or perfect intraTile bands cannot -// be formed. -static LogicalResult tryIsolateBands(const TileLoops &tileLoops) { - LogicalResult status = success(); - const Loops &interTile = tileLoops.first; - const Loops &intraTile = tileLoops.second; - auto size = interTile.size(); - assert(size == intraTile.size()); - if (size <= 1) - return success(); - for (unsigned s = 1; s < size; ++s) - status = succeeded(status) ? hoistOpsBetween(intraTile[0], intraTile[s]) - : failure(); - for (unsigned s = 1; s < size; ++s) - status = succeeded(status) ? hoistOpsBetween(interTile[0], interTile[s]) - : failure(); - return status; -} - -TileLoops mlir::extractFixedOuterLoops(scf::ForOp rootForOp, - ArrayRef sizes) { - // Collect perfectly nested loops. If more size values provided than nested - // loops available, truncate `sizes`. - SmallVector forOps; - forOps.reserve(sizes.size()); - getPerfectlyNestedLoopsImpl(forOps, rootForOp, sizes.size()); - if (forOps.size() < sizes.size()) - sizes = sizes.take_front(forOps.size()); - - // Compute the tile sizes such that i-th outer loop executes size[i] - // iterations. Given that the loop current executes - // numIterations = ceildiv((upperBound - lowerBound), step) - // iterations, we need to tile with size ceildiv(numIterations, size[i]). - SmallVector tileSizes; - tileSizes.reserve(sizes.size()); - for (unsigned i = 0, e = sizes.size(); i < e; ++i) { - assert(sizes[i] > 0 && "expected strictly positive size for strip-mining"); - - auto forOp = forOps[i]; - OpBuilder builder(forOp); - auto loc = forOp.getLoc(); - Value diff = builder.create(loc, forOp.getUpperBound(), - forOp.getLowerBound()); - Value numIterations = ceilDivPositive(builder, loc, diff, forOp.getStep()); - Value iterationsPerBlock = - ceilDivPositive(builder, loc, numIterations, sizes[i]); - tileSizes.push_back(iterationsPerBlock); - } - - // Call parametric tiling with the given sizes. - auto intraTile = tile(forOps, tileSizes, forOps.back()); - TileLoops tileLoops = std::make_pair(forOps, intraTile); - - // TODO: for now we just ignore the result of band isolation. - // In the future, mapping decisions may be impacted by the ability to - // isolate perfectly nested bands. - (void)tryIsolateBands(tileLoops); - - return tileLoops; -} - -/// Return the new lower bound, upper bound, and step in that order. Insert any -/// additional bounds calculations before the given builder and any additional -/// conversion back to the original loop induction value inside the given Block. -static LoopParams normalizeLoop(OpBuilder &boundsBuilder, - OpBuilder &insideLoopBuilder, Location loc, - Value lowerBound, Value upperBound, Value step, - Value inductionVar) { - // Check if the loop is already known to have a constant zero lower bound or - // a constant one step. - bool isZeroBased = false; - if (auto ubCst = lowerBound.getDefiningOp()) - isZeroBased = ubCst.value() == 0; - - bool isStepOne = false; - if (auto stepCst = step.getDefiningOp()) - isStepOne = stepCst.value() == 1; - - // Compute the number of iterations the loop executes: ceildiv(ub - lb, step) - // assuming the step is strictly positive. Update the bounds and the step - // of the loop to go from 0 to the number of iterations, if necessary. - // TODO: introduce support for negative steps or emit dynamic asserts - // on step positivity, whatever gets implemented first. - if (isZeroBased && isStepOne) - return {/*lowerBound=*/lowerBound, /*upperBound=*/upperBound, - /*step=*/step}; - - Value diff = boundsBuilder.create(loc, upperBound, lowerBound); - Value newUpperBound = ceilDivPositive(boundsBuilder, loc, diff, step); - - Value newLowerBound = - isZeroBased ? lowerBound - : boundsBuilder.create(loc, 0); - Value newStep = - isStepOne ? step : boundsBuilder.create(loc, 1); - - // Insert code computing the value of the original loop induction variable - // from the "normalized" one. - Value scaled = - isStepOne - ? inductionVar - : insideLoopBuilder.create(loc, inductionVar, step); - Value shifted = - isZeroBased - ? scaled - : insideLoopBuilder.create(loc, scaled, lowerBound); - - SmallPtrSet preserve{scaled.getDefiningOp(), - shifted.getDefiningOp()}; - inductionVar.replaceAllUsesExcept(shifted, preserve); - return {/*lowerBound=*/newLowerBound, /*upperBound=*/newUpperBound, - /*step=*/newStep}; -} - -/// Transform a loop with a strictly positive step -/// for %i = %lb to %ub step %s -/// into a 0-based loop with step 1 -/// for %ii = 0 to ceildiv(%ub - %lb, %s) step 1 { -/// %i = %ii * %s + %lb -/// Insert the induction variable remapping in the body of `inner`, which is -/// expected to be either `loop` or another loop perfectly nested under `loop`. -/// Insert the definition of new bounds immediate before `outer`, which is -/// expected to be either `loop` or its parent in the loop nest. -static void normalizeLoop(scf::ForOp loop, scf::ForOp outer, scf::ForOp inner) { - OpBuilder builder(outer); - OpBuilder innerBuilder = OpBuilder::atBlockBegin(inner.getBody()); - auto loopPieces = normalizeLoop(builder, innerBuilder, loop.getLoc(), - loop.getLowerBound(), loop.getUpperBound(), - loop.getStep(), loop.getInductionVar()); - - loop.setLowerBound(loopPieces.lowerBound); - loop.setUpperBound(loopPieces.upperBound); - loop.setStep(loopPieces.step); -} - -void mlir::coalesceLoops(MutableArrayRef loops) { - if (loops.size() < 2) - return; - - scf::ForOp innermost = loops.back(); - scf::ForOp outermost = loops.front(); - - // 1. Make sure all loops iterate from 0 to upperBound with step 1. This - // allows the following code to assume upperBound is the number of iterations. - for (auto loop : loops) - normalizeLoop(loop, outermost, innermost); - - // 2. Emit code computing the upper bound of the coalesced loop as product - // of the number of iterations of all loops. - OpBuilder builder(outermost); - Location loc = outermost.getLoc(); - Value upperBound = outermost.getUpperBound(); - for (auto loop : loops.drop_front()) - upperBound = - builder.create(loc, upperBound, loop.getUpperBound()); - outermost.setUpperBound(upperBound); - - builder.setInsertionPointToStart(outermost.getBody()); - - // 3. Remap induction variables. For each original loop, the value of the - // induction variable can be obtained by dividing the induction variable of - // the linearized loop by the total number of iterations of the loops nested - // in it modulo the number of iterations in this loop (remove the values - // related to the outer loops): - // iv_i = floordiv(iv_linear, product-of-loop-ranges-until-i) mod range_i. - // Compute these iteratively from the innermost loop by creating a "running - // quotient" of division by the range. - Value previous = outermost.getInductionVar(); - for (unsigned i = 0, e = loops.size(); i < e; ++i) { - unsigned idx = loops.size() - i - 1; - if (i != 0) - previous = builder.create(loc, previous, - loops[idx + 1].getUpperBound()); - - Value iv = (i == e - 1) ? previous - : builder.create( - loc, previous, loops[idx].getUpperBound()); - replaceAllUsesInRegionWith(loops[idx].getInductionVar(), iv, - loops.back().getRegion()); + SmallVector res; + for (auto loops : tile(forOps, sizes, ArrayRef(target))) { + assert(loops.size() == 1); + res.push_back(loops[0]); } - - // 4. Move the operations from the innermost just above the second-outermost - // loop, delete the extra terminator and the second-outermost loop. - scf::ForOp second = loops[1]; - innermost.getBody()->back().erase(); - outermost.getBody()->getOperations().splice( - Block::iterator(second.getOperation()), - innermost.getBody()->getOperations()); - second.erase(); + return res; } LogicalResult mlir::coalesceLoops(MutableArrayRef loops) { @@ -2347,89 +1829,6 @@ return success(); } -void mlir::collapseParallelLoops( - scf::ParallelOp loops, ArrayRef> combinedDimensions) { - OpBuilder outsideBuilder(loops); - Location loc = loops.getLoc(); - - // Presort combined dimensions. - auto sortedDimensions = llvm::to_vector<3>(combinedDimensions); - for (auto &dims : sortedDimensions) - std::sort(dims.begin(), dims.end()); - - // Normalize ParallelOp's iteration pattern. - SmallVector normalizedLowerBounds, normalizedSteps, - normalizedUpperBounds; - for (unsigned i = 0, e = loops.getNumLoops(); i < e; ++i) { - OpBuilder insideLoopBuilder = OpBuilder::atBlockBegin(loops.getBody()); - auto resultBounds = - normalizeLoop(outsideBuilder, insideLoopBuilder, loc, - loops.getLowerBound()[i], loops.getUpperBound()[i], - loops.getStep()[i], loops.getBody()->getArgument(i)); - - normalizedLowerBounds.push_back(resultBounds.lowerBound); - normalizedUpperBounds.push_back(resultBounds.upperBound); - normalizedSteps.push_back(resultBounds.step); - } - - // Combine iteration spaces. - SmallVector lowerBounds, upperBounds, steps; - auto cst0 = outsideBuilder.create(loc, 0); - auto cst1 = outsideBuilder.create(loc, 1); - for (unsigned i = 0, e = sortedDimensions.size(); i < e; ++i) { - Value newUpperBound = outsideBuilder.create(loc, 1); - for (auto idx : sortedDimensions[i]) { - newUpperBound = outsideBuilder.create( - loc, newUpperBound, normalizedUpperBounds[idx]); - } - lowerBounds.push_back(cst0); - steps.push_back(cst1); - upperBounds.push_back(newUpperBound); - } - - // Create new ParallelLoop with conversions to the original induction values. - // The loop below uses divisions to get the relevant range of values in the - // new induction value that represent each range of the original induction - // value. The remainders then determine based on that range, which iteration - // of the original induction value this represents. This is a normalized value - // that is un-normalized already by the previous logic. - auto newPloop = outsideBuilder.create( - loc, lowerBounds, upperBounds, steps, - [&](OpBuilder &insideBuilder, Location, ValueRange ploopIVs) { - for (unsigned i = 0, e = combinedDimensions.size(); i < e; ++i) { - Value previous = ploopIVs[i]; - unsigned numberCombinedDimensions = combinedDimensions[i].size(); - // Iterate over all except the last induction value. - for (unsigned j = numberCombinedDimensions - 1; j > 0; --j) { - unsigned idx = combinedDimensions[i][j]; - - // Determine the current induction value's current loop iteration - Value iv = insideBuilder.create( - loc, previous, normalizedUpperBounds[idx]); - replaceAllUsesInRegionWith(loops.getBody()->getArgument(idx), iv, - loops.getRegion()); - - // Remove the effect of the current induction value to prepare for - // the next value. - previous = insideBuilder.create( - loc, previous, normalizedUpperBounds[idx]); - } - - // The final induction value is just the remaining value. - unsigned idx = combinedDimensions[i][0]; - replaceAllUsesInRegionWith(loops.getBody()->getArgument(idx), - previous, loops.getRegion()); - } - }); - - // Replace the old loop with the new loop. - loops.getBody()->back().erase(); - newPloop.getBody()->getOperations().splice( - Block::iterator(newPloop.getBody()->back()), - loops.getBody()->getOperations()); - loops.erase(); -} - void mlir::mapLoopToProcessorIds(scf::ForOp forOp, ArrayRef processorId, ArrayRef numProcessors) { assert(processorId.size() == numProcessors.size()); diff --git a/mlir/lib/Dialect/Affine/Utils/Utils.cpp b/mlir/lib/Dialect/Affine/Utils/Utils.cpp --- a/mlir/lib/Dialect/Affine/Utils/Utils.cpp +++ b/mlir/lib/Dialect/Affine/Utils/Utils.cpp @@ -16,12 +16,14 @@ #include "mlir/Dialect/Affine/Analysis/Utils.h" #include "mlir/Dialect/Affine/IR/AffineOps.h" #include "mlir/Dialect/Affine/IR/AffineValueMap.h" +#include "mlir/Dialect/Affine/LoopUtils.h" #include "mlir/Dialect/MemRef/IR/MemRef.h" #include "mlir/IR/BlockAndValueMapping.h" #include "mlir/IR/Dominance.h" #include "mlir/IR/IntegerSet.h" #include "mlir/Transforms/GreedyPatternRewriteDriver.h" -#include "mlir/Transforms/LoopUtils.h" + +#define DEBUG_TYPE "affine-utils" using namespace mlir; @@ -856,3 +858,740 @@ defOp->erase(); } } + +// Perform the replacement in `op`. +LogicalResult mlir::replaceAllMemRefUsesWith(Value oldMemRef, Value newMemRef, + Operation *op, + ArrayRef extraIndices, + AffineMap indexRemap, + ArrayRef extraOperands, + ArrayRef symbolOperands, + bool allowNonDereferencingOps) { + unsigned newMemRefRank = newMemRef.getType().cast().getRank(); + (void)newMemRefRank; // unused in opt mode + unsigned oldMemRefRank = oldMemRef.getType().cast().getRank(); + (void)oldMemRefRank; // unused in opt mode + if (indexRemap) { + assert(indexRemap.getNumSymbols() == symbolOperands.size() && + "symbolic operand count mismatch"); + assert(indexRemap.getNumInputs() == + extraOperands.size() + oldMemRefRank + symbolOperands.size()); + assert(indexRemap.getNumResults() + extraIndices.size() == newMemRefRank); + } else { + assert(oldMemRefRank + extraIndices.size() == newMemRefRank); + } + + // Assert same elemental type. + assert(oldMemRef.getType().cast().getElementType() == + newMemRef.getType().cast().getElementType()); + + SmallVector usePositions; + for (const auto &opEntry : llvm::enumerate(op->getOperands())) { + if (opEntry.value() == oldMemRef) + usePositions.push_back(opEntry.index()); + } + + // If memref doesn't appear, nothing to do. + if (usePositions.empty()) + return success(); + + if (usePositions.size() > 1) { + // TODO: extend it for this case when needed (rare). + assert(false && "multiple dereferencing uses in a single op not supported"); + return failure(); + } + + unsigned memRefOperandPos = usePositions.front(); + + OpBuilder builder(op); + // The following checks if op is dereferencing memref and performs the access + // index rewrites. + auto affMapAccInterface = dyn_cast(op); + if (!affMapAccInterface) { + if (!allowNonDereferencingOps) { + // Failure: memref used in a non-dereferencing context (potentially + // escapes); no replacement in these cases unless allowNonDereferencingOps + // is set. + return failure(); + } + op->setOperand(memRefOperandPos, newMemRef); + return success(); + } + // Perform index rewrites for the dereferencing op and then replace the op + NamedAttribute oldMapAttrPair = + affMapAccInterface.getAffineMapAttrForMemRef(oldMemRef); + AffineMap oldMap = oldMapAttrPair.getValue().cast().getValue(); + unsigned oldMapNumInputs = oldMap.getNumInputs(); + SmallVector oldMapOperands( + op->operand_begin() + memRefOperandPos + 1, + op->operand_begin() + memRefOperandPos + 1 + oldMapNumInputs); + + // Apply 'oldMemRefOperands = oldMap(oldMapOperands)'. + SmallVector oldMemRefOperands; + SmallVector affineApplyOps; + oldMemRefOperands.reserve(oldMemRefRank); + if (oldMap != builder.getMultiDimIdentityMap(oldMap.getNumDims())) { + for (auto resultExpr : oldMap.getResults()) { + auto singleResMap = AffineMap::get(oldMap.getNumDims(), + oldMap.getNumSymbols(), resultExpr); + auto afOp = builder.create(op->getLoc(), singleResMap, + oldMapOperands); + oldMemRefOperands.push_back(afOp); + affineApplyOps.push_back(afOp); + } + } else { + oldMemRefOperands.assign(oldMapOperands.begin(), oldMapOperands.end()); + } + + // Construct new indices as a remap of the old ones if a remapping has been + // provided. The indices of a memref come right after it, i.e., + // at position memRefOperandPos + 1. + SmallVector remapOperands; + remapOperands.reserve(extraOperands.size() + oldMemRefRank + + symbolOperands.size()); + remapOperands.append(extraOperands.begin(), extraOperands.end()); + remapOperands.append(oldMemRefOperands.begin(), oldMemRefOperands.end()); + remapOperands.append(symbolOperands.begin(), symbolOperands.end()); + + SmallVector remapOutputs; + remapOutputs.reserve(oldMemRefRank); + + if (indexRemap && + indexRemap != builder.getMultiDimIdentityMap(indexRemap.getNumDims())) { + // Remapped indices. + for (auto resultExpr : indexRemap.getResults()) { + auto singleResMap = AffineMap::get( + indexRemap.getNumDims(), indexRemap.getNumSymbols(), resultExpr); + auto afOp = builder.create(op->getLoc(), singleResMap, + remapOperands); + remapOutputs.push_back(afOp); + affineApplyOps.push_back(afOp); + } + } else { + // No remapping specified. + remapOutputs.assign(remapOperands.begin(), remapOperands.end()); + } + + SmallVector newMapOperands; + newMapOperands.reserve(newMemRefRank); + + // Prepend 'extraIndices' in 'newMapOperands'. + for (Value extraIndex : extraIndices) { + assert(extraIndex.getDefiningOp()->getNumResults() == 1 && + "single result op's expected to generate these indices"); + assert((isValidDim(extraIndex) || isValidSymbol(extraIndex)) && + "invalid memory op index"); + newMapOperands.push_back(extraIndex); + } + + // Append 'remapOutputs' to 'newMapOperands'. + newMapOperands.append(remapOutputs.begin(), remapOutputs.end()); + + // Create new fully composed AffineMap for new op to be created. + assert(newMapOperands.size() == newMemRefRank); + auto newMap = builder.getMultiDimIdentityMap(newMemRefRank); + // TODO: Avoid creating/deleting temporary AffineApplyOps here. + fullyComposeAffineMapAndOperands(&newMap, &newMapOperands); + newMap = simplifyAffineMap(newMap); + canonicalizeMapAndOperands(&newMap, &newMapOperands); + // Remove any affine.apply's that became dead as a result of composition. + for (Value value : affineApplyOps) + if (value.use_empty()) + value.getDefiningOp()->erase(); + + OperationState state(op->getLoc(), op->getName()); + // Construct the new operation using this memref. + state.operands.reserve(op->getNumOperands() + extraIndices.size()); + // Insert the non-memref operands. + state.operands.append(op->operand_begin(), + op->operand_begin() + memRefOperandPos); + // Insert the new memref value. + state.operands.push_back(newMemRef); + + // Insert the new memref map operands. + state.operands.append(newMapOperands.begin(), newMapOperands.end()); + + // Insert the remaining operands unmodified. + state.operands.append(op->operand_begin() + memRefOperandPos + 1 + + oldMapNumInputs, + op->operand_end()); + + // Result types don't change. Both memref's are of the same elemental type. + state.types.reserve(op->getNumResults()); + for (auto result : op->getResults()) + state.types.push_back(result.getType()); + + // Add attribute for 'newMap', other Attributes do not change. + auto newMapAttr = AffineMapAttr::get(newMap); + for (auto namedAttr : op->getAttrs()) { + if (namedAttr.getName() == oldMapAttrPair.getName()) + state.attributes.push_back({namedAttr.getName(), newMapAttr}); + else + state.attributes.push_back(namedAttr); + } + + // Create the new operation. + auto *repOp = builder.createOperation(state); + op->replaceAllUsesWith(repOp); + op->erase(); + + return success(); +} + +LogicalResult mlir::replaceAllMemRefUsesWith( + Value oldMemRef, Value newMemRef, ArrayRef extraIndices, + AffineMap indexRemap, ArrayRef extraOperands, + ArrayRef symbolOperands, Operation *domOpFilter, + Operation *postDomOpFilter, bool allowNonDereferencingOps, + bool replaceInDeallocOp) { + unsigned newMemRefRank = newMemRef.getType().cast().getRank(); + (void)newMemRefRank; // unused in opt mode + unsigned oldMemRefRank = oldMemRef.getType().cast().getRank(); + (void)oldMemRefRank; + if (indexRemap) { + assert(indexRemap.getNumSymbols() == symbolOperands.size() && + "symbol operand count mismatch"); + assert(indexRemap.getNumInputs() == + extraOperands.size() + oldMemRefRank + symbolOperands.size()); + assert(indexRemap.getNumResults() + extraIndices.size() == newMemRefRank); + } else { + assert(oldMemRefRank + extraIndices.size() == newMemRefRank); + } + + // Assert same elemental type. + assert(oldMemRef.getType().cast().getElementType() == + newMemRef.getType().cast().getElementType()); + + std::unique_ptr domInfo; + std::unique_ptr postDomInfo; + if (domOpFilter) + domInfo = + std::make_unique(domOpFilter->getParentOfType()); + + if (postDomOpFilter) + postDomInfo = std::make_unique( + postDomOpFilter->getParentOfType()); + + // Walk all uses of old memref; collect ops to perform replacement. We use a + // DenseSet since an operation could potentially have multiple uses of a + // memref (although rare), and the replacement later is going to erase ops. + DenseSet opsToReplace; + for (auto *op : oldMemRef.getUsers()) { + // Skip this use if it's not dominated by domOpFilter. + if (domOpFilter && !domInfo->dominates(domOpFilter, op)) + continue; + + // Skip this use if it's not post-dominated by postDomOpFilter. + if (postDomOpFilter && !postDomInfo->postDominates(postDomOpFilter, op)) + continue; + + // Skip dealloc's - no replacement is necessary, and a memref replacement + // at other uses doesn't hurt these dealloc's. + if (isa(op) && !replaceInDeallocOp) + continue; + + // Check if the memref was used in a non-dereferencing context. It is fine + // for the memref to be used in a non-dereferencing way outside of the + // region where this replacement is happening. + if (!isa(*op)) { + if (!allowNonDereferencingOps) { + LLVM_DEBUG(llvm::dbgs() + << "Memref replacement failed: non-deferencing memref op: \n" + << *op << '\n'); + return failure(); + } + // Non-dereferencing ops with the MemRefsNormalizable trait are + // supported for replacement. + if (!op->hasTrait()) { + LLVM_DEBUG(llvm::dbgs() << "Memref replacement failed: use without a " + "memrefs normalizable trait: \n" + << *op << '\n'); + return failure(); + } + } + + // We'll first collect and then replace --- since replacement erases the op + // that has the use, and that op could be postDomFilter or domFilter itself! + opsToReplace.insert(op); + } + + for (auto *op : opsToReplace) { + if (failed(replaceAllMemRefUsesWith( + oldMemRef, newMemRef, op, extraIndices, indexRemap, extraOperands, + symbolOperands, allowNonDereferencingOps))) + llvm_unreachable("memref replacement guaranteed to succeed here"); + } + + return success(); +} + +/// Given an operation, inserts one or more single result affine +/// apply operations, results of which are exclusively used by this operation +/// operation. The operands of these newly created affine apply ops are +/// guaranteed to be loop iterators or terminal symbols of a function. +/// +/// Before +/// +/// affine.for %i = 0 to #map(%N) +/// %idx = affine.apply (d0) -> (d0 mod 2) (%i) +/// "send"(%idx, %A, ...) +/// "compute"(%idx) +/// +/// After +/// +/// affine.for %i = 0 to #map(%N) +/// %idx = affine.apply (d0) -> (d0 mod 2) (%i) +/// "send"(%idx, %A, ...) +/// %idx_ = affine.apply (d0) -> (d0 mod 2) (%i) +/// "compute"(%idx_) +/// +/// This allows applying different transformations on send and compute (for eg. +/// different shifts/delays). +/// +/// Returns nullptr either if none of opInst's operands were the result of an +/// affine.apply and thus there was no affine computation slice to create, or if +/// all the affine.apply op's supplying operands to this opInst did not have any +/// uses besides this opInst; otherwise returns the list of affine.apply +/// operations created in output argument `sliceOps`. +void mlir::createAffineComputationSlice( + Operation *opInst, SmallVectorImpl *sliceOps) { + // Collect all operands that are results of affine apply ops. + SmallVector subOperands; + subOperands.reserve(opInst->getNumOperands()); + for (auto operand : opInst->getOperands()) + if (isa_and_nonnull(operand.getDefiningOp())) + subOperands.push_back(operand); + + // Gather sequence of AffineApplyOps reachable from 'subOperands'. + SmallVector affineApplyOps; + getReachableAffineApplyOps(subOperands, affineApplyOps); + // Skip transforming if there are no affine maps to compose. + if (affineApplyOps.empty()) + return; + + // Check if all uses of the affine apply op's lie only in this op op, in + // which case there would be nothing to do. + bool localized = true; + for (auto *op : affineApplyOps) { + for (auto result : op->getResults()) { + for (auto *user : result.getUsers()) { + if (user != opInst) { + localized = false; + break; + } + } + } + } + if (localized) + return; + + OpBuilder builder(opInst); + SmallVector composedOpOperands(subOperands); + auto composedMap = builder.getMultiDimIdentityMap(composedOpOperands.size()); + fullyComposeAffineMapAndOperands(&composedMap, &composedOpOperands); + + // Create an affine.apply for each of the map results. + sliceOps->reserve(composedMap.getNumResults()); + for (auto resultExpr : composedMap.getResults()) { + auto singleResMap = AffineMap::get(composedMap.getNumDims(), + composedMap.getNumSymbols(), resultExpr); + sliceOps->push_back(builder.create( + opInst->getLoc(), singleResMap, composedOpOperands)); + } + + // Construct the new operands that include the results from the composed + // affine apply op above instead of existing ones (subOperands). So, they + // differ from opInst's operands only for those operands in 'subOperands', for + // which they will be replaced by the corresponding one from 'sliceOps'. + SmallVector newOperands(opInst->getOperands()); + for (unsigned i = 0, e = newOperands.size(); i < e; i++) { + // Replace the subOperands from among the new operands. + unsigned j, f; + for (j = 0, f = subOperands.size(); j < f; j++) { + if (newOperands[i] == subOperands[j]) + break; + } + if (j < subOperands.size()) { + newOperands[i] = (*sliceOps)[j]; + } + } + for (unsigned idx = 0, e = newOperands.size(); idx < e; idx++) { + opInst->setOperand(idx, newOperands[idx]); + } +} + +/// Enum to set patterns of affine expr in tiled-layout map. +/// TileFloorDiv: div +/// TileMod: mod +/// TileNone: None of the above +/// Example: +/// #tiled_2d_128x256 = affine_map<(d0, d1) +/// -> (d0 div 128, d1 div 256, d0 mod 128, d1 mod 256)> +/// "d0 div 128" and "d1 div 256" ==> TileFloorDiv +/// "d0 mod 128" and "d1 mod 256" ==> TileMod +enum TileExprPattern { TileFloorDiv, TileMod, TileNone }; + +/// Check if `map` is a tiled layout. In the tiled layout, specific k dimensions +/// being floordiv'ed by respective tile sizes appeare in a mod with the same +/// tile sizes, and no other expression involves those k dimensions. This +/// function stores a vector of tuples (`tileSizePos`) including AffineExpr for +/// tile size, positions of corresponding `floordiv` and `mod`. If it is not a +/// tiled layout, an empty vector is returned. +static LogicalResult getTileSizePos( + AffineMap map, + SmallVectorImpl> &tileSizePos) { + // Create `floordivExprs` which is a vector of tuples including LHS and RHS of + // `floordiv` and its position in `map` output. + // Example: #tiled_2d_128x256 = affine_map<(d0, d1) + // -> (d0 div 128, d1 div 256, d0 mod 128, d1 mod 256)> + // In this example, `floordivExprs` includes {d0, 128, 0} and {d1, 256, 1}. + SmallVector, 4> floordivExprs; + unsigned pos = 0; + for (AffineExpr expr : map.getResults()) { + if (expr.getKind() == AffineExprKind::FloorDiv) { + AffineBinaryOpExpr binaryExpr = expr.cast(); + if (binaryExpr.getRHS().isa()) + floordivExprs.emplace_back( + std::make_tuple(binaryExpr.getLHS(), binaryExpr.getRHS(), pos)); + } + pos++; + } + // Not tiled layout if `floordivExprs` is empty. + if (floordivExprs.empty()) { + tileSizePos = SmallVector>{}; + return success(); + } + + // Check if LHS of `floordiv` is used in LHS of `mod`. If not used, `map` is + // not tiled layout. + for (std::tuple fexpr : floordivExprs) { + AffineExpr floordivExprLHS = std::get<0>(fexpr); + AffineExpr floordivExprRHS = std::get<1>(fexpr); + unsigned floordivPos = std::get<2>(fexpr); + + // Walk affinexpr of `map` output except `fexpr`, and check if LHS and RHS + // of `fexpr` are used in LHS and RHS of `mod`. If LHS of `fexpr` is used + // other expr, the map is not tiled layout. Example of non tiled layout: + // affine_map<(d0, d1, d2) -> (d0, d1, d2 floordiv 256, d2 floordiv 256)> + // affine_map<(d0, d1, d2) -> (d0, d1, d2 floordiv 256, d2 mod 128)> + // affine_map<(d0, d1, d2) -> (d0, d1, d2 floordiv 256, d2 mod 256, d2 mod + // 256)> + bool found = false; + pos = 0; + for (AffineExpr expr : map.getResults()) { + bool notTiled = false; + if (pos != floordivPos) { + expr.walk([&](AffineExpr e) { + if (e == floordivExprLHS) { + if (expr.getKind() == AffineExprKind::Mod) { + AffineBinaryOpExpr binaryExpr = expr.cast(); + // If LHS and RHS of `mod` are the same with those of floordiv. + if (floordivExprLHS == binaryExpr.getLHS() && + floordivExprRHS == binaryExpr.getRHS()) { + // Save tile size (RHS of `mod`), and position of `floordiv` and + // `mod` if same expr with `mod` is not found yet. + if (!found) { + tileSizePos.emplace_back( + std::make_tuple(binaryExpr.getRHS(), floordivPos, pos)); + found = true; + } else { + // Non tiled layout: Have multilpe `mod` with the same LHS. + // eg. affine_map<(d0, d1, d2) -> (d0, d1, d2 floordiv 256, d2 + // mod 256, d2 mod 256)> + notTiled = true; + } + } else { + // Non tiled layout: RHS of `mod` is different from `floordiv`. + // eg. affine_map<(d0, d1, d2) -> (d0, d1, d2 floordiv 256, d2 + // mod 128)> + notTiled = true; + } + } else { + // Non tiled layout: LHS is the same, but not `mod`. + // eg. affine_map<(d0, d1, d2) -> (d0, d1, d2 floordiv 256, d2 + // floordiv 256)> + notTiled = true; + } + } + }); + } + if (notTiled) { + tileSizePos = SmallVector>{}; + return success(); + } + pos++; + } + } + return success(); +} + +/// Check if `dim` dimension of memrefType with `layoutMap` becomes dynamic +/// after normalization. Dimensions that include dynamic dimensions in the map +/// output will become dynamic dimensions. Return true if `dim` is dynamic +/// dimension. +/// +/// Example: +/// #map0 = affine_map<(d0, d1) -> (d0, d1 floordiv 32, d1 mod 32)> +/// +/// If d1 is dynamic dimension, 2nd and 3rd dimension of map output are dynamic. +/// memref<4x?xf32, #map0> ==> memref<4x?x?xf32> +static bool +isNormalizedMemRefDynamicDim(unsigned dim, AffineMap layoutMap, + SmallVectorImpl &inMemrefTypeDynDims, + MLIRContext *context) { + bool isDynamicDim = false; + AffineExpr expr = layoutMap.getResults()[dim]; + // Check if affine expr of the dimension includes dynamic dimension of input + // memrefType. + expr.walk([&inMemrefTypeDynDims, &isDynamicDim, &context](AffineExpr e) { + if (e.isa()) { + for (unsigned dm : inMemrefTypeDynDims) { + if (e == getAffineDimExpr(dm, context)) { + isDynamicDim = true; + } + } + } + }); + return isDynamicDim; +} + +/// Create affine expr to calculate dimension size for a tiled-layout map. +static AffineExpr createDimSizeExprForTiledLayout(AffineExpr oldMapOutput, + TileExprPattern pat) { + // Create map output for the patterns. + // "floordiv " ==> "ceildiv " + // "mod " ==> "" + AffineExpr newMapOutput; + AffineBinaryOpExpr binaryExpr = nullptr; + switch (pat) { + case TileExprPattern::TileMod: + binaryExpr = oldMapOutput.cast(); + newMapOutput = binaryExpr.getRHS(); + break; + case TileExprPattern::TileFloorDiv: + binaryExpr = oldMapOutput.cast(); + newMapOutput = getAffineBinaryOpExpr( + AffineExprKind::CeilDiv, binaryExpr.getLHS(), binaryExpr.getRHS()); + break; + default: + newMapOutput = oldMapOutput; + } + return newMapOutput; +} + +/// Create new maps to calculate each dimension size of `newMemRefType`, and +/// create `newDynamicSizes` from them by using AffineApplyOp. +/// +/// Steps for normalizing dynamic memrefs for a tiled layout map +/// Example: +/// #map0 = affine_map<(d0, d1) -> (d0, d1 floordiv 32, d1 mod 32)> +/// %0 = dim %arg0, %c1 :memref<4x?xf32> +/// %1 = alloc(%0) : memref<4x?xf32, #map0> +/// +/// (Before this function) +/// 1. Check if `map`(#map0) is a tiled layout using `getTileSizePos()`. Only +/// single layout map is supported. +/// +/// 2. Create normalized memrefType using `isNormalizedMemRefDynamicDim()`. It +/// is memref<4x?x?xf32> in the above example. +/// +/// (In this function) +/// 3. Create new maps to calculate each dimension of the normalized memrefType +/// using `createDimSizeExprForTiledLayout()`. In the tiled layout, the +/// dimension size can be calculated by replacing "floordiv " with +/// "ceildiv " and "mod " with "". +/// - New map in the above example +/// #map0 = affine_map<(d0, d1) -> (d0)> +/// #map1 = affine_map<(d0, d1) -> (d1 ceildiv 32)> +/// #map2 = affine_map<(d0, d1) -> (32)> +/// +/// 4. Create AffineApplyOp to apply the new maps. The output of AffineApplyOp +/// is used in dynamicSizes of new AllocOp. +/// %0 = dim %arg0, %c1 : memref<4x?xf32> +/// %c4 = arith.constant 4 : index +/// %1 = affine.apply #map1(%c4, %0) +/// %2 = affine.apply #map2(%c4, %0) +static void createNewDynamicSizes(MemRefType oldMemRefType, + MemRefType newMemRefType, AffineMap map, + memref::AllocOp *allocOp, OpBuilder b, + SmallVectorImpl &newDynamicSizes) { + // Create new input for AffineApplyOp. + SmallVector inAffineApply; + ArrayRef oldMemRefShape = oldMemRefType.getShape(); + unsigned dynIdx = 0; + for (unsigned d = 0; d < oldMemRefType.getRank(); ++d) { + if (oldMemRefShape[d] < 0) { + // Use dynamicSizes of allocOp for dynamic dimension. + inAffineApply.emplace_back(allocOp->dynamicSizes()[dynIdx]); + dynIdx++; + } else { + // Create ConstantOp for static dimension. + Attribute constantAttr = + b.getIntegerAttr(b.getIndexType(), oldMemRefShape[d]); + inAffineApply.emplace_back( + b.create(allocOp->getLoc(), constantAttr)); + } + } + + // Create new map to calculate each dimension size of new memref for each + // original map output. Only for dynamic dimesion of `newMemRefType`. + unsigned newDimIdx = 0; + ArrayRef newMemRefShape = newMemRefType.getShape(); + SmallVector> tileSizePos; + (void)getTileSizePos(map, tileSizePos); + for (AffineExpr expr : map.getResults()) { + if (newMemRefShape[newDimIdx] < 0) { + // Create new maps to calculate each dimension size of new memref. + enum TileExprPattern pat = TileExprPattern::TileNone; + for (auto pos : tileSizePos) { + if (newDimIdx == std::get<1>(pos)) + pat = TileExprPattern::TileFloorDiv; + else if (newDimIdx == std::get<2>(pos)) + pat = TileExprPattern::TileMod; + } + AffineExpr newMapOutput = createDimSizeExprForTiledLayout(expr, pat); + AffineMap newMap = + AffineMap::get(map.getNumInputs(), map.getNumSymbols(), newMapOutput); + Value affineApp = + b.create(allocOp->getLoc(), newMap, inAffineApply); + newDynamicSizes.emplace_back(affineApp); + } + newDimIdx++; + } +} + +// TODO: Currently works for static memrefs with a single layout map. +LogicalResult mlir::normalizeMemRef(memref::AllocOp *allocOp) { + MemRefType memrefType = allocOp->getType(); + OpBuilder b(*allocOp); + + // Fetch a new memref type after normalizing the old memref to have an + // identity map layout. + MemRefType newMemRefType = + normalizeMemRefType(memrefType, b, allocOp->symbolOperands().size()); + if (newMemRefType == memrefType) + // Either memrefType already had an identity map or the map couldn't be + // transformed to an identity map. + return failure(); + + Value oldMemRef = allocOp->getResult(); + + SmallVector symbolOperands(allocOp->symbolOperands()); + AffineMap layoutMap = memrefType.getLayout().getAffineMap(); + memref::AllocOp newAlloc; + // Check if `layoutMap` is a tiled layout. Only single layout map is + // supported for normalizing dynamic memrefs. + SmallVector> tileSizePos; + (void)getTileSizePos(layoutMap, tileSizePos); + if (newMemRefType.getNumDynamicDims() > 0 && !tileSizePos.empty()) { + MemRefType oldMemRefType = oldMemRef.getType().cast(); + SmallVector newDynamicSizes; + createNewDynamicSizes(oldMemRefType, newMemRefType, layoutMap, allocOp, b, + newDynamicSizes); + // Add the new dynamic sizes in new AllocOp. + newAlloc = + b.create(allocOp->getLoc(), newMemRefType, + newDynamicSizes, allocOp->alignmentAttr()); + } else { + newAlloc = b.create(allocOp->getLoc(), newMemRefType, + allocOp->alignmentAttr()); + } + // Replace all uses of the old memref. + if (failed(replaceAllMemRefUsesWith(oldMemRef, /*newMemRef=*/newAlloc, + /*extraIndices=*/{}, + /*indexRemap=*/layoutMap, + /*extraOperands=*/{}, + /*symbolOperands=*/symbolOperands, + /*domOpFilter=*/nullptr, + /*postDomOpFilter=*/nullptr, + /*allowNonDereferencingOps=*/true))) { + // If it failed (due to escapes for example), bail out. + newAlloc.erase(); + return failure(); + } + // Replace any uses of the original alloc op and erase it. All remaining uses + // have to be dealloc's; RAMUW above would've failed otherwise. + assert(llvm::all_of(oldMemRef.getUsers(), [](Operation *op) { + return isa(op); + })); + oldMemRef.replaceAllUsesWith(newAlloc); + allocOp->erase(); + return success(); +} + +MemRefType mlir::normalizeMemRefType(MemRefType memrefType, OpBuilder b, + unsigned numSymbolicOperands) { + unsigned rank = memrefType.getRank(); + if (rank == 0) + return memrefType; + + if (memrefType.getLayout().isIdentity()) { + // Either no maps is associated with this memref or this memref has + // a trivial (identity) map. + return memrefType; + } + AffineMap layoutMap = memrefType.getLayout().getAffineMap(); + + // We don't do any checks for one-to-one'ness; we assume that it is + // one-to-one. + + // Normalize only static memrefs and dynamic memrefs with a tiled-layout map + // for now. + // TODO: Normalize the other types of dynamic memrefs. + SmallVector> tileSizePos; + (void)getTileSizePos(layoutMap, tileSizePos); + if (memrefType.getNumDynamicDims() > 0 && tileSizePos.empty()) + return memrefType; + + // We have a single map that is not an identity map. Create a new memref + // with the right shape and an identity layout map. + ArrayRef shape = memrefType.getShape(); + // FlatAffineConstraint may later on use symbolicOperands. + FlatAffineConstraints fac(rank, numSymbolicOperands); + SmallVector memrefTypeDynDims; + for (unsigned d = 0; d < rank; ++d) { + // Use constraint system only in static dimensions. + if (shape[d] > 0) { + fac.addBound(FlatAffineConstraints::LB, d, 0); + fac.addBound(FlatAffineConstraints::UB, d, shape[d] - 1); + } else { + memrefTypeDynDims.emplace_back(d); + } + } + // We compose this map with the original index (logical) space to derive + // the upper bounds for the new index space. + unsigned newRank = layoutMap.getNumResults(); + if (failed(fac.composeMatchingMap(layoutMap))) + return memrefType; + // TODO: Handle semi-affine maps. + // Project out the old data dimensions. + fac.projectOut(newRank, fac.getNumIds() - newRank - fac.getNumLocalIds()); + SmallVector newShape(newRank); + for (unsigned d = 0; d < newRank; ++d) { + // Check if each dimension of normalized memrefType is dynamic. + bool isDynDim = isNormalizedMemRefDynamicDim( + d, layoutMap, memrefTypeDynDims, b.getContext()); + if (isDynDim) { + newShape[d] = -1; + } else { + // The lower bound for the shape is always zero. + auto ubConst = fac.getConstantBound(FlatAffineConstraints::UB, d); + // For a static memref and an affine map with no symbols, this is + // always bounded. + assert(ubConst.hasValue() && "should always have an upper bound"); + if (ubConst.getValue() < 0) + // This is due to an invalid map that maps to a negative space. + return memrefType; + // If dimension of new memrefType is dynamic, the value is -1. + newShape[d] = ubConst.getValue() + 1; + } + } + + // Create the new memref type after trivializing the old layout map. + MemRefType newMemRefType = + MemRefType::Builder(memrefType) + .setShape(newShape) + .setLayout(AffineMapAttr::get(b.getMultiDimIdentityMap(newRank))); + + return newMemRefType; +} diff --git a/mlir/lib/Dialect/GPU/Transforms/MemoryPromotion.cpp b/mlir/lib/Dialect/GPU/Transforms/MemoryPromotion.cpp --- a/mlir/lib/Dialect/GPU/Transforms/MemoryPromotion.cpp +++ b/mlir/lib/Dialect/GPU/Transforms/MemoryPromotion.cpp @@ -12,6 +12,7 @@ //===----------------------------------------------------------------------===// #include "mlir/Dialect/GPU/MemoryPromotion.h" +#include "mlir/Dialect/Affine/LoopUtils.h" #include "mlir/Dialect/Arithmetic/IR/Arithmetic.h" #include "mlir/Dialect/GPU/GPUDialect.h" #include "mlir/Dialect/MemRef/IR/MemRef.h" @@ -19,7 +20,6 @@ #include "mlir/Dialect/StandardOps/IR/Ops.h" #include "mlir/IR/ImplicitLocOpBuilder.h" #include "mlir/Pass/Pass.h" -#include "mlir/Transforms/LoopUtils.h" using namespace mlir; using namespace mlir::gpu; diff --git a/mlir/lib/Dialect/Linalg/Transforms/CodegenStrategy.cpp b/mlir/lib/Dialect/Linalg/Transforms/CodegenStrategy.cpp --- a/mlir/lib/Dialect/Linalg/Transforms/CodegenStrategy.cpp +++ b/mlir/lib/Dialect/Linalg/Transforms/CodegenStrategy.cpp @@ -12,7 +12,6 @@ //===----------------------------------------------------------------------===// #include "mlir/Dialect/Linalg/Transforms/CodegenStrategy.h" - #include "mlir/Dialect/Linalg/Passes.h" #include "mlir/Dialect/Linalg/Transforms/Hoisting.h" #include "mlir/Dialect/SCF/Transforms.h" @@ -20,7 +19,6 @@ #include "mlir/Dialect/Vector/VectorTransforms.h" #include "mlir/Pass/PassManager.h" #include "mlir/Transforms/GreedyPatternRewriteDriver.h" -#include "mlir/Transforms/LoopUtils.h" #include "mlir/Transforms/Passes.h" using namespace mlir; diff --git a/mlir/lib/Dialect/Linalg/Transforms/HoistPadding.cpp b/mlir/lib/Dialect/Linalg/Transforms/HoistPadding.cpp --- a/mlir/lib/Dialect/Linalg/Transforms/HoistPadding.cpp +++ b/mlir/lib/Dialect/Linalg/Transforms/HoistPadding.cpp @@ -12,7 +12,6 @@ #include "mlir/Dialect/Linalg/Transforms/HoistPadding.h" #include "mlir/Analysis/SliceAnalysis.h" -#include "mlir/Dialect/Affine/Utils.h" #include "mlir/Dialect/Linalg/IR/Linalg.h" #include "mlir/Dialect/Linalg/Transforms/Transforms.h" #include "mlir/Dialect/SCF/SCF.h" @@ -24,7 +23,6 @@ #include "mlir/IR/AsmState.h" #include "mlir/IR/BuiltinOps.h" #include "mlir/IR/Dominance.h" -#include "mlir/Transforms/LoopUtils.h" #include "llvm/ADT/StringRef.h" #include "llvm/Support/Debug.h" diff --git a/mlir/lib/Dialect/Linalg/Transforms/Hoisting.cpp b/mlir/lib/Dialect/Linalg/Transforms/Hoisting.cpp --- a/mlir/lib/Dialect/Linalg/Transforms/Hoisting.cpp +++ b/mlir/lib/Dialect/Linalg/Transforms/Hoisting.cpp @@ -15,7 +15,6 @@ #include "mlir/Analysis/SliceAnalysis.h" #include "mlir/Dialect/Affine/Analysis/AffineStructures.h" #include "mlir/Dialect/Affine/IR/AffineValueMap.h" -#include "mlir/Dialect/Affine/Utils.h" #include "mlir/Dialect/Linalg/IR/Linalg.h" #include "mlir/Dialect/Linalg/Transforms/Transforms.h" #include "mlir/Dialect/SCF/SCF.h" @@ -27,7 +26,6 @@ #include "mlir/IR/BuiltinOps.h" #include "mlir/IR/Dominance.h" #include "mlir/Transforms/GreedyPatternRewriteDriver.h" -#include "mlir/Transforms/LoopUtils.h" #include "llvm/ADT/StringRef.h" #include "llvm/Support/Debug.h" diff --git a/mlir/lib/Dialect/Linalg/Transforms/LinalgStrategyPasses.cpp b/mlir/lib/Dialect/Linalg/Transforms/LinalgStrategyPasses.cpp --- a/mlir/lib/Dialect/Linalg/Transforms/LinalgStrategyPasses.cpp +++ b/mlir/lib/Dialect/Linalg/Transforms/LinalgStrategyPasses.cpp @@ -16,6 +16,8 @@ #include "PassDetail.h" #include "mlir/Analysis/SliceAnalysis.h" #include "mlir/Dialect/Affine/IR/AffineOps.h" +#include "mlir/Dialect/Affine/LoopUtils.h" +#include "mlir/Dialect/Affine/Utils.h" #include "mlir/Dialect/Linalg/IR/Linalg.h" #include "mlir/Dialect/Linalg/Passes.h" #include "mlir/Dialect/Linalg/Transforms/Hoisting.h" @@ -29,9 +31,7 @@ #include "mlir/Pass/PassManager.h" #include "mlir/Support/LLVM.h" #include "mlir/Transforms/GreedyPatternRewriteDriver.h" -#include "mlir/Transforms/LoopUtils.h" #include "mlir/Transforms/Passes.h" -#include "mlir/Transforms/Utils.h" using namespace mlir; using namespace mlir::vector; @@ -348,7 +348,13 @@ return signalPassFailure(); } - promoteSingleIterationLoops(funcOp); + // Gathers all innermost loops through a post order pruned walk. + funcOp.walk([](Operation *op) { + if (auto forOp = dyn_cast(op)) + (void)promoteIfSingleIteration(forOp); + else if (auto forOp = dyn_cast(op)) + (void)promoteIfSingleIteration(forOp); + }); if (options.hoistRedundantVectorTransfers) hoistRedundantVectorTransfers(funcOp); diff --git a/mlir/lib/Dialect/Linalg/Transforms/Transforms.cpp b/mlir/lib/Dialect/Linalg/Transforms/Transforms.cpp --- a/mlir/lib/Dialect/Linalg/Transforms/Transforms.cpp +++ b/mlir/lib/Dialect/Linalg/Transforms/Transforms.cpp @@ -12,7 +12,6 @@ //===----------------------------------------------------------------------===// #include "mlir/Dialect/Linalg/Transforms/Transforms.h" -#include "mlir/Dialect/Affine/Utils.h" #include "mlir/Dialect/Arithmetic/IR/Arithmetic.h" #include "mlir/Dialect/Linalg/Analysis/DependenceAnalysis.h" #include "mlir/Dialect/Linalg/IR/Linalg.h" diff --git a/mlir/lib/Dialect/Linalg/Utils/Utils.cpp b/mlir/lib/Dialect/Linalg/Utils/Utils.cpp --- a/mlir/lib/Dialect/Linalg/Utils/Utils.cpp +++ b/mlir/lib/Dialect/Linalg/Utils/Utils.cpp @@ -16,6 +16,7 @@ #include "mlir/Dialect/Affine/Analysis/AffineStructures.h" #include "mlir/Dialect/Affine/IR/AffineOps.h" #include "mlir/Dialect/Affine/IR/AffineValueMap.h" +#include "mlir/Dialect/Affine/LoopUtils.h" #include "mlir/Dialect/Arithmetic/IR/Arithmetic.h" #include "mlir/Dialect/Linalg/IR/Linalg.h" #include "mlir/Dialect/MemRef/IR/MemRef.h" @@ -31,7 +32,6 @@ #include "mlir/IR/Matchers.h" #include "mlir/IR/OpImplementation.h" #include "mlir/Pass/Pass.h" -#include "mlir/Transforms/LoopUtils.h" #include "llvm/ADT/TypeSwitch.h" #include "llvm/Support/Debug.h" diff --git a/mlir/lib/Dialect/MemRef/Transforms/NormalizeMemRefs.cpp b/mlir/lib/Dialect/MemRef/Transforms/NormalizeMemRefs.cpp --- a/mlir/lib/Dialect/MemRef/Transforms/NormalizeMemRefs.cpp +++ b/mlir/lib/Dialect/MemRef/Transforms/NormalizeMemRefs.cpp @@ -13,9 +13,9 @@ #include "PassDetail.h" #include "mlir/Dialect/Affine/IR/AffineOps.h" +#include "mlir/Dialect/Affine/Utils.h" #include "mlir/Dialect/MemRef/IR/MemRef.h" #include "mlir/Dialect/MemRef/Transforms/Passes.h" -#include "mlir/Transforms/Utils.h" #include "llvm/ADT/SmallSet.h" #include "llvm/Support/Debug.h" diff --git a/mlir/lib/Dialect/SCF/Transforms/CMakeLists.txt b/mlir/lib/Dialect/SCF/Transforms/CMakeLists.txt --- a/mlir/lib/Dialect/SCF/Transforms/CMakeLists.txt +++ b/mlir/lib/Dialect/SCF/Transforms/CMakeLists.txt @@ -6,6 +6,7 @@ LoopPipelining.cpp LoopRangeFolding.cpp LoopSpecialization.cpp + ParallelLoopCollapsing.cpp ParallelLoopFusion.cpp ParallelLoopTiling.cpp StructuralTypeConversions.cpp diff --git a/mlir/lib/Transforms/ParallelLoopCollapsing.cpp b/mlir/lib/Dialect/SCF/Transforms/ParallelLoopCollapsing.cpp rename from mlir/lib/Transforms/ParallelLoopCollapsing.cpp rename to mlir/lib/Dialect/SCF/Transforms/ParallelLoopCollapsing.cpp --- a/mlir/lib/Transforms/ParallelLoopCollapsing.cpp +++ b/mlir/lib/Dialect/SCF/Transforms/ParallelLoopCollapsing.cpp @@ -7,9 +7,9 @@ //===----------------------------------------------------------------------===// #include "PassDetail.h" +#include "mlir/Dialect/SCF/Passes.h" #include "mlir/Dialect/SCF/SCF.h" -#include "mlir/Transforms/LoopUtils.h" -#include "mlir/Transforms/Passes.h" +#include "mlir/Dialect/SCF/Utils.h" #include "mlir/Transforms/RegionUtils.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" @@ -20,7 +20,7 @@ namespace { struct ParallelLoopCollapsing - : public ParallelLoopCollapsingBase { + : public SCFParallelLoopCollapsingBase { void runOnOperation() override { Operation *module = getOperation(); diff --git a/mlir/lib/Dialect/SCF/Transforms/Utils.cpp b/mlir/lib/Dialect/SCF/Transforms/Utils.cpp --- a/mlir/lib/Dialect/SCF/Transforms/Utils.cpp +++ b/mlir/lib/Dialect/SCF/Transforms/Utils.cpp @@ -11,20 +11,31 @@ //===----------------------------------------------------------------------===// #include "mlir/Dialect/SCF/Utils.h" - +#include "mlir/Analysis/SliceAnalysis.h" #include "mlir/Dialect/Arithmetic/IR/Arithmetic.h" #include "mlir/Dialect/SCF/SCF.h" #include "mlir/Dialect/StandardOps/IR/Ops.h" #include "mlir/IR/BlockAndValueMapping.h" #include "mlir/IR/BuiltinOps.h" #include "mlir/IR/PatternMatch.h" +#include "mlir/Support/MathExtras.h" #include "mlir/Transforms/RegionUtils.h" - #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SetVector.h" +#include "llvm/ADT/SmallPtrSet.h" using namespace mlir; +namespace { +// This structure is to pass and return sets of loop parameters without +// confusing the order. +struct LoopParams { + Value lowerBound; + Value upperBound; + Value step; +}; +} // namespace + scf::ForOp mlir::cloneWithNewYields(OpBuilder &b, scf::ForOp loop, ValueRange newIterOperands, ValueRange newYieldedValues, @@ -230,3 +241,682 @@ } return rootEnclosesPloops; } + +// Build the IR that performs ceil division of a positive value by a constant: +// ceildiv(a, B) = divis(a + (B-1), B) +// where divis is rounding-to-zero division. +static Value ceilDivPositive(OpBuilder &builder, Location loc, Value dividend, + int64_t divisor) { + assert(divisor > 0 && "expected positive divisor"); + assert(dividend.getType().isIndex() && "expected index-typed value"); + + Value divisorMinusOneCst = + builder.create(loc, divisor - 1); + Value divisorCst = builder.create(loc, divisor); + Value sum = builder.create(loc, dividend, divisorMinusOneCst); + return builder.create(loc, sum, divisorCst); +} + +// Build the IR that performs ceil division of a positive value by another +// positive value: +// ceildiv(a, b) = divis(a + (b - 1), b) +// where divis is rounding-to-zero division. +static Value ceilDivPositive(OpBuilder &builder, Location loc, Value dividend, + Value divisor) { + assert(dividend.getType().isIndex() && "expected index-typed value"); + + Value cstOne = builder.create(loc, 1); + Value divisorMinusOne = builder.create(loc, divisor, cstOne); + Value sum = builder.create(loc, dividend, divisorMinusOne); + return builder.create(loc, sum, divisor); +} + +/// Helper to replace uses of loop carried values (iter_args) and loop +/// yield values while promoting single iteration scf.for ops. +static void replaceIterArgsAndYieldResults(scf::ForOp forOp) { + // Replace uses of iter arguments with iter operands (initial values). + auto iterOperands = forOp.getIterOperands(); + auto iterArgs = forOp.getRegionIterArgs(); + for (auto e : llvm::zip(iterOperands, iterArgs)) + std::get<1>(e).replaceAllUsesWith(std::get<0>(e)); + + // Replace uses of loop results with the values yielded by the loop. + auto outerResults = forOp.getResults(); + auto innerResults = forOp.getBody()->getTerminator()->getOperands(); + for (auto e : llvm::zip(outerResults, innerResults)) + std::get<0>(e).replaceAllUsesWith(std::get<1>(e)); +} + +/// Promotes the loop body of a forOp to its containing block if the forOp +/// it can be determined that the loop has a single iteration. +LogicalResult mlir::promoteIfSingleIteration(scf::ForOp forOp) { + auto lbCstOp = forOp.getLowerBound().getDefiningOp(); + auto ubCstOp = forOp.getUpperBound().getDefiningOp(); + auto stepCstOp = forOp.getStep().getDefiningOp(); + if (!lbCstOp || !ubCstOp || !stepCstOp || lbCstOp.value() < 0 || + ubCstOp.value() < 0 || stepCstOp.value() < 0) + return failure(); + int64_t tripCount = + mlir::ceilDiv(ubCstOp.value() - lbCstOp.value(), stepCstOp.value()); + if (tripCount != 1) + return failure(); + auto iv = forOp.getInductionVar(); + iv.replaceAllUsesWith(lbCstOp); + + replaceIterArgsAndYieldResults(forOp); + + // Move the loop body operations, except for its terminator, to the loop's + // containing block. + auto *parentBlock = forOp->getBlock(); + forOp.getBody()->getTerminator()->erase(); + parentBlock->getOperations().splice(Block::iterator(forOp), + forOp.getBody()->getOperations()); + forOp.erase(); + return success(); +} + +/// Generates unrolled copies of scf::ForOp 'loopBodyBlock', with +/// associated 'forOpIV' by 'unrollFactor', calling 'ivRemapFn' to remap +/// 'forOpIV' for each unrolled body. If specified, annotates the Ops in each +/// unrolled iteration using annotateFn. +static void generateUnrolledLoop( + Block *loopBodyBlock, Value forOpIV, uint64_t unrollFactor, + function_ref ivRemapFn, + function_ref annotateFn, + ValueRange iterArgs, ValueRange yieldedValues) { + // Builder to insert unrolled bodies just before the terminator of the body of + // 'forOp'. + auto builder = OpBuilder::atBlockTerminator(loopBodyBlock); + + if (!annotateFn) + annotateFn = [](unsigned, Operation *, OpBuilder) {}; + + // Keep a pointer to the last non-terminator operation in the original block + // so that we know what to clone (since we are doing this in-place). + Block::iterator srcBlockEnd = std::prev(loopBodyBlock->end(), 2); + + // Unroll the contents of 'forOp' (append unrollFactor - 1 additional copies). + SmallVector lastYielded(yieldedValues); + + for (unsigned i = 1; i < unrollFactor; i++) { + BlockAndValueMapping operandMap; + + // Prepare operand map. + operandMap.map(iterArgs, lastYielded); + + // If the induction variable is used, create a remapping to the value for + // this unrolled instance. + if (!forOpIV.use_empty()) { + Value ivUnroll = ivRemapFn(i, forOpIV, builder); + operandMap.map(forOpIV, ivUnroll); + } + + // Clone the original body of 'forOp'. + for (auto it = loopBodyBlock->begin(); it != std::next(srcBlockEnd); it++) { + Operation *clonedOp = builder.clone(*it, operandMap); + annotateFn(i, clonedOp, builder); + } + + // Update yielded values. + for (unsigned i = 0, e = lastYielded.size(); i < e; i++) + lastYielded[i] = operandMap.lookup(yieldedValues[i]); + } + + // Make sure we annotate the Ops in the original body. We do this last so that + // any annotations are not copied into the cloned Ops above. + for (auto it = loopBodyBlock->begin(); it != std::next(srcBlockEnd); it++) + annotateFn(0, &*it, builder); + + // Update operands of the yield statement. + loopBodyBlock->getTerminator()->setOperands(lastYielded); +} + +/// Unrolls 'forOp' by 'unrollFactor', returns success if the loop is unrolled. +LogicalResult mlir::loopUnrollByFactor( + scf::ForOp forOp, uint64_t unrollFactor, + function_ref annotateFn) { + assert(unrollFactor > 0 && "expected positive unroll factor"); + + // Return if the loop body is empty. + if (llvm::hasSingleElement(forOp.getBody()->getOperations())) + return success(); + + // Compute tripCount = ceilDiv((upperBound - lowerBound), step) and populate + // 'upperBoundUnrolled' and 'stepUnrolled' for static and dynamic cases. + OpBuilder boundsBuilder(forOp); + auto loc = forOp.getLoc(); + auto step = forOp.getStep(); + Value upperBoundUnrolled; + Value stepUnrolled; + bool generateEpilogueLoop = true; + + auto lbCstOp = forOp.getLowerBound().getDefiningOp(); + auto ubCstOp = forOp.getUpperBound().getDefiningOp(); + auto stepCstOp = forOp.getStep().getDefiningOp(); + if (lbCstOp && ubCstOp && stepCstOp) { + // Constant loop bounds computation. + int64_t lbCst = lbCstOp.value(); + int64_t ubCst = ubCstOp.value(); + int64_t stepCst = stepCstOp.value(); + assert(lbCst >= 0 && ubCst >= 0 && stepCst >= 0 && + "expected positive loop bounds and step"); + int64_t tripCount = mlir::ceilDiv(ubCst - lbCst, stepCst); + + if (unrollFactor == 1) { + if (tripCount == 1 && failed(promoteIfSingleIteration(forOp))) + return failure(); + return success(); + } + + int64_t tripCountEvenMultiple = tripCount - (tripCount % unrollFactor); + int64_t upperBoundUnrolledCst = lbCst + tripCountEvenMultiple * stepCst; + assert(upperBoundUnrolledCst <= ubCst); + int64_t stepUnrolledCst = stepCst * unrollFactor; + + // Create constant for 'upperBoundUnrolled' and set epilogue loop flag. + generateEpilogueLoop = upperBoundUnrolledCst < ubCst; + if (generateEpilogueLoop) + upperBoundUnrolled = boundsBuilder.create( + loc, upperBoundUnrolledCst); + else + upperBoundUnrolled = ubCstOp; + + // Create constant for 'stepUnrolled'. + stepUnrolled = stepCst == stepUnrolledCst + ? step + : boundsBuilder.create( + loc, stepUnrolledCst); + } else { + // Dynamic loop bounds computation. + // TODO: Add dynamic asserts for negative lb/ub/step, or + // consider using ceilDiv from AffineApplyExpander. + auto lowerBound = forOp.getLowerBound(); + auto upperBound = forOp.getUpperBound(); + Value diff = + boundsBuilder.create(loc, upperBound, lowerBound); + Value tripCount = ceilDivPositive(boundsBuilder, loc, diff, step); + Value unrollFactorCst = + boundsBuilder.create(loc, unrollFactor); + Value tripCountRem = + boundsBuilder.create(loc, tripCount, unrollFactorCst); + // Compute tripCountEvenMultiple = tripCount - (tripCount % unrollFactor) + Value tripCountEvenMultiple = + boundsBuilder.create(loc, tripCount, tripCountRem); + // Compute upperBoundUnrolled = lowerBound + tripCountEvenMultiple * step + upperBoundUnrolled = boundsBuilder.create( + loc, lowerBound, + boundsBuilder.create(loc, tripCountEvenMultiple, step)); + // Scale 'step' by 'unrollFactor'. + stepUnrolled = + boundsBuilder.create(loc, step, unrollFactorCst); + } + + // Create epilogue clean up loop starting at 'upperBoundUnrolled'. + if (generateEpilogueLoop) { + OpBuilder epilogueBuilder(forOp->getContext()); + epilogueBuilder.setInsertionPoint(forOp->getBlock(), + std::next(Block::iterator(forOp))); + auto epilogueForOp = cast(epilogueBuilder.clone(*forOp)); + epilogueForOp.setLowerBound(upperBoundUnrolled); + + // Update uses of loop results. + auto results = forOp.getResults(); + auto epilogueResults = epilogueForOp.getResults(); + auto epilogueIterOperands = epilogueForOp.getIterOperands(); + + for (auto e : llvm::zip(results, epilogueResults, epilogueIterOperands)) { + std::get<0>(e).replaceAllUsesWith(std::get<1>(e)); + epilogueForOp->replaceUsesOfWith(std::get<2>(e), std::get<0>(e)); + } + (void)promoteIfSingleIteration(epilogueForOp); + } + + // Create unrolled loop. + forOp.setUpperBound(upperBoundUnrolled); + forOp.setStep(stepUnrolled); + + auto iterArgs = ValueRange(forOp.getRegionIterArgs()); + auto yieldedValues = forOp.getBody()->getTerminator()->getOperands(); + + generateUnrolledLoop( + forOp.getBody(), forOp.getInductionVar(), unrollFactor, + [&](unsigned i, Value iv, OpBuilder b) { + // iv' = iv + step * i; + auto stride = b.create( + loc, step, b.create(loc, i)); + return b.create(loc, iv, stride); + }, + annotateFn, iterArgs, yieldedValues); + // Promote the loop body up if this has turned into a single iteration loop. + (void)promoteIfSingleIteration(forOp); + return success(); +} + +/// Return the new lower bound, upper bound, and step in that order. Insert any +/// additional bounds calculations before the given builder and any additional +/// conversion back to the original loop induction value inside the given Block. +static LoopParams normalizeLoop(OpBuilder &boundsBuilder, + OpBuilder &insideLoopBuilder, Location loc, + Value lowerBound, Value upperBound, Value step, + Value inductionVar) { + // Check if the loop is already known to have a constant zero lower bound or + // a constant one step. + bool isZeroBased = false; + if (auto ubCst = lowerBound.getDefiningOp()) + isZeroBased = ubCst.value() == 0; + + bool isStepOne = false; + if (auto stepCst = step.getDefiningOp()) + isStepOne = stepCst.value() == 1; + + // Compute the number of iterations the loop executes: ceildiv(ub - lb, step) + // assuming the step is strictly positive. Update the bounds and the step + // of the loop to go from 0 to the number of iterations, if necessary. + // TODO: introduce support for negative steps or emit dynamic asserts + // on step positivity, whatever gets implemented first. + if (isZeroBased && isStepOne) + return {/*lowerBound=*/lowerBound, /*upperBound=*/upperBound, + /*step=*/step}; + + Value diff = boundsBuilder.create(loc, upperBound, lowerBound); + Value newUpperBound = ceilDivPositive(boundsBuilder, loc, diff, step); + + Value newLowerBound = + isZeroBased ? lowerBound + : boundsBuilder.create(loc, 0); + Value newStep = + isStepOne ? step : boundsBuilder.create(loc, 1); + + // Insert code computing the value of the original loop induction variable + // from the "normalized" one. + Value scaled = + isStepOne + ? inductionVar + : insideLoopBuilder.create(loc, inductionVar, step); + Value shifted = + isZeroBased + ? scaled + : insideLoopBuilder.create(loc, scaled, lowerBound); + + SmallPtrSet preserve{scaled.getDefiningOp(), + shifted.getDefiningOp()}; + inductionVar.replaceAllUsesExcept(shifted, preserve); + return {/*lowerBound=*/newLowerBound, /*upperBound=*/newUpperBound, + /*step=*/newStep}; +} + +/// Transform a loop with a strictly positive step +/// for %i = %lb to %ub step %s +/// into a 0-based loop with step 1 +/// for %ii = 0 to ceildiv(%ub - %lb, %s) step 1 { +/// %i = %ii * %s + %lb +/// Insert the induction variable remapping in the body of `inner`, which is +/// expected to be either `loop` or another loop perfectly nested under `loop`. +/// Insert the definition of new bounds immediate before `outer`, which is +/// expected to be either `loop` or its parent in the loop nest. +static void normalizeLoop(scf::ForOp loop, scf::ForOp outer, scf::ForOp inner) { + OpBuilder builder(outer); + OpBuilder innerBuilder = OpBuilder::atBlockBegin(inner.getBody()); + auto loopPieces = normalizeLoop(builder, innerBuilder, loop.getLoc(), + loop.getLowerBound(), loop.getUpperBound(), + loop.getStep(), loop.getInductionVar()); + + loop.setLowerBound(loopPieces.lowerBound); + loop.setUpperBound(loopPieces.upperBound); + loop.setStep(loopPieces.step); +} + +void mlir::coalesceLoops(MutableArrayRef loops) { + if (loops.size() < 2) + return; + + scf::ForOp innermost = loops.back(); + scf::ForOp outermost = loops.front(); + + // 1. Make sure all loops iterate from 0 to upperBound with step 1. This + // allows the following code to assume upperBound is the number of iterations. + for (auto loop : loops) + normalizeLoop(loop, outermost, innermost); + + // 2. Emit code computing the upper bound of the coalesced loop as product + // of the number of iterations of all loops. + OpBuilder builder(outermost); + Location loc = outermost.getLoc(); + Value upperBound = outermost.getUpperBound(); + for (auto loop : loops.drop_front()) + upperBound = + builder.create(loc, upperBound, loop.getUpperBound()); + outermost.setUpperBound(upperBound); + + builder.setInsertionPointToStart(outermost.getBody()); + + // 3. Remap induction variables. For each original loop, the value of the + // induction variable can be obtained by dividing the induction variable of + // the linearized loop by the total number of iterations of the loops nested + // in it modulo the number of iterations in this loop (remove the values + // related to the outer loops): + // iv_i = floordiv(iv_linear, product-of-loop-ranges-until-i) mod range_i. + // Compute these iteratively from the innermost loop by creating a "running + // quotient" of division by the range. + Value previous = outermost.getInductionVar(); + for (unsigned i = 0, e = loops.size(); i < e; ++i) { + unsigned idx = loops.size() - i - 1; + if (i != 0) + previous = builder.create(loc, previous, + loops[idx + 1].getUpperBound()); + + Value iv = (i == e - 1) ? previous + : builder.create( + loc, previous, loops[idx].getUpperBound()); + replaceAllUsesInRegionWith(loops[idx].getInductionVar(), iv, + loops.back().getRegion()); + } + + // 4. Move the operations from the innermost just above the second-outermost + // loop, delete the extra terminator and the second-outermost loop. + scf::ForOp second = loops[1]; + innermost.getBody()->back().erase(); + outermost.getBody()->getOperations().splice( + Block::iterator(second.getOperation()), + innermost.getBody()->getOperations()); + second.erase(); +} + +void mlir::collapseParallelLoops( + scf::ParallelOp loops, ArrayRef> combinedDimensions) { + OpBuilder outsideBuilder(loops); + Location loc = loops.getLoc(); + + // Presort combined dimensions. + auto sortedDimensions = llvm::to_vector<3>(combinedDimensions); + for (auto &dims : sortedDimensions) + std::sort(dims.begin(), dims.end()); + + // Normalize ParallelOp's iteration pattern. + SmallVector normalizedLowerBounds, normalizedSteps, + normalizedUpperBounds; + for (unsigned i = 0, e = loops.getNumLoops(); i < e; ++i) { + OpBuilder insideLoopBuilder = OpBuilder::atBlockBegin(loops.getBody()); + auto resultBounds = + normalizeLoop(outsideBuilder, insideLoopBuilder, loc, + loops.getLowerBound()[i], loops.getUpperBound()[i], + loops.getStep()[i], loops.getBody()->getArgument(i)); + + normalizedLowerBounds.push_back(resultBounds.lowerBound); + normalizedUpperBounds.push_back(resultBounds.upperBound); + normalizedSteps.push_back(resultBounds.step); + } + + // Combine iteration spaces. + SmallVector lowerBounds, upperBounds, steps; + auto cst0 = outsideBuilder.create(loc, 0); + auto cst1 = outsideBuilder.create(loc, 1); + for (unsigned i = 0, e = sortedDimensions.size(); i < e; ++i) { + Value newUpperBound = outsideBuilder.create(loc, 1); + for (auto idx : sortedDimensions[i]) { + newUpperBound = outsideBuilder.create( + loc, newUpperBound, normalizedUpperBounds[idx]); + } + lowerBounds.push_back(cst0); + steps.push_back(cst1); + upperBounds.push_back(newUpperBound); + } + + // Create new ParallelLoop with conversions to the original induction values. + // The loop below uses divisions to get the relevant range of values in the + // new induction value that represent each range of the original induction + // value. The remainders then determine based on that range, which iteration + // of the original induction value this represents. This is a normalized value + // that is un-normalized already by the previous logic. + auto newPloop = outsideBuilder.create( + loc, lowerBounds, upperBounds, steps, + [&](OpBuilder &insideBuilder, Location, ValueRange ploopIVs) { + for (unsigned i = 0, e = combinedDimensions.size(); i < e; ++i) { + Value previous = ploopIVs[i]; + unsigned numberCombinedDimensions = combinedDimensions[i].size(); + // Iterate over all except the last induction value. + for (unsigned j = numberCombinedDimensions - 1; j > 0; --j) { + unsigned idx = combinedDimensions[i][j]; + + // Determine the current induction value's current loop iteration + Value iv = insideBuilder.create( + loc, previous, normalizedUpperBounds[idx]); + replaceAllUsesInRegionWith(loops.getBody()->getArgument(idx), iv, + loops.getRegion()); + + // Remove the effect of the current induction value to prepare for + // the next value. + previous = insideBuilder.create( + loc, previous, normalizedUpperBounds[idx]); + } + + // The final induction value is just the remaining value. + unsigned idx = combinedDimensions[i][0]; + replaceAllUsesInRegionWith(loops.getBody()->getArgument(idx), + previous, loops.getRegion()); + } + }); + + // Replace the old loop with the new loop. + loops.getBody()->back().erase(); + newPloop.getBody()->getOperations().splice( + Block::iterator(newPloop.getBody()->back()), + loops.getBody()->getOperations()); + loops.erase(); +} + +// Hoist the ops within `outer` that appear before `inner`. +// Such ops include the ops that have been introduced by parametric tiling. +// Ops that come from triangular loops (i.e. that belong to the program slice +// rooted at `outer`) and ops that have side effects cannot be hoisted. +// Return failure when any op fails to hoist. +static LogicalResult hoistOpsBetween(scf::ForOp outer, scf::ForOp inner) { + SetVector forwardSlice; + getForwardSlice( + outer.getInductionVar(), &forwardSlice, + [&inner](Operation *op) { return op != inner.getOperation(); }); + LogicalResult status = success(); + SmallVector toHoist; + for (auto &op : outer.getBody()->without_terminator()) { + // Stop when encountering the inner loop. + if (&op == inner.getOperation()) + break; + // Skip over non-hoistable ops. + if (forwardSlice.count(&op) > 0) { + status = failure(); + continue; + } + // Skip intermediate scf::ForOp, these are not considered a failure. + if (isa(op)) + continue; + // Skip other ops with regions. + if (op.getNumRegions() > 0) { + status = failure(); + continue; + } + // Skip if op has side effects. + // TODO: loads to immutable memory regions are ok. + if (!MemoryEffectOpInterface::hasNoEffect(&op)) { + status = failure(); + continue; + } + toHoist.push_back(&op); + } + auto *outerForOp = outer.getOperation(); + for (auto *op : toHoist) + op->moveBefore(outerForOp); + return status; +} + +// Traverse the interTile and intraTile loops and try to hoist ops such that +// bands of perfectly nested loops are isolated. +// Return failure if either perfect interTile or perfect intraTile bands cannot +// be formed. +static LogicalResult tryIsolateBands(const TileLoops &tileLoops) { + LogicalResult status = success(); + const Loops &interTile = tileLoops.first; + const Loops &intraTile = tileLoops.second; + auto size = interTile.size(); + assert(size == intraTile.size()); + if (size <= 1) + return success(); + for (unsigned s = 1; s < size; ++s) + status = succeeded(status) ? hoistOpsBetween(intraTile[0], intraTile[s]) + : failure(); + for (unsigned s = 1; s < size; ++s) + status = succeeded(status) ? hoistOpsBetween(interTile[0], interTile[s]) + : failure(); + return status; +} + +/// Collect perfectly nested loops starting from `rootForOps`. Loops are +/// perfectly nested if each loop is the first and only non-terminator operation +/// in the parent loop. Collect at most `maxLoops` loops and append them to +/// `forOps`. +template +static void getPerfectlyNestedLoopsImpl( + SmallVectorImpl &forOps, T rootForOp, + unsigned maxLoops = std::numeric_limits::max()) { + for (unsigned i = 0; i < maxLoops; ++i) { + forOps.push_back(rootForOp); + Block &body = rootForOp.getRegion().front(); + if (body.begin() != std::prev(body.end(), 2)) + return; + + rootForOp = dyn_cast(&body.front()); + if (!rootForOp) + return; + } +} + +static Loops stripmineSink(scf::ForOp forOp, Value factor, + ArrayRef targets) { + auto originalStep = forOp.getStep(); + auto iv = forOp.getInductionVar(); + + OpBuilder b(forOp); + forOp.setStep(b.create(forOp.getLoc(), originalStep, factor)); + + Loops innerLoops; + for (auto t : targets) { + // Save information for splicing ops out of t when done + auto begin = t.getBody()->begin(); + auto nOps = t.getBody()->getOperations().size(); + + // Insert newForOp before the terminator of `t`. + auto b = OpBuilder::atBlockTerminator((t.getBody())); + Value stepped = b.create(t.getLoc(), iv, forOp.getStep()); + Value less = b.create(t.getLoc(), arith::CmpIPredicate::slt, + forOp.getUpperBound(), stepped); + Value ub = + b.create(t.getLoc(), less, forOp.getUpperBound(), stepped); + + // Splice [begin, begin + nOps - 1) into `newForOp` and replace uses. + auto newForOp = b.create(t.getLoc(), iv, ub, originalStep); + newForOp.getBody()->getOperations().splice( + newForOp.getBody()->getOperations().begin(), + t.getBody()->getOperations(), begin, std::next(begin, nOps - 1)); + replaceAllUsesInRegionWith(iv, newForOp.getInductionVar(), + newForOp.getRegion()); + + innerLoops.push_back(newForOp); + } + + return innerLoops; +} + +// Stripmines a `forOp` by `factor` and sinks it under a single `target`. +// Returns the new for operation, nested immediately under `target`. +template +static scf::ForOp stripmineSink(scf::ForOp forOp, SizeType factor, + scf::ForOp target) { + // TODO: Use cheap structural assertions that targets are nested under + // forOp and that targets are not nested under each other when DominanceInfo + // exposes the capability. It seems overkill to construct a whole function + // dominance tree at this point. + auto res = stripmineSink(forOp, factor, ArrayRef(target)); + assert(res.size() == 1 && "Expected 1 inner forOp"); + return res[0]; +} + +SmallVector mlir::tile(ArrayRef forOps, + ArrayRef sizes, + ArrayRef targets) { + SmallVector, 8> res; + SmallVector currentTargets(targets.begin(), targets.end()); + for (auto it : llvm::zip(forOps, sizes)) { + auto step = stripmineSink(std::get<0>(it), std::get<1>(it), currentTargets); + res.push_back(step); + currentTargets = step; + } + return res; +} + +Loops mlir::tile(ArrayRef forOps, ArrayRef sizes, + scf::ForOp target) { + SmallVector res; + for (auto loops : tile(forOps, sizes, ArrayRef(target))) { + assert(loops.size() == 1); + res.push_back(loops[0]); + } + return res; +} + +Loops mlir::tilePerfectlyNested(scf::ForOp rootForOp, ArrayRef sizes) { + // Collect perfectly nested loops. If more size values provided than nested + // loops available, truncate `sizes`. + SmallVector forOps; + forOps.reserve(sizes.size()); + getPerfectlyNestedLoopsImpl(forOps, rootForOp, sizes.size()); + if (forOps.size() < sizes.size()) + sizes = sizes.take_front(forOps.size()); + + return ::tile(forOps, sizes, forOps.back()); +} + +void mlir::getPerfectlyNestedLoops(SmallVectorImpl &nestedLoops, + scf::ForOp root) { + getPerfectlyNestedLoopsImpl(nestedLoops, root); +} + +TileLoops mlir::extractFixedOuterLoops(scf::ForOp rootForOp, + ArrayRef sizes) { + // Collect perfectly nested loops. If more size values provided than nested + // loops available, truncate `sizes`. + SmallVector forOps; + forOps.reserve(sizes.size()); + getPerfectlyNestedLoopsImpl(forOps, rootForOp, sizes.size()); + if (forOps.size() < sizes.size()) + sizes = sizes.take_front(forOps.size()); + + // Compute the tile sizes such that i-th outer loop executes size[i] + // iterations. Given that the loop current executes + // numIterations = ceildiv((upperBound - lowerBound), step) + // iterations, we need to tile with size ceildiv(numIterations, size[i]). + SmallVector tileSizes; + tileSizes.reserve(sizes.size()); + for (unsigned i = 0, e = sizes.size(); i < e; ++i) { + assert(sizes[i] > 0 && "expected strictly positive size for strip-mining"); + + auto forOp = forOps[i]; + OpBuilder builder(forOp); + auto loc = forOp.getLoc(); + Value diff = builder.create(loc, forOp.getUpperBound(), + forOp.getLowerBound()); + Value numIterations = ceilDivPositive(builder, loc, diff, forOp.getStep()); + Value iterationsPerBlock = + ceilDivPositive(builder, loc, numIterations, sizes[i]); + tileSizes.push_back(iterationsPerBlock); + } + + // Call parametric tiling with the given sizes. + auto intraTile = tile(forOps, tileSizes, forOps.back()); + TileLoops tileLoops = std::make_pair(forOps, intraTile); + + // TODO: for now we just ignore the result of band isolation. + // In the future, mapping decisions may be impacted by the ability to + // isolate perfectly nested bands. + (void)tryIsolateBands(tileLoops); + + return tileLoops; +} diff --git a/mlir/lib/Dialect/Vector/VectorTransferSplitRewritePatterns.cpp b/mlir/lib/Dialect/Vector/VectorTransferSplitRewritePatterns.cpp --- a/mlir/lib/Dialect/Vector/VectorTransferSplitRewritePatterns.cpp +++ b/mlir/lib/Dialect/Vector/VectorTransferSplitRewritePatterns.cpp @@ -14,7 +14,6 @@ #include #include "mlir/Dialect/Affine/IR/AffineOps.h" -#include "mlir/Dialect/Affine/Utils.h" #include "mlir/Dialect/Arithmetic/IR/Arithmetic.h" #include "mlir/Dialect/Linalg/IR/Linalg.h" #include "mlir/Dialect/MemRef/IR/MemRef.h" diff --git a/mlir/lib/Dialect/Vector/VectorTransforms.cpp b/mlir/lib/Dialect/Vector/VectorTransforms.cpp --- a/mlir/lib/Dialect/Vector/VectorTransforms.cpp +++ b/mlir/lib/Dialect/Vector/VectorTransforms.cpp @@ -13,7 +13,6 @@ #include #include "mlir/Dialect/Affine/IR/AffineOps.h" -#include "mlir/Dialect/Affine/Utils.h" #include "mlir/Dialect/Arithmetic/IR/Arithmetic.h" #include "mlir/Dialect/Linalg/IR/Linalg.h" #include "mlir/Dialect/MemRef/IR/MemRef.h" diff --git a/mlir/lib/Dialect/Vector/VectorUnrollDistribute.cpp b/mlir/lib/Dialect/Vector/VectorUnrollDistribute.cpp --- a/mlir/lib/Dialect/Vector/VectorUnrollDistribute.cpp +++ b/mlir/lib/Dialect/Vector/VectorUnrollDistribute.cpp @@ -11,7 +11,6 @@ //===----------------------------------------------------------------------===// #include "mlir/Dialect/Affine/IR/AffineOps.h" -#include "mlir/Dialect/Affine/Utils.h" #include "mlir/Dialect/Vector/VectorTransforms.h" #include "mlir/IR/ImplicitLocOpBuilder.h" #include "mlir/Interfaces/VectorInterfaces.h" diff --git a/mlir/lib/Interfaces/LoopLikeInterface.cpp b/mlir/lib/Interfaces/LoopLikeInterface.cpp --- a/mlir/lib/Interfaces/LoopLikeInterface.cpp +++ b/mlir/lib/Interfaces/LoopLikeInterface.cpp @@ -7,12 +7,95 @@ //===----------------------------------------------------------------------===// #include "mlir/Interfaces/LoopLikeInterface.h" +#include "mlir/Interfaces/SideEffectInterfaces.h" +#include "llvm/ADT/SmallPtrSet.h" +#include "llvm/Support/Debug.h" using namespace mlir; +#define DEBUG_TYPE "loop-like" + //===----------------------------------------------------------------------===// // LoopLike Interfaces //===----------------------------------------------------------------------===// /// Include the definitions of the loop-like interfaces. #include "mlir/Interfaces/LoopLikeInterface.cpp.inc" + +//===----------------------------------------------------------------------===// +// LoopLike Utilities +//===----------------------------------------------------------------------===// + +// Checks whether the given op can be hoisted by checking that +// - the op and any of its contained operations do not depend on SSA values +// defined inside of the loop (by means of calling definedOutside). +// - the op has no side-effects. If sideEffecting is Never, sideeffects of this +// op and its nested ops are ignored. +static bool canBeHoisted(Operation *op, + function_ref definedOutside) { + // Check that dependencies are defined outside of loop. + if (!llvm::all_of(op->getOperands(), definedOutside)) + return false; + // Check whether this op is side-effect free. If we already know that there + // can be no side-effects because the surrounding op has claimed so, we can + // (and have to) skip this step. + if (auto memInterface = dyn_cast(op)) { + if (!memInterface.hasNoEffect()) + return false; + // If the operation doesn't have side effects and it doesn't recursively + // have side effects, it can always be hoisted. + if (!op->hasTrait()) + return true; + + // Otherwise, if the operation doesn't provide the memory effect interface + // and it doesn't have recursive side effects we treat it conservatively as + // side-effecting. + } else if (!op->hasTrait()) { + return false; + } + + // Recurse into the regions for this op and check whether the contained ops + // can be hoisted. + for (auto ®ion : op->getRegions()) { + for (auto &block : region) { + for (auto &innerOp : block) + if (!canBeHoisted(&innerOp, definedOutside)) + return false; + } + } + return true; +} + +LogicalResult mlir::moveLoopInvariantCode(LoopLikeOpInterface looplike) { + auto &loopBody = looplike.getLoopBody(); + + // We use two collections here as we need to preserve the order for insertion + // and this is easiest. + SmallPtrSet willBeMovedSet; + SmallVector opsToMove; + + // Helper to check whether an operation is loop invariant wrt. SSA properties. + auto isDefinedOutsideOfBody = [&](Value value) { + auto *definingOp = value.getDefiningOp(); + return (definingOp && !!willBeMovedSet.count(definingOp)) || + looplike.isDefinedOutsideOfLoop(value); + }; + + // Do not use walk here, as we do not want to go into nested regions and hoist + // operations from there. These regions might have semantics unknown to this + // rewriting. If the nested regions are loops, they will have been processed. + for (auto &block : loopBody) { + for (auto &op : block.without_terminator()) { + if (canBeHoisted(&op, isDefinedOutsideOfBody)) { + opsToMove.push_back(&op); + willBeMovedSet.insert(&op); + } + } + } + + // For all instructions that we found to be invariant, move outside of the + // loop. + LogicalResult result = looplike.moveOutOfLoop(opsToMove); + LLVM_DEBUG(looplike.print(llvm::dbgs() << "\n\nModified loop:\n")); + return result; +} diff --git a/mlir/lib/Transforms/CMakeLists.txt b/mlir/lib/Transforms/CMakeLists.txt --- a/mlir/lib/Transforms/CMakeLists.txt +++ b/mlir/lib/Transforms/CMakeLists.txt @@ -6,12 +6,8 @@ CSE.cpp Inliner.cpp LocationSnapshot.cpp - LoopCoalescing.cpp - LoopFusion.cpp LoopInvariantCodeMotion.cpp OpStats.cpp - ParallelLoopCollapsing.cpp - PipelineDataTransfer.cpp SCCP.cpp StripDebugInfo.cpp SymbolDCE.cpp @@ -21,18 +17,13 @@ ${MLIR_MAIN_INCLUDE_DIR}/mlir/Transforms DEPENDS - MLIRStandardOpsIncGen MLIRTransformsPassIncGen LINK_LIBS PUBLIC - MLIRAffine MLIRAnalysis MLIRCopyOpInterface MLIRLoopLikeInterface - MLIRMemRef - MLIRSCF MLIRPass MLIRSupport MLIRTransformUtils - MLIRVector ) diff --git a/mlir/lib/Transforms/CSE.cpp b/mlir/lib/Transforms/CSE.cpp --- a/mlir/lib/Transforms/CSE.cpp +++ b/mlir/lib/Transforms/CSE.cpp @@ -15,7 +15,6 @@ #include "mlir/IR/Dominance.h" #include "mlir/Pass/Pass.h" #include "mlir/Transforms/Passes.h" -#include "mlir/Transforms/Utils.h" #include "llvm/ADT/DenseMapInfo.h" #include "llvm/ADT/Hashing.h" #include "llvm/ADT/ScopedHashTable.h" diff --git a/mlir/lib/Transforms/ControlFlowSink.cpp b/mlir/lib/Transforms/ControlFlowSink.cpp --- a/mlir/lib/Transforms/ControlFlowSink.cpp +++ b/mlir/lib/Transforms/ControlFlowSink.cpp @@ -16,8 +16,8 @@ #include "PassDetail.h" #include "mlir/IR/Dominance.h" #include "mlir/Interfaces/ControlFlowInterfaces.h" +#include "mlir/Transforms/ControlFlowSinkUtils.h" #include "mlir/Transforms/Passes.h" -#include "mlir/Transforms/Utils.h" using namespace mlir; diff --git a/mlir/lib/Transforms/LoopInvariantCodeMotion.cpp b/mlir/lib/Transforms/LoopInvariantCodeMotion.cpp --- a/mlir/lib/Transforms/LoopInvariantCodeMotion.cpp +++ b/mlir/lib/Transforms/LoopInvariantCodeMotion.cpp @@ -11,13 +11,10 @@ //===----------------------------------------------------------------------===// #include "PassDetail.h" -#include "mlir/Transforms/Passes.h" - #include "mlir/IR/Builders.h" -#include "mlir/IR/BuiltinOps.h" #include "mlir/Interfaces/LoopLikeInterface.h" #include "mlir/Interfaces/SideEffectInterfaces.h" -#include "mlir/Transforms/LoopUtils.h" +#include "mlir/Transforms/Passes.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" @@ -34,80 +31,6 @@ }; } // namespace -// Checks whether the given op can be hoisted by checking that -// - the op and any of its contained operations do not depend on SSA values -// defined inside of the loop (by means of calling definedOutside). -// - the op has no side-effects. If sideEffecting is Never, sideeffects of this -// op and its nested ops are ignored. -static bool canBeHoisted(Operation *op, - function_ref definedOutside) { - // Check that dependencies are defined outside of loop. - if (!llvm::all_of(op->getOperands(), definedOutside)) - return false; - // Check whether this op is side-effect free. If we already know that there - // can be no side-effects because the surrounding op has claimed so, we can - // (and have to) skip this step. - if (auto memInterface = dyn_cast(op)) { - if (!memInterface.hasNoEffect()) - return false; - // If the operation doesn't have side effects and it doesn't recursively - // have side effects, it can always be hoisted. - if (!op->hasTrait()) - return true; - - // Otherwise, if the operation doesn't provide the memory effect interface - // and it doesn't have recursive side effects we treat it conservatively as - // side-effecting. - } else if (!op->hasTrait()) { - return false; - } - - // Recurse into the regions for this op and check whether the contained ops - // can be hoisted. - for (auto ®ion : op->getRegions()) { - for (auto &block : region) { - for (auto &innerOp : block) - if (!canBeHoisted(&innerOp, definedOutside)) - return false; - } - } - return true; -} - -LogicalResult mlir::moveLoopInvariantCode(LoopLikeOpInterface looplike) { - auto &loopBody = looplike.getLoopBody(); - - // We use two collections here as we need to preserve the order for insertion - // and this is easiest. - SmallPtrSet willBeMovedSet; - SmallVector opsToMove; - - // Helper to check whether an operation is loop invariant wrt. SSA properties. - auto isDefinedOutsideOfBody = [&](Value value) { - auto *definingOp = value.getDefiningOp(); - return (definingOp && !!willBeMovedSet.count(definingOp)) || - looplike.isDefinedOutsideOfLoop(value); - }; - - // Do not use walk here, as we do not want to go into nested regions and hoist - // operations from there. These regions might have semantics unknown to this - // rewriting. If the nested regions are loops, they will have been processed. - for (auto &block : loopBody) { - for (auto &op : block.without_terminator()) { - if (canBeHoisted(&op, isDefinedOutsideOfBody)) { - opsToMove.push_back(&op); - willBeMovedSet.insert(&op); - } - } - } - - // For all instructions that we found to be invariant, move outside of the - // loop. - auto result = looplike.moveOutOfLoop(opsToMove); - LLVM_DEBUG(looplike.print(llvm::dbgs() << "\n\nModified loop:\n")); - return result; -} - void LoopInvariantCodeMotion::runOnOperation() { // Walk through all loops in a function in innermost-loop-first order. This // way, we first LICM from the inner loop, and place the ops in diff --git a/mlir/lib/Transforms/PassDetail.h b/mlir/lib/Transforms/PassDetail.h --- a/mlir/lib/Transforms/PassDetail.h +++ b/mlir/lib/Transforms/PassDetail.h @@ -13,23 +13,8 @@ #include "mlir/Transforms/Passes.h" namespace mlir { -class AffineDialect; - -// Forward declaration from Dialect.h -template -void registerDialect(DialectRegistry ®istry); - -namespace arith { -class ArithmeticDialect; -} // namespace arith - -namespace memref { -class MemRefDialect; -} // namespace memref - #define GEN_PASS_CLASSES #include "mlir/Transforms/Passes.h.inc" - } // namespace mlir #endif // TRANSFORMS_PASSDETAIL_H_ diff --git a/mlir/lib/Transforms/Utils/CMakeLists.txt b/mlir/lib/Transforms/Utils/CMakeLists.txt --- a/mlir/lib/Transforms/Utils/CMakeLists.txt +++ b/mlir/lib/Transforms/Utils/CMakeLists.txt @@ -4,25 +4,12 @@ FoldUtils.cpp GreedyPatternRewriteDriver.cpp InliningUtils.cpp - LoopFusionUtils.cpp - LoopUtils.cpp RegionUtils.cpp - Utils.cpp ADDITIONAL_HEADER_DIRS ${MLIR_MAIN_INCLUDE_DIR}/mlir/Transforms - DEPENDS - MLIRStandardOpsIncGen - LINK_LIBS PUBLIC - MLIRAffine - MLIRArithmetic MLIRAnalysis - MLIRAffineAnalysis - MLIRMemRef - MLIRSCF - MLIRPass MLIRRewrite - MLIRStandard ) diff --git a/mlir/lib/Transforms/Utils/ControlFlowSinkUtils.cpp b/mlir/lib/Transforms/Utils/ControlFlowSinkUtils.cpp --- a/mlir/lib/Transforms/Utils/ControlFlowSinkUtils.cpp +++ b/mlir/lib/Transforms/Utils/ControlFlowSinkUtils.cpp @@ -18,10 +18,10 @@ // //===----------------------------------------------------------------------===// +#include "mlir/Transforms/ControlFlowSinkUtils.h" #include "mlir/IR/Dominance.h" #include "mlir/IR/Matchers.h" #include "mlir/Interfaces/ControlFlowInterfaces.h" -#include "mlir/Transforms/Utils.h" #include #define DEBUG_TYPE "cf-sink" diff --git a/mlir/lib/Transforms/Utils/DialectConversion.cpp b/mlir/lib/Transforms/Utils/DialectConversion.cpp --- a/mlir/lib/Transforms/Utils/DialectConversion.cpp +++ b/mlir/lib/Transforms/Utils/DialectConversion.cpp @@ -13,7 +13,6 @@ #include "mlir/IR/BuiltinOps.h" #include "mlir/IR/FunctionInterfaces.h" #include "mlir/Rewrite/PatternApplicator.h" -#include "mlir/Transforms/Utils.h" #include "llvm/ADT/ScopeExit.h" #include "llvm/ADT/SetVector.h" #include "llvm/ADT/SmallPtrSet.h" diff --git a/mlir/lib/Transforms/Utils/Utils.cpp b/mlir/lib/Transforms/Utils/Utils.cpp deleted file mode 100644 --- a/mlir/lib/Transforms/Utils/Utils.cpp +++ /dev/null @@ -1,767 +0,0 @@ -//===- Utils.cpp ---- Misc utilities for code and data transformation -----===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// This file implements miscellaneous transformation routines for non-loop IR -// structures. -// -//===----------------------------------------------------------------------===// - -#include "mlir/Transforms/Utils.h" -#include "mlir/Dialect/Affine/Analysis/AffineAnalysis.h" -#include "mlir/Dialect/Affine/Analysis/AffineStructures.h" -#include "mlir/Dialect/Affine/Analysis/Utils.h" -#include "mlir/Dialect/Affine/IR/AffineOps.h" -#include "mlir/Dialect/Arithmetic/IR/Arithmetic.h" -#include "mlir/Dialect/MemRef/IR/MemRef.h" -#include "mlir/IR/Builders.h" -#include "mlir/IR/BuiltinOps.h" -#include "mlir/IR/Dominance.h" -#include "mlir/Support/MathExtras.h" -#include "llvm/ADT/DenseMap.h" -#include "llvm/ADT/TypeSwitch.h" - -#define DEBUG_TYPE "transforms-utils" - -using namespace mlir; - -// Perform the replacement in `op`. -LogicalResult mlir::replaceAllMemRefUsesWith(Value oldMemRef, Value newMemRef, - Operation *op, - ArrayRef extraIndices, - AffineMap indexRemap, - ArrayRef extraOperands, - ArrayRef symbolOperands, - bool allowNonDereferencingOps) { - unsigned newMemRefRank = newMemRef.getType().cast().getRank(); - (void)newMemRefRank; // unused in opt mode - unsigned oldMemRefRank = oldMemRef.getType().cast().getRank(); - (void)oldMemRefRank; // unused in opt mode - if (indexRemap) { - assert(indexRemap.getNumSymbols() == symbolOperands.size() && - "symbolic operand count mismatch"); - assert(indexRemap.getNumInputs() == - extraOperands.size() + oldMemRefRank + symbolOperands.size()); - assert(indexRemap.getNumResults() + extraIndices.size() == newMemRefRank); - } else { - assert(oldMemRefRank + extraIndices.size() == newMemRefRank); - } - - // Assert same elemental type. - assert(oldMemRef.getType().cast().getElementType() == - newMemRef.getType().cast().getElementType()); - - SmallVector usePositions; - for (const auto &opEntry : llvm::enumerate(op->getOperands())) { - if (opEntry.value() == oldMemRef) - usePositions.push_back(opEntry.index()); - } - - // If memref doesn't appear, nothing to do. - if (usePositions.empty()) - return success(); - - if (usePositions.size() > 1) { - // TODO: extend it for this case when needed (rare). - assert(false && "multiple dereferencing uses in a single op not supported"); - return failure(); - } - - unsigned memRefOperandPos = usePositions.front(); - - OpBuilder builder(op); - // The following checks if op is dereferencing memref and performs the access - // index rewrites. - auto affMapAccInterface = dyn_cast(op); - if (!affMapAccInterface) { - if (!allowNonDereferencingOps) { - // Failure: memref used in a non-dereferencing context (potentially - // escapes); no replacement in these cases unless allowNonDereferencingOps - // is set. - return failure(); - } - op->setOperand(memRefOperandPos, newMemRef); - return success(); - } - // Perform index rewrites for the dereferencing op and then replace the op - NamedAttribute oldMapAttrPair = - affMapAccInterface.getAffineMapAttrForMemRef(oldMemRef); - AffineMap oldMap = oldMapAttrPair.getValue().cast().getValue(); - unsigned oldMapNumInputs = oldMap.getNumInputs(); - SmallVector oldMapOperands( - op->operand_begin() + memRefOperandPos + 1, - op->operand_begin() + memRefOperandPos + 1 + oldMapNumInputs); - - // Apply 'oldMemRefOperands = oldMap(oldMapOperands)'. - SmallVector oldMemRefOperands; - SmallVector affineApplyOps; - oldMemRefOperands.reserve(oldMemRefRank); - if (oldMap != builder.getMultiDimIdentityMap(oldMap.getNumDims())) { - for (auto resultExpr : oldMap.getResults()) { - auto singleResMap = AffineMap::get(oldMap.getNumDims(), - oldMap.getNumSymbols(), resultExpr); - auto afOp = builder.create(op->getLoc(), singleResMap, - oldMapOperands); - oldMemRefOperands.push_back(afOp); - affineApplyOps.push_back(afOp); - } - } else { - oldMemRefOperands.assign(oldMapOperands.begin(), oldMapOperands.end()); - } - - // Construct new indices as a remap of the old ones if a remapping has been - // provided. The indices of a memref come right after it, i.e., - // at position memRefOperandPos + 1. - SmallVector remapOperands; - remapOperands.reserve(extraOperands.size() + oldMemRefRank + - symbolOperands.size()); - remapOperands.append(extraOperands.begin(), extraOperands.end()); - remapOperands.append(oldMemRefOperands.begin(), oldMemRefOperands.end()); - remapOperands.append(symbolOperands.begin(), symbolOperands.end()); - - SmallVector remapOutputs; - remapOutputs.reserve(oldMemRefRank); - - if (indexRemap && - indexRemap != builder.getMultiDimIdentityMap(indexRemap.getNumDims())) { - // Remapped indices. - for (auto resultExpr : indexRemap.getResults()) { - auto singleResMap = AffineMap::get( - indexRemap.getNumDims(), indexRemap.getNumSymbols(), resultExpr); - auto afOp = builder.create(op->getLoc(), singleResMap, - remapOperands); - remapOutputs.push_back(afOp); - affineApplyOps.push_back(afOp); - } - } else { - // No remapping specified. - remapOutputs.assign(remapOperands.begin(), remapOperands.end()); - } - - SmallVector newMapOperands; - newMapOperands.reserve(newMemRefRank); - - // Prepend 'extraIndices' in 'newMapOperands'. - for (Value extraIndex : extraIndices) { - assert(extraIndex.getDefiningOp()->getNumResults() == 1 && - "single result op's expected to generate these indices"); - assert((isValidDim(extraIndex) || isValidSymbol(extraIndex)) && - "invalid memory op index"); - newMapOperands.push_back(extraIndex); - } - - // Append 'remapOutputs' to 'newMapOperands'. - newMapOperands.append(remapOutputs.begin(), remapOutputs.end()); - - // Create new fully composed AffineMap for new op to be created. - assert(newMapOperands.size() == newMemRefRank); - auto newMap = builder.getMultiDimIdentityMap(newMemRefRank); - // TODO: Avoid creating/deleting temporary AffineApplyOps here. - fullyComposeAffineMapAndOperands(&newMap, &newMapOperands); - newMap = simplifyAffineMap(newMap); - canonicalizeMapAndOperands(&newMap, &newMapOperands); - // Remove any affine.apply's that became dead as a result of composition. - for (Value value : affineApplyOps) - if (value.use_empty()) - value.getDefiningOp()->erase(); - - OperationState state(op->getLoc(), op->getName()); - // Construct the new operation using this memref. - state.operands.reserve(op->getNumOperands() + extraIndices.size()); - // Insert the non-memref operands. - state.operands.append(op->operand_begin(), - op->operand_begin() + memRefOperandPos); - // Insert the new memref value. - state.operands.push_back(newMemRef); - - // Insert the new memref map operands. - state.operands.append(newMapOperands.begin(), newMapOperands.end()); - - // Insert the remaining operands unmodified. - state.operands.append(op->operand_begin() + memRefOperandPos + 1 + - oldMapNumInputs, - op->operand_end()); - - // Result types don't change. Both memref's are of the same elemental type. - state.types.reserve(op->getNumResults()); - for (auto result : op->getResults()) - state.types.push_back(result.getType()); - - // Add attribute for 'newMap', other Attributes do not change. - auto newMapAttr = AffineMapAttr::get(newMap); - for (auto namedAttr : op->getAttrs()) { - if (namedAttr.getName() == oldMapAttrPair.getName()) - state.attributes.push_back({namedAttr.getName(), newMapAttr}); - else - state.attributes.push_back(namedAttr); - } - - // Create the new operation. - auto *repOp = builder.createOperation(state); - op->replaceAllUsesWith(repOp); - op->erase(); - - return success(); -} - -LogicalResult mlir::replaceAllMemRefUsesWith( - Value oldMemRef, Value newMemRef, ArrayRef extraIndices, - AffineMap indexRemap, ArrayRef extraOperands, - ArrayRef symbolOperands, Operation *domOpFilter, - Operation *postDomOpFilter, bool allowNonDereferencingOps, - bool replaceInDeallocOp) { - unsigned newMemRefRank = newMemRef.getType().cast().getRank(); - (void)newMemRefRank; // unused in opt mode - unsigned oldMemRefRank = oldMemRef.getType().cast().getRank(); - (void)oldMemRefRank; - if (indexRemap) { - assert(indexRemap.getNumSymbols() == symbolOperands.size() && - "symbol operand count mismatch"); - assert(indexRemap.getNumInputs() == - extraOperands.size() + oldMemRefRank + symbolOperands.size()); - assert(indexRemap.getNumResults() + extraIndices.size() == newMemRefRank); - } else { - assert(oldMemRefRank + extraIndices.size() == newMemRefRank); - } - - // Assert same elemental type. - assert(oldMemRef.getType().cast().getElementType() == - newMemRef.getType().cast().getElementType()); - - std::unique_ptr domInfo; - std::unique_ptr postDomInfo; - if (domOpFilter) - domInfo = - std::make_unique(domOpFilter->getParentOfType()); - - if (postDomOpFilter) - postDomInfo = std::make_unique( - postDomOpFilter->getParentOfType()); - - // Walk all uses of old memref; collect ops to perform replacement. We use a - // DenseSet since an operation could potentially have multiple uses of a - // memref (although rare), and the replacement later is going to erase ops. - DenseSet opsToReplace; - for (auto *op : oldMemRef.getUsers()) { - // Skip this use if it's not dominated by domOpFilter. - if (domOpFilter && !domInfo->dominates(domOpFilter, op)) - continue; - - // Skip this use if it's not post-dominated by postDomOpFilter. - if (postDomOpFilter && !postDomInfo->postDominates(postDomOpFilter, op)) - continue; - - // Skip dealloc's - no replacement is necessary, and a memref replacement - // at other uses doesn't hurt these dealloc's. - if (isa(op) && !replaceInDeallocOp) - continue; - - // Check if the memref was used in a non-dereferencing context. It is fine - // for the memref to be used in a non-dereferencing way outside of the - // region where this replacement is happening. - if (!isa(*op)) { - if (!allowNonDereferencingOps) { - LLVM_DEBUG(llvm::dbgs() - << "Memref replacement failed: non-deferencing memref op: \n" - << *op << '\n'); - return failure(); - } - // Non-dereferencing ops with the MemRefsNormalizable trait are - // supported for replacement. - if (!op->hasTrait()) { - LLVM_DEBUG(llvm::dbgs() << "Memref replacement failed: use without a " - "memrefs normalizable trait: \n" - << *op << '\n'); - return failure(); - } - } - - // We'll first collect and then replace --- since replacement erases the op - // that has the use, and that op could be postDomFilter or domFilter itself! - opsToReplace.insert(op); - } - - for (auto *op : opsToReplace) { - if (failed(replaceAllMemRefUsesWith( - oldMemRef, newMemRef, op, extraIndices, indexRemap, extraOperands, - symbolOperands, allowNonDereferencingOps))) - llvm_unreachable("memref replacement guaranteed to succeed here"); - } - - return success(); -} - -/// Given an operation, inserts one or more single result affine -/// apply operations, results of which are exclusively used by this operation -/// operation. The operands of these newly created affine apply ops are -/// guaranteed to be loop iterators or terminal symbols of a function. -/// -/// Before -/// -/// affine.for %i = 0 to #map(%N) -/// %idx = affine.apply (d0) -> (d0 mod 2) (%i) -/// "send"(%idx, %A, ...) -/// "compute"(%idx) -/// -/// After -/// -/// affine.for %i = 0 to #map(%N) -/// %idx = affine.apply (d0) -> (d0 mod 2) (%i) -/// "send"(%idx, %A, ...) -/// %idx_ = affine.apply (d0) -> (d0 mod 2) (%i) -/// "compute"(%idx_) -/// -/// This allows applying different transformations on send and compute (for eg. -/// different shifts/delays). -/// -/// Returns nullptr either if none of opInst's operands were the result of an -/// affine.apply and thus there was no affine computation slice to create, or if -/// all the affine.apply op's supplying operands to this opInst did not have any -/// uses besides this opInst; otherwise returns the list of affine.apply -/// operations created in output argument `sliceOps`. -void mlir::createAffineComputationSlice( - Operation *opInst, SmallVectorImpl *sliceOps) { - // Collect all operands that are results of affine apply ops. - SmallVector subOperands; - subOperands.reserve(opInst->getNumOperands()); - for (auto operand : opInst->getOperands()) - if (isa_and_nonnull(operand.getDefiningOp())) - subOperands.push_back(operand); - - // Gather sequence of AffineApplyOps reachable from 'subOperands'. - SmallVector affineApplyOps; - getReachableAffineApplyOps(subOperands, affineApplyOps); - // Skip transforming if there are no affine maps to compose. - if (affineApplyOps.empty()) - return; - - // Check if all uses of the affine apply op's lie only in this op op, in - // which case there would be nothing to do. - bool localized = true; - for (auto *op : affineApplyOps) { - for (auto result : op->getResults()) { - for (auto *user : result.getUsers()) { - if (user != opInst) { - localized = false; - break; - } - } - } - } - if (localized) - return; - - OpBuilder builder(opInst); - SmallVector composedOpOperands(subOperands); - auto composedMap = builder.getMultiDimIdentityMap(composedOpOperands.size()); - fullyComposeAffineMapAndOperands(&composedMap, &composedOpOperands); - - // Create an affine.apply for each of the map results. - sliceOps->reserve(composedMap.getNumResults()); - for (auto resultExpr : composedMap.getResults()) { - auto singleResMap = AffineMap::get(composedMap.getNumDims(), - composedMap.getNumSymbols(), resultExpr); - sliceOps->push_back(builder.create( - opInst->getLoc(), singleResMap, composedOpOperands)); - } - - // Construct the new operands that include the results from the composed - // affine apply op above instead of existing ones (subOperands). So, they - // differ from opInst's operands only for those operands in 'subOperands', for - // which they will be replaced by the corresponding one from 'sliceOps'. - SmallVector newOperands(opInst->getOperands()); - for (unsigned i = 0, e = newOperands.size(); i < e; i++) { - // Replace the subOperands from among the new operands. - unsigned j, f; - for (j = 0, f = subOperands.size(); j < f; j++) { - if (newOperands[i] == subOperands[j]) - break; - } - if (j < subOperands.size()) { - newOperands[i] = (*sliceOps)[j]; - } - } - for (unsigned idx = 0, e = newOperands.size(); idx < e; idx++) { - opInst->setOperand(idx, newOperands[idx]); - } -} - -/// Enum to set patterns of affine expr in tiled-layout map. -/// TileFloorDiv: div -/// TileMod: mod -/// TileNone: None of the above -/// Example: -/// #tiled_2d_128x256 = affine_map<(d0, d1) -/// -> (d0 div 128, d1 div 256, d0 mod 128, d1 mod 256)> -/// "d0 div 128" and "d1 div 256" ==> TileFloorDiv -/// "d0 mod 128" and "d1 mod 256" ==> TileMod -enum TileExprPattern { TileFloorDiv, TileMod, TileNone }; - -/// Check if `map` is a tiled layout. In the tiled layout, specific k dimensions -/// being floordiv'ed by respective tile sizes appeare in a mod with the same -/// tile sizes, and no other expression involves those k dimensions. This -/// function stores a vector of tuples (`tileSizePos`) including AffineExpr for -/// tile size, positions of corresponding `floordiv` and `mod`. If it is not a -/// tiled layout, an empty vector is returned. -static LogicalResult getTileSizePos( - AffineMap map, - SmallVectorImpl> &tileSizePos) { - // Create `floordivExprs` which is a vector of tuples including LHS and RHS of - // `floordiv` and its position in `map` output. - // Example: #tiled_2d_128x256 = affine_map<(d0, d1) - // -> (d0 div 128, d1 div 256, d0 mod 128, d1 mod 256)> - // In this example, `floordivExprs` includes {d0, 128, 0} and {d1, 256, 1}. - SmallVector, 4> floordivExprs; - unsigned pos = 0; - for (AffineExpr expr : map.getResults()) { - if (expr.getKind() == AffineExprKind::FloorDiv) { - AffineBinaryOpExpr binaryExpr = expr.cast(); - if (binaryExpr.getRHS().isa()) - floordivExprs.emplace_back( - std::make_tuple(binaryExpr.getLHS(), binaryExpr.getRHS(), pos)); - } - pos++; - } - // Not tiled layout if `floordivExprs` is empty. - if (floordivExprs.empty()) { - tileSizePos = SmallVector>{}; - return success(); - } - - // Check if LHS of `floordiv` is used in LHS of `mod`. If not used, `map` is - // not tiled layout. - for (std::tuple fexpr : floordivExprs) { - AffineExpr floordivExprLHS = std::get<0>(fexpr); - AffineExpr floordivExprRHS = std::get<1>(fexpr); - unsigned floordivPos = std::get<2>(fexpr); - - // Walk affinexpr of `map` output except `fexpr`, and check if LHS and RHS - // of `fexpr` are used in LHS and RHS of `mod`. If LHS of `fexpr` is used - // other expr, the map is not tiled layout. Example of non tiled layout: - // affine_map<(d0, d1, d2) -> (d0, d1, d2 floordiv 256, d2 floordiv 256)> - // affine_map<(d0, d1, d2) -> (d0, d1, d2 floordiv 256, d2 mod 128)> - // affine_map<(d0, d1, d2) -> (d0, d1, d2 floordiv 256, d2 mod 256, d2 mod - // 256)> - bool found = false; - pos = 0; - for (AffineExpr expr : map.getResults()) { - bool notTiled = false; - if (pos != floordivPos) { - expr.walk([&](AffineExpr e) { - if (e == floordivExprLHS) { - if (expr.getKind() == AffineExprKind::Mod) { - AffineBinaryOpExpr binaryExpr = expr.cast(); - // If LHS and RHS of `mod` are the same with those of floordiv. - if (floordivExprLHS == binaryExpr.getLHS() && - floordivExprRHS == binaryExpr.getRHS()) { - // Save tile size (RHS of `mod`), and position of `floordiv` and - // `mod` if same expr with `mod` is not found yet. - if (!found) { - tileSizePos.emplace_back( - std::make_tuple(binaryExpr.getRHS(), floordivPos, pos)); - found = true; - } else { - // Non tiled layout: Have multilpe `mod` with the same LHS. - // eg. affine_map<(d0, d1, d2) -> (d0, d1, d2 floordiv 256, d2 - // mod 256, d2 mod 256)> - notTiled = true; - } - } else { - // Non tiled layout: RHS of `mod` is different from `floordiv`. - // eg. affine_map<(d0, d1, d2) -> (d0, d1, d2 floordiv 256, d2 - // mod 128)> - notTiled = true; - } - } else { - // Non tiled layout: LHS is the same, but not `mod`. - // eg. affine_map<(d0, d1, d2) -> (d0, d1, d2 floordiv 256, d2 - // floordiv 256)> - notTiled = true; - } - } - }); - } - if (notTiled) { - tileSizePos = SmallVector>{}; - return success(); - } - pos++; - } - } - return success(); -} - -/// Check if `dim` dimension of memrefType with `layoutMap` becomes dynamic -/// after normalization. Dimensions that include dynamic dimensions in the map -/// output will become dynamic dimensions. Return true if `dim` is dynamic -/// dimension. -/// -/// Example: -/// #map0 = affine_map<(d0, d1) -> (d0, d1 floordiv 32, d1 mod 32)> -/// -/// If d1 is dynamic dimension, 2nd and 3rd dimension of map output are dynamic. -/// memref<4x?xf32, #map0> ==> memref<4x?x?xf32> -static bool -isNormalizedMemRefDynamicDim(unsigned dim, AffineMap layoutMap, - SmallVectorImpl &inMemrefTypeDynDims, - MLIRContext *context) { - bool isDynamicDim = false; - AffineExpr expr = layoutMap.getResults()[dim]; - // Check if affine expr of the dimension includes dynamic dimension of input - // memrefType. - expr.walk([&inMemrefTypeDynDims, &isDynamicDim, &context](AffineExpr e) { - if (e.isa()) { - for (unsigned dm : inMemrefTypeDynDims) { - if (e == getAffineDimExpr(dm, context)) { - isDynamicDim = true; - } - } - } - }); - return isDynamicDim; -} - -/// Create affine expr to calculate dimension size for a tiled-layout map. -static AffineExpr createDimSizeExprForTiledLayout(AffineExpr oldMapOutput, - TileExprPattern pat) { - // Create map output for the patterns. - // "floordiv " ==> "ceildiv " - // "mod " ==> "" - AffineExpr newMapOutput; - AffineBinaryOpExpr binaryExpr = nullptr; - switch (pat) { - case TileExprPattern::TileMod: - binaryExpr = oldMapOutput.cast(); - newMapOutput = binaryExpr.getRHS(); - break; - case TileExprPattern::TileFloorDiv: - binaryExpr = oldMapOutput.cast(); - newMapOutput = getAffineBinaryOpExpr( - AffineExprKind::CeilDiv, binaryExpr.getLHS(), binaryExpr.getRHS()); - break; - default: - newMapOutput = oldMapOutput; - } - return newMapOutput; -} - -/// Create new maps to calculate each dimension size of `newMemRefType`, and -/// create `newDynamicSizes` from them by using AffineApplyOp. -/// -/// Steps for normalizing dynamic memrefs for a tiled layout map -/// Example: -/// #map0 = affine_map<(d0, d1) -> (d0, d1 floordiv 32, d1 mod 32)> -/// %0 = dim %arg0, %c1 :memref<4x?xf32> -/// %1 = alloc(%0) : memref<4x?xf32, #map0> -/// -/// (Before this function) -/// 1. Check if `map`(#map0) is a tiled layout using `getTileSizePos()`. Only -/// single layout map is supported. -/// -/// 2. Create normalized memrefType using `isNormalizedMemRefDynamicDim()`. It -/// is memref<4x?x?xf32> in the above example. -/// -/// (In this function) -/// 3. Create new maps to calculate each dimension of the normalized memrefType -/// using `createDimSizeExprForTiledLayout()`. In the tiled layout, the -/// dimension size can be calculated by replacing "floordiv " with -/// "ceildiv " and "mod " with "". -/// - New map in the above example -/// #map0 = affine_map<(d0, d1) -> (d0)> -/// #map1 = affine_map<(d0, d1) -> (d1 ceildiv 32)> -/// #map2 = affine_map<(d0, d1) -> (32)> -/// -/// 4. Create AffineApplyOp to apply the new maps. The output of AffineApplyOp -/// is used in dynamicSizes of new AllocOp. -/// %0 = dim %arg0, %c1 : memref<4x?xf32> -/// %c4 = arith.constant 4 : index -/// %1 = affine.apply #map1(%c4, %0) -/// %2 = affine.apply #map2(%c4, %0) -static void createNewDynamicSizes(MemRefType oldMemRefType, - MemRefType newMemRefType, AffineMap map, - memref::AllocOp *allocOp, OpBuilder b, - SmallVectorImpl &newDynamicSizes) { - // Create new input for AffineApplyOp. - SmallVector inAffineApply; - ArrayRef oldMemRefShape = oldMemRefType.getShape(); - unsigned dynIdx = 0; - for (unsigned d = 0; d < oldMemRefType.getRank(); ++d) { - if (oldMemRefShape[d] < 0) { - // Use dynamicSizes of allocOp for dynamic dimension. - inAffineApply.emplace_back(allocOp->dynamicSizes()[dynIdx]); - dynIdx++; - } else { - // Create ConstantOp for static dimension. - Attribute constantAttr = - b.getIntegerAttr(b.getIndexType(), oldMemRefShape[d]); - inAffineApply.emplace_back( - b.create(allocOp->getLoc(), constantAttr)); - } - } - - // Create new map to calculate each dimension size of new memref for each - // original map output. Only for dynamic dimesion of `newMemRefType`. - unsigned newDimIdx = 0; - ArrayRef newMemRefShape = newMemRefType.getShape(); - SmallVector> tileSizePos; - (void)getTileSizePos(map, tileSizePos); - for (AffineExpr expr : map.getResults()) { - if (newMemRefShape[newDimIdx] < 0) { - // Create new maps to calculate each dimension size of new memref. - enum TileExprPattern pat = TileExprPattern::TileNone; - for (auto pos : tileSizePos) { - if (newDimIdx == std::get<1>(pos)) - pat = TileExprPattern::TileFloorDiv; - else if (newDimIdx == std::get<2>(pos)) - pat = TileExprPattern::TileMod; - } - AffineExpr newMapOutput = createDimSizeExprForTiledLayout(expr, pat); - AffineMap newMap = - AffineMap::get(map.getNumInputs(), map.getNumSymbols(), newMapOutput); - Value affineApp = - b.create(allocOp->getLoc(), newMap, inAffineApply); - newDynamicSizes.emplace_back(affineApp); - } - newDimIdx++; - } -} - -// TODO: Currently works for static memrefs with a single layout map. -LogicalResult mlir::normalizeMemRef(memref::AllocOp *allocOp) { - MemRefType memrefType = allocOp->getType(); - OpBuilder b(*allocOp); - - // Fetch a new memref type after normalizing the old memref to have an - // identity map layout. - MemRefType newMemRefType = - normalizeMemRefType(memrefType, b, allocOp->symbolOperands().size()); - if (newMemRefType == memrefType) - // Either memrefType already had an identity map or the map couldn't be - // transformed to an identity map. - return failure(); - - Value oldMemRef = allocOp->getResult(); - - SmallVector symbolOperands(allocOp->symbolOperands()); - AffineMap layoutMap = memrefType.getLayout().getAffineMap(); - memref::AllocOp newAlloc; - // Check if `layoutMap` is a tiled layout. Only single layout map is - // supported for normalizing dynamic memrefs. - SmallVector> tileSizePos; - (void)getTileSizePos(layoutMap, tileSizePos); - if (newMemRefType.getNumDynamicDims() > 0 && !tileSizePos.empty()) { - MemRefType oldMemRefType = oldMemRef.getType().cast(); - SmallVector newDynamicSizes; - createNewDynamicSizes(oldMemRefType, newMemRefType, layoutMap, allocOp, b, - newDynamicSizes); - // Add the new dynamic sizes in new AllocOp. - newAlloc = - b.create(allocOp->getLoc(), newMemRefType, - newDynamicSizes, allocOp->alignmentAttr()); - } else { - newAlloc = b.create(allocOp->getLoc(), newMemRefType, - allocOp->alignmentAttr()); - } - // Replace all uses of the old memref. - if (failed(replaceAllMemRefUsesWith(oldMemRef, /*newMemRef=*/newAlloc, - /*extraIndices=*/{}, - /*indexRemap=*/layoutMap, - /*extraOperands=*/{}, - /*symbolOperands=*/symbolOperands, - /*domOpFilter=*/nullptr, - /*postDomOpFilter=*/nullptr, - /*allowNonDereferencingOps=*/true))) { - // If it failed (due to escapes for example), bail out. - newAlloc.erase(); - return failure(); - } - // Replace any uses of the original alloc op and erase it. All remaining uses - // have to be dealloc's; RAMUW above would've failed otherwise. - assert(llvm::all_of(oldMemRef.getUsers(), [](Operation *op) { - return isa(op); - })); - oldMemRef.replaceAllUsesWith(newAlloc); - allocOp->erase(); - return success(); -} - -MemRefType mlir::normalizeMemRefType(MemRefType memrefType, OpBuilder b, - unsigned numSymbolicOperands) { - unsigned rank = memrefType.getRank(); - if (rank == 0) - return memrefType; - - if (memrefType.getLayout().isIdentity()) { - // Either no maps is associated with this memref or this memref has - // a trivial (identity) map. - return memrefType; - } - AffineMap layoutMap = memrefType.getLayout().getAffineMap(); - - // We don't do any checks for one-to-one'ness; we assume that it is - // one-to-one. - - // Normalize only static memrefs and dynamic memrefs with a tiled-layout map - // for now. - // TODO: Normalize the other types of dynamic memrefs. - SmallVector> tileSizePos; - (void)getTileSizePos(layoutMap, tileSizePos); - if (memrefType.getNumDynamicDims() > 0 && tileSizePos.empty()) - return memrefType; - - // We have a single map that is not an identity map. Create a new memref - // with the right shape and an identity layout map. - ArrayRef shape = memrefType.getShape(); - // FlatAffineConstraint may later on use symbolicOperands. - FlatAffineConstraints fac(rank, numSymbolicOperands); - SmallVector memrefTypeDynDims; - for (unsigned d = 0; d < rank; ++d) { - // Use constraint system only in static dimensions. - if (shape[d] > 0) { - fac.addBound(FlatAffineConstraints::LB, d, 0); - fac.addBound(FlatAffineConstraints::UB, d, shape[d] - 1); - } else { - memrefTypeDynDims.emplace_back(d); - } - } - // We compose this map with the original index (logical) space to derive - // the upper bounds for the new index space. - unsigned newRank = layoutMap.getNumResults(); - if (failed(fac.composeMatchingMap(layoutMap))) - return memrefType; - // TODO: Handle semi-affine maps. - // Project out the old data dimensions. - fac.projectOut(newRank, fac.getNumIds() - newRank - fac.getNumLocalIds()); - SmallVector newShape(newRank); - for (unsigned d = 0; d < newRank; ++d) { - // Check if each dimension of normalized memrefType is dynamic. - bool isDynDim = isNormalizedMemRefDynamicDim( - d, layoutMap, memrefTypeDynDims, b.getContext()); - if (isDynDim) { - newShape[d] = -1; - } else { - // The lower bound for the shape is always zero. - auto ubConst = fac.getConstantBound(FlatAffineConstraints::UB, d); - // For a static memref and an affine map with no symbols, this is - // always bounded. - assert(ubConst.hasValue() && "should always have an upper bound"); - if (ubConst.getValue() < 0) - // This is due to an invalid map that maps to a negative space. - return memrefType; - // If dimension of new memrefType is dynamic, the value is -1. - newShape[d] = ubConst.getValue() + 1; - } - } - - // Create the new memref type after trivializing the old layout map. - MemRefType newMemRefType = - MemRefType::Builder(memrefType) - .setShape(newShape) - .setLayout(AffineMapAttr::get(b.getMultiDimIdentityMap(newRank))); - - return newMemRefType; -} diff --git a/mlir/test/lib/Dialect/Affine/CMakeLists.txt b/mlir/test/lib/Dialect/Affine/CMakeLists.txt --- a/mlir/test/lib/Dialect/Affine/CMakeLists.txt +++ b/mlir/test/lib/Dialect/Affine/CMakeLists.txt @@ -3,6 +3,8 @@ TestAffineDataCopy.cpp TestAffineLoopUnswitching.cpp TestAffineLoopParametricTiling.cpp + TestLoopFusion.cpp + TestLoopMapping.cpp TestLoopPermutation.cpp TestVectorizationUtils.cpp diff --git a/mlir/test/lib/Dialect/Affine/TestAffineDataCopy.cpp b/mlir/test/lib/Dialect/Affine/TestAffineDataCopy.cpp --- a/mlir/test/lib/Dialect/Affine/TestAffineDataCopy.cpp +++ b/mlir/test/lib/Dialect/Affine/TestAffineDataCopy.cpp @@ -13,10 +13,10 @@ #include "mlir/Dialect/Affine/Analysis/Utils.h" #include "mlir/Dialect/Affine/IR/AffineOps.h" +#include "mlir/Dialect/Affine/LoopUtils.h" #include "mlir/Dialect/MemRef/IR/MemRef.h" #include "mlir/Pass/Pass.h" #include "mlir/Transforms/GreedyPatternRewriteDriver.h" -#include "mlir/Transforms/LoopUtils.h" #include "mlir/Transforms/Passes.h" #define PASS_NAME "test-affine-data-copy" diff --git a/mlir/test/lib/Dialect/Affine/TestAffineLoopParametricTiling.cpp b/mlir/test/lib/Dialect/Affine/TestAffineLoopParametricTiling.cpp --- a/mlir/test/lib/Dialect/Affine/TestAffineLoopParametricTiling.cpp +++ b/mlir/test/lib/Dialect/Affine/TestAffineLoopParametricTiling.cpp @@ -12,8 +12,8 @@ //===----------------------------------------------------------------------===// #include "mlir/Dialect/Affine/IR/AffineOps.h" +#include "mlir/Dialect/Affine/LoopUtils.h" #include "mlir/Dialect/Affine/Passes.h" -#include "mlir/Transforms/LoopUtils.h" using namespace mlir; diff --git a/mlir/test/lib/Transforms/TestLoopFusion.cpp b/mlir/test/lib/Dialect/Affine/TestLoopFusion.cpp rename from mlir/test/lib/Transforms/TestLoopFusion.cpp rename to mlir/test/lib/Dialect/Affine/TestLoopFusion.cpp --- a/mlir/test/lib/Transforms/TestLoopFusion.cpp +++ b/mlir/test/lib/Dialect/Affine/TestLoopFusion.cpp @@ -12,11 +12,10 @@ #include "mlir/Dialect/Affine/Analysis/Utils.h" #include "mlir/Dialect/Affine/IR/AffineOps.h" +#include "mlir/Dialect/Affine/LoopFusionUtils.h" +#include "mlir/Dialect/Affine/LoopUtils.h" #include "mlir/Dialect/StandardOps/IR/Ops.h" #include "mlir/Pass/Pass.h" -#include "mlir/Transforms/LoopFusionUtils.h" -#include "mlir/Transforms/LoopUtils.h" -#include "mlir/Transforms/Passes.h" #define DEBUG_TYPE "test-loop-fusion" diff --git a/mlir/test/lib/Transforms/TestLoopMapping.cpp b/mlir/test/lib/Dialect/Affine/TestLoopMapping.cpp rename from mlir/test/lib/Transforms/TestLoopMapping.cpp rename to mlir/test/lib/Dialect/Affine/TestLoopMapping.cpp --- a/mlir/test/lib/Transforms/TestLoopMapping.cpp +++ b/mlir/test/lib/Dialect/Affine/TestLoopMapping.cpp @@ -12,11 +12,10 @@ //===----------------------------------------------------------------------===// #include "mlir/Dialect/Affine/IR/AffineOps.h" +#include "mlir/Dialect/Affine/LoopUtils.h" #include "mlir/Dialect/SCF/SCF.h" #include "mlir/IR/Builders.h" #include "mlir/Pass/Pass.h" -#include "mlir/Transforms/LoopUtils.h" -#include "mlir/Transforms/Passes.h" #include "llvm/ADT/SetVector.h" diff --git a/mlir/test/lib/Dialect/Affine/TestLoopPermutation.cpp b/mlir/test/lib/Dialect/Affine/TestLoopPermutation.cpp --- a/mlir/test/lib/Dialect/Affine/TestLoopPermutation.cpp +++ b/mlir/test/lib/Dialect/Affine/TestLoopPermutation.cpp @@ -12,9 +12,8 @@ #include "mlir/Dialect/Affine/Analysis/Utils.h" #include "mlir/Dialect/Affine/IR/AffineOps.h" +#include "mlir/Dialect/Affine/LoopUtils.h" #include "mlir/Pass/Pass.h" -#include "mlir/Transforms/LoopUtils.h" -#include "mlir/Transforms/Passes.h" #define PASS_NAME "test-loop-permutation" diff --git a/mlir/test/lib/Dialect/Affine/TestVectorizationUtils.cpp b/mlir/test/lib/Dialect/Affine/TestVectorizationUtils.cpp --- a/mlir/test/lib/Dialect/Affine/TestVectorizationUtils.cpp +++ b/mlir/test/lib/Dialect/Affine/TestVectorizationUtils.cpp @@ -14,6 +14,7 @@ #include "mlir/Dialect/Affine/Analysis/AffineAnalysis.h" #include "mlir/Dialect/Affine/Analysis/NestedMatcher.h" #include "mlir/Dialect/Affine/IR/AffineOps.h" +#include "mlir/Dialect/Affine/LoopUtils.h" #include "mlir/Dialect/Affine/Utils.h" #include "mlir/Dialect/Vector/VectorOps.h" #include "mlir/Dialect/Vector/VectorUtils.h" @@ -21,7 +22,6 @@ #include "mlir/IR/BuiltinTypes.h" #include "mlir/IR/Diagnostics.h" #include "mlir/Pass/Pass.h" -#include "mlir/Transforms/LoopUtils.h" #include "mlir/Transforms/Passes.h" #include "llvm/ADT/STLExtras.h" diff --git a/mlir/test/lib/Dialect/SCF/CMakeLists.txt b/mlir/test/lib/Dialect/SCF/CMakeLists.txt --- a/mlir/test/lib/Dialect/SCF/CMakeLists.txt +++ b/mlir/test/lib/Dialect/SCF/CMakeLists.txt @@ -1,5 +1,7 @@ # Exclude tests from libMLIR.so add_mlir_library(MLIRSCFTestPasses + TestLoopParametricTiling.cpp + TestLoopUnrolling.cpp TestSCFUtils.cpp EXCLUDE_FROM_LIBMLIR diff --git a/mlir/test/lib/Transforms/TestLoopParametricTiling.cpp b/mlir/test/lib/Dialect/SCF/TestLoopParametricTiling.cpp rename from mlir/test/lib/Transforms/TestLoopParametricTiling.cpp rename to mlir/test/lib/Dialect/SCF/TestLoopParametricTiling.cpp --- a/mlir/test/lib/Transforms/TestLoopParametricTiling.cpp +++ b/mlir/test/lib/Dialect/SCF/TestLoopParametricTiling.cpp @@ -11,10 +11,9 @@ //===----------------------------------------------------------------------===// #include "mlir/Dialect/SCF/SCF.h" +#include "mlir/Dialect/SCF/Utils.h" #include "mlir/IR/Builders.h" #include "mlir/Pass/Pass.h" -#include "mlir/Transforms/LoopUtils.h" -#include "mlir/Transforms/Passes.h" using namespace mlir; @@ -31,8 +30,7 @@ } StringRef getDescription() const final { return "test application of parametric tiling to the outer loops so that " - "the " - "ranges of outer loops become static"; + "the ranges of outer loops become static"; } SimpleParametricLoopTilingPass() = default; SimpleParametricLoopTilingPass(const SimpleParametricLoopTilingPass &) {} diff --git a/mlir/test/lib/Transforms/TestLoopUnrolling.cpp b/mlir/test/lib/Dialect/SCF/TestLoopUnrolling.cpp rename from mlir/test/lib/Transforms/TestLoopUnrolling.cpp rename to mlir/test/lib/Dialect/SCF/TestLoopUnrolling.cpp --- a/mlir/test/lib/Transforms/TestLoopUnrolling.cpp +++ b/mlir/test/lib/Dialect/SCF/TestLoopUnrolling.cpp @@ -12,11 +12,10 @@ #include "mlir/Dialect/Arithmetic/IR/Arithmetic.h" #include "mlir/Dialect/SCF/SCF.h" +#include "mlir/Dialect/SCF/Utils.h" #include "mlir/Dialect/StandardOps/IR/Ops.h" #include "mlir/IR/Builders.h" #include "mlir/Pass/Pass.h" -#include "mlir/Transforms/LoopUtils.h" -#include "mlir/Transforms/Passes.h" using namespace mlir; diff --git a/mlir/test/lib/Transforms/CMakeLists.txt b/mlir/test/lib/Transforms/CMakeLists.txt --- a/mlir/test/lib/Transforms/CMakeLists.txt +++ b/mlir/test/lib/Transforms/CMakeLists.txt @@ -2,10 +2,6 @@ add_mlir_library(MLIRTestTransforms TestConstantFold.cpp TestInlining.cpp - TestLoopFusion.cpp - TestLoopMapping.cpp - TestLoopParametricTiling.cpp - TestLoopUnrolling.cpp EXCLUDE_FROM_LIBMLIR diff --git a/mlir/test/lib/Transforms/TestConstantFold.cpp b/mlir/test/lib/Transforms/TestConstantFold.cpp --- a/mlir/test/lib/Transforms/TestConstantFold.cpp +++ b/mlir/test/lib/Transforms/TestConstantFold.cpp @@ -9,7 +9,6 @@ #include "mlir/Pass/Pass.h" #include "mlir/Transforms/FoldUtils.h" #include "mlir/Transforms/Passes.h" -#include "mlir/Transforms/Utils.h" using namespace mlir; diff --git a/mlir/unittests/Transforms/CMakeLists.txt b/mlir/unittests/Transforms/CMakeLists.txt --- a/mlir/unittests/Transforms/CMakeLists.txt +++ b/mlir/unittests/Transforms/CMakeLists.txt @@ -4,4 +4,5 @@ ) target_link_libraries(MLIRTransformsTests PRIVATE + MLIRParser MLIRTransforms)