diff --git a/mlir/include/mlir/Dialect/Affine/Utils.h b/mlir/include/mlir/Dialect/Affine/Utils.h --- a/mlir/include/mlir/Dialect/Affine/Utils.h +++ b/mlir/include/mlir/Dialect/Affine/Utils.h @@ -14,6 +14,8 @@ #define MLIR_DIALECT_AFFINE_UTILS_H #include "mlir/Support/LLVM.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/SmallVector.h" namespace mlir { @@ -34,6 +36,17 @@ /// significant code expansion in some cases. LogicalResult hoistAffineIfOp(AffineIfOp ifOp, bool *folded = nullptr); +/// Holds parameters to perform n-D vectorization on a loop nest. +struct VectorizationStrategy { + // Vectorization factors to apply to each target vector dimension. + // Each factor will be applied to a different loop. + SmallVector vectorSizes; + // Maps each AffineForOp vectorization candidate with its vector dimension. + // The candidate will be vectorize using the vectorization factor in + // 'vectorSizes' for that dimension. + DenseMap loopToVectorDim; +}; + /// Vectorizes affine loops in 'loops' using the n-D vectorization factors in /// 'vectorSizes'. By default, each vectorization factor is applied /// inner-to-outer to the loops of each loop nest. 'fastestVaryingPattern' can @@ -43,6 +56,11 @@ llvm::DenseSet> &loops, ArrayRef vectorSizes, ArrayRef fastestVaryingPattern); +/// Vectorizes affine loops from a single loop nest using the n-D vectorization +/// factors 'strategy'. Nest's loops must be provided in DFS pre-order. +LogicalResult vectorizeAffineLoopNest(ArrayRef loops, + const VectorizationStrategy &strategy); + } // namespace mlir #endif // MLIR_DIALECT_AFFINE_UTILS_H diff --git a/mlir/lib/Dialect/Affine/Transforms/SuperVectorize.cpp b/mlir/lib/Dialect/Affine/Transforms/SuperVectorize.cpp --- a/mlir/lib/Dialect/Affine/Transforms/SuperVectorize.cpp +++ b/mlir/lib/Dialect/Affine/Transforms/SuperVectorize.cpp @@ -583,17 +583,6 @@ vectorSizes = virtualVectorSize; } -/////// TODO: Hoist to a VectorizationStrategy.cpp when appropriate. -///////// -namespace { - -struct VectorizationStrategy { - SmallVector vectorSizes; - DenseMap loopToVectorDim; -}; - -} // end anonymous namespace - static void vectorizeLoopIfProfitable(Operation *loop, unsigned depthInPattern, unsigned patternDepth, VectorizationStrategy *strategy) { @@ -856,44 +845,40 @@ }; } -/// Apply vectorization of `loop` according to `state`. This is only triggered -/// if all vectorizations in `childrenMatches` have already succeeded -/// recursively in DFS post-order. -static LogicalResult -vectorizeLoopsAndLoadsRecursively(NestedMatch oneMatch, - VectorizationState *state) { - auto *loopInst = oneMatch.getMatchedOperation(); - auto loop = cast(loopInst); - auto childrenMatches = oneMatch.getMatchedChildren(); - - // 1. DFS postorder recursion, if any of my children fails, I fail too. - for (auto m : childrenMatches) { - if (failed(vectorizeLoopsAndLoadsRecursively(m, state))) { - return failure(); +/// Apply vectorization of `loop` according to `state`. `loops` must be +/// provided in a DFS pre-order and are processed in post-order to ensure that +/// all the children loops have already been vectorized before vectorizing the +/// parent loop. +static LogicalResult vectorizeLoopsAndLoads(ArrayRef loops, + VectorizationState *state) { + // Vectorize loops in post-order. If any children fails, the parent will fail + // too. + for (auto loop : llvm::reverse(loops)) { + // 1. This loop may have been omitted from vectorization for various reasons + // (e.g. due to the performance model or pattern depth > vector size). + auto it = state->strategy->loopToVectorDim.find(loop.getOperation()); + if (it == state->strategy->loopToVectorDim.end()) { + continue; } - } - // 2. This loop may have been omitted from vectorization for various reasons - // (e.g. due to the performance model or pattern depth > vector size). - auto it = state->strategy->loopToVectorDim.find(loopInst); - if (it == state->strategy->loopToVectorDim.end()) { - return success(); + // 2. Actual post-order transformation. + auto vectorDim = it->second; + assert(vectorDim < state->strategy->vectorSizes.size() && + "vector dim overflow"); + // a. get actual vector size + auto vectorSize = state->strategy->vectorSizes[vectorDim]; + // b. loop transformation for early vectorization is still subject to + // exploratory tradeoffs (see top of the file). Apply coarsening, i.e.: + // | ub -> ub + // | step -> step * vectorSize + LLVM_DEBUG(dbgs() << "\n[early-vect] vectorizeForOp by " << vectorSize + << " : \n" + << loop); + if (failed(vectorizeAffineForOp(loop, loop.getStep() * vectorSize, state))) + return failure(); } - // 3. Actual post-order transformation. - auto vectorDim = it->second; - assert(vectorDim < state->strategy->vectorSizes.size() && - "vector dim overflow"); - // a. get actual vector size - auto vectorSize = state->strategy->vectorSizes[vectorDim]; - // b. loop transformation for early vectorization is still subject to - // exploratory tradeoffs (see top of the file). Apply coarsening, i.e.: - // | ub -> ub - // | step -> step * vectorSize - LLVM_DEBUG(dbgs() << "\n[early-vect] vectorizeForOp by " << vectorSize - << " : "); - LLVM_DEBUG(loopInst->print(dbgs())); - return vectorizeAffineForOp(loop, loop.getStep() * vectorSize, state); + return success(); } /// Tries to transform a scalar constant into a vector splat of that constant. @@ -1104,16 +1089,24 @@ return success(); } -/// Vectorization is a recursive procedure where anything below can fail. -/// The root match thus needs to maintain a clone for handling failure. -/// Each root may succeed independently but will otherwise clean after itself if -/// anything below it fails. -static LogicalResult vectorizeRootMatch(NestedMatch m, - VectorizationStrategy *strategy) { - auto loop = cast(m.getMatchedOperation()); - OperationFolder folder(loop.getContext()); +/// Returns in 'loops' a DFS pre-order of AffineForOp's in 'match'. +static void getMatchedAffineLoops(NestedMatch match, + SmallVectorImpl &loops) { + loops.push_back(cast(match.getMatchedOperation())); + for (auto childMatch : match.getMatchedChildren()) { + getMatchedAffineLoops(childMatch, loops); + } +} + +/// Internal implementation to vectorize affine loops from a single loop nest +/// using the n-D vectorization factors in 'strategy'. Nest's loops must be +/// provided in DFS pre-order. +static LogicalResult vectorizeLoopNest(ArrayRef loops, + const VectorizationStrategy &strategy) { + AffineForOp rootLoop = loops.front(); + OperationFolder folder(rootLoop.getContext()); VectorizationState state; - state.strategy = strategy; + state.strategy = &strategy; state.folder = &folder; // Since patterns are recursive, they can very well intersect. @@ -1123,7 +1116,7 @@ // vectorizable. If a pattern is not vectorizable anymore, we just skip it. // TODO: implement a non-greedy profitability analysis that keeps only // non-intersecting patterns. - if (!isVectorizableLoopBody(loop, vectorTransferPattern())) { + if (!isVectorizableLoopBody(rootLoop, vectorTransferPattern())) { LLVM_DEBUG(dbgs() << "\n[early-vect]+++++ loop is not vectorizable"); return failure(); } @@ -1131,7 +1124,7 @@ /// Sets up error handling for this root loop. This is how the root match /// maintains a clone for handling failure and restores the proper state via /// RAII. - auto *loopInst = loop.getOperation(); + auto *loopInst = rootLoop.getOperation(); OpBuilder builder(loopInst); auto clonedLoop = cast(builder.clone(*loopInst)); struct Guard { @@ -1146,17 +1139,17 @@ } AffineForOp loop; AffineForOp clonedLoop; - } guard{loop, clonedLoop}; + } guard{rootLoop, clonedLoop}; ////////////////////////////////////////////////////////////////////////////// // Start vectorizing. // From now on, any error triggers the scope guard above. ////////////////////////////////////////////////////////////////////////////// - // 1. Vectorize all the loops matched by the pattern, recursively. + // 1. Vectorize all the loop candidates, in post-order. // This also vectorizes the roots (AffineLoadOp) as well as registers the // terminals (AffineStoreOp) for post-processing vectorization (we need to // wait for all use-def chains into them to be vectorized first). - if (failed(vectorizeLoopsAndLoadsRecursively(m, &state))) { + if (failed(vectorizeLoopsAndLoads(loops, &state))) { LLVM_DEBUG(dbgs() << "\n[early-vect]+++++ failed root vectorizeLoop"); return guard.failure(); } @@ -1188,38 +1181,25 @@ return guard.success(); } -/// Applies vectorization to the current Function by searching over a bunch of -/// predetermined patterns. -void Vectorize::runOnFunction() { - FuncOp f = getFunction(); - if (!fastestVaryingPattern.empty() && - fastestVaryingPattern.size() != vectorSizes.size()) { - f.emitRemark("Fastest varying pattern specified with different size than " - "the vector size."); - return signalPassFailure(); - } - - DenseSet parallelLoops; - f.walk([¶llelLoops](AffineForOp loop) { - if (isLoopParallel(loop)) - parallelLoops.insert(loop); - }); - - vectorizeAffineLoops(f, parallelLoops, vectorSizes, fastestVaryingPattern); +/// Vectorization is a recursive procedure where anything below can fail. +/// The root match thus needs to maintain a clone for handling failure. +/// Each root may succeed independently but will otherwise clean after itself if +/// anything below it fails. +static LogicalResult vectorizeRootMatch(NestedMatch m, + const VectorizationStrategy &strategy) { + SmallVector loopsToVectorize; + getMatchedAffineLoops(m, loopsToVectorize); + return vectorizeLoopNest(loopsToVectorize, strategy); } -namespace mlir { - -/// Vectorizes affine loops in 'loops' using the n-D vectorization factors in -/// 'vectorSizes'. By default, each vectorization factor is applied -/// inner-to-outer to the loops of each loop nest. 'fastestVaryingPattern' can -/// be optionally used to provide a different loop vectorization order. -void vectorizeAffineLoops(Operation *parentOp, DenseSet &loops, - ArrayRef vectorSizes, - ArrayRef fastestVaryingPattern) { - // Thread-safe RAII local context, BumpPtrAllocator freed on exit. - NestedPatternContext mlContext; - +/// Internal implementation to vectorize affine loops in 'loops' using the n-D +/// vectorization factors in 'vectorSizes'. By default, each vectorization +/// factor is applied inner-to-outer to the loops of each loop nest. +/// 'fastestVaryingPattern' can be optionally used to provide a different loop +/// vectorization order. +static void vectorizeLoops(Operation *parentOp, DenseSet &loops, + ArrayRef vectorSizes, + ArrayRef fastestVaryingPattern) { for (auto &pat : makePatterns(loops, vectorSizes.size(), fastestVaryingPattern)) { LLVM_DEBUG(dbgs() << "\n******************************************"); @@ -1245,7 +1225,7 @@ &strategy); // TODO: if pattern does not apply, report it; alter the // cost/benefit. - vectorizeRootMatch(m, &strategy); + vectorizeRootMatch(m, strategy); // TODO: some diagnostics if failure to vectorize occurs. } } @@ -1260,4 +1240,77 @@ return std::make_unique(); } +/// Applies vectorization to the current Function by searching over a bunch of +/// predetermined patterns. +void Vectorize::runOnFunction() { + FuncOp f = getFunction(); + if (!fastestVaryingPattern.empty() && + fastestVaryingPattern.size() != vectorSizes.size()) { + f.emitRemark("Fastest varying pattern specified with different size than " + "the vector size."); + return signalPassFailure(); + } + + DenseSet parallelLoops; + f.walk([¶llelLoops](AffineForOp loop) { + if (isLoopParallel(loop)) + parallelLoops.insert(loop); + }); + + // Thread-safe RAII local context, BumpPtrAllocator freed on exit. + NestedPatternContext mlContext; + vectorizeLoops(f, parallelLoops, vectorSizes, fastestVaryingPattern); +} + +/// Verify pre-order of affine loops in 'loops'. +static void verifyLoopNestPreOrder(ArrayRef loops) { + DenseSet visited; + visited.insert(loops.front()); + + for (auto loop : loops) { + if (visited.count(loop)) + // Skip root loop. + continue; + + // Parent loop must have been already visited if they are in pre-order. + auto parentLoop = loop.getParentOfType(); + assert(!parentLoop && "Unexpected root loop"); + assert(!visited.count(parentLoop) && "Loops are not in pre-order."); + } +} + +namespace mlir { + +/// External utility to vectorize affine loops in 'loops' using the n-D +/// vectorization factors in 'vectorSizes'. By default, each vectorization +/// factor is applied inner-to-outer to the loops of each loop nest. +/// 'fastestVaryingPattern' can be optionally used to provide a different loop +/// vectorization order. +void vectorizeAffineLoops(Operation *parentOp, DenseSet &loops, + ArrayRef vectorSizes, + ArrayRef fastestVaryingPattern) { + // Thread-safe RAII local context, BumpPtrAllocator freed on exit. + NestedPatternContext mlContext; + vectorizeLoops(parentOp, loops, vectorSizes, fastestVaryingPattern); +} + +/// External utility to vectorize affine loops from a single loop nest using the +/// n-D vectorization factors encoded in 'strategy'. Nest's loops must be +/// provided in DFS pre-order. +LogicalResult vectorizeAffineLoopNest(ArrayRef loops, + const VectorizationStrategy &strategy) { + // Thread-safe RAII local context, BumpPtrAllocator freed on exit. + NestedPatternContext mlContext; + verifyLoopNestPreOrder(loops); + return vectorizeLoopNest(loops, strategy); +} + +std::unique_ptr> +createSuperVectorizePass(ArrayRef virtualVectorSize) { + return std::make_unique(virtualVectorSize); +} +std::unique_ptr> createSuperVectorizePass() { + return std::make_unique(); +} + } // namespace mlir diff --git a/mlir/test/Dialect/Affine/SuperVectorize/vectorize_1d.mlir b/mlir/test/Dialect/Affine/SuperVectorize/vectorize_1d.mlir --- a/mlir/test/Dialect/Affine/SuperVectorize/vectorize_1d.mlir +++ b/mlir/test/Dialect/Affine/SuperVectorize/vectorize_1d.mlir @@ -1,7 +1,8 @@ // RUN: mlir-opt %s -affine-super-vectorize="virtual-vector-size=128 test-fastest-varying=0" | FileCheck %s // Permutation maps used in vectorization. -// CHECK: #[[$map_proj_d0d1_0:map[0-9]+]] = affine_map<(d0, d1) -> (0)> +// CHECK-DAG: #[[$map_proj_d0d1_0:map[0-9]+]] = affine_map<(d0, d1) -> (0)> +// CHECK-DAG: #[[$map_id1:map[0-9]+]] = affine_map<(d0) -> (d0)> #map0 = affine_map<(d0) -> (d0)> #mapadd1 = affine_map<(d0) -> (d0 + 1)> @@ -26,8 +27,8 @@ %P = dim %B, %c2 : memref // CHECK: for {{.*}} step 128 -// CHECK-NEXT: %{{.*}} = affine.apply #map0(%[[C0]]) -// CHECK-NEXT: %{{.*}} = affine.apply #map0(%[[C0]]) +// CHECK-NEXT: %{{.*}} = affine.apply #[[$map_id1]](%[[C0]]) +// CHECK-NEXT: %{{.*}} = affine.apply #[[$map_id1]](%[[C0]]) // CHECK-NEXT: %{{.*}} = constant 0.0{{.*}}: f32 // CHECK-NEXT: {{.*}} = vector.transfer_read %{{.*}}[%{{.*}}, %{{.*}}], %{{.*}} {permutation_map = #[[$map_proj_d0d1_0]]} : memref, vector<128xf32> affine.for %i0 = 0 to %M { // vectorized due to scalar -> vector @@ -331,8 +332,8 @@ // CHECK: affine.for %{{.*}}{{[0-9]*}} = 0 to %{{[0-9]*}} { // CHECK: for [[IV18:%[a-zA-Z0-9]+]] = 0 to [[ARG_M]] step 128 -// CHECK: %{{.*}} = affine.apply #map0(%{{.*}}) -// CHECK: %{{.*}} = affine.apply #map0(%{{.*}}) +// CHECK: %{{.*}} = affine.apply #[[$map_id1]](%{{.*}}) +// CHECK: %{{.*}} = affine.apply #[[$map_id1]](%{{.*}}) // CHECK: %{{.*}} = constant 0.0{{.*}}: f32 // CHECK: {{.*}} = vector.transfer_read %{{.*}}[%{{.*}}, %{{.*}}], %{{.*}} {permutation_map = #[[$map_proj_d0d1_0]]} : memref, vector<128xf32> affine.for %i17 = 0 to %M { // not vectorized, the 1-D pattern that matched %{{.*}} in DFS post-order prevents vectorizing %{{.*}} @@ -360,8 +361,8 @@ // CHECK: affine.for %{{.*}}{{[0-9]*}} = 0 to %{{[0-9]*}} { // CHECK: for [[IV18:%[a-zA-Z0-9]+]] = 0 to [[ARG_M]] step 128 -// CHECK: %{{.*}} = affine.apply #map0(%{{.*}}) -// CHECK-NEXT: %{{.*}} = affine.apply #map0(%{{.*}}) +// CHECK: %{{.*}} = affine.apply #[[$map_id1]](%{{.*}}) +// CHECK-NEXT: %{{.*}} = affine.apply #[[$map_id1]](%{{.*}}) // CHECK-NEXT: %{{.*}} = constant 0.0{{.*}}: f32 // CHECK-NEXT: {{.*}} = vector.transfer_read %{{.*}}[%{{.*}}, %{{.*}}], %{{.*}} {permutation_map = #[[$map_proj_d0d1_0]]} : memref, vector<128xf32> affine.for %i17 = 0 to %M { // not vectorized, the 1-D pattern that matched %i18 in DFS post-order prevents vectorizing %{{.*}} diff --git a/mlir/test/Dialect/Affine/SuperVectorize/vectorize_2d.mlir b/mlir/test/Dialect/Affine/SuperVectorize/vectorize_2d.mlir --- a/mlir/test/Dialect/Affine/SuperVectorize/vectorize_2d.mlir +++ b/mlir/test/Dialect/Affine/SuperVectorize/vectorize_2d.mlir @@ -124,7 +124,7 @@ } // VECT: affine.for %[[I2:.*]] = #[[$map_id1]](%[[C0]]) to #[[$map_id1]](%[[M]]) step 4 { // VECT-NEXT: affine.for %[[I3:.*]] = #[[$map_id1]](%[[C0]]) to #[[$map_id1]](%[[N]]) step 8 { - // VECT-NEXT: affine.for %[[I4:.*]] = #map5(%[[C0]]) to #[[$map_id1]](%[[K]]) { + // VECT-NEXT: affine.for %[[I4:.*]] = #{{map[0-9]+}}(%[[C0]]) to #[[$map_id1]](%[[K]]) { // VECT: %[[A:.*]] = vector.transfer_read %{{.*}}[%[[I4]], %[[I3]]], %{{.*}} {permutation_map = #[[$map_proj_d0d1_zerod1]]} : memref, vector<4x8xf32> // VECT: %[[B:.*]] = vector.transfer_read %{{.*}}[%[[I2]], %[[I4]]], %{{.*}} {permutation_map = #[[$map_proj_d0d1_d0zero]]} : memref, vector<4x8xf32> // VECT-NEXT: %[[C:.*]] = mulf %[[B]], %[[A]] : vector<4x8xf32>