diff --git a/mlir/include/mlir/Dialect/Affine/Utils.h b/mlir/include/mlir/Dialect/Affine/Utils.h --- a/mlir/include/mlir/Dialect/Affine/Utils.h +++ b/mlir/include/mlir/Dialect/Affine/Utils.h @@ -14,6 +14,8 @@ #define MLIR_DIALECT_AFFINE_UTILS_H #include "mlir/Support/LLVM.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/SmallVector.h" namespace mlir { @@ -34,6 +36,47 @@ /// significant code expansion in some cases. LogicalResult hoistAffineIfOp(AffineIfOp ifOp, bool *folded = nullptr); +/// Holds parameters to perform n-D vectorization on a single loop nest. +/// For example, for the following loop nest: +/// +/// func @vec2d(%in: memref<64x128x512xf32>, %out: memref<64x128x512xf32>) { +/// affine.for %i0 = 0 to 64 { +/// affine.for %i1 = 0 to 128 { +/// affine.for %i2 = 0 to 512 { +/// %ld = affine.load %in[%i0, %i1, %i2] : memref<64x128x512xf32> +/// affine.store %ld, %out[%i0, %i1, %i2] : memref<64x128x512xf32> +/// } +/// } +/// } +/// return +/// } +/// +/// and VectorizationStrategy = 'vectorSizes = {8, 4}', 'loopToVectorDim = +/// {{i1->0}, {i2->1}}', SuperVectorizer will generate: +/// +/// func @vec2d(%arg0: memref<64x128x512xf32>, %arg1: memref<64x128x512xf32>) { +/// affine.for %arg2 = 0 to 64 { +/// affine.for %arg3 = 0 to 128 step 8 { +/// affine.for %arg4 = 0 to 512 step 4 { +/// %cst = constant 0.000000e+00 : f32 +/// %0 = vector.transfer_read %arg0[%arg2, %arg3, %arg4], %cst : ... +/// vector.transfer_write %0, %arg1[%arg2, %arg3, %arg4] : ... +/// } +/// } +/// } +/// return +/// } +// TODO: Hoist to a VectorizationStrategy.cpp when appropriate. +struct VectorizationStrategy { + // Vectorization factors to apply to each target vector dimension. + // Each factor will be applied to a different loop. + SmallVector vectorSizes; + // Maps each AffineForOp vectorization candidate with its vector dimension. + // The candidate will be vectorized using the vectorization factor in + // 'vectorSizes' for that dimension. + DenseMap loopToVectorDim; +}; + /// Vectorizes affine loops in 'loops' using the n-D vectorization factors in /// 'vectorSizes'. By default, each vectorization factor is applied /// inner-to-outer to the loops of each loop nest. 'fastestVaryingPattern' can @@ -43,6 +86,45 @@ llvm::DenseSet> &loops, ArrayRef vectorSizes, ArrayRef fastestVaryingPattern); +/// External utility to vectorize affine loops from a single loop nest using an +/// n-D vectorization strategy (see doc in VectorizationStrategy definition). +/// Loops are provided in a 2D vector container. The first dimension represents +/// the nesting level relative to the loops to be vectorized. The second +/// dimension contains the loops. This means that: +/// a) every loop in 'loops[i]' must have a parent loop in 'loops[i-1]', +/// b) a loop in 'loops[i]' may or may not have a child loop in 'loops[i+1]'. +/// +/// For example, for the following loop nest: +/// +/// func @vec2d(%in0: memref<64x128x512xf32>, %in1: memref<64x128x128xf32>, +/// %out0: memref<64x128x512xf32>, +/// %out1: memref<64x128x128xf32>) { +/// affine.for %i0 = 0 to 64 { +/// affine.for %i1 = 0 to 128 { +/// affine.for %i2 = 0 to 512 { +/// %ld = affine.load %in0[%i0, %i1, %i2] : memref<64x128x512xf32> +/// affine.store %ld, %out0[%i0, %i1, %i2] : memref<64x128x512xf32> +/// } +/// affine.for %i3 = 0 to 128 { +/// %ld = affine.load %in1[%i0, %i1, %i3] : memref<64x128x128xf32> +/// affine.store %ld, %out1[%i0, %i1, %i3] : memref<64x128x128xf32> +/// } +/// } +/// } +/// return +/// } +/// +/// loops = {{%i0}, {%i2, %i3}}, to vectorize the outermost and the two +/// innermost loops; +/// loops = {{%i1}, {%i2, %i3}}, to vectorize the middle and the two innermost +/// loops; +/// loops = {{%i2}}, to vectorize only the first innermost loop; +/// loops = {{%i3}}, to vectorize only the second innermost loop; +/// loops = {{%i1}}, to vectorize only the middle loop. +LogicalResult +vectorizeAffineLoopNest(const std::vector> &loops, + const VectorizationStrategy &strategy); + /// Normalize a affine.parallel op so that lower bounds are 0 and steps are 1. /// As currently implemented, this transformation cannot fail and will return /// early if the op is already in a normalized form. diff --git a/mlir/lib/Dialect/Affine/Transforms/SuperVectorize.cpp b/mlir/lib/Dialect/Affine/Transforms/SuperVectorize.cpp --- a/mlir/lib/Dialect/Affine/Transforms/SuperVectorize.cpp +++ b/mlir/lib/Dialect/Affine/Transforms/SuperVectorize.cpp @@ -254,8 +254,8 @@ /// interference); /// 3. Then, for each pattern in order: /// a. applying iterative rewriting of the loop and the load operations in -/// DFS postorder. Rewriting is implemented by coarsening the loops and -/// turning load operations into opaque vector.transfer_read ops; +/// inner-to-outer order. Rewriting is implemented by coarsening the loops +/// and turning load operations into opaque vector.transfer_read ops; /// b. keeping track of the load operations encountered as "roots" and the /// store operations as "terminals"; /// c. traversing the use-def chains starting from the roots and iteratively @@ -584,17 +584,6 @@ vectorSizes = virtualVectorSize; } -/////// TODO: Hoist to a VectorizationStrategy.cpp when appropriate. -///////// -namespace { - -struct VectorizationStrategy { - SmallVector vectorSizes; - DenseMap loopToVectorDim; -}; - -} // end anonymous namespace - static void vectorizeLoopIfProfitable(Operation *loop, unsigned depthInPattern, unsigned patternDepth, VectorizationStrategy *strategy) { @@ -857,44 +846,44 @@ }; } -/// Apply vectorization of `loop` according to `state`. This is only triggered -/// if all vectorizations in `childrenMatches` have already succeeded -/// recursively in DFS post-order. +/// Apply vectorization of `loop` according to `state`. `loops` are processed in +/// inner-to-outer order to ensure that all the children loops have already been +/// vectorized before vectorizing the parent loop. static LogicalResult -vectorizeLoopsAndLoadsRecursively(NestedMatch oneMatch, - VectorizationState *state) { - auto *loopInst = oneMatch.getMatchedOperation(); - auto loop = cast(loopInst); - auto childrenMatches = oneMatch.getMatchedChildren(); - - // 1. DFS postorder recursion, if any of my children fails, I fail too. - for (auto m : childrenMatches) { - if (failed(vectorizeLoopsAndLoadsRecursively(m, state))) { - return failure(); - } - } +vectorizeLoopsAndLoads(std::vector> &loops, + VectorizationState *state) { + // Vectorize loops in inner-to-outer order. If any children fails, the parent + // will fail too. + for (auto &loopsInLevel : llvm::reverse(loops)) { + for (AffineForOp loop : loopsInLevel) { + // 1. This loop may have been omitted from vectorization for various + // reasons (e.g. due to the performance model or pattern depth > vector + // size). + auto it = state->strategy->loopToVectorDim.find(loop.getOperation()); + if (it == state->strategy->loopToVectorDim.end()) + continue; - // 2. This loop may have been omitted from vectorization for various reasons - // (e.g. due to the performance model or pattern depth > vector size). - auto it = state->strategy->loopToVectorDim.find(loopInst); - if (it == state->strategy->loopToVectorDim.end()) { - return success(); + // 2. Actual inner-to-outer transformation. + auto vectorDim = it->second; + assert(vectorDim < state->strategy->vectorSizes.size() && + "vector dim overflow"); + // a. get actual vector size + auto vectorSize = state->strategy->vectorSizes[vectorDim]; + // b. loop transformation for early vectorization is still subject to + // exploratory tradeoffs (see top of the file). Apply coarsening, + // i.e.: + // | ub -> ub + // | step -> step * vectorSize + LLVM_DEBUG(dbgs() << "\n[early-vect] vectorizeForOp by " << vectorSize + << " : \n" + << loop); + if (failed( + vectorizeAffineForOp(loop, loop.getStep() * vectorSize, state))) + return failure(); + } // end for. } - // 3. Actual post-order transformation. - auto vectorDim = it->second; - assert(vectorDim < state->strategy->vectorSizes.size() && - "vector dim overflow"); - // a. get actual vector size - auto vectorSize = state->strategy->vectorSizes[vectorDim]; - // b. loop transformation for early vectorization is still subject to - // exploratory tradeoffs (see top of the file). Apply coarsening, i.e.: - // | ub -> ub - // | step -> step * vectorSize - LLVM_DEBUG(dbgs() << "\n[early-vect] vectorizeForOp by " << vectorSize - << " : "); - LLVM_DEBUG(loopInst->print(dbgs())); - return vectorizeAffineForOp(loop, loop.getStep() * vectorSize, state); + return success(); } /// Tries to transform a scalar constant into a vector splat of that constant. @@ -1145,16 +1134,46 @@ return success(); } -/// Vectorization is a recursive procedure where anything below can fail. -/// The root match thus needs to maintain a clone for handling failure. -/// Each root may succeed independently but will otherwise clean after itself if -/// anything below it fails. -static LogicalResult vectorizeRootMatch(NestedMatch m, - VectorizationStrategy *strategy) { - auto loop = cast(m.getMatchedOperation()); - OperationFolder folder(loop.getContext()); +/// Recursive implementation to convert all the nested loops in 'match' to a 2D +/// vector container that preserves the relative nesting level of each loop with +/// respect to the others in 'match'. 'currentLevel' is the nesting level that +/// will be assigned to the loop in the current 'match'. +static void +getMatchedAffineLoopsRec(NestedMatch match, unsigned currentLevel, + std::vector> &loops) { + // Add a new empty level to the output if it doesn't exist already. + assert(currentLevel <= loops.size() && "Unexpected currentLevel"); + if (currentLevel == loops.size()) + loops.push_back(SmallVector()); + + // Add current match and recursively visit its children. + loops[currentLevel].push_back(cast(match.getMatchedOperation())); + for (auto childMatch : match.getMatchedChildren()) { + getMatchedAffineLoopsRec(childMatch, currentLevel + 1, loops); + } +} + +/// Converts all the nested loops in 'match' to a 2D vector container that +/// preserves the relative nesting level of each loop with respect to the others +/// in 'match'. This means that every loop in 'loops[i]' will have a parent loop +/// in 'loops[i-1]'. A loop in 'loops[i]' may or may not have a child loop in +/// 'loops[i+1]'. +static void +getMatchedAffineLoops(NestedMatch match, + std::vector> &loops) { + getMatchedAffineLoopsRec(match, /*currLoopDepth=*/0, loops); +} + +/// Internal implementation to vectorize affine loops from a single loop nest +/// using an n-D vectorization strategy. +static LogicalResult +vectorizeLoopNest(std::vector> &loops, + const VectorizationStrategy &strategy) { + assert(loops[0].size() == 1 && "Expected single root loop"); + AffineForOp rootLoop = loops[0][0]; + OperationFolder folder(rootLoop.getContext()); VectorizationState state; - state.strategy = strategy; + state.strategy = &strategy; state.folder = &folder; // Since patterns are recursive, they can very well intersect. @@ -1164,7 +1183,7 @@ // vectorizable. If a pattern is not vectorizable anymore, we just skip it. // TODO: implement a non-greedy profitability analysis that keeps only // non-intersecting patterns. - if (!isVectorizableLoopBody(loop, vectorTransferPattern())) { + if (!isVectorizableLoopBody(rootLoop, vectorTransferPattern())) { LLVM_DEBUG(dbgs() << "\n[early-vect]+++++ loop is not vectorizable"); return failure(); } @@ -1172,7 +1191,7 @@ /// Sets up error handling for this root loop. This is how the root match /// maintains a clone for handling failure and restores the proper state via /// RAII. - auto *loopInst = loop.getOperation(); + auto *loopInst = rootLoop.getOperation(); OpBuilder builder(loopInst); auto clonedLoop = cast(builder.clone(*loopInst)); struct Guard { @@ -1187,17 +1206,17 @@ } AffineForOp loop; AffineForOp clonedLoop; - } guard{loop, clonedLoop}; + } guard{rootLoop, clonedLoop}; ////////////////////////////////////////////////////////////////////////////// // Start vectorizing. // From now on, any error triggers the scope guard above. ////////////////////////////////////////////////////////////////////////////// - // 1. Vectorize all the loops matched by the pattern, recursively. + // 1. Vectorize all the loop candidates, in inner-to-outer order. // This also vectorizes the roots (AffineLoadOp) as well as registers the // terminals (AffineStoreOp) for post-processing vectorization (we need to // wait for all use-def chains into them to be vectorized first). - if (failed(vectorizeLoopsAndLoadsRecursively(m, &state))) { + if (failed(vectorizeLoopsAndLoads(loops, &state))) { LLVM_DEBUG(dbgs() << "\n[early-vect]+++++ failed root vectorizeLoop"); return guard.failure(); } @@ -1229,38 +1248,25 @@ return guard.success(); } -/// Applies vectorization to the current Function by searching over a bunch of -/// predetermined patterns. -void Vectorize::runOnFunction() { - FuncOp f = getFunction(); - if (!fastestVaryingPattern.empty() && - fastestVaryingPattern.size() != vectorSizes.size()) { - f.emitRemark("Fastest varying pattern specified with different size than " - "the vector size."); - return signalPassFailure(); - } - - DenseSet parallelLoops; - f.walk([¶llelLoops](AffineForOp loop) { - if (isLoopParallel(loop)) - parallelLoops.insert(loop); - }); - - vectorizeAffineLoops(f, parallelLoops, vectorSizes, fastestVaryingPattern); +/// Vectorization is a recursive procedure where anything below can fail. The +/// root match thus needs to maintain a clone for handling failure. Each root +/// may succeed independently but will otherwise clean after itself if anything +/// below it fails. +static LogicalResult vectorizeRootMatch(NestedMatch m, + const VectorizationStrategy &strategy) { + std::vector> loopsToVectorize; + getMatchedAffineLoops(m, loopsToVectorize); + return vectorizeLoopNest(loopsToVectorize, strategy); } -namespace mlir { - -/// Vectorizes affine loops in 'loops' using the n-D vectorization factors in -/// 'vectorSizes'. By default, each vectorization factor is applied -/// inner-to-outer to the loops of each loop nest. 'fastestVaryingPattern' can -/// be optionally used to provide a different loop vectorization order. -void vectorizeAffineLoops(Operation *parentOp, DenseSet &loops, - ArrayRef vectorSizes, - ArrayRef fastestVaryingPattern) { - // Thread-safe RAII local context, BumpPtrAllocator freed on exit. - NestedPatternContext mlContext; - +/// Internal implementation to vectorize affine loops in 'loops' using the n-D +/// vectorization factors in 'vectorSizes'. By default, each vectorization +/// factor is applied inner-to-outer to the loops of each loop nest. +/// 'fastestVaryingPattern' can be optionally used to provide a different loop +/// vectorization order. +static void vectorizeLoops(Operation *parentOp, DenseSet &loops, + ArrayRef vectorSizes, + ArrayRef fastestVaryingPattern) { for (auto &pat : makePatterns(loops, vectorSizes.size(), fastestVaryingPattern)) { LLVM_DEBUG(dbgs() << "\n******************************************"); @@ -1286,7 +1292,7 @@ &strategy); // TODO: if pattern does not apply, report it; alter the // cost/benefit. - vectorizeRootMatch(m, &strategy); + vectorizeRootMatch(m, strategy); // TODO: some diagnostics if failure to vectorize occurs. } } @@ -1301,4 +1307,127 @@ return std::make_unique(); } +/// Applies vectorization to the current function by searching over a bunch of +/// predetermined patterns. +void Vectorize::runOnFunction() { + FuncOp f = getFunction(); + if (!fastestVaryingPattern.empty() && + fastestVaryingPattern.size() != vectorSizes.size()) { + f.emitRemark("Fastest varying pattern specified with different size than " + "the vector size."); + return signalPassFailure(); + } + + DenseSet parallelLoops; + f.walk([¶llelLoops](AffineForOp loop) { + if (isLoopParallel(loop)) + parallelLoops.insert(loop); + }); + + // Thread-safe RAII local context, BumpPtrAllocator freed on exit. + NestedPatternContext mlContext; + vectorizeLoops(f, parallelLoops, vectorSizes, fastestVaryingPattern); +} + +/// Verify that affine loops in 'loops' meet the nesting criteria expected by +/// SuperVectorizer: +/// * There must be at least one loop. +/// * There must be a single root loop (nesting level 0). +/// * Each loop at a given nesting level must be nested in a loop from a +/// previous nesting level. +static void +verifyLoopNesting(const std::vector> &loops) { + assert(!loops.empty() && "Expected at least one loop"); + assert(!loops[0].size() && "Expected only one root loop"); + + // Traverse loops outer-to-inner to check some invariants. + for (int i = 1, end = loops.size(); i < end; ++i) { + for (AffineForOp loop : loops[i]) { + // Check that each loop at this level is nested in one of the loops from + // the previous level. + bool parentFound = false; + for (AffineForOp maybeParent : loops[i - 1]) { + if (maybeParent.getOperation()->isProperAncestor(loop)) { + parentFound = true; + break; + } + } + assert(parentFound && "Child loop not nested in any parent loop"); + + // Check that each loop at this level is not nested in another loop from + // this level. + for (AffineForOp sibling : loops[i]) + assert(!sibling.getOperation()->isProperAncestor(loop) && + "Loops at the same level are nested"); + } + } +} + +namespace mlir { + +/// External utility to vectorize affine loops in 'loops' using the n-D +/// vectorization factors in 'vectorSizes'. By default, each vectorization +/// factor is applied inner-to-outer to the loops of each loop nest. +/// 'fastestVaryingPattern' can be optionally used to provide a different loop +/// vectorization order. +void vectorizeAffineLoops(Operation *parentOp, DenseSet &loops, + ArrayRef vectorSizes, + ArrayRef fastestVaryingPattern) { + // Thread-safe RAII local context, BumpPtrAllocator freed on exit. + NestedPatternContext mlContext; + vectorizeLoops(parentOp, loops, vectorSizes, fastestVaryingPattern); +} + +/// External utility to vectorize affine loops from a single loop nest using an +/// n-D vectorization strategy (see doc in VectorizationStrategy definition). +/// Loops are provided in a 2D vector container. The first dimension represents +/// the nesting level relative to the loops to be vectorized. The second +/// dimension contains the loops. This means that: +/// a) every loop in 'loops[i]' must have a parent loop in 'loops[i-1]', +/// b) a loop in 'loops[i]' may or may not have a child loop in 'loops[i+1]'. +/// +/// For example, for the following loop nest: +/// +/// func @vec2d(%in0: memref<64x128x512xf32>, %in1: memref<64x128x128xf32>, +/// %out0: memref<64x128x512xf32>, +/// %out1: memref<64x128x128xf32>) { +/// affine.for %i0 = 0 to 64 { +/// affine.for %i1 = 0 to 128 { +/// affine.for %i2 = 0 to 512 { +/// %ld = affine.load %in0[%i0, %i1, %i2] : memref<64x128x512xf32> +/// affine.store %ld, %out0[%i0, %i1, %i2] : memref<64x128x512xf32> +/// } +/// affine.for %i3 = 0 to 128 { +/// %ld = affine.load %in1[%i0, %i1, %i3] : memref<64x128x128xf32> +/// affine.store %ld, %out1[%i0, %i1, %i3] : memref<64x128x128xf32> +/// } +/// } +/// } +/// return +/// } +/// +/// loops = {{%i0}, {%i2, %i3}}, to vectorize the outermost and the two +/// innermost loops; +/// loops = {{%i1}, {%i2, %i3}}, to vectorize the middle and the two innermost +/// loops; +/// loops = {{%i2}}, to vectorize only the first innermost loop; +/// loops = {{%i3}}, to vectorize only the second innermost loop; +/// loops = {{%i1}}, to vectorize only the middle loop. +LogicalResult +vectorizeAffineLoopNest(std::vector> &loops, + const VectorizationStrategy &strategy) { + // Thread-safe RAII local context, BumpPtrAllocator freed on exit. + NestedPatternContext mlContext; + verifyLoopNesting(loops); + return vectorizeLoopNest(loops, strategy); +} + +std::unique_ptr> +createSuperVectorizePass(ArrayRef virtualVectorSize) { + return std::make_unique(virtualVectorSize); +} +std::unique_ptr> createSuperVectorizePass() { + return std::make_unique(); +} + } // namespace mlir diff --git a/mlir/test/Dialect/Affine/SuperVectorize/vectorize_1d.mlir b/mlir/test/Dialect/Affine/SuperVectorize/vectorize_1d.mlir --- a/mlir/test/Dialect/Affine/SuperVectorize/vectorize_1d.mlir +++ b/mlir/test/Dialect/Affine/SuperVectorize/vectorize_1d.mlir @@ -1,7 +1,8 @@ // RUN: mlir-opt %s -affine-super-vectorize="virtual-vector-size=128 test-fastest-varying=0" | FileCheck %s // Permutation maps used in vectorization. -// CHECK: #[[$map_proj_d0d1_0:map[0-9]+]] = affine_map<(d0, d1) -> (0)> +// CHECK-DAG: #[[$map_proj_d0d1_0:map[0-9]+]] = affine_map<(d0, d1) -> (0)> +// CHECK-DAG: #[[$map_id1:map[0-9]+]] = affine_map<(d0) -> (d0)> #map0 = affine_map<(d0) -> (d0)> #mapadd1 = affine_map<(d0) -> (d0 + 1)> @@ -26,8 +27,8 @@ %P = dim %B, %c2 : memref // CHECK: for {{.*}} step 128 -// CHECK-NEXT: %{{.*}} = affine.apply #map0(%[[C0]]) -// CHECK-NEXT: %{{.*}} = affine.apply #map0(%[[C0]]) +// CHECK-NEXT: %{{.*}} = affine.apply #[[$map_id1]](%[[C0]]) +// CHECK-NEXT: %{{.*}} = affine.apply #[[$map_id1]](%[[C0]]) // CHECK-NEXT: %{{.*}} = constant 0.0{{.*}}: f32 // CHECK-NEXT: {{.*}} = vector.transfer_read %{{.*}}[%{{.*}}, %{{.*}}], %{{.*}} {permutation_map = #[[$map_proj_d0d1_0]]} : memref, vector<128xf32> affine.for %i0 = 0 to %M { // vectorized due to scalar -> vector @@ -331,8 +332,8 @@ // CHECK: affine.for %{{.*}}{{[0-9]*}} = 0 to %{{[0-9]*}} { // CHECK: for [[IV18:%[a-zA-Z0-9]+]] = 0 to [[ARG_M]] step 128 -// CHECK: %{{.*}} = affine.apply #map0(%{{.*}}) -// CHECK: %{{.*}} = affine.apply #map0(%{{.*}}) +// CHECK: %{{.*}} = affine.apply #[[$map_id1]](%{{.*}}) +// CHECK: %{{.*}} = affine.apply #[[$map_id1]](%{{.*}}) // CHECK: %{{.*}} = constant 0.0{{.*}}: f32 // CHECK: {{.*}} = vector.transfer_read %{{.*}}[%{{.*}}, %{{.*}}], %{{.*}} {permutation_map = #[[$map_proj_d0d1_0]]} : memref, vector<128xf32> affine.for %i17 = 0 to %M { // not vectorized, the 1-D pattern that matched %{{.*}} in DFS post-order prevents vectorizing %{{.*}} @@ -360,8 +361,8 @@ // CHECK: affine.for %{{.*}}{{[0-9]*}} = 0 to %{{[0-9]*}} { // CHECK: for [[IV18:%[a-zA-Z0-9]+]] = 0 to [[ARG_M]] step 128 -// CHECK: %{{.*}} = affine.apply #map0(%{{.*}}) -// CHECK-NEXT: %{{.*}} = affine.apply #map0(%{{.*}}) +// CHECK: %{{.*}} = affine.apply #[[$map_id1]](%{{.*}}) +// CHECK-NEXT: %{{.*}} = affine.apply #[[$map_id1]](%{{.*}}) // CHECK-NEXT: %{{.*}} = constant 0.0{{.*}}: f32 // CHECK-NEXT: {{.*}} = vector.transfer_read %{{.*}}[%{{.*}}, %{{.*}}], %{{.*}} {permutation_map = #[[$map_proj_d0d1_0]]} : memref, vector<128xf32> affine.for %i17 = 0 to %M { // not vectorized, the 1-D pattern that matched %i18 in DFS post-order prevents vectorizing %{{.*}} diff --git a/mlir/test/Dialect/Affine/SuperVectorize/vectorize_2d.mlir b/mlir/test/Dialect/Affine/SuperVectorize/vectorize_2d.mlir --- a/mlir/test/Dialect/Affine/SuperVectorize/vectorize_2d.mlir +++ b/mlir/test/Dialect/Affine/SuperVectorize/vectorize_2d.mlir @@ -124,7 +124,7 @@ } // VECT: affine.for %[[I2:.*]] = #[[$map_id1]](%[[C0]]) to #[[$map_id1]](%[[M]]) step 4 { // VECT-NEXT: affine.for %[[I3:.*]] = #[[$map_id1]](%[[C0]]) to #[[$map_id1]](%[[N]]) step 8 { - // VECT-NEXT: affine.for %[[I4:.*]] = #map5(%[[C0]]) to #[[$map_id1]](%[[K]]) { + // VECT-NEXT: affine.for %[[I4:.*]] = #[[$map_id1]](%[[C0]]) to #[[$map_id1]](%[[K]]) { // VECT: %[[A:.*]] = vector.transfer_read %{{.*}}[%[[I4]], %[[I3]]], %{{.*}} {permutation_map = #[[$map_proj_d0d1_zerod1]]} : memref, vector<4x8xf32> // VECT: %[[B:.*]] = vector.transfer_read %{{.*}}[%[[I2]], %[[I4]]], %{{.*}} {permutation_map = #[[$map_proj_d0d1_d0zero]]} : memref, vector<4x8xf32> // VECT-NEXT: %[[C:.*]] = mulf %[[B]], %[[A]] : vector<4x8xf32>