diff --git a/mlir/include/mlir/Dialect/Affine/Utils.h b/mlir/include/mlir/Dialect/Affine/Utils.h
--- a/mlir/include/mlir/Dialect/Affine/Utils.h
+++ b/mlir/include/mlir/Dialect/Affine/Utils.h
@@ -14,6 +14,8 @@
 #define MLIR_DIALECT_AFFINE_UTILS_H
 
 #include "mlir/Support/LLVM.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallVector.h"
 
 namespace mlir {
 
@@ -34,6 +36,17 @@
 /// significant code expansion in some cases.
 LogicalResult hoistAffineIfOp(AffineIfOp ifOp, bool *folded = nullptr);
 
+/// Holds parameters to perform n-D vectorization on a loop nest.
+struct VectorizationStrategy {
+  // Vectorization factors to apply to each target vector dimension.
+  // Each factor will be applied to a different loop.
+  SmallVector<int64_t, 8> vectorSizes;
+  // Maps each AffineForOp vectorization candidate with its vector dimension.
+  // The candidate will be vectorize using the vectorization factor in
+  // 'vectorSizes' for that dimension.
+  DenseMap<Operation *, unsigned> loopToVectorDim;
+};
+
 /// Vectorizes affine loops in 'loops' using the n-D vectorization factors in
 /// 'vectorSizes'. By default, each vectorization factor is applied
 /// inner-to-outer to the loops of each loop nest. 'fastestVaryingPattern' can
@@ -43,6 +56,11 @@
     llvm::DenseSet<Operation *, DenseMapInfo<Operation *>> &loops,
     ArrayRef<int64_t> vectorSizes, ArrayRef<int64_t> fastestVaryingPattern);
 
+/// Vectorizes affine loops from a single loop nest using the n-D vectorization
+/// factors 'strategy'. Nest's loops must be provided in DFS pre-order.
+LogicalResult vectorizeAffineLoopNest(ArrayRef<AffineForOp> loops,
+                                      const VectorizationStrategy &strategy);
+
 } // namespace mlir
 
 #endif // MLIR_DIALECT_AFFINE_UTILS_H
diff --git a/mlir/lib/Dialect/Affine/Transforms/SuperVectorize.cpp b/mlir/lib/Dialect/Affine/Transforms/SuperVectorize.cpp
--- a/mlir/lib/Dialect/Affine/Transforms/SuperVectorize.cpp
+++ b/mlir/lib/Dialect/Affine/Transforms/SuperVectorize.cpp
@@ -583,17 +583,6 @@
   vectorSizes = virtualVectorSize;
 }
 
-/////// TODO: Hoist to a VectorizationStrategy.cpp when appropriate.
-/////////
-namespace {
-
-struct VectorizationStrategy {
-  SmallVector<int64_t, 8> vectorSizes;
-  DenseMap<Operation *, unsigned> loopToVectorDim;
-};
-
-} // end anonymous namespace
-
 static void vectorizeLoopIfProfitable(Operation *loop, unsigned depthInPattern,
                                       unsigned patternDepth,
                                       VectorizationStrategy *strategy) {
@@ -856,44 +845,40 @@
   };
 }
 
-/// Apply vectorization of `loop` according to `state`. This is only triggered
-/// if all vectorizations in `childrenMatches` have already succeeded
-/// recursively in DFS post-order.
-static LogicalResult
-vectorizeLoopsAndLoadsRecursively(NestedMatch oneMatch,
-                                  VectorizationState *state) {
-  auto *loopInst = oneMatch.getMatchedOperation();
-  auto loop = cast<AffineForOp>(loopInst);
-  auto childrenMatches = oneMatch.getMatchedChildren();
-
-  // 1. DFS postorder recursion, if any of my children fails, I fail too.
-  for (auto m : childrenMatches) {
-    if (failed(vectorizeLoopsAndLoadsRecursively(m, state))) {
-      return failure();
+/// Apply vectorization of `loop` according to `state`. `loops` must be
+/// provided in a DFS pre-order and are processed in post-order to ensure that
+/// all the children loops have already been vectorized before vectorizing the
+/// parent loop.
+static LogicalResult vectorizeLoopsAndLoads(ArrayRef<AffineForOp> loops,
+                                            VectorizationState *state) {
+  // Vectorize loops in post-order. If any children fails, the parent will fail
+  // too.
+  for (auto loop : llvm::reverse(loops)) {
+    // 1. This loop may have been omitted from vectorization for various reasons
+    // (e.g. due to the performance model or pattern depth > vector size).
+    auto it = state->strategy->loopToVectorDim.find(loop.getOperation());
+    if (it == state->strategy->loopToVectorDim.end()) {
+      continue;
     }
-  }
 
-  // 2. This loop may have been omitted from vectorization for various reasons
-  // (e.g. due to the performance model or pattern depth > vector size).
-  auto it = state->strategy->loopToVectorDim.find(loopInst);
-  if (it == state->strategy->loopToVectorDim.end()) {
-    return success();
+    // 2. Actual post-order transformation.
+    auto vectorDim = it->second;
+    assert(vectorDim < state->strategy->vectorSizes.size() &&
+           "vector dim overflow");
+    //   a. get actual vector size
+    auto vectorSize = state->strategy->vectorSizes[vectorDim];
+    //   b. loop transformation for early vectorization is still subject to
+    //     exploratory tradeoffs (see top of the file). Apply coarsening, i.e.:
+    //        | ub -> ub
+    //        | step -> step * vectorSize
+    LLVM_DEBUG(dbgs() << "\n[early-vect] vectorizeForOp by " << vectorSize
+                      << " : \n"
+                      << loop);
+    if (failed(vectorizeAffineForOp(loop, loop.getStep() * vectorSize, state)))
+      return failure();
   }
 
-  // 3. Actual post-order transformation.
-  auto vectorDim = it->second;
-  assert(vectorDim < state->strategy->vectorSizes.size() &&
-         "vector dim overflow");
-  //   a. get actual vector size
-  auto vectorSize = state->strategy->vectorSizes[vectorDim];
-  //   b. loop transformation for early vectorization is still subject to
-  //     exploratory tradeoffs (see top of the file). Apply coarsening, i.e.:
-  //        | ub -> ub
-  //        | step -> step * vectorSize
-  LLVM_DEBUG(dbgs() << "\n[early-vect] vectorizeForOp by " << vectorSize
-                    << " : ");
-  LLVM_DEBUG(loopInst->print(dbgs()));
-  return vectorizeAffineForOp(loop, loop.getStep() * vectorSize, state);
+  return success();
 }
 
 /// Tries to transform a scalar constant into a vector splat of that constant.
@@ -1104,16 +1089,24 @@
   return success();
 }
 
-/// Vectorization is a recursive procedure where anything below can fail.
-/// The root match thus needs to maintain a clone for handling failure.
-/// Each root may succeed independently but will otherwise clean after itself if
-/// anything below it fails.
-static LogicalResult vectorizeRootMatch(NestedMatch m,
-                                        VectorizationStrategy *strategy) {
-  auto loop = cast<AffineForOp>(m.getMatchedOperation());
-  OperationFolder folder(loop.getContext());
+/// Returns in 'loops' a DFS pre-order of AffineForOp's in 'match'.
+static void getMatchedAffineLoops(NestedMatch match,
+                                  SmallVectorImpl<AffineForOp> &loops) {
+  loops.push_back(cast<AffineForOp>(match.getMatchedOperation()));
+  for (auto childMatch : match.getMatchedChildren()) {
+    getMatchedAffineLoops(childMatch, loops);
+  }
+}
+
+/// Internal implementation to vectorize affine loops from a single loop nest
+/// using the n-D vectorization factors in 'strategy'. Nest's loops must be
+/// provided in DFS pre-order.
+static LogicalResult vectorizeLoopNest(ArrayRef<AffineForOp> loops,
+                                       const VectorizationStrategy &strategy) {
+  AffineForOp rootLoop = loops.front();
+  OperationFolder folder(rootLoop.getContext());
   VectorizationState state;
-  state.strategy = strategy;
+  state.strategy = &strategy;
   state.folder = &folder;
 
   // Since patterns are recursive, they can very well intersect.
@@ -1123,7 +1116,7 @@
   // vectorizable. If a pattern is not vectorizable anymore, we just skip it.
   // TODO: implement a non-greedy profitability analysis that keeps only
   // non-intersecting patterns.
-  if (!isVectorizableLoopBody(loop, vectorTransferPattern())) {
+  if (!isVectorizableLoopBody(rootLoop, vectorTransferPattern())) {
     LLVM_DEBUG(dbgs() << "\n[early-vect]+++++ loop is not vectorizable");
     return failure();
   }
@@ -1131,7 +1124,7 @@
   /// Sets up error handling for this root loop. This is how the root match
   /// maintains a clone for handling failure and restores the proper state via
   /// RAII.
-  auto *loopInst = loop.getOperation();
+  auto *loopInst = rootLoop.getOperation();
   OpBuilder builder(loopInst);
   auto clonedLoop = cast<AffineForOp>(builder.clone(*loopInst));
   struct Guard {
@@ -1146,17 +1139,17 @@
     }
     AffineForOp loop;
     AffineForOp clonedLoop;
-  } guard{loop, clonedLoop};
+  } guard{rootLoop, clonedLoop};
 
   //////////////////////////////////////////////////////////////////////////////
   // Start vectorizing.
   // From now on, any error triggers the scope guard above.
   //////////////////////////////////////////////////////////////////////////////
-  // 1. Vectorize all the loops matched by the pattern, recursively.
+  // 1. Vectorize all the loop candidates, in post-order.
   // This also vectorizes the roots (AffineLoadOp) as well as registers the
   // terminals (AffineStoreOp) for post-processing vectorization (we need to
   // wait for all use-def chains into them to be vectorized first).
-  if (failed(vectorizeLoopsAndLoadsRecursively(m, &state))) {
+  if (failed(vectorizeLoopsAndLoads(loops, &state))) {
     LLVM_DEBUG(dbgs() << "\n[early-vect]+++++ failed root vectorizeLoop");
     return guard.failure();
   }
@@ -1188,38 +1181,25 @@
   return guard.success();
 }
 
-/// Applies vectorization to the current Function by searching over a bunch of
-/// predetermined patterns.
-void Vectorize::runOnFunction() {
-  FuncOp f = getFunction();
-  if (!fastestVaryingPattern.empty() &&
-      fastestVaryingPattern.size() != vectorSizes.size()) {
-    f.emitRemark("Fastest varying pattern specified with different size than "
-                 "the vector size.");
-    return signalPassFailure();
-  }
-
-  DenseSet<Operation *> parallelLoops;
-  f.walk([&parallelLoops](AffineForOp loop) {
-    if (isLoopParallel(loop))
-      parallelLoops.insert(loop);
-  });
-
-  vectorizeAffineLoops(f, parallelLoops, vectorSizes, fastestVaryingPattern);
+/// Vectorization is a recursive procedure where anything below can fail.
+/// The root match thus needs to maintain a clone for handling failure.
+/// Each root may succeed independently but will otherwise clean after itself if
+/// anything below it fails.
+static LogicalResult vectorizeRootMatch(NestedMatch m,
+                                        const VectorizationStrategy &strategy) {
+  SmallVector<AffineForOp, 4> loopsToVectorize;
+  getMatchedAffineLoops(m, loopsToVectorize);
+  return vectorizeLoopNest(loopsToVectorize, strategy);
 }
 
-namespace mlir {
-
-/// Vectorizes affine loops in 'loops' using the n-D vectorization factors in
-/// 'vectorSizes'. By default, each vectorization factor is applied
-/// inner-to-outer to the loops of each loop nest. 'fastestVaryingPattern' can
-/// be optionally used to provide a different loop vectorization order.
-void vectorizeAffineLoops(Operation *parentOp, DenseSet<Operation *> &loops,
-                          ArrayRef<int64_t> vectorSizes,
-                          ArrayRef<int64_t> fastestVaryingPattern) {
-  // Thread-safe RAII local context, BumpPtrAllocator freed on exit.
-  NestedPatternContext mlContext;
-
+/// Internal implementation to vectorize affine loops in 'loops' using the n-D
+/// vectorization factors in 'vectorSizes'. By default, each vectorization
+/// factor is applied inner-to-outer to the loops of each loop nest.
+/// 'fastestVaryingPattern' can be optionally used to provide a different loop
+/// vectorization order.
+static void vectorizeLoops(Operation *parentOp, DenseSet<Operation *> &loops,
+                           ArrayRef<int64_t> vectorSizes,
+                           ArrayRef<int64_t> fastestVaryingPattern) {
   for (auto &pat :
        makePatterns(loops, vectorSizes.size(), fastestVaryingPattern)) {
     LLVM_DEBUG(dbgs() << "\n******************************************");
@@ -1245,7 +1225,7 @@
                                 &strategy);
       // TODO: if pattern does not apply, report it; alter the
       // cost/benefit.
-      vectorizeRootMatch(m, &strategy);
+      vectorizeRootMatch(m, strategy);
       // TODO: some diagnostics if failure to vectorize occurs.
     }
   }
@@ -1260,4 +1240,77 @@
   return std::make_unique<Vectorize>();
 }
 
+/// Applies vectorization to the current Function by searching over a bunch of
+/// predetermined patterns.
+void Vectorize::runOnFunction() {
+  FuncOp f = getFunction();
+  if (!fastestVaryingPattern.empty() &&
+      fastestVaryingPattern.size() != vectorSizes.size()) {
+    f.emitRemark("Fastest varying pattern specified with different size than "
+                 "the vector size.");
+    return signalPassFailure();
+  }
+
+  DenseSet<Operation *> parallelLoops;
+  f.walk([&parallelLoops](AffineForOp loop) {
+    if (isLoopParallel(loop))
+      parallelLoops.insert(loop);
+  });
+
+  // Thread-safe RAII local context, BumpPtrAllocator freed on exit.
+  NestedPatternContext mlContext;
+  vectorizeLoops(f, parallelLoops, vectorSizes, fastestVaryingPattern);
+}
+
+/// Verify pre-order of affine loops in 'loops'.
+static void verifyLoopNestPreOrder(ArrayRef<AffineForOp> loops) {
+  DenseSet<Operation *> visited;
+  visited.insert(loops.front());
+
+  for (auto loop : loops) {
+    if (visited.count(loop))
+      // Skip root loop.
+      continue;
+
+    // Parent loop must have been already visited if they are in pre-order.
+    auto parentLoop = loop.getParentOfType<AffineForOp>();
+    assert(!parentLoop && "Unexpected root loop");
+    assert(!visited.count(parentLoop) && "Loops are not in pre-order.");
+  }
+}
+
+namespace mlir {
+
+/// External utility to vectorize affine loops in 'loops' using the n-D
+/// vectorization factors in 'vectorSizes'. By default, each vectorization
+/// factor is applied inner-to-outer to the loops of each loop nest.
+/// 'fastestVaryingPattern' can be optionally used to provide a different loop
+/// vectorization order.
+void vectorizeAffineLoops(Operation *parentOp, DenseSet<Operation *> &loops,
+                          ArrayRef<int64_t> vectorSizes,
+                          ArrayRef<int64_t> fastestVaryingPattern) {
+  // Thread-safe RAII local context, BumpPtrAllocator freed on exit.
+  NestedPatternContext mlContext;
+  vectorizeLoops(parentOp, loops, vectorSizes, fastestVaryingPattern);
+}
+
+/// External utility to vectorize affine loops from a single loop nest using the
+/// n-D vectorization factors encoded in 'strategy'. Nest's loops must be
+/// provided in DFS pre-order.
+LogicalResult vectorizeAffineLoopNest(ArrayRef<AffineForOp> loops,
+                                      const VectorizationStrategy &strategy) {
+  // Thread-safe RAII local context, BumpPtrAllocator freed on exit.
+  NestedPatternContext mlContext;
+  verifyLoopNestPreOrder(loops);
+  return vectorizeLoopNest(loops, strategy);
+}
+
+std::unique_ptr<OperationPass<FuncOp>>
+createSuperVectorizePass(ArrayRef<int64_t> virtualVectorSize) {
+  return std::make_unique<Vectorize>(virtualVectorSize);
+}
+std::unique_ptr<OperationPass<FuncOp>> createSuperVectorizePass() {
+  return std::make_unique<Vectorize>();
+}
+
 } // namespace mlir
diff --git a/mlir/test/Dialect/Affine/SuperVectorize/vectorize_1d.mlir b/mlir/test/Dialect/Affine/SuperVectorize/vectorize_1d.mlir
--- a/mlir/test/Dialect/Affine/SuperVectorize/vectorize_1d.mlir
+++ b/mlir/test/Dialect/Affine/SuperVectorize/vectorize_1d.mlir
@@ -1,7 +1,8 @@
 // RUN: mlir-opt %s -affine-super-vectorize="virtual-vector-size=128 test-fastest-varying=0" | FileCheck %s
 
 // Permutation maps used in vectorization.
-// CHECK: #[[$map_proj_d0d1_0:map[0-9]+]] = affine_map<(d0, d1) -> (0)>
+// CHECK-DAG: #[[$map_proj_d0d1_0:map[0-9]+]] = affine_map<(d0, d1) -> (0)>
+// CHECK-DAG: #[[$map_id1:map[0-9]+]] = affine_map<(d0) -> (d0)>
 
 #map0 = affine_map<(d0) -> (d0)>
 #mapadd1 = affine_map<(d0) -> (d0 + 1)>
@@ -26,8 +27,8 @@
    %P = dim %B, %c2 : memref<?x?x?xf32>
 
 // CHECK: for {{.*}} step 128
-// CHECK-NEXT: %{{.*}} = affine.apply #map0(%[[C0]])
-// CHECK-NEXT: %{{.*}} = affine.apply #map0(%[[C0]])
+// CHECK-NEXT: %{{.*}} = affine.apply #[[$map_id1]](%[[C0]])
+// CHECK-NEXT: %{{.*}} = affine.apply #[[$map_id1]](%[[C0]])
 // CHECK-NEXT: %{{.*}} = constant 0.0{{.*}}: f32
 // CHECK-NEXT: {{.*}} = vector.transfer_read %{{.*}}[%{{.*}}, %{{.*}}], %{{.*}} {permutation_map = #[[$map_proj_d0d1_0]]} : memref<?x?xf32>, vector<128xf32>
    affine.for %i0 = 0 to %M { // vectorized due to scalar -> vector
@@ -331,8 +332,8 @@
 
 // CHECK: affine.for %{{.*}}{{[0-9]*}} = 0 to %{{[0-9]*}} {
 // CHECK:   for [[IV18:%[a-zA-Z0-9]+]] = 0 to [[ARG_M]] step 128
-// CHECK:     %{{.*}} = affine.apply #map0(%{{.*}})
-// CHECK:     %{{.*}} = affine.apply #map0(%{{.*}})
+// CHECK:     %{{.*}} = affine.apply #[[$map_id1]](%{{.*}})
+// CHECK:     %{{.*}} = affine.apply #[[$map_id1]](%{{.*}})
 // CHECK:     %{{.*}} = constant 0.0{{.*}}: f32
 // CHECK:     {{.*}} = vector.transfer_read %{{.*}}[%{{.*}}, %{{.*}}], %{{.*}} {permutation_map = #[[$map_proj_d0d1_0]]} : memref<?x?xf32>, vector<128xf32>
    affine.for %i17 = 0 to %M { // not vectorized, the 1-D pattern that matched %{{.*}} in DFS post-order prevents vectorizing %{{.*}}
@@ -360,8 +361,8 @@
 
 // CHECK: affine.for %{{.*}}{{[0-9]*}} = 0 to %{{[0-9]*}} {
 // CHECK:   for [[IV18:%[a-zA-Z0-9]+]] = 0 to [[ARG_M]] step 128
-// CHECK:      %{{.*}} = affine.apply #map0(%{{.*}})
-// CHECK-NEXT: %{{.*}} = affine.apply #map0(%{{.*}})
+// CHECK:      %{{.*}} = affine.apply #[[$map_id1]](%{{.*}})
+// CHECK-NEXT: %{{.*}} = affine.apply #[[$map_id1]](%{{.*}})
 // CHECK-NEXT: %{{.*}} = constant 0.0{{.*}}: f32
 // CHECK-NEXT: {{.*}} = vector.transfer_read %{{.*}}[%{{.*}}, %{{.*}}], %{{.*}} {permutation_map = #[[$map_proj_d0d1_0]]} : memref<?x?xf32>, vector<128xf32>
    affine.for %i17 = 0 to %M { // not vectorized, the 1-D pattern that matched %i18 in DFS post-order prevents vectorizing %{{.*}}
diff --git a/mlir/test/Dialect/Affine/SuperVectorize/vectorize_2d.mlir b/mlir/test/Dialect/Affine/SuperVectorize/vectorize_2d.mlir
--- a/mlir/test/Dialect/Affine/SuperVectorize/vectorize_2d.mlir
+++ b/mlir/test/Dialect/Affine/SuperVectorize/vectorize_2d.mlir
@@ -124,7 +124,7 @@
   }
   //      VECT:  affine.for %[[I2:.*]] = #[[$map_id1]](%[[C0]]) to #[[$map_id1]](%[[M]]) step 4 {
   // VECT-NEXT:    affine.for %[[I3:.*]] = #[[$map_id1]](%[[C0]]) to #[[$map_id1]](%[[N]]) step 8 {
-  // VECT-NEXT:      affine.for %[[I4:.*]] = #map5(%[[C0]]) to #[[$map_id1]](%[[K]]) {
+  // VECT-NEXT:      affine.for %[[I4:.*]] = #{{map[0-9]+}}(%[[C0]]) to #[[$map_id1]](%[[K]]) {
   //      VECT:        %[[A:.*]] = vector.transfer_read %{{.*}}[%[[I4]], %[[I3]]], %{{.*}} {permutation_map = #[[$map_proj_d0d1_zerod1]]} : memref<?x?xf32>, vector<4x8xf32>
   //      VECT:        %[[B:.*]] = vector.transfer_read %{{.*}}[%[[I2]], %[[I4]]], %{{.*}} {permutation_map = #[[$map_proj_d0d1_d0zero]]} : memref<?x?xf32>, vector<4x8xf32>
   // VECT-NEXT:        %[[C:.*]] = mulf %[[B]], %[[A]] : vector<4x8xf32>