diff --git a/mlir/include/mlir/Dialect/Affine/Passes.h b/mlir/include/mlir/Dialect/Affine/Passes.h
--- a/mlir/include/mlir/Dialect/Affine/Passes.h
+++ b/mlir/include/mlir/Dialect/Affine/Passes.h
@@ -36,6 +36,10 @@
 std::unique_ptr<OperationPass<FuncOp>>
 createAffineLoopInvariantCodeMotionPass();
 
+/// Creates a pass to convert all parallel affine.for's into 1-d affine.parallel
+/// ops.
+std::unique_ptr<OperationPass<FuncOp>> createAffineParallelizePass();
+
 /// Performs packing (or explicit copying) of accessed memref regions into
 /// buffers in the specified faster memory space through either pointwise copies
 /// or DMA operations.
diff --git a/mlir/include/mlir/Dialect/Affine/Passes.td b/mlir/include/mlir/Dialect/Affine/Passes.td
--- a/mlir/include/mlir/Dialect/Affine/Passes.td
+++ b/mlir/include/mlir/Dialect/Affine/Passes.td
@@ -112,6 +112,11 @@
   ];
 }
 
+def AffineParallelize : FunctionPass<"affine-parallelize"> {
+  let summary = "Convert affine.for ops into 1-D affine.parallel";
+  let constructor = "mlir::createAffineParallelizePass()";
+}
+
 def SimplifyAffineStructures : FunctionPass<"simplify-affine-structures"> {
   let summary = "Simplify affine expressions in maps/sets and normalize "
                 "memrefs";
diff --git a/mlir/include/mlir/Dialect/Affine/Utils.h b/mlir/include/mlir/Dialect/Affine/Utils.h
--- a/mlir/include/mlir/Dialect/Affine/Utils.h
+++ b/mlir/include/mlir/Dialect/Affine/Utils.h
@@ -15,9 +15,16 @@
 
 namespace mlir {
 
+class AffineForOp;
 class AffineIfOp;
+class AffineParallelOp;
 struct LogicalResult;
 
+/// Replaces parallel affine.for op with 1-d affine.parallel op.
+/// mlir::isLoopParallel detect the parallel affine.for ops.
+/// There is no cost model currently used to drive this parallelization.
+void affineParallelize(AffineForOp forOp);
+
 /// Hoists out affine.if/else to as high as possible, i.e., past all invariant
 /// affine.fors/parallel's. Returns success if any hoisting happened; folded` is
 /// set to true if the op was folded or erased. This hoisting could lead to
diff --git a/mlir/lib/Dialect/Affine/Transforms/AffineParallelize.cpp b/mlir/lib/Dialect/Affine/Transforms/AffineParallelize.cpp
new file mode 100644
--- /dev/null
+++ b/mlir/lib/Dialect/Affine/Transforms/AffineParallelize.cpp
@@ -0,0 +1,50 @@
+//===- AffineParallelize.cpp - Affineparallelize Pass---------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements a parallelizer for affine loop nests that is able to
+// perform inner or outer loop parallelization.
+//
+//===----------------------------------------------------------------------===//
+
+#include "PassDetail.h"
+#include "mlir/Analysis/AffineStructures.h"
+#include "mlir/Analysis/LoopAnalysis.h"
+#include "mlir/Analysis/Utils.h"
+#include "mlir/Dialect/Affine/IR/AffineOps.h"
+#include "mlir/Dialect/Affine/IR/AffineValueMap.h"
+#include "mlir/Dialect/Affine/Passes.h"
+#include "mlir/Dialect/Affine/Passes.h.inc"
+#include "mlir/Dialect/Affine/Utils.h"
+#include "mlir/Transforms/LoopUtils.h"
+#include "llvm/Support/Debug.h"
+
+#define DEBUG_TYPE "affine-parallel"
+
+using namespace mlir;
+
+namespace {
+/// Convert all parallel affine.for op into 1-D affine.parallel op.
+struct AffineParallelize : public AffineParallelizeBase<AffineParallelize> {
+  void runOnFunction() override;
+};
+} // namespace
+
+void AffineParallelize::runOnFunction() {
+  FuncOp f = getFunction();
+  SmallVector<AffineForOp, 8> parallelizableLoops;
+  f.walk([&](AffineForOp loop) {
+    if (isLoopParallel(loop))
+      parallelizableLoops.push_back(loop);
+  });
+  for (AffineForOp loop : parallelizableLoops)
+    affineParallelize(loop);
+}
+
+std::unique_ptr<OperationPass<FuncOp>> mlir::createAffineParallelizePass() {
+  return std::make_unique<AffineParallelize>();
+}
diff --git a/mlir/lib/Dialect/Affine/Transforms/CMakeLists.txt b/mlir/lib/Dialect/Affine/Transforms/CMakeLists.txt
--- a/mlir/lib/Dialect/Affine/Transforms/CMakeLists.txt
+++ b/mlir/lib/Dialect/Affine/Transforms/CMakeLists.txt
@@ -1,6 +1,7 @@
 add_mlir_dialect_library(MLIRAffineTransforms
   AffineDataCopyGeneration.cpp
   AffineLoopInvariantCodeMotion.cpp
+  AffineParallelize.cpp
   LoopTiling.cpp
   LoopUnroll.cpp
   LoopUnrollAndJam.cpp
@@ -17,6 +18,7 @@
 
   LINK_LIBS PUBLIC
   MLIRAffineOps
+  MLIRAffineUtils
   MLIREDSC
   MLIRIR
   MLIRPass
diff --git a/mlir/lib/Dialect/Affine/Utils/Utils.cpp b/mlir/lib/Dialect/Affine/Utils/Utils.cpp
--- a/mlir/lib/Dialect/Affine/Utils/Utils.cpp
+++ b/mlir/lib/Dialect/Affine/Utils/Utils.cpp
@@ -129,6 +129,20 @@
   return hoistedIfOp;
 }
 
+/// Replace affine.for with a 1-d affine.parallel and clone the former's body
+/// into the latter while remapping values.
+void mlir::affineParallelize(AffineForOp forOp) {
+  Location loc = forOp.getLoc();
+  OpBuilder outsideBuilder(forOp);
+  // Creating empty 1-D affine.parallel op.
+  AffineParallelOp newPloop = outsideBuilder.create<AffineParallelOp>(
+      loc, forOp.getLowerBoundMap(), forOp.getLowerBoundOperands(),
+      forOp.getUpperBoundMap(), forOp.getUpperBoundOperands());
+  // Steal the body of the old affine for op and erase it.
+  newPloop.region().takeBody(forOp.region());
+  forOp.erase();
+}
+
 // Returns success if any hoisting happened.
 LogicalResult mlir::hoistAffineIfOp(AffineIfOp ifOp, bool *folded) {
   // Apply canonicalization patterns and folding - this is necessary for the
diff --git a/mlir/test/Dialect/Affine/parallelism-detection.mlir b/mlir/test/Dialect/Affine/parallelism-detection.mlir
deleted file mode 100644
--- a/mlir/test/Dialect/Affine/parallelism-detection.mlir
+++ /dev/null
@@ -1,47 +0,0 @@
-// RUN: mlir-opt -allow-unregistered-dialect %s -test-detect-parallel -split-input-file -verify-diagnostics | FileCheck %s
-
-// CHECK-LABEL: func @loop_nest_3d_outer_two_parallel
-func @loop_nest_3d_outer_two_parallel(%N : index) {
-  %0 = alloc() : memref<1024 x 1024 x vector<64xf32>>
-  %1 = alloc() : memref<1024 x 1024 x vector<64xf32>>
-  %2 = alloc() : memref<1024 x 1024 x vector<64xf32>>
-  affine.for %i = 0 to %N {
-    // expected-remark@-1 {{parallel loop}}
-    affine.for %j = 0 to %N {
-      // expected-remark@-1 {{parallel loop}}
-      affine.for %k = 0 to %N {
-        // expected-remark@-1 {{sequential loop}}
-        %5 = affine.load %0[%i, %k] : memref<1024x1024xvector<64xf32>>
-        %6 = affine.load %1[%k, %j] : memref<1024x1024xvector<64xf32>>
-        %7 = affine.load %2[%i, %j] : memref<1024x1024xvector<64xf32>>
-        %8 = mulf %5, %6 : vector<64xf32>
-        %9 = addf %7, %8 : vector<64xf32>
-        affine.store %9, %2[%i, %j] : memref<1024x1024xvector<64xf32>>
-      }
-    }
-  }
-  return
-}
-
-// -----
-
-// CHECK-LABEL: unknown_op_conservative
-func @unknown_op_conservative() {
-  affine.for %i = 0 to 10 {
-    // expected-remark@-1 {{sequential loop}}
-    "unknown"() : () -> ()
-  }
-  return
-}
-
-// -----
-
-// CHECK-LABEL: non_affine_load
-func @non_affine_load() {
-  %0 = alloc() : memref<100 x f32>
-  affine.for %i = 0 to 100 {
-    // expected-remark@-1 {{sequential loop}}
-    load %0[%i] : memref<100 x f32>
-  }
-  return
-}
diff --git a/mlir/test/Dialect/Affine/parallelize.mlir b/mlir/test/Dialect/Affine/parallelize.mlir
new file mode 100644
--- /dev/null
+++ b/mlir/test/Dialect/Affine/parallelize.mlir
@@ -0,0 +1,118 @@
+// RUN: mlir-opt %s -allow-unregistered-dialect -affine-parallelize| FileCheck %s
+
+// For multiple nested for-loops.
+// CHECK-DAG: [[MAP5:#map[0-9]+]] = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0 + d1, d2 * 2 + d3, d4 * 2 + d5, d6 + d7)>
+// CHECK-LABEL:    func @reduce_window_max() {
+func @reduce_window_max() {
+  %cst = constant 0.000000e+00 : f32
+  %0 = alloc() : memref<1x8x8x64xf32>
+  %1 = alloc() : memref<1x18x18x64xf32>
+  affine.for %arg0 = 0 to 1 {
+    affine.for %arg1 = 0 to 8 {
+      affine.for %arg2 = 0 to 8 {
+        affine.for %arg3 = 0 to 64 {
+          affine.store %cst, %0[%arg0, %arg1, %arg2, %arg3] : memref<1x8x8x64xf32>
+        }
+      }
+    }
+  }
+  affine.for %arg0 = 0 to 1 {
+    affine.for %arg1 = 0 to 8 {
+      affine.for %arg2 = 0 to 8 {
+        affine.for %arg3 = 0 to 64 {
+          affine.for %arg4 = 0 to 1 {
+            affine.for %arg5 = 0 to 3 {
+              affine.for %arg6 = 0 to 3 {
+                affine.for %arg7 = 0 to 1 {
+                  %2 = affine.load %0[%arg0, %arg1, %arg2, %arg3] : memref<1x8x8x64xf32>
+                  %3 = affine.load %1[%arg0 + %arg4, %arg1 * 2 + %arg5, %arg2 * 2 + %arg6, %arg3 + %arg7] : memref<1x18x18x64xf32>
+                  %4 = cmpf "ogt", %2, %3 : f32
+                  %5 = select %4, %2, %3 : f32
+                  affine.store %5, %0[%arg0, %arg1, %arg2, %arg3] : memref<1x8x8x64xf32>
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+  return
+}
+
+// CHECK:        %[[cst:.*]] = constant 0.000000e+00 : f32
+// CHECK:        %[[v0:.*]] = alloc() : memref<1x8x8x64xf32>
+// CHECK:        %[[v1:.*]] = alloc() : memref<1x18x18x64xf32>
+// CHECK:        affine.parallel (%[[arg0:.*]]) = (0) to (1) {
+// CHECK:          affine.parallel (%[[arg1:.*]]) = (0) to (8) {
+// CHECK:            affine.parallel (%[[arg2:.*]]) = (0) to (8) {
+// CHECK:              affine.parallel (%[[arg3:.*]]) = (0) to (64) {
+// CHECK:                affine.store %[[cst]], %[[v0]][%[[arg0]], %[[arg1]], %[[arg2]], %[[arg3]]] : memref<1x8x8x64xf32>
+// CHECK:              }
+// CHECK:            }
+// CHECK:          }
+// CHECK:        }
+// CHECK:        affine.parallel (%[[a0:.*]]) = (0) to (1) {
+// CHECK:          affine.parallel (%[[a1:.*]]) = (0) to (8) {
+// CHECK:            affine.parallel (%[[a2:.*]]) = (0) to (8) {
+// CHECK:              affine.parallel (%[[a3:.*]]) = (0) to (64) {
+// CHECK:                affine.parallel (%[[a4:.*]]) = (0) to (1) {
+// CHECK:                  affine.for %[[a5:.*]] = 0 to 3 {
+// CHECK:                    affine.for %[[a6:.*]] = 0 to 3 {
+// CHECK:                      affine.parallel (%[[a7:.*]]) = (0) to (1) {
+// CHECK:                        %[[lhs:.*]] = affine.load %[[v0]][%[[a0]], %[[a1]], %[[a2]], %[[a3]]] : memref<1x8x8x64xf32>
+// CHECK:                        %[[rhs:.*]] = affine.load %[[v1]][%[[a0]] + %[[a4]], %[[a1]] * 2 + %[[a5]], %[[a2]] * 2 + %[[a6]], %[[a3]] + %[[a7]]] : memref<1x18x18x64xf32>
+// CHECK:                        %[[res:.*]] = cmpf "ogt", %[[lhs]], %[[rhs]] : f32
+// CHECK:                        %[[sel:.*]] = select %[[res]], %[[lhs]], %[[rhs]] : f32
+// CHECK:                        affine.store %[[sel]], %[[v0]][%[[a0]], %[[a1]], %[[a2]], %[[a3]]] : memref<1x8x8x64xf32>
+// CHECK:                      }
+// CHECK:                    }
+// CHECK:                  }
+// CHECK:                }
+// CHECK:              }
+// CHECK:            }
+// CHECK:          }
+// CHECK:        }
+// CHECK:      }
+
+func @loop_nest_3d_outer_two_parallel(%N : index) {
+  %0 = alloc() : memref<1024 x 1024 x vector<64xf32>>
+  %1 = alloc() : memref<1024 x 1024 x vector<64xf32>>
+  %2 = alloc() : memref<1024 x 1024 x vector<64xf32>>
+  affine.for %i = 0 to %N {
+    affine.for %j = 0 to %N {
+      %7 = affine.load %2[%i, %j] : memref<1024x1024xvector<64xf32>>
+      affine.for %k = 0 to %N {
+        %5 = affine.load %0[%i, %k] : memref<1024x1024xvector<64xf32>>
+        %6 = affine.load %1[%k, %j] : memref<1024x1024xvector<64xf32>>
+        %8 = mulf %5, %6 : vector<64xf32>
+        %9 = addf %7, %8 : vector<64xf32>
+        affine.store %9, %2[%i, %j] : memref<1024x1024xvector<64xf32>>
+      }
+    }
+  }
+  return
+}
+
+// CHECK:      affine.parallel (%[[arg1:.*]]) = (0) to (symbol(%arg0)) {
+// CHECK-NEXT:        affine.parallel (%[[arg2:.*]]) = (0) to (symbol(%arg0)) {
+// CHECK:          affine.for %[[arg3:.*]] = 0 to %arg0 {
+
+// CHECK-LABEL: unknown_op_conservative
+func @unknown_op_conservative() {
+  affine.for %i = 0 to 10 {
+// CHECK:  affine.for %[[arg1:.*]] = 0 to 10 {
+    "unknown"() : () -> ()
+  }
+  return
+}
+
+// CHECK-LABEL: non_affine_load
+func @non_affine_load() {
+  %0 = alloc() : memref<100 x f32>
+  affine.for %i = 0 to 100 {
+// CHECK:  affine.for %{{.*}} = 0 to 100 {
+    load %0[%i] : memref<100 x f32>
+  }
+  return
+}
diff --git a/mlir/test/lib/Dialect/Affine/CMakeLists.txt b/mlir/test/lib/Dialect/Affine/CMakeLists.txt
--- a/mlir/test/lib/Dialect/Affine/CMakeLists.txt
+++ b/mlir/test/lib/Dialect/Affine/CMakeLists.txt
@@ -3,7 +3,6 @@
   TestAffineDataCopy.cpp
   TestAffineLoopUnswitching.cpp
   TestLoopPermutation.cpp
-  TestParallelismDetection.cpp
   TestVectorizationUtils.cpp
 
   EXCLUDE_FROM_LIBMLIR
diff --git a/mlir/test/lib/Dialect/Affine/TestParallelismDetection.cpp b/mlir/test/lib/Dialect/Affine/TestParallelismDetection.cpp
deleted file mode 100644
--- a/mlir/test/lib/Dialect/Affine/TestParallelismDetection.cpp
+++ /dev/null
@@ -1,47 +0,0 @@
-//===- ParallelismDetection.cpp - Parallelism Detection pass ------------*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file implements a pass to detect parallel affine 'affine.for' ops.
-//
-//===----------------------------------------------------------------------===//
-
-#include "mlir/Analysis/Utils.h"
-#include "mlir/Dialect/Affine/IR/AffineOps.h"
-#include "mlir/IR/Builders.h"
-#include "mlir/Pass/Pass.h"
-
-using namespace mlir;
-
-namespace {
-
-struct TestParallelismDetection
-    : public PassWrapper<TestParallelismDetection, FunctionPass> {
-  void runOnFunction() override;
-};
-
-} // end anonymous namespace
-
-// Walks the function and emits a note for all 'affine.for' ops detected as
-// parallel.
-void TestParallelismDetection::runOnFunction() {
-  FuncOp f = getFunction();
-  OpBuilder b(f.getBody());
-  f.walk([&](AffineForOp forOp) {
-    if (isLoopParallel(forOp))
-      forOp.emitRemark("parallel loop");
-    else
-      forOp.emitRemark("sequential loop");
-  });
-}
-
-namespace mlir {
-void registerTestParallelismDetection() {
-  PassRegistration<TestParallelismDetection> pass(
-      "test-detect-parallel", "Test parallelism detection ");
-}
-} // namespace mlir
diff --git a/mlir/tools/mlir-opt/mlir-opt.cpp b/mlir/tools/mlir-opt/mlir-opt.cpp
--- a/mlir/tools/mlir-opt/mlir-opt.cpp
+++ b/mlir/tools/mlir-opt/mlir-opt.cpp
@@ -62,7 +62,6 @@
 void registerTestMemRefDependenceCheck();
 void registerTestMemRefStrideCalculation();
 void registerTestOpaqueLoc();
-void registerTestParallelismDetection();
 void registerTestPreparationPassWithAllowedMemrefResults();
 void registerTestGpuParallelLoopMappingPass();
 void registerTestSCFUtilsPass();
@@ -137,7 +136,6 @@
   registerTestMemRefDependenceCheck();
   registerTestMemRefStrideCalculation();
   registerTestOpaqueLoc();
-  registerTestParallelismDetection();
   registerTestPreparationPassWithAllowedMemrefResults();
   registerTestGpuParallelLoopMappingPass();
   registerTestSCFUtilsPass();