diff --git a/mlir/include/mlir/Transforms/LoopUtils.h b/mlir/include/mlir/Transforms/LoopUtils.h
--- a/mlir/include/mlir/Transforms/LoopUtils.h
+++ b/mlir/include/mlir/Transforms/LoopUtils.h
@@ -28,6 +28,7 @@
 
 namespace loop {
 class ForOp;
+class ParallelOp;
 } // end namespace loop
 
 /// Unrolls this for operation completely if the trip count is known to be
@@ -225,6 +226,9 @@
 /// independent of any loop induction variable involved in the nest.
 void coalesceLoops(MutableArrayRef<loop::ForOp> loops);
 
+void coalescePLoops(loop::ParallelOp loops,
+                    std::vector<std::vector<unsigned>> combinedDimensions);
+
 /// Maps `forOp` for execution on a parallel grid of virtual `processorIds` of
 /// size given by `numProcessors`. This is achieved by embedding the SSA values
 /// corresponding to `processorIds` and `numProcessors` into the bounds and step
diff --git a/mlir/include/mlir/Transforms/Passes.h b/mlir/include/mlir/Transforms/Passes.h
--- a/mlir/include/mlir/Transforms/Passes.h
+++ b/mlir/include/mlir/Transforms/Passes.h
@@ -91,6 +91,11 @@
 /// bounds into a single loop.
 std::unique_ptr<OpPassBase<FuncOp>> createLoopCoalescingPass();
 
+/// Creates a pass that transforms a single ParallelLoop over N induction
+/// variables into another ParallelLoop over less than N induction variables.
+std::unique_ptr<OpPassBase<FuncOp>> createParallelLoopCoalescingPass();
+
+
 /// Performs packing (or explicit copying) of accessed memref regions into
 /// buffers in the specified faster memory space through either pointwise copies
 /// or DMA operations.
diff --git a/mlir/lib/Transforms/CMakeLists.txt b/mlir/lib/Transforms/CMakeLists.txt
--- a/mlir/lib/Transforms/CMakeLists.txt
+++ b/mlir/lib/Transforms/CMakeLists.txt
@@ -16,6 +16,7 @@
   LoopUnroll.cpp
   MemRefDataFlowOpt.cpp
   OpStats.cpp
+  ParallelLoopCoalescing.cpp
   PipelineDataTransfer.cpp
   SimplifyAffineStructures.cpp
   StripDebugInfo.cpp
diff --git a/mlir/lib/Transforms/ParallelLoopCoalescing.cpp b/mlir/lib/Transforms/ParallelLoopCoalescing.cpp
new file mode 100644
--- /dev/null
+++ b/mlir/lib/Transforms/ParallelLoopCoalescing.cpp
@@ -0,0 +1,49 @@
+//===- ParallelLoopCoalescing.cpp - Pass coalescing parallel loop indices -===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Dialect/LoopOps/LoopOps.h"
+#include "mlir/Dialect/StandardOps/IR/Ops.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Transforms/LoopUtils.h"
+#include "mlir/Transforms/Passes.h"
+#include "mlir/Transforms/RegionUtils.h"
+#include "llvm/Support/Debug.h"
+
+#define PASS_NAME "parallel-loop-coalescing"
+#define DEBUG_TYPE PASS_NAME
+
+using namespace mlir;
+
+namespace {
+class ParallelLoopCoalescingPass
+    : public FunctionPass<ParallelLoopCoalescingPass> {
+public:
+  void runOnFunction() override {
+    FuncOp func = getFunction();
+
+    // The common case for GPU dialect will be simplifying the ParallelOp to 3
+    // arguments, so we do that here to simplify things.
+    func.walk([&](loop::ParallelOp op) {
+      std::vector<std::vector<unsigned>> combinedLoops(3);
+      for (unsigned i = 0; i < op.getNumInductionVars(); i++) {
+        combinedLoops[i % 3].push_back(i);
+      }
+      coalescePLoops(op, combinedLoops);
+      func.dump();
+    });
+  }
+};
+
+} // namespace
+
+std::unique_ptr<OpPassBase<FuncOp>> mlir::createParallelLoopCoalescingPass() {
+  return std::make_unique<ParallelLoopCoalescingPass>();
+}
+
+static PassRegistration<ParallelLoopCoalescingPass>
+    reg(PASS_NAME, "coalesce parallel loops to use less induction variables.");
diff --git a/mlir/lib/Transforms/Utils/LoopUtils.cpp b/mlir/lib/Transforms/Utils/LoopUtils.cpp
--- a/mlir/lib/Transforms/Utils/LoopUtils.cpp
+++ b/mlir/lib/Transforms/Utils/LoopUtils.cpp
@@ -973,69 +973,83 @@
   }
 }
 
-// Transform a loop with a strictly positive step
-//   for %i = %lb to %ub step %s
-// into a 0-based loop with step 1
-//   for %ii = 0 to ceildiv(%ub - %lb, %s) step 1 {
-//     %i = %ii * %s + %lb
-// Insert the induction variable remapping in the body of `inner`, which is
-// expected to be either `loop` or another loop perfectly nested under `loop`.
-// Insert the definition of new bounds immediate before `outer`, which is
-// expected to be either `loop` or its parent in the loop nest.
-static void normalizeLoop(loop::ForOp loop, loop::ForOp outer,
-                          loop::ForOp inner) {
-  OpBuilder builder(outer);
-  Location loc = loop.getLoc();
-
+static std::tuple<Value, Value, Value>
+normalizeLoop(OpBuilder &builder, Block *body, Value lowerBound, Value step,
+              Value upperBound, Value inductionVar) {
+  Value newLowerBound, newUpperBound, newStep;
+  auto loc = builder.getUnknownLoc();
   // Check if the loop is already known to have a constant zero lower bound or
   // a constant one step.
   bool isZeroBased = false;
   if (auto ubCst =
-          dyn_cast_or_null<ConstantIndexOp>(loop.lowerBound().getDefiningOp()))
+          dyn_cast_or_null<ConstantIndexOp>(lowerBound.getDefiningOp()))
     isZeroBased = ubCst.getValue() == 0;
 
   bool isStepOne = false;
-  if (auto stepCst =
-          dyn_cast_or_null<ConstantIndexOp>(loop.step().getDefiningOp()))
+  if (auto stepCst = dyn_cast_or_null<ConstantIndexOp>(step.getDefiningOp()))
     isStepOne = stepCst.getValue() == 1;
 
-  if (isZeroBased && isStepOne)
-    return;
 
   // Compute the number of iterations the loop executes: ceildiv(ub - lb, step)
   // assuming the step is strictly positive.  Update the bounds and the step
   // of the loop to go from 0 to the number of iterations, if necessary.
   // TODO(zinenko): introduce support for negative steps or emit dynamic asserts
   // on step positivity, whatever gets implemented first.
-  Value diff =
-      builder.create<SubIOp>(loc, loop.upperBound(), loop.lowerBound());
-  Value numIterations = ceilDivPositive(builder, loc, diff, loop.step());
-  loop.setUpperBound(numIterations);
-
-  Value lb = loop.lowerBound();
-  if (!isZeroBased) {
-    Value cst0 = builder.create<ConstantIndexOp>(loc, 0);
-    loop.setLowerBound(cst0);
+
+  if (isZeroBased && isStepOne) {
+    return {lowerBound, step, upperBound};
   }
 
-  Value step = loop.step();
-  if (!isStepOne) {
-    Value cst1 = builder.create<ConstantIndexOp>(loc, 1);
-    loop.setStep(cst1);
+  Value diff = builder.create<SubIOp>(loc, upperBound, lowerBound);
+  newUpperBound = ceilDivPositive(builder, loc, diff, step);
+
+  if (isZeroBased) {
+    newLowerBound = lowerBound;
+  } else {
+    newLowerBound = builder.create<ConstantIndexOp>(loc, 0);
+  }
+
+  if (isStepOne) {
+    newStep = step;
+  } else {
+    newStep = builder.create<ConstantIndexOp>(loc, 1);
   }
 
   // Insert code computing the value of the original loop induction variable
   // from the "normalized" one.
-  builder.setInsertionPointToStart(inner.getBody());
-  Value scaled =
-      isStepOne ? loop.getInductionVar()
-                : builder.create<MulIOp>(loc, loop.getInductionVar(), step);
-  Value shifted =
-      isZeroBased ? scaled : builder.create<AddIOp>(loc, scaled, lb);
+  OpBuilder insideBuilder(body, body->begin());
+  Value scaled = isStepOne
+                     ? inductionVar
+                     : insideBuilder.create<MulIOp>(loc, inductionVar, step);
+  Value shifted = isZeroBased
+                      ? scaled
+                      : insideBuilder.create<AddIOp>(loc, scaled, lowerBound);
 
   SmallPtrSet<Operation *, 2> preserve{scaled.getDefiningOp(),
                                        shifted.getDefiningOp()};
-  replaceAllUsesExcept(loop.getInductionVar(), shifted, preserve);
+  replaceAllUsesExcept(inductionVar, shifted, preserve);
+  return {newLowerBound, newStep, newUpperBound};
+}
+
+// Transform a loop with a strictly positive step
+//   for %i = %lb to %ub step %s
+// into a 0-based loop with step 1
+//   for %ii = 0 to ceildiv(%ub - %lb, %s) step 1 {
+//     %i = %ii * %s + %lb
+// Insert the induction variable remapping in the body of `inner`, which is
+// expected to be either `loop` or another loop perfectly nested under `loop`.
+// Insert the definition of new bounds immediate before `outer`, which is
+// expected to be either `loop` or its parent in the loop nest.
+static void normalizeLoop(loop::ForOp loop, loop::ForOp outer,
+                          loop::ForOp inner) {
+  OpBuilder builder(outer);
+  auto loopPieces =
+      normalizeLoop(builder, inner.getBody(), loop.lowerBound(), loop.step(),
+                    loop.upperBound(), loop.getInductionVar());
+
+  loop.setLowerBound(std::get<0>(loopPieces));
+  loop.setStep(std::get<1>(loopPieces));
+  loop.setUpperBound(std::get<2>(loopPieces));
 }
 
 void mlir::coalesceLoops(MutableArrayRef<loop::ForOp> loops) {
@@ -1093,6 +1107,118 @@
   second.erase();
 }
 
+void mlir::coalescePLoops(
+    loop::ParallelOp loops,
+    std::vector<std::vector<unsigned>> combinedDimensions) {
+  OpBuilder outsideBuilder(loops);
+
+  SmallVector<Value, 3> normalizedLowerBounds;
+  SmallVector<Value, 3> normalizedSteps;
+  SmallVector<Value, 3> normalizedUpperBounds;
+  // Normalize ParallelOp's iteration pattern.
+  for (unsigned i = 0; i < loops.getNumLoops(); i++) {
+    auto resultBounds = normalizeLoop(
+        outsideBuilder, loops.getBody(), loops.lowerBound()[i], loops.step()[i],
+        loops.upperBound()[i], loops.getBody()->getArgument(i));
+
+    normalizedLowerBounds.push_back(std::get<0>(resultBounds));
+    normalizedSteps.push_back(std::get<1>(resultBounds));
+    normalizedUpperBounds.push_back(std::get<2>(resultBounds));
+  }
+
+  // Combine iteration spaces
+  SmallVector<Value, 3> lowerBounds;
+  SmallVector<Value, 3> steps;
+  SmallVector<Value, 3> upperBounds;
+  auto cst0 = outsideBuilder.create<ConstantIndexOp>(loops.getLoc(), 0);
+  ;
+  auto cst1 = outsideBuilder.create<ConstantIndexOp>(loops.getLoc(), 1);
+  ;
+  for (unsigned i = 0; i < combinedDimensions.size(); i++) {
+    Value newUpperBound =
+        outsideBuilder.create<ConstantIndexOp>(loops.getLoc(), 1);
+    for (auto idx : combinedDimensions[i]) {
+      newUpperBound = outsideBuilder.create<MulIOp>(
+          loops.getLoc(), newUpperBound, normalizedUpperBounds[idx]);
+    }
+    lowerBounds.push_back(cst0);
+    steps.push_back(cst1);
+    upperBounds.push_back(newUpperBound);
+  }
+  auto newPloop = outsideBuilder.create<loop::ParallelOp>(
+      loops.getLoc(), lowerBounds, upperBounds, steps);
+  OpBuilder insideBuilder(newPloop.getBody(), newPloop.getBody()->begin());
+  // TODO: iterate the other way and mod to the get the original values and
+  // then replace all uses of the original induction vars with the new moded
+  // values of the one induction variable
+  for (unsigned i = 0; i < combinedDimensions.size(); i++) {
+    Value previous = newPloop.getBody()->getArgument(i);
+    for (unsigned idx = 0, e = combinedDimensions[i].size(); idx < e; ++idx) {
+      unsigned ivar_idx = combinedDimensions[i][idx];
+      if (idx != 0)
+        previous = insideBuilder.create<SignedDivIOp>(
+            loops.getLoc(), previous, loops.upperBound()[ivar_idx]);
+
+      Value iv = (idx == e - 1) ? previous
+                                : insideBuilder.create<SignedRemIOp>(
+                                      loops.getLoc(), previous,
+                                      loops.upperBound()[ivar_idx]);
+      replaceAllUsesInRegionWith(loops.getBody()->getArgument(ivar_idx), iv,
+                                 loops.region());
+    }
+  }
+
+  loops.getBody()->back().erase();
+  newPloop.getBody()->getOperations().splice(
+      Block::iterator(newPloop.getBody()->back()),
+      loops.getBody()->getOperations());
+  loops.erase();
+
+  /*
+    // 2. Emit code computing the upper bound of the coalesced loop as product
+    // of the number of iterations of all loops.
+    OpBuilder builder(outermost);
+    Location loc = outermost.getLoc();
+    Value upperBound = outermost.upperBound();
+    for (auto loop : loops.drop_front())
+      upperBound = builder.create<MulIOp>(loc, upperBound, loop.upperBound());
+    outermost.setUpperBound(upperBound);
+
+    builder.setInsertionPointToStart(outermost.getBody());
+
+    // 3. Remap induction variables.  For each original loop, the value of the
+    // induction variable can be obtained by dividing the induction variable of
+    // the linearized loop by the total number of iterations of the loops nested
+    // in it modulo the number of iterations in this loop (remove the values
+    // related to the outer loops):
+    //   iv_i = floordiv(iv_linear, product-of-loop-ranges-until-i) mod range_i.
+    // Compute these iteratively from the innermost loop by creating a "running
+    // quotient" of division by the range.
+    Value previous = outermost.getInductionVar();
+    for (unsigned i = 0, e = loops.size(); i < e; ++i) {
+      unsigned idx = loops.size() - i - 1;
+      if (i != 0)
+        previous = builder.create<SignedDivIOp>(loc, previous,
+                                                loops[idx + 1].upperBound());
+
+      Value iv = (i == e - 1) ? previous
+                              : builder.create<SignedRemIOp>(
+                                    loc, previous, loops[idx].upperBound());
+      replaceAllUsesInRegionWith(loops[idx].getInductionVar(), iv,
+                                 loops.back().region());
+    }
+
+    // 4. Move the operations from the innermost just above the second-outermost
+    // loop, delete the extra terminator and the second-outermost loop.
+    loop::ForOp second = loops[1];
+    innermost.getBody()->back().erase();
+    outermost.getBody()->getOperations().splice(
+        Block::iterator(second.getOperation()),
+        innermost.getBody()->getOperations());
+    second.erase();
+  */
+}
+
 void mlir::mapLoopToProcessorIds(loop::ForOp forOp, ArrayRef<Value> processorId,
                                  ArrayRef<Value> numProcessors) {
   assert(processorId.size() == numProcessors.size());
diff --git a/mlir/test/Transforms/parallel-loop-coalescing.mlir b/mlir/test/Transforms/parallel-loop-coalescing.mlir
new file mode 100644
--- /dev/null
+++ b/mlir/test/Transforms/parallel-loop-coalescing.mlir
@@ -0,0 +1,25 @@
+// RUN: mlir-opt -parallel-loop-coalescing %s | FileCheck %s
+
+// CHECK-LABEL: @parallel_many_dims
+func @parallel_many_dims() {
+  %c0 = constant 0 : index
+  %c1 = constant 1 : index
+  %c2 = constant 2 : index
+  %c3 = constant 3 : index
+  %c4 = constant 4 : index
+  %c5 = constant 5 : index
+  %c6 = constant 6 : index
+  %c7 = constant 7 : index
+  %c8 = constant 8 : index
+  %c9 = constant 9 : index
+  %c10 = constant 10 : index
+  %c11 = constant 11 : index
+  %c12 = constant 12 : index
+  %c13 = constant 13 : index
+  %c14 = constant 14 : index
+  loop.parallel (%i0, %i1, %i2, %i3, %i4) = (%c0, %c3, %c6, %c9, %c12) to (%c2, %c5, %c8, %c11, %c14)
+                                          step (%c1, %c4, %c7, %c10, %c13) {
+    %result = "magic.op"(%i0, %i1, %i2, %i3, %i4): (index, index, index, index, index) -> index
+  }
+  return
+}