diff --git a/mlir/include/mlir/InitAllPasses.h b/mlir/include/mlir/InitAllPasses.h
--- a/mlir/include/mlir/InitAllPasses.h
+++ b/mlir/include/mlir/InitAllPasses.h
@@ -111,6 +111,7 @@
   createParallelLoopFusionPass();
   createParallelLoopSpecializationPass();
   createParallelLoopTilingPass();
+  createParallelLoopCoalescingPass();
 
   // QuantOps
   quant::createConvertSimulatedQuantPass();
diff --git a/mlir/include/mlir/Transforms/LoopUtils.h b/mlir/include/mlir/Transforms/LoopUtils.h
--- a/mlir/include/mlir/Transforms/LoopUtils.h
+++ b/mlir/include/mlir/Transforms/LoopUtils.h
@@ -28,6 +28,7 @@
 
 namespace loop {
 class ForOp;
+class ParallelOp;
 } // end namespace loop
 
 /// Unrolls this for operation completely if the trip count is known to be
@@ -225,6 +226,9 @@
 /// independent of any loop induction variable involved in the nest.
 void coalesceLoops(MutableArrayRef<loop::ForOp> loops);
 
+void coalescePLoops(loop::ParallelOp loops,
+                    std::vector<std::vector<unsigned>> combinedDimensions);
+
 /// Maps `forOp` for execution on a parallel grid of virtual `processorIds` of
 /// size given by `numProcessors`. This is achieved by embedding the SSA values
 /// corresponding to `processorIds` and `numProcessors` into the bounds and step
diff --git a/mlir/include/mlir/Transforms/Passes.h b/mlir/include/mlir/Transforms/Passes.h
--- a/mlir/include/mlir/Transforms/Passes.h
+++ b/mlir/include/mlir/Transforms/Passes.h
@@ -91,6 +91,10 @@
 /// bounds into a single loop.
 std::unique_ptr<OpPassBase<FuncOp>> createLoopCoalescingPass();
 
+/// Creates a pass that transforms a single ParallelLoop over N induction
+/// variables into another ParallelLoop over less than N induction variables.
+std::unique_ptr<OpPassBase<FuncOp>> createParallelLoopCoalescingPass();
+
 /// Performs packing (or explicit copying) of accessed memref regions into
 /// buffers in the specified faster memory space through either pointwise copies
 /// or DMA operations.
diff --git a/mlir/lib/Transforms/CMakeLists.txt b/mlir/lib/Transforms/CMakeLists.txt
--- a/mlir/lib/Transforms/CMakeLists.txt
+++ b/mlir/lib/Transforms/CMakeLists.txt
@@ -16,6 +16,7 @@
   LoopUnroll.cpp
   MemRefDataFlowOpt.cpp
   OpStats.cpp
+  ParallelLoopCoalescing.cpp
   PipelineDataTransfer.cpp
   SimplifyAffineStructures.cpp
   StripDebugInfo.cpp
diff --git a/mlir/lib/Transforms/ParallelLoopCoalescing.cpp b/mlir/lib/Transforms/ParallelLoopCoalescing.cpp
new file mode 100644
--- /dev/null
+++ b/mlir/lib/Transforms/ParallelLoopCoalescing.cpp
@@ -0,0 +1,68 @@
+//===- ParallelLoopCoalescing.cpp - Pass coalescing parallel loop indices -===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Dialect/LoopOps/LoopOps.h"
+#include "mlir/Dialect/StandardOps/IR/Ops.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Transforms/LoopUtils.h"
+#include "mlir/Transforms/Passes.h"
+#include "mlir/Transforms/RegionUtils.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+
+#define PASS_NAME "parallel-loop-coalescing"
+#define DEBUG_TYPE PASS_NAME
+
+using namespace mlir;
+
+static llvm::cl::OptionCategory clOptionsCategory(DEBUG_TYPE " options");
+
+static llvm::cl::list<unsigned> clCoalescedIndices0(
+    "coalesced-indices-0",
+    llvm::cl::desc("Which loop indices to combine 0th loop index"),
+    llvm::cl::MiscFlags::CommaSeparated, llvm::cl::cat(clOptionsCategory));
+
+static llvm::cl::list<unsigned> clCoalescedIndices1(
+    "coalesced-indices-1",
+    llvm::cl::desc(
+        "Which loop indices to combine into the position 1 loop index"),
+    llvm::cl::MiscFlags::CommaSeparated, llvm::cl::cat(clOptionsCategory));
+
+static llvm::cl::list<unsigned> clCoalescedIndices2(
+    "coalesced-indices-2",
+    llvm::cl::desc(
+        "Which loop indices to combine into the position 2 loop index"),
+    llvm::cl::MiscFlags::CommaSeparated, llvm::cl::cat(clOptionsCategory));
+
+namespace {
+class ParallelLoopCoalescingPass
+    : public FunctionPass<ParallelLoopCoalescingPass> {
+public:
+  void runOnFunction() override {
+    FuncOp func = getFunction();
+
+    // The common case for GPU dialect will be simplifying the ParallelOp to 3
+    // arguments, so we do that here to simplify things.
+    func.walk([&](loop::ParallelOp op) {
+      std::vector<std::vector<unsigned>> combinedLoops(3);
+      combinedLoops[0] = clCoalescedIndices0;
+      combinedLoops[1] = clCoalescedIndices1;
+      combinedLoops[2] = clCoalescedIndices2;
+      coalescePLoops(op, combinedLoops);
+    });
+  }
+};
+
+} // namespace
+
+std::unique_ptr<OpPassBase<FuncOp>> mlir::createParallelLoopCoalescingPass() {
+  return std::make_unique<ParallelLoopCoalescingPass>();
+}
+
+static PassRegistration<ParallelLoopCoalescingPass>
+    reg(PASS_NAME, "coalesce parallel loops to use less induction variables.");
diff --git a/mlir/lib/Transforms/Utils/LoopUtils.cpp b/mlir/lib/Transforms/Utils/LoopUtils.cpp
--- a/mlir/lib/Transforms/Utils/LoopUtils.cpp
+++ b/mlir/lib/Transforms/Utils/LoopUtils.cpp
@@ -973,69 +973,85 @@
   }
 }
 
-// Transform a loop with a strictly positive step
-//   for %i = %lb to %ub step %s
-// into a 0-based loop with step 1
-//   for %ii = 0 to ceildiv(%ub - %lb, %s) step 1 {
-//     %i = %ii * %s + %lb
-// Insert the induction variable remapping in the body of `inner`, which is
-// expected to be either `loop` or another loop perfectly nested under `loop`.
-// Insert the definition of new bounds immediate before `outer`, which is
-// expected to be either `loop` or its parent in the loop nest.
-static void normalizeLoop(loop::ForOp loop, loop::ForOp outer,
-                          loop::ForOp inner) {
-  OpBuilder builder(outer);
-  Location loc = loop.getLoc();
-
+// Return the new lower bound, upper bound, and step in that order. Insert any
+// additional bounds calculations before the given builder and any additional
+// conversion back to the original loop induction value inside the given Block.
+static std::tuple<Value, Value, Value>
+normalizeLoop(OpBuilder &boundsBuilder, OpBuilder &insideLoopBuilder,
+              Location loc, Value lowerBound, Value upperBound, Value step,
+              Value inductionVar) {
+  Value newLowerBound, newUpperBound, newStep;
   // Check if the loop is already known to have a constant zero lower bound or
   // a constant one step.
   bool isZeroBased = false;
   if (auto ubCst =
-          dyn_cast_or_null<ConstantIndexOp>(loop.lowerBound().getDefiningOp()))
+          dyn_cast_or_null<ConstantIndexOp>(lowerBound.getDefiningOp()))
     isZeroBased = ubCst.getValue() == 0;
 
   bool isStepOne = false;
-  if (auto stepCst =
-          dyn_cast_or_null<ConstantIndexOp>(loop.step().getDefiningOp()))
+  if (auto stepCst = dyn_cast_or_null<ConstantIndexOp>(step.getDefiningOp()))
     isStepOne = stepCst.getValue() == 1;
 
-  if (isZeroBased && isStepOne)
-    return;
 
   // Compute the number of iterations the loop executes: ceildiv(ub - lb, step)
   // assuming the step is strictly positive.  Update the bounds and the step
   // of the loop to go from 0 to the number of iterations, if necessary.
   // TODO(zinenko): introduce support for negative steps or emit dynamic asserts
   // on step positivity, whatever gets implemented first.
-  Value diff =
-      builder.create<SubIOp>(loc, loop.upperBound(), loop.lowerBound());
-  Value numIterations = ceilDivPositive(builder, loc, diff, loop.step());
-  loop.setUpperBound(numIterations);
-
-  Value lb = loop.lowerBound();
-  if (!isZeroBased) {
-    Value cst0 = builder.create<ConstantIndexOp>(loc, 0);
-    loop.setLowerBound(cst0);
+  if (isZeroBased && isStepOne) {
+    return {lowerBound, upperBound, step};
   }
 
-  Value step = loop.step();
-  if (!isStepOne) {
-    Value cst1 = builder.create<ConstantIndexOp>(loc, 1);
-    loop.setStep(cst1);
+  Value diff = boundsBuilder.create<SubIOp>(loc, upperBound, lowerBound);
+  newUpperBound = ceilDivPositive(boundsBuilder, loc, diff, step);
+
+  if (isZeroBased) {
+    newLowerBound = lowerBound;
+  } else {
+    newLowerBound = boundsBuilder.create<ConstantIndexOp>(loc, 0);
+  }
+
+  if (isStepOne) {
+    newStep = step;
+  } else {
+    newStep = boundsBuilder.create<ConstantIndexOp>(loc, 1);
   }
 
   // Insert code computing the value of the original loop induction variable
   // from the "normalized" one.
-  builder.setInsertionPointToStart(inner.getBody());
   Value scaled =
-      isStepOne ? loop.getInductionVar()
-                : builder.create<MulIOp>(loc, loop.getInductionVar(), step);
+      isStepOne ? inductionVar
+                : insideLoopBuilder.create<MulIOp>(loc, inductionVar, step);
   Value shifted =
-      isZeroBased ? scaled : builder.create<AddIOp>(loc, scaled, lb);
+      isZeroBased ? scaled
+                  : insideLoopBuilder.create<AddIOp>(loc, scaled, lowerBound);
 
   SmallPtrSet<Operation *, 2> preserve{scaled.getDefiningOp(),
                                        shifted.getDefiningOp()};
-  replaceAllUsesExcept(loop.getInductionVar(), shifted, preserve);
+  replaceAllUsesExcept(inductionVar, shifted, preserve);
+  return {newLowerBound, newUpperBound, newStep};
+}
+
+// Transform a loop with a strictly positive step
+//   for %i = %lb to %ub step %s
+// into a 0-based loop with step 1
+//   for %ii = 0 to ceildiv(%ub - %lb, %s) step 1 {
+//     %i = %ii * %s + %lb
+// Insert the induction variable remapping in the body of `inner`, which is
+// expected to be either `loop` or another loop perfectly nested under `loop`.
+// Insert the definition of new bounds immediate before `outer`, which is
+// expected to be either `loop` or its parent in the loop nest.
+static void normalizeLoop(loop::ForOp loop, loop::ForOp outer,
+                          loop::ForOp inner) {
+  OpBuilder builder(outer);
+  OpBuilder innerBuilder(inner.getBody(), inner.getBody()->begin());
+  auto loopPieces =
+      normalizeLoop(builder, innerBuilder, loop.getLoc(), loop.lowerBound(),
+                    loop.upperBound(), loop.step(), loop.getInductionVar());
+
+  loop.setLowerBound(std::get<0>(loopPieces));
+  loop.setStep(std::get<2>(loopPieces));
+  loop.setUpperBound(std::get<1>(loopPieces));
 }
 
 void mlir::coalesceLoops(MutableArrayRef<loop::ForOp> loops) {
@@ -1093,6 +1109,74 @@
   second.erase();
 }
 
+void mlir::coalescePLoops(
+    loop::ParallelOp loops,
+    std::vector<std::vector<unsigned>> combinedDimensions) {
+  OpBuilder outsideBuilder(loops);
+  Location loc = loops.getLoc();
+
+  // Normalize ParallelOp's iteration pattern.
+  SmallVector<Value, 3> normalizedLowerBounds;
+  SmallVector<Value, 3> normalizedSteps;
+  SmallVector<Value, 3> normalizedUpperBounds;
+  for (unsigned i = 0; i < loops.getNumLoops(); i++) {
+    OpBuilder insideLoopBuilder(loops.getBody(), loops.getBody()->begin());
+    auto resultBounds =
+        normalizeLoop(outsideBuilder, insideLoopBuilder, loc,
+                      loops.lowerBound()[i], loops.upperBound()[i],
+                      loops.step()[i], loops.getBody()->getArgument(i));
+
+    normalizedLowerBounds.push_back(std::get<0>(resultBounds));
+    normalizedUpperBounds.push_back(std::get<1>(resultBounds));
+    normalizedSteps.push_back(std::get<2>(resultBounds));
+  }
+
+  // Combine iteration spaces
+  SmallVector<Value, 3> lowerBounds;
+  SmallVector<Value, 3> steps;
+  SmallVector<Value, 3> upperBounds;
+  auto cst0 = outsideBuilder.create<ConstantIndexOp>(loc, 0);
+  auto cst1 = outsideBuilder.create<ConstantIndexOp>(loc, 1);
+  for (unsigned i = 0; i < combinedDimensions.size(); i++) {
+    Value newUpperBound = outsideBuilder.create<ConstantIndexOp>(loc, 1);
+    for (auto idx : combinedDimensions[i]) {
+      newUpperBound = outsideBuilder.create<MulIOp>(loc, newUpperBound,
+                                                    normalizedUpperBounds[idx]);
+    }
+    lowerBounds.push_back(cst0);
+    steps.push_back(cst1);
+    upperBounds.push_back(newUpperBound);
+  }
+
+  // Create new ParallelLoop with conversions to the original induction values.
+  auto newPloop = outsideBuilder.create<loop::ParallelOp>(loc, lowerBounds,
+                                                          upperBounds, steps);
+  OpBuilder insideBuilder(newPloop.getBody(), newPloop.getBody()->begin());
+  for (unsigned i = 0; i < combinedDimensions.size(); i++) {
+    Value previous = newPloop.getBody()->getArgument(i);
+    for (unsigned idx = 0, e = combinedDimensions[i].size(); idx < e; ++idx) {
+      unsigned ivar_idx = combinedDimensions[i][idx];
+      if (idx != 0)
+        previous = insideBuilder.create<SignedDivIOp>(
+            loc, previous, loops.upperBound()[ivar_idx]);
+
+      Value iv = (idx == e - 1)
+                     ? previous
+                     : insideBuilder.create<SignedRemIOp>(
+                           loc, previous, loops.upperBound()[ivar_idx]);
+      replaceAllUsesInRegionWith(loops.getBody()->getArgument(ivar_idx), iv,
+                                 loops.region());
+    }
+  }
+
+  // Replace the old loop with the new loop.
+  loops.getBody()->back().erase();
+  newPloop.getBody()->getOperations().splice(
+      Block::iterator(newPloop.getBody()->back()),
+      loops.getBody()->getOperations());
+  loops.erase();
+}
+
 void mlir::mapLoopToProcessorIds(loop::ForOp forOp, ArrayRef<Value> processorId,
                                  ArrayRef<Value> numProcessors) {
   assert(processorId.size() == numProcessors.size());
diff --git a/mlir/test/Transforms/parallel-loop-coalescing.mlir b/mlir/test/Transforms/parallel-loop-coalescing.mlir
new file mode 100644
--- /dev/null
+++ b/mlir/test/Transforms/parallel-loop-coalescing.mlir
@@ -0,0 +1,96 @@
+// RUN: mlir-opt -parallel-loop-coalescing --coalesced-indices-0=0,3 --coalesced-indices-1=1,4 --coalesced-indices-2=2 %s | FileCheck %s
+
+// CHECK-LABEL:   func @parallel_many_dims() {
+func @parallel_many_dims() {
+  // CHECK:           [[VAL_0:%.*]] = constant 0 : index
+  // CHECK:           [[VAL_1:%.*]] = constant 1 : index
+  // CHECK:           [[VAL_2:%.*]] = constant 2 : index
+  // CHECK:           [[VAL_3:%.*]] = constant 3 : index
+  // CHECK:           [[VAL_4:%.*]] = constant 4 : index
+  // CHECK:           [[VAL_5:%.*]] = constant 5 : index
+  // CHECK:           [[VAL_6:%.*]] = constant 6 : index
+  // CHECK:           [[VAL_7:%.*]] = constant 7 : index
+  // CHECK:           [[VAL_8:%.*]] = constant 8 : index
+  // CHECK:           [[VAL_9:%.*]] = constant 9 : index
+  // CHECK:           [[VAL_10:%.*]] = constant 10 : index
+  // CHECK:           [[VAL_11:%.*]] = constant 11 : index
+  // CHECK:           [[VAL_12:%.*]] = constant 12 : index
+  // CHECK:           [[VAL_13:%.*]] = constant 13 : index
+  // CHECK:           [[VAL_14:%.*]] = constant 14 : index
+  // CHECK:           [[VAL_15:%.*]] = subi [[VAL_5]], [[VAL_3]] : index
+  // CHECK:           [[VAL_16:%.*]] = constant 1 : index
+  // CHECK:           [[VAL_17:%.*]] = subi [[VAL_4]], [[VAL_16]] : index
+  // CHECK:           [[VAL_18:%.*]] = addi [[VAL_15]], [[VAL_17]] : index
+  // CHECK:           [[VAL_19:%.*]] = divi_signed [[VAL_18]], [[VAL_4]] : index
+  // CHECK:           [[VAL_20:%.*]] = constant 0 : index
+  // CHECK:           [[VAL_21:%.*]] = constant 1 : index
+  // CHECK:           [[VAL_22:%.*]] = subi [[VAL_8]], [[VAL_6]] : index
+  // CHECK:           [[VAL_23:%.*]] = constant 1 : index
+  // CHECK:           [[VAL_24:%.*]] = subi [[VAL_7]], [[VAL_23]] : index
+  // CHECK:           [[VAL_25:%.*]] = addi [[VAL_22]], [[VAL_24]] : index
+  // CHECK:           [[VAL_26:%.*]] = divi_signed [[VAL_25]], [[VAL_7]] : index
+  // CHECK:           [[VAL_27:%.*]] = constant 0 : index
+  // CHECK:           [[VAL_28:%.*]] = constant 1 : index
+  // CHECK:           [[VAL_29:%.*]] = subi [[VAL_11]], [[VAL_9]] : index
+  // CHECK:           [[VAL_30:%.*]] = constant 1 : index
+  // CHECK:           [[VAL_31:%.*]] = subi [[VAL_10]], [[VAL_30]] : index
+  // CHECK:           [[VAL_32:%.*]] = addi [[VAL_29]], [[VAL_31]] : index
+  // CHECK:           [[VAL_33:%.*]] = divi_signed [[VAL_32]], [[VAL_10]] : index
+  // CHECK:           [[VAL_34:%.*]] = constant 0 : index
+  // CHECK:           [[VAL_35:%.*]] = constant 1 : index
+  // CHECK:           [[VAL_36:%.*]] = subi [[VAL_14]], [[VAL_12]] : index
+  // CHECK:           [[VAL_37:%.*]] = constant 1 : index
+  // CHECK:           [[VAL_38:%.*]] = subi [[VAL_13]], [[VAL_37]] : index
+  // CHECK:           [[VAL_39:%.*]] = addi [[VAL_36]], [[VAL_38]] : index
+  // CHECK:           [[VAL_40:%.*]] = divi_signed [[VAL_39]], [[VAL_13]] : index
+  // CHECK:           [[VAL_41:%.*]] = constant 0 : index
+  // CHECK:           [[VAL_42:%.*]] = constant 1 : index
+  // CHECK:           [[VAL_43:%.*]] = constant 0 : index
+  // CHECK:           [[VAL_44:%.*]] = constant 1 : index
+  // CHECK:           [[VAL_45:%.*]] = constant 1 : index
+  // CHECK:           [[VAL_46:%.*]] = muli [[VAL_45]], [[VAL_2]] : index
+  // CHECK:           [[VAL_47:%.*]] = muli [[VAL_46]], [[VAL_33]] : index
+  // CHECK:           [[VAL_48:%.*]] = constant 1 : index
+  // CHECK:           [[VAL_49:%.*]] = muli [[VAL_48]], [[VAL_19]] : index
+  // CHECK:           [[VAL_50:%.*]] = muli [[VAL_49]], [[VAL_40]] : index
+  // CHECK:           [[VAL_51:%.*]] = constant 1 : index
+  // CHECK:           [[VAL_52:%.*]] = muli [[VAL_51]], [[VAL_26]] : index
+  %c0 = constant 0 : index
+  %c1 = constant 1 : index
+  %c2 = constant 2 : index
+  %c3 = constant 3 : index
+  %c4 = constant 4 : index
+  %c5 = constant 5 : index
+  %c6 = constant 6 : index
+  %c7 = constant 7 : index
+  %c8 = constant 8 : index
+  %c9 = constant 9 : index
+  %c10 = constant 10 : index
+  %c11 = constant 11 : index
+  %c12 = constant 12 : index
+  %c13 = constant 13 : index
+  %c14 = constant 14 : index
+// CHECK:           loop.parallel ([[VAL_53:%.*]], [[VAL_54:%.*]], [[VAL_55:%.*]]) = ([[VAL_43]], [[VAL_43]], [[VAL_43]]) to ([[VAL_47]], [[VAL_50]], [[VAL_52]]) step ([[VAL_44]], [[VAL_44]], [[VAL_44]]) {
+
+  loop.parallel (%i0, %i1, %i2, %i3, %i4) = (%c0, %c3, %c6, %c9, %c12) to (%c2, %c5, %c8, %c11, %c14)
+                                          step (%c1, %c4, %c7, %c10, %c13) {
+  // CHECK:             [[VAL_56:%.*]] = remi_signed [[VAL_53]], [[VAL_2]] : index
+  // CHECK:             [[VAL_57:%.*]] = divi_signed [[VAL_53]], [[VAL_11]] : index
+  // CHECK:             [[VAL_58:%.*]] = remi_signed [[VAL_54]], [[VAL_5]] : index
+  // CHECK:             [[VAL_59:%.*]] = divi_signed [[VAL_54]], [[VAL_14]] : index
+  // CHECK:             [[VAL_60:%.*]] = muli [[VAL_59]], [[VAL_13]] : index
+  // CHECK:             [[VAL_61:%.*]] = addi [[VAL_60]], [[VAL_12]] : index
+  // CHECK:             [[VAL_62:%.*]] = muli [[VAL_57]], [[VAL_10]] : index
+  // CHECK:             [[VAL_63:%.*]] = addi [[VAL_62]], [[VAL_9]] : index
+  // CHECK:             [[VAL_64:%.*]] = muli [[VAL_55]], [[VAL_7]] : index
+  // CHECK:             [[VAL_65:%.*]] = addi [[VAL_64]], [[VAL_6]] : index
+  // CHECK:             [[VAL_66:%.*]] = muli [[VAL_58]], [[VAL_4]] : index
+  // CHECK:             [[VAL_67:%.*]] = addi [[VAL_66]], [[VAL_3]] : index
+  // CHECK:             [[VAL_68:%.*]] = "magic.op"([[VAL_56]], [[VAL_67]], [[VAL_65]], [[VAL_63]], [[VAL_61]]) : (index, index, index, index, index) -> index
+
+    %result = "magic.op"(%i0, %i1, %i2, %i3, %i4): (index, index, index, index, index) -> index
+  // CHECK:             loop.yield
+  }
+  // CHECK:           return
+  return
+}