diff --git a/mlir/include/mlir/Dialect/Affine/IR/AffineOps.td b/mlir/include/mlir/Dialect/Affine/IR/AffineOps.td
--- a/mlir/include/mlir/Dialect/Affine/IR/AffineOps.td
+++ b/mlir/include/mlir/Dialect/Affine/IR/AffineOps.td
@@ -621,11 +621,6 @@
     /// Get the number of dimensions.
     unsigned getNumDims();
 
-    operand_range getLowerBoundsOperands();
-    operand_range getUpperBoundsOperands();
-
-    AffineValueMap getLowerBoundsValueMap();
-    AffineValueMap getUpperBoundsValueMap();
     AffineValueMap getRangesValueMap();
 
     /// Get ranges as constants, may fail in dynamic case.
@@ -636,6 +631,18 @@
     MutableArrayRef<BlockArgument> getIVs() {
       return getBody()->getArguments();
     }
+
+    operand_range getLowerBoundsOperands();
+    AffineValueMap getLowerBoundsValueMap();
+    void setLowerBounds(ValueRange operands, AffineMap map);
+    void setLowerBoundsMap(AffineMap map);
+
+    operand_range getUpperBoundsOperands();
+    AffineValueMap getUpperBoundsValueMap();
+    void setUpperBounds(ValueRange operands, AffineMap map);
+    void setUpperBoundsMap(AffineMap map);
+
+    SmallVector<int64_t, 8> getSteps();
     void setSteps(ArrayRef<int64_t> newSteps);
 
     static StringRef getReductionsAttrName() { return "reductions"; }
@@ -643,6 +650,8 @@
     static StringRef getUpperBoundsMapAttrName() { return "upperBoundsMap"; }
     static StringRef getStepsAttrName() { return "steps"; }
   }];
+
+  let hasFolder = 1;
 }
 
 def AffinePrefetchOp : Affine_Op<"prefetch"> {
diff --git a/mlir/include/mlir/Dialect/Affine/IR/AffineValueMap.h b/mlir/include/mlir/Dialect/Affine/IR/AffineValueMap.h
--- a/mlir/include/mlir/Dialect/Affine/IR/AffineValueMap.h
+++ b/mlir/include/mlir/Dialect/Affine/IR/AffineValueMap.h
@@ -74,6 +74,10 @@
   ArrayRef<Value> getOperands() const;
   AffineMap getAffineMap() const;
 
+  /// Attempts to canonicalize the map and operands. Return success if the map
+  /// and/or operands have been modified.
+  LogicalResult canonicalize();
+
 private:
   // A mutable affine map.
   MutableAffineMap map;
diff --git a/mlir/include/mlir/Dialect/Affine/Passes.h b/mlir/include/mlir/Dialect/Affine/Passes.h
--- a/mlir/include/mlir/Dialect/Affine/Passes.h
+++ b/mlir/include/mlir/Dialect/Affine/Passes.h
@@ -35,6 +35,9 @@
 /// ops.
 std::unique_ptr<OperationPass<FuncOp>> createAffineParallelizePass();
 
+/// Normalize affine.parallel ops so that lower bounds are 0 and steps are 1.
+std::unique_ptr<OperationPass<FuncOp>> createAffineParallelNormalizePass();
+
 /// Performs packing (or explicit copying) of accessed memref regions into
 /// buffers in the specified faster memory space through either pointwise copies
 /// or DMA operations.
diff --git a/mlir/include/mlir/Dialect/Affine/Passes.td b/mlir/include/mlir/Dialect/Affine/Passes.td
--- a/mlir/include/mlir/Dialect/Affine/Passes.td
+++ b/mlir/include/mlir/Dialect/Affine/Passes.td
@@ -118,6 +118,12 @@
   let constructor = "mlir::createAffineParallelizePass()";
 }
 
+def AffineParallelNormalize : FunctionPass<"affine-parallel-normalize"> {
+  let summary = "Normalize affine.parallel ops so that lower bounds are 0 and "
+                "steps are 1";
+  let constructor = "mlir::createAffineParallelNormalizePass()";
+}
+
 def SimplifyAffineStructures : FunctionPass<"simplify-affine-structures"> {
   let summary = "Simplify affine expressions in maps/sets and normalize "
                 "memrefs";
diff --git a/mlir/include/mlir/Dialect/Affine/Utils.h b/mlir/include/mlir/Dialect/Affine/Utils.h
--- a/mlir/include/mlir/Dialect/Affine/Utils.h
+++ b/mlir/include/mlir/Dialect/Affine/Utils.h
@@ -43,6 +43,11 @@
     llvm::DenseSet<Operation *, DenseMapInfo<Operation *>> &loops,
     ArrayRef<int64_t> vectorSizes, ArrayRef<int64_t> fastestVaryingPattern);
 
+/// Normalize a affine.parallel op so that lower bounds are 0 and steps are 1.
+/// As currently implemented, this transformation cannot fail and will return
+/// early if the op is already in a normalized form.
+void normalizeAffineParallel(AffineParallelOp op);
+
 } // namespace mlir
 
 #endif // MLIR_DIALECT_AFFINE_UTILS_H
diff --git a/mlir/lib/Dialect/Affine/IR/AffineOps.cpp b/mlir/lib/Dialect/Affine/IR/AffineOps.cpp
--- a/mlir/lib/Dialect/Affine/IR/AffineOps.cpp
+++ b/mlir/lib/Dialect/Affine/IR/AffineOps.cpp
@@ -2505,9 +2505,58 @@
   return OpBuilder(getBody(), std::prev(getBody()->end()));
 }
 
+void AffineParallelOp::setLowerBounds(ValueRange lbOperands, AffineMap map) {
+  assert(lbOperands.size() == map.getNumInputs() &&
+         "operands to map must match number of inputs");
+  assert(map.getNumResults() >= 1 && "bounds map has at least one result");
+
+  auto ubOperands = getUpperBoundsOperands();
+
+  SmallVector<Value, 4> newOperands(lbOperands);
+  newOperands.append(ubOperands.begin(), ubOperands.end());
+  getOperation()->setOperands(newOperands);
+
+  lowerBoundsMapAttr(AffineMapAttr::get(map));
+}
+
+void AffineParallelOp::setUpperBounds(ValueRange ubOperands, AffineMap map) {
+  assert(ubOperands.size() == map.getNumInputs() &&
+         "operands to map must match number of inputs");
+  assert(map.getNumResults() >= 1 && "bounds map has at least one result");
+
+  SmallVector<Value, 4> newOperands(getLowerBoundsOperands());
+  newOperands.append(ubOperands.begin(), ubOperands.end());
+  getOperation()->setOperands(newOperands);
+
+  upperBoundsMapAttr(AffineMapAttr::get(map));
+}
+
+void AffineParallelOp::setLowerBoundsMap(AffineMap map) {
+  AffineMap lbMap = lowerBoundsMap();
+  assert(lbMap.getNumDims() == map.getNumDims() &&
+         lbMap.getNumSymbols() == map.getNumSymbols());
+  (void)lbMap;
+  lowerBoundsMapAttr(AffineMapAttr::get(map));
+}
+
+void AffineParallelOp::setUpperBoundsMap(AffineMap map) {
+  AffineMap ubMap = upperBoundsMap();
+  assert(ubMap.getNumDims() == map.getNumDims() &&
+         ubMap.getNumSymbols() == map.getNumSymbols());
+  (void)ubMap;
+  upperBoundsMapAttr(AffineMapAttr::get(map));
+}
+
+SmallVector<int64_t, 8> AffineParallelOp::getSteps() {
+  SmallVector<int64_t, 8> result;
+  for (Attribute attr : steps()) {
+    result.push_back(attr.cast<IntegerAttr>().getInt());
+  }
+  return result;
+}
+
 void AffineParallelOp::setSteps(ArrayRef<int64_t> newSteps) {
-  assert(newSteps.size() == getNumDims() && "steps & num dims mismatch");
-  setAttr(getStepsAttrName(), getBodyBuilder().getI64ArrayAttr(newSteps));
+  stepsAttr(getBodyBuilder().getI64ArrayAttr(newSteps));
 }
 
 static LogicalResult verify(AffineParallelOp op) {
@@ -2541,6 +2590,41 @@
   return success();
 }
 
+LogicalResult AffineValueMap::canonicalize() {
+  SmallVector<Value, 4> newOperands{operands};
+  auto newMap = getAffineMap();
+  composeAffineMapAndOperands(&newMap, &newOperands);
+  if (newMap == getAffineMap() && newOperands == operands)
+    return failure();
+  reset(newMap, newOperands);
+  return success();
+}
+
+/// Canonicalize the bounds of the given loop.
+static LogicalResult canonicalizeLoopBounds(AffineParallelOp op) {
+  AffineValueMap lb = op.getLowerBoundsValueMap();
+  bool lbCanonicalized = succeeded(lb.canonicalize());
+
+  AffineValueMap ub = op.getUpperBoundsValueMap();
+  bool ubCanonicalized = succeeded(ub.canonicalize());
+
+  // Any canonicalization change always leads to updated map(s).
+  if (!lbCanonicalized && !ubCanonicalized)
+    return failure();
+
+  if (lbCanonicalized)
+    op.setLowerBounds(lb.getOperands(), lb.getAffineMap());
+  if (ubCanonicalized)
+    op.setUpperBounds(ub.getOperands(), ub.getAffineMap());
+
+  return success();
+}
+
+LogicalResult AffineParallelOp::fold(ArrayRef<Attribute> operands,
+                                     SmallVectorImpl<OpFoldResult> &results) {
+  return canonicalizeLoopBounds(*this);
+}
+
 static void print(OpAsmPrinter &p, AffineParallelOp op) {
   p << op.getOperationName() << " (" << op.getBody()->getArguments() << ") = (";
   p.printAffineMapOfSSAIds(op.lowerBoundsMapAttr(),
@@ -2549,13 +2633,8 @@
   p.printAffineMapOfSSAIds(op.upperBoundsMapAttr(),
                            op.getUpperBoundsOperands());
   p << ')';
-  SmallVector<int64_t, 4> steps;
-  bool elideSteps = true;
-  for (auto attr : op.steps()) {
-    auto step = attr.cast<IntegerAttr>().getInt();
-    elideSteps &= (step == 1);
-    steps.push_back(step);
-  }
+  SmallVector<int64_t, 8> steps = op.getSteps();
+  bool elideSteps = llvm::all_of(steps, [](int64_t step) { return step == 1; });
   if (!elideSteps) {
     p << " step (";
     llvm::interleaveComma(steps, p);
@@ -2641,7 +2720,7 @@
   }
 
   // Parse optional clause of the form: `reduce ("addf", "maxf")`, where the
-  // quoted strings a member of the enum AtomicRMWKind.
+  // quoted strings are a member of the enum AtomicRMWKind.
   SmallVector<Attribute, 4> reductions;
   if (succeeded(parser.parseOptionalKeyword("reduce"))) {
     if (parser.parseLParen())
diff --git a/mlir/lib/Dialect/Affine/Transforms/AffineParallelNormalize.cpp b/mlir/lib/Dialect/Affine/Transforms/AffineParallelNormalize.cpp
new file mode 100644
--- /dev/null
+++ b/mlir/lib/Dialect/Affine/Transforms/AffineParallelNormalize.cpp
@@ -0,0 +1,96 @@
+//===- AffineParallelNormalize.cpp - AffineParallelNormalize Pass ---------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements a normalizer for affine parallel loops.
+//
+//===----------------------------------------------------------------------===//
+
+#include "PassDetail.h"
+#include "mlir/Dialect/Affine/IR/AffineOps.h"
+#include "mlir/Dialect/Affine/IR/AffineValueMap.h"
+#include "mlir/Dialect/Affine/Passes.h"
+#include "mlir/IR/PatternMatch.h"
+
+using namespace mlir;
+
+void normalizeAffineParallel(AffineParallelOp op) {
+  AffineMap lbMap = op.lowerBoundsMap();
+  SmallVector<int64_t, 8> steps = op.getSteps();
+  // No need to do any work if the parallel op is already normalized.
+  bool isAlreadyNormalized =
+      llvm::all_of(llvm::zip(steps, lbMap.getResults()), [](auto tuple) {
+        int64_t step = std::get<0>(tuple);
+        auto lbExpr =
+            std::get<1>(tuple).template dyn_cast<AffineConstantExpr>();
+        return lbExpr && lbExpr.getValue() == 0 && step == 1;
+      });
+  if (isAlreadyNormalized)
+    return;
+
+  AffineValueMap ranges = op.getRangesValueMap();
+  auto builder = OpBuilder::atBlockBegin(op.getBody());
+  auto zeroExpr = builder.getAffineConstantExpr(0);
+  SmallVector<AffineExpr, 8> lbExprs;
+  SmallVector<AffineExpr, 8> ubExprs;
+  for (unsigned i = 0, e = steps.size(); i < e; ++i) {
+    int64_t step = steps[i];
+
+    // Adjust the lower bound to be 0.
+    lbExprs.push_back(zeroExpr);
+
+    // Adjust the upper bound expression: 'range / step'.
+    AffineExpr ubExpr = ranges.getResult(i).ceilDiv(step);
+    ubExprs.push_back(ubExpr);
+
+    // Adjust the corresponding IV: 'lb + i * step'.
+    BlockArgument iv = op.getBody()->getArgument(i);
+    AffineExpr lbExpr = lbMap.getResult(i);
+    unsigned nDims = lbMap.getNumDims();
+    auto expr = lbExpr + builder.getAffineDimExpr(nDims) * step;
+    auto map = AffineMap::get(/*dimCount=*/nDims + 1,
+                              /*symbolCount=*/lbMap.getNumSymbols(), expr);
+
+    // Use an 'affine.apply' op that will be simplified later in subsequent
+    // canonicalizations.
+    OperandRange lbOperands = op.getLowerBoundsOperands();
+    OperandRange dimOperands = lbOperands.take_front(nDims);
+    OperandRange symbolOperands = lbOperands.drop_front(nDims);
+    SmallVector<Value, 8> applyOperands{dimOperands};
+    applyOperands.push_back(iv);
+    applyOperands.append(symbolOperands.begin(), symbolOperands.end());
+    auto apply = builder.create<AffineApplyOp>(op.getLoc(), map, applyOperands);
+    iv.replaceAllUsesExcept(apply, SmallPtrSet<Operation *, 1>{apply});
+  }
+
+  SmallVector<int64_t, 8> newSteps(op.getNumDims(), 1);
+  op.setSteps(newSteps);
+  auto newLowerMap = AffineMap::get(
+      /*dimCount=*/0, /*symbolCount=*/0, lbExprs, op.getContext());
+  op.setLowerBounds({}, newLowerMap);
+  auto newUpperMap = AffineMap::get(ranges.getNumDims(), ranges.getNumSymbols(),
+                                    ubExprs, op.getContext());
+  op.setUpperBounds(ranges.getOperands(), newUpperMap);
+}
+
+namespace {
+
+/// Normalize affine.parallel ops so that lower bounds are 0 and steps are 1.
+/// As currently implemented, this pass cannot fail, but it might skip over ops
+/// that are already in a normalized form.
+struct AffineParallelNormalizePass
+    : public AffineParallelNormalizeBase<AffineParallelNormalizePass> {
+
+  void runOnFunction() override { getFunction().walk(normalizeAffineParallel); }
+};
+
+} // namespace
+
+std::unique_ptr<OperationPass<FuncOp>>
+mlir::createAffineParallelNormalizePass() {
+  return std::make_unique<AffineParallelNormalizePass>();
+}
diff --git a/mlir/lib/Dialect/Affine/Transforms/CMakeLists.txt b/mlir/lib/Dialect/Affine/Transforms/CMakeLists.txt
--- a/mlir/lib/Dialect/Affine/Transforms/CMakeLists.txt
+++ b/mlir/lib/Dialect/Affine/Transforms/CMakeLists.txt
@@ -2,6 +2,7 @@
   AffineDataCopyGeneration.cpp
   AffineLoopInvariantCodeMotion.cpp
   AffineParallelize.cpp
+  AffineParallelNormalize.cpp
   LoopTiling.cpp
   LoopUnroll.cpp
   LoopUnrollAndJam.cpp
diff --git a/mlir/test/Dialect/Affine/affine-parallel-normalize.mlir b/mlir/test/Dialect/Affine/affine-parallel-normalize.mlir
new file mode 100644
--- /dev/null
+++ b/mlir/test/Dialect/Affine/affine-parallel-normalize.mlir
@@ -0,0 +1,25 @@
+// RUN: mlir-opt %s -affine-parallel-normalize -split-input-file | FileCheck %s
+
+// Normalize steps to 1 and lower bounds to 0.
+
+// CHECK-DAG: [[$MAP0:#map[0-9]+]] = affine_map<(d0) -> (d0 * 3)>
+// CHECK-DAG: [[$MAP1:#map[0-9]+]] = affine_map<(d0) -> (d0 * 2 + 1)>
+// CHECK-DAG: [[$MAP2:#map[0-9]+]] = affine_map<(d0, d1) -> (d0 + d1)>
+
+// CHECK-LABEL: func @normalize_parallel()
+func @normalize_parallel() {
+  %cst = constant 1.0 : f32
+  %0 = alloc() : memref<2x4xf32>
+  // CHECK: affine.parallel (%[[i0:.*]], %[[j0:.*]]) = (0, 0) to (4, 2)
+  affine.parallel (%i, %j) = (0, 1) to (10, 5) step (3, 2) {
+    // CHECK: %[[i1:.*]] = affine.apply [[$MAP0]](%[[i0]])
+    // CHECK: %[[j1:.*]] = affine.apply [[$MAP1]](%[[j0]])
+    // CHECK: affine.parallel (%[[k0:.*]]) = (0) to (%[[j1]] - %[[i1]])
+    affine.parallel (%k) = (%i) to (%j) {
+      // CHECK: %[[k1:.*]] = affine.apply [[$MAP2]](%[[i1]], %[[k0]])
+      // CHECK: affine.store %{{.*}}, %{{.*}}[%[[i1]], %[[k1]]] : memref<2x4xf32>
+      affine.store %cst, %0[%i, %k] : memref<2x4xf32>
+    }
+  }
+  return
+}
diff --git a/mlir/test/Dialect/Affine/canonicalize.mlir b/mlir/test/Dialect/Affine/canonicalize.mlir
--- a/mlir/test/Dialect/Affine/canonicalize.mlir
+++ b/mlir/test/Dialect/Affine/canonicalize.mlir
@@ -604,3 +604,26 @@
   }
   return
 }
+
+// -----
+
+// Ensure affine.parallel bounds expressions are canonicalized.
+
+#map3 = affine_map<(d0) -> (d0 * 5)>
+
+// CHECK-LABEL: func @affine_parallel_const_bounds
+func @affine_parallel_const_bounds() {
+  %cst = constant 1.0 : f32
+  %c0 = constant 0 : index
+  %c4 = constant 4 : index
+  %0 = alloc() : memref<4xf32>
+  // CHECK: affine.parallel (%{{.*}}) = (0) to (4)
+  affine.parallel (%i) = (%c0) to (%c0 + %c4) {
+    %1 = affine.apply #map3(%i)
+    // CHECK: affine.parallel (%{{.*}}) = (0) to (%{{.*}} * 5)
+    affine.parallel (%j) = (%c0) to (%1) {
+      affine.store %cst, %0[%j] : memref<4xf32>
+    }
+  }
+  return
+}