Index: mlir/include/mlir/Dialect/Affine/IR/AffineOps.td
===================================================================
--- mlir/include/mlir/Dialect/Affine/IR/AffineOps.td
+++ mlir/include/mlir/Dialect/Affine/IR/AffineOps.td
@@ -621,11 +621,6 @@
     /// Get the number of dimensions.
     unsigned getNumDims();
 
-    operand_range getLowerBoundsOperands();
-    operand_range getUpperBoundsOperands();
-
-    AffineValueMap getLowerBoundsValueMap();
-    AffineValueMap getUpperBoundsValueMap();
     AffineValueMap getRangesValueMap();
 
     /// Get ranges as constants, may fail in dynamic case.
@@ -636,6 +631,17 @@
     MutableArrayRef<BlockArgument> getIVs() {
       return getBody()->getArguments();
     }
+
+    operand_range getLowerBoundsOperands();
+    AffineValueMap getLowerBoundsValueMap();
+    void setLowerBounds(ValueRange operands, AffineMap map);
+    void setLowerBoundsMap(AffineMap map);
+
+    operand_range getUpperBoundsOperands();
+    AffineValueMap getUpperBoundsValueMap();
+    void setUpperBounds(ValueRange operands, AffineMap map);
+    void setUpperBoundsMap(AffineMap map);
+
     void setSteps(ArrayRef<int64_t> newSteps);
 
     static StringRef getReductionsAttrName() { return "reductions"; }
@@ -643,6 +649,9 @@
     static StringRef getUpperBoundsMapAttrName() { return "upperBoundsMap"; }
     static StringRef getStepsAttrName() { return "steps"; }
   }];
+
+  let hasCanonicalizer = 1;
+  let hasFolder = 1;
 }
 
 def AffinePrefetchOp : Affine_Op<"prefetch"> {
Index: mlir/include/mlir/Dialect/Affine/IR/AffineValueMap.h
===================================================================
--- mlir/include/mlir/Dialect/Affine/IR/AffineValueMap.h
+++ mlir/include/mlir/Dialect/Affine/IR/AffineValueMap.h
@@ -74,6 +74,9 @@
   ArrayRef<Value> getOperands() const;
   AffineMap getAffineMap() const;
 
+  /// Return success if the map and/or operands have been modified.
+  LogicalResult canonicalize();
+
 private:
   // A mutable affine map.
   MutableAffineMap map;
Index: mlir/include/mlir/Dialect/Affine/Passes.h
===================================================================
--- mlir/include/mlir/Dialect/Affine/Passes.h
+++ mlir/include/mlir/Dialect/Affine/Passes.h
@@ -40,6 +40,10 @@
 /// ops.
 std::unique_ptr<OperationPass<FuncOp>> createAffineParallelizePass();
 
+/// Simplify affine.parallel ops so that they have a step size of 1 and a lower
+/// bound of 0.
+std::unique_ptr<OperationPass<FuncOp>> createAffineParallelSimplifyPass();
+
 /// Performs packing (or explicit copying) of accessed memref regions into
 /// buffers in the specified faster memory space through either pointwise copies
 /// or DMA operations.
Index: mlir/include/mlir/Dialect/Affine/Passes.td
===================================================================
--- mlir/include/mlir/Dialect/Affine/Passes.td
+++ mlir/include/mlir/Dialect/Affine/Passes.td
@@ -117,6 +117,11 @@
   let constructor = "mlir::createAffineParallelizePass()";
 }
 
+def AffineParallelSimplify : FunctionPass<"affine-parallel-simplify"> {
+  let summary = "Simplify affine.parallel ops";
+  let constructor = "mlir::createAffineParallelSimplifyPass()";
+}
+
 def SimplifyAffineStructures : FunctionPass<"simplify-affine-structures"> {
   let summary = "Simplify affine expressions in maps/sets and normalize "
                 "memrefs";
Index: mlir/include/mlir/IR/PatternMatch.h
===================================================================
--- mlir/include/mlir/IR/PatternMatch.h
+++ mlir/include/mlir/IR/PatternMatch.h
@@ -313,6 +313,16 @@
     replaceOpWithResultsOfAnotherOp(op, newOp.getOperation());
   }
 
+  /// Replace all the uses of the block argument `from` with value `to`.
+  virtual void replaceUsesOfBlockArgument(BlockArgument from, Value to) {
+    llvm_unreachable("replaceUsesOfBlockArgument is unsupported by this pattern rewriter");
+  }
+
+  /// Replace any uses of 'from' with 'to' within this operation.
+  virtual void replaceUsesOfWith(Value from, Value to) {
+    llvm_unreachable("replaceUsesOfWith is unsupported by this pattern rewriter");
+  }
+
   /// This method erases an operation that is known to have no uses.
   virtual void eraseOp(Operation *op);
 
Index: mlir/include/mlir/Transforms/DialectConversion.h
===================================================================
--- mlir/include/mlir/Transforms/DialectConversion.h
+++ mlir/include/mlir/Transforms/DialectConversion.h
@@ -455,7 +455,7 @@
       TypeConverter::SignatureConversion *entryConversion = nullptr);
 
   /// Replace all the uses of the block argument `from` with value `to`.
-  void replaceUsesOfBlockArgument(BlockArgument from, Value to);
+  void replaceUsesOfBlockArgument(BlockArgument from, Value to) override;
 
   /// Return the converted value that replaces 'key'. Return 'key' if there is
   /// no such a converted value.
Index: mlir/lib/Dialect/Affine/IR/AffineOps.cpp
===================================================================
--- mlir/lib/Dialect/Affine/IR/AffineOps.cpp
+++ mlir/lib/Dialect/Affine/IR/AffineOps.cpp
@@ -17,6 +17,8 @@
 #include "mlir/Transforms/InliningUtils.h"
 #include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/SmallBitVector.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/StringSet.h"
 #include "llvm/ADT/TypeSwitch.h"
 #include "llvm/Support/Debug.h"
 
@@ -2506,9 +2508,50 @@
   return OpBuilder(getBody(), std::prev(getBody()->end()));
 }
 
+void AffineParallelOp::setLowerBounds(ValueRange lbOperands, AffineMap map) {
+  assert(lbOperands.size() == map.getNumInputs() &&
+         "operands to map must match number of inputs");
+  assert(map.getNumResults() >= 1 && "bounds map has at least one result");
+
+  auto ubOperands = getUpperBoundsOperands();
+
+  SmallVector<Value, 4> newOperands(lbOperands);
+  newOperands.append(ubOperands.begin(), ubOperands.end());
+  getOperation()->setOperands(newOperands);
+
+  lowerBoundsMapAttr(AffineMapAttr::get(map));
+}
+
+void AffineParallelOp::setUpperBounds(ValueRange ubOperands, AffineMap map) {
+  assert(ubOperands.size() == map.getNumInputs() &&
+         "operands to map must match number of inputs");
+  assert(map.getNumResults() >= 1 && "bounds map has at least one result");
+
+  SmallVector<Value, 4> newOperands(getLowerBoundsOperands());
+  newOperands.append(ubOperands.begin(), ubOperands.end());
+  getOperation()->setOperands(newOperands);
+
+  upperBoundsMapAttr(AffineMapAttr::get(map));
+}
+
+void AffineParallelOp::setLowerBoundsMap(AffineMap map) {
+  AffineMap lbMap = lowerBoundsMap();
+  assert(lbMap.getNumDims() == map.getNumDims() &&
+         lbMap.getNumSymbols() == map.getNumSymbols());
+  (void)lbMap;
+  lowerBoundsMapAttr(AffineMapAttr::get(map));
+}
+
+void AffineParallelOp::setUpperBoundsMap(AffineMap map) {
+  AffineMap ubMap = upperBoundsMap();
+  assert(ubMap.getNumDims() == map.getNumDims() &&
+         ubMap.getNumSymbols() == map.getNumSymbols());
+  (void)ubMap;
+  upperBoundsMapAttr(AffineMapAttr::get(map));
+}
+
 void AffineParallelOp::setSteps(ArrayRef<int64_t> newSteps) {
-  assert(newSteps.size() == getNumDims() && "steps & num dims mismatch");
-  setAttr(getStepsAttrName(), getBodyBuilder().getI64ArrayAttr(newSteps));
+  stepsAttr(getBodyBuilder().getI64ArrayAttr(newSteps));
 }
 
 static LogicalResult verify(AffineParallelOp op) {
@@ -2542,6 +2585,142 @@
   return success();
 }
 
+namespace {
+/// This pattern removes affine.parallel ops with no induction variables.
+struct AffineParallelRank0LoopRemover
+    : public OpRewritePattern<AffineParallelOp> {
+  using OpRewritePattern<AffineParallelOp>::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(AffineParallelOp op,
+                                PatternRewriter &rewriter) const override {
+    // Check that there are no induction variables
+    if (op.getNumDims())
+      return failure();
+
+    // Only remove ops that don't have any custom attributes (i.e. those not
+    // defined by the op itself).
+    StringSet<> opAttrs{AffineParallelOp::getReductionsAttrName(),
+                        AffineParallelOp::getLowerBoundsMapAttrName(),
+                        AffineParallelOp::getUpperBoundsMapAttrName(),
+                        AffineParallelOp::getStepsAttrName()};
+    for (NamedAttribute attr : op.getAttrs()) {
+      if (!opAttrs.count(attr.first.strref()))
+        return failure();
+    }
+
+    // Remove the affine.parallel wrapper, retain the body in the same location
+    auto &parentOps = rewriter.getInsertionBlock()->getOperations();
+    auto &parallelBodyOps = op.region().front().getOperations();
+    auto yield = mlir::cast<AffineYieldOp>(std::prev(parallelBodyOps.end()));
+    for (auto it : zip(op.getResults(), yield.operands()))
+      rewriter.replaceUsesOfWith(std::get<0>(it), std::get<1>(it));
+    parentOps.splice(mlir::Block::iterator(op), parallelBodyOps,
+                     parallelBodyOps.begin(), std::prev(parallelBodyOps.end()));
+    rewriter.eraseOp(op);
+    return success();
+  }
+};
+
+/// This pattern removes indexes that go over an empty range.
+struct AffineParallelTripCount1IndexRemover
+    : public OpRewritePattern<AffineParallelOp> {
+  using OpRewritePattern<AffineParallelOp>::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(AffineParallelOp op,
+                                PatternRewriter &rewriter) const override {
+    AffineValueMap ranges = op.getRangesValueMap();
+    Block *body = op.getBody();
+    SmallVector<AffineExpr, 6> newLowerBounds;
+    SmallVector<AffineExpr, 6> newUpperBounds;
+    SmallVector<int64_t, 6> newSteps;
+    SmallVector<BlockArgument, 6> argsToRemove;
+    for (unsigned i = 0, e = body->getNumArguments(); i < e; i++) {
+      // Is the range a constant value matching the step size?
+      auto constExpr = ranges.getResult(i).dyn_cast<AffineConstantExpr>();
+      int64_t step = op.steps()[i].template cast<IntegerAttr>().getInt();
+      if (constExpr && constExpr.getValue() == step) {
+        // Mark argument for removal and replacement with 0.
+        argsToRemove.push_back(body->getArgument(i));
+      } else {
+        // Keep argument
+        newLowerBounds.push_back(op.lowerBoundsMap().getResult(i));
+        newUpperBounds.push_back(op.upperBoundsMap().getResult(i));
+        newSteps.push_back(step);
+      }
+    }
+
+    // If no arguments need removal, return failure to match.
+    if (argsToRemove.empty())
+      return failure();
+
+    rewriter.updateRootInPlace(op, [&] {
+      for (BlockArgument arg : argsToRemove) {
+        unsigned argNumber = arg.getArgNumber();
+        auto lowerBoundValue = rewriter.create<AffineApplyOp>(
+            op.getLoc(), op.lowerBoundsMap().getSubMap({argNumber}),
+            op.getLowerBoundsOperands());
+        rewriter.replaceUsesOfBlockArgument(arg, lowerBoundValue);
+        body->eraseArgument(argNumber);
+      }
+
+      auto newLower = AffineMap::get(op.lowerBoundsMap().getNumDims(),
+                                     op.lowerBoundsMap().getNumSymbols(),
+                                     newLowerBounds, op.getContext());
+      auto newUpper = AffineMap::get(op.upperBoundsMap().getNumDims(),
+                                     op.upperBoundsMap().getNumSymbols(),
+                                     newUpperBounds, op.getContext());
+      op.lowerBoundsMapAttr(AffineMapAttr::get(newLower));
+      op.upperBoundsMapAttr(AffineMapAttr::get(newUpper));
+      op.stepsAttr(rewriter.getI64ArrayAttr(newSteps));
+    });
+
+    return success();
+  }
+};
+
+} // end anonymous namespace
+
+LogicalResult AffineValueMap::canonicalize() {
+  SmallVector<Value, 4> newOperands{operands};
+  auto newMap = getAffineMap();
+  composeAffineMapAndOperands(&newMap, &newOperands);
+  if (newMap == getAffineMap() && newOperands == operands)
+    return failure();
+  reset(newMap, newOperands);
+  return success();
+}
+
+/// Canonicalize the bounds of the given loop.
+static LogicalResult canonicalizeLoopBounds(AffineParallelOp op) {
+  AffineValueMap lb = op.getLowerBoundsValueMap();
+  bool lbCanonicalized = succeeded(lb.canonicalize());
+
+  AffineValueMap ub = op.getUpperBoundsValueMap();
+  bool ubCanonicalized = succeeded(ub.canonicalize());
+
+  // Any canonicalization change always leads to updated map(s).
+  if (!lbCanonicalized && !ubCanonicalized)
+    return failure();
+
+  if (lbCanonicalized)
+    op.setLowerBounds(lb.getOperands(), lb.getAffineMap());
+  if (ubCanonicalized)
+    op.setUpperBounds(ub.getOperands(), ub.getAffineMap());
+
+  return success();
+}
+
+void AffineParallelOp::getCanonicalizationPatterns(
+    OwningRewritePatternList &results, MLIRContext *context) {
+  results.insert<AffineParallelRank0LoopRemover,
+                 AffineParallelTripCount1IndexRemover>(context);
+}
+
+LogicalResult AffineParallelOp::fold(ArrayRef<Attribute> operands,
+                                     SmallVectorImpl<OpFoldResult> &results) {
+  return canonicalizeLoopBounds(*this);
+}
+
 static void print(OpAsmPrinter &p, AffineParallelOp op) {
   p << op.getOperationName() << " (" << op.getBody()->getArguments() << ") = (";
   p.printAffineMapOfSSAIds(op.lowerBoundsMapAttr(),
@@ -2642,7 +2821,7 @@
   }
 
   // Parse optional clause of the form: `reduce ("addf", "maxf")`, where the
-  // quoted strings a member of the enum AtomicRMWKind.  
+  // quoted strings are a member of the enum AtomicRMWKind.
   SmallVector<Attribute, 4> reductions;
   if (succeeded(parser.parseOptionalKeyword("reduce"))) {
     if (parser.parseLParen())
Index: mlir/lib/Dialect/Affine/Transforms/AffineParallelSimplify.cpp
===================================================================
--- /dev/null
+++ mlir/lib/Dialect/Affine/Transforms/AffineParallelSimplify.cpp
@@ -0,0 +1,109 @@
+//===- AffineParallelSimplify.cpp - AffineParallelSimplify Pass -----------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements a simplifier for affine parallel loops.
+//
+//===----------------------------------------------------------------------===//
+
+#include "PassDetail.h"
+#include "mlir/Dialect/Affine/IR/AffineOps.h"
+#include "mlir/Dialect/Affine/IR/AffineValueMap.h"
+#include "mlir/Dialect/Affine/Passes.h"
+#include "mlir/IR/PatternMatch.h"
+
+using namespace mlir;
+
+namespace {
+
+/// Simplify affine.parallel ops so that they have a step size of 1 and a lower
+/// bound of 0.
+struct AffineParallelSimplifyPass
+    : public AffineParallelSimplifyBase<AffineParallelSimplifyPass> {
+
+  void runOnFunction() override {
+    getFunction().walk([&](AffineParallelOp op) { runOnAffineParallelOp(op); });
+
+    // Run canonicailzation as a follow up.
+    OwningRewritePatternList patterns;
+    auto *context = &getContext();
+    for (auto *op : context->getRegisteredOperations())
+      op->getCanonicalizationPatterns(patterns, context);
+
+    Operation *op = getOperation();
+    applyPatternsAndFoldGreedily(op->getRegions(), patterns);
+  }
+
+  void runOnAffineParallelOp(AffineParallelOp op) {
+    ArrayAttr stepsAttrs = op.steps();
+    AffineMap lbMap = op.lowerBoundsMap();
+
+    SmallVector<int, 8> steps;
+    bool isWorkPending = false;
+    for (unsigned i = 0, e = stepsAttrs.size(); i < e; ++i) {
+      auto step = stepsAttrs[i].cast<IntegerAttr>().getInt();
+      steps.push_back(step);
+      auto lbExpr = lbMap.getResult(i).dyn_cast<AffineConstantExpr>();
+      isWorkPending |= (!lbExpr || lbExpr.getValue() || step != 1);
+    }
+
+    // No need to do any work if the parallel op is already simplified.
+    if (!isWorkPending)
+      return;
+
+    AffineValueMap ranges = op.getRangesValueMap();
+    auto builder = OpBuilder::atBlockBegin(op.getBody());
+    auto zeroExpr = builder.getAffineConstantExpr(0);
+    SmallVector<AffineExpr, 8> lbExprs;
+    SmallVector<AffineExpr, 8> ubExprs;
+    for (unsigned i = 0, e = steps.size(); i < e; ++i) {
+      int step = steps[i];
+
+      // Adjust the lower bound to be 0.
+      lbExprs.push_back(zeroExpr);
+
+      // Adjust the upper bound expression: 'range / step'
+      AffineExpr ubExpr = ranges.getResult(i).floorDiv(step);
+      ubExprs.push_back(ubExpr);
+
+      // Adjust the corresponding IV: 'lb + i * step'
+      BlockArgument iv = op.getBody()->getArgument(i);
+      AffineExpr lbExpr = lbMap.getResult(i);
+      unsigned nDims = lbMap.getNumDims();
+      auto expr = lbExpr + builder.getAffineDimExpr(nDims) * step;
+      auto map = AffineMap::get(/*dimCount=*/nDims + 1,
+                                /*symbolCount=*/lbMap.getNumSymbols(), expr);
+
+      // Use an 'affine.apply' op that will be simplified later in subsequent
+      // canonicalizations.
+      OperandRange lbOperands = op.getLowerBoundsOperands();
+      OperandRange dimOperands = lbOperands.take_front(nDims);
+      OperandRange symbolOperands = lbOperands.drop_front(nDims);
+      SmallVector<Value, 8> applyOperands{dimOperands};
+      applyOperands.push_back(iv);
+      applyOperands.append(symbolOperands.begin(), symbolOperands.end());
+      auto apply =
+          builder.create<AffineApplyOp>(op.getLoc(), map, applyOperands);
+      iv.replaceAllUsesExcept(apply, SmallPtrSet<Operation *, 1>{apply});
+    }
+
+    SmallVector<int64_t, 8> newSteps(op.getNumDims(), 1);
+    op.stepsAttr(builder.getI64ArrayAttr(newSteps));
+    auto newLowerMap = AffineMap::get(
+        /*dimCount=*/0, /*symbolCount=*/0, lbExprs, &getContext());
+    op.setLowerBounds({}, newLowerMap);
+    auto newUpperMap = AffineMap::get(
+        ranges.getNumDims(), ranges.getNumSymbols(), ubExprs, &getContext());
+    op.setUpperBounds(ranges.getOperands(), newUpperMap);
+  }
+};
+} // namespace
+
+std::unique_ptr<OperationPass<FuncOp>>
+mlir::createAffineParallelSimplifyPass() {
+  return std::make_unique<AffineParallelSimplifyPass>();
+}
Index: mlir/lib/Dialect/Affine/Transforms/CMakeLists.txt
===================================================================
--- mlir/lib/Dialect/Affine/Transforms/CMakeLists.txt
+++ mlir/lib/Dialect/Affine/Transforms/CMakeLists.txt
@@ -2,6 +2,7 @@
   AffineDataCopyGeneration.cpp
   AffineLoopInvariantCodeMotion.cpp
   AffineParallelize.cpp
+  AffineParallelSimplify.cpp
   LoopTiling.cpp
   LoopUnroll.cpp
   LoopUnrollAndJam.cpp
Index: mlir/lib/Transforms/Utils/GreedyPatternRewriteDriver.cpp
===================================================================
--- mlir/lib/Transforms/Utils/GreedyPatternRewriteDriver.cpp
+++ mlir/lib/Transforms/Utils/GreedyPatternRewriteDriver.cpp
@@ -76,6 +76,18 @@
     }
   }
 
+  void replaceUsesOfBlockArgument(BlockArgument from, Value to) override {
+    from.replaceAllUsesWith(to);
+    for (auto *user : to.getUsers())
+      addToWorklist(user);
+  }
+
+  void replaceUsesOfWith(Value from, Value to) override {
+    from.replaceAllUsesWith(to);
+    for (auto *user : to.getUsers())
+      addToWorklist(user);
+  }
+
   // These are hooks implemented for PatternRewriter.
 protected:
   // Implement the hook for inserting operations, and make sure that newly
Index: mlir/test/Dialect/Affine/affine-fold.mlir
===================================================================
--- /dev/null
+++ mlir/test/Dialect/Affine/affine-fold.mlir
@@ -0,0 +1,73 @@
+// RUN: mlir-opt -canonicalize -split-input-file %s | FileCheck %s
+
+// Ensure that affine.parallel canonicalization and folding are working
+// properly.
+
+// Remove enclosing affine.parallel when no iterations occur.
+
+// CHECK: func @affine_parallel_rank0
+func @affine_parallel_rank0(%out: memref<f32>) {
+  // CHECK-NEXT: constant
+  %cst = constant 0.0 : f32
+  // CHECK-NEXT: affine.store
+  affine.parallel () = () to () {
+    affine.parallel () = () to () {
+      affine.store %cst, %out[] : memref<f32>
+    }
+  }
+  return
+}
+
+// -----
+
+// Remove enclosing affine.parallel when no iterations occur.
+
+// CHECK-LABEL: func @affine_parallel_range1
+func @affine_parallel_range1() {
+  // CHECK-NEXT: constant
+  %cst = constant 1.0 : f32
+  // CHECK-NEXT: alloc
+  %0 = alloc() : memref<2x4xf32>
+  // CHECK-NEXT: affine.store
+  affine.parallel (%i, %j) = (0, 1) to (2, 2) step (2, 1) {
+    affine.store %cst, %0[%i, %j] : memref<2x4xf32>
+  }
+  // CHECK-NEXT: return
+  return
+}
+
+// -----
+
+// Remove induction variables when the trip count is 1.
+
+// CHECK-LABEL: func @affine_parallel_partial_range1
+func @affine_parallel_partial_range1() {
+  // CHECK-NEXT: constant
+  %cst = constant 1.0 : f32
+  // CHECK-NEXT: alloc
+  %0 = alloc() : memref<2x4xf32>
+  // CHECK-NEXT: affine.parallel (%{{.*}}) = (0) to (10)
+  affine.parallel (%i, %j) = (0, 1) to (10, 2) {
+    // CHECK-NEXT: affine.store %{{.*}}, %{{.*}}[%{{.*}}, 1]
+    affine.store %cst, %0[%i, %j] : memref<2x4xf32>
+  }
+  // CHECK: return
+  return
+}
+
+// -----
+
+// Ensure bounds expressions are canonicalized.
+
+// CHECK-LABEL: func @affine_parallel_const_bounds
+func @affine_parallel_const_bounds() {
+  %cst = constant 1.0 : f32
+  %c0 = constant 0 : index
+  %c4 = constant 4 : index
+  %0 = alloc() : memref<4xf32>
+  // CHECK: affine.parallel (%{{.*}}) = (0) to (4)
+  affine.parallel (%i) = (%c0) to (%c0 + %c4) {
+    affine.store %cst, %0[%i] : memref<4xf32>
+  }
+  return
+}
Index: mlir/test/Dialect/Affine/affine-parallel-simplify.mlir
===================================================================
--- /dev/null
+++ mlir/test/Dialect/Affine/affine-parallel-simplify.mlir
@@ -0,0 +1,43 @@
+// RUN: mlir-opt %s -affine-parallel-simplify -split-input-file | FileCheck %s
+
+// Set step to 1 and lower bound to 0.
+
+// CHECK-LABEL: func @simplify_parallel
+func @simplify_parallel() {
+  %cst = constant 1.0 : f32
+  %0 = alloc() : memref<2x4xf32>
+  // CHECK: affine.parallel (%[[i:.*]], %[[j:.*]]) = (0, 0) to (3, 2)
+  affine.parallel (%i, %j) = (0, 1) to (10, 5) step (3, 2) {
+    // CHECK: affine.parallel (%[[k:.*]]) = (0) to (%[[j]] * 2 - %[[i]] * 3 + 1)
+    affine.parallel (%k) = (%i) to (%j) {
+      // CHECK: affine.store %{{.*}}, %{{.*}}[%[[i]] * 3, %[[i]] * 3 + %[[k]]] : memref<2x4xf32>
+      affine.store %cst, %0[%i, %k] : memref<2x4xf32>
+    }
+  }
+  return
+}
+
+// -----
+
+// Ensure bounds expressions are canonicalized.
+
+#map0 = affine_map<(d0) -> (d0 * 5)>
+#map1 = affine_map<(d0) -> (d0 * 10)>
+
+// CHECK-LABEL: func @affine_parallel_fold_bounds
+func @affine_parallel_fold_bounds() {
+  %cst = constant 1.0 : f32
+  %0 = alloc() : memref<100x100xf32>
+  // CHECK: affine.parallel (%[[i0:.*]], %[[j0:.*]]) =
+  affine.parallel (%i0, %j0) = (0, 0) to (100, 10) {
+    %2 = affine.apply #map0(%i0)
+    %3 = affine.apply #map1(%j0)
+    // CHECK-NOT: affine.apply
+    // CHECK: affine.parallel (%[[i1:.*]], %[[j1:.*]]) = (0, 0) to (5, 10)
+    affine.parallel (%i1, %j1) = (%2, %3) to (%2 + 5, %3 + 10) {
+      // CHECK: affine.store %{{.*}}, %{{.*}}[%[[i0]] * 5 + %[[i1]], %[[j0]] * 10 + %[[j1]]]
+      affine.store %cst, %0[%i1, %j1] : memref<100x100xf32>
+    }
+  }
+  return
+}