diff --git a/mlir/include/mlir/Analysis/AffineStructures.h b/mlir/include/mlir/Analysis/AffineStructures.h
--- a/mlir/include/mlir/Analysis/AffineStructures.h
+++ b/mlir/include/mlir/Analysis/AffineStructures.h
@@ -481,13 +481,13 @@
   /// one; None otherwise.
   Optional<int64_t> getConstantUpperBound(unsigned pos) const;
 
-  /// Gets the lower and upper bound of the pos^th identifier treating
-  /// [0, offset) U [offset + num, symStartPos) as dimensions and
-  /// [symStartPos, getNumDimAndSymbolIds) as symbols. The returned
-  /// multi-dimensional maps in the pair represent the max and min of
-  /// potentially multiple affine expressions. The upper bound is exclusive.
-  /// 'localExprs' holds pre-computed AffineExpr's for all local identifiers in
-  /// the system.
+  /// Gets the lower and upper bound of the `offset` + `pos`th identifier
+  /// treating [0, offset) U [offset + num, symStartPos) as dimensions and
+  /// [symStartPos, getNumDimAndSymbolIds) as symbols, and `pos` lies in
+  /// [0, num). The multi-dimensional maps in the returned pair represent the
+  /// max and min of potentially multiple affine expressions. The upper bound is
+  /// exclusive. `localExprs` holds pre-computed AffineExpr's for all local
+  /// identifiers in the system.
   std::pair<AffineMap, AffineMap>
   getLowerAndUpperBound(unsigned pos, unsigned offset, unsigned num,
                         unsigned symStartPos, ArrayRef<AffineExpr> localExprs,
diff --git a/mlir/include/mlir/Analysis/Utils.h b/mlir/include/mlir/Analysis/Utils.h
--- a/mlir/include/mlir/Analysis/Utils.h
+++ b/mlir/include/mlir/Analysis/Utils.h
@@ -220,12 +220,19 @@
   /// i.e., the returned bounding constant holds for *any given* value of the
   /// symbol identifiers. The 'shape' vector is set to the corresponding
   /// dimension-wise bounds major to minor. We use int64_t instead of uint64_t
-  /// since index types can be at most int64_t.
+  /// since index types can be at most int64_t. `lbs` are set to the lower
+  /// bounds for each of the rank dimensions, and lbDivisors contains the
+  /// corresponding denominators for floorDivs.
   Optional<int64_t> getConstantBoundingSizeAndShape(
       SmallVectorImpl<int64_t> *shape = nullptr,
       std::vector<SmallVector<int64_t, 4>> *lbs = nullptr,
       SmallVectorImpl<int64_t> *lbDivisors = nullptr) const;
 
+  /// Gets the lower and upper bound map for the dimensional identifier at
+  /// `pos`.
+  void getLowerAndUpperBound(unsigned pos, AffineMap &lbMap,
+                             AffineMap &ubMap) const;
+
   /// A wrapper around FlatAffineConstraints::getConstantBoundOnDimSize(). 'pos'
   /// corresponds to the position of the memref shape's dimension (major to
   /// minor) which matches 1:1 with the dimensional identifier positions in
diff --git a/mlir/lib/Analysis/AffineStructures.cpp b/mlir/lib/Analysis/AffineStructures.cpp
--- a/mlir/lib/Analysis/AffineStructures.cpp
+++ b/mlir/lib/Analysis/AffineStructures.cpp
@@ -1395,8 +1395,9 @@
   assert(getNumLocalIds() == localExprs.size() &&
          "incorrect local exprs count");
 
-  SmallVector<unsigned, 4> lbIndices, ubIndices;
-  getLowerAndUpperBoundIndices(pos + offset, &lbIndices, &ubIndices);
+  SmallVector<unsigned, 4> lbIndices, ubIndices, eqIndices;
+  getLowerAndUpperBoundIndices(pos + offset, &lbIndices, &ubIndices, &eqIndices,
+                               offset, num);
 
   /// Add to 'b' from 'a' in set [0, offset) U [offset + num, symbStartPos).
   auto addCoeffs = [&](ArrayRef<int64_t> a, SmallVectorImpl<int64_t> &b) {
@@ -1408,10 +1409,10 @@
   };
 
   SmallVector<int64_t, 8> lb, ub;
-  SmallVector<AffineExpr, 4> exprs;
+  SmallVector<AffineExpr, 4> lbExprs;
   unsigned dimCount = symStartPos - num;
   unsigned symCount = getNumDimAndSymbolIds() - symStartPos;
-  exprs.reserve(lbIndices.size());
+  lbExprs.reserve(lbIndices.size() + eqIndices.size());
   // Lower bound expressions.
   for (auto idx : lbIndices) {
     auto ineq = getInequality(idx);
@@ -1422,13 +1423,14 @@
     std::transform(lb.begin(), lb.end(), lb.begin(), std::negate<int64_t>());
     auto expr =
         getAffineExprFromFlatForm(lb, dimCount, symCount, localExprs, context);
-    exprs.push_back(expr);
+    // expr ceildiv divisor is (expr + divisor - 1) floordiv divisor
+    int64_t divisor = std::abs(ineq[pos + offset]);
+    expr = (expr + divisor - 1).floorDiv(divisor);
+    lbExprs.push_back(expr);
   }
-  auto lbMap =
-      exprs.empty() ? AffineMap() : AffineMap::get(dimCount, symCount, exprs);
 
-  exprs.clear();
-  exprs.reserve(ubIndices.size());
+  SmallVector<AffineExpr, 4> ubExprs;
+  ubExprs.reserve(ubIndices.size() + eqIndices.size());
   // Upper bound expressions.
   for (auto idx : ubIndices) {
     auto ineq = getInequality(idx);
@@ -1436,11 +1438,37 @@
     addCoeffs(ineq, ub);
     auto expr =
         getAffineExprFromFlatForm(ub, dimCount, symCount, localExprs, context);
+    expr = expr.floorDiv(std::abs(ineq[pos + offset]));
+    // Upper bound is exclusive.
+    ubExprs.push_back(expr + 1);
+  }
+
+  // Equalities. It's both a lower and a upper bound.
+  SmallVector<int64_t, 4> b;
+  for (auto idx : eqIndices) {
+    auto eq = getEquality(idx);
+    addCoeffs(eq, b);
+    if (eq[pos + offset] > 0)
+      std::transform(b.begin(), b.end(), b.begin(), std::negate<int64_t>());
+
+    // Extract the upper bound (in terms of other coeff's + const).
+    auto expr =
+        getAffineExprFromFlatForm(b, dimCount, symCount, localExprs, context);
+    expr = expr.floorDiv(std::abs(eq[pos + offset]));
     // Upper bound is exclusive.
-    exprs.push_back(expr + 1);
+    ubExprs.push_back(expr + 1);
+    // Lower bound.
+    expr =
+        getAffineExprFromFlatForm(b, dimCount, symCount, localExprs, context);
+    expr = expr.ceilDiv(std::abs(eq[pos + offset]));
+    lbExprs.push_back(expr);
   }
-  auto ubMap =
-      exprs.empty() ? AffineMap() : AffineMap::get(dimCount, symCount, exprs);
+
+  auto lbMap = lbExprs.empty() ? AffineMap()
+                               : AffineMap::get(dimCount, symCount, lbExprs);
+
+  auto ubMap = ubExprs.empty() ? AffineMap()
+                               : AffineMap::get(dimCount, symCount, ubExprs);
 
   return {lbMap, ubMap};
 }
@@ -1583,7 +1611,7 @@
           tmpClone->removeRedundantInequalities();
         }
         std::tie(lbMap, ubMap) = tmpClone->getLowerAndUpperBound(
-            pos, offset, num, getNumDimIds(), {}, context);
+            pos, offset, num, getNumDimIds(), /*localExprs=*/{}, context);
       }
 
       // If the above fails, we'll just use the constant lower bound and the
@@ -2627,6 +2655,30 @@
 }
 } // namespace
 
+// Returns constraints that are common to both A & B.
+static void getCommonConstraints(const FlatAffineConstraints &A,
+                                 const FlatAffineConstraints &B,
+                                 FlatAffineConstraints &C) {
+  C.reset(A.getNumDimIds(), A.getNumSymbolIds(), A.getNumLocalIds());
+  // A naive O(n^2) check should be enough here given the input sizes.
+  for (unsigned r = 0, e = A.getNumInequalities(); r < e; ++r) {
+    for (unsigned s = 0, f = B.getNumInequalities(); s < f; ++s) {
+      if (A.getInequality(r) == B.getInequality(s)) {
+        C.addInequality(A.getInequality(r));
+        break;
+      }
+    }
+  }
+  for (unsigned r = 0, e = A.getNumEqualities(); r < e; ++r) {
+    for (unsigned s = 0, f = B.getNumEqualities(); s < f; ++s) {
+      if (A.getEquality(r) == B.getEquality(s)) {
+        C.addEquality(A.getEquality(r));
+        break;
+      }
+    }
+  }
+}
+
 // Computes the bounding box with respect to 'other' by finding the min of the
 // lower bounds and the max of the upper bounds along each of the dimensions.
 LogicalResult
@@ -2639,13 +2691,19 @@
   assert(otherCst.getNumLocalIds() == 0 && "local ids not supported here");
   assert(getNumLocalIds() == 0 && "local ids not supported yet here");
 
+  // Align `other` to this.
   Optional<FlatAffineConstraints> otherCopy;
   if (!areIdsAligned(*this, otherCst)) {
     otherCopy.emplace(FlatAffineConstraints(otherCst));
     mergeAndAlignIds(/*offset=*/numDims, this, &otherCopy.getValue());
   }
 
-  const auto &other = otherCopy ? *otherCopy : otherCst;
+  const auto &otherAligned = otherCopy ? *otherCopy : otherCst;
+
+  // Get the constraints common to both systems; these will be added as is to
+  // the union.
+  FlatAffineConstraints commonCst;
+  getCommonConstraints(*this, otherAligned, commonCst);
 
   std::vector<SmallVector<int64_t, 8>> boundingLbs;
   std::vector<SmallVector<int64_t, 8>> boundingUbs;
@@ -2668,7 +2726,7 @@
       // TODO(bondhugula): handle union if a dimension is unbounded.
       return failure();
 
-    auto otherExtent = other.getConstantBoundOnDimSize(
+    auto otherExtent = otherAligned.getConstantBoundOnDimSize(
         d, &otherLb, &otherLbFloorDivisor, &otherUb);
     if (!otherExtent.hasValue() || lbFloorDivisor != otherLbFloorDivisor)
       // TODO(bondhugula): symbolic extents when necessary.
@@ -2690,7 +2748,7 @@
     } else {
       // Uncomparable - check for constant lower/upper bounds.
       auto constLb = getConstantLowerBound(d);
-      auto constOtherLb = other.getConstantLowerBound(d);
+      auto constOtherLb = otherAligned.getConstantLowerBound(d);
       if (!constLb.hasValue() || !constOtherLb.hasValue())
         return failure();
       std::fill(minLb.begin(), minLb.end(), 0);
@@ -2706,7 +2764,7 @@
     } else {
       // Uncomparable - check for constant lower/upper bounds.
       auto constUb = getConstantUpperBound(d);
-      auto constOtherUb = other.getConstantUpperBound(d);
+      auto constOtherUb = otherAligned.getConstantUpperBound(d);
       if (!constUb.hasValue() || !constOtherUb.hasValue())
         return failure();
       std::fill(maxUb.begin(), maxUb.end(), 0);
@@ -2736,6 +2794,11 @@
     addInequality(boundingLbs[d]);
     addInequality(boundingUbs[d]);
   }
+
+  // Add the constraints that were common to both systems.
+  append(commonCst);
+  removeTrivialRedundancy();
+
   // TODO(mlir-team): copy over pure symbolic constraints from this and 'other'
   // over to the union (since the above are just the union along dimensions); we
   // shouldn't be discarding any other constraints on the symbols.
diff --git a/mlir/lib/Analysis/Utils.cpp b/mlir/lib/Analysis/Utils.cpp
--- a/mlir/lib/Analysis/Utils.cpp
+++ b/mlir/lib/Analysis/Utils.cpp
@@ -64,7 +64,6 @@
     assert(cst->containsId(value) && "value expected to be present");
     if (isValidSymbol(value)) {
       // Check if the symbol is a constant.
-
       if (auto cOp = dyn_cast_or_null<ConstantIndexOp>(value.getDefiningOp()))
         cst->setIdToConstant(value, cOp.getValue());
     } else if (auto loop = getForInductionVarOwner(value)) {
@@ -103,6 +102,20 @@
 
   assert(rank == cst.getNumDimIds() && "inconsistent memref region");
 
+  // Use a copy of the region constraints that has upper/lower bounds for each
+  // memref dimension with static size added to guard against potential
+  // over-approximation from projection or union bounding box. We may not add
+  // this on the region itself since they might just be redundant constraints
+  // that will need non-trivials means to eliminate.
+  FlatAffineConstraints cstWithShapeBounds(cst);
+  for (unsigned r = 0; r < rank; r++) {
+    cstWithShapeBounds.addConstantLowerBound(r, 0);
+    int64_t dimSize = memRefType.getDimSize(r);
+    if (ShapedType::isDynamic(dimSize))
+      continue;
+    cstWithShapeBounds.addConstantUpperBound(r, dimSize - 1);
+  }
+
   // Find a constant upper bound on the extent of this memref region along each
   // dimension.
   int64_t numElements = 1;
@@ -110,7 +123,8 @@
   int64_t lbDivisor;
   for (unsigned d = 0; d < rank; d++) {
     SmallVector<int64_t, 4> lb;
-    Optional<int64_t> diff = cst.getConstantBoundOnDimSize(d, &lb, &lbDivisor);
+    Optional<int64_t> diff =
+        cstWithShapeBounds.getConstantBoundOnDimSize(d, &lb, &lbDivisor);
     if (diff.hasValue()) {
       diffConstant = diff.getValue();
       assert(lbDivisor > 0);
@@ -122,7 +136,7 @@
         return None;
       diffConstant = dimSize;
       // Lower bound becomes 0.
-      lb.resize(cst.getNumSymbolIds() + 1, 0);
+      lb.resize(cstWithShapeBounds.getNumSymbolIds() + 1, 0);
       lbDivisor = 1;
     }
     numElements *= diffConstant;
@@ -138,6 +152,25 @@
   return numElements;
 }
 
+void MemRefRegion::getLowerAndUpperBound(unsigned pos, AffineMap &lbMap,
+                                         AffineMap &ubMap) const {
+  assert(pos < cst.getNumDimIds() && "invalid position");
+  auto memRefType = memref.getType().cast<MemRefType>();
+  unsigned rank = memRefType.getRank();
+
+  assert(rank == cst.getNumDimIds() && "inconsistent memref region");
+
+  auto boundPairs = cst.getLowerAndUpperBound(
+      pos, /*offset=*/0, /*num=*/rank, cst.getNumDimAndSymbolIds(),
+      /*localExprs=*/{}, memRefType.getContext());
+  lbMap = boundPairs.first;
+  ubMap = boundPairs.second;
+  assert(lbMap && "lower bound for a region must exist");
+  assert(ubMap && "upper bound for a region must exist");
+  assert(lbMap.getNumInputs() == cst.getNumDimAndSymbolIds() - rank);
+  assert(ubMap.getNumInputs() == cst.getNumDimAndSymbolIds() - rank);
+}
+
 LogicalResult MemRefRegion::unionBoundingBox(const MemRefRegion &other) {
   assert(memref == other.memref);
   return cst.unionBoundingBox(*other.getConstraints());
@@ -304,6 +337,7 @@
       cst.addConstantUpperBound(r, dimSize - 1);
     }
   }
+  cst.removeTrivialRedundancy();
 
   LLVM_DEBUG(llvm::dbgs() << "Memory region:\n");
   LLVM_DEBUG(cst.dump());
diff --git a/mlir/lib/Dialect/Affine/Transforms/AffineDataCopyGeneration.cpp b/mlir/lib/Dialect/Affine/Transforms/AffineDataCopyGeneration.cpp
--- a/mlir/lib/Dialect/Affine/Transforms/AffineDataCopyGeneration.cpp
+++ b/mlir/lib/Dialect/Affine/Transforms/AffineDataCopyGeneration.cpp
@@ -21,8 +21,9 @@
 
 #include "mlir/Analysis/Utils.h"
 #include "mlir/Dialect/Affine/IR/AffineOps.h"
-#include "mlir/Dialect/StandardOps/IR/Ops.h"
 #include "mlir/Dialect/Affine/Passes.h"
+#include "mlir/Dialect/StandardOps/IR/Ops.h"
+#include "mlir/IR/PatternMatch.h"
 #include "mlir/Pass/Pass.h"
 #include "mlir/Transforms/LoopUtils.h"
 #include "llvm/ADT/MapVector.h"
@@ -264,7 +265,19 @@
     runOnBlock(&block, copyNests);
 
   // Promote any single iteration loops in the copy nests.
-  for (auto nest : copyNests) {
+  for (auto nest : copyNests)
     nest->walk([](AffineForOp forOp) { promoteIfSingleIteration(forOp); });
+
+  // Promoting single iteration loops could lead to simplification of
+  // load's/store's. We will run canonicalization patterns on load/stores.
+  // TODO: this whole function load/store canonicalization should be replaced by
+  // canonicalization that is limited to only the load/store ops
+  // introduced/touched by this pass (those inside 'copyNests'). This would be
+  // possible once the necessary support is available in the pattern rewriter.
+  if (!copyNests.empty()) {
+    OwningRewritePatternList patterns;
+    AffineLoadOp::getCanonicalizationPatterns(patterns, &getContext());
+    AffineStoreOp::getCanonicalizationPatterns(patterns, &getContext());
+    applyPatternsGreedily(f, std::move(patterns));
   }
 }
diff --git a/mlir/lib/Transforms/Utils/LoopUtils.cpp b/mlir/lib/Transforms/Utils/LoopUtils.cpp
--- a/mlir/lib/Transforms/Utils/LoopUtils.cpp
+++ b/mlir/lib/Transforms/Utils/LoopUtils.cpp
@@ -1458,58 +1458,79 @@
 }
 
 /// Generates a point-wise copy from/to `memref' to/from `fastMemRef' and
-/// returns the outermost AffineForOp of the copy loop nest. `memIndicesStart'
-/// holds the lower coordinates of the region in the original memref to copy
-/// in/out. If `copyOut' is true, generates a copy-out; otherwise a copy-in.
-static AffineForOp generatePointWiseCopy(Location loc, Value memref,
-                                         Value fastMemRef,
-                                         AffineMap memAffineMap,
-                                         ArrayRef<Value> memIndicesStart,
-                                         ArrayRef<int64_t> fastBufferShape,
-                                         bool isCopyOut, OpBuilder b) {
-  assert(!memIndicesStart.empty() && "only 1-d or more memrefs");
-
-  // The copy-in nest is generated as follows as an example for a 2-d region:
-  // for x = ...
-  //   for y = ...
-  //     fast_buf[x][y] = buf[mem_x + x][mem_y + y]
-
-  SmallVector<Value, 4> fastBufIndices, memIndices;
+/// returns the outermost AffineForOp of the copy loop nest. `lbMaps` and
+/// `ubMaps` along with `lbOperands` and `ubOperands` hold the lower and upper
+/// bound information for the copy loop nest. `fastBufOffsets` contain the
+/// expressions to be subtracted out from the respective copy loop iterators in
+/// order to index the fast buffer. If `copyOut' is true, generates a copy-out;
+/// otherwise a copy-in. Builder `b` should be set to the point the copy nest is
+/// inserted.
+//
+/// The copy-in nest is generated as follows as an example for a 2-d region:
+/// for x = ...
+///   for y = ...
+///     fast_buf[x - offset_x][y - offset_y] = memref[x][y]
+///
+static AffineForOp
+generatePointWiseCopy(Location loc, Value memref, Value fastMemRef,
+                      ArrayRef<AffineMap> lbMaps, ArrayRef<Value> lbOperands,
+                      ArrayRef<AffineMap> ubMaps, ArrayRef<Value> ubOperands,
+                      ArrayRef<AffineExpr> fastBufOffsets, bool isCopyOut,
+                      OpBuilder b) {
+  assert(llvm::all_of(lbMaps, [&](AffineMap lbMap) {
+    return lbMap.getNumInputs() == lbOperands.size();
+  }));
+  assert(llvm::all_of(ubMaps, [&](AffineMap ubMap) {
+    return ubMap.getNumInputs() == ubOperands.size();
+  }));
+
+  unsigned rank = memref.getType().cast<MemRefType>().getRank();
+  assert(lbMaps.size() == rank && "wrong number of lb maps");
+  assert(ubMaps.size() == rank && "wrong number of ub maps");
+
+  SmallVector<Value, 4> memIndices;
+  SmallVector<AffineExpr, 4> fastBufExprs;
+  SmallVector<Value, 4> fastBufMapOperands;
   AffineForOp copyNestRoot;
-  for (unsigned d = 0, e = fastBufferShape.size(); d < e; ++d) {
-    auto forOp = b.create<AffineForOp>(loc, 0, fastBufferShape[d]);
+  for (unsigned d = 0; d < rank; ++d) {
+    auto forOp = createCanonicalizedAffineForOp(b, loc, lbOperands, lbMaps[d],
+                                                ubOperands, ubMaps[d]);
     if (d == 0)
       copyNestRoot = forOp;
+
     b = forOp.getBodyBuilder();
-    fastBufIndices.push_back(forOp.getInductionVar());
-
-    Value memBase =
-        (memAffineMap == b.getMultiDimIdentityMap(memAffineMap.getNumDims()))
-            ? memIndicesStart[d]
-            : b.create<AffineApplyOp>(
-                  loc,
-                  AffineMap::get(memAffineMap.getNumDims(),
-                                 memAffineMap.getNumSymbols(),
-                                 memAffineMap.getResult(d)),
-                  memIndicesStart);
-
-    // Construct the subscript for the slow memref being copied.
-    auto memIndex = b.create<AffineApplyOp>(
-        loc,
-        AffineMap::get(2, 0, b.getAffineDimExpr(0) + b.getAffineDimExpr(1)),
-        ValueRange({memBase, forOp.getInductionVar()}));
-    memIndices.push_back(memIndex);
+
+    auto fastBufOffsetMap =
+        AffineMap::get(lbOperands.size(), 0, {fastBufOffsets[d]});
+    auto offset = b.create<AffineApplyOp>(loc, fastBufOffsetMap, lbOperands);
+
+    // Construct the subscript for the fast memref being copied into/from:
+    // x - offset_x.
+    fastBufExprs.push_back(b.getAffineDimExpr(2 * d + 1) -
+                           b.getAffineDimExpr(2 * d));
+    fastBufMapOperands.push_back(offset);
+    fastBufMapOperands.push_back(forOp.getInductionVar());
+
+    // Subscript for the slow memref being copied.
+    memIndices.push_back(forOp.getInductionVar());
   }
 
+  auto fastBufMap = AffineMap::get(2 * rank, /*symbolCount=*/0, fastBufExprs);
+  fullyComposeAffineMapAndOperands(&fastBufMap, &fastBufMapOperands);
+  fastBufMap = simplifyAffineMap(fastBufMap);
+  canonicalizeMapAndOperands(&fastBufMap, &fastBufMapOperands);
+
   if (!isCopyOut) {
     // Copy in.
     auto load = b.create<AffineLoadOp>(loc, memref, memIndices);
-    b.create<AffineStoreOp>(loc, load, fastMemRef, fastBufIndices);
+    b.create<AffineStoreOp>(loc, load, fastMemRef, fastBufMap,
+                            fastBufMapOperands);
     return copyNestRoot;
   }
 
   // Copy out.
-  auto load = b.create<AffineLoadOp>(loc, fastMemRef, fastBufIndices);
+  auto load =
+      b.create<AffineLoadOp>(loc, fastMemRef, fastBufMap, fastBufMapOperands);
   b.create<AffineStoreOp>(loc, load, memref, memIndices);
   return copyNestRoot;
 }
@@ -1600,6 +1621,10 @@
     return success();
   }
 
+  SmallVector<AffineMap, 4> lbMaps(rank), ubMaps(rank);
+  for (unsigned i = 0; i < rank; ++i)
+    region.getLowerAndUpperBound(i, lbMaps[i], ubMaps[i]);
+
   const FlatAffineConstraints *cst = region.getConstraints();
   // 'regionSymbols' hold values that this memory region is symbolic/parametric
   // on; these typically include loop IVs surrounding the level at which the
@@ -1613,15 +1638,14 @@
   // along the corresponding dimension.
 
   // Index start offsets for faster memory buffer relative to the original.
-  SmallVector<AffineExpr, 4> offsets;
-  offsets.reserve(rank);
+  SmallVector<AffineExpr, 4> fastBufOffsets;
+  fastBufOffsets.reserve(rank);
   for (unsigned d = 0; d < rank; d++) {
     assert(lbs[d].size() == cst->getNumCols() - rank && "incorrect bound size");
 
     AffineExpr offset = top.getAffineConstantExpr(0);
-    for (unsigned j = 0, e = cst->getNumCols() - rank - 1; j < e; j++) {
+    for (unsigned j = 0, e = cst->getNumCols() - rank - 1; j < e; j++)
       offset = offset + lbs[d][j] * top.getAffineDimExpr(j);
-    }
     assert(lbDivisors[d] > 0);
     offset =
         (offset + lbs[d][cst->getNumCols() - 1 - rank]).floorDiv(lbDivisors[d]);
@@ -1648,7 +1672,7 @@
 
     // Record the offsets since they are needed to remap the memory accesses of
     // the original memref further below.
-    offsets.push_back(offset);
+    fastBufOffsets.push_back(offset);
   }
 
   // The faster memory space buffer.
@@ -1716,9 +1740,11 @@
 
   if (!copyOptions.generateDma) {
     // Point-wise copy generation.
-    auto copyNest = generatePointWiseCopy(loc, memref, fastMemRef, memAffineMap,
-                                          memIndices, fastBufferShape,
-                                          /*isCopyOut=*/region.isWrite(), b);
+    auto copyNest =
+        generatePointWiseCopy(loc, memref, fastMemRef, lbMaps,
+                              /*lbOperands=*/regionSymbols, ubMaps,
+                              /*ubOperands=*/regionSymbols, fastBufOffsets,
+                              /*isCopyOut=*/region.isWrite(), b);
 
     // Record this so that we can skip it from yet another copy.
     copyNests.insert(copyNest);
@@ -1790,7 +1816,7 @@
     // which the memref region is parametric); then those corresponding to
     // the memref's original indices follow.
     auto dimExpr = b.getAffineDimExpr(regionSymbols.size() + i);
-    remapExprs.push_back(dimExpr - offsets[i]);
+    remapExprs.push_back(dimExpr - fastBufOffsets[i]);
   }
   auto indexRemap = AffineMap::get(regionSymbols.size() + rank, 0, remapExprs);
 
@@ -1925,7 +1951,8 @@
 
     // Compute the MemRefRegion accessed.
     auto region = std::make_unique<MemRefRegion>(opInst->getLoc());
-    if (failed(region->compute(opInst, copyDepth))) {
+    if (failed(region->compute(opInst, copyDepth, /*sliceState=*/nullptr,
+                               /*addMemRefDimBounds=*/false))) {
       LLVM_DEBUG(llvm::dbgs()
                  << "Error obtaining memory region: semi-affine maps?\n");
       LLVM_DEBUG(llvm::dbgs() << "over-approximating to the entire memref\n");
@@ -2051,7 +2078,7 @@
   if (totalCopyBuffersSizeInBytes > copyOptions.fastMemCapacityBytes) {
     StringRef str = "Total size of all copy buffers' for this block "
                     "exceeds fast memory capacity\n";
-    block->getParentOp()->emitError(str);
+    block->getParentOp()->emitWarning(str);
   }
 
   return totalCopyBuffersSizeInBytes;
diff --git a/mlir/test/Dialect/Affine/affine-data-copy.mlir b/mlir/test/Dialect/Affine/affine-data-copy.mlir
--- a/mlir/test/Dialect/Affine/affine-data-copy.mlir
+++ b/mlir/test/Dialect/Affine/affine-data-copy.mlir
@@ -17,12 +17,9 @@
 #id = affine_map<(d0) -> (d0)>
 #ub = affine_map<(d0) -> (d0 + 128)>
 
-// Map used to index the original memref while copying.
-// CHECK-DAG: [[MEM_IDX_MAP:map[0-9]+]] = affine_map<(d0, d1) -> (d0 + d1)>
 // Map used to index the buffer while computing.
 // CHECK-DAG: [[MAP_IDENTITY:map[0-9]+]] = affine_map<(d0) -> (d0)>
 // CHECK-DAG: [[MAP_PLUS_128:map[0-9]+]] = affine_map<(d0) -> (d0 + 128)>
-// CHECK-DAG: [[BUF_IDX_MAP:map[0-9]+]] = affine_map<(d0, d1, d2, d3) -> (-d0 + d2, -d1 + d3)>
 
 // CHECK-LABEL: func @matmul
 // FILTER-LABEL: func @matmul
@@ -50,41 +47,34 @@
 
 // Buffers of size 128x128 get created here for all three matrices.
 
-// CHECK: affine.for %{{.*}} = 0 to 4096 step 128 {
-// CHECK:   affine.for %{{.*}} = 0 to 4096 step 128 {
+// CHECK: affine.for %[[I:.*]] = 0 to 4096 step 128 {
+// CHECK:   affine.for %[[J:.*]] = 0 to 4096 step 128 {
 // CHECK:     [[BUFC:%[0-9]+]] = alloc() : memref<128x128xf32>
-
 // The result matrix's copy gets hoisted out.
 // Result matrix copy-in.
-// CHECK:     affine.for %{{.*}} = 0 to 128 {
-// CHECK:       affine.apply #[[MEM_IDX_MAP]](%{{.*}}, %{{.*}})
-// CHECK:       affine.for %{{.*}} = 0 to 128 {
-// CHECK:         affine.apply #[[MEM_IDX_MAP]](%{{.*}}, %{{.*}})
+// CHECK:     affine.for %[[II:.*]] = #[[MAP_IDENTITY]](%{{.*}}) to #[[MAP_PLUS_128]](%{{.*}}) {
+// CHECK:       affine.for %[[JJ:.*]] = #[[MAP_IDENTITY]](%{{.*}}) to #[[MAP_PLUS_128]](%{{.*}}) {
 // CHECK:         affine.load %{{.*}}[%{{.*}}, %{{.*}}] : memref<4096x4096xf32>
-// CHECK:         affine.store %{{.*}}, [[BUFC]][%{{.*}}, %{{.*}}] : memref<128x128xf32>
+// CHECK:         affine.store %{{.*}}, [[BUFC]][-%[[I]] + %[[II]], -%[[J]] + %[[JJ]]] : memref<128x128xf32>
 // CHECK:       }
 // CHECK:     }
 
 // LHS matrix copy-in.
-// CHECK:     affine.for %{{.*}} = 0 to 4096 step 128 {
+// CHECK:     affine.for %[[K:.*]] = 0 to 4096 step 128 {
 // CHECK:      [[BUFA:%[0-9]+]] = alloc() : memref<128x128xf32>
-// CHECK:       affine.for %{{.*}} = 0 to 128 {
-// CHECK:         affine.apply #[[MEM_IDX_MAP]](%{{.*}}, %{{.*}})
-// CHECK:         affine.for %{{.*}} = 0 to 128 {
-// CHECK:           affine.apply #[[MEM_IDX_MAP]](%{{.*}}, %{{.*}})
+// CHECK:       affine.for %[[II:.*]] = #[[MAP_IDENTITY]](%{{.*}}) to #[[MAP_PLUS_128]](%{{.*}}) {
+// CHECK:         affine.for %[[KK:.*]] = #[[MAP_IDENTITY]](%{{.*}}) to #[[MAP_PLUS_128]](%{{.*}}) {
 // CHECK:           affine.load %{{.*}}[%{{.*}}, %{{.*}}] : memref<4096x4096xf32>
-// CHECK:           affine.store %{{.*}}, [[BUFA]][%{{.*}}, %{{.*}}] : memref<128x128xf32>
+// CHECK:           affine.store %{{.*}}, [[BUFA]][-%[[I]] + %[[II]], -%[[K]] + %[[KK]]] : memref<128x128xf32>
 // CHECK:         }
 // CHECK:       }
 
 // RHS matrix copy-in.
 // CHECK:       [[BUFB:%[0-9]+]] = alloc() : memref<128x128xf32>
-// CHECK:       affine.for %{{.*}} = 0 to 128 {
-// CHECK:         affine.apply #[[MEM_IDX_MAP]](%{{.*}}, %{{.*}})
-// CHECK:         affine.for %{{.*}} = 0 to 128 {
-// CHECK:           affine.apply #[[MEM_IDX_MAP]](%{{.*}}, %{{.*}})
+// CHECK:       affine.for %[[KK:.*]] = #[[MAP_IDENTITY]](%{{.*}}) to #[[MAP_PLUS_128]](%{{.*}}) {
+// CHECK:         affine.for %[[JJ:.*]] = #[[MAP_IDENTITY]](%{{.*}}) to #[[MAP_PLUS_128]](%{{.*}}) {
 // CHECK:           affine.load %{{.*}}[%{{.*}}, %{{.*}}] : memref<4096x4096xf32>
-// CHECK:           affine.store %{{.*}}, [[BUFB]][%{{.*}}, %{{.*}}] : memref<128x128xf32>
+// CHECK:           affine.store %{{.*}}, [[BUFB]][-%[[K]] + %[[KK]], -%[[J]] + %[[JJ]]] : memref<128x128xf32>
 // CHECK:         }
 // CHECK:       }
 
@@ -104,16 +94,12 @@
 // CHECK:       dealloc [[BUFB]] : memref<128x128xf32>
 // CHECK:       dealloc [[BUFA]] : memref<128x128xf32>
 // CHECK:     }
-// CHECK:     affine.apply #map{{.*}}(%{{.*}}, %{{.*}})
-// CHECK:     affine.apply #map{{.*}}(%{{.*}}, %{{.*}})
 
 // Result matrix copy out.
-// CHECK:     affine.for %{{.*}} = 0 to 128 {
-// CHECK:       affine.apply #[[MEM_IDX_MAP]](%{{.*}}, %{{.*}})
-// CHECK:       affine.for %{{.*}} = 0 to 128 {
-// CHECK:         affine.apply #[[MEM_IDX_MAP]](%{{.*}}, %{{.*}})
-// CHECK:         [[BUFA]] = affine.load [[BUFC]][%{{.*}}, %{{.*}}] : memref<128x128xf32>
-// CHECK:         store [[BUFA]], %{{.*}}[%{{.*}}, %{{.*}}] : memref<4096x4096xf32>
+// CHECK:     affine.for %{{.*}} = #[[MAP_IDENTITY]](%{{.*}}) to #[[MAP_PLUS_128]](%{{.*}}) {
+// CHECK:       affine.for %{{.*}} = #[[MAP_IDENTITY]](%{{.*}}) to #[[MAP_PLUS_128]](%{{.*}}) {
+// CHECK:         affine.load [[BUFC]][-%{{.*}} + %{{.*}}, -%{{.*}} + %{{.*}}] : memref<128x128xf32>
+// CHECK:         store %{{.*}}, %{{.*}}[%{{.*}}, %{{.*}}] : memref<4096x4096xf32>
 // CHECK:       }
 // CHECK:     }
 // CHECK:     dealloc [[BUFC]] : memref<128x128xf32>
@@ -125,15 +111,15 @@
 //      FILTER: affine.for %{{.*}} = 0 to 4096 step 128 {
 //      FILTER:   alloc() : memref<128x4096xf32>
 //  FILTER-NOT:   alloc()
-//      FILTER:   affine.for %{{.*}} = 0 to 128 {
+//      FILTER:   affine.for
 //      FILTER:     affine.for %{{.*}} = 0 to 4096 {
 //      FILTER:   affine.for %{{.*}} = 0 to 4096 step 128 {
 // FILTER-NEXT:     affine.for %{{.*}} = 0 to 4096 step 128 {
 // FILTER-NEXT:       affine.for %{{.*}} = #map{{.*}}(%{{.*}}) to #map{{.*}}(%{{.*}}) {
 // FILTER-NEXT:         affine.for %{{.*}} = #map{{.*}}(%{{.*}}) to #map{{.*}}(%{{.*}}) {
 // FILTER-NEXT:           affine.for %{{.*}} = #map{{.*}}(%{{.*}}) to #map{{.*}}(%{{.*}}) {
-//      FILTER:   dealloc %1 : memref<128x4096xf32>
-//  FILTER-NOT:   dealloc %1 : memref<128x4096xf32>
+//      FILTER:   dealloc %{{.*}} : memref<128x4096xf32>
+//  FILTER-NOT:   dealloc %{{.*}} : memref<128x4096xf32>
 
 // -----
 
@@ -141,10 +127,10 @@
 // This test case will lead to single element buffers. These are eventually
 // expected to be turned into registers via alloca and mem2reg.
 //
-// CHECK-SMALL-LABEL: func @foo
-// FILTER-LABEL: func @foo
-// MEMREF_REGION-LABEL: func @foo
-func @foo(%arg0: memref<1024x1024xf32>, %arg1: memref<1024x1024xf32>, %arg2: memref<1024x1024xf32>) -> memref<1024x1024xf32> {
+// CHECK-SMALL-LABEL: func @single_elt_buffers
+// FILTER-LABEL: func @single_elt_buffers
+// MEMREF_REGION-LABEL: func @single_elt_buffers
+func @single_elt_buffers(%arg0: memref<1024x1024xf32>, %arg1: memref<1024x1024xf32>, %arg2: memref<1024x1024xf32>) -> memref<1024x1024xf32> {
   affine.for %i = 0 to 1024 {
     affine.for %j = 0 to 1024 {
       affine.for %k = 0 to 1024 {
@@ -159,32 +145,20 @@
 }
 // CHECK-SMALL: affine.for %arg{{.*}} = 0 to 1024 {
 // CHECK-SMALL:   affine.for %arg{{.*}} = 0 to 1024 {
-// CHECK-SMALL:     affine.apply #map{{.*}}(%arg{{.*}}, %arg{{.*}})
-// CHECK-SMALL:     affine.apply #map{{.*}}(%arg{{.*}}, %arg{{.*}})
 // CHECK-SMALL:     alloc() : memref<1x1xf32>
-// CHECK-SMALL:     affine.apply #map{{.*}}(%arg{{.*}}, %c0{{.*}})
-// CHECK-SMALL:     affine.apply #map{{.*}}(%arg{{.*}}, %c0{{.*}})
 // CHECK-SMALL:     affine.load %arg{{.*}}[%{{.*}}, %{{.*}}] : memref<1024x1024xf32>
-// CHECK-SMALL:     affine.store %{{.*}}, %{{.*}}[%c0{{.*}}, %c0{{.*}}] : memref<1x1xf32>
+// CHECK-SMALL:     affine.store %{{.*}}, %{{.*}}[0, 0] : memref<1x1xf32>
 // CHECK-SMALL:     affine.for %arg{{.*}} = 0 to 1024 {
-// CHECK-SMALL:       affine.apply #map{{.*}}(%arg{{.*}}, %arg{{.*}})
-// CHECK-SMALL:       affine.apply #map{{.*}}(%arg{{.*}}, %arg{{.*}})
 // CHECK-SMALL:       alloc() : memref<1x1xf32>
-// CHECK-SMALL:       affine.apply #map{{.*}}(%arg{{.*}}, %c0{{.*}})
-// CHECK-SMALL:       affine.apply #map{{.*}}(%arg{{.*}}, %c0{{.*}})
 // CHECK-SMALL:       affine.load %arg{{.*}}[%{{.*}}, %{{.*}}] : memref<1024x1024xf32>
-// CHECK-SMALL:       affine.store %{{.*}}, %{{.*}}[%c0{{.*}}, %c0{{.*}}] : memref<1x1xf32>
+// CHECK-SMALL:       affine.store %{{.*}}, %{{.*}}[0, 0] : memref<1x1xf32>
 // CHECK-SMALL:       affine.load %{{.*}}[0, 0] : memref<1x1xf32>
 // CHECK-SMALL:       affine.load %{{.*}}[0, 0] : memref<1x1xf32>
 // CHECK-SMALL:       addf %{{.*}}, %{{.*}} : f32
 // CHECK-SMALL:       affine.store %{{.*}}, %{{.*}}[0, 0] : memref<1x1xf32>
 // CHECK-SMALL:       dealloc %{{.*}} : memref<1x1xf32>
 // CHECK-SMALL:     }
-// CHECK-SMALL:     affine.apply #map{{.*}}(%arg{{.*}}, %arg{{.*}})
-// CHECK-SMALL:     affine.apply #map{{.*}}(%arg{{.*}}, %arg{{.*}})
-// CHECK-SMALL:     affine.apply #map{{.*}}(%arg{{.*}}, %c0{{.*}})
-// CHECK-SMALL:     affine.apply #map{{.*}}(%arg{{.*}}, %c0{{.*}})
-// CHECK-SMALL:     affine.load %{{.*}}[%c0{{.*}}, %c0{{.*}}] : memref<1x1xf32>
+// CHECK-SMALL:     affine.load %{{.*}}[0, 0] : memref<1x1xf32>
 // CHECK-SMALL:     affine.store %{{.*}}, %arg{{.*}}[%{{.*}}, %{{.*}}] : memref<1024x1024xf32>
 // CHECK-SMALL:     dealloc %{{.*}} : memref<1x1xf32>
 // CHECK-SMALL:   }
@@ -210,8 +184,89 @@
 //  MEMREF_REGION-NOT: alloc()
 //      MEMREF_REGION: affine.for %{{.*}} = 0 to 1024 {
 //      MEMREF_REGION:   affine.for %{{.*}} = 0 to 1024 {
-//      MEMREF_REGION: affine.for %{{.*}} = 0 to 1024 {
+//      MEMREF_REGION:   }
+//      MEMREF_REGION: }
+// MEMREF_REGION-NEXT: affine.for %{{.*}} = 0 to 1024 {
 // MEMREF_REGION-NEXT:   affine.for %{{.*}} = 0 to 1024 {
 // MEMREF_REGION-NEXT:     affine.for %{{.*}} = 0 to 1024 {
 //      MEMREF_REGION: dealloc %{{.*}} : memref<1024x1024xf32>
 // MEMREF_REGION-NOT: dealloc
+// MEMREF_REGION-NEXT: return
+
+// -----
+
+// This pattern typically appears with tiling with tile sizes that don't divide
+// the loop trip counts.
+
+#map_ub = affine_map<(d0) -> (4096, d0 + 100)>
+
+// CHECK-DAG: [[MAP_IDENTITY:map[0-9]+]] = affine_map<(d0) -> (d0)>
+// CHECK-DAG: [[MAP_MIN_UB1:map[0-9]+]] = affine_map<(d0) -> (d0 + 100, 4096)>
+// CHECK-DAG: [[MAP_MIN_UB2:map[0-9]+]] = affine_map<(d0) -> (4096, d0 + 100)>
+
+// CHECK-LABEL: func @min_upper_bound
+func @min_upper_bound(%A: memref<4096xf32>) -> memref<4096xf32> {
+  affine.for %i = 0 to 4096 step 100 {
+    affine.for %ii = affine_map<(d0) -> (d0)>(%i) to min #map_ub(%i) {
+      %5 = affine.load %A[%ii] : memref<4096xf32>
+      %6 = mulf %5, %5 : f32
+      affine.store %6, %A[%ii] : memref<4096xf32>
+    }
+  }
+  return %A : memref<4096xf32>
+}
+// CHECK:      affine.for %[[IV1:.*]] = 0 to 4096 step 100
+// CHECK-NEXT:   %[[BUF:.*]] = alloc() : memref<100xf32>
+// CHECK-NEXT:   affine.for %[[IV2:.*]] = #[[MAP_IDENTITY]](%[[IV1]]) to min #[[MAP_MIN_UB1]](%[[IV1]]) {
+// CHECK-NEXT:     affine.load %{{.*}}[%[[IV2]]] : memref<4096xf32>
+// CHECK-NEXT:     affine.store %{{.*}}, %[[BUF]][-%[[IV1]] + %[[IV2]]] : memref<100xf32>
+// CHECK-NEXT:   }
+// CHECK-NEXT:   affine.for %[[IV2:.*]] = #[[MAP_IDENTITY]](%[[IV1]]) to min #[[MAP_MIN_UB2]](%[[IV1]]) {
+// CHECK-NEXT:     affine.load %[[BUF]][-%[[IV1]] + %[[IV2]]] : memref<100xf32>
+// CHECK-NEXT:     mulf
+// CHECK-NEXT:     affine.store %{{.*}}, %[[BUF]][-%[[IV1]] + %[[IV2]]] : memref<100xf32>
+// CHECK-NEXT:   }
+// CHECK-NEXT:   affine.for %[[IV2:.*]] = #[[MAP_IDENTITY]](%[[IV1]]) to min #[[MAP_MIN_UB1]](%[[IV1]]) {
+// CHECK-NEXT:     affine.load %[[BUF]][-%[[IV1]] + %[[IV2]]] : memref<100xf32>
+// CHECK-NEXT:     affine.store %{{.*}}, %{{.*}}[%[[IV2]]] : memref<4096xf32>
+// CHECK-NEXT:   }
+// CHECK-NEXT:   dealloc %[[BUF]] : memref<100xf32>
+// CHECK-NEXT: }
+
+// -----
+
+// Lower bound is a max; upper bound is a min. This pattern typically appears
+// with multi-level tiling when the tile sizes used don't divide loop trip
+// counts.
+
+#lb = affine_map<(d0, d1) -> (d0 * 512, d1 * 6)>
+#ub = affine_map<(d0, d1) -> (d0 * 512 + 512, d1 * 6 + 6)>
+
+// CHECK-DAG: #[[LB:.*]] = affine_map<()[s0, s1] -> (s0 * 512, s1 * 6)>
+// CHECK-DAG: #[[UB:.*]] = affine_map<()[s0, s1] -> (s0 * 512 + 512, s1 * 6 + 6)>
+
+// CHECK-LABEL: max_lower_bound(%{{.*}}: memref<2048x516xf64>,
+// CHECK-SAME: [[i:arg[0-9]+]]
+// CHECK-SAME: [[j:arg[0-9]+]]
+func @max_lower_bound(%M: memref<2048x516xf64>, %i : index, %j : index) {
+  affine.for %ii = 0 to 2048 {
+    affine.for %jj = max #lb(%i, %j) to min #ub(%i, %j) {
+      affine.load %M[%ii, %jj] : memref<2048x516xf64>
+    }
+  }
+  return
+}
+
+// CHECK:      %[[BUF=.*]] = alloc() : memref<2048x6xf64>
+// CHECK-NEXT: affine.for %[[ii:.*]] = 0 to 2048 {
+// CHECK-NEXT:   affine.for %[[jj:.*]] = max #[[LB]]()[%[[i]], %[[j]]] to min #[[UB]]()[%[[i]], %[[j]]] {
+// CHECK-NEXT:      affine.load %{{.*}}[%[[ii]], %[[jj]]] : memref<2048x516xf64>
+// CHECK-NEXT:      affine.store %{{.*}}, %[[BUF]][%[[ii]], %[[jj]] - symbol(%[[j]]) * 6] : memref<2048x6xf64>
+// CHECK-NEXT:   }
+// CHECK-NEXT: }
+// CHECK-NEXT: affine.for %[[ii_:.*]] = 0 to 2048 {
+// CHECK-NEXT:   affine.for %[[jj_:.*]] = max #[[LB]]()[%{{.*}}, %{{.*}}] to min #[[UB]]()[%{{.*}}, %{{.*}}] {
+// CHECK-NEXT:     affine.load %[[BUF]][%[[ii_]], %[[jj_]] - symbol(%[[j]]) * 6] : memref<2048x6xf64>
+// CHECK-NEXT:    }
+// CHECK-NEXT: }
+// CHECK-NEXT: dealloc %[[BUF]] : memref<2048x6xf64>
diff --git a/mlir/test/Dialect/Affine/dma-generate.mlir b/mlir/test/Dialect/Affine/dma-generate.mlir
--- a/mlir/test/Dialect/Affine/dma-generate.mlir
+++ b/mlir/test/Dialect/Affine/dma-generate.mlir
@@ -13,7 +13,6 @@
 // -----
 
 // Index of the buffer for the second DMA is remapped.
-// CHECK-DAG: [[MAP_PLUS_256:#map[0-9]+]] = affine_map<(d0) -> (d0 + 256)>
 // CHECK-DAG: [[MAP0:#map[0-9]+]] = affine_map<(d0) -> (d0)>
 
 // CHECK-LABEL: func @loop_nest_1d() {
@@ -36,14 +35,13 @@
   // Second DMA transfer.
   // CHECK:       affine.dma_start %{{.*}}[%{{.*}}], %{{.*}}[%{{.*}}], %{{.*}}[%{{.*}}], %{{.*}} : memref<512xf32>, memref<256xf32, 2>, memref<1xi32>
   // CHECK-NEXT:  affine.dma_wait %{{.*}}[%{{.*}}], %{{.*}} : memref<1xi32>
-  // CHECK: affine.for %{{.*}} = 0 to 256 {
+  // CHECK: affine.for %[[IV:.*]] = 0 to 256 {
       // CHECK-NEXT: affine.load %{{.*}}[%{{.*}}] : memref<256xf32, 2>
-      // CHECK:      affine.apply [[MAP_PLUS_256]](%{{.*}})
-      // Buffer for '%{{.*}}' in faster memref space is smaller size: 256xf32
-      // Affine map for 'affine.load %{{.*}}' is composed: %{{.*}} + 256 - 256 = %{{.*}}.
-      // CHECK-NEXT: %{{.*}} = affine.load %{{.*}}[%{{.*}}] : memref<256xf32, 2>
+      // Buffer for '%{{.*}}' in faster memref space is of smaller size: 256xf32
+      // Affine map for load on B is composed and becomes identity.
+      // CHECK:      affine.load %{{.*}}[%[[IV]]] : memref<256xf32, 2>
       // Already in faster memory space.
-      // CHECK:     affine.load %{{.*}}[%{{.*}}] : memref<256xf32, 2>
+      // CHECK:     affine.load %{{.*}}[%[[IV]]] : memref<256xf32, 2>
   // CHECK-NEXT: }
   // CHECK-NEXT: dealloc %{{.*}} : memref<1xi32>
   // CHECK-NEXT: dealloc %{{.*}} : memref<256xf32, 2>
@@ -83,19 +81,16 @@
 // CHECK-NEXT:    affine.for %{{.*}} = 0 to 32 {
 // CHECK-NEXT:      affine.for %{{.*}} = 0 to 32 {
 // CHECK-NEXT:        affine.for %{{.*}} = 0 to 16 {
-// CHECK-NEXT:          affine.apply #map{{[0-9]+}}(%{{.*}}, %{{.*}})
-// CHECK-NEXT:          %{{.*}} = affine.load [[BUFB]][%{{.*}} * 16 + %{{.*}}, %{{.*}}] : memref<512x32xf32, 2>
+// CHECK:               affine.load [[BUFB]][%{{.*}} * 16 + %{{.*}}, %{{.*}}] : memref<512x32xf32, 2>
 // CHECK-NEXT:          "foo"(%{{.*}}) : (f32) -> ()
 // CHECK-NEXT:        }
 // CHECK-NEXT:        affine.for %{{.*}} = 0 to 16 {
-// CHECK-NEXT:          affine.apply #map{{[0-9]+}}(%{{.*}}, %{{.*}})
-// CHECK-NEXT:          affine.load [[BUFA]][%{{.*}} * 16 + %{{.*}}, %{{.*}}] : memref<512x32xf32, 2>
+// CHECK:               affine.load [[BUFA]][%{{.*}} * 16 + %{{.*}}, %{{.*}}] : memref<512x32xf32, 2>
 // CHECK-NEXT:          "bar"(%{{.*}}) : (f32) -> ()
 // CHECK-NEXT:        }
 // CHECK-NEXT:        affine.for %{{.*}} = 0 to 16 {
 // CHECK-NEXT:          "abc_compute"() : () -> f32
-// CHECK-NEXT:          affine.apply #map{{[0-9]+}}(%{{.*}}, %{{.*}})
-// CHECK-NEXT:          affine.load [[BUFC]][%{{.*}} * 16 + %{{.*}}, %{{.*}}] : memref<512x32xf32, 2>
+// CHECK:               affine.load [[BUFC]][%{{.*}} * 16 + %{{.*}}, %{{.*}}] : memref<512x32xf32, 2>
 // CHECK-NEXT:          "addf32"(%{{.*}}, %{{.*}}) : (f32, f32) -> f32
 // CHECK-NEXT:          affine.store %{{.*}}, [[BUFC]][%{{.*}} * 16 + %{{.*}}, %{{.*}}] : memref<512x32xf32, 2>
 // CHECK-NEXT:        }
@@ -155,8 +150,7 @@
 // CHECK-LABEL: func @loop_nest_modulo() {
 // CHECK:       alloc() : memref<256x8xf32>
 // CHECK-NEXT:    affine.for %{{.*}} = 0 to 32 step 4 {
-// CHECK-NEXT:      affine.apply #map{{[0-9]+}}(%{{.*}})
-// CHECK-NEXT:      alloc() : memref<1x2xf32, 2>
+// CHECK:           alloc() : memref<1x2xf32, 2>
 // CHECK-NEXT:      alloc() : memref<1xi32>
 // Composition of the affine map for '%{{.*}}' causes '%{{.*}}' to be added as a symbol.
 // CHECK-NEXT:      affine.dma_start %{{.*}}[%{{.*}}, 0], %{{.*}}[%{{.*}}, %{{.*}}], %{{.*}}[%{{.*}}], %{{.*}} : memref<256x8xf32>, memref<1x2xf32, 2>, memref<1xi32>
@@ -231,8 +225,6 @@
 
 // -----
 
-// CHECK-DAG: [[MAP_SYM_SHIFT:#map[0-9]+]] = affine_map<(d0, d1)[s0, s1] -> (d1 + s0 + s1)>
-
 // CHECK-LABEL: func @dma_with_symbolic_accesses
 func @dma_with_symbolic_accesses(%A : memref<100x100xf32>, %M : index) {
   %N = constant 9 : index
@@ -247,10 +239,9 @@
 // CHECK-NEXT:  alloc() : memref<1xi32>
 // CHECK-NEXT:  affine.dma_start %{{.*}}[0, symbol(%{{.*}}) + 9], %{{.*}}[%{{.*}}, %{{.*}}], %{{.*}}[%{{.*}}], %{{.*}}
 // CHECK-NEXT:  affine.dma_wait %{{.*}}[%{{.*}}], %{{.*}}
-// CHECK-NEXT:  affine.for %{{.*}} = 0 to 100 {
-// CHECK-NEXT:    affine.for %{{.*}} = 0 to 100 {
-// CHECK-NEXT:      affine.apply [[MAP_SYM_SHIFT]](%{{.*}}, %{{.*}})[%{{.*}}, %{{.*}}]
-// CHECK-NEXT:      affine.load %{{.*}}[%{{.*}}, %{{.*}}] : memref<100x100xf32, 2>
+// CHECK-NEXT:  affine.for %[[IV0:.*]] = 0 to 100 {
+// CHECK-NEXT:    affine.for %[[IV1:.*]] = 0 to 100 {
+// CHECK:           affine.load %{{.*}}[%[[IV0]], %[[IV1]]] : memref<100x100xf32, 2>
 // CHECK-NEXT:    }
 // CHECK-NEXT:  }
 // CHECK:       return
@@ -317,11 +308,6 @@
 
 // -----
 
-// CHECK-DAG: [[MAP_PLUS_64:#map[0-9]+]] = affine_map<(d0) -> (d0 + 64)>
-// CHECK-DAG: [[MAP_PLUS_128:#map[0-9]+]] = affine_map<(d0) -> (d0 + 128)>
-// CHECK-DAG: [[MAP_PLUS_2:#map[0-9]+]] = affine_map<(d0) -> (d0 + 2)>
-// CHECK-DAG: [[MAP_PLUS_192:#map[0-9]+]] = affine_map<(d0) -> (d0 + 192)>
-
 // The first load accesses ([2,258), [128,384))
 // The second load accesses ([64,320), [2,258))
 // The first store writes to ([2,258), [192,448))
@@ -359,15 +345,9 @@
 // CHECK-NEXT:  alloc() : memref<1xi32>
 // CHECK-NEXT:  affine.for %{{.*}} = 0 to 256 {
 // CHECK-NEXT:    affine.for %{{.*}} = 0 to 256 {
-// CHECK-NEXT:      affine.apply [[MAP_PLUS_64]](%{{.*}})
-// CHECK-NEXT:      affine.apply [[MAP_PLUS_128]](%{{.*}})
-// CHECK-NEXT:      affine.apply [[MAP_PLUS_2]](%{{.*}})
-// CHECK-NEXT:      affine.apply [[MAP_PLUS_2]](%{{.*}})
-// CHECK-NEXT:      affine.load %{{.*}}[%{{.*}}, %{{.*}} + 126] : memref<382x446xf32, 2>
+// CHECK:           affine.load %{{.*}}[%{{.*}}, %{{.*}} + 126] : memref<382x446xf32, 2>
 // CHECK-NEXT:      affine.load %{{.*}}[%{{.*}} + 62, %{{.*}}] : memref<382x446xf32, 2>
-// CHECK-NEXT:      affine.apply [[MAP_PLUS_128]](%{{.*}})
-// CHECK-NEXT:      affine.apply [[MAP_PLUS_192]](%{{.*}})
-// CHECK-NEXT:      affine.store %{{.*}}, %{{.*}}[%{{.*}}, %{{.*}} + 190] : memref<382x446xf32, 2>
+// CHECK:           affine.store %{{.*}}, %{{.*}}[%{{.*}}, %{{.*}} + 190] : memref<382x446xf32, 2>
 // CHECK-NEXT:      affine.store %{{.*}}, %{{.*}}[%{{.*}} + 126, %{{.*}}] : memref<382x446xf32, 2>
 // CHECK-NEXT:    }
 // CHECK-NEXT:  }
@@ -468,9 +448,8 @@
 // CHECK:      [[BUF:%[0-9]+]] = alloc() : memref<1027xf32, 2>
 // CHECK-NEXT: [[MEM:%[0-9]+]] = alloc() : memref<1xi32>
 // CHECK-NEXT: affine.for %{{.*}} = 0 to 1024 {
-// CHECK-NEXT:    affine.for %{{.*}} = {{#map[0-9]+}}(%{{.*}}) to {{#map[0-9]+}}(%{{.*}}) {
-// CHECK-NEXT:      constant 0.000000e+00 : f32
-// CHECK-NEXT:      affine.store %{{.*}}, [[BUF]][%{{.*}}] : memref<1027xf32, 2>
+// CHECK-NEXT:    affine.for %[[I2:.*]] = {{#map[0-9]+}}(%{{.*}}) to {{#map[0-9]+}}(%{{.*}}) {
+// CHECK:           affine.store %{{.*}}, [[BUF]][%[[I2]]] : memref<1027xf32, 2>
 // CHECK-NEXT:    }
 // CHECK-NEXT:  }
 // CHECK-NEXT:  affine.dma_start [[BUF]][%{{.*}}], %{{.*}}[%{{.*}}], [[MEM]][%{{.*}}], %{{.*}}  : memref<1027xf32, 2>, memref<1027xf32>, memref<1xi32>
@@ -478,9 +457,6 @@
 
 // -----
 
-// CHECK-DAG: [[MAP_READ_OFFSET:#map[0-9]+]] = affine_map<(d0) -> (d0 + 100)>
-// CHECK-DAG: [[MAP_WRITE_OFFSET:#map[0-9]+]] = affine_map<(d0) -> (d0 + 25)>
-
 func @test_read_write_region_union() {
   %0 = alloc() : memref<256xf32>
   affine.for %i0 = 0 to 10 {
@@ -503,9 +479,7 @@
 // CHECK-NEXT:  affine.dma_wait %{{.*}}[%{{.*}}], %{{.*}} : memref<1xi32>
 // CHECK-NEXT:  alloc() : memref<1xi32>
 // CHECK-NEXT:  affine.for %{{.*}} = 0 to 10 {
-// CHECK-NEXT:    affine.apply [[MAP_READ_OFFSET]](%{{.*}})
-// CHECK-NEXT:    affine.apply [[MAP_WRITE_OFFSET]](%{{.*}})
-// CHECK-NEXT:    affine.load %{{.*}}[%{{.*}} + 75] : memref<85xf32, 2>
+// CHECK:         affine.load %{{.*}}[%{{.*}} + 75] : memref<85xf32, 2>
 // CHECK-NEXT:    affine.store %{{.*}}, %{{.*}}[%{{.*}}] : memref<85xf32, 2>
 // CHECK-NEXT:  }
 // CHECK-NEXT:  affine.dma_start %{{.*}}[%{{.*}}], %{{.*}}[%{{.*}}], %{{.*}}[%{{.*}}], %{{.*}} : memref<85xf32, 2>, memref<256xf32>, memref<1xi32>
diff --git a/mlir/test/lib/Dialect/Affine/TestAffineDataCopy.cpp b/mlir/test/lib/Dialect/Affine/TestAffineDataCopy.cpp
--- a/mlir/test/lib/Dialect/Affine/TestAffineDataCopy.cpp
+++ b/mlir/test/lib/Dialect/Affine/TestAffineDataCopy.cpp
@@ -13,6 +13,7 @@
 
 #include "mlir/Analysis/Utils.h"
 #include "mlir/Dialect/Affine/IR/AffineOps.h"
+#include "mlir/IR/PatternMatch.h"
 #include "mlir/Pass/Pass.h"
 #include "mlir/Transforms/LoopUtils.h"
 #include "mlir/Transforms/Passes.h"
@@ -76,8 +77,8 @@
                                    /*fastMemorySpace=*/0,
                                    /*tagMemorySpace=*/0,
                                    /*fastMemCapacityBytes=*/32 * 1024 * 1024UL};
+  DenseSet<Operation *> copyNests;
   if (clMemRefFilter) {
-    DenseSet<Operation *> copyNests;
     affineDataCopyGenerate(loopNest, copyOptions, load.getMemRef(), copyNests);
   } else if (clTestGenerateCopyForMemRegion) {
     CopyGenerateResult result;
@@ -85,6 +86,17 @@
     region.compute(load, /*loopDepth=*/0);
     generateCopyForMemRegion(region, loopNest, copyOptions, result);
   }
+
+  // Promote any single iteration loops in the copy nests.
+  for (auto nest : copyNests)
+    nest->walk([](AffineForOp forOp) { promoteIfSingleIteration(forOp); });
+
+  // Promoting single iteration loops could lead to simplification
+  // of load's/store's. We will run the canonicalization patterns again.
+  OwningRewritePatternList patterns;
+  AffineLoadOp::getCanonicalizationPatterns(patterns, &getContext());
+  AffineStoreOp::getCanonicalizationPatterns(patterns, &getContext());
+  applyPatternsGreedily(getFunction(), std::move(patterns));
 }
 
 namespace mlir {