diff --git a/mlir/include/mlir/Transforms/LoopFusionUtils.h b/mlir/include/mlir/Dialect/Affine/LoopFusionUtils.h
rename from mlir/include/mlir/Transforms/LoopFusionUtils.h
rename to mlir/include/mlir/Dialect/Affine/LoopFusionUtils.h
--- a/mlir/include/mlir/Transforms/LoopFusionUtils.h
+++ b/mlir/include/mlir/Dialect/Affine/LoopFusionUtils.h
@@ -12,8 +12,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef MLIR_TRANSFORMS_LOOPFUSIONUTILS_H
-#define MLIR_TRANSFORMS_LOOPFUSIONUTILS_H
+#ifndef MLIR_DIALECT_AFFINE_LOOPFUSIONUTILS_H
+#define MLIR_DIALECT_AFFINE_LOOPFUSIONUTILS_H
 
 #include "mlir/IR/Value.h"
 #include "mlir/Support/LLVM.h"
@@ -167,4 +167,4 @@
                                    DenseSet<Value> &producerConsumerMemrefs);
 } // namespace mlir
 
-#endif // MLIR_TRANSFORMS_LOOPFUSIONUTILS_H
+#endif // MLIR_DIALECT_AFFINE_LOOPFUSIONUTILS_H
diff --git a/mlir/include/mlir/Transforms/LoopUtils.h b/mlir/include/mlir/Dialect/Affine/LoopUtils.h
rename from mlir/include/mlir/Transforms/LoopUtils.h
rename to mlir/include/mlir/Dialect/Affine/LoopUtils.h
--- a/mlir/include/mlir/Transforms/LoopUtils.h
+++ b/mlir/include/mlir/Dialect/Affine/LoopUtils.h
@@ -12,8 +12,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef MLIR_TRANSFORMS_LOOPUTILS_H
-#define MLIR_TRANSFORMS_LOOPUTILS_H
+#ifndef MLIR_DIALECT_AFFINE_LOOPUTILS_H
+#define MLIR_DIALECT_AFFINE_LOOPUTILS_H
 
 #include "mlir/IR/Block.h"
 #include "mlir/Support/LLVM.h"
@@ -45,9 +45,6 @@
 LogicalResult loopUnrollByFactor(
     AffineForOp forOp, uint64_t unrollFactor,
     function_ref<void(unsigned, Operation *, OpBuilder)> annotateFn = nullptr);
-LogicalResult loopUnrollByFactor(
-    scf::ForOp forOp, uint64_t unrollFactor,
-    function_ref<void(unsigned, Operation *, OpBuilder)> annotateFn = nullptr);
 
 /// Unrolls this loop by the specified unroll factor or its trip count,
 /// whichever is lower.
@@ -63,8 +60,6 @@
 /// AffineForOp, and the second op is a terminator).
 void getPerfectlyNestedLoops(SmallVectorImpl<AffineForOp> &nestedLoops,
                              AffineForOp root);
-void getPerfectlyNestedLoops(SmallVectorImpl<scf::ForOp> &nestedLoops,
-                             scf::ForOp root);
 
 /// Unrolls and jams this loop by the specified factor. `forOp` can be a loop
 /// with iteration arguments performing supported reductions and its inner loops
@@ -78,10 +73,9 @@
 LogicalResult loopUnrollJamUpToFactor(AffineForOp forOp,
                                       uint64_t unrollJamFactor);
 
-/// Promotes the loop body of a AffineForOp/scf::ForOp to its containing block
-/// if the loop was known to have a single iteration.
+/// Promotes the loop body of a AffineForOp to its containing block if the loop
+/// was known to have a single iteration.
 LogicalResult promoteIfSingleIteration(AffineForOp forOp);
-LogicalResult promoteIfSingleIteration(scf::ForOp forOp);
 
 /// Promotes all single iteration AffineForOp's in the Function, i.e., moves
 /// their body into the containing Block.
@@ -146,13 +140,9 @@
 /// occurrence in `forOps`, under each of the `targets`.
 /// Returns the new AffineForOps, one per each of (`forOps`, `targets`) pair,
 /// nested immediately under each of `targets`.
-using Loops = SmallVector<scf::ForOp, 8>;
-using TileLoops = std::pair<Loops, Loops>;
 SmallVector<SmallVector<AffineForOp, 8>, 8> tile(ArrayRef<AffineForOp> forOps,
                                                  ArrayRef<uint64_t> sizes,
                                                  ArrayRef<AffineForOp> targets);
-SmallVector<Loops, 8> tile(ArrayRef<scf::ForOp> forOps, ArrayRef<Value> sizes,
-                           ArrayRef<scf::ForOp> targets);
 
 /// Performs tiling (with interchange) by strip-mining the `forOps` by `sizes`
 /// and sinking them, in their order of occurrence in `forOps`, under `target`.
@@ -160,15 +150,6 @@
 /// `target`.
 SmallVector<AffineForOp, 8> tile(ArrayRef<AffineForOp> forOps,
                                  ArrayRef<uint64_t> sizes, AffineForOp target);
-Loops tile(ArrayRef<scf::ForOp> forOps, ArrayRef<Value> sizes,
-           scf::ForOp target);
-
-/// Tile a nest of scf::ForOp loops rooted at `rootForOp` with the given
-/// (parametric) sizes. Sizes are expected to be strictly positive values at
-/// runtime.  If more sizes than loops are provided, discard the trailing values
-/// in sizes.  Assumes the loop nest is permutable.
-/// Returns the newly created intra-tile loops.
-Loops tilePerfectlyNested(scf::ForOp rootForOp, ArrayRef<Value> sizes);
 
 /// Explicit copy / DMA generation options for mlir::affineDataCopyGenerate.
 struct AffineCopyOptions {
@@ -236,16 +217,6 @@
                                        const AffineCopyOptions &copyOptions,
                                        CopyGenerateResult &result);
 
-/// Tile a nest of standard for loops rooted at `rootForOp` by finding such
-/// parametric tile sizes that the outer loops have a fixed number of iterations
-/// as defined in `sizes`.
-TileLoops extractFixedOuterLoops(scf::ForOp rootFOrOp, ArrayRef<int64_t> sizes);
-
-/// Replace a perfect nest of "for" loops with a single linearized loop. Assumes
-/// `loops` contains a list of perfectly nested loops with bounds and steps
-/// independent of any loop induction variable involved in the nest.
-void coalesceLoops(MutableArrayRef<scf::ForOp> loops);
-
 /// Replace a perfect nest of "for" loops with a single linearized loop. Assumes
 /// `loops` contains a list of perfectly nested loops outermost to innermost
 /// that are normalized (step one and lower bound of zero) and with bounds and
@@ -254,12 +225,6 @@
 /// be representable using affine.for.
 LogicalResult coalesceLoops(MutableArrayRef<AffineForOp> loops);
 
-/// Take the ParallelLoop and for each set of dimension indices, combine them
-/// into a single dimension. combinedDimensions must contain each index into
-/// loops exactly once.
-void collapseParallelLoops(scf::ParallelOp loops,
-                           ArrayRef<std::vector<unsigned>> combinedDimensions);
-
 /// Maps `forOp` for execution on a parallel grid of virtual `processorIds` of
 /// size given by `numProcessors`. This is achieved by embedding the SSA values
 /// corresponding to `processorIds` and `numProcessors` into the bounds and step
@@ -321,9 +286,6 @@
 separateFullTiles(MutableArrayRef<AffineForOp> nest,
                   SmallVectorImpl<AffineForOp> *fullTileNest = nullptr);
 
-/// Move loop invariant code out of `looplike`.
-LogicalResult moveLoopInvariantCode(LoopLikeOpInterface looplike);
-
 } // namespace mlir
 
-#endif // MLIR_TRANSFORMS_LOOPUTILS_H
+#endif // MLIR_DIALECT_AFFINE_LOOPUTILS_H
diff --git a/mlir/include/mlir/Dialect/Affine/Passes.h b/mlir/include/mlir/Dialect/Affine/Passes.h
--- a/mlir/include/mlir/Dialect/Affine/Passes.h
+++ b/mlir/include/mlir/Dialect/Affine/Passes.h
@@ -21,6 +21,10 @@
 
 class AffineForOp;
 
+/// Fusion mode to attempt. The default mode `Greedy` does both
+/// producer-consumer and sibling fusion.
+enum FusionMode { Greedy, ProducerConsumer, Sibling };
+
 /// Creates a simplification pass for affine structures (maps and sets). In
 /// addition, this pass also normalizes memrefs to have the trivial (identity)
 /// layout map.
@@ -53,6 +57,19 @@
 /// dead allocs.
 std::unique_ptr<OperationPass<FuncOp>> createAffineScalarReplacementPass();
 
+/// Creates a pass that transforms perfectly nested loops with independent
+/// bounds into a single loop.
+std::unique_ptr<OperationPass<FuncOp>> createLoopCoalescingPass();
+
+/// Creates a loop fusion pass which fuses loops according to type of fusion
+/// specified in `fusionMode`. Buffers of size less than or equal to
+/// `localBufSizeThreshold` are promoted to memory space `fastMemorySpace`.
+std::unique_ptr<OperationPass<FuncOp>>
+createLoopFusionPass(unsigned fastMemorySpace = 0,
+                     uint64_t localBufSizeThreshold = 0,
+                     bool maximalFusion = false,
+                     enum FusionMode fusionMode = FusionMode::Greedy);
+
 /// Creates a pass to perform tiling on loop nests.
 std::unique_ptr<OperationPass<FuncOp>>
 createLoopTilingPass(uint64_t cacheSizeBytes);
@@ -76,6 +93,10 @@
 std::unique_ptr<OperationPass<FuncOp>>
 createLoopUnrollAndJamPass(int unrollJamFactor = -1);
 
+/// Creates a pass to pipeline explicit movement of data across levels of the
+/// memory hierarchy.
+std::unique_ptr<OperationPass<FuncOp>> createPipelineDataTransferPass();
+
 /// Creates a pass to vectorize loops, operations and data types using a
 /// target-independent, n-D super-vector abstraction.
 std::unique_ptr<OperationPass<FuncOp>>
diff --git a/mlir/include/mlir/Dialect/Affine/Passes.td b/mlir/include/mlir/Dialect/Affine/Passes.td
--- a/mlir/include/mlir/Dialect/Affine/Passes.td
+++ b/mlir/include/mlir/Dialect/Affine/Passes.td
@@ -43,6 +43,138 @@
   ];
 }
 
+def AffineLoopFusion : Pass<"affine-loop-fusion", "FuncOp"> {
+  let summary = "Fuse affine loop nests";
+  let description = [{
+    This pass performs fusion of loop nests using a slicing-based approach. It
+    combines two fusion strategies: producer-consumer fusion and sibling fusion.
+    Producer-consumer fusion is aimed at fusing pairs of loops where the first
+    one writes to a memref that the second reads. Sibling fusion targets pairs
+    of loops that share no dependences between them but that load from the same
+    memref. The fused loop nests, when possible, are rewritten to access
+    significantly smaller local buffers instead of the original memref's, and
+    the latter are often either completely optimized away or contracted. This
+    transformation leads to enhanced locality and lower memory footprint through
+    the elimination or contraction of temporaries/intermediate memref's. These
+    benefits are sometimes achieved at the expense of redundant computation
+    through a cost model that evaluates available choices such as the depth at
+    which a source slice should be materialized in the designation slice.
+
+    Example 1: Producer-consumer fusion.
+    Input:
+    ```mlir
+    func @producer_consumer_fusion(%arg0: memref<10xf32>, %arg1: memref<10xf32>) {
+      %0 = memref.alloc() : memref<10xf32>
+      %1 = memref.alloc() : memref<10xf32>
+      %cst = arith.constant 0.000000e+00 : f32
+      affine.for %arg2 = 0 to 10 {
+        affine.store %cst, %0[%arg2] : memref<10xf32>
+        affine.store %cst, %1[%arg2] : memref<10xf32>
+      }
+      affine.for %arg2 = 0 to 10 {
+        %2 = affine.load %0[%arg2] : memref<10xf32>
+        %3 = arith.addf %2, %2 : f32
+        affine.store %3, %arg0[%arg2] : memref<10xf32>
+      }
+      affine.for %arg2 = 0 to 10 {
+        %2 = affine.load %1[%arg2] : memref<10xf32>
+        %3 = arith.mulf %2, %2 : f32
+        affine.store %3, %arg1[%arg2] : memref<10xf32>
+      }
+      return
+    }
+    ```
+    Output:
+    ```mlir
+    func @producer_consumer_fusion(%arg0: memref<10xf32>, %arg1: memref<10xf32>) {
+      %0 = memref.alloc() : memref<1xf32>
+      %1 = memref.alloc() : memref<1xf32>
+      %cst = arith.constant 0.000000e+00 : f32
+      affine.for %arg2 = 0 to 10 {
+        affine.store %cst, %0[0] : memref<1xf32>
+        affine.store %cst, %1[0] : memref<1xf32>
+        %2 = affine.load %1[0] : memref<1xf32>
+        %3 = arith.mulf %2, %2 : f32
+        affine.store %3, %arg1[%arg2] : memref<10xf32>
+        %4 = affine.load %0[0] : memref<1xf32>
+        %5 = arith.addf %4, %4 : f32
+        affine.store %5, %arg0[%arg2] : memref<10xf32>
+      }
+      return
+    }
+    ```
+
+    Example 2: Sibling fusion.
+    Input:
+    ```mlir
+    func @sibling_fusion(%arg0: memref<10x10xf32>, %arg1: memref<10x10xf32>,
+                         %arg2: memref<10x10xf32>, %arg3: memref<10x10xf32>,
+                         %arg4: memref<10x10xf32>) {
+      affine.for %arg5 = 0 to 3 {
+        affine.for %arg6 = 0 to 3 {
+          %0 = affine.load %arg0[%arg5, %arg6] : memref<10x10xf32>
+          %1 = affine.load %arg1[%arg5, %arg6] : memref<10x10xf32>
+          %2 = arith.mulf %0, %1 : f32
+          affine.store %2, %arg3[%arg5, %arg6] : memref<10x10xf32>
+        }
+      }
+      affine.for %arg5 = 0 to 3 {
+        affine.for %arg6 = 0 to 3 {
+          %0 = affine.load %arg0[%arg5, %arg6] : memref<10x10xf32>
+          %1 = affine.load %arg2[%arg5, %arg6] : memref<10x10xf32>
+          %2 = arith.addf %0, %1 : f32
+          affine.store %2, %arg4[%arg5, %arg6] : memref<10x10xf32>
+        }
+      }
+      return
+    }
+    ```
+    Output:
+    ```mlir
+    func @sibling_fusion(%arg0: memref<10x10xf32>, %arg1: memref<10x10xf32>,
+                         %arg2: memref<10x10xf32>, %arg3: memref<10x10xf32>,
+                         %arg4: memref<10x10xf32>) {
+      affine.for %arg5 = 0 to 3 {
+        affine.for %arg6 = 0 to 3 {
+          %0 = affine.load %arg0[%arg5, %arg6] : memref<10x10xf32>
+          %1 = affine.load %arg1[%arg5, %arg6] : memref<10x10xf32>
+          %2 = arith.mulf %0, %1 : f32
+          affine.store %2, %arg3[%arg5, %arg6] : memref<10x10xf32>
+          %3 = affine.load %arg0[%arg5, %arg6] : memref<10x10xf32>
+          %4 = affine.load %arg2[%arg5, %arg6] : memref<10x10xf32>
+          %5 = arith.addf %3, %4 : f32
+          affine.store %5, %arg4[%arg5, %arg6] : memref<10x10xf32>
+        }
+      }
+      return
+    }
+    ```
+  }];
+  let constructor = "mlir::createLoopFusionPass()";
+  let options = [
+    Option<"computeToleranceThreshold", "fusion-compute-tolerance", "double",
+           /*default=*/"0.30f", "Fractional increase in additional computation "
+                                "tolerated while fusing">,
+    Option<"fastMemorySpace", "fusion-fast-mem-space", "unsigned",
+           /*default=*/"0",
+           "Faster memory space number to promote fusion buffers to">,
+    Option<"localBufSizeThreshold", "fusion-local-buf-threshold", "uint64_t",
+           /*default=*/"0", "Threshold size (KiB) for promoting local buffers "
+                            "to fast memory space">,
+    Option<"maximalFusion", "fusion-maximal", "bool", /*default=*/"false",
+           "Enables maximal loop fusion">,
+    Option<"affineFusionMode", "mode", "enum FusionMode",
+           "mlir::FusionMode::Greedy", "fusion mode to attempt",
+           "llvm::cl::values(clEnumValN(mlir::FusionMode::Greedy,"
+           " \"greedy\", \"Perform greedy (both producer-consumer and sibling)  fusion\"), "
+           "clEnumValN( mlir::FusionMode::ProducerConsumer, "
+           "\"producer\", \"Perform only producer-consumer fusion\"), "
+           "clEnumValN( mlir::FusionMode::Sibling, "
+           "\"sibling\", \"Perform only sibling fusion\"))">,
+    ];
+  let dependentDialects = ["memref::MemRefDialect"];
+}
+
 def AffineLoopInvariantCodeMotion
     : Pass<"affine-loop-invariant-code-motion", "FuncOp"> {
   let summary = "Hoist loop invariant instructions outside of affine loops";
@@ -94,6 +226,75 @@
   ];
 }
 
+def AffinePipelineDataTransfer
+    : Pass<"affine-pipeline-data-transfer", "FuncOp"> {
+  let summary = "Pipeline non-blocking data transfers between explicitly "
+                "managed levels of the memory hierarchy";
+  let description = [{
+    This pass performs a transformation to overlap non-blocking DMA operations
+    in a loop with computations through double buffering. This is achieved by
+    advancing dma_start operations with respect to other operations.
+
+    Input
+
+    ```mlir
+    func @pipelinedatatransfer() {
+      %0 = memref.alloc() : memref<256xf32>
+      %1 = memref.alloc() : memref<32xf32, 1>
+      %2 = memref.alloc() : memref<1xf32>
+      %c0 = arith.constant 0 : index
+      %c128 = arith.constant 128 : index
+      affine.for %i0 = 0 to 8 {
+        affine.dma_start %0[%i0], %1[%i0], %2[%c0], %c128 : memref<256xf32>, memref<32xf32, 1>, memref<1xf32>
+        affine.dma_wait %2[%c0], %c128 : memref<1xf32>
+        %3 = affine.load %1[%i0] : memref<32xf32, 1>
+        %4 = "compute"(%3) : (f32) -> f32
+        affine.store %4, %1[%i0] : memref<32xf32, 1>
+      }
+      return
+    }
+    ```
+
+    Output
+
+    ```mlir
+    module {
+      func @pipelinedatatransfer() {
+        %c8 = arith.constant 8 : index
+        %c0 = arith.constant 0 : index
+        %0 = memref.alloc() : memref<256xf32>
+        %c0_0 = arith.constant 0 : index
+        %c128 = arith.constant 128 : index
+        %1 = memref.alloc() : memref<2x32xf32, 1>
+        %2 = memref.alloc() : memref<2x1xf32>
+        affine.dma_start %0[%c0], %1[%c0 mod 2, %c0], %2[%c0 mod 2, symbol(%c0_0)], %c128 : memref<256xf32>, memref<2x32xf32, 1>, memref<2x1xf32>
+        affine.for %arg0 = 1 to 8 {
+          affine.dma_start %0[%arg0], %1[%arg0 mod 2, %arg0], %2[%arg0 mod 2, symbol(%c0_0)], %c128 : memref<256xf32>, memref<2x32xf32, 1>, memref<2x1xf32>
+          %8 = affine.apply #map3(%arg0)
+          %9 = affine.apply #map4(%8)
+          %10 = affine.apply #map4(%8)
+          affine.dma_wait %2[%8 mod 2, symbol(%c0_0)], %c128 : memref<2x1xf32>
+          %11 = affine.load %1[%8 mod 2, %8] : memref<2x32xf32, 1>
+          %12 = "compute"(%11) : (f32) -> f32
+          affine.store %12, %1[%8 mod 2, %8] : memref<2x32xf32, 1>
+        }
+        %3 = affine.apply #map3(%c8)
+        %4 = affine.apply #map4(%3)
+        %5 = affine.apply #map4(%3)
+        affine.dma_wait %2[%3 mod 2, symbol(%c0_0)], %c128 : memref<2x1xf32>
+        %6 = affine.load %1[%3 mod 2, %3] : memref<2x32xf32, 1>
+        %7 = "compute"(%6) : (f32) -> f32
+        affine.store %7, %1[%3 mod 2, %3] : memref<2x32xf32, 1>
+        memref.dealloc %2 : memref<2x1xf32>
+        memref.dealloc %1 : memref<2x32xf32, 1>
+        return
+      }
+    }
+    ```
+  }];
+  let constructor = "mlir::createPipelineDataTransferPass()";
+}
+
 def AffineScalarReplacement : Pass<"affine-scalrep", "FuncOp"> {
   let summary = "Replace affine memref acceses by scalars by forwarding stores "
                 "to loads and eliminating redundant loads";
@@ -184,6 +385,13 @@
   let constructor = "mlir::createAffineLoopNormalizePass()";
 }
 
+def LoopCoalescing : Pass<"loop-coalescing", "FuncOp"> {
+  let summary = "Coalesce nested loops with independent bounds into a single "
+                "loop";
+  let constructor = "mlir::createLoopCoalescingPass()";
+  let dependentDialects = ["arith::ArithmeticDialect"];
+}
+
 def SimplifyAffineStructures : Pass<"simplify-affine-structures", "FuncOp"> {
   let summary = "Simplify affine expressions in maps/sets and normalize "
                 "memrefs";
diff --git a/mlir/include/mlir/Dialect/Affine/Utils.h b/mlir/include/mlir/Dialect/Affine/Utils.h
--- a/mlir/include/mlir/Dialect/Affine/Utils.h
+++ b/mlir/include/mlir/Dialect/Affine/Utils.h
@@ -24,6 +24,10 @@
 class Operation;
 class PostDominanceInfo;
 
+namespace memref {
+class AllocOp;
+} // namespace memref
+
 struct LogicalResult;
 
 using ReductionLoopMap = DenseMap<Operation *, SmallVector<LoopReduction, 2>>;
@@ -168,6 +172,121 @@
 AffineExpr substWithMin(AffineExpr e, AffineExpr dim, AffineExpr min,
                         AffineExpr max, bool positivePath = true);
 
+/// Replaces all "dereferencing" uses of `oldMemRef` with `newMemRef` while
+/// optionally remapping the old memref's indices using the supplied affine map,
+/// `indexRemap`. The new memref could be of a different shape or rank.
+/// `extraIndices` provides any additional access indices to be added to the
+/// start.
+///
+/// `indexRemap` remaps indices of the old memref access to a new set of indices
+/// that are used to index the memref. Additional input operands to indexRemap
+/// can be optionally provided in `extraOperands`, and they occupy the start
+/// of its input list. `indexRemap`'s dimensional inputs are expected to
+/// correspond to memref's indices, and its symbolic inputs if any should be
+/// provided in `symbolOperands`.
+///
+/// `domOpFilter`, if non-null, restricts the replacement to only those
+/// operations that are dominated by the former; similarly, `postDomOpFilter`
+/// restricts replacement to only those operations that are postdominated by it.
+///
+/// 'allowNonDereferencingOps', if set, allows replacement of non-dereferencing
+/// uses of a memref without any requirement for access index rewrites as long
+/// as the user operation has the MemRefsNormalizable trait. The default value
+/// of this flag is false.
+///
+/// 'replaceInDeallocOp', if set, lets DeallocOp, a non-dereferencing user, to
+/// also be a candidate for replacement. The default value of this flag is
+/// false.
+///
+/// Returns true on success and false if the replacement is not possible,
+/// whenever a memref is used as an operand in a non-dereferencing context and
+/// 'allowNonDereferencingOps' is false, except for dealloc's on the memref
+/// which are left untouched. See comments at function definition for an
+/// example.
+//
+//  Ex: to replace load %A[%i, %j] with load %Abuf[%t mod 2, %ii - %i, %j]:
+//  The SSA value corresponding to '%t mod 2' should be in 'extraIndices', and
+//  index remap will perform (%i, %j) -> (%ii - %i, %j), i.e., indexRemap = (d0,
+//  d1, d2) -> (d0 - d1, d2), and %ii will be the extra operand. Without any
+//  extra operands, note that 'indexRemap' would just be applied to existing
+//  indices (%i, %j).
+//  TODO: allow extraIndices to be added at any position.
+LogicalResult replaceAllMemRefUsesWith(
+    Value oldMemRef, Value newMemRef, ArrayRef<Value> extraIndices = {},
+    AffineMap indexRemap = AffineMap(), ArrayRef<Value> extraOperands = {},
+    ArrayRef<Value> symbolOperands = {}, Operation *domOpFilter = nullptr,
+    Operation *postDomOpFilter = nullptr, bool allowNonDereferencingOps = false,
+    bool replaceInDeallocOp = false);
+
+/// Performs the same replacement as the other version above but only for the
+/// dereferencing uses of `oldMemRef` in `op`, except in cases where
+/// 'allowNonDereferencingOps' is set to true where we replace the
+/// non-dereferencing uses as well.
+LogicalResult replaceAllMemRefUsesWith(Value oldMemRef, Value newMemRef,
+                                       Operation *op,
+                                       ArrayRef<Value> extraIndices = {},
+                                       AffineMap indexRemap = AffineMap(),
+                                       ArrayRef<Value> extraOperands = {},
+                                       ArrayRef<Value> symbolOperands = {},
+                                       bool allowNonDereferencingOps = false);
+
+/// Rewrites the memref defined by this alloc op to have an identity layout map
+/// and updates all its indexing uses. Returns failure if any of its uses
+/// escape (while leaving the IR in a valid state).
+LogicalResult normalizeMemRef(memref::AllocOp *op);
+
+/// Uses the old memref type map layout and computes the new memref type to have
+/// a new shape and a layout map, where the old layout map has been normalized
+/// to an identity layout map. It returns the old memref in case no
+/// normalization was needed or a failure occurs while transforming the old map
+/// layout to an identity layout map.
+MemRefType normalizeMemRefType(MemRefType memrefType, OpBuilder builder,
+                               unsigned numSymbolicOperands);
+
+/// Creates and inserts into 'builder' a new AffineApplyOp, with the number of
+/// its results equal to the number of operands, as a composition
+/// of all other AffineApplyOps reachable from input parameter 'operands'. If
+/// different operands were drawing results from multiple affine apply ops,
+/// these will also be collected into a single (multi-result) affine apply op.
+/// The final results of the composed AffineApplyOp are returned in output
+/// parameter 'results'. Returns the affine apply op created.
+Operation *createComposedAffineApplyOp(OpBuilder &builder, Location loc,
+                                       ArrayRef<Value> operands,
+                                       ArrayRef<Operation *> affineApplyOps,
+                                       SmallVectorImpl<Value> *results);
+
+/// Given an operation, inserts one or more single result affine apply
+/// operations, results of which are exclusively used by this operation.
+/// The operands of these newly created affine apply ops are
+/// guaranteed to be loop iterators or terminal symbols of a function.
+///
+/// Before
+///
+/// affine.for %i = 0 to #map(%N)
+///   %idx = affine.apply (d0) -> (d0 mod 2) (%i)
+///   send %A[%idx], ...
+///   %v = "compute"(%idx, ...)
+///
+/// After
+///
+/// affine.for %i = 0 to #map(%N)
+///   %idx = affine.apply (d0) -> (d0 mod 2) (%i)
+///   send %A[%idx], ...
+///   %idx_ = affine.apply (d0) -> (d0 mod 2) (%i)
+///   %v = "compute"(%idx_, ...)
+
+/// This allows the application of different transformations on send and
+/// compute (for eg. different shifts/delays)
+///
+/// Fills `sliceOps` with the list of affine.apply operations.
+/// In the following cases, `sliceOps` remains empty:
+///   1. If none of opInst's operands were the result of an affine.apply
+///      (i.e., there was no affine computation slice to create).
+///   2. If all the affine.apply op's supplying operands to this opInst did not
+///      have any uses other than those in this opInst.
+void createAffineComputationSlice(Operation *opInst,
+                                  SmallVectorImpl<AffineApplyOp> *sliceOps);
+
 } // namespace mlir
 
 #endif // MLIR_DIALECT_AFFINE_UTILS_H
diff --git a/mlir/include/mlir/Dialect/SCF/Passes.h b/mlir/include/mlir/Dialect/SCF/Passes.h
--- a/mlir/include/mlir/Dialect/SCF/Passes.h
+++ b/mlir/include/mlir/Dialect/SCF/Passes.h
@@ -32,6 +32,10 @@
 /// inside of scf.for loops with known lower and upper bounds.
 std::unique_ptr<Pass> createSCFForLoopCanonicalizationPass();
 
+/// Creates a pass that transforms a single ParallelLoop over N induction
+/// variables into another ParallelLoop over less than N induction variables.
+std::unique_ptr<Pass> createParallelLoopCollapsingPass();
+
 /// Creates a loop fusion pass which fuses parallel loops.
 std::unique_ptr<Pass> createParallelLoopFusionPass();
 
diff --git a/mlir/include/mlir/Dialect/SCF/Passes.td b/mlir/include/mlir/Dialect/SCF/Passes.td
--- a/mlir/include/mlir/Dialect/SCF/Passes.td
+++ b/mlir/include/mlir/Dialect/SCF/Passes.td
@@ -52,6 +52,22 @@
   let constructor = "mlir::createParallelLoopFusionPass()";
 }
 
+def SCFParallelLoopCollapsing : Pass<"parallel-loop-collapsing"> {
+  let summary = "Collapse parallel loops to use less induction variables";
+  let constructor = "mlir::createParallelLoopCollapsingPass()";
+  let options = [
+    ListOption<"clCollapsedIndices0", "collapsed-indices-0", "unsigned",
+               "Which loop indices to combine 0th loop index",
+               "llvm::cl::MiscFlags::CommaSeparated">,
+    ListOption<"clCollapsedIndices1", "collapsed-indices-1", "unsigned",
+               "Which loop indices to combine into the position 1 loop index",
+               "llvm::cl::MiscFlags::CommaSeparated">,
+    ListOption<"clCollapsedIndices2", "collapsed-indices-2", "unsigned",
+               "Which loop indices to combine into the position 2 loop index",
+               "llvm::cl::MiscFlags::CommaSeparated">,
+  ];
+}
+
 def SCFParallelLoopSpecialization
     : Pass<"parallel-loop-specialization", "FuncOp"> {
   let summary = "Specialize parallel loops for vectorization";
diff --git a/mlir/include/mlir/Dialect/SCF/Utils.h b/mlir/include/mlir/Dialect/SCF/Utils.h
--- a/mlir/include/mlir/Dialect/SCF/Utils.h
+++ b/mlir/include/mlir/Dialect/SCF/Utils.h
@@ -98,5 +98,65 @@
                  SmallVectorImpl<Value> &symbols,
                  llvm::function_ref<bool(Operation *)> loopFilter = nullptr);
 
+/// Replace a perfect nest of "for" loops with a single linearized loop. Assumes
+/// `loops` contains a list of perfectly nested loops with bounds and steps
+/// independent of any loop induction variable involved in the nest.
+void coalesceLoops(MutableArrayRef<scf::ForOp> loops);
+
+/// Take the ParallelLoop and for each set of dimension indices, combine them
+/// into a single dimension. combinedDimensions must contain each index into
+/// loops exactly once.
+void collapseParallelLoops(scf::ParallelOp loops,
+                           ArrayRef<std::vector<unsigned>> combinedDimensions);
+
+/// Promotes the loop body of a scf::ForOp to its containing block if the loop
+/// was known to have a single iteration.
+LogicalResult promoteIfSingleIteration(scf::ForOp forOp);
+
+/// Unrolls this for operation by the specified unroll factor. Returns failure
+/// if the loop cannot be unrolled either due to restrictions or due to invalid
+/// unroll factors. Requires positive loop bounds and step. If specified,
+/// annotates the Ops in each unrolled iteration by applying `annotateFn`.
+LogicalResult loopUnrollByFactor(
+    scf::ForOp forOp, uint64_t unrollFactor,
+    function_ref<void(unsigned, Operation *, OpBuilder)> annotateFn = nullptr);
+
+/// Tile a nest of standard for loops rooted at `rootForOp` by finding such
+/// parametric tile sizes that the outer loops have a fixed number of iterations
+/// as defined in `sizes`.
+using Loops = SmallVector<scf::ForOp, 8>;
+using TileLoops = std::pair<Loops, Loops>;
+TileLoops extractFixedOuterLoops(scf::ForOp rootFOrOp, ArrayRef<int64_t> sizes);
+
+/// Performs tiling fo imperfectly nested loops (with interchange) by
+/// strip-mining the `forOps` by `sizes` and sinking them, in their order of
+/// occurrence in `forOps`, under each of the `targets`.
+/// Returns the new AffineForOps, one per each of (`forOps`, `targets`) pair,
+/// nested immediately under each of `targets`.
+SmallVector<Loops, 8> tile(ArrayRef<scf::ForOp> forOps, ArrayRef<Value> sizes,
+                           ArrayRef<scf::ForOp> targets);
+
+/// Performs tiling (with interchange) by strip-mining the `forOps` by `sizes`
+/// and sinking them, in their order of occurrence in `forOps`, under `target`.
+/// Returns the new AffineForOps, one per `forOps`, nested immediately under
+/// `target`.
+Loops tile(ArrayRef<scf::ForOp> forOps, ArrayRef<Value> sizes,
+           scf::ForOp target);
+
+/// Tile a nest of scf::ForOp loops rooted at `rootForOp` with the given
+/// (parametric) sizes. Sizes are expected to be strictly positive values at
+/// runtime.  If more sizes than loops are provided, discard the trailing values
+/// in sizes.  Assumes the loop nest is permutable.
+/// Returns the newly created intra-tile loops.
+Loops tilePerfectlyNested(scf::ForOp rootForOp, ArrayRef<Value> sizes);
+
+/// Get perfectly nested sequence of loops starting at root of loop nest
+/// (the first op being another AffineFor, and the second op - a terminator).
+/// A loop is perfectly nested iff: the first op in the loop's body is another
+/// AffineForOp, and the second op is a terminator).
+void getPerfectlyNestedLoops(SmallVectorImpl<scf::ForOp> &nestedLoops,
+                             scf::ForOp root);
+
 } // namespace mlir
+
 #endif // MLIR_DIALECT_SCF_UTILS_H_
diff --git a/mlir/include/mlir/Interfaces/LoopLikeInterface.h b/mlir/include/mlir/Interfaces/LoopLikeInterface.h
--- a/mlir/include/mlir/Interfaces/LoopLikeInterface.h
+++ b/mlir/include/mlir/Interfaces/LoopLikeInterface.h
@@ -15,7 +15,20 @@
 
 #include "mlir/IR/OpDefinition.h"
 
+//===----------------------------------------------------------------------===//
+// LoopLike Interfaces
+//===----------------------------------------------------------------------===//
+
 /// Include the generated interface declarations.
 #include "mlir/Interfaces/LoopLikeInterface.h.inc"
 
+//===----------------------------------------------------------------------===//
+// LoopLike Utilities
+//===----------------------------------------------------------------------===//
+
+namespace mlir {
+/// Move loop invariant code out of a `looplike` operation.
+LogicalResult moveLoopInvariantCode(LoopLikeOpInterface looplike);
+} // namespace mlir
+
 #endif // MLIR_INTERFACES_LOOPLIKEINTERFACE_H_
diff --git a/mlir/include/mlir/Transforms/ControlFlowSinkUtils.h b/mlir/include/mlir/Transforms/ControlFlowSinkUtils.h
new file mode 100644
--- /dev/null
+++ b/mlir/include/mlir/Transforms/ControlFlowSinkUtils.h
@@ -0,0 +1,70 @@
+//===- ControlFlowSinkUtils.h - ControlFlow Sink Utils ----------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_TRANSFORMS_CONTROLFLOWSINKUTILS_H
+#define MLIR_TRANSFORMS_CONTROLFLOWSINKUTILS_H
+
+#include "mlir/Support/LLVM.h"
+
+namespace mlir {
+
+class DominanceInfo;
+class Operation;
+class Region;
+class RegionBranchOpInterface;
+
+/// Given a list of regions, perform control flow sinking on them. For each
+/// region, control-flow sinking moves operations that dominate the region but
+/// whose only users are in the region into the regions so that they aren't
+/// executed on paths where their results are not needed.
+///
+/// TODO: For the moment, this is a *simple* control-flow sink, i.e., no
+/// duplicating of ops. It should be made to accept a cost model to determine
+/// whether duplicating a particular op is profitable.
+///
+/// Example:
+///
+/// ```mlir
+/// %0 = arith.addi %arg0, %arg1
+/// scf.if %cond {
+///   scf.yield %0
+/// } else {
+///   scf.yield %arg2
+/// }
+/// ```
+///
+/// After control-flow sink:
+///
+/// ```mlir
+/// scf.if %cond {
+///   %0 = arith.addi %arg0, %arg1
+///   scf.yield %0
+/// } else {
+///   scf.yield %arg2
+/// }
+/// ```
+///
+/// Users must supply a callback `shouldMoveIntoRegion` that determines whether
+/// the given operation that only has users in the given operation should be
+/// moved into that region.
+///
+/// Returns the number of operations sunk.
+size_t
+controlFlowSink(ArrayRef<Region *> regions, DominanceInfo &domInfo,
+                function_ref<bool(Operation *, Region *)> shouldMoveIntoRegion);
+
+/// Populates `regions` with regions of the provided region branch op that are
+/// executed at most once at that are reachable given the current operands of
+/// the op. These regions can be passed to `controlFlowSink` to perform sinking
+/// on the regions of the operation.
+void getSinglyExecutedRegionsToSink(RegionBranchOpInterface branch,
+                                    SmallVectorImpl<Region *> &regions);
+
+} // namespace mlir
+
+#endif // MLIR_TRANSFORMS_CONTROLFLOWSINKUTILS_H
diff --git a/mlir/include/mlir/Transforms/Passes.h b/mlir/include/mlir/Transforms/Passes.h
--- a/mlir/include/mlir/Transforms/Passes.h
+++ b/mlir/include/mlir/Transforms/Passes.h
@@ -22,13 +22,8 @@
 
 namespace mlir {
 
-class AffineForOp;
 class GreedyRewriteConfig;
 
-/// Fusion mode to attempt. The default mode `Greedy` does both
-/// producer-consumer and sibling fusion.
-enum FusionMode { Greedy, ProducerConsumer, Sibling };
-
 //===----------------------------------------------------------------------===//
 // Passes
 //===----------------------------------------------------------------------===//
@@ -56,31 +51,10 @@
 /// Creates a pass to perform common sub expression elimination.
 std::unique_ptr<Pass> createCSEPass();
 
-/// Creates a loop fusion pass which fuses loops according to type of fusion
-/// specified in `fusionMode`. Buffers of size less than or equal to
-/// `localBufSizeThreshold` are promoted to memory space `fastMemorySpace`.
-std::unique_ptr<OperationPass<FuncOp>>
-createLoopFusionPass(unsigned fastMemorySpace = 0,
-                     uint64_t localBufSizeThreshold = 0,
-                     bool maximalFusion = false,
-                     enum FusionMode fusionMode = FusionMode::Greedy);
-
 /// Creates a loop invariant code motion pass that hoists loop invariant
 /// instructions out of the loop.
 std::unique_ptr<Pass> createLoopInvariantCodeMotionPass();
 
-/// Creates a pass to pipeline explicit movement of data across levels of the
-/// memory hierarchy.
-std::unique_ptr<OperationPass<FuncOp>> createPipelineDataTransferPass();
-
-/// Creates a pass that transforms perfectly nested loops with independent
-/// bounds into a single loop.
-std::unique_ptr<OperationPass<FuncOp>> createLoopCoalescingPass();
-
-/// Creates a pass that transforms a single ParallelLoop over N induction
-/// variables into another ParallelLoop over less than N induction variables.
-std::unique_ptr<Pass> createParallelLoopCollapsingPass();
-
 /// Creates a pass to strip debug information from a function.
 std::unique_ptr<Pass> createStripDebugInfoPass();
 
diff --git a/mlir/include/mlir/Transforms/Passes.td b/mlir/include/mlir/Transforms/Passes.td
--- a/mlir/include/mlir/Transforms/Passes.td
+++ b/mlir/include/mlir/Transforms/Passes.td
@@ -16,207 +16,6 @@
 include "mlir/Pass/PassBase.td"
 include "mlir/Rewrite/PassUtil.td"
 
-def AffineLoopFusion : Pass<"affine-loop-fusion", "FuncOp"> {
-  let summary = "Fuse affine loop nests";
-  let description = [{
-    This pass performs fusion of loop nests using a slicing-based approach. It
-    combines two fusion strategies: producer-consumer fusion and sibling fusion.
-    Producer-consumer fusion is aimed at fusing pairs of loops where the first
-    one writes to a memref that the second reads. Sibling fusion targets pairs
-    of loops that share no dependences between them but that load from the same
-    memref. The fused loop nests, when possible, are rewritten to access
-    significantly smaller local buffers instead of the original memref's, and
-    the latter are often either completely optimized away or contracted. This
-    transformation leads to enhanced locality and lower memory footprint through
-    the elimination or contraction of temporaries/intermediate memref's. These
-    benefits are sometimes achieved at the expense of redundant computation
-    through a cost model that evaluates available choices such as the depth at
-    which a source slice should be materialized in the designation slice.
-
-    Example 1: Producer-consumer fusion.
-    Input:
-    ```mlir
-    func @producer_consumer_fusion(%arg0: memref<10xf32>, %arg1: memref<10xf32>) {
-      %0 = memref.alloc() : memref<10xf32>
-      %1 = memref.alloc() : memref<10xf32>
-      %cst = arith.constant 0.000000e+00 : f32
-      affine.for %arg2 = 0 to 10 {
-        affine.store %cst, %0[%arg2] : memref<10xf32>
-        affine.store %cst, %1[%arg2] : memref<10xf32>
-      }
-      affine.for %arg2 = 0 to 10 {
-        %2 = affine.load %0[%arg2] : memref<10xf32>
-        %3 = arith.addf %2, %2 : f32
-        affine.store %3, %arg0[%arg2] : memref<10xf32>
-      }
-      affine.for %arg2 = 0 to 10 {
-        %2 = affine.load %1[%arg2] : memref<10xf32>
-        %3 = arith.mulf %2, %2 : f32
-        affine.store %3, %arg1[%arg2] : memref<10xf32>
-      }
-      return
-    }
-    ```
-    Output:
-    ```mlir
-    func @producer_consumer_fusion(%arg0: memref<10xf32>, %arg1: memref<10xf32>) {
-      %0 = memref.alloc() : memref<1xf32>
-      %1 = memref.alloc() : memref<1xf32>
-      %cst = arith.constant 0.000000e+00 : f32
-      affine.for %arg2 = 0 to 10 {
-        affine.store %cst, %0[0] : memref<1xf32>
-        affine.store %cst, %1[0] : memref<1xf32>
-        %2 = affine.load %1[0] : memref<1xf32>
-        %3 = arith.mulf %2, %2 : f32
-        affine.store %3, %arg1[%arg2] : memref<10xf32>
-        %4 = affine.load %0[0] : memref<1xf32>
-        %5 = arith.addf %4, %4 : f32
-        affine.store %5, %arg0[%arg2] : memref<10xf32>
-      }
-      return
-    }
-    ```
-
-    Example 2: Sibling fusion.
-    Input:
-    ```mlir
-    func @sibling_fusion(%arg0: memref<10x10xf32>, %arg1: memref<10x10xf32>,
-                         %arg2: memref<10x10xf32>, %arg3: memref<10x10xf32>,
-                         %arg4: memref<10x10xf32>) {
-      affine.for %arg5 = 0 to 3 {
-        affine.for %arg6 = 0 to 3 {
-          %0 = affine.load %arg0[%arg5, %arg6] : memref<10x10xf32>
-          %1 = affine.load %arg1[%arg5, %arg6] : memref<10x10xf32>
-          %2 = arith.mulf %0, %1 : f32
-          affine.store %2, %arg3[%arg5, %arg6] : memref<10x10xf32>
-        }
-      }
-      affine.for %arg5 = 0 to 3 {
-        affine.for %arg6 = 0 to 3 {
-          %0 = affine.load %arg0[%arg5, %arg6] : memref<10x10xf32>
-          %1 = affine.load %arg2[%arg5, %arg6] : memref<10x10xf32>
-          %2 = arith.addf %0, %1 : f32
-          affine.store %2, %arg4[%arg5, %arg6] : memref<10x10xf32>
-        }
-      }
-      return
-    }
-    ```
-    Output:
-    ```mlir
-    func @sibling_fusion(%arg0: memref<10x10xf32>, %arg1: memref<10x10xf32>,
-                         %arg2: memref<10x10xf32>, %arg3: memref<10x10xf32>,
-                         %arg4: memref<10x10xf32>) {
-      affine.for %arg5 = 0 to 3 {
-        affine.for %arg6 = 0 to 3 {
-          %0 = affine.load %arg0[%arg5, %arg6] : memref<10x10xf32>
-          %1 = affine.load %arg1[%arg5, %arg6] : memref<10x10xf32>
-          %2 = arith.mulf %0, %1 : f32
-          affine.store %2, %arg3[%arg5, %arg6] : memref<10x10xf32>
-          %3 = affine.load %arg0[%arg5, %arg6] : memref<10x10xf32>
-          %4 = affine.load %arg2[%arg5, %arg6] : memref<10x10xf32>
-          %5 = arith.addf %3, %4 : f32
-          affine.store %5, %arg4[%arg5, %arg6] : memref<10x10xf32>
-        }
-      }
-      return
-    }
-    ```
-  }];
-  let constructor = "mlir::createLoopFusionPass()";
-  let options = [
-    Option<"computeToleranceThreshold", "fusion-compute-tolerance", "double",
-           /*default=*/"0.30f", "Fractional increase in additional computation "
-                                "tolerated while fusing">,
-    Option<"fastMemorySpace", "fusion-fast-mem-space", "unsigned",
-           /*default=*/"0",
-           "Faster memory space number to promote fusion buffers to">,
-    Option<"localBufSizeThreshold", "fusion-local-buf-threshold", "uint64_t",
-           /*default=*/"0", "Threshold size (KiB) for promoting local buffers "
-                            "to fast memory space">,
-    Option<"maximalFusion", "fusion-maximal", "bool", /*default=*/"false",
-           "Enables maximal loop fusion">,
-    Option<"affineFusionMode", "mode", "enum FusionMode",
-           "mlir::FusionMode::Greedy", "fusion mode to attempt",
-           "llvm::cl::values(clEnumValN(mlir::FusionMode::Greedy,"
-           " \"greedy\", \"Perform greedy (both producer-consumer and sibling)  fusion\"), "
-           "clEnumValN( mlir::FusionMode::ProducerConsumer, "
-           "\"producer\", \"Perform only producer-consumer fusion\"), "
-           "clEnumValN( mlir::FusionMode::Sibling, "
-           "\"sibling\", \"Perform only sibling fusion\"))">,
-    ];
-  let dependentDialects = ["memref::MemRefDialect"];
-}
-
-def AffinePipelineDataTransfer
-    : Pass<"affine-pipeline-data-transfer", "FuncOp"> {
-  let summary = "Pipeline non-blocking data transfers between explicitly "
-                "managed levels of the memory hierarchy";
-  let description = [{
-    This pass performs a transformation to overlap non-blocking DMA operations
-    in a loop with computations through double buffering. This is achieved by
-    advancing dma_start operations with respect to other operations.
-
-    Input
-
-    ```mlir
-    func @pipelinedatatransfer() {
-      %0 = memref.alloc() : memref<256xf32>
-      %1 = memref.alloc() : memref<32xf32, 1>
-      %2 = memref.alloc() : memref<1xf32>
-      %c0 = arith.constant 0 : index
-      %c128 = arith.constant 128 : index
-      affine.for %i0 = 0 to 8 {
-        affine.dma_start %0[%i0], %1[%i0], %2[%c0], %c128 : memref<256xf32>, memref<32xf32, 1>, memref<1xf32>
-        affine.dma_wait %2[%c0], %c128 : memref<1xf32>
-        %3 = affine.load %1[%i0] : memref<32xf32, 1>
-        %4 = "compute"(%3) : (f32) -> f32
-        affine.store %4, %1[%i0] : memref<32xf32, 1>
-      }
-      return
-    }
-    ```
-
-    Output
-
-    ```mlir
-    module {
-      func @pipelinedatatransfer() {
-        %c8 = arith.constant 8 : index
-        %c0 = arith.constant 0 : index
-        %0 = memref.alloc() : memref<256xf32>
-        %c0_0 = arith.constant 0 : index
-        %c128 = arith.constant 128 : index
-        %1 = memref.alloc() : memref<2x32xf32, 1>
-        %2 = memref.alloc() : memref<2x1xf32>
-        affine.dma_start %0[%c0], %1[%c0 mod 2, %c0], %2[%c0 mod 2, symbol(%c0_0)], %c128 : memref<256xf32>, memref<2x32xf32, 1>, memref<2x1xf32>
-        affine.for %arg0 = 1 to 8 {
-          affine.dma_start %0[%arg0], %1[%arg0 mod 2, %arg0], %2[%arg0 mod 2, symbol(%c0_0)], %c128 : memref<256xf32>, memref<2x32xf32, 1>, memref<2x1xf32>
-          %8 = affine.apply #map3(%arg0)
-          %9 = affine.apply #map4(%8)
-          %10 = affine.apply #map4(%8)
-          affine.dma_wait %2[%8 mod 2, symbol(%c0_0)], %c128 : memref<2x1xf32>
-          %11 = affine.load %1[%8 mod 2, %8] : memref<2x32xf32, 1>
-          %12 = "compute"(%11) : (f32) -> f32
-          affine.store %12, %1[%8 mod 2, %8] : memref<2x32xf32, 1>
-        }
-        %3 = affine.apply #map3(%c8)
-        %4 = affine.apply #map4(%3)
-        %5 = affine.apply #map4(%3)
-        affine.dma_wait %2[%3 mod 2, symbol(%c0_0)], %c128 : memref<2x1xf32>
-        %6 = affine.load %1[%3 mod 2, %3] : memref<2x32xf32, 1>
-        %7 = "compute"(%6) : (f32) -> f32
-        affine.store %7, %1[%3 mod 2, %3] : memref<2x32xf32, 1>
-        memref.dealloc %2 : memref<2x1xf32>
-        memref.dealloc %1 : memref<2x32xf32, 1>
-        return
-      }
-    }
-    ```
-  }];
-  let constructor = "mlir::createPipelineDataTransferPass()";
-}
-
 def Canonicalizer : Pass<"canonicalize"> {
   let summary = "Canonicalize operations";
   let description = [{
@@ -339,34 +138,11 @@
   ];
 }
 
-def LoopCoalescing : Pass<"loop-coalescing", "FuncOp"> {
-  let summary = "Coalesce nested loops with independent bounds into a single "
-                "loop";
-  let constructor = "mlir::createLoopCoalescingPass()";
-  let dependentDialects = ["arith::ArithmeticDialect"];
-}
-
 def LoopInvariantCodeMotion : Pass<"loop-invariant-code-motion"> {
   let summary = "Hoist loop invariant instructions outside of the loop";
   let constructor = "mlir::createLoopInvariantCodeMotionPass()";
 }
 
-def ParallelLoopCollapsing : Pass<"parallel-loop-collapsing"> {
-  let summary = "Collapse parallel loops to use less induction variables";
-  let constructor = "mlir::createParallelLoopCollapsingPass()";
-  let options = [
-    ListOption<"clCollapsedIndices0", "collapsed-indices-0", "unsigned",
-               "Which loop indices to combine 0th loop index",
-               "llvm::cl::MiscFlags::CommaSeparated">,
-    ListOption<"clCollapsedIndices1", "collapsed-indices-1", "unsigned",
-               "Which loop indices to combine into the position 1 loop index",
-               "llvm::cl::MiscFlags::CommaSeparated">,
-    ListOption<"clCollapsedIndices2", "collapsed-indices-2", "unsigned",
-               "Which loop indices to combine into the position 2 loop index",
-               "llvm::cl::MiscFlags::CommaSeparated">,
-  ];
-}
-
 def PrintOpStats : Pass<"print-op-stats"> {
   let summary = "Print statistics of operations";
   let constructor = "mlir::createPrintOpStatsPass()";
diff --git a/mlir/include/mlir/Transforms/Utils.h b/mlir/include/mlir/Transforms/Utils.h
deleted file mode 100644
--- a/mlir/include/mlir/Transforms/Utils.h
+++ /dev/null
@@ -1,200 +0,0 @@
-//===- Utils.h - General transformation utilities ---------------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This header file defines prototypes for various transformation utilities for
-// memref's and non-loop IR structures. These are not passes by themselves but
-// are used either by passes, optimization sequences, or in turn by other
-// transformation utilities.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef MLIR_TRANSFORMS_UTILS_H
-#define MLIR_TRANSFORMS_UTILS_H
-
-#include "mlir/Dialect/StandardOps/IR/Ops.h"
-#include "mlir/IR/AffineMap.h"
-#include "llvm/ADT/ArrayRef.h"
-#include "llvm/ADT/DenseMap.h"
-
-namespace mlir {
-
-class AffineApplyOp;
-class AffineForOp;
-class DominanceInfo;
-class Location;
-class OpBuilder;
-
-namespace memref {
-class AllocOp;
-} // namespace memref
-
-/// Replaces all "dereferencing" uses of `oldMemRef` with `newMemRef` while
-/// optionally remapping the old memref's indices using the supplied affine map,
-/// `indexRemap`. The new memref could be of a different shape or rank.
-/// `extraIndices` provides any additional access indices to be added to the
-/// start.
-///
-/// `indexRemap` remaps indices of the old memref access to a new set of indices
-/// that are used to index the memref. Additional input operands to indexRemap
-/// can be optionally provided in `extraOperands`, and they occupy the start
-/// of its input list. `indexRemap`'s dimensional inputs are expected to
-/// correspond to memref's indices, and its symbolic inputs if any should be
-/// provided in `symbolOperands`.
-///
-/// `domOpFilter`, if non-null, restricts the replacement to only those
-/// operations that are dominated by the former; similarly, `postDomOpFilter`
-/// restricts replacement to only those operations that are postdominated by it.
-///
-/// 'allowNonDereferencingOps', if set, allows replacement of non-dereferencing
-/// uses of a memref without any requirement for access index rewrites as long
-/// as the user operation has the MemRefsNormalizable trait. The default value
-/// of this flag is false.
-///
-/// 'replaceInDeallocOp', if set, lets DeallocOp, a non-dereferencing user, to
-/// also be a candidate for replacement. The default value of this flag is
-/// false.
-///
-/// Returns true on success and false if the replacement is not possible,
-/// whenever a memref is used as an operand in a non-dereferencing context and
-/// 'allowNonDereferencingOps' is false, except for dealloc's on the memref
-/// which are left untouched. See comments at function definition for an
-/// example.
-//
-//  Ex: to replace load %A[%i, %j] with load %Abuf[%t mod 2, %ii - %i, %j]:
-//  The SSA value corresponding to '%t mod 2' should be in 'extraIndices', and
-//  index remap will perform (%i, %j) -> (%ii - %i, %j), i.e., indexRemap = (d0,
-//  d1, d2) -> (d0 - d1, d2), and %ii will be the extra operand. Without any
-//  extra operands, note that 'indexRemap' would just be applied to existing
-//  indices (%i, %j).
-//  TODO: allow extraIndices to be added at any position.
-LogicalResult replaceAllMemRefUsesWith(
-    Value oldMemRef, Value newMemRef, ArrayRef<Value> extraIndices = {},
-    AffineMap indexRemap = AffineMap(), ArrayRef<Value> extraOperands = {},
-    ArrayRef<Value> symbolOperands = {}, Operation *domOpFilter = nullptr,
-    Operation *postDomOpFilter = nullptr, bool allowNonDereferencingOps = false,
-    bool replaceInDeallocOp = false);
-
-/// Performs the same replacement as the other version above but only for the
-/// dereferencing uses of `oldMemRef` in `op`, except in cases where
-/// 'allowNonDereferencingOps' is set to true where we replace the
-/// non-dereferencing uses as well.
-LogicalResult replaceAllMemRefUsesWith(Value oldMemRef, Value newMemRef,
-                                       Operation *op,
-                                       ArrayRef<Value> extraIndices = {},
-                                       AffineMap indexRemap = AffineMap(),
-                                       ArrayRef<Value> extraOperands = {},
-                                       ArrayRef<Value> symbolOperands = {},
-                                       bool allowNonDereferencingOps = false);
-
-/// Rewrites the memref defined by this alloc op to have an identity layout map
-/// and updates all its indexing uses. Returns failure if any of its uses
-/// escape (while leaving the IR in a valid state).
-LogicalResult normalizeMemRef(memref::AllocOp *op);
-
-/// Uses the old memref type map layout and computes the new memref type to have
-/// a new shape and a layout map, where the old layout map has been normalized
-/// to an identity layout map. It returns the old memref in case no
-/// normalization was needed or a failure occurs while transforming the old map
-/// layout to an identity layout map.
-MemRefType normalizeMemRefType(MemRefType memrefType, OpBuilder builder,
-                               unsigned numSymbolicOperands);
-
-/// Creates and inserts into 'builder' a new AffineApplyOp, with the number of
-/// its results equal to the number of operands, as a composition
-/// of all other AffineApplyOps reachable from input parameter 'operands'. If
-/// different operands were drawing results from multiple affine apply ops,
-/// these will also be collected into a single (multi-result) affine apply op.
-/// The final results of the composed AffineApplyOp are returned in output
-/// parameter 'results'. Returns the affine apply op created.
-Operation *createComposedAffineApplyOp(OpBuilder &builder, Location loc,
-                                       ArrayRef<Value> operands,
-                                       ArrayRef<Operation *> affineApplyOps,
-                                       SmallVectorImpl<Value> *results);
-
-/// Given an operation, inserts one or more single result affine apply
-/// operations, results of which are exclusively used by this operation.
-/// The operands of these newly created affine apply ops are
-/// guaranteed to be loop iterators or terminal symbols of a function.
-///
-/// Before
-///
-/// affine.for %i = 0 to #map(%N)
-///   %idx = affine.apply (d0) -> (d0 mod 2) (%i)
-///   send %A[%idx], ...
-///   %v = "compute"(%idx, ...)
-///
-/// After
-///
-/// affine.for %i = 0 to #map(%N)
-///   %idx = affine.apply (d0) -> (d0 mod 2) (%i)
-///   send %A[%idx], ...
-///   %idx_ = affine.apply (d0) -> (d0 mod 2) (%i)
-///   %v = "compute"(%idx_, ...)
-
-/// This allows the application of different transformations on send and
-/// compute (for eg. different shifts/delays)
-///
-/// Fills `sliceOps` with the list of affine.apply operations.
-/// In the following cases, `sliceOps` remains empty:
-///   1. If none of opInst's operands were the result of an affine.apply
-///      (i.e., there was no affine computation slice to create).
-///   2. If all the affine.apply op's supplying operands to this opInst did not
-///      have any uses other than those in this opInst.
-void createAffineComputationSlice(Operation *opInst,
-                                  SmallVectorImpl<AffineApplyOp> *sliceOps);
-
-/// Given a list of regions, perform control flow sinking on them. For each
-/// region, control-flow sinking moves operations that dominate the region but
-/// whose only users are in the region into the regions so that they aren't
-/// executed on paths where their results are not needed.
-///
-/// TODO: For the moment, this is a *simple* control-flow sink, i.e., no
-/// duplicating of ops. It should be made to accept a cost model to determine
-/// whether duplicating a particular op is profitable.
-///
-/// Example:
-///
-/// ```mlir
-/// %0 = arith.addi %arg0, %arg1
-/// scf.if %cond {
-///   scf.yield %0
-/// } else {
-///   scf.yield %arg2
-/// }
-/// ```
-///
-/// After control-flow sink:
-///
-/// ```mlir
-/// scf.if %cond {
-///   %0 = arith.addi %arg0, %arg1
-///   scf.yield %0
-/// } else {
-///   scf.yield %arg2
-/// }
-/// ```
-///
-/// Users must supply a callback `shouldMoveIntoRegion` that determines whether
-/// the given operation that only has users in the given operation should be
-/// moved into that region.
-///
-/// Returns the number of operations sunk.
-size_t
-controlFlowSink(ArrayRef<Region *> regions, DominanceInfo &domInfo,
-                function_ref<bool(Operation *, Region *)> shouldMoveIntoRegion);
-
-/// Populates `regions` with regions of the provided region branch op that are
-/// executed at most once at that are reachable given the current operands of
-/// the op. These regions can be passed to `controlFlowSink` to perform sinking
-/// on the regions of the operation.
-void getSinglyExecutedRegionsToSink(RegionBranchOpInterface branch,
-                                    SmallVectorImpl<Region *> &regions);
-
-} // namespace mlir
-
-#endif // MLIR_TRANSFORMS_UTILS_H
diff --git a/mlir/lib/Conversion/SCFToGPU/SCFToGPU.cpp b/mlir/lib/Conversion/SCFToGPU/SCFToGPU.cpp
--- a/mlir/lib/Conversion/SCFToGPU/SCFToGPU.cpp
+++ b/mlir/lib/Conversion/SCFToGPU/SCFToGPU.cpp
@@ -27,7 +27,6 @@
 #include "mlir/IR/Builders.h"
 #include "mlir/Pass/Pass.h"
 #include "mlir/Transforms/DialectConversion.h"
-#include "mlir/Transforms/LoopUtils.h"
 #include "mlir/Transforms/Passes.h"
 #include "mlir/Transforms/RegionUtils.h"
 #include "llvm/ADT/Sequence.h"
diff --git a/mlir/lib/Conversion/SCFToStandard/SCFToStandard.cpp b/mlir/lib/Conversion/SCFToStandard/SCFToStandard.cpp
--- a/mlir/lib/Conversion/SCFToStandard/SCFToStandard.cpp
+++ b/mlir/lib/Conversion/SCFToStandard/SCFToStandard.cpp
@@ -23,7 +23,6 @@
 #include "mlir/IR/PatternMatch.h"
 #include "mlir/Transforms/DialectConversion.h"
 #include "mlir/Transforms/Passes.h"
-#include "mlir/Transforms/Utils.h"
 
 using namespace mlir;
 using namespace mlir::scf;
diff --git a/mlir/lib/Conversion/StandardToLLVM/StandardToLLVM.cpp b/mlir/lib/Conversion/StandardToLLVM/StandardToLLVM.cpp
--- a/mlir/lib/Conversion/StandardToLLVM/StandardToLLVM.cpp
+++ b/mlir/lib/Conversion/StandardToLLVM/StandardToLLVM.cpp
@@ -33,7 +33,6 @@
 #include "mlir/Support/MathExtras.h"
 #include "mlir/Transforms/DialectConversion.h"
 #include "mlir/Transforms/Passes.h"
-#include "mlir/Transforms/Utils.h"
 #include "llvm/ADT/TypeSwitch.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/IRBuilder.h"
diff --git a/mlir/lib/Conversion/VectorToSCF/VectorToSCF.cpp b/mlir/lib/Conversion/VectorToSCF/VectorToSCF.cpp
--- a/mlir/lib/Conversion/VectorToSCF/VectorToSCF.cpp
+++ b/mlir/lib/Conversion/VectorToSCF/VectorToSCF.cpp
@@ -16,7 +16,6 @@
 
 #include "../PassDetail.h"
 #include "mlir/Dialect/Affine/IR/AffineOps.h"
-#include "mlir/Dialect/Affine/Utils.h"
 #include "mlir/Dialect/Arithmetic/IR/Arithmetic.h"
 #include "mlir/Dialect/MemRef/IR/MemRef.h"
 #include "mlir/Dialect/SCF/SCF.h"
diff --git a/mlir/lib/Dialect/Affine/Transforms/AffineDataCopyGeneration.cpp b/mlir/lib/Dialect/Affine/Transforms/AffineDataCopyGeneration.cpp
--- a/mlir/lib/Dialect/Affine/Transforms/AffineDataCopyGeneration.cpp
+++ b/mlir/lib/Dialect/Affine/Transforms/AffineDataCopyGeneration.cpp
@@ -22,12 +22,12 @@
 #include "PassDetail.h"
 #include "mlir/Dialect/Affine/Analysis/Utils.h"
 #include "mlir/Dialect/Affine/IR/AffineOps.h"
+#include "mlir/Dialect/Affine/LoopUtils.h"
 #include "mlir/Dialect/Affine/Passes.h"
 #include "mlir/Dialect/Arithmetic/IR/Arithmetic.h"
 #include "mlir/Dialect/MemRef/IR/MemRef.h"
 #include "mlir/Dialect/StandardOps/IR/Ops.h"
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
-#include "mlir/Transforms/LoopUtils.h"
 #include "llvm/ADT/MapVector.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
diff --git a/mlir/lib/Dialect/Affine/Transforms/AffineLoopInvariantCodeMotion.cpp b/mlir/lib/Dialect/Affine/Transforms/AffineLoopInvariantCodeMotion.cpp
--- a/mlir/lib/Dialect/Affine/Transforms/AffineLoopInvariantCodeMotion.cpp
+++ b/mlir/lib/Dialect/Affine/Transforms/AffineLoopInvariantCodeMotion.cpp
@@ -17,13 +17,13 @@
 #include "mlir/Dialect/Affine/Analysis/LoopAnalysis.h"
 #include "mlir/Dialect/Affine/Analysis/Utils.h"
 #include "mlir/Dialect/Affine/IR/AffineOps.h"
+#include "mlir/Dialect/Affine/LoopUtils.h"
 #include "mlir/Dialect/Affine/Passes.h"
+#include "mlir/Dialect/Affine/Utils.h"
 #include "mlir/Dialect/Arithmetic/IR/Arithmetic.h"
 #include "mlir/IR/AffineExpr.h"
 #include "mlir/IR/AffineMap.h"
 #include "mlir/IR/Builders.h"
-#include "mlir/Transforms/LoopUtils.h"
-#include "mlir/Transforms/Utils.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/DenseSet.h"
 #include "llvm/ADT/SmallPtrSet.h"
diff --git a/mlir/lib/Dialect/Affine/Transforms/AffineParallelize.cpp b/mlir/lib/Dialect/Affine/Transforms/AffineParallelize.cpp
--- a/mlir/lib/Dialect/Affine/Transforms/AffineParallelize.cpp
+++ b/mlir/lib/Dialect/Affine/Transforms/AffineParallelize.cpp
@@ -18,10 +18,10 @@
 #include "mlir/Dialect/Affine/Analysis/Utils.h"
 #include "mlir/Dialect/Affine/IR/AffineOps.h"
 #include "mlir/Dialect/Affine/IR/AffineValueMap.h"
+#include "mlir/Dialect/Affine/LoopUtils.h"
 #include "mlir/Dialect/Affine/Passes.h"
 #include "mlir/Dialect/Affine/Passes.h.inc"
 #include "mlir/Dialect/Affine/Utils.h"
-#include "mlir/Transforms/LoopUtils.h"
 #include "llvm/Support/Debug.h"
 #include <deque>
 
diff --git a/mlir/lib/Dialect/Affine/Transforms/CMakeLists.txt b/mlir/lib/Dialect/Affine/Transforms/CMakeLists.txt
--- a/mlir/lib/Dialect/Affine/Transforms/CMakeLists.txt
+++ b/mlir/lib/Dialect/Affine/Transforms/CMakeLists.txt
@@ -4,9 +4,12 @@
   AffineLoopNormalize.cpp
   AffineParallelize.cpp
   AffineScalarReplacement.cpp
+  LoopCoalescing.cpp
+  LoopFusion.cpp
   LoopTiling.cpp
   LoopUnroll.cpp
   LoopUnrollAndJam.cpp
+  PipelineDataTransfer.cpp
   SuperVectorize.cpp
   SimplifyAffineStructures.cpp
 
diff --git a/mlir/lib/Transforms/LoopCoalescing.cpp b/mlir/lib/Dialect/Affine/Transforms/LoopCoalescing.cpp
rename from mlir/lib/Transforms/LoopCoalescing.cpp
rename to mlir/lib/Dialect/Affine/Transforms/LoopCoalescing.cpp
--- a/mlir/lib/Transforms/LoopCoalescing.cpp
+++ b/mlir/lib/Dialect/Affine/Transforms/LoopCoalescing.cpp
@@ -8,9 +8,10 @@
 
 #include "PassDetail.h"
 #include "mlir/Dialect/Affine/IR/AffineOps.h"
+#include "mlir/Dialect/Affine/LoopUtils.h"
 #include "mlir/Dialect/Arithmetic/IR/Arithmetic.h"
 #include "mlir/Dialect/SCF/SCF.h"
-#include "mlir/Transforms/LoopUtils.h"
+#include "mlir/Dialect/SCF/Utils.h"
 #include "mlir/Transforms/Passes.h"
 #include "mlir/Transforms/RegionUtils.h"
 #include "llvm/Support/Debug.h"
diff --git a/mlir/lib/Transforms/LoopFusion.cpp b/mlir/lib/Dialect/Affine/Transforms/LoopFusion.cpp
rename from mlir/lib/Transforms/LoopFusion.cpp
rename to mlir/lib/Dialect/Affine/Transforms/LoopFusion.cpp
--- a/mlir/lib/Transforms/LoopFusion.cpp
+++ b/mlir/lib/Dialect/Affine/Transforms/LoopFusion.cpp
@@ -16,14 +16,14 @@
 #include "mlir/Dialect/Affine/Analysis/LoopAnalysis.h"
 #include "mlir/Dialect/Affine/Analysis/Utils.h"
 #include "mlir/Dialect/Affine/IR/AffineOps.h"
+#include "mlir/Dialect/Affine/LoopFusionUtils.h"
+#include "mlir/Dialect/Affine/LoopUtils.h"
+#include "mlir/Dialect/Affine/Utils.h"
 #include "mlir/Dialect/MemRef/IR/MemRef.h"
 #include "mlir/IR/AffineExpr.h"
 #include "mlir/IR/AffineMap.h"
 #include "mlir/IR/Builders.h"
-#include "mlir/Transforms/LoopFusionUtils.h"
-#include "mlir/Transforms/LoopUtils.h"
 #include "mlir/Transforms/Passes.h"
-#include "mlir/Transforms/Utils.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/DenseSet.h"
 #include "llvm/ADT/SetVector.h"
diff --git a/mlir/lib/Dialect/Affine/Transforms/LoopTiling.cpp b/mlir/lib/Dialect/Affine/Transforms/LoopTiling.cpp
--- a/mlir/lib/Dialect/Affine/Transforms/LoopTiling.cpp
+++ b/mlir/lib/Dialect/Affine/Transforms/LoopTiling.cpp
@@ -17,11 +17,11 @@
 #include "mlir/Dialect/Affine/Analysis/Utils.h"
 #include "mlir/Dialect/Affine/IR/AffineOps.h"
 #include "mlir/Dialect/Affine/IR/AffineValueMap.h"
+#include "mlir/Dialect/Affine/LoopUtils.h"
 #include "mlir/Dialect/Affine/Passes.h"
+#include "mlir/Dialect/Affine/Utils.h"
 #include "mlir/IR/BlockAndValueMapping.h"
 #include "mlir/IR/Builders.h"
-#include "mlir/Transforms/LoopUtils.h"
-#include "mlir/Transforms/Utils.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 using namespace mlir;
diff --git a/mlir/lib/Dialect/Affine/Transforms/LoopUnroll.cpp b/mlir/lib/Dialect/Affine/Transforms/LoopUnroll.cpp
--- a/mlir/lib/Dialect/Affine/Transforms/LoopUnroll.cpp
+++ b/mlir/lib/Dialect/Affine/Transforms/LoopUnroll.cpp
@@ -12,11 +12,11 @@
 #include "PassDetail.h"
 #include "mlir/Dialect/Affine/Analysis/LoopAnalysis.h"
 #include "mlir/Dialect/Affine/IR/AffineOps.h"
+#include "mlir/Dialect/Affine/LoopUtils.h"
 #include "mlir/Dialect/Affine/Passes.h"
 #include "mlir/IR/AffineExpr.h"
 #include "mlir/IR/AffineMap.h"
 #include "mlir/IR/Builders.h"
-#include "mlir/Transforms/LoopUtils.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
diff --git a/mlir/lib/Dialect/Affine/Transforms/LoopUnrollAndJam.cpp b/mlir/lib/Dialect/Affine/Transforms/LoopUnrollAndJam.cpp
--- a/mlir/lib/Dialect/Affine/Transforms/LoopUnrollAndJam.cpp
+++ b/mlir/lib/Dialect/Affine/Transforms/LoopUnrollAndJam.cpp
@@ -37,12 +37,12 @@
 #include "mlir/Dialect/Affine/Analysis/AffineAnalysis.h"
 #include "mlir/Dialect/Affine/Analysis/LoopAnalysis.h"
 #include "mlir/Dialect/Affine/IR/AffineOps.h"
+#include "mlir/Dialect/Affine/LoopUtils.h"
 #include "mlir/Dialect/Affine/Passes.h"
 #include "mlir/IR/AffineExpr.h"
 #include "mlir/IR/AffineMap.h"
 #include "mlir/IR/BlockAndValueMapping.h"
 #include "mlir/IR/Builders.h"
-#include "mlir/Transforms/LoopUtils.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/Support/CommandLine.h"
 
diff --git a/mlir/lib/Dialect/Affine/Transforms/PassDetail.h b/mlir/lib/Dialect/Affine/Transforms/PassDetail.h
--- a/mlir/lib/Dialect/Affine/Transforms/PassDetail.h
+++ b/mlir/lib/Dialect/Affine/Transforms/PassDetail.h
@@ -9,6 +9,7 @@
 #ifndef DIALECT_AFFINE_TRANSFORMS_PASSDETAIL_H_
 #define DIALECT_AFFINE_TRANSFORMS_PASSDETAIL_H_
 
+#include "mlir/Dialect/Affine/Passes.h"
 #include "mlir/Pass/Pass.h"
 
 namespace mlir {
@@ -16,6 +17,10 @@
 template <typename ConcreteDialect>
 void registerDialect(DialectRegistry &registry);
 
+namespace arith {
+class ArithmeticDialect;
+} // namespace arith
+
 namespace linalg {
 class LinalgDialect;
 } // namespace linalg
diff --git a/mlir/lib/Transforms/PipelineDataTransfer.cpp b/mlir/lib/Dialect/Affine/Transforms/PipelineDataTransfer.cpp
rename from mlir/lib/Transforms/PipelineDataTransfer.cpp
rename to mlir/lib/Dialect/Affine/Transforms/PipelineDataTransfer.cpp
--- a/mlir/lib/Transforms/PipelineDataTransfer.cpp
+++ b/mlir/lib/Dialect/Affine/Transforms/PipelineDataTransfer.cpp
@@ -11,17 +11,16 @@
 //===----------------------------------------------------------------------===//
 
 #include "PassDetail.h"
-#include "mlir/Transforms/Passes.h"
-
 #include "mlir/Dialect/Affine/Analysis/AffineAnalysis.h"
 #include "mlir/Dialect/Affine/Analysis/LoopAnalysis.h"
 #include "mlir/Dialect/Affine/Analysis/Utils.h"
 #include "mlir/Dialect/Affine/IR/AffineOps.h"
+#include "mlir/Dialect/Affine/LoopUtils.h"
+#include "mlir/Dialect/Affine/Utils.h"
 #include "mlir/Dialect/MemRef/IR/MemRef.h"
 #include "mlir/Dialect/StandardOps/Utils/Utils.h"
 #include "mlir/IR/Builders.h"
-#include "mlir/Transforms/LoopUtils.h"
-#include "mlir/Transforms/Utils.h"
+#include "mlir/Transforms/Passes.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/Support/Debug.h"
 
diff --git a/mlir/lib/Dialect/Affine/Transforms/SimplifyAffineStructures.cpp b/mlir/lib/Dialect/Affine/Transforms/SimplifyAffineStructures.cpp
--- a/mlir/lib/Dialect/Affine/Transforms/SimplifyAffineStructures.cpp
+++ b/mlir/lib/Dialect/Affine/Transforms/SimplifyAffineStructures.cpp
@@ -14,9 +14,9 @@
 #include "mlir/Dialect/Affine/Analysis/Utils.h"
 #include "mlir/Dialect/Affine/IR/AffineOps.h"
 #include "mlir/Dialect/Affine/Passes.h"
+#include "mlir/Dialect/Affine/Utils.h"
 #include "mlir/IR/IntegerSet.h"
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
-#include "mlir/Transforms/Utils.h"
 
 #define DEBUG_TYPE "simplify-affine-structure"
 
diff --git a/mlir/lib/Dialect/Affine/Utils/CMakeLists.txt b/mlir/lib/Dialect/Affine/Utils/CMakeLists.txt
--- a/mlir/lib/Dialect/Affine/Utils/CMakeLists.txt
+++ b/mlir/lib/Dialect/Affine/Utils/CMakeLists.txt
@@ -1,4 +1,6 @@
 add_mlir_dialect_library(MLIRAffineUtils
+  LoopFusionUtils.cpp
+  LoopUtils.cpp
   Utils.cpp
 
   ADDITIONAL_HEADER_DIRS
@@ -7,5 +9,6 @@
   LINK_LIBS PUBLIC
   MLIRAffine
   MLIRAnalysis
+  MLIRMemRef
   MLIRTransformUtils
   )
diff --git a/mlir/lib/Transforms/Utils/LoopFusionUtils.cpp b/mlir/lib/Dialect/Affine/Utils/LoopFusionUtils.cpp
rename from mlir/lib/Transforms/Utils/LoopFusionUtils.cpp
rename to mlir/lib/Dialect/Affine/Utils/LoopFusionUtils.cpp
--- a/mlir/lib/Transforms/Utils/LoopFusionUtils.cpp
+++ b/mlir/lib/Dialect/Affine/Utils/LoopFusionUtils.cpp
@@ -10,21 +10,20 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "mlir/Transforms/LoopFusionUtils.h"
-
+#include "mlir/Dialect/Affine/LoopFusionUtils.h"
 #include "mlir/Analysis/SliceAnalysis.h"
 #include "mlir/Dialect/Affine/Analysis/AffineAnalysis.h"
 #include "mlir/Dialect/Affine/Analysis/AffineStructures.h"
 #include "mlir/Dialect/Affine/Analysis/LoopAnalysis.h"
 #include "mlir/Dialect/Affine/Analysis/Utils.h"
 #include "mlir/Dialect/Affine/IR/AffineOps.h"
+#include "mlir/Dialect/Affine/LoopUtils.h"
 #include "mlir/IR/AffineExpr.h"
 #include "mlir/IR/AffineMap.h"
 #include "mlir/IR/BlockAndValueMapping.h"
 #include "mlir/IR/Builders.h"
 #include "mlir/IR/BuiltinOps.h"
 #include "mlir/IR/Operation.h"
-#include "mlir/Transforms/LoopUtils.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/Support/Debug.h"
diff --git a/mlir/lib/Transforms/Utils/LoopUtils.cpp b/mlir/lib/Dialect/Affine/Utils/LoopUtils.cpp
rename from mlir/lib/Transforms/Utils/LoopUtils.cpp
rename to mlir/lib/Dialect/Affine/Utils/LoopUtils.cpp
--- a/mlir/lib/Transforms/Utils/LoopUtils.cpp
+++ b/mlir/lib/Dialect/Affine/Utils/LoopUtils.cpp
@@ -10,14 +10,14 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "mlir/Transforms/LoopUtils.h"
-
+#include "mlir/Dialect/Affine/LoopUtils.h"
 #include "mlir/Analysis/SliceAnalysis.h"
 #include "mlir/Dialect/Affine/Analysis/AffineAnalysis.h"
 #include "mlir/Dialect/Affine/Analysis/LoopAnalysis.h"
 #include "mlir/Dialect/Affine/Analysis/Utils.h"
 #include "mlir/Dialect/Affine/IR/AffineOps.h"
 #include "mlir/Dialect/Affine/IR/AffineValueMap.h"
+#include "mlir/Dialect/Affine/Utils.h"
 #include "mlir/Dialect/MemRef/IR/MemRef.h"
 #include "mlir/Dialect/SCF/SCF.h"
 #include "mlir/IR/BlockAndValueMapping.h"
@@ -25,7 +25,6 @@
 #include "mlir/Support/MathExtras.h"
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
 #include "mlir/Transforms/RegionUtils.h"
-#include "mlir/Transforms/Utils.h"
 #include "llvm/ADT/MapVector.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/Support/Debug.h"
@@ -108,42 +107,9 @@
     lb.erase();
 }
 
-// Build the IR that performs ceil division of a positive value by a constant:
-//    ceildiv(a, B) = divis(a + (B-1), B)
-// where divis is rounding-to-zero division.
-static Value ceilDivPositive(OpBuilder &builder, Location loc, Value dividend,
-                             int64_t divisor) {
-  assert(divisor > 0 && "expected positive divisor");
-  assert(dividend.getType().isIndex() && "expected index-typed value");
-
-  Value divisorMinusOneCst =
-      builder.create<arith::ConstantIndexOp>(loc, divisor - 1);
-  Value divisorCst = builder.create<arith::ConstantIndexOp>(loc, divisor);
-  Value sum = builder.create<arith::AddIOp>(loc, dividend, divisorMinusOneCst);
-  return builder.create<arith::DivSIOp>(loc, sum, divisorCst);
-}
-
-// Build the IR that performs ceil division of a positive value by another
-// positive value:
-//    ceildiv(a, b) = divis(a + (b - 1), b)
-// where divis is rounding-to-zero division.
-static Value ceilDivPositive(OpBuilder &builder, Location loc, Value dividend,
-                             Value divisor) {
-  assert(dividend.getType().isIndex() && "expected index-typed value");
-
-  Value cstOne = builder.create<arith::ConstantIndexOp>(loc, 1);
-  Value divisorMinusOne = builder.create<arith::SubIOp>(loc, divisor, cstOne);
-  Value sum = builder.create<arith::AddIOp>(loc, dividend, divisorMinusOne);
-  return builder.create<arith::DivSIOp>(loc, sum, divisor);
-}
-
 /// Helper to replace uses of loop carried values (iter_args) and loop
-/// yield values while promoting single iteration affine.for and scf.for ops.
-template <typename AffineOrSCFForOp>
-static void replaceIterArgsAndYieldResults(AffineOrSCFForOp forOp) {
-  static_assert(
-      llvm::is_one_of<AffineOrSCFForOp, AffineForOp, scf::ForOp>::value,
-      "only for affine.for and scf.for ops");
+/// yield values while promoting single iteration affine.for ops.
+static void replaceIterArgsAndYieldResults(AffineForOp forOp) {
   // Replace uses of iter arguments with iter operands (initial values).
   auto iterOperands = forOp.getIterOperands();
   auto iterArgs = forOp.getRegionIterArgs();
@@ -203,46 +169,6 @@
   return success();
 }
 
-/// Promotes the loop body of a forOp to its containing block if the forOp
-/// it can be determined that the loop has a single iteration.
-LogicalResult mlir::promoteIfSingleIteration(scf::ForOp forOp) {
-  auto lbCstOp = forOp.getLowerBound().getDefiningOp<arith::ConstantIndexOp>();
-  auto ubCstOp = forOp.getUpperBound().getDefiningOp<arith::ConstantIndexOp>();
-  auto stepCstOp = forOp.getStep().getDefiningOp<arith::ConstantIndexOp>();
-  if (!lbCstOp || !ubCstOp || !stepCstOp || lbCstOp.value() < 0 ||
-      ubCstOp.value() < 0 || stepCstOp.value() < 0)
-    return failure();
-  int64_t tripCount =
-      mlir::ceilDiv(ubCstOp.value() - lbCstOp.value(), stepCstOp.value());
-  if (tripCount != 1)
-    return failure();
-  auto iv = forOp.getInductionVar();
-  iv.replaceAllUsesWith(lbCstOp);
-
-  replaceIterArgsAndYieldResults(forOp);
-
-  // Move the loop body operations, except for its terminator, to the loop's
-  // containing block.
-  auto *parentBlock = forOp->getBlock();
-  forOp.getBody()->getTerminator()->erase();
-  parentBlock->getOperations().splice(Block::iterator(forOp),
-                                      forOp.getBody()->getOperations());
-  forOp.erase();
-  return success();
-}
-
-/// Promotes all single iteration 'for' ops in `f`, i.e., moves
-/// their body into the containing Block.
-void mlir::promoteSingleIterationLoops(FuncOp f) {
-  // Gathers all innermost loops through a post order pruned walk.
-  f.walk([](Operation *op) {
-    if (auto forOp = dyn_cast<AffineForOp>(op))
-      (void)promoteIfSingleIteration(forOp);
-    else if (auto forOp = dyn_cast<scf::ForOp>(op))
-      (void)promoteIfSingleIteration(forOp);
-  });
-}
-
 /// Generates an affine.for op with the specified lower and upper bounds
 /// while generating the right IV remappings to realize shifts for operations in
 /// its body. The operations that go into the loop body are specified in
@@ -1011,38 +937,22 @@
   return success();
 }
 
-/// Collect perfectly nested loops starting from `rootForOps`.  Loops are
-/// perfectly nested if each loop is the first and only non-terminator operation
-/// in the parent loop.  Collect at most `maxLoops` loops and append them to
-/// `forOps`.
-template <typename T>
-static void getPerfectlyNestedLoopsImpl(
-    SmallVectorImpl<T> &forOps, T rootForOp,
-    unsigned maxLoops = std::numeric_limits<unsigned>::max()) {
-  for (unsigned i = 0; i < maxLoops; ++i) {
-    forOps.push_back(rootForOp);
-    Block &body = rootForOp.getRegion().front();
-    if (body.begin() != std::prev(body.end(), 2))
-      return;
-
-    rootForOp = dyn_cast<T>(&body.front());
-    if (!rootForOp)
-      return;
-  }
-}
-
 /// Get perfectly nested sequence of loops starting at root of loop nest
 /// (the first op being another AffineFor, and the second op - a terminator).
 /// A loop is perfectly nested iff: the first op in the loop's body is another
 /// AffineForOp, and the second op is a terminator).
 void mlir::getPerfectlyNestedLoops(SmallVectorImpl<AffineForOp> &nestedLoops,
                                    AffineForOp root) {
-  getPerfectlyNestedLoopsImpl(nestedLoops, root);
-}
+  for (unsigned i = 0; i < std::numeric_limits<unsigned>::max(); ++i) {
+    nestedLoops.push_back(root);
+    Block &body = root.getRegion().front();
+    if (body.begin() != std::prev(body.end(), 2))
+      return;
 
-void mlir::getPerfectlyNestedLoops(SmallVectorImpl<scf::ForOp> &nestedLoops,
-                                   scf::ForOp root) {
-  getPerfectlyNestedLoopsImpl(nestedLoops, root);
+    root = dyn_cast<AffineForOp>(&body.front());
+    if (!root)
+      return;
+  }
 }
 
 /// Identify valid and profitable bands of loops to tile. This is currently just
@@ -1084,10 +994,10 @@
   return loopUnrollByFactor(forOp, unrollFactor);
 }
 
-/// Generates unrolled copies of AffineForOp or scf::ForOp 'loopBodyBlock', with
-/// associated 'forOpIV' by 'unrollFactor', calling 'ivRemapFn' to remap
-/// 'forOpIV' for each unrolled body. If specified, annotates the Ops in each
-/// unrolled iteration using annotateFn.
+/// Generates unrolled copies of AffineForOp 'loopBodyBlock', with associated
+/// 'forOpIV' by 'unrollFactor', calling 'ivRemapFn' to remap 'forOpIV' for each
+/// unrolled body. If specified, annotates the Ops in each unrolled iteration
+/// using annotateFn.
 static void generateUnrolledLoop(
     Block *loopBodyBlock, Value forOpIV, uint64_t unrollFactor,
     function_ref<Value(unsigned, Value, OpBuilder)> ivRemapFn,
@@ -1237,127 +1147,6 @@
   return success();
 }
 
-/// Unrolls 'forOp' by 'unrollFactor', returns success if the loop is unrolled.
-LogicalResult mlir::loopUnrollByFactor(
-    scf::ForOp forOp, uint64_t unrollFactor,
-    function_ref<void(unsigned, Operation *, OpBuilder)> annotateFn) {
-  assert(unrollFactor > 0 && "expected positive unroll factor");
-
-  // Return if the loop body is empty.
-  if (llvm::hasSingleElement(forOp.getBody()->getOperations()))
-    return success();
-
-  // Compute tripCount = ceilDiv((upperBound - lowerBound), step) and populate
-  // 'upperBoundUnrolled' and 'stepUnrolled' for static and dynamic cases.
-  OpBuilder boundsBuilder(forOp);
-  auto loc = forOp.getLoc();
-  auto step = forOp.getStep();
-  Value upperBoundUnrolled;
-  Value stepUnrolled;
-  bool generateEpilogueLoop = true;
-
-  auto lbCstOp = forOp.getLowerBound().getDefiningOp<arith::ConstantIndexOp>();
-  auto ubCstOp = forOp.getUpperBound().getDefiningOp<arith::ConstantIndexOp>();
-  auto stepCstOp = forOp.getStep().getDefiningOp<arith::ConstantIndexOp>();
-  if (lbCstOp && ubCstOp && stepCstOp) {
-    // Constant loop bounds computation.
-    int64_t lbCst = lbCstOp.value();
-    int64_t ubCst = ubCstOp.value();
-    int64_t stepCst = stepCstOp.value();
-    assert(lbCst >= 0 && ubCst >= 0 && stepCst >= 0 &&
-           "expected positive loop bounds and step");
-    int64_t tripCount = mlir::ceilDiv(ubCst - lbCst, stepCst);
-
-    if (unrollFactor == 1) {
-      if (tripCount == 1 && failed(promoteIfSingleIteration(forOp)))
-        return failure();
-      return success();
-    }
-
-    int64_t tripCountEvenMultiple = tripCount - (tripCount % unrollFactor);
-    int64_t upperBoundUnrolledCst = lbCst + tripCountEvenMultiple * stepCst;
-    assert(upperBoundUnrolledCst <= ubCst);
-    int64_t stepUnrolledCst = stepCst * unrollFactor;
-
-    // Create constant for 'upperBoundUnrolled' and set epilogue loop flag.
-    generateEpilogueLoop = upperBoundUnrolledCst < ubCst;
-    if (generateEpilogueLoop)
-      upperBoundUnrolled = boundsBuilder.create<arith::ConstantIndexOp>(
-          loc, upperBoundUnrolledCst);
-    else
-      upperBoundUnrolled = ubCstOp;
-
-    // Create constant for 'stepUnrolled'.
-    stepUnrolled = stepCst == stepUnrolledCst
-                       ? step
-                       : boundsBuilder.create<arith::ConstantIndexOp>(
-                             loc, stepUnrolledCst);
-  } else {
-    // Dynamic loop bounds computation.
-    // TODO: Add dynamic asserts for negative lb/ub/step, or
-    // consider using ceilDiv from AffineApplyExpander.
-    auto lowerBound = forOp.getLowerBound();
-    auto upperBound = forOp.getUpperBound();
-    Value diff =
-        boundsBuilder.create<arith::SubIOp>(loc, upperBound, lowerBound);
-    Value tripCount = ceilDivPositive(boundsBuilder, loc, diff, step);
-    Value unrollFactorCst =
-        boundsBuilder.create<arith::ConstantIndexOp>(loc, unrollFactor);
-    Value tripCountRem =
-        boundsBuilder.create<arith::RemSIOp>(loc, tripCount, unrollFactorCst);
-    // Compute tripCountEvenMultiple = tripCount - (tripCount % unrollFactor)
-    Value tripCountEvenMultiple =
-        boundsBuilder.create<arith::SubIOp>(loc, tripCount, tripCountRem);
-    // Compute upperBoundUnrolled = lowerBound + tripCountEvenMultiple * step
-    upperBoundUnrolled = boundsBuilder.create<arith::AddIOp>(
-        loc, lowerBound,
-        boundsBuilder.create<arith::MulIOp>(loc, tripCountEvenMultiple, step));
-    // Scale 'step' by 'unrollFactor'.
-    stepUnrolled =
-        boundsBuilder.create<arith::MulIOp>(loc, step, unrollFactorCst);
-  }
-
-  // Create epilogue clean up loop starting at 'upperBoundUnrolled'.
-  if (generateEpilogueLoop) {
-    OpBuilder epilogueBuilder(forOp->getContext());
-    epilogueBuilder.setInsertionPoint(forOp->getBlock(),
-                                      std::next(Block::iterator(forOp)));
-    auto epilogueForOp = cast<scf::ForOp>(epilogueBuilder.clone(*forOp));
-    epilogueForOp.setLowerBound(upperBoundUnrolled);
-
-    // Update uses of loop results.
-    auto results = forOp.getResults();
-    auto epilogueResults = epilogueForOp.getResults();
-    auto epilogueIterOperands = epilogueForOp.getIterOperands();
-
-    for (auto e : llvm::zip(results, epilogueResults, epilogueIterOperands)) {
-      std::get<0>(e).replaceAllUsesWith(std::get<1>(e));
-      epilogueForOp->replaceUsesOfWith(std::get<2>(e), std::get<0>(e));
-    }
-    (void)promoteIfSingleIteration(epilogueForOp);
-  }
-
-  // Create unrolled loop.
-  forOp.setUpperBound(upperBoundUnrolled);
-  forOp.setStep(stepUnrolled);
-
-  auto iterArgs = ValueRange(forOp.getRegionIterArgs());
-  auto yieldedValues = forOp.getBody()->getTerminator()->getOperands();
-
-  generateUnrolledLoop(
-      forOp.getBody(), forOp.getInductionVar(), unrollFactor,
-      [&](unsigned i, Value iv, OpBuilder b) {
-        // iv' = iv + step * i;
-        auto stride = b.create<arith::MulIOp>(
-            loc, step, b.create<arith::ConstantIndexOp>(loc, i));
-        return b.create<arith::AddIOp>(loc, iv, stride);
-      },
-      annotateFn, iterArgs, yieldedValues);
-  // Promote the loop body up if this has turned into a single iteration loop.
-  (void)promoteIfSingleIteration(forOp);
-  return success();
-}
-
 LogicalResult mlir::loopUnrollJamUpToFactor(AffineForOp forOp,
                                             uint64_t unrollJamFactor) {
   Optional<uint64_t> mayBeConstantTripCount = getConstantTripCount(forOp);
@@ -1888,61 +1677,25 @@
   return innerLoops;
 }
 
-static Loops stripmineSink(scf::ForOp forOp, Value factor,
-                           ArrayRef<scf::ForOp> targets) {
-  auto originalStep = forOp.getStep();
-  auto iv = forOp.getInductionVar();
-
-  OpBuilder b(forOp);
-  forOp.setStep(b.create<arith::MulIOp>(forOp.getLoc(), originalStep, factor));
-
-  Loops innerLoops;
-  for (auto t : targets) {
-    // Save information for splicing ops out of t when done
-    auto begin = t.getBody()->begin();
-    auto nOps = t.getBody()->getOperations().size();
-
-    // Insert newForOp before the terminator of `t`.
-    auto b = OpBuilder::atBlockTerminator((t.getBody()));
-    Value stepped = b.create<arith::AddIOp>(t.getLoc(), iv, forOp.getStep());
-    Value less = b.create<arith::CmpIOp>(t.getLoc(), arith::CmpIPredicate::slt,
-                                         forOp.getUpperBound(), stepped);
-    Value ub =
-        b.create<SelectOp>(t.getLoc(), less, forOp.getUpperBound(), stepped);
-
-    // Splice [begin, begin + nOps - 1) into `newForOp` and replace uses.
-    auto newForOp = b.create<scf::ForOp>(t.getLoc(), iv, ub, originalStep);
-    newForOp.getBody()->getOperations().splice(
-        newForOp.getBody()->getOperations().begin(),
-        t.getBody()->getOperations(), begin, std::next(begin, nOps - 1));
-    replaceAllUsesInRegionWith(iv, newForOp.getInductionVar(),
-                               newForOp.getRegion());
-
-    innerLoops.push_back(newForOp);
-  }
-
-  return innerLoops;
-}
-
 // Stripmines a `forOp` by `factor` and sinks it under a single `target`.
 // Returns the new AffineForOps, nested immediately under `target`.
-template <typename ForType, typename SizeType>
-static ForType stripmineSink(ForType forOp, SizeType factor, ForType target) {
+template <typename SizeType>
+static AffineForOp stripmineSink(AffineForOp forOp, SizeType factor,
+                                 AffineForOp target) {
   // TODO: Use cheap structural assertions that targets are nested under
   // forOp and that targets are not nested under each other when DominanceInfo
   // exposes the capability. It seems overkill to construct a whole function
   // dominance tree at this point.
-  auto res = stripmineSink(forOp, factor, ArrayRef<ForType>{target});
+  auto res = stripmineSink(forOp, factor, ArrayRef<AffineForOp>(target));
   assert(res.size() == 1 && "Expected 1 inner forOp");
   return res[0];
 }
 
-template <typename ForType, typename SizeType>
-static SmallVector<SmallVector<ForType, 8>, 8>
-tileImpl(ArrayRef<ForType> forOps, ArrayRef<SizeType> sizes,
-         ArrayRef<ForType> targets) {
-  SmallVector<SmallVector<ForType, 8>, 8> res;
-  SmallVector<ForType, 8> currentTargets(targets.begin(), targets.end());
+SmallVector<SmallVector<AffineForOp, 8>, 8>
+mlir::tile(ArrayRef<AffineForOp> forOps, ArrayRef<uint64_t> sizes,
+           ArrayRef<AffineForOp> targets) {
+  SmallVector<SmallVector<AffineForOp, 8>, 8> res;
+  SmallVector<AffineForOp, 8> currentTargets(targets.begin(), targets.end());
   for (auto it : llvm::zip(forOps, sizes)) {
     auto step = stripmineSink(std::get<0>(it), std::get<1>(it), currentTargets);
     res.push_back(step);
@@ -1951,286 +1704,15 @@
   return res;
 }
 
-SmallVector<SmallVector<AffineForOp, 8>, 8>
-mlir::tile(ArrayRef<AffineForOp> forOps, ArrayRef<uint64_t> sizes,
-           ArrayRef<AffineForOp> targets) {
-  return tileImpl(forOps, sizes, targets);
-}
-
-SmallVector<Loops, 8> mlir::tile(ArrayRef<scf::ForOp> forOps,
-                                 ArrayRef<Value> sizes,
-                                 ArrayRef<scf::ForOp> targets) {
-  return tileImpl(forOps, sizes, targets);
-}
-
-template <typename ForType, typename SizeType>
-static SmallVector<ForType, 8>
-tileImpl(ArrayRef<ForType> forOps, ArrayRef<SizeType> sizes, ForType target) {
-  SmallVector<ForType, 8> res;
-  for (auto loops : tile(forOps, sizes, ArrayRef<ForType>{target})) {
-    assert(loops.size() == 1);
-    res.push_back(loops[0]);
-  }
-  return res;
-}
-
 SmallVector<AffineForOp, 8> mlir::tile(ArrayRef<AffineForOp> forOps,
                                        ArrayRef<uint64_t> sizes,
                                        AffineForOp target) {
-  return tileImpl(forOps, sizes, target);
-}
-
-Loops mlir::tile(ArrayRef<scf::ForOp> forOps, ArrayRef<Value> sizes,
-                 scf::ForOp target) {
-  return tileImpl(forOps, sizes, target);
-}
-
-Loops mlir::tilePerfectlyNested(scf::ForOp rootForOp, ArrayRef<Value> sizes) {
-  // Collect perfectly nested loops.  If more size values provided than nested
-  // loops available, truncate `sizes`.
-  SmallVector<scf::ForOp, 4> forOps;
-  forOps.reserve(sizes.size());
-  getPerfectlyNestedLoopsImpl(forOps, rootForOp, sizes.size());
-  if (forOps.size() < sizes.size())
-    sizes = sizes.take_front(forOps.size());
-
-  return ::tile(forOps, sizes, forOps.back());
-}
-
-// Hoist the ops within `outer` that appear before `inner`.
-// Such ops include the ops that have been introduced by parametric tiling.
-// Ops that come from triangular loops (i.e. that belong to the program slice
-// rooted at `outer`) and ops that have side effects cannot be hoisted.
-// Return failure when any op fails to hoist.
-static LogicalResult hoistOpsBetween(scf::ForOp outer, scf::ForOp inner) {
-  SetVector<Operation *> forwardSlice;
-  getForwardSlice(
-      outer.getInductionVar(), &forwardSlice,
-      [&inner](Operation *op) { return op != inner.getOperation(); });
-  LogicalResult status = success();
-  SmallVector<Operation *, 8> toHoist;
-  for (auto &op : outer.getBody()->without_terminator()) {
-    // Stop when encountering the inner loop.
-    if (&op == inner.getOperation())
-      break;
-    // Skip over non-hoistable ops.
-    if (forwardSlice.count(&op) > 0) {
-      status = failure();
-      continue;
-    }
-    // Skip intermediate scf::ForOp, these are not considered a failure.
-    if (isa<scf::ForOp>(op))
-      continue;
-    // Skip other ops with regions.
-    if (op.getNumRegions() > 0) {
-      status = failure();
-      continue;
-    }
-    // Skip if op has side effects.
-    // TODO: loads to immutable memory regions are ok.
-    if (!MemoryEffectOpInterface::hasNoEffect(&op)) {
-      status = failure();
-      continue;
-    }
-    toHoist.push_back(&op);
-  }
-  auto *outerForOp = outer.getOperation();
-  for (auto *op : toHoist)
-    op->moveBefore(outerForOp);
-  return status;
-}
-
-// Traverse the interTile and intraTile loops and try to hoist ops such that
-// bands of perfectly nested loops are isolated.
-// Return failure if either perfect interTile or perfect intraTile bands cannot
-// be formed.
-static LogicalResult tryIsolateBands(const TileLoops &tileLoops) {
-  LogicalResult status = success();
-  const Loops &interTile = tileLoops.first;
-  const Loops &intraTile = tileLoops.second;
-  auto size = interTile.size();
-  assert(size == intraTile.size());
-  if (size <= 1)
-    return success();
-  for (unsigned s = 1; s < size; ++s)
-    status = succeeded(status) ? hoistOpsBetween(intraTile[0], intraTile[s])
-                               : failure();
-  for (unsigned s = 1; s < size; ++s)
-    status = succeeded(status) ? hoistOpsBetween(interTile[0], interTile[s])
-                               : failure();
-  return status;
-}
-
-TileLoops mlir::extractFixedOuterLoops(scf::ForOp rootForOp,
-                                       ArrayRef<int64_t> sizes) {
-  // Collect perfectly nested loops.  If more size values provided than nested
-  // loops available, truncate `sizes`.
-  SmallVector<scf::ForOp, 4> forOps;
-  forOps.reserve(sizes.size());
-  getPerfectlyNestedLoopsImpl(forOps, rootForOp, sizes.size());
-  if (forOps.size() < sizes.size())
-    sizes = sizes.take_front(forOps.size());
-
-  // Compute the tile sizes such that i-th outer loop executes size[i]
-  // iterations.  Given that the loop current executes
-  //   numIterations = ceildiv((upperBound - lowerBound), step)
-  // iterations, we need to tile with size ceildiv(numIterations, size[i]).
-  SmallVector<Value, 4> tileSizes;
-  tileSizes.reserve(sizes.size());
-  for (unsigned i = 0, e = sizes.size(); i < e; ++i) {
-    assert(sizes[i] > 0 && "expected strictly positive size for strip-mining");
-
-    auto forOp = forOps[i];
-    OpBuilder builder(forOp);
-    auto loc = forOp.getLoc();
-    Value diff = builder.create<arith::SubIOp>(loc, forOp.getUpperBound(),
-                                               forOp.getLowerBound());
-    Value numIterations = ceilDivPositive(builder, loc, diff, forOp.getStep());
-    Value iterationsPerBlock =
-        ceilDivPositive(builder, loc, numIterations, sizes[i]);
-    tileSizes.push_back(iterationsPerBlock);
-  }
-
-  // Call parametric tiling with the given sizes.
-  auto intraTile = tile(forOps, tileSizes, forOps.back());
-  TileLoops tileLoops = std::make_pair(forOps, intraTile);
-
-  // TODO: for now we just ignore the result of band isolation.
-  // In the future, mapping decisions may be impacted by the ability to
-  // isolate perfectly nested bands.
-  (void)tryIsolateBands(tileLoops);
-
-  return tileLoops;
-}
-
-/// Return the new lower bound, upper bound, and step in that order. Insert any
-/// additional bounds calculations before the given builder and any additional
-/// conversion back to the original loop induction value inside the given Block.
-static LoopParams normalizeLoop(OpBuilder &boundsBuilder,
-                                OpBuilder &insideLoopBuilder, Location loc,
-                                Value lowerBound, Value upperBound, Value step,
-                                Value inductionVar) {
-  // Check if the loop is already known to have a constant zero lower bound or
-  // a constant one step.
-  bool isZeroBased = false;
-  if (auto ubCst = lowerBound.getDefiningOp<arith::ConstantIndexOp>())
-    isZeroBased = ubCst.value() == 0;
-
-  bool isStepOne = false;
-  if (auto stepCst = step.getDefiningOp<arith::ConstantIndexOp>())
-    isStepOne = stepCst.value() == 1;
-
-  // Compute the number of iterations the loop executes: ceildiv(ub - lb, step)
-  // assuming the step is strictly positive.  Update the bounds and the step
-  // of the loop to go from 0 to the number of iterations, if necessary.
-  // TODO: introduce support for negative steps or emit dynamic asserts
-  // on step positivity, whatever gets implemented first.
-  if (isZeroBased && isStepOne)
-    return {/*lowerBound=*/lowerBound, /*upperBound=*/upperBound,
-            /*step=*/step};
-
-  Value diff = boundsBuilder.create<arith::SubIOp>(loc, upperBound, lowerBound);
-  Value newUpperBound = ceilDivPositive(boundsBuilder, loc, diff, step);
-
-  Value newLowerBound =
-      isZeroBased ? lowerBound
-                  : boundsBuilder.create<arith::ConstantIndexOp>(loc, 0);
-  Value newStep =
-      isStepOne ? step : boundsBuilder.create<arith::ConstantIndexOp>(loc, 1);
-
-  // Insert code computing the value of the original loop induction variable
-  // from the "normalized" one.
-  Value scaled =
-      isStepOne
-          ? inductionVar
-          : insideLoopBuilder.create<arith::MulIOp>(loc, inductionVar, step);
-  Value shifted =
-      isZeroBased
-          ? scaled
-          : insideLoopBuilder.create<arith::AddIOp>(loc, scaled, lowerBound);
-
-  SmallPtrSet<Operation *, 2> preserve{scaled.getDefiningOp(),
-                                       shifted.getDefiningOp()};
-  inductionVar.replaceAllUsesExcept(shifted, preserve);
-  return {/*lowerBound=*/newLowerBound, /*upperBound=*/newUpperBound,
-          /*step=*/newStep};
-}
-
-/// Transform a loop with a strictly positive step
-///   for %i = %lb to %ub step %s
-/// into a 0-based loop with step 1
-///   for %ii = 0 to ceildiv(%ub - %lb, %s) step 1 {
-///     %i = %ii * %s + %lb
-/// Insert the induction variable remapping in the body of `inner`, which is
-/// expected to be either `loop` or another loop perfectly nested under `loop`.
-/// Insert the definition of new bounds immediate before `outer`, which is
-/// expected to be either `loop` or its parent in the loop nest.
-static void normalizeLoop(scf::ForOp loop, scf::ForOp outer, scf::ForOp inner) {
-  OpBuilder builder(outer);
-  OpBuilder innerBuilder = OpBuilder::atBlockBegin(inner.getBody());
-  auto loopPieces = normalizeLoop(builder, innerBuilder, loop.getLoc(),
-                                  loop.getLowerBound(), loop.getUpperBound(),
-                                  loop.getStep(), loop.getInductionVar());
-
-  loop.setLowerBound(loopPieces.lowerBound);
-  loop.setUpperBound(loopPieces.upperBound);
-  loop.setStep(loopPieces.step);
-}
-
-void mlir::coalesceLoops(MutableArrayRef<scf::ForOp> loops) {
-  if (loops.size() < 2)
-    return;
-
-  scf::ForOp innermost = loops.back();
-  scf::ForOp outermost = loops.front();
-
-  // 1. Make sure all loops iterate from 0 to upperBound with step 1.  This
-  // allows the following code to assume upperBound is the number of iterations.
-  for (auto loop : loops)
-    normalizeLoop(loop, outermost, innermost);
-
-  // 2. Emit code computing the upper bound of the coalesced loop as product
-  // of the number of iterations of all loops.
-  OpBuilder builder(outermost);
-  Location loc = outermost.getLoc();
-  Value upperBound = outermost.getUpperBound();
-  for (auto loop : loops.drop_front())
-    upperBound =
-        builder.create<arith::MulIOp>(loc, upperBound, loop.getUpperBound());
-  outermost.setUpperBound(upperBound);
-
-  builder.setInsertionPointToStart(outermost.getBody());
-
-  // 3. Remap induction variables. For each original loop, the value of the
-  // induction variable can be obtained by dividing the induction variable of
-  // the linearized loop by the total number of iterations of the loops nested
-  // in it modulo the number of iterations in this loop (remove the values
-  // related to the outer loops):
-  //   iv_i = floordiv(iv_linear, product-of-loop-ranges-until-i) mod range_i.
-  // Compute these iteratively from the innermost loop by creating a "running
-  // quotient" of division by the range.
-  Value previous = outermost.getInductionVar();
-  for (unsigned i = 0, e = loops.size(); i < e; ++i) {
-    unsigned idx = loops.size() - i - 1;
-    if (i != 0)
-      previous = builder.create<arith::DivSIOp>(loc, previous,
-                                                loops[idx + 1].getUpperBound());
-
-    Value iv = (i == e - 1) ? previous
-                            : builder.create<arith::RemSIOp>(
-                                  loc, previous, loops[idx].getUpperBound());
-    replaceAllUsesInRegionWith(loops[idx].getInductionVar(), iv,
-                               loops.back().getRegion());
+  SmallVector<AffineForOp, 8> res;
+  for (auto loops : tile(forOps, sizes, ArrayRef<AffineForOp>(target))) {
+    assert(loops.size() == 1);
+    res.push_back(loops[0]);
   }
-
-  // 4. Move the operations from the innermost just above the second-outermost
-  // loop, delete the extra terminator and the second-outermost loop.
-  scf::ForOp second = loops[1];
-  innermost.getBody()->back().erase();
-  outermost.getBody()->getOperations().splice(
-      Block::iterator(second.getOperation()),
-      innermost.getBody()->getOperations());
-  second.erase();
+  return res;
 }
 
 LogicalResult mlir::coalesceLoops(MutableArrayRef<AffineForOp> loops) {
@@ -2347,89 +1829,6 @@
   return success();
 }
 
-void mlir::collapseParallelLoops(
-    scf::ParallelOp loops, ArrayRef<std::vector<unsigned>> combinedDimensions) {
-  OpBuilder outsideBuilder(loops);
-  Location loc = loops.getLoc();
-
-  // Presort combined dimensions.
-  auto sortedDimensions = llvm::to_vector<3>(combinedDimensions);
-  for (auto &dims : sortedDimensions)
-    std::sort(dims.begin(), dims.end());
-
-  // Normalize ParallelOp's iteration pattern.
-  SmallVector<Value, 3> normalizedLowerBounds, normalizedSteps,
-      normalizedUpperBounds;
-  for (unsigned i = 0, e = loops.getNumLoops(); i < e; ++i) {
-    OpBuilder insideLoopBuilder = OpBuilder::atBlockBegin(loops.getBody());
-    auto resultBounds =
-        normalizeLoop(outsideBuilder, insideLoopBuilder, loc,
-                      loops.getLowerBound()[i], loops.getUpperBound()[i],
-                      loops.getStep()[i], loops.getBody()->getArgument(i));
-
-    normalizedLowerBounds.push_back(resultBounds.lowerBound);
-    normalizedUpperBounds.push_back(resultBounds.upperBound);
-    normalizedSteps.push_back(resultBounds.step);
-  }
-
-  // Combine iteration spaces.
-  SmallVector<Value, 3> lowerBounds, upperBounds, steps;
-  auto cst0 = outsideBuilder.create<arith::ConstantIndexOp>(loc, 0);
-  auto cst1 = outsideBuilder.create<arith::ConstantIndexOp>(loc, 1);
-  for (unsigned i = 0, e = sortedDimensions.size(); i < e; ++i) {
-    Value newUpperBound = outsideBuilder.create<arith::ConstantIndexOp>(loc, 1);
-    for (auto idx : sortedDimensions[i]) {
-      newUpperBound = outsideBuilder.create<arith::MulIOp>(
-          loc, newUpperBound, normalizedUpperBounds[idx]);
-    }
-    lowerBounds.push_back(cst0);
-    steps.push_back(cst1);
-    upperBounds.push_back(newUpperBound);
-  }
-
-  // Create new ParallelLoop with conversions to the original induction values.
-  // The loop below uses divisions to get the relevant range of values in the
-  // new induction value that represent each range of the original induction
-  // value. The remainders then determine based on that range, which iteration
-  // of the original induction value this represents. This is a normalized value
-  // that is un-normalized already by the previous logic.
-  auto newPloop = outsideBuilder.create<scf::ParallelOp>(
-      loc, lowerBounds, upperBounds, steps,
-      [&](OpBuilder &insideBuilder, Location, ValueRange ploopIVs) {
-        for (unsigned i = 0, e = combinedDimensions.size(); i < e; ++i) {
-          Value previous = ploopIVs[i];
-          unsigned numberCombinedDimensions = combinedDimensions[i].size();
-          // Iterate over all except the last induction value.
-          for (unsigned j = numberCombinedDimensions - 1; j > 0; --j) {
-            unsigned idx = combinedDimensions[i][j];
-
-            // Determine the current induction value's current loop iteration
-            Value iv = insideBuilder.create<arith::RemSIOp>(
-                loc, previous, normalizedUpperBounds[idx]);
-            replaceAllUsesInRegionWith(loops.getBody()->getArgument(idx), iv,
-                                       loops.getRegion());
-
-            // Remove the effect of the current induction value to prepare for
-            // the next value.
-            previous = insideBuilder.create<arith::DivSIOp>(
-                loc, previous, normalizedUpperBounds[idx]);
-          }
-
-          // The final induction value is just the remaining value.
-          unsigned idx = combinedDimensions[i][0];
-          replaceAllUsesInRegionWith(loops.getBody()->getArgument(idx),
-                                     previous, loops.getRegion());
-        }
-      });
-
-  // Replace the old loop with the new loop.
-  loops.getBody()->back().erase();
-  newPloop.getBody()->getOperations().splice(
-      Block::iterator(newPloop.getBody()->back()),
-      loops.getBody()->getOperations());
-  loops.erase();
-}
-
 void mlir::mapLoopToProcessorIds(scf::ForOp forOp, ArrayRef<Value> processorId,
                                  ArrayRef<Value> numProcessors) {
   assert(processorId.size() == numProcessors.size());
diff --git a/mlir/lib/Dialect/Affine/Utils/Utils.cpp b/mlir/lib/Dialect/Affine/Utils/Utils.cpp
--- a/mlir/lib/Dialect/Affine/Utils/Utils.cpp
+++ b/mlir/lib/Dialect/Affine/Utils/Utils.cpp
@@ -16,12 +16,14 @@
 #include "mlir/Dialect/Affine/Analysis/Utils.h"
 #include "mlir/Dialect/Affine/IR/AffineOps.h"
 #include "mlir/Dialect/Affine/IR/AffineValueMap.h"
+#include "mlir/Dialect/Affine/LoopUtils.h"
 #include "mlir/Dialect/MemRef/IR/MemRef.h"
 #include "mlir/IR/BlockAndValueMapping.h"
 #include "mlir/IR/Dominance.h"
 #include "mlir/IR/IntegerSet.h"
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
-#include "mlir/Transforms/LoopUtils.h"
+
+#define DEBUG_TYPE "affine-utils"
 
 using namespace mlir;
 
@@ -856,3 +858,740 @@
     defOp->erase();
   }
 }
+
+// Perform the replacement in `op`.
+LogicalResult mlir::replaceAllMemRefUsesWith(Value oldMemRef, Value newMemRef,
+                                             Operation *op,
+                                             ArrayRef<Value> extraIndices,
+                                             AffineMap indexRemap,
+                                             ArrayRef<Value> extraOperands,
+                                             ArrayRef<Value> symbolOperands,
+                                             bool allowNonDereferencingOps) {
+  unsigned newMemRefRank = newMemRef.getType().cast<MemRefType>().getRank();
+  (void)newMemRefRank; // unused in opt mode
+  unsigned oldMemRefRank = oldMemRef.getType().cast<MemRefType>().getRank();
+  (void)oldMemRefRank; // unused in opt mode
+  if (indexRemap) {
+    assert(indexRemap.getNumSymbols() == symbolOperands.size() &&
+           "symbolic operand count mismatch");
+    assert(indexRemap.getNumInputs() ==
+           extraOperands.size() + oldMemRefRank + symbolOperands.size());
+    assert(indexRemap.getNumResults() + extraIndices.size() == newMemRefRank);
+  } else {
+    assert(oldMemRefRank + extraIndices.size() == newMemRefRank);
+  }
+
+  // Assert same elemental type.
+  assert(oldMemRef.getType().cast<MemRefType>().getElementType() ==
+         newMemRef.getType().cast<MemRefType>().getElementType());
+
+  SmallVector<unsigned, 2> usePositions;
+  for (const auto &opEntry : llvm::enumerate(op->getOperands())) {
+    if (opEntry.value() == oldMemRef)
+      usePositions.push_back(opEntry.index());
+  }
+
+  // If memref doesn't appear, nothing to do.
+  if (usePositions.empty())
+    return success();
+
+  if (usePositions.size() > 1) {
+    // TODO: extend it for this case when needed (rare).
+    assert(false && "multiple dereferencing uses in a single op not supported");
+    return failure();
+  }
+
+  unsigned memRefOperandPos = usePositions.front();
+
+  OpBuilder builder(op);
+  // The following checks if op is dereferencing memref and performs the access
+  // index rewrites.
+  auto affMapAccInterface = dyn_cast<AffineMapAccessInterface>(op);
+  if (!affMapAccInterface) {
+    if (!allowNonDereferencingOps) {
+      // Failure: memref used in a non-dereferencing context (potentially
+      // escapes); no replacement in these cases unless allowNonDereferencingOps
+      // is set.
+      return failure();
+    }
+    op->setOperand(memRefOperandPos, newMemRef);
+    return success();
+  }
+  // Perform index rewrites for the dereferencing op and then replace the op
+  NamedAttribute oldMapAttrPair =
+      affMapAccInterface.getAffineMapAttrForMemRef(oldMemRef);
+  AffineMap oldMap = oldMapAttrPair.getValue().cast<AffineMapAttr>().getValue();
+  unsigned oldMapNumInputs = oldMap.getNumInputs();
+  SmallVector<Value, 4> oldMapOperands(
+      op->operand_begin() + memRefOperandPos + 1,
+      op->operand_begin() + memRefOperandPos + 1 + oldMapNumInputs);
+
+  // Apply 'oldMemRefOperands = oldMap(oldMapOperands)'.
+  SmallVector<Value, 4> oldMemRefOperands;
+  SmallVector<Value, 4> affineApplyOps;
+  oldMemRefOperands.reserve(oldMemRefRank);
+  if (oldMap != builder.getMultiDimIdentityMap(oldMap.getNumDims())) {
+    for (auto resultExpr : oldMap.getResults()) {
+      auto singleResMap = AffineMap::get(oldMap.getNumDims(),
+                                         oldMap.getNumSymbols(), resultExpr);
+      auto afOp = builder.create<AffineApplyOp>(op->getLoc(), singleResMap,
+                                                oldMapOperands);
+      oldMemRefOperands.push_back(afOp);
+      affineApplyOps.push_back(afOp);
+    }
+  } else {
+    oldMemRefOperands.assign(oldMapOperands.begin(), oldMapOperands.end());
+  }
+
+  // Construct new indices as a remap of the old ones if a remapping has been
+  // provided. The indices of a memref come right after it, i.e.,
+  // at position memRefOperandPos + 1.
+  SmallVector<Value, 4> remapOperands;
+  remapOperands.reserve(extraOperands.size() + oldMemRefRank +
+                        symbolOperands.size());
+  remapOperands.append(extraOperands.begin(), extraOperands.end());
+  remapOperands.append(oldMemRefOperands.begin(), oldMemRefOperands.end());
+  remapOperands.append(symbolOperands.begin(), symbolOperands.end());
+
+  SmallVector<Value, 4> remapOutputs;
+  remapOutputs.reserve(oldMemRefRank);
+
+  if (indexRemap &&
+      indexRemap != builder.getMultiDimIdentityMap(indexRemap.getNumDims())) {
+    // Remapped indices.
+    for (auto resultExpr : indexRemap.getResults()) {
+      auto singleResMap = AffineMap::get(
+          indexRemap.getNumDims(), indexRemap.getNumSymbols(), resultExpr);
+      auto afOp = builder.create<AffineApplyOp>(op->getLoc(), singleResMap,
+                                                remapOperands);
+      remapOutputs.push_back(afOp);
+      affineApplyOps.push_back(afOp);
+    }
+  } else {
+    // No remapping specified.
+    remapOutputs.assign(remapOperands.begin(), remapOperands.end());
+  }
+
+  SmallVector<Value, 4> newMapOperands;
+  newMapOperands.reserve(newMemRefRank);
+
+  // Prepend 'extraIndices' in 'newMapOperands'.
+  for (Value extraIndex : extraIndices) {
+    assert(extraIndex.getDefiningOp()->getNumResults() == 1 &&
+           "single result op's expected to generate these indices");
+    assert((isValidDim(extraIndex) || isValidSymbol(extraIndex)) &&
+           "invalid memory op index");
+    newMapOperands.push_back(extraIndex);
+  }
+
+  // Append 'remapOutputs' to 'newMapOperands'.
+  newMapOperands.append(remapOutputs.begin(), remapOutputs.end());
+
+  // Create new fully composed AffineMap for new op to be created.
+  assert(newMapOperands.size() == newMemRefRank);
+  auto newMap = builder.getMultiDimIdentityMap(newMemRefRank);
+  // TODO: Avoid creating/deleting temporary AffineApplyOps here.
+  fullyComposeAffineMapAndOperands(&newMap, &newMapOperands);
+  newMap = simplifyAffineMap(newMap);
+  canonicalizeMapAndOperands(&newMap, &newMapOperands);
+  // Remove any affine.apply's that became dead as a result of composition.
+  for (Value value : affineApplyOps)
+    if (value.use_empty())
+      value.getDefiningOp()->erase();
+
+  OperationState state(op->getLoc(), op->getName());
+  // Construct the new operation using this memref.
+  state.operands.reserve(op->getNumOperands() + extraIndices.size());
+  // Insert the non-memref operands.
+  state.operands.append(op->operand_begin(),
+                        op->operand_begin() + memRefOperandPos);
+  // Insert the new memref value.
+  state.operands.push_back(newMemRef);
+
+  // Insert the new memref map operands.
+  state.operands.append(newMapOperands.begin(), newMapOperands.end());
+
+  // Insert the remaining operands unmodified.
+  state.operands.append(op->operand_begin() + memRefOperandPos + 1 +
+                            oldMapNumInputs,
+                        op->operand_end());
+
+  // Result types don't change. Both memref's are of the same elemental type.
+  state.types.reserve(op->getNumResults());
+  for (auto result : op->getResults())
+    state.types.push_back(result.getType());
+
+  // Add attribute for 'newMap', other Attributes do not change.
+  auto newMapAttr = AffineMapAttr::get(newMap);
+  for (auto namedAttr : op->getAttrs()) {
+    if (namedAttr.getName() == oldMapAttrPair.getName())
+      state.attributes.push_back({namedAttr.getName(), newMapAttr});
+    else
+      state.attributes.push_back(namedAttr);
+  }
+
+  // Create the new operation.
+  auto *repOp = builder.createOperation(state);
+  op->replaceAllUsesWith(repOp);
+  op->erase();
+
+  return success();
+}
+
+LogicalResult mlir::replaceAllMemRefUsesWith(
+    Value oldMemRef, Value newMemRef, ArrayRef<Value> extraIndices,
+    AffineMap indexRemap, ArrayRef<Value> extraOperands,
+    ArrayRef<Value> symbolOperands, Operation *domOpFilter,
+    Operation *postDomOpFilter, bool allowNonDereferencingOps,
+    bool replaceInDeallocOp) {
+  unsigned newMemRefRank = newMemRef.getType().cast<MemRefType>().getRank();
+  (void)newMemRefRank; // unused in opt mode
+  unsigned oldMemRefRank = oldMemRef.getType().cast<MemRefType>().getRank();
+  (void)oldMemRefRank;
+  if (indexRemap) {
+    assert(indexRemap.getNumSymbols() == symbolOperands.size() &&
+           "symbol operand count mismatch");
+    assert(indexRemap.getNumInputs() ==
+           extraOperands.size() + oldMemRefRank + symbolOperands.size());
+    assert(indexRemap.getNumResults() + extraIndices.size() == newMemRefRank);
+  } else {
+    assert(oldMemRefRank + extraIndices.size() == newMemRefRank);
+  }
+
+  // Assert same elemental type.
+  assert(oldMemRef.getType().cast<MemRefType>().getElementType() ==
+         newMemRef.getType().cast<MemRefType>().getElementType());
+
+  std::unique_ptr<DominanceInfo> domInfo;
+  std::unique_ptr<PostDominanceInfo> postDomInfo;
+  if (domOpFilter)
+    domInfo =
+        std::make_unique<DominanceInfo>(domOpFilter->getParentOfType<FuncOp>());
+
+  if (postDomOpFilter)
+    postDomInfo = std::make_unique<PostDominanceInfo>(
+        postDomOpFilter->getParentOfType<FuncOp>());
+
+  // Walk all uses of old memref; collect ops to perform replacement. We use a
+  // DenseSet since an operation could potentially have multiple uses of a
+  // memref (although rare), and the replacement later is going to erase ops.
+  DenseSet<Operation *> opsToReplace;
+  for (auto *op : oldMemRef.getUsers()) {
+    // Skip this use if it's not dominated by domOpFilter.
+    if (domOpFilter && !domInfo->dominates(domOpFilter, op))
+      continue;
+
+    // Skip this use if it's not post-dominated by postDomOpFilter.
+    if (postDomOpFilter && !postDomInfo->postDominates(postDomOpFilter, op))
+      continue;
+
+    // Skip dealloc's - no replacement is necessary, and a memref replacement
+    // at other uses doesn't hurt these dealloc's.
+    if (isa<memref::DeallocOp>(op) && !replaceInDeallocOp)
+      continue;
+
+    // Check if the memref was used in a non-dereferencing context. It is fine
+    // for the memref to be used in a non-dereferencing way outside of the
+    // region where this replacement is happening.
+    if (!isa<AffineMapAccessInterface>(*op)) {
+      if (!allowNonDereferencingOps) {
+        LLVM_DEBUG(llvm::dbgs()
+                   << "Memref replacement failed: non-deferencing memref op: \n"
+                   << *op << '\n');
+        return failure();
+      }
+      // Non-dereferencing ops with the MemRefsNormalizable trait are
+      // supported for replacement.
+      if (!op->hasTrait<OpTrait::MemRefsNormalizable>()) {
+        LLVM_DEBUG(llvm::dbgs() << "Memref replacement failed: use without a "
+                                   "memrefs normalizable trait: \n"
+                                << *op << '\n');
+        return failure();
+      }
+    }
+
+    // We'll first collect and then replace --- since replacement erases the op
+    // that has the use, and that op could be postDomFilter or domFilter itself!
+    opsToReplace.insert(op);
+  }
+
+  for (auto *op : opsToReplace) {
+    if (failed(replaceAllMemRefUsesWith(
+            oldMemRef, newMemRef, op, extraIndices, indexRemap, extraOperands,
+            symbolOperands, allowNonDereferencingOps)))
+      llvm_unreachable("memref replacement guaranteed to succeed here");
+  }
+
+  return success();
+}
+
+/// Given an operation, inserts one or more single result affine
+/// apply operations, results of which are exclusively used by this operation
+/// operation. The operands of these newly created affine apply ops are
+/// guaranteed to be loop iterators or terminal symbols of a function.
+///
+/// Before
+///
+/// affine.for %i = 0 to #map(%N)
+///   %idx = affine.apply (d0) -> (d0 mod 2) (%i)
+///   "send"(%idx, %A, ...)
+///   "compute"(%idx)
+///
+/// After
+///
+/// affine.for %i = 0 to #map(%N)
+///   %idx = affine.apply (d0) -> (d0 mod 2) (%i)
+///   "send"(%idx, %A, ...)
+///   %idx_ = affine.apply (d0) -> (d0 mod 2) (%i)
+///   "compute"(%idx_)
+///
+/// This allows applying different transformations on send and compute (for eg.
+/// different shifts/delays).
+///
+/// Returns nullptr either if none of opInst's operands were the result of an
+/// affine.apply and thus there was no affine computation slice to create, or if
+/// all the affine.apply op's supplying operands to this opInst did not have any
+/// uses besides this opInst; otherwise returns the list of affine.apply
+/// operations created in output argument `sliceOps`.
+void mlir::createAffineComputationSlice(
+    Operation *opInst, SmallVectorImpl<AffineApplyOp> *sliceOps) {
+  // Collect all operands that are results of affine apply ops.
+  SmallVector<Value, 4> subOperands;
+  subOperands.reserve(opInst->getNumOperands());
+  for (auto operand : opInst->getOperands())
+    if (isa_and_nonnull<AffineApplyOp>(operand.getDefiningOp()))
+      subOperands.push_back(operand);
+
+  // Gather sequence of AffineApplyOps reachable from 'subOperands'.
+  SmallVector<Operation *, 4> affineApplyOps;
+  getReachableAffineApplyOps(subOperands, affineApplyOps);
+  // Skip transforming if there are no affine maps to compose.
+  if (affineApplyOps.empty())
+    return;
+
+  // Check if all uses of the affine apply op's lie only in this op op, in
+  // which case there would be nothing to do.
+  bool localized = true;
+  for (auto *op : affineApplyOps) {
+    for (auto result : op->getResults()) {
+      for (auto *user : result.getUsers()) {
+        if (user != opInst) {
+          localized = false;
+          break;
+        }
+      }
+    }
+  }
+  if (localized)
+    return;
+
+  OpBuilder builder(opInst);
+  SmallVector<Value, 4> composedOpOperands(subOperands);
+  auto composedMap = builder.getMultiDimIdentityMap(composedOpOperands.size());
+  fullyComposeAffineMapAndOperands(&composedMap, &composedOpOperands);
+
+  // Create an affine.apply for each of the map results.
+  sliceOps->reserve(composedMap.getNumResults());
+  for (auto resultExpr : composedMap.getResults()) {
+    auto singleResMap = AffineMap::get(composedMap.getNumDims(),
+                                       composedMap.getNumSymbols(), resultExpr);
+    sliceOps->push_back(builder.create<AffineApplyOp>(
+        opInst->getLoc(), singleResMap, composedOpOperands));
+  }
+
+  // Construct the new operands that include the results from the composed
+  // affine apply op above instead of existing ones (subOperands). So, they
+  // differ from opInst's operands only for those operands in 'subOperands', for
+  // which they will be replaced by the corresponding one from 'sliceOps'.
+  SmallVector<Value, 4> newOperands(opInst->getOperands());
+  for (unsigned i = 0, e = newOperands.size(); i < e; i++) {
+    // Replace the subOperands from among the new operands.
+    unsigned j, f;
+    for (j = 0, f = subOperands.size(); j < f; j++) {
+      if (newOperands[i] == subOperands[j])
+        break;
+    }
+    if (j < subOperands.size()) {
+      newOperands[i] = (*sliceOps)[j];
+    }
+  }
+  for (unsigned idx = 0, e = newOperands.size(); idx < e; idx++) {
+    opInst->setOperand(idx, newOperands[idx]);
+  }
+}
+
+/// Enum to set patterns of affine expr in tiled-layout map.
+/// TileFloorDiv: <dim expr> div <tile size>
+/// TileMod: <dim expr> mod <tile size>
+/// TileNone: None of the above
+/// Example:
+/// #tiled_2d_128x256 = affine_map<(d0, d1)
+///            -> (d0 div 128, d1 div 256, d0 mod 128, d1 mod 256)>
+/// "d0 div 128" and "d1 div 256" ==> TileFloorDiv
+/// "d0 mod 128" and "d1 mod 256" ==> TileMod
+enum TileExprPattern { TileFloorDiv, TileMod, TileNone };
+
+/// Check if `map` is a tiled layout. In the tiled layout, specific k dimensions
+/// being floordiv'ed by respective tile sizes appeare in a mod with the same
+/// tile sizes, and no other expression involves those k dimensions. This
+/// function stores a vector of tuples (`tileSizePos`) including AffineExpr for
+/// tile size, positions of corresponding `floordiv` and `mod`. If it is not a
+/// tiled layout, an empty vector is returned.
+static LogicalResult getTileSizePos(
+    AffineMap map,
+    SmallVectorImpl<std::tuple<AffineExpr, unsigned, unsigned>> &tileSizePos) {
+  // Create `floordivExprs` which is a vector of tuples including LHS and RHS of
+  // `floordiv` and its position in `map` output.
+  // Example: #tiled_2d_128x256 = affine_map<(d0, d1)
+  //                -> (d0 div 128, d1 div 256, d0 mod 128, d1 mod 256)>
+  // In this example, `floordivExprs` includes {d0, 128, 0} and {d1, 256, 1}.
+  SmallVector<std::tuple<AffineExpr, AffineExpr, unsigned>, 4> floordivExprs;
+  unsigned pos = 0;
+  for (AffineExpr expr : map.getResults()) {
+    if (expr.getKind() == AffineExprKind::FloorDiv) {
+      AffineBinaryOpExpr binaryExpr = expr.cast<AffineBinaryOpExpr>();
+      if (binaryExpr.getRHS().isa<AffineConstantExpr>())
+        floordivExprs.emplace_back(
+            std::make_tuple(binaryExpr.getLHS(), binaryExpr.getRHS(), pos));
+    }
+    pos++;
+  }
+  // Not tiled layout if `floordivExprs` is empty.
+  if (floordivExprs.empty()) {
+    tileSizePos = SmallVector<std::tuple<AffineExpr, unsigned, unsigned>>{};
+    return success();
+  }
+
+  // Check if LHS of `floordiv` is used in LHS of `mod`. If not used, `map` is
+  // not tiled layout.
+  for (std::tuple<AffineExpr, AffineExpr, unsigned> fexpr : floordivExprs) {
+    AffineExpr floordivExprLHS = std::get<0>(fexpr);
+    AffineExpr floordivExprRHS = std::get<1>(fexpr);
+    unsigned floordivPos = std::get<2>(fexpr);
+
+    // Walk affinexpr of `map` output except `fexpr`, and check if LHS and RHS
+    // of `fexpr` are used in LHS and RHS of `mod`. If LHS of `fexpr` is used
+    // other expr, the map is not tiled layout. Example of non tiled layout:
+    //   affine_map<(d0, d1, d2) -> (d0, d1, d2 floordiv 256, d2 floordiv 256)>
+    //   affine_map<(d0, d1, d2) -> (d0, d1, d2 floordiv 256, d2 mod 128)>
+    //   affine_map<(d0, d1, d2) -> (d0, d1, d2 floordiv 256, d2 mod 256, d2 mod
+    //   256)>
+    bool found = false;
+    pos = 0;
+    for (AffineExpr expr : map.getResults()) {
+      bool notTiled = false;
+      if (pos != floordivPos) {
+        expr.walk([&](AffineExpr e) {
+          if (e == floordivExprLHS) {
+            if (expr.getKind() == AffineExprKind::Mod) {
+              AffineBinaryOpExpr binaryExpr = expr.cast<AffineBinaryOpExpr>();
+              // If LHS and RHS of `mod` are the same with those of floordiv.
+              if (floordivExprLHS == binaryExpr.getLHS() &&
+                  floordivExprRHS == binaryExpr.getRHS()) {
+                // Save tile size (RHS of `mod`), and position of `floordiv` and
+                // `mod` if same expr with `mod` is not found yet.
+                if (!found) {
+                  tileSizePos.emplace_back(
+                      std::make_tuple(binaryExpr.getRHS(), floordivPos, pos));
+                  found = true;
+                } else {
+                  // Non tiled layout: Have multilpe `mod` with the same LHS.
+                  // eg. affine_map<(d0, d1, d2) -> (d0, d1, d2 floordiv 256, d2
+                  // mod 256, d2 mod 256)>
+                  notTiled = true;
+                }
+              } else {
+                // Non tiled layout: RHS of `mod` is different from `floordiv`.
+                // eg. affine_map<(d0, d1, d2) -> (d0, d1, d2 floordiv 256, d2
+                // mod 128)>
+                notTiled = true;
+              }
+            } else {
+              // Non tiled layout: LHS is the same, but not `mod`.
+              // eg. affine_map<(d0, d1, d2) -> (d0, d1, d2 floordiv 256, d2
+              // floordiv 256)>
+              notTiled = true;
+            }
+          }
+        });
+      }
+      if (notTiled) {
+        tileSizePos = SmallVector<std::tuple<AffineExpr, unsigned, unsigned>>{};
+        return success();
+      }
+      pos++;
+    }
+  }
+  return success();
+}
+
+/// Check if `dim` dimension of memrefType with `layoutMap` becomes dynamic
+/// after normalization. Dimensions that include dynamic dimensions in the map
+/// output will become dynamic dimensions. Return true if `dim` is dynamic
+/// dimension.
+///
+/// Example:
+/// #map0 = affine_map<(d0, d1) -> (d0, d1 floordiv 32, d1 mod 32)>
+///
+/// If d1 is dynamic dimension, 2nd and 3rd dimension of map output are dynamic.
+/// memref<4x?xf32, #map0>  ==>  memref<4x?x?xf32>
+static bool
+isNormalizedMemRefDynamicDim(unsigned dim, AffineMap layoutMap,
+                             SmallVectorImpl<unsigned> &inMemrefTypeDynDims,
+                             MLIRContext *context) {
+  bool isDynamicDim = false;
+  AffineExpr expr = layoutMap.getResults()[dim];
+  // Check if affine expr of the dimension includes dynamic dimension of input
+  // memrefType.
+  expr.walk([&inMemrefTypeDynDims, &isDynamicDim, &context](AffineExpr e) {
+    if (e.isa<AffineDimExpr>()) {
+      for (unsigned dm : inMemrefTypeDynDims) {
+        if (e == getAffineDimExpr(dm, context)) {
+          isDynamicDim = true;
+        }
+      }
+    }
+  });
+  return isDynamicDim;
+}
+
+/// Create affine expr to calculate dimension size for a tiled-layout map.
+static AffineExpr createDimSizeExprForTiledLayout(AffineExpr oldMapOutput,
+                                                  TileExprPattern pat) {
+  // Create map output for the patterns.
+  // "floordiv <tile size>" ==> "ceildiv <tile size>"
+  // "mod <tile size>" ==> "<tile size>"
+  AffineExpr newMapOutput;
+  AffineBinaryOpExpr binaryExpr = nullptr;
+  switch (pat) {
+  case TileExprPattern::TileMod:
+    binaryExpr = oldMapOutput.cast<AffineBinaryOpExpr>();
+    newMapOutput = binaryExpr.getRHS();
+    break;
+  case TileExprPattern::TileFloorDiv:
+    binaryExpr = oldMapOutput.cast<AffineBinaryOpExpr>();
+    newMapOutput = getAffineBinaryOpExpr(
+        AffineExprKind::CeilDiv, binaryExpr.getLHS(), binaryExpr.getRHS());
+    break;
+  default:
+    newMapOutput = oldMapOutput;
+  }
+  return newMapOutput;
+}
+
+/// Create new maps to calculate each dimension size of `newMemRefType`, and
+/// create `newDynamicSizes` from them by using AffineApplyOp.
+///
+/// Steps for normalizing dynamic memrefs for a tiled layout map
+/// Example:
+///    #map0 = affine_map<(d0, d1) -> (d0, d1 floordiv 32, d1 mod 32)>
+///    %0 = dim %arg0, %c1 :memref<4x?xf32>
+///    %1 = alloc(%0) : memref<4x?xf32, #map0>
+///
+/// (Before this function)
+/// 1. Check if `map`(#map0) is a tiled layout using `getTileSizePos()`. Only
+/// single layout map is supported.
+///
+/// 2. Create normalized memrefType using `isNormalizedMemRefDynamicDim()`. It
+/// is memref<4x?x?xf32> in the above example.
+///
+/// (In this function)
+/// 3. Create new maps to calculate each dimension of the normalized memrefType
+/// using `createDimSizeExprForTiledLayout()`. In the tiled layout, the
+/// dimension size can be calculated by replacing "floordiv <tile size>" with
+/// "ceildiv <tile size>" and "mod <tile size>" with "<tile size>".
+/// - New map in the above example
+///   #map0 = affine_map<(d0, d1) -> (d0)>
+///   #map1 = affine_map<(d0, d1) -> (d1 ceildiv 32)>
+///   #map2 = affine_map<(d0, d1) -> (32)>
+///
+/// 4. Create AffineApplyOp to apply the new maps. The output of AffineApplyOp
+/// is used in dynamicSizes of new AllocOp.
+///   %0 = dim %arg0, %c1 : memref<4x?xf32>
+///   %c4 = arith.constant 4 : index
+///   %1 = affine.apply #map1(%c4, %0)
+///   %2 = affine.apply #map2(%c4, %0)
+static void createNewDynamicSizes(MemRefType oldMemRefType,
+                                  MemRefType newMemRefType, AffineMap map,
+                                  memref::AllocOp *allocOp, OpBuilder b,
+                                  SmallVectorImpl<Value> &newDynamicSizes) {
+  // Create new input for AffineApplyOp.
+  SmallVector<Value, 4> inAffineApply;
+  ArrayRef<int64_t> oldMemRefShape = oldMemRefType.getShape();
+  unsigned dynIdx = 0;
+  for (unsigned d = 0; d < oldMemRefType.getRank(); ++d) {
+    if (oldMemRefShape[d] < 0) {
+      // Use dynamicSizes of allocOp for dynamic dimension.
+      inAffineApply.emplace_back(allocOp->dynamicSizes()[dynIdx]);
+      dynIdx++;
+    } else {
+      // Create ConstantOp for static dimension.
+      Attribute constantAttr =
+          b.getIntegerAttr(b.getIndexType(), oldMemRefShape[d]);
+      inAffineApply.emplace_back(
+          b.create<arith::ConstantOp>(allocOp->getLoc(), constantAttr));
+    }
+  }
+
+  // Create new map to calculate each dimension size of new memref for each
+  // original map output. Only for dynamic dimesion of `newMemRefType`.
+  unsigned newDimIdx = 0;
+  ArrayRef<int64_t> newMemRefShape = newMemRefType.getShape();
+  SmallVector<std::tuple<AffineExpr, unsigned, unsigned>> tileSizePos;
+  (void)getTileSizePos(map, tileSizePos);
+  for (AffineExpr expr : map.getResults()) {
+    if (newMemRefShape[newDimIdx] < 0) {
+      // Create new maps to calculate each dimension size of new memref.
+      enum TileExprPattern pat = TileExprPattern::TileNone;
+      for (auto pos : tileSizePos) {
+        if (newDimIdx == std::get<1>(pos))
+          pat = TileExprPattern::TileFloorDiv;
+        else if (newDimIdx == std::get<2>(pos))
+          pat = TileExprPattern::TileMod;
+      }
+      AffineExpr newMapOutput = createDimSizeExprForTiledLayout(expr, pat);
+      AffineMap newMap =
+          AffineMap::get(map.getNumInputs(), map.getNumSymbols(), newMapOutput);
+      Value affineApp =
+          b.create<AffineApplyOp>(allocOp->getLoc(), newMap, inAffineApply);
+      newDynamicSizes.emplace_back(affineApp);
+    }
+    newDimIdx++;
+  }
+}
+
+// TODO: Currently works for static memrefs with a single layout map.
+LogicalResult mlir::normalizeMemRef(memref::AllocOp *allocOp) {
+  MemRefType memrefType = allocOp->getType();
+  OpBuilder b(*allocOp);
+
+  // Fetch a new memref type after normalizing the old memref to have an
+  // identity map layout.
+  MemRefType newMemRefType =
+      normalizeMemRefType(memrefType, b, allocOp->symbolOperands().size());
+  if (newMemRefType == memrefType)
+    // Either memrefType already had an identity map or the map couldn't be
+    // transformed to an identity map.
+    return failure();
+
+  Value oldMemRef = allocOp->getResult();
+
+  SmallVector<Value, 4> symbolOperands(allocOp->symbolOperands());
+  AffineMap layoutMap = memrefType.getLayout().getAffineMap();
+  memref::AllocOp newAlloc;
+  // Check if `layoutMap` is a tiled layout. Only single layout map is
+  // supported for normalizing dynamic memrefs.
+  SmallVector<std::tuple<AffineExpr, unsigned, unsigned>> tileSizePos;
+  (void)getTileSizePos(layoutMap, tileSizePos);
+  if (newMemRefType.getNumDynamicDims() > 0 && !tileSizePos.empty()) {
+    MemRefType oldMemRefType = oldMemRef.getType().cast<MemRefType>();
+    SmallVector<Value, 4> newDynamicSizes;
+    createNewDynamicSizes(oldMemRefType, newMemRefType, layoutMap, allocOp, b,
+                          newDynamicSizes);
+    // Add the new dynamic sizes in new AllocOp.
+    newAlloc =
+        b.create<memref::AllocOp>(allocOp->getLoc(), newMemRefType,
+                                  newDynamicSizes, allocOp->alignmentAttr());
+  } else {
+    newAlloc = b.create<memref::AllocOp>(allocOp->getLoc(), newMemRefType,
+                                         allocOp->alignmentAttr());
+  }
+  // Replace all uses of the old memref.
+  if (failed(replaceAllMemRefUsesWith(oldMemRef, /*newMemRef=*/newAlloc,
+                                      /*extraIndices=*/{},
+                                      /*indexRemap=*/layoutMap,
+                                      /*extraOperands=*/{},
+                                      /*symbolOperands=*/symbolOperands,
+                                      /*domOpFilter=*/nullptr,
+                                      /*postDomOpFilter=*/nullptr,
+                                      /*allowNonDereferencingOps=*/true))) {
+    // If it failed (due to escapes for example), bail out.
+    newAlloc.erase();
+    return failure();
+  }
+  // Replace any uses of the original alloc op and erase it. All remaining uses
+  // have to be dealloc's; RAMUW above would've failed otherwise.
+  assert(llvm::all_of(oldMemRef.getUsers(), [](Operation *op) {
+    return isa<memref::DeallocOp>(op);
+  }));
+  oldMemRef.replaceAllUsesWith(newAlloc);
+  allocOp->erase();
+  return success();
+}
+
+MemRefType mlir::normalizeMemRefType(MemRefType memrefType, OpBuilder b,
+                                     unsigned numSymbolicOperands) {
+  unsigned rank = memrefType.getRank();
+  if (rank == 0)
+    return memrefType;
+
+  if (memrefType.getLayout().isIdentity()) {
+    // Either no maps is associated with this memref or this memref has
+    // a trivial (identity) map.
+    return memrefType;
+  }
+  AffineMap layoutMap = memrefType.getLayout().getAffineMap();
+
+  // We don't do any checks for one-to-one'ness; we assume that it is
+  // one-to-one.
+
+  // Normalize only static memrefs and dynamic memrefs with a tiled-layout map
+  // for now.
+  // TODO: Normalize the other types of dynamic memrefs.
+  SmallVector<std::tuple<AffineExpr, unsigned, unsigned>> tileSizePos;
+  (void)getTileSizePos(layoutMap, tileSizePos);
+  if (memrefType.getNumDynamicDims() > 0 && tileSizePos.empty())
+    return memrefType;
+
+  // We have a single map that is not an identity map. Create a new memref
+  // with the right shape and an identity layout map.
+  ArrayRef<int64_t> shape = memrefType.getShape();
+  // FlatAffineConstraint may later on use symbolicOperands.
+  FlatAffineConstraints fac(rank, numSymbolicOperands);
+  SmallVector<unsigned, 4> memrefTypeDynDims;
+  for (unsigned d = 0; d < rank; ++d) {
+    // Use constraint system only in static dimensions.
+    if (shape[d] > 0) {
+      fac.addBound(FlatAffineConstraints::LB, d, 0);
+      fac.addBound(FlatAffineConstraints::UB, d, shape[d] - 1);
+    } else {
+      memrefTypeDynDims.emplace_back(d);
+    }
+  }
+  // We compose this map with the original index (logical) space to derive
+  // the upper bounds for the new index space.
+  unsigned newRank = layoutMap.getNumResults();
+  if (failed(fac.composeMatchingMap(layoutMap)))
+    return memrefType;
+  // TODO: Handle semi-affine maps.
+  // Project out the old data dimensions.
+  fac.projectOut(newRank, fac.getNumIds() - newRank - fac.getNumLocalIds());
+  SmallVector<int64_t, 4> newShape(newRank);
+  for (unsigned d = 0; d < newRank; ++d) {
+    // Check if each dimension of normalized memrefType is dynamic.
+    bool isDynDim = isNormalizedMemRefDynamicDim(
+        d, layoutMap, memrefTypeDynDims, b.getContext());
+    if (isDynDim) {
+      newShape[d] = -1;
+    } else {
+      // The lower bound for the shape is always zero.
+      auto ubConst = fac.getConstantBound(FlatAffineConstraints::UB, d);
+      // For a static memref and an affine map with no symbols, this is
+      // always bounded.
+      assert(ubConst.hasValue() && "should always have an upper bound");
+      if (ubConst.getValue() < 0)
+        // This is due to an invalid map that maps to a negative space.
+        return memrefType;
+      // If dimension of new memrefType is dynamic, the value is -1.
+      newShape[d] = ubConst.getValue() + 1;
+    }
+  }
+
+  // Create the new memref type after trivializing the old layout map.
+  MemRefType newMemRefType =
+      MemRefType::Builder(memrefType)
+          .setShape(newShape)
+          .setLayout(AffineMapAttr::get(b.getMultiDimIdentityMap(newRank)));
+
+  return newMemRefType;
+}
diff --git a/mlir/lib/Dialect/GPU/Transforms/MemoryPromotion.cpp b/mlir/lib/Dialect/GPU/Transforms/MemoryPromotion.cpp
--- a/mlir/lib/Dialect/GPU/Transforms/MemoryPromotion.cpp
+++ b/mlir/lib/Dialect/GPU/Transforms/MemoryPromotion.cpp
@@ -12,6 +12,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "mlir/Dialect/GPU/MemoryPromotion.h"
+#include "mlir/Dialect/Affine/LoopUtils.h"
 #include "mlir/Dialect/Arithmetic/IR/Arithmetic.h"
 #include "mlir/Dialect/GPU/GPUDialect.h"
 #include "mlir/Dialect/MemRef/IR/MemRef.h"
@@ -19,7 +20,6 @@
 #include "mlir/Dialect/StandardOps/IR/Ops.h"
 #include "mlir/IR/ImplicitLocOpBuilder.h"
 #include "mlir/Pass/Pass.h"
-#include "mlir/Transforms/LoopUtils.h"
 
 using namespace mlir;
 using namespace mlir::gpu;
diff --git a/mlir/lib/Dialect/Linalg/Transforms/CodegenStrategy.cpp b/mlir/lib/Dialect/Linalg/Transforms/CodegenStrategy.cpp
--- a/mlir/lib/Dialect/Linalg/Transforms/CodegenStrategy.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/CodegenStrategy.cpp
@@ -12,7 +12,6 @@
 //===----------------------------------------------------------------------===//
 
 #include "mlir/Dialect/Linalg/Transforms/CodegenStrategy.h"
-
 #include "mlir/Dialect/Linalg/Passes.h"
 #include "mlir/Dialect/Linalg/Transforms/Hoisting.h"
 #include "mlir/Dialect/SCF/Transforms.h"
@@ -20,7 +19,6 @@
 #include "mlir/Dialect/Vector/VectorTransforms.h"
 #include "mlir/Pass/PassManager.h"
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
-#include "mlir/Transforms/LoopUtils.h"
 #include "mlir/Transforms/Passes.h"
 
 using namespace mlir;
diff --git a/mlir/lib/Dialect/Linalg/Transforms/HoistPadding.cpp b/mlir/lib/Dialect/Linalg/Transforms/HoistPadding.cpp
--- a/mlir/lib/Dialect/Linalg/Transforms/HoistPadding.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/HoistPadding.cpp
@@ -12,7 +12,6 @@
 
 #include "mlir/Dialect/Linalg/Transforms/HoistPadding.h"
 #include "mlir/Analysis/SliceAnalysis.h"
-#include "mlir/Dialect/Affine/Utils.h"
 #include "mlir/Dialect/Linalg/IR/Linalg.h"
 #include "mlir/Dialect/Linalg/Transforms/Transforms.h"
 #include "mlir/Dialect/SCF/SCF.h"
@@ -24,7 +23,6 @@
 #include "mlir/IR/AsmState.h"
 #include "mlir/IR/BuiltinOps.h"
 #include "mlir/IR/Dominance.h"
-#include "mlir/Transforms/LoopUtils.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Support/Debug.h"
 
diff --git a/mlir/lib/Dialect/Linalg/Transforms/Hoisting.cpp b/mlir/lib/Dialect/Linalg/Transforms/Hoisting.cpp
--- a/mlir/lib/Dialect/Linalg/Transforms/Hoisting.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/Hoisting.cpp
@@ -15,7 +15,6 @@
 #include "mlir/Analysis/SliceAnalysis.h"
 #include "mlir/Dialect/Affine/Analysis/AffineStructures.h"
 #include "mlir/Dialect/Affine/IR/AffineValueMap.h"
-#include "mlir/Dialect/Affine/Utils.h"
 #include "mlir/Dialect/Linalg/IR/Linalg.h"
 #include "mlir/Dialect/Linalg/Transforms/Transforms.h"
 #include "mlir/Dialect/SCF/SCF.h"
@@ -27,7 +26,6 @@
 #include "mlir/IR/BuiltinOps.h"
 #include "mlir/IR/Dominance.h"
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
-#include "mlir/Transforms/LoopUtils.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Support/Debug.h"
 
diff --git a/mlir/lib/Dialect/Linalg/Transforms/LinalgStrategyPasses.cpp b/mlir/lib/Dialect/Linalg/Transforms/LinalgStrategyPasses.cpp
--- a/mlir/lib/Dialect/Linalg/Transforms/LinalgStrategyPasses.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/LinalgStrategyPasses.cpp
@@ -16,6 +16,8 @@
 #include "PassDetail.h"
 #include "mlir/Analysis/SliceAnalysis.h"
 #include "mlir/Dialect/Affine/IR/AffineOps.h"
+#include "mlir/Dialect/Affine/LoopUtils.h"
+#include "mlir/Dialect/Affine/Utils.h"
 #include "mlir/Dialect/Linalg/IR/Linalg.h"
 #include "mlir/Dialect/Linalg/Passes.h"
 #include "mlir/Dialect/Linalg/Transforms/Hoisting.h"
@@ -29,9 +31,7 @@
 #include "mlir/Pass/PassManager.h"
 #include "mlir/Support/LLVM.h"
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
-#include "mlir/Transforms/LoopUtils.h"
 #include "mlir/Transforms/Passes.h"
-#include "mlir/Transforms/Utils.h"
 
 using namespace mlir;
 using namespace mlir::vector;
@@ -348,7 +348,13 @@
         return signalPassFailure();
     }
 
-    promoteSingleIterationLoops(funcOp);
+    // Gathers all innermost loops through a post order pruned walk.
+    funcOp.walk([](Operation *op) {
+      if (auto forOp = dyn_cast<AffineForOp>(op))
+        (void)promoteIfSingleIteration(forOp);
+      else if (auto forOp = dyn_cast<scf::ForOp>(op))
+        (void)promoteIfSingleIteration(forOp);
+    });
     if (options.hoistRedundantVectorTransfers)
       hoistRedundantVectorTransfers(funcOp);
 
diff --git a/mlir/lib/Dialect/Linalg/Transforms/Transforms.cpp b/mlir/lib/Dialect/Linalg/Transforms/Transforms.cpp
--- a/mlir/lib/Dialect/Linalg/Transforms/Transforms.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/Transforms.cpp
@@ -12,7 +12,6 @@
 //===----------------------------------------------------------------------===//
 
 #include "mlir/Dialect/Linalg/Transforms/Transforms.h"
-#include "mlir/Dialect/Affine/Utils.h"
 #include "mlir/Dialect/Arithmetic/IR/Arithmetic.h"
 #include "mlir/Dialect/Linalg/Analysis/DependenceAnalysis.h"
 #include "mlir/Dialect/Linalg/IR/Linalg.h"
diff --git a/mlir/lib/Dialect/Linalg/Utils/Utils.cpp b/mlir/lib/Dialect/Linalg/Utils/Utils.cpp
--- a/mlir/lib/Dialect/Linalg/Utils/Utils.cpp
+++ b/mlir/lib/Dialect/Linalg/Utils/Utils.cpp
@@ -16,6 +16,7 @@
 #include "mlir/Dialect/Affine/Analysis/AffineStructures.h"
 #include "mlir/Dialect/Affine/IR/AffineOps.h"
 #include "mlir/Dialect/Affine/IR/AffineValueMap.h"
+#include "mlir/Dialect/Affine/LoopUtils.h"
 #include "mlir/Dialect/Arithmetic/IR/Arithmetic.h"
 #include "mlir/Dialect/Linalg/IR/Linalg.h"
 #include "mlir/Dialect/MemRef/IR/MemRef.h"
@@ -31,7 +32,6 @@
 #include "mlir/IR/Matchers.h"
 #include "mlir/IR/OpImplementation.h"
 #include "mlir/Pass/Pass.h"
-#include "mlir/Transforms/LoopUtils.h"
 #include "llvm/ADT/TypeSwitch.h"
 #include "llvm/Support/Debug.h"
 
diff --git a/mlir/lib/Dialect/MemRef/Transforms/NormalizeMemRefs.cpp b/mlir/lib/Dialect/MemRef/Transforms/NormalizeMemRefs.cpp
--- a/mlir/lib/Dialect/MemRef/Transforms/NormalizeMemRefs.cpp
+++ b/mlir/lib/Dialect/MemRef/Transforms/NormalizeMemRefs.cpp
@@ -13,9 +13,9 @@
 
 #include "PassDetail.h"
 #include "mlir/Dialect/Affine/IR/AffineOps.h"
+#include "mlir/Dialect/Affine/Utils.h"
 #include "mlir/Dialect/MemRef/IR/MemRef.h"
 #include "mlir/Dialect/MemRef/Transforms/Passes.h"
-#include "mlir/Transforms/Utils.h"
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/Support/Debug.h"
 
diff --git a/mlir/lib/Dialect/SCF/Transforms/CMakeLists.txt b/mlir/lib/Dialect/SCF/Transforms/CMakeLists.txt
--- a/mlir/lib/Dialect/SCF/Transforms/CMakeLists.txt
+++ b/mlir/lib/Dialect/SCF/Transforms/CMakeLists.txt
@@ -6,6 +6,7 @@
   LoopPipelining.cpp
   LoopRangeFolding.cpp
   LoopSpecialization.cpp
+  ParallelLoopCollapsing.cpp
   ParallelLoopFusion.cpp
   ParallelLoopTiling.cpp
   StructuralTypeConversions.cpp
diff --git a/mlir/lib/Transforms/ParallelLoopCollapsing.cpp b/mlir/lib/Dialect/SCF/Transforms/ParallelLoopCollapsing.cpp
rename from mlir/lib/Transforms/ParallelLoopCollapsing.cpp
rename to mlir/lib/Dialect/SCF/Transforms/ParallelLoopCollapsing.cpp
--- a/mlir/lib/Transforms/ParallelLoopCollapsing.cpp
+++ b/mlir/lib/Dialect/SCF/Transforms/ParallelLoopCollapsing.cpp
@@ -7,9 +7,9 @@
 //===----------------------------------------------------------------------===//
 
 #include "PassDetail.h"
+#include "mlir/Dialect/SCF/Passes.h"
 #include "mlir/Dialect/SCF/SCF.h"
-#include "mlir/Transforms/LoopUtils.h"
-#include "mlir/Transforms/Passes.h"
+#include "mlir/Dialect/SCF/Utils.h"
 #include "mlir/Transforms/RegionUtils.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
@@ -20,7 +20,7 @@
 
 namespace {
 struct ParallelLoopCollapsing
-    : public ParallelLoopCollapsingBase<ParallelLoopCollapsing> {
+    : public SCFParallelLoopCollapsingBase<ParallelLoopCollapsing> {
   void runOnOperation() override {
     Operation *module = getOperation();
 
diff --git a/mlir/lib/Dialect/SCF/Transforms/Utils.cpp b/mlir/lib/Dialect/SCF/Transforms/Utils.cpp
--- a/mlir/lib/Dialect/SCF/Transforms/Utils.cpp
+++ b/mlir/lib/Dialect/SCF/Transforms/Utils.cpp
@@ -11,20 +11,31 @@
 //===----------------------------------------------------------------------===//
 
 #include "mlir/Dialect/SCF/Utils.h"
-
+#include "mlir/Analysis/SliceAnalysis.h"
 #include "mlir/Dialect/Arithmetic/IR/Arithmetic.h"
 #include "mlir/Dialect/SCF/SCF.h"
 #include "mlir/Dialect/StandardOps/IR/Ops.h"
 #include "mlir/IR/BlockAndValueMapping.h"
 #include "mlir/IR/BuiltinOps.h"
 #include "mlir/IR/PatternMatch.h"
+#include "mlir/Support/MathExtras.h"
 #include "mlir/Transforms/RegionUtils.h"
-
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallPtrSet.h"
 
 using namespace mlir;
 
+namespace {
+// This structure is to pass and return sets of loop parameters without
+// confusing the order.
+struct LoopParams {
+  Value lowerBound;
+  Value upperBound;
+  Value step;
+};
+} // namespace
+
 scf::ForOp mlir::cloneWithNewYields(OpBuilder &b, scf::ForOp loop,
                                     ValueRange newIterOperands,
                                     ValueRange newYieldedValues,
@@ -230,3 +241,682 @@
   }
   return rootEnclosesPloops;
 }
+
+// Build the IR that performs ceil division of a positive value by a constant:
+//    ceildiv(a, B) = divis(a + (B-1), B)
+// where divis is rounding-to-zero division.
+static Value ceilDivPositive(OpBuilder &builder, Location loc, Value dividend,
+                             int64_t divisor) {
+  assert(divisor > 0 && "expected positive divisor");
+  assert(dividend.getType().isIndex() && "expected index-typed value");
+
+  Value divisorMinusOneCst =
+      builder.create<arith::ConstantIndexOp>(loc, divisor - 1);
+  Value divisorCst = builder.create<arith::ConstantIndexOp>(loc, divisor);
+  Value sum = builder.create<arith::AddIOp>(loc, dividend, divisorMinusOneCst);
+  return builder.create<arith::DivSIOp>(loc, sum, divisorCst);
+}
+
+// Build the IR that performs ceil division of a positive value by another
+// positive value:
+//    ceildiv(a, b) = divis(a + (b - 1), b)
+// where divis is rounding-to-zero division.
+static Value ceilDivPositive(OpBuilder &builder, Location loc, Value dividend,
+                             Value divisor) {
+  assert(dividend.getType().isIndex() && "expected index-typed value");
+
+  Value cstOne = builder.create<arith::ConstantIndexOp>(loc, 1);
+  Value divisorMinusOne = builder.create<arith::SubIOp>(loc, divisor, cstOne);
+  Value sum = builder.create<arith::AddIOp>(loc, dividend, divisorMinusOne);
+  return builder.create<arith::DivSIOp>(loc, sum, divisor);
+}
+
+/// Helper to replace uses of loop carried values (iter_args) and loop
+/// yield values while promoting single iteration scf.for ops.
+static void replaceIterArgsAndYieldResults(scf::ForOp forOp) {
+  // Replace uses of iter arguments with iter operands (initial values).
+  auto iterOperands = forOp.getIterOperands();
+  auto iterArgs = forOp.getRegionIterArgs();
+  for (auto e : llvm::zip(iterOperands, iterArgs))
+    std::get<1>(e).replaceAllUsesWith(std::get<0>(e));
+
+  // Replace uses of loop results with the values yielded by the loop.
+  auto outerResults = forOp.getResults();
+  auto innerResults = forOp.getBody()->getTerminator()->getOperands();
+  for (auto e : llvm::zip(outerResults, innerResults))
+    std::get<0>(e).replaceAllUsesWith(std::get<1>(e));
+}
+
+/// Promotes the loop body of a forOp to its containing block if the forOp
+/// it can be determined that the loop has a single iteration.
+LogicalResult mlir::promoteIfSingleIteration(scf::ForOp forOp) {
+  auto lbCstOp = forOp.getLowerBound().getDefiningOp<arith::ConstantIndexOp>();
+  auto ubCstOp = forOp.getUpperBound().getDefiningOp<arith::ConstantIndexOp>();
+  auto stepCstOp = forOp.getStep().getDefiningOp<arith::ConstantIndexOp>();
+  if (!lbCstOp || !ubCstOp || !stepCstOp || lbCstOp.value() < 0 ||
+      ubCstOp.value() < 0 || stepCstOp.value() < 0)
+    return failure();
+  int64_t tripCount =
+      mlir::ceilDiv(ubCstOp.value() - lbCstOp.value(), stepCstOp.value());
+  if (tripCount != 1)
+    return failure();
+  auto iv = forOp.getInductionVar();
+  iv.replaceAllUsesWith(lbCstOp);
+
+  replaceIterArgsAndYieldResults(forOp);
+
+  // Move the loop body operations, except for its terminator, to the loop's
+  // containing block.
+  auto *parentBlock = forOp->getBlock();
+  forOp.getBody()->getTerminator()->erase();
+  parentBlock->getOperations().splice(Block::iterator(forOp),
+                                      forOp.getBody()->getOperations());
+  forOp.erase();
+  return success();
+}
+
+/// Generates unrolled copies of scf::ForOp 'loopBodyBlock', with
+/// associated 'forOpIV' by 'unrollFactor', calling 'ivRemapFn' to remap
+/// 'forOpIV' for each unrolled body. If specified, annotates the Ops in each
+/// unrolled iteration using annotateFn.
+static void generateUnrolledLoop(
+    Block *loopBodyBlock, Value forOpIV, uint64_t unrollFactor,
+    function_ref<Value(unsigned, Value, OpBuilder)> ivRemapFn,
+    function_ref<void(unsigned, Operation *, OpBuilder)> annotateFn,
+    ValueRange iterArgs, ValueRange yieldedValues) {
+  // Builder to insert unrolled bodies just before the terminator of the body of
+  // 'forOp'.
+  auto builder = OpBuilder::atBlockTerminator(loopBodyBlock);
+
+  if (!annotateFn)
+    annotateFn = [](unsigned, Operation *, OpBuilder) {};
+
+  // Keep a pointer to the last non-terminator operation in the original block
+  // so that we know what to clone (since we are doing this in-place).
+  Block::iterator srcBlockEnd = std::prev(loopBodyBlock->end(), 2);
+
+  // Unroll the contents of 'forOp' (append unrollFactor - 1 additional copies).
+  SmallVector<Value, 4> lastYielded(yieldedValues);
+
+  for (unsigned i = 1; i < unrollFactor; i++) {
+    BlockAndValueMapping operandMap;
+
+    // Prepare operand map.
+    operandMap.map(iterArgs, lastYielded);
+
+    // If the induction variable is used, create a remapping to the value for
+    // this unrolled instance.
+    if (!forOpIV.use_empty()) {
+      Value ivUnroll = ivRemapFn(i, forOpIV, builder);
+      operandMap.map(forOpIV, ivUnroll);
+    }
+
+    // Clone the original body of 'forOp'.
+    for (auto it = loopBodyBlock->begin(); it != std::next(srcBlockEnd); it++) {
+      Operation *clonedOp = builder.clone(*it, operandMap);
+      annotateFn(i, clonedOp, builder);
+    }
+
+    // Update yielded values.
+    for (unsigned i = 0, e = lastYielded.size(); i < e; i++)
+      lastYielded[i] = operandMap.lookup(yieldedValues[i]);
+  }
+
+  // Make sure we annotate the Ops in the original body. We do this last so that
+  // any annotations are not copied into the cloned Ops above.
+  for (auto it = loopBodyBlock->begin(); it != std::next(srcBlockEnd); it++)
+    annotateFn(0, &*it, builder);
+
+  // Update operands of the yield statement.
+  loopBodyBlock->getTerminator()->setOperands(lastYielded);
+}
+
+/// Unrolls 'forOp' by 'unrollFactor', returns success if the loop is unrolled.
+LogicalResult mlir::loopUnrollByFactor(
+    scf::ForOp forOp, uint64_t unrollFactor,
+    function_ref<void(unsigned, Operation *, OpBuilder)> annotateFn) {
+  assert(unrollFactor > 0 && "expected positive unroll factor");
+
+  // Return if the loop body is empty.
+  if (llvm::hasSingleElement(forOp.getBody()->getOperations()))
+    return success();
+
+  // Compute tripCount = ceilDiv((upperBound - lowerBound), step) and populate
+  // 'upperBoundUnrolled' and 'stepUnrolled' for static and dynamic cases.
+  OpBuilder boundsBuilder(forOp);
+  auto loc = forOp.getLoc();
+  auto step = forOp.getStep();
+  Value upperBoundUnrolled;
+  Value stepUnrolled;
+  bool generateEpilogueLoop = true;
+
+  auto lbCstOp = forOp.getLowerBound().getDefiningOp<arith::ConstantIndexOp>();
+  auto ubCstOp = forOp.getUpperBound().getDefiningOp<arith::ConstantIndexOp>();
+  auto stepCstOp = forOp.getStep().getDefiningOp<arith::ConstantIndexOp>();
+  if (lbCstOp && ubCstOp && stepCstOp) {
+    // Constant loop bounds computation.
+    int64_t lbCst = lbCstOp.value();
+    int64_t ubCst = ubCstOp.value();
+    int64_t stepCst = stepCstOp.value();
+    assert(lbCst >= 0 && ubCst >= 0 && stepCst >= 0 &&
+           "expected positive loop bounds and step");
+    int64_t tripCount = mlir::ceilDiv(ubCst - lbCst, stepCst);
+
+    if (unrollFactor == 1) {
+      if (tripCount == 1 && failed(promoteIfSingleIteration(forOp)))
+        return failure();
+      return success();
+    }
+
+    int64_t tripCountEvenMultiple = tripCount - (tripCount % unrollFactor);
+    int64_t upperBoundUnrolledCst = lbCst + tripCountEvenMultiple * stepCst;
+    assert(upperBoundUnrolledCst <= ubCst);
+    int64_t stepUnrolledCst = stepCst * unrollFactor;
+
+    // Create constant for 'upperBoundUnrolled' and set epilogue loop flag.
+    generateEpilogueLoop = upperBoundUnrolledCst < ubCst;
+    if (generateEpilogueLoop)
+      upperBoundUnrolled = boundsBuilder.create<arith::ConstantIndexOp>(
+          loc, upperBoundUnrolledCst);
+    else
+      upperBoundUnrolled = ubCstOp;
+
+    // Create constant for 'stepUnrolled'.
+    stepUnrolled = stepCst == stepUnrolledCst
+                       ? step
+                       : boundsBuilder.create<arith::ConstantIndexOp>(
+                             loc, stepUnrolledCst);
+  } else {
+    // Dynamic loop bounds computation.
+    // TODO: Add dynamic asserts for negative lb/ub/step, or
+    // consider using ceilDiv from AffineApplyExpander.
+    auto lowerBound = forOp.getLowerBound();
+    auto upperBound = forOp.getUpperBound();
+    Value diff =
+        boundsBuilder.create<arith::SubIOp>(loc, upperBound, lowerBound);
+    Value tripCount = ceilDivPositive(boundsBuilder, loc, diff, step);
+    Value unrollFactorCst =
+        boundsBuilder.create<arith::ConstantIndexOp>(loc, unrollFactor);
+    Value tripCountRem =
+        boundsBuilder.create<arith::RemSIOp>(loc, tripCount, unrollFactorCst);
+    // Compute tripCountEvenMultiple = tripCount - (tripCount % unrollFactor)
+    Value tripCountEvenMultiple =
+        boundsBuilder.create<arith::SubIOp>(loc, tripCount, tripCountRem);
+    // Compute upperBoundUnrolled = lowerBound + tripCountEvenMultiple * step
+    upperBoundUnrolled = boundsBuilder.create<arith::AddIOp>(
+        loc, lowerBound,
+        boundsBuilder.create<arith::MulIOp>(loc, tripCountEvenMultiple, step));
+    // Scale 'step' by 'unrollFactor'.
+    stepUnrolled =
+        boundsBuilder.create<arith::MulIOp>(loc, step, unrollFactorCst);
+  }
+
+  // Create epilogue clean up loop starting at 'upperBoundUnrolled'.
+  if (generateEpilogueLoop) {
+    OpBuilder epilogueBuilder(forOp->getContext());
+    epilogueBuilder.setInsertionPoint(forOp->getBlock(),
+                                      std::next(Block::iterator(forOp)));
+    auto epilogueForOp = cast<scf::ForOp>(epilogueBuilder.clone(*forOp));
+    epilogueForOp.setLowerBound(upperBoundUnrolled);
+
+    // Update uses of loop results.
+    auto results = forOp.getResults();
+    auto epilogueResults = epilogueForOp.getResults();
+    auto epilogueIterOperands = epilogueForOp.getIterOperands();
+
+    for (auto e : llvm::zip(results, epilogueResults, epilogueIterOperands)) {
+      std::get<0>(e).replaceAllUsesWith(std::get<1>(e));
+      epilogueForOp->replaceUsesOfWith(std::get<2>(e), std::get<0>(e));
+    }
+    (void)promoteIfSingleIteration(epilogueForOp);
+  }
+
+  // Create unrolled loop.
+  forOp.setUpperBound(upperBoundUnrolled);
+  forOp.setStep(stepUnrolled);
+
+  auto iterArgs = ValueRange(forOp.getRegionIterArgs());
+  auto yieldedValues = forOp.getBody()->getTerminator()->getOperands();
+
+  generateUnrolledLoop(
+      forOp.getBody(), forOp.getInductionVar(), unrollFactor,
+      [&](unsigned i, Value iv, OpBuilder b) {
+        // iv' = iv + step * i;
+        auto stride = b.create<arith::MulIOp>(
+            loc, step, b.create<arith::ConstantIndexOp>(loc, i));
+        return b.create<arith::AddIOp>(loc, iv, stride);
+      },
+      annotateFn, iterArgs, yieldedValues);
+  // Promote the loop body up if this has turned into a single iteration loop.
+  (void)promoteIfSingleIteration(forOp);
+  return success();
+}
+
+/// Return the new lower bound, upper bound, and step in that order. Insert any
+/// additional bounds calculations before the given builder and any additional
+/// conversion back to the original loop induction value inside the given Block.
+static LoopParams normalizeLoop(OpBuilder &boundsBuilder,
+                                OpBuilder &insideLoopBuilder, Location loc,
+                                Value lowerBound, Value upperBound, Value step,
+                                Value inductionVar) {
+  // Check if the loop is already known to have a constant zero lower bound or
+  // a constant one step.
+  bool isZeroBased = false;
+  if (auto ubCst = lowerBound.getDefiningOp<arith::ConstantIndexOp>())
+    isZeroBased = ubCst.value() == 0;
+
+  bool isStepOne = false;
+  if (auto stepCst = step.getDefiningOp<arith::ConstantIndexOp>())
+    isStepOne = stepCst.value() == 1;
+
+  // Compute the number of iterations the loop executes: ceildiv(ub - lb, step)
+  // assuming the step is strictly positive.  Update the bounds and the step
+  // of the loop to go from 0 to the number of iterations, if necessary.
+  // TODO: introduce support for negative steps or emit dynamic asserts
+  // on step positivity, whatever gets implemented first.
+  if (isZeroBased && isStepOne)
+    return {/*lowerBound=*/lowerBound, /*upperBound=*/upperBound,
+            /*step=*/step};
+
+  Value diff = boundsBuilder.create<arith::SubIOp>(loc, upperBound, lowerBound);
+  Value newUpperBound = ceilDivPositive(boundsBuilder, loc, diff, step);
+
+  Value newLowerBound =
+      isZeroBased ? lowerBound
+                  : boundsBuilder.create<arith::ConstantIndexOp>(loc, 0);
+  Value newStep =
+      isStepOne ? step : boundsBuilder.create<arith::ConstantIndexOp>(loc, 1);
+
+  // Insert code computing the value of the original loop induction variable
+  // from the "normalized" one.
+  Value scaled =
+      isStepOne
+          ? inductionVar
+          : insideLoopBuilder.create<arith::MulIOp>(loc, inductionVar, step);
+  Value shifted =
+      isZeroBased
+          ? scaled
+          : insideLoopBuilder.create<arith::AddIOp>(loc, scaled, lowerBound);
+
+  SmallPtrSet<Operation *, 2> preserve{scaled.getDefiningOp(),
+                                       shifted.getDefiningOp()};
+  inductionVar.replaceAllUsesExcept(shifted, preserve);
+  return {/*lowerBound=*/newLowerBound, /*upperBound=*/newUpperBound,
+          /*step=*/newStep};
+}
+
+/// Transform a loop with a strictly positive step
+///   for %i = %lb to %ub step %s
+/// into a 0-based loop with step 1
+///   for %ii = 0 to ceildiv(%ub - %lb, %s) step 1 {
+///     %i = %ii * %s + %lb
+/// Insert the induction variable remapping in the body of `inner`, which is
+/// expected to be either `loop` or another loop perfectly nested under `loop`.
+/// Insert the definition of new bounds immediate before `outer`, which is
+/// expected to be either `loop` or its parent in the loop nest.
+static void normalizeLoop(scf::ForOp loop, scf::ForOp outer, scf::ForOp inner) {
+  OpBuilder builder(outer);
+  OpBuilder innerBuilder = OpBuilder::atBlockBegin(inner.getBody());
+  auto loopPieces = normalizeLoop(builder, innerBuilder, loop.getLoc(),
+                                  loop.getLowerBound(), loop.getUpperBound(),
+                                  loop.getStep(), loop.getInductionVar());
+
+  loop.setLowerBound(loopPieces.lowerBound);
+  loop.setUpperBound(loopPieces.upperBound);
+  loop.setStep(loopPieces.step);
+}
+
+void mlir::coalesceLoops(MutableArrayRef<scf::ForOp> loops) {
+  if (loops.size() < 2)
+    return;
+
+  scf::ForOp innermost = loops.back();
+  scf::ForOp outermost = loops.front();
+
+  // 1. Make sure all loops iterate from 0 to upperBound with step 1.  This
+  // allows the following code to assume upperBound is the number of iterations.
+  for (auto loop : loops)
+    normalizeLoop(loop, outermost, innermost);
+
+  // 2. Emit code computing the upper bound of the coalesced loop as product
+  // of the number of iterations of all loops.
+  OpBuilder builder(outermost);
+  Location loc = outermost.getLoc();
+  Value upperBound = outermost.getUpperBound();
+  for (auto loop : loops.drop_front())
+    upperBound =
+        builder.create<arith::MulIOp>(loc, upperBound, loop.getUpperBound());
+  outermost.setUpperBound(upperBound);
+
+  builder.setInsertionPointToStart(outermost.getBody());
+
+  // 3. Remap induction variables. For each original loop, the value of the
+  // induction variable can be obtained by dividing the induction variable of
+  // the linearized loop by the total number of iterations of the loops nested
+  // in it modulo the number of iterations in this loop (remove the values
+  // related to the outer loops):
+  //   iv_i = floordiv(iv_linear, product-of-loop-ranges-until-i) mod range_i.
+  // Compute these iteratively from the innermost loop by creating a "running
+  // quotient" of division by the range.
+  Value previous = outermost.getInductionVar();
+  for (unsigned i = 0, e = loops.size(); i < e; ++i) {
+    unsigned idx = loops.size() - i - 1;
+    if (i != 0)
+      previous = builder.create<arith::DivSIOp>(loc, previous,
+                                                loops[idx + 1].getUpperBound());
+
+    Value iv = (i == e - 1) ? previous
+                            : builder.create<arith::RemSIOp>(
+                                  loc, previous, loops[idx].getUpperBound());
+    replaceAllUsesInRegionWith(loops[idx].getInductionVar(), iv,
+                               loops.back().getRegion());
+  }
+
+  // 4. Move the operations from the innermost just above the second-outermost
+  // loop, delete the extra terminator and the second-outermost loop.
+  scf::ForOp second = loops[1];
+  innermost.getBody()->back().erase();
+  outermost.getBody()->getOperations().splice(
+      Block::iterator(second.getOperation()),
+      innermost.getBody()->getOperations());
+  second.erase();
+}
+
+void mlir::collapseParallelLoops(
+    scf::ParallelOp loops, ArrayRef<std::vector<unsigned>> combinedDimensions) {
+  OpBuilder outsideBuilder(loops);
+  Location loc = loops.getLoc();
+
+  // Presort combined dimensions.
+  auto sortedDimensions = llvm::to_vector<3>(combinedDimensions);
+  for (auto &dims : sortedDimensions)
+    std::sort(dims.begin(), dims.end());
+
+  // Normalize ParallelOp's iteration pattern.
+  SmallVector<Value, 3> normalizedLowerBounds, normalizedSteps,
+      normalizedUpperBounds;
+  for (unsigned i = 0, e = loops.getNumLoops(); i < e; ++i) {
+    OpBuilder insideLoopBuilder = OpBuilder::atBlockBegin(loops.getBody());
+    auto resultBounds =
+        normalizeLoop(outsideBuilder, insideLoopBuilder, loc,
+                      loops.getLowerBound()[i], loops.getUpperBound()[i],
+                      loops.getStep()[i], loops.getBody()->getArgument(i));
+
+    normalizedLowerBounds.push_back(resultBounds.lowerBound);
+    normalizedUpperBounds.push_back(resultBounds.upperBound);
+    normalizedSteps.push_back(resultBounds.step);
+  }
+
+  // Combine iteration spaces.
+  SmallVector<Value, 3> lowerBounds, upperBounds, steps;
+  auto cst0 = outsideBuilder.create<arith::ConstantIndexOp>(loc, 0);
+  auto cst1 = outsideBuilder.create<arith::ConstantIndexOp>(loc, 1);
+  for (unsigned i = 0, e = sortedDimensions.size(); i < e; ++i) {
+    Value newUpperBound = outsideBuilder.create<arith::ConstantIndexOp>(loc, 1);
+    for (auto idx : sortedDimensions[i]) {
+      newUpperBound = outsideBuilder.create<arith::MulIOp>(
+          loc, newUpperBound, normalizedUpperBounds[idx]);
+    }
+    lowerBounds.push_back(cst0);
+    steps.push_back(cst1);
+    upperBounds.push_back(newUpperBound);
+  }
+
+  // Create new ParallelLoop with conversions to the original induction values.
+  // The loop below uses divisions to get the relevant range of values in the
+  // new induction value that represent each range of the original induction
+  // value. The remainders then determine based on that range, which iteration
+  // of the original induction value this represents. This is a normalized value
+  // that is un-normalized already by the previous logic.
+  auto newPloop = outsideBuilder.create<scf::ParallelOp>(
+      loc, lowerBounds, upperBounds, steps,
+      [&](OpBuilder &insideBuilder, Location, ValueRange ploopIVs) {
+        for (unsigned i = 0, e = combinedDimensions.size(); i < e; ++i) {
+          Value previous = ploopIVs[i];
+          unsigned numberCombinedDimensions = combinedDimensions[i].size();
+          // Iterate over all except the last induction value.
+          for (unsigned j = numberCombinedDimensions - 1; j > 0; --j) {
+            unsigned idx = combinedDimensions[i][j];
+
+            // Determine the current induction value's current loop iteration
+            Value iv = insideBuilder.create<arith::RemSIOp>(
+                loc, previous, normalizedUpperBounds[idx]);
+            replaceAllUsesInRegionWith(loops.getBody()->getArgument(idx), iv,
+                                       loops.getRegion());
+
+            // Remove the effect of the current induction value to prepare for
+            // the next value.
+            previous = insideBuilder.create<arith::DivSIOp>(
+                loc, previous, normalizedUpperBounds[idx]);
+          }
+
+          // The final induction value is just the remaining value.
+          unsigned idx = combinedDimensions[i][0];
+          replaceAllUsesInRegionWith(loops.getBody()->getArgument(idx),
+                                     previous, loops.getRegion());
+        }
+      });
+
+  // Replace the old loop with the new loop.
+  loops.getBody()->back().erase();
+  newPloop.getBody()->getOperations().splice(
+      Block::iterator(newPloop.getBody()->back()),
+      loops.getBody()->getOperations());
+  loops.erase();
+}
+
+// Hoist the ops within `outer` that appear before `inner`.
+// Such ops include the ops that have been introduced by parametric tiling.
+// Ops that come from triangular loops (i.e. that belong to the program slice
+// rooted at `outer`) and ops that have side effects cannot be hoisted.
+// Return failure when any op fails to hoist.
+static LogicalResult hoistOpsBetween(scf::ForOp outer, scf::ForOp inner) {
+  SetVector<Operation *> forwardSlice;
+  getForwardSlice(
+      outer.getInductionVar(), &forwardSlice,
+      [&inner](Operation *op) { return op != inner.getOperation(); });
+  LogicalResult status = success();
+  SmallVector<Operation *, 8> toHoist;
+  for (auto &op : outer.getBody()->without_terminator()) {
+    // Stop when encountering the inner loop.
+    if (&op == inner.getOperation())
+      break;
+    // Skip over non-hoistable ops.
+    if (forwardSlice.count(&op) > 0) {
+      status = failure();
+      continue;
+    }
+    // Skip intermediate scf::ForOp, these are not considered a failure.
+    if (isa<scf::ForOp>(op))
+      continue;
+    // Skip other ops with regions.
+    if (op.getNumRegions() > 0) {
+      status = failure();
+      continue;
+    }
+    // Skip if op has side effects.
+    // TODO: loads to immutable memory regions are ok.
+    if (!MemoryEffectOpInterface::hasNoEffect(&op)) {
+      status = failure();
+      continue;
+    }
+    toHoist.push_back(&op);
+  }
+  auto *outerForOp = outer.getOperation();
+  for (auto *op : toHoist)
+    op->moveBefore(outerForOp);
+  return status;
+}
+
+// Traverse the interTile and intraTile loops and try to hoist ops such that
+// bands of perfectly nested loops are isolated.
+// Return failure if either perfect interTile or perfect intraTile bands cannot
+// be formed.
+static LogicalResult tryIsolateBands(const TileLoops &tileLoops) {
+  LogicalResult status = success();
+  const Loops &interTile = tileLoops.first;
+  const Loops &intraTile = tileLoops.second;
+  auto size = interTile.size();
+  assert(size == intraTile.size());
+  if (size <= 1)
+    return success();
+  for (unsigned s = 1; s < size; ++s)
+    status = succeeded(status) ? hoistOpsBetween(intraTile[0], intraTile[s])
+                               : failure();
+  for (unsigned s = 1; s < size; ++s)
+    status = succeeded(status) ? hoistOpsBetween(interTile[0], interTile[s])
+                               : failure();
+  return status;
+}
+
+/// Collect perfectly nested loops starting from `rootForOps`.  Loops are
+/// perfectly nested if each loop is the first and only non-terminator operation
+/// in the parent loop.  Collect at most `maxLoops` loops and append them to
+/// `forOps`.
+template <typename T>
+static void getPerfectlyNestedLoopsImpl(
+    SmallVectorImpl<T> &forOps, T rootForOp,
+    unsigned maxLoops = std::numeric_limits<unsigned>::max()) {
+  for (unsigned i = 0; i < maxLoops; ++i) {
+    forOps.push_back(rootForOp);
+    Block &body = rootForOp.getRegion().front();
+    if (body.begin() != std::prev(body.end(), 2))
+      return;
+
+    rootForOp = dyn_cast<T>(&body.front());
+    if (!rootForOp)
+      return;
+  }
+}
+
+static Loops stripmineSink(scf::ForOp forOp, Value factor,
+                           ArrayRef<scf::ForOp> targets) {
+  auto originalStep = forOp.getStep();
+  auto iv = forOp.getInductionVar();
+
+  OpBuilder b(forOp);
+  forOp.setStep(b.create<arith::MulIOp>(forOp.getLoc(), originalStep, factor));
+
+  Loops innerLoops;
+  for (auto t : targets) {
+    // Save information for splicing ops out of t when done
+    auto begin = t.getBody()->begin();
+    auto nOps = t.getBody()->getOperations().size();
+
+    // Insert newForOp before the terminator of `t`.
+    auto b = OpBuilder::atBlockTerminator((t.getBody()));
+    Value stepped = b.create<arith::AddIOp>(t.getLoc(), iv, forOp.getStep());
+    Value less = b.create<arith::CmpIOp>(t.getLoc(), arith::CmpIPredicate::slt,
+                                         forOp.getUpperBound(), stepped);
+    Value ub =
+        b.create<SelectOp>(t.getLoc(), less, forOp.getUpperBound(), stepped);
+
+    // Splice [begin, begin + nOps - 1) into `newForOp` and replace uses.
+    auto newForOp = b.create<scf::ForOp>(t.getLoc(), iv, ub, originalStep);
+    newForOp.getBody()->getOperations().splice(
+        newForOp.getBody()->getOperations().begin(),
+        t.getBody()->getOperations(), begin, std::next(begin, nOps - 1));
+    replaceAllUsesInRegionWith(iv, newForOp.getInductionVar(),
+                               newForOp.getRegion());
+
+    innerLoops.push_back(newForOp);
+  }
+
+  return innerLoops;
+}
+
+// Stripmines a `forOp` by `factor` and sinks it under a single `target`.
+// Returns the new for operation, nested immediately under `target`.
+template <typename SizeType>
+static scf::ForOp stripmineSink(scf::ForOp forOp, SizeType factor,
+                                scf::ForOp target) {
+  // TODO: Use cheap structural assertions that targets are nested under
+  // forOp and that targets are not nested under each other when DominanceInfo
+  // exposes the capability. It seems overkill to construct a whole function
+  // dominance tree at this point.
+  auto res = stripmineSink(forOp, factor, ArrayRef<scf::ForOp>(target));
+  assert(res.size() == 1 && "Expected 1 inner forOp");
+  return res[0];
+}
+
+SmallVector<Loops, 8> mlir::tile(ArrayRef<scf::ForOp> forOps,
+                                 ArrayRef<Value> sizes,
+                                 ArrayRef<scf::ForOp> targets) {
+  SmallVector<SmallVector<scf::ForOp, 8>, 8> res;
+  SmallVector<scf::ForOp, 8> currentTargets(targets.begin(), targets.end());
+  for (auto it : llvm::zip(forOps, sizes)) {
+    auto step = stripmineSink(std::get<0>(it), std::get<1>(it), currentTargets);
+    res.push_back(step);
+    currentTargets = step;
+  }
+  return res;
+}
+
+Loops mlir::tile(ArrayRef<scf::ForOp> forOps, ArrayRef<Value> sizes,
+                 scf::ForOp target) {
+  SmallVector<scf::ForOp, 8> res;
+  for (auto loops : tile(forOps, sizes, ArrayRef<scf::ForOp>(target))) {
+    assert(loops.size() == 1);
+    res.push_back(loops[0]);
+  }
+  return res;
+}
+
+Loops mlir::tilePerfectlyNested(scf::ForOp rootForOp, ArrayRef<Value> sizes) {
+  // Collect perfectly nested loops.  If more size values provided than nested
+  // loops available, truncate `sizes`.
+  SmallVector<scf::ForOp, 4> forOps;
+  forOps.reserve(sizes.size());
+  getPerfectlyNestedLoopsImpl(forOps, rootForOp, sizes.size());
+  if (forOps.size() < sizes.size())
+    sizes = sizes.take_front(forOps.size());
+
+  return ::tile(forOps, sizes, forOps.back());
+}
+
+void mlir::getPerfectlyNestedLoops(SmallVectorImpl<scf::ForOp> &nestedLoops,
+                                   scf::ForOp root) {
+  getPerfectlyNestedLoopsImpl(nestedLoops, root);
+}
+
+TileLoops mlir::extractFixedOuterLoops(scf::ForOp rootForOp,
+                                       ArrayRef<int64_t> sizes) {
+  // Collect perfectly nested loops.  If more size values provided than nested
+  // loops available, truncate `sizes`.
+  SmallVector<scf::ForOp, 4> forOps;
+  forOps.reserve(sizes.size());
+  getPerfectlyNestedLoopsImpl(forOps, rootForOp, sizes.size());
+  if (forOps.size() < sizes.size())
+    sizes = sizes.take_front(forOps.size());
+
+  // Compute the tile sizes such that i-th outer loop executes size[i]
+  // iterations.  Given that the loop current executes
+  //   numIterations = ceildiv((upperBound - lowerBound), step)
+  // iterations, we need to tile with size ceildiv(numIterations, size[i]).
+  SmallVector<Value, 4> tileSizes;
+  tileSizes.reserve(sizes.size());
+  for (unsigned i = 0, e = sizes.size(); i < e; ++i) {
+    assert(sizes[i] > 0 && "expected strictly positive size for strip-mining");
+
+    auto forOp = forOps[i];
+    OpBuilder builder(forOp);
+    auto loc = forOp.getLoc();
+    Value diff = builder.create<arith::SubIOp>(loc, forOp.getUpperBound(),
+                                               forOp.getLowerBound());
+    Value numIterations = ceilDivPositive(builder, loc, diff, forOp.getStep());
+    Value iterationsPerBlock =
+        ceilDivPositive(builder, loc, numIterations, sizes[i]);
+    tileSizes.push_back(iterationsPerBlock);
+  }
+
+  // Call parametric tiling with the given sizes.
+  auto intraTile = tile(forOps, tileSizes, forOps.back());
+  TileLoops tileLoops = std::make_pair(forOps, intraTile);
+
+  // TODO: for now we just ignore the result of band isolation.
+  // In the future, mapping decisions may be impacted by the ability to
+  // isolate perfectly nested bands.
+  (void)tryIsolateBands(tileLoops);
+
+  return tileLoops;
+}
diff --git a/mlir/lib/Dialect/Vector/VectorTransferSplitRewritePatterns.cpp b/mlir/lib/Dialect/Vector/VectorTransferSplitRewritePatterns.cpp
--- a/mlir/lib/Dialect/Vector/VectorTransferSplitRewritePatterns.cpp
+++ b/mlir/lib/Dialect/Vector/VectorTransferSplitRewritePatterns.cpp
@@ -14,7 +14,6 @@
 #include <type_traits>
 
 #include "mlir/Dialect/Affine/IR/AffineOps.h"
-#include "mlir/Dialect/Affine/Utils.h"
 #include "mlir/Dialect/Arithmetic/IR/Arithmetic.h"
 #include "mlir/Dialect/Linalg/IR/Linalg.h"
 #include "mlir/Dialect/MemRef/IR/MemRef.h"
diff --git a/mlir/lib/Dialect/Vector/VectorTransforms.cpp b/mlir/lib/Dialect/Vector/VectorTransforms.cpp
--- a/mlir/lib/Dialect/Vector/VectorTransforms.cpp
+++ b/mlir/lib/Dialect/Vector/VectorTransforms.cpp
@@ -13,7 +13,6 @@
 #include <type_traits>
 
 #include "mlir/Dialect/Affine/IR/AffineOps.h"
-#include "mlir/Dialect/Affine/Utils.h"
 #include "mlir/Dialect/Arithmetic/IR/Arithmetic.h"
 #include "mlir/Dialect/Linalg/IR/Linalg.h"
 #include "mlir/Dialect/MemRef/IR/MemRef.h"
diff --git a/mlir/lib/Dialect/Vector/VectorUnrollDistribute.cpp b/mlir/lib/Dialect/Vector/VectorUnrollDistribute.cpp
--- a/mlir/lib/Dialect/Vector/VectorUnrollDistribute.cpp
+++ b/mlir/lib/Dialect/Vector/VectorUnrollDistribute.cpp
@@ -11,7 +11,6 @@
 //===----------------------------------------------------------------------===//
 
 #include "mlir/Dialect/Affine/IR/AffineOps.h"
-#include "mlir/Dialect/Affine/Utils.h"
 #include "mlir/Dialect/Vector/VectorTransforms.h"
 #include "mlir/IR/ImplicitLocOpBuilder.h"
 #include "mlir/Interfaces/VectorInterfaces.h"
diff --git a/mlir/lib/Interfaces/LoopLikeInterface.cpp b/mlir/lib/Interfaces/LoopLikeInterface.cpp
--- a/mlir/lib/Interfaces/LoopLikeInterface.cpp
+++ b/mlir/lib/Interfaces/LoopLikeInterface.cpp
@@ -7,12 +7,95 @@
 //===----------------------------------------------------------------------===//
 
 #include "mlir/Interfaces/LoopLikeInterface.h"
+#include "mlir/Interfaces/SideEffectInterfaces.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/Support/Debug.h"
 
 using namespace mlir;
 
+#define DEBUG_TYPE "loop-like"
+
 //===----------------------------------------------------------------------===//
 // LoopLike Interfaces
 //===----------------------------------------------------------------------===//
 
 /// Include the definitions of the loop-like interfaces.
 #include "mlir/Interfaces/LoopLikeInterface.cpp.inc"
+
+//===----------------------------------------------------------------------===//
+// LoopLike Utilities
+//===----------------------------------------------------------------------===//
+
+// Checks whether the given op can be hoisted by checking that
+// - the op and any of its contained operations do not depend on SSA values
+//   defined inside of the loop (by means of calling definedOutside).
+// - the op has no side-effects. If sideEffecting is Never, sideeffects of this
+//   op and its nested ops are ignored.
+static bool canBeHoisted(Operation *op,
+                         function_ref<bool(Value)> definedOutside) {
+  // Check that dependencies are defined outside of loop.
+  if (!llvm::all_of(op->getOperands(), definedOutside))
+    return false;
+  // Check whether this op is side-effect free. If we already know that there
+  // can be no side-effects because the surrounding op has claimed so, we can
+  // (and have to) skip this step.
+  if (auto memInterface = dyn_cast<MemoryEffectOpInterface>(op)) {
+    if (!memInterface.hasNoEffect())
+      return false;
+    // If the operation doesn't have side effects and it doesn't recursively
+    // have side effects, it can always be hoisted.
+    if (!op->hasTrait<OpTrait::HasRecursiveSideEffects>())
+      return true;
+
+    // Otherwise, if the operation doesn't provide the memory effect interface
+    // and it doesn't have recursive side effects we treat it conservatively as
+    // side-effecting.
+  } else if (!op->hasTrait<OpTrait::HasRecursiveSideEffects>()) {
+    return false;
+  }
+
+  // Recurse into the regions for this op and check whether the contained ops
+  // can be hoisted.
+  for (auto &region : op->getRegions()) {
+    for (auto &block : region) {
+      for (auto &innerOp : block)
+        if (!canBeHoisted(&innerOp, definedOutside))
+          return false;
+    }
+  }
+  return true;
+}
+
+LogicalResult mlir::moveLoopInvariantCode(LoopLikeOpInterface looplike) {
+  auto &loopBody = looplike.getLoopBody();
+
+  // We use two collections here as we need to preserve the order for insertion
+  // and this is easiest.
+  SmallPtrSet<Operation *, 8> willBeMovedSet;
+  SmallVector<Operation *, 8> opsToMove;
+
+  // Helper to check whether an operation is loop invariant wrt. SSA properties.
+  auto isDefinedOutsideOfBody = [&](Value value) {
+    auto *definingOp = value.getDefiningOp();
+    return (definingOp && !!willBeMovedSet.count(definingOp)) ||
+           looplike.isDefinedOutsideOfLoop(value);
+  };
+
+  // Do not use walk here, as we do not want to go into nested regions and hoist
+  // operations from there. These regions might have semantics unknown to this
+  // rewriting. If the nested regions are loops, they will have been processed.
+  for (auto &block : loopBody) {
+    for (auto &op : block.without_terminator()) {
+      if (canBeHoisted(&op, isDefinedOutsideOfBody)) {
+        opsToMove.push_back(&op);
+        willBeMovedSet.insert(&op);
+      }
+    }
+  }
+
+  // For all instructions that we found to be invariant, move outside of the
+  // loop.
+  LogicalResult result = looplike.moveOutOfLoop(opsToMove);
+  LLVM_DEBUG(looplike.print(llvm::dbgs() << "\n\nModified loop:\n"));
+  return result;
+}
diff --git a/mlir/lib/Transforms/CMakeLists.txt b/mlir/lib/Transforms/CMakeLists.txt
--- a/mlir/lib/Transforms/CMakeLists.txt
+++ b/mlir/lib/Transforms/CMakeLists.txt
@@ -6,12 +6,8 @@
   CSE.cpp
   Inliner.cpp
   LocationSnapshot.cpp
-  LoopCoalescing.cpp
-  LoopFusion.cpp
   LoopInvariantCodeMotion.cpp
   OpStats.cpp
-  ParallelLoopCollapsing.cpp
-  PipelineDataTransfer.cpp
   SCCP.cpp
   StripDebugInfo.cpp
   SymbolDCE.cpp
@@ -21,18 +17,13 @@
   ${MLIR_MAIN_INCLUDE_DIR}/mlir/Transforms
 
   DEPENDS
-  MLIRStandardOpsIncGen
   MLIRTransformsPassIncGen
 
   LINK_LIBS PUBLIC
-  MLIRAffine
   MLIRAnalysis
   MLIRCopyOpInterface
   MLIRLoopLikeInterface
-  MLIRMemRef
-  MLIRSCF
   MLIRPass
   MLIRSupport
   MLIRTransformUtils
-  MLIRVector
   )
diff --git a/mlir/lib/Transforms/CSE.cpp b/mlir/lib/Transforms/CSE.cpp
--- a/mlir/lib/Transforms/CSE.cpp
+++ b/mlir/lib/Transforms/CSE.cpp
@@ -15,7 +15,6 @@
 #include "mlir/IR/Dominance.h"
 #include "mlir/Pass/Pass.h"
 #include "mlir/Transforms/Passes.h"
-#include "mlir/Transforms/Utils.h"
 #include "llvm/ADT/DenseMapInfo.h"
 #include "llvm/ADT/Hashing.h"
 #include "llvm/ADT/ScopedHashTable.h"
diff --git a/mlir/lib/Transforms/ControlFlowSink.cpp b/mlir/lib/Transforms/ControlFlowSink.cpp
--- a/mlir/lib/Transforms/ControlFlowSink.cpp
+++ b/mlir/lib/Transforms/ControlFlowSink.cpp
@@ -16,8 +16,8 @@
 #include "PassDetail.h"
 #include "mlir/IR/Dominance.h"
 #include "mlir/Interfaces/ControlFlowInterfaces.h"
+#include "mlir/Transforms/ControlFlowSinkUtils.h"
 #include "mlir/Transforms/Passes.h"
-#include "mlir/Transforms/Utils.h"
 
 using namespace mlir;
 
diff --git a/mlir/lib/Transforms/LoopInvariantCodeMotion.cpp b/mlir/lib/Transforms/LoopInvariantCodeMotion.cpp
--- a/mlir/lib/Transforms/LoopInvariantCodeMotion.cpp
+++ b/mlir/lib/Transforms/LoopInvariantCodeMotion.cpp
@@ -11,13 +11,10 @@
 //===----------------------------------------------------------------------===//
 
 #include "PassDetail.h"
-#include "mlir/Transforms/Passes.h"
-
 #include "mlir/IR/Builders.h"
-#include "mlir/IR/BuiltinOps.h"
 #include "mlir/Interfaces/LoopLikeInterface.h"
 #include "mlir/Interfaces/SideEffectInterfaces.h"
-#include "mlir/Transforms/LoopUtils.h"
+#include "mlir/Transforms/Passes.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
@@ -34,80 +31,6 @@
 };
 } // namespace
 
-// Checks whether the given op can be hoisted by checking that
-// - the op and any of its contained operations do not depend on SSA values
-//   defined inside of the loop (by means of calling definedOutside).
-// - the op has no side-effects. If sideEffecting is Never, sideeffects of this
-//   op and its nested ops are ignored.
-static bool canBeHoisted(Operation *op,
-                         function_ref<bool(Value)> definedOutside) {
-  // Check that dependencies are defined outside of loop.
-  if (!llvm::all_of(op->getOperands(), definedOutside))
-    return false;
-  // Check whether this op is side-effect free. If we already know that there
-  // can be no side-effects because the surrounding op has claimed so, we can
-  // (and have to) skip this step.
-  if (auto memInterface = dyn_cast<MemoryEffectOpInterface>(op)) {
-    if (!memInterface.hasNoEffect())
-      return false;
-    // If the operation doesn't have side effects and it doesn't recursively
-    // have side effects, it can always be hoisted.
-    if (!op->hasTrait<OpTrait::HasRecursiveSideEffects>())
-      return true;
-
-    // Otherwise, if the operation doesn't provide the memory effect interface
-    // and it doesn't have recursive side effects we treat it conservatively as
-    // side-effecting.
-  } else if (!op->hasTrait<OpTrait::HasRecursiveSideEffects>()) {
-    return false;
-  }
-
-  // Recurse into the regions for this op and check whether the contained ops
-  // can be hoisted.
-  for (auto &region : op->getRegions()) {
-    for (auto &block : region) {
-      for (auto &innerOp : block)
-        if (!canBeHoisted(&innerOp, definedOutside))
-          return false;
-    }
-  }
-  return true;
-}
-
-LogicalResult mlir::moveLoopInvariantCode(LoopLikeOpInterface looplike) {
-  auto &loopBody = looplike.getLoopBody();
-
-  // We use two collections here as we need to preserve the order for insertion
-  // and this is easiest.
-  SmallPtrSet<Operation *, 8> willBeMovedSet;
-  SmallVector<Operation *, 8> opsToMove;
-
-  // Helper to check whether an operation is loop invariant wrt. SSA properties.
-  auto isDefinedOutsideOfBody = [&](Value value) {
-    auto *definingOp = value.getDefiningOp();
-    return (definingOp && !!willBeMovedSet.count(definingOp)) ||
-           looplike.isDefinedOutsideOfLoop(value);
-  };
-
-  // Do not use walk here, as we do not want to go into nested regions and hoist
-  // operations from there. These regions might have semantics unknown to this
-  // rewriting. If the nested regions are loops, they will have been processed.
-  for (auto &block : loopBody) {
-    for (auto &op : block.without_terminator()) {
-      if (canBeHoisted(&op, isDefinedOutsideOfBody)) {
-        opsToMove.push_back(&op);
-        willBeMovedSet.insert(&op);
-      }
-    }
-  }
-
-  // For all instructions that we found to be invariant, move outside of the
-  // loop.
-  auto result = looplike.moveOutOfLoop(opsToMove);
-  LLVM_DEBUG(looplike.print(llvm::dbgs() << "\n\nModified loop:\n"));
-  return result;
-}
-
 void LoopInvariantCodeMotion::runOnOperation() {
   // Walk through all loops in a function in innermost-loop-first order. This
   // way, we first LICM from the inner loop, and place the ops in
diff --git a/mlir/lib/Transforms/PassDetail.h b/mlir/lib/Transforms/PassDetail.h
--- a/mlir/lib/Transforms/PassDetail.h
+++ b/mlir/lib/Transforms/PassDetail.h
@@ -13,23 +13,8 @@
 #include "mlir/Transforms/Passes.h"
 
 namespace mlir {
-class AffineDialect;
-
-// Forward declaration from Dialect.h
-template <typename ConcreteDialect>
-void registerDialect(DialectRegistry &registry);
-
-namespace arith {
-class ArithmeticDialect;
-} // namespace arith
-
-namespace memref {
-class MemRefDialect;
-} // namespace memref
-
 #define GEN_PASS_CLASSES
 #include "mlir/Transforms/Passes.h.inc"
-
 } // namespace mlir
 
 #endif // TRANSFORMS_PASSDETAIL_H_
diff --git a/mlir/lib/Transforms/Utils/CMakeLists.txt b/mlir/lib/Transforms/Utils/CMakeLists.txt
--- a/mlir/lib/Transforms/Utils/CMakeLists.txt
+++ b/mlir/lib/Transforms/Utils/CMakeLists.txt
@@ -4,25 +4,12 @@
   FoldUtils.cpp
   GreedyPatternRewriteDriver.cpp
   InliningUtils.cpp
-  LoopFusionUtils.cpp
-  LoopUtils.cpp
   RegionUtils.cpp
-  Utils.cpp
 
   ADDITIONAL_HEADER_DIRS
   ${MLIR_MAIN_INCLUDE_DIR}/mlir/Transforms
 
-  DEPENDS
-  MLIRStandardOpsIncGen
-
   LINK_LIBS PUBLIC
-  MLIRAffine
-  MLIRArithmetic
   MLIRAnalysis
-  MLIRAffineAnalysis
-  MLIRMemRef
-  MLIRSCF
-  MLIRPass
   MLIRRewrite
-  MLIRStandard
   )
diff --git a/mlir/lib/Transforms/Utils/ControlFlowSinkUtils.cpp b/mlir/lib/Transforms/Utils/ControlFlowSinkUtils.cpp
--- a/mlir/lib/Transforms/Utils/ControlFlowSinkUtils.cpp
+++ b/mlir/lib/Transforms/Utils/ControlFlowSinkUtils.cpp
@@ -18,10 +18,10 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "mlir/Transforms/ControlFlowSinkUtils.h"
 #include "mlir/IR/Dominance.h"
 #include "mlir/IR/Matchers.h"
 #include "mlir/Interfaces/ControlFlowInterfaces.h"
-#include "mlir/Transforms/Utils.h"
 #include <vector>
 
 #define DEBUG_TYPE "cf-sink"
diff --git a/mlir/lib/Transforms/Utils/DialectConversion.cpp b/mlir/lib/Transforms/Utils/DialectConversion.cpp
--- a/mlir/lib/Transforms/Utils/DialectConversion.cpp
+++ b/mlir/lib/Transforms/Utils/DialectConversion.cpp
@@ -13,7 +13,6 @@
 #include "mlir/IR/BuiltinOps.h"
 #include "mlir/IR/FunctionInterfaces.h"
 #include "mlir/Rewrite/PatternApplicator.h"
-#include "mlir/Transforms/Utils.h"
 #include "llvm/ADT/ScopeExit.h"
 #include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/SmallPtrSet.h"
diff --git a/mlir/lib/Transforms/Utils/Utils.cpp b/mlir/lib/Transforms/Utils/Utils.cpp
deleted file mode 100644
--- a/mlir/lib/Transforms/Utils/Utils.cpp
+++ /dev/null
@@ -1,767 +0,0 @@
-//===- Utils.cpp ---- Misc utilities for code and data transformation -----===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file implements miscellaneous transformation routines for non-loop IR
-// structures.
-//
-//===----------------------------------------------------------------------===//
-
-#include "mlir/Transforms/Utils.h"
-#include "mlir/Dialect/Affine/Analysis/AffineAnalysis.h"
-#include "mlir/Dialect/Affine/Analysis/AffineStructures.h"
-#include "mlir/Dialect/Affine/Analysis/Utils.h"
-#include "mlir/Dialect/Affine/IR/AffineOps.h"
-#include "mlir/Dialect/Arithmetic/IR/Arithmetic.h"
-#include "mlir/Dialect/MemRef/IR/MemRef.h"
-#include "mlir/IR/Builders.h"
-#include "mlir/IR/BuiltinOps.h"
-#include "mlir/IR/Dominance.h"
-#include "mlir/Support/MathExtras.h"
-#include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/TypeSwitch.h"
-
-#define DEBUG_TYPE "transforms-utils"
-
-using namespace mlir;
-
-// Perform the replacement in `op`.
-LogicalResult mlir::replaceAllMemRefUsesWith(Value oldMemRef, Value newMemRef,
-                                             Operation *op,
-                                             ArrayRef<Value> extraIndices,
-                                             AffineMap indexRemap,
-                                             ArrayRef<Value> extraOperands,
-                                             ArrayRef<Value> symbolOperands,
-                                             bool allowNonDereferencingOps) {
-  unsigned newMemRefRank = newMemRef.getType().cast<MemRefType>().getRank();
-  (void)newMemRefRank; // unused in opt mode
-  unsigned oldMemRefRank = oldMemRef.getType().cast<MemRefType>().getRank();
-  (void)oldMemRefRank; // unused in opt mode
-  if (indexRemap) {
-    assert(indexRemap.getNumSymbols() == symbolOperands.size() &&
-           "symbolic operand count mismatch");
-    assert(indexRemap.getNumInputs() ==
-           extraOperands.size() + oldMemRefRank + symbolOperands.size());
-    assert(indexRemap.getNumResults() + extraIndices.size() == newMemRefRank);
-  } else {
-    assert(oldMemRefRank + extraIndices.size() == newMemRefRank);
-  }
-
-  // Assert same elemental type.
-  assert(oldMemRef.getType().cast<MemRefType>().getElementType() ==
-         newMemRef.getType().cast<MemRefType>().getElementType());
-
-  SmallVector<unsigned, 2> usePositions;
-  for (const auto &opEntry : llvm::enumerate(op->getOperands())) {
-    if (opEntry.value() == oldMemRef)
-      usePositions.push_back(opEntry.index());
-  }
-
-  // If memref doesn't appear, nothing to do.
-  if (usePositions.empty())
-    return success();
-
-  if (usePositions.size() > 1) {
-    // TODO: extend it for this case when needed (rare).
-    assert(false && "multiple dereferencing uses in a single op not supported");
-    return failure();
-  }
-
-  unsigned memRefOperandPos = usePositions.front();
-
-  OpBuilder builder(op);
-  // The following checks if op is dereferencing memref and performs the access
-  // index rewrites.
-  auto affMapAccInterface = dyn_cast<AffineMapAccessInterface>(op);
-  if (!affMapAccInterface) {
-    if (!allowNonDereferencingOps) {
-      // Failure: memref used in a non-dereferencing context (potentially
-      // escapes); no replacement in these cases unless allowNonDereferencingOps
-      // is set.
-      return failure();
-    }
-    op->setOperand(memRefOperandPos, newMemRef);
-    return success();
-  }
-  // Perform index rewrites for the dereferencing op and then replace the op
-  NamedAttribute oldMapAttrPair =
-      affMapAccInterface.getAffineMapAttrForMemRef(oldMemRef);
-  AffineMap oldMap = oldMapAttrPair.getValue().cast<AffineMapAttr>().getValue();
-  unsigned oldMapNumInputs = oldMap.getNumInputs();
-  SmallVector<Value, 4> oldMapOperands(
-      op->operand_begin() + memRefOperandPos + 1,
-      op->operand_begin() + memRefOperandPos + 1 + oldMapNumInputs);
-
-  // Apply 'oldMemRefOperands = oldMap(oldMapOperands)'.
-  SmallVector<Value, 4> oldMemRefOperands;
-  SmallVector<Value, 4> affineApplyOps;
-  oldMemRefOperands.reserve(oldMemRefRank);
-  if (oldMap != builder.getMultiDimIdentityMap(oldMap.getNumDims())) {
-    for (auto resultExpr : oldMap.getResults()) {
-      auto singleResMap = AffineMap::get(oldMap.getNumDims(),
-                                         oldMap.getNumSymbols(), resultExpr);
-      auto afOp = builder.create<AffineApplyOp>(op->getLoc(), singleResMap,
-                                                oldMapOperands);
-      oldMemRefOperands.push_back(afOp);
-      affineApplyOps.push_back(afOp);
-    }
-  } else {
-    oldMemRefOperands.assign(oldMapOperands.begin(), oldMapOperands.end());
-  }
-
-  // Construct new indices as a remap of the old ones if a remapping has been
-  // provided. The indices of a memref come right after it, i.e.,
-  // at position memRefOperandPos + 1.
-  SmallVector<Value, 4> remapOperands;
-  remapOperands.reserve(extraOperands.size() + oldMemRefRank +
-                        symbolOperands.size());
-  remapOperands.append(extraOperands.begin(), extraOperands.end());
-  remapOperands.append(oldMemRefOperands.begin(), oldMemRefOperands.end());
-  remapOperands.append(symbolOperands.begin(), symbolOperands.end());
-
-  SmallVector<Value, 4> remapOutputs;
-  remapOutputs.reserve(oldMemRefRank);
-
-  if (indexRemap &&
-      indexRemap != builder.getMultiDimIdentityMap(indexRemap.getNumDims())) {
-    // Remapped indices.
-    for (auto resultExpr : indexRemap.getResults()) {
-      auto singleResMap = AffineMap::get(
-          indexRemap.getNumDims(), indexRemap.getNumSymbols(), resultExpr);
-      auto afOp = builder.create<AffineApplyOp>(op->getLoc(), singleResMap,
-                                                remapOperands);
-      remapOutputs.push_back(afOp);
-      affineApplyOps.push_back(afOp);
-    }
-  } else {
-    // No remapping specified.
-    remapOutputs.assign(remapOperands.begin(), remapOperands.end());
-  }
-
-  SmallVector<Value, 4> newMapOperands;
-  newMapOperands.reserve(newMemRefRank);
-
-  // Prepend 'extraIndices' in 'newMapOperands'.
-  for (Value extraIndex : extraIndices) {
-    assert(extraIndex.getDefiningOp()->getNumResults() == 1 &&
-           "single result op's expected to generate these indices");
-    assert((isValidDim(extraIndex) || isValidSymbol(extraIndex)) &&
-           "invalid memory op index");
-    newMapOperands.push_back(extraIndex);
-  }
-
-  // Append 'remapOutputs' to 'newMapOperands'.
-  newMapOperands.append(remapOutputs.begin(), remapOutputs.end());
-
-  // Create new fully composed AffineMap for new op to be created.
-  assert(newMapOperands.size() == newMemRefRank);
-  auto newMap = builder.getMultiDimIdentityMap(newMemRefRank);
-  // TODO: Avoid creating/deleting temporary AffineApplyOps here.
-  fullyComposeAffineMapAndOperands(&newMap, &newMapOperands);
-  newMap = simplifyAffineMap(newMap);
-  canonicalizeMapAndOperands(&newMap, &newMapOperands);
-  // Remove any affine.apply's that became dead as a result of composition.
-  for (Value value : affineApplyOps)
-    if (value.use_empty())
-      value.getDefiningOp()->erase();
-
-  OperationState state(op->getLoc(), op->getName());
-  // Construct the new operation using this memref.
-  state.operands.reserve(op->getNumOperands() + extraIndices.size());
-  // Insert the non-memref operands.
-  state.operands.append(op->operand_begin(),
-                        op->operand_begin() + memRefOperandPos);
-  // Insert the new memref value.
-  state.operands.push_back(newMemRef);
-
-  // Insert the new memref map operands.
-  state.operands.append(newMapOperands.begin(), newMapOperands.end());
-
-  // Insert the remaining operands unmodified.
-  state.operands.append(op->operand_begin() + memRefOperandPos + 1 +
-                            oldMapNumInputs,
-                        op->operand_end());
-
-  // Result types don't change. Both memref's are of the same elemental type.
-  state.types.reserve(op->getNumResults());
-  for (auto result : op->getResults())
-    state.types.push_back(result.getType());
-
-  // Add attribute for 'newMap', other Attributes do not change.
-  auto newMapAttr = AffineMapAttr::get(newMap);
-  for (auto namedAttr : op->getAttrs()) {
-    if (namedAttr.getName() == oldMapAttrPair.getName())
-      state.attributes.push_back({namedAttr.getName(), newMapAttr});
-    else
-      state.attributes.push_back(namedAttr);
-  }
-
-  // Create the new operation.
-  auto *repOp = builder.createOperation(state);
-  op->replaceAllUsesWith(repOp);
-  op->erase();
-
-  return success();
-}
-
-LogicalResult mlir::replaceAllMemRefUsesWith(
-    Value oldMemRef, Value newMemRef, ArrayRef<Value> extraIndices,
-    AffineMap indexRemap, ArrayRef<Value> extraOperands,
-    ArrayRef<Value> symbolOperands, Operation *domOpFilter,
-    Operation *postDomOpFilter, bool allowNonDereferencingOps,
-    bool replaceInDeallocOp) {
-  unsigned newMemRefRank = newMemRef.getType().cast<MemRefType>().getRank();
-  (void)newMemRefRank; // unused in opt mode
-  unsigned oldMemRefRank = oldMemRef.getType().cast<MemRefType>().getRank();
-  (void)oldMemRefRank;
-  if (indexRemap) {
-    assert(indexRemap.getNumSymbols() == symbolOperands.size() &&
-           "symbol operand count mismatch");
-    assert(indexRemap.getNumInputs() ==
-           extraOperands.size() + oldMemRefRank + symbolOperands.size());
-    assert(indexRemap.getNumResults() + extraIndices.size() == newMemRefRank);
-  } else {
-    assert(oldMemRefRank + extraIndices.size() == newMemRefRank);
-  }
-
-  // Assert same elemental type.
-  assert(oldMemRef.getType().cast<MemRefType>().getElementType() ==
-         newMemRef.getType().cast<MemRefType>().getElementType());
-
-  std::unique_ptr<DominanceInfo> domInfo;
-  std::unique_ptr<PostDominanceInfo> postDomInfo;
-  if (domOpFilter)
-    domInfo =
-        std::make_unique<DominanceInfo>(domOpFilter->getParentOfType<FuncOp>());
-
-  if (postDomOpFilter)
-    postDomInfo = std::make_unique<PostDominanceInfo>(
-        postDomOpFilter->getParentOfType<FuncOp>());
-
-  // Walk all uses of old memref; collect ops to perform replacement. We use a
-  // DenseSet since an operation could potentially have multiple uses of a
-  // memref (although rare), and the replacement later is going to erase ops.
-  DenseSet<Operation *> opsToReplace;
-  for (auto *op : oldMemRef.getUsers()) {
-    // Skip this use if it's not dominated by domOpFilter.
-    if (domOpFilter && !domInfo->dominates(domOpFilter, op))
-      continue;
-
-    // Skip this use if it's not post-dominated by postDomOpFilter.
-    if (postDomOpFilter && !postDomInfo->postDominates(postDomOpFilter, op))
-      continue;
-
-    // Skip dealloc's - no replacement is necessary, and a memref replacement
-    // at other uses doesn't hurt these dealloc's.
-    if (isa<memref::DeallocOp>(op) && !replaceInDeallocOp)
-      continue;
-
-    // Check if the memref was used in a non-dereferencing context. It is fine
-    // for the memref to be used in a non-dereferencing way outside of the
-    // region where this replacement is happening.
-    if (!isa<AffineMapAccessInterface>(*op)) {
-      if (!allowNonDereferencingOps) {
-        LLVM_DEBUG(llvm::dbgs()
-                   << "Memref replacement failed: non-deferencing memref op: \n"
-                   << *op << '\n');
-        return failure();
-      }
-      // Non-dereferencing ops with the MemRefsNormalizable trait are
-      // supported for replacement.
-      if (!op->hasTrait<OpTrait::MemRefsNormalizable>()) {
-        LLVM_DEBUG(llvm::dbgs() << "Memref replacement failed: use without a "
-                                   "memrefs normalizable trait: \n"
-                                << *op << '\n');
-        return failure();
-      }
-    }
-
-    // We'll first collect and then replace --- since replacement erases the op
-    // that has the use, and that op could be postDomFilter or domFilter itself!
-    opsToReplace.insert(op);
-  }
-
-  for (auto *op : opsToReplace) {
-    if (failed(replaceAllMemRefUsesWith(
-            oldMemRef, newMemRef, op, extraIndices, indexRemap, extraOperands,
-            symbolOperands, allowNonDereferencingOps)))
-      llvm_unreachable("memref replacement guaranteed to succeed here");
-  }
-
-  return success();
-}
-
-/// Given an operation, inserts one or more single result affine
-/// apply operations, results of which are exclusively used by this operation
-/// operation. The operands of these newly created affine apply ops are
-/// guaranteed to be loop iterators or terminal symbols of a function.
-///
-/// Before
-///
-/// affine.for %i = 0 to #map(%N)
-///   %idx = affine.apply (d0) -> (d0 mod 2) (%i)
-///   "send"(%idx, %A, ...)
-///   "compute"(%idx)
-///
-/// After
-///
-/// affine.for %i = 0 to #map(%N)
-///   %idx = affine.apply (d0) -> (d0 mod 2) (%i)
-///   "send"(%idx, %A, ...)
-///   %idx_ = affine.apply (d0) -> (d0 mod 2) (%i)
-///   "compute"(%idx_)
-///
-/// This allows applying different transformations on send and compute (for eg.
-/// different shifts/delays).
-///
-/// Returns nullptr either if none of opInst's operands were the result of an
-/// affine.apply and thus there was no affine computation slice to create, or if
-/// all the affine.apply op's supplying operands to this opInst did not have any
-/// uses besides this opInst; otherwise returns the list of affine.apply
-/// operations created in output argument `sliceOps`.
-void mlir::createAffineComputationSlice(
-    Operation *opInst, SmallVectorImpl<AffineApplyOp> *sliceOps) {
-  // Collect all operands that are results of affine apply ops.
-  SmallVector<Value, 4> subOperands;
-  subOperands.reserve(opInst->getNumOperands());
-  for (auto operand : opInst->getOperands())
-    if (isa_and_nonnull<AffineApplyOp>(operand.getDefiningOp()))
-      subOperands.push_back(operand);
-
-  // Gather sequence of AffineApplyOps reachable from 'subOperands'.
-  SmallVector<Operation *, 4> affineApplyOps;
-  getReachableAffineApplyOps(subOperands, affineApplyOps);
-  // Skip transforming if there are no affine maps to compose.
-  if (affineApplyOps.empty())
-    return;
-
-  // Check if all uses of the affine apply op's lie only in this op op, in
-  // which case there would be nothing to do.
-  bool localized = true;
-  for (auto *op : affineApplyOps) {
-    for (auto result : op->getResults()) {
-      for (auto *user : result.getUsers()) {
-        if (user != opInst) {
-          localized = false;
-          break;
-        }
-      }
-    }
-  }
-  if (localized)
-    return;
-
-  OpBuilder builder(opInst);
-  SmallVector<Value, 4> composedOpOperands(subOperands);
-  auto composedMap = builder.getMultiDimIdentityMap(composedOpOperands.size());
-  fullyComposeAffineMapAndOperands(&composedMap, &composedOpOperands);
-
-  // Create an affine.apply for each of the map results.
-  sliceOps->reserve(composedMap.getNumResults());
-  for (auto resultExpr : composedMap.getResults()) {
-    auto singleResMap = AffineMap::get(composedMap.getNumDims(),
-                                       composedMap.getNumSymbols(), resultExpr);
-    sliceOps->push_back(builder.create<AffineApplyOp>(
-        opInst->getLoc(), singleResMap, composedOpOperands));
-  }
-
-  // Construct the new operands that include the results from the composed
-  // affine apply op above instead of existing ones (subOperands). So, they
-  // differ from opInst's operands only for those operands in 'subOperands', for
-  // which they will be replaced by the corresponding one from 'sliceOps'.
-  SmallVector<Value, 4> newOperands(opInst->getOperands());
-  for (unsigned i = 0, e = newOperands.size(); i < e; i++) {
-    // Replace the subOperands from among the new operands.
-    unsigned j, f;
-    for (j = 0, f = subOperands.size(); j < f; j++) {
-      if (newOperands[i] == subOperands[j])
-        break;
-    }
-    if (j < subOperands.size()) {
-      newOperands[i] = (*sliceOps)[j];
-    }
-  }
-  for (unsigned idx = 0, e = newOperands.size(); idx < e; idx++) {
-    opInst->setOperand(idx, newOperands[idx]);
-  }
-}
-
-/// Enum to set patterns of affine expr in tiled-layout map.
-/// TileFloorDiv: <dim expr> div <tile size>
-/// TileMod: <dim expr> mod <tile size>
-/// TileNone: None of the above
-/// Example:
-/// #tiled_2d_128x256 = affine_map<(d0, d1)
-///            -> (d0 div 128, d1 div 256, d0 mod 128, d1 mod 256)>
-/// "d0 div 128" and "d1 div 256" ==> TileFloorDiv
-/// "d0 mod 128" and "d1 mod 256" ==> TileMod
-enum TileExprPattern { TileFloorDiv, TileMod, TileNone };
-
-/// Check if `map` is a tiled layout. In the tiled layout, specific k dimensions
-/// being floordiv'ed by respective tile sizes appeare in a mod with the same
-/// tile sizes, and no other expression involves those k dimensions. This
-/// function stores a vector of tuples (`tileSizePos`) including AffineExpr for
-/// tile size, positions of corresponding `floordiv` and `mod`. If it is not a
-/// tiled layout, an empty vector is returned.
-static LogicalResult getTileSizePos(
-    AffineMap map,
-    SmallVectorImpl<std::tuple<AffineExpr, unsigned, unsigned>> &tileSizePos) {
-  // Create `floordivExprs` which is a vector of tuples including LHS and RHS of
-  // `floordiv` and its position in `map` output.
-  // Example: #tiled_2d_128x256 = affine_map<(d0, d1)
-  //                -> (d0 div 128, d1 div 256, d0 mod 128, d1 mod 256)>
-  // In this example, `floordivExprs` includes {d0, 128, 0} and {d1, 256, 1}.
-  SmallVector<std::tuple<AffineExpr, AffineExpr, unsigned>, 4> floordivExprs;
-  unsigned pos = 0;
-  for (AffineExpr expr : map.getResults()) {
-    if (expr.getKind() == AffineExprKind::FloorDiv) {
-      AffineBinaryOpExpr binaryExpr = expr.cast<AffineBinaryOpExpr>();
-      if (binaryExpr.getRHS().isa<AffineConstantExpr>())
-        floordivExprs.emplace_back(
-            std::make_tuple(binaryExpr.getLHS(), binaryExpr.getRHS(), pos));
-    }
-    pos++;
-  }
-  // Not tiled layout if `floordivExprs` is empty.
-  if (floordivExprs.empty()) {
-    tileSizePos = SmallVector<std::tuple<AffineExpr, unsigned, unsigned>>{};
-    return success();
-  }
-
-  // Check if LHS of `floordiv` is used in LHS of `mod`. If not used, `map` is
-  // not tiled layout.
-  for (std::tuple<AffineExpr, AffineExpr, unsigned> fexpr : floordivExprs) {
-    AffineExpr floordivExprLHS = std::get<0>(fexpr);
-    AffineExpr floordivExprRHS = std::get<1>(fexpr);
-    unsigned floordivPos = std::get<2>(fexpr);
-
-    // Walk affinexpr of `map` output except `fexpr`, and check if LHS and RHS
-    // of `fexpr` are used in LHS and RHS of `mod`. If LHS of `fexpr` is used
-    // other expr, the map is not tiled layout. Example of non tiled layout:
-    //   affine_map<(d0, d1, d2) -> (d0, d1, d2 floordiv 256, d2 floordiv 256)>
-    //   affine_map<(d0, d1, d2) -> (d0, d1, d2 floordiv 256, d2 mod 128)>
-    //   affine_map<(d0, d1, d2) -> (d0, d1, d2 floordiv 256, d2 mod 256, d2 mod
-    //   256)>
-    bool found = false;
-    pos = 0;
-    for (AffineExpr expr : map.getResults()) {
-      bool notTiled = false;
-      if (pos != floordivPos) {
-        expr.walk([&](AffineExpr e) {
-          if (e == floordivExprLHS) {
-            if (expr.getKind() == AffineExprKind::Mod) {
-              AffineBinaryOpExpr binaryExpr = expr.cast<AffineBinaryOpExpr>();
-              // If LHS and RHS of `mod` are the same with those of floordiv.
-              if (floordivExprLHS == binaryExpr.getLHS() &&
-                  floordivExprRHS == binaryExpr.getRHS()) {
-                // Save tile size (RHS of `mod`), and position of `floordiv` and
-                // `mod` if same expr with `mod` is not found yet.
-                if (!found) {
-                  tileSizePos.emplace_back(
-                      std::make_tuple(binaryExpr.getRHS(), floordivPos, pos));
-                  found = true;
-                } else {
-                  // Non tiled layout: Have multilpe `mod` with the same LHS.
-                  // eg. affine_map<(d0, d1, d2) -> (d0, d1, d2 floordiv 256, d2
-                  // mod 256, d2 mod 256)>
-                  notTiled = true;
-                }
-              } else {
-                // Non tiled layout: RHS of `mod` is different from `floordiv`.
-                // eg. affine_map<(d0, d1, d2) -> (d0, d1, d2 floordiv 256, d2
-                // mod 128)>
-                notTiled = true;
-              }
-            } else {
-              // Non tiled layout: LHS is the same, but not `mod`.
-              // eg. affine_map<(d0, d1, d2) -> (d0, d1, d2 floordiv 256, d2
-              // floordiv 256)>
-              notTiled = true;
-            }
-          }
-        });
-      }
-      if (notTiled) {
-        tileSizePos = SmallVector<std::tuple<AffineExpr, unsigned, unsigned>>{};
-        return success();
-      }
-      pos++;
-    }
-  }
-  return success();
-}
-
-/// Check if `dim` dimension of memrefType with `layoutMap` becomes dynamic
-/// after normalization. Dimensions that include dynamic dimensions in the map
-/// output will become dynamic dimensions. Return true if `dim` is dynamic
-/// dimension.
-///
-/// Example:
-/// #map0 = affine_map<(d0, d1) -> (d0, d1 floordiv 32, d1 mod 32)>
-///
-/// If d1 is dynamic dimension, 2nd and 3rd dimension of map output are dynamic.
-/// memref<4x?xf32, #map0>  ==>  memref<4x?x?xf32>
-static bool
-isNormalizedMemRefDynamicDim(unsigned dim, AffineMap layoutMap,
-                             SmallVectorImpl<unsigned> &inMemrefTypeDynDims,
-                             MLIRContext *context) {
-  bool isDynamicDim = false;
-  AffineExpr expr = layoutMap.getResults()[dim];
-  // Check if affine expr of the dimension includes dynamic dimension of input
-  // memrefType.
-  expr.walk([&inMemrefTypeDynDims, &isDynamicDim, &context](AffineExpr e) {
-    if (e.isa<AffineDimExpr>()) {
-      for (unsigned dm : inMemrefTypeDynDims) {
-        if (e == getAffineDimExpr(dm, context)) {
-          isDynamicDim = true;
-        }
-      }
-    }
-  });
-  return isDynamicDim;
-}
-
-/// Create affine expr to calculate dimension size for a tiled-layout map.
-static AffineExpr createDimSizeExprForTiledLayout(AffineExpr oldMapOutput,
-                                                  TileExprPattern pat) {
-  // Create map output for the patterns.
-  // "floordiv <tile size>" ==> "ceildiv <tile size>"
-  // "mod <tile size>" ==> "<tile size>"
-  AffineExpr newMapOutput;
-  AffineBinaryOpExpr binaryExpr = nullptr;
-  switch (pat) {
-  case TileExprPattern::TileMod:
-    binaryExpr = oldMapOutput.cast<AffineBinaryOpExpr>();
-    newMapOutput = binaryExpr.getRHS();
-    break;
-  case TileExprPattern::TileFloorDiv:
-    binaryExpr = oldMapOutput.cast<AffineBinaryOpExpr>();
-    newMapOutput = getAffineBinaryOpExpr(
-        AffineExprKind::CeilDiv, binaryExpr.getLHS(), binaryExpr.getRHS());
-    break;
-  default:
-    newMapOutput = oldMapOutput;
-  }
-  return newMapOutput;
-}
-
-/// Create new maps to calculate each dimension size of `newMemRefType`, and
-/// create `newDynamicSizes` from them by using AffineApplyOp.
-///
-/// Steps for normalizing dynamic memrefs for a tiled layout map
-/// Example:
-///    #map0 = affine_map<(d0, d1) -> (d0, d1 floordiv 32, d1 mod 32)>
-///    %0 = dim %arg0, %c1 :memref<4x?xf32>
-///    %1 = alloc(%0) : memref<4x?xf32, #map0>
-///
-/// (Before this function)
-/// 1. Check if `map`(#map0) is a tiled layout using `getTileSizePos()`. Only
-/// single layout map is supported.
-///
-/// 2. Create normalized memrefType using `isNormalizedMemRefDynamicDim()`. It
-/// is memref<4x?x?xf32> in the above example.
-///
-/// (In this function)
-/// 3. Create new maps to calculate each dimension of the normalized memrefType
-/// using `createDimSizeExprForTiledLayout()`. In the tiled layout, the
-/// dimension size can be calculated by replacing "floordiv <tile size>" with
-/// "ceildiv <tile size>" and "mod <tile size>" with "<tile size>".
-/// - New map in the above example
-///   #map0 = affine_map<(d0, d1) -> (d0)>
-///   #map1 = affine_map<(d0, d1) -> (d1 ceildiv 32)>
-///   #map2 = affine_map<(d0, d1) -> (32)>
-///
-/// 4. Create AffineApplyOp to apply the new maps. The output of AffineApplyOp
-/// is used in dynamicSizes of new AllocOp.
-///   %0 = dim %arg0, %c1 : memref<4x?xf32>
-///   %c4 = arith.constant 4 : index
-///   %1 = affine.apply #map1(%c4, %0)
-///   %2 = affine.apply #map2(%c4, %0)
-static void createNewDynamicSizes(MemRefType oldMemRefType,
-                                  MemRefType newMemRefType, AffineMap map,
-                                  memref::AllocOp *allocOp, OpBuilder b,
-                                  SmallVectorImpl<Value> &newDynamicSizes) {
-  // Create new input for AffineApplyOp.
-  SmallVector<Value, 4> inAffineApply;
-  ArrayRef<int64_t> oldMemRefShape = oldMemRefType.getShape();
-  unsigned dynIdx = 0;
-  for (unsigned d = 0; d < oldMemRefType.getRank(); ++d) {
-    if (oldMemRefShape[d] < 0) {
-      // Use dynamicSizes of allocOp for dynamic dimension.
-      inAffineApply.emplace_back(allocOp->dynamicSizes()[dynIdx]);
-      dynIdx++;
-    } else {
-      // Create ConstantOp for static dimension.
-      Attribute constantAttr =
-          b.getIntegerAttr(b.getIndexType(), oldMemRefShape[d]);
-      inAffineApply.emplace_back(
-          b.create<arith::ConstantOp>(allocOp->getLoc(), constantAttr));
-    }
-  }
-
-  // Create new map to calculate each dimension size of new memref for each
-  // original map output. Only for dynamic dimesion of `newMemRefType`.
-  unsigned newDimIdx = 0;
-  ArrayRef<int64_t> newMemRefShape = newMemRefType.getShape();
-  SmallVector<std::tuple<AffineExpr, unsigned, unsigned>> tileSizePos;
-  (void)getTileSizePos(map, tileSizePos);
-  for (AffineExpr expr : map.getResults()) {
-    if (newMemRefShape[newDimIdx] < 0) {
-      // Create new maps to calculate each dimension size of new memref.
-      enum TileExprPattern pat = TileExprPattern::TileNone;
-      for (auto pos : tileSizePos) {
-        if (newDimIdx == std::get<1>(pos))
-          pat = TileExprPattern::TileFloorDiv;
-        else if (newDimIdx == std::get<2>(pos))
-          pat = TileExprPattern::TileMod;
-      }
-      AffineExpr newMapOutput = createDimSizeExprForTiledLayout(expr, pat);
-      AffineMap newMap =
-          AffineMap::get(map.getNumInputs(), map.getNumSymbols(), newMapOutput);
-      Value affineApp =
-          b.create<AffineApplyOp>(allocOp->getLoc(), newMap, inAffineApply);
-      newDynamicSizes.emplace_back(affineApp);
-    }
-    newDimIdx++;
-  }
-}
-
-// TODO: Currently works for static memrefs with a single layout map.
-LogicalResult mlir::normalizeMemRef(memref::AllocOp *allocOp) {
-  MemRefType memrefType = allocOp->getType();
-  OpBuilder b(*allocOp);
-
-  // Fetch a new memref type after normalizing the old memref to have an
-  // identity map layout.
-  MemRefType newMemRefType =
-      normalizeMemRefType(memrefType, b, allocOp->symbolOperands().size());
-  if (newMemRefType == memrefType)
-    // Either memrefType already had an identity map or the map couldn't be
-    // transformed to an identity map.
-    return failure();
-
-  Value oldMemRef = allocOp->getResult();
-
-  SmallVector<Value, 4> symbolOperands(allocOp->symbolOperands());
-  AffineMap layoutMap = memrefType.getLayout().getAffineMap();
-  memref::AllocOp newAlloc;
-  // Check if `layoutMap` is a tiled layout. Only single layout map is
-  // supported for normalizing dynamic memrefs.
-  SmallVector<std::tuple<AffineExpr, unsigned, unsigned>> tileSizePos;
-  (void)getTileSizePos(layoutMap, tileSizePos);
-  if (newMemRefType.getNumDynamicDims() > 0 && !tileSizePos.empty()) {
-    MemRefType oldMemRefType = oldMemRef.getType().cast<MemRefType>();
-    SmallVector<Value, 4> newDynamicSizes;
-    createNewDynamicSizes(oldMemRefType, newMemRefType, layoutMap, allocOp, b,
-                          newDynamicSizes);
-    // Add the new dynamic sizes in new AllocOp.
-    newAlloc =
-        b.create<memref::AllocOp>(allocOp->getLoc(), newMemRefType,
-                                  newDynamicSizes, allocOp->alignmentAttr());
-  } else {
-    newAlloc = b.create<memref::AllocOp>(allocOp->getLoc(), newMemRefType,
-                                         allocOp->alignmentAttr());
-  }
-  // Replace all uses of the old memref.
-  if (failed(replaceAllMemRefUsesWith(oldMemRef, /*newMemRef=*/newAlloc,
-                                      /*extraIndices=*/{},
-                                      /*indexRemap=*/layoutMap,
-                                      /*extraOperands=*/{},
-                                      /*symbolOperands=*/symbolOperands,
-                                      /*domOpFilter=*/nullptr,
-                                      /*postDomOpFilter=*/nullptr,
-                                      /*allowNonDereferencingOps=*/true))) {
-    // If it failed (due to escapes for example), bail out.
-    newAlloc.erase();
-    return failure();
-  }
-  // Replace any uses of the original alloc op and erase it. All remaining uses
-  // have to be dealloc's; RAMUW above would've failed otherwise.
-  assert(llvm::all_of(oldMemRef.getUsers(), [](Operation *op) {
-    return isa<memref::DeallocOp>(op);
-  }));
-  oldMemRef.replaceAllUsesWith(newAlloc);
-  allocOp->erase();
-  return success();
-}
-
-MemRefType mlir::normalizeMemRefType(MemRefType memrefType, OpBuilder b,
-                                     unsigned numSymbolicOperands) {
-  unsigned rank = memrefType.getRank();
-  if (rank == 0)
-    return memrefType;
-
-  if (memrefType.getLayout().isIdentity()) {
-    // Either no maps is associated with this memref or this memref has
-    // a trivial (identity) map.
-    return memrefType;
-  }
-  AffineMap layoutMap = memrefType.getLayout().getAffineMap();
-
-  // We don't do any checks for one-to-one'ness; we assume that it is
-  // one-to-one.
-
-  // Normalize only static memrefs and dynamic memrefs with a tiled-layout map
-  // for now.
-  // TODO: Normalize the other types of dynamic memrefs.
-  SmallVector<std::tuple<AffineExpr, unsigned, unsigned>> tileSizePos;
-  (void)getTileSizePos(layoutMap, tileSizePos);
-  if (memrefType.getNumDynamicDims() > 0 && tileSizePos.empty())
-    return memrefType;
-
-  // We have a single map that is not an identity map. Create a new memref
-  // with the right shape and an identity layout map.
-  ArrayRef<int64_t> shape = memrefType.getShape();
-  // FlatAffineConstraint may later on use symbolicOperands.
-  FlatAffineConstraints fac(rank, numSymbolicOperands);
-  SmallVector<unsigned, 4> memrefTypeDynDims;
-  for (unsigned d = 0; d < rank; ++d) {
-    // Use constraint system only in static dimensions.
-    if (shape[d] > 0) {
-      fac.addBound(FlatAffineConstraints::LB, d, 0);
-      fac.addBound(FlatAffineConstraints::UB, d, shape[d] - 1);
-    } else {
-      memrefTypeDynDims.emplace_back(d);
-    }
-  }
-  // We compose this map with the original index (logical) space to derive
-  // the upper bounds for the new index space.
-  unsigned newRank = layoutMap.getNumResults();
-  if (failed(fac.composeMatchingMap(layoutMap)))
-    return memrefType;
-  // TODO: Handle semi-affine maps.
-  // Project out the old data dimensions.
-  fac.projectOut(newRank, fac.getNumIds() - newRank - fac.getNumLocalIds());
-  SmallVector<int64_t, 4> newShape(newRank);
-  for (unsigned d = 0; d < newRank; ++d) {
-    // Check if each dimension of normalized memrefType is dynamic.
-    bool isDynDim = isNormalizedMemRefDynamicDim(
-        d, layoutMap, memrefTypeDynDims, b.getContext());
-    if (isDynDim) {
-      newShape[d] = -1;
-    } else {
-      // The lower bound for the shape is always zero.
-      auto ubConst = fac.getConstantBound(FlatAffineConstraints::UB, d);
-      // For a static memref and an affine map with no symbols, this is
-      // always bounded.
-      assert(ubConst.hasValue() && "should always have an upper bound");
-      if (ubConst.getValue() < 0)
-        // This is due to an invalid map that maps to a negative space.
-        return memrefType;
-      // If dimension of new memrefType is dynamic, the value is -1.
-      newShape[d] = ubConst.getValue() + 1;
-    }
-  }
-
-  // Create the new memref type after trivializing the old layout map.
-  MemRefType newMemRefType =
-      MemRefType::Builder(memrefType)
-          .setShape(newShape)
-          .setLayout(AffineMapAttr::get(b.getMultiDimIdentityMap(newRank)));
-
-  return newMemRefType;
-}
diff --git a/mlir/test/lib/Dialect/Affine/CMakeLists.txt b/mlir/test/lib/Dialect/Affine/CMakeLists.txt
--- a/mlir/test/lib/Dialect/Affine/CMakeLists.txt
+++ b/mlir/test/lib/Dialect/Affine/CMakeLists.txt
@@ -3,6 +3,8 @@
   TestAffineDataCopy.cpp
   TestAffineLoopUnswitching.cpp
   TestAffineLoopParametricTiling.cpp
+  TestLoopFusion.cpp
+  TestLoopMapping.cpp
   TestLoopPermutation.cpp
   TestVectorizationUtils.cpp
 
diff --git a/mlir/test/lib/Dialect/Affine/TestAffineDataCopy.cpp b/mlir/test/lib/Dialect/Affine/TestAffineDataCopy.cpp
--- a/mlir/test/lib/Dialect/Affine/TestAffineDataCopy.cpp
+++ b/mlir/test/lib/Dialect/Affine/TestAffineDataCopy.cpp
@@ -13,10 +13,10 @@
 
 #include "mlir/Dialect/Affine/Analysis/Utils.h"
 #include "mlir/Dialect/Affine/IR/AffineOps.h"
+#include "mlir/Dialect/Affine/LoopUtils.h"
 #include "mlir/Dialect/MemRef/IR/MemRef.h"
 #include "mlir/Pass/Pass.h"
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
-#include "mlir/Transforms/LoopUtils.h"
 #include "mlir/Transforms/Passes.h"
 
 #define PASS_NAME "test-affine-data-copy"
diff --git a/mlir/test/lib/Dialect/Affine/TestAffineLoopParametricTiling.cpp b/mlir/test/lib/Dialect/Affine/TestAffineLoopParametricTiling.cpp
--- a/mlir/test/lib/Dialect/Affine/TestAffineLoopParametricTiling.cpp
+++ b/mlir/test/lib/Dialect/Affine/TestAffineLoopParametricTiling.cpp
@@ -12,8 +12,8 @@
 //===----------------------------------------------------------------------===//
 
 #include "mlir/Dialect/Affine/IR/AffineOps.h"
+#include "mlir/Dialect/Affine/LoopUtils.h"
 #include "mlir/Dialect/Affine/Passes.h"
-#include "mlir/Transforms/LoopUtils.h"
 
 using namespace mlir;
 
diff --git a/mlir/test/lib/Transforms/TestLoopFusion.cpp b/mlir/test/lib/Dialect/Affine/TestLoopFusion.cpp
rename from mlir/test/lib/Transforms/TestLoopFusion.cpp
rename to mlir/test/lib/Dialect/Affine/TestLoopFusion.cpp
--- a/mlir/test/lib/Transforms/TestLoopFusion.cpp
+++ b/mlir/test/lib/Dialect/Affine/TestLoopFusion.cpp
@@ -12,11 +12,10 @@
 
 #include "mlir/Dialect/Affine/Analysis/Utils.h"
 #include "mlir/Dialect/Affine/IR/AffineOps.h"
+#include "mlir/Dialect/Affine/LoopFusionUtils.h"
+#include "mlir/Dialect/Affine/LoopUtils.h"
 #include "mlir/Dialect/StandardOps/IR/Ops.h"
 #include "mlir/Pass/Pass.h"
-#include "mlir/Transforms/LoopFusionUtils.h"
-#include "mlir/Transforms/LoopUtils.h"
-#include "mlir/Transforms/Passes.h"
 
 #define DEBUG_TYPE "test-loop-fusion"
 
diff --git a/mlir/test/lib/Transforms/TestLoopMapping.cpp b/mlir/test/lib/Dialect/Affine/TestLoopMapping.cpp
rename from mlir/test/lib/Transforms/TestLoopMapping.cpp
rename to mlir/test/lib/Dialect/Affine/TestLoopMapping.cpp
--- a/mlir/test/lib/Transforms/TestLoopMapping.cpp
+++ b/mlir/test/lib/Dialect/Affine/TestLoopMapping.cpp
@@ -12,11 +12,10 @@
 //===----------------------------------------------------------------------===//
 
 #include "mlir/Dialect/Affine/IR/AffineOps.h"
+#include "mlir/Dialect/Affine/LoopUtils.h"
 #include "mlir/Dialect/SCF/SCF.h"
 #include "mlir/IR/Builders.h"
 #include "mlir/Pass/Pass.h"
-#include "mlir/Transforms/LoopUtils.h"
-#include "mlir/Transforms/Passes.h"
 
 #include "llvm/ADT/SetVector.h"
 
diff --git a/mlir/test/lib/Dialect/Affine/TestLoopPermutation.cpp b/mlir/test/lib/Dialect/Affine/TestLoopPermutation.cpp
--- a/mlir/test/lib/Dialect/Affine/TestLoopPermutation.cpp
+++ b/mlir/test/lib/Dialect/Affine/TestLoopPermutation.cpp
@@ -12,9 +12,8 @@
 
 #include "mlir/Dialect/Affine/Analysis/Utils.h"
 #include "mlir/Dialect/Affine/IR/AffineOps.h"
+#include "mlir/Dialect/Affine/LoopUtils.h"
 #include "mlir/Pass/Pass.h"
-#include "mlir/Transforms/LoopUtils.h"
-#include "mlir/Transforms/Passes.h"
 
 #define PASS_NAME "test-loop-permutation"
 
diff --git a/mlir/test/lib/Dialect/Affine/TestVectorizationUtils.cpp b/mlir/test/lib/Dialect/Affine/TestVectorizationUtils.cpp
--- a/mlir/test/lib/Dialect/Affine/TestVectorizationUtils.cpp
+++ b/mlir/test/lib/Dialect/Affine/TestVectorizationUtils.cpp
@@ -14,6 +14,7 @@
 #include "mlir/Dialect/Affine/Analysis/AffineAnalysis.h"
 #include "mlir/Dialect/Affine/Analysis/NestedMatcher.h"
 #include "mlir/Dialect/Affine/IR/AffineOps.h"
+#include "mlir/Dialect/Affine/LoopUtils.h"
 #include "mlir/Dialect/Affine/Utils.h"
 #include "mlir/Dialect/Vector/VectorOps.h"
 #include "mlir/Dialect/Vector/VectorUtils.h"
@@ -21,7 +22,6 @@
 #include "mlir/IR/BuiltinTypes.h"
 #include "mlir/IR/Diagnostics.h"
 #include "mlir/Pass/Pass.h"
-#include "mlir/Transforms/LoopUtils.h"
 #include "mlir/Transforms/Passes.h"
 
 #include "llvm/ADT/STLExtras.h"
diff --git a/mlir/test/lib/Dialect/SCF/CMakeLists.txt b/mlir/test/lib/Dialect/SCF/CMakeLists.txt
--- a/mlir/test/lib/Dialect/SCF/CMakeLists.txt
+++ b/mlir/test/lib/Dialect/SCF/CMakeLists.txt
@@ -1,5 +1,7 @@
 # Exclude tests from libMLIR.so
 add_mlir_library(MLIRSCFTestPasses
+  TestLoopParametricTiling.cpp
+  TestLoopUnrolling.cpp
   TestSCFUtils.cpp
 
   EXCLUDE_FROM_LIBMLIR
diff --git a/mlir/test/lib/Transforms/TestLoopParametricTiling.cpp b/mlir/test/lib/Dialect/SCF/TestLoopParametricTiling.cpp
rename from mlir/test/lib/Transforms/TestLoopParametricTiling.cpp
rename to mlir/test/lib/Dialect/SCF/TestLoopParametricTiling.cpp
--- a/mlir/test/lib/Transforms/TestLoopParametricTiling.cpp
+++ b/mlir/test/lib/Dialect/SCF/TestLoopParametricTiling.cpp
@@ -11,10 +11,9 @@
 //===----------------------------------------------------------------------===//
 
 #include "mlir/Dialect/SCF/SCF.h"
+#include "mlir/Dialect/SCF/Utils.h"
 #include "mlir/IR/Builders.h"
 #include "mlir/Pass/Pass.h"
-#include "mlir/Transforms/LoopUtils.h"
-#include "mlir/Transforms/Passes.h"
 
 using namespace mlir;
 
@@ -31,8 +30,7 @@
   }
   StringRef getDescription() const final {
     return "test application of parametric tiling to the outer loops so that "
-           "the "
-           "ranges of outer loops become static";
+           "the ranges of outer loops become static";
   }
   SimpleParametricLoopTilingPass() = default;
   SimpleParametricLoopTilingPass(const SimpleParametricLoopTilingPass &) {}
diff --git a/mlir/test/lib/Transforms/TestLoopUnrolling.cpp b/mlir/test/lib/Dialect/SCF/TestLoopUnrolling.cpp
rename from mlir/test/lib/Transforms/TestLoopUnrolling.cpp
rename to mlir/test/lib/Dialect/SCF/TestLoopUnrolling.cpp
--- a/mlir/test/lib/Transforms/TestLoopUnrolling.cpp
+++ b/mlir/test/lib/Dialect/SCF/TestLoopUnrolling.cpp
@@ -12,11 +12,10 @@
 
 #include "mlir/Dialect/Arithmetic/IR/Arithmetic.h"
 #include "mlir/Dialect/SCF/SCF.h"
+#include "mlir/Dialect/SCF/Utils.h"
 #include "mlir/Dialect/StandardOps/IR/Ops.h"
 #include "mlir/IR/Builders.h"
 #include "mlir/Pass/Pass.h"
-#include "mlir/Transforms/LoopUtils.h"
-#include "mlir/Transforms/Passes.h"
 
 using namespace mlir;
 
diff --git a/mlir/test/lib/Transforms/CMakeLists.txt b/mlir/test/lib/Transforms/CMakeLists.txt
--- a/mlir/test/lib/Transforms/CMakeLists.txt
+++ b/mlir/test/lib/Transforms/CMakeLists.txt
@@ -2,10 +2,6 @@
 add_mlir_library(MLIRTestTransforms
   TestConstantFold.cpp
   TestInlining.cpp
-  TestLoopFusion.cpp
-  TestLoopMapping.cpp
-  TestLoopParametricTiling.cpp
-  TestLoopUnrolling.cpp
 
   EXCLUDE_FROM_LIBMLIR
 
diff --git a/mlir/test/lib/Transforms/TestConstantFold.cpp b/mlir/test/lib/Transforms/TestConstantFold.cpp
--- a/mlir/test/lib/Transforms/TestConstantFold.cpp
+++ b/mlir/test/lib/Transforms/TestConstantFold.cpp
@@ -9,7 +9,6 @@
 #include "mlir/Pass/Pass.h"
 #include "mlir/Transforms/FoldUtils.h"
 #include "mlir/Transforms/Passes.h"
-#include "mlir/Transforms/Utils.h"
 
 using namespace mlir;
 
diff --git a/mlir/unittests/Transforms/CMakeLists.txt b/mlir/unittests/Transforms/CMakeLists.txt
--- a/mlir/unittests/Transforms/CMakeLists.txt
+++ b/mlir/unittests/Transforms/CMakeLists.txt
@@ -4,4 +4,5 @@
 )
 target_link_libraries(MLIRTransformsTests
   PRIVATE
+  MLIRParser
   MLIRTransforms)