diff --git a/mlir/include/mlir/Transforms/LoopFusionUtils.h b/mlir/include/mlir/Dialect/Affine/LoopFusionUtils.h
rename from mlir/include/mlir/Transforms/LoopFusionUtils.h
rename to mlir/include/mlir/Dialect/Affine/LoopFusionUtils.h
--- a/mlir/include/mlir/Transforms/LoopFusionUtils.h
+++ b/mlir/include/mlir/Dialect/Affine/LoopFusionUtils.h
@@ -12,8 +12,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef MLIR_TRANSFORMS_LOOPFUSIONUTILS_H
-#define MLIR_TRANSFORMS_LOOPFUSIONUTILS_H
+#ifndef MLIR_DIALECT_AFFINE_LOOPFUSIONUTILS_H
+#define MLIR_DIALECT_AFFINE_LOOPFUSIONUTILS_H
 
 #include "mlir/IR/Value.h"
 #include "mlir/Support/LLVM.h"
@@ -167,4 +167,4 @@
                                    DenseSet<Value> &producerConsumerMemrefs);
 } // namespace mlir
 
-#endif // MLIR_TRANSFORMS_LOOPFUSIONUTILS_H
+#endif // MLIR_DIALECT_AFFINE_LOOPFUSIONUTILS_H
diff --git a/mlir/include/mlir/Transforms/LoopUtils.h b/mlir/include/mlir/Dialect/Affine/LoopUtils.h
rename from mlir/include/mlir/Transforms/LoopUtils.h
rename to mlir/include/mlir/Dialect/Affine/LoopUtils.h
--- a/mlir/include/mlir/Transforms/LoopUtils.h
+++ b/mlir/include/mlir/Dialect/Affine/LoopUtils.h
@@ -12,8 +12,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef MLIR_TRANSFORMS_LOOPUTILS_H
-#define MLIR_TRANSFORMS_LOOPUTILS_H
+#ifndef MLIR_DIALECT_AFFINE_LOOPUTILS_H
+#define MLIR_DIALECT_AFFINE_LOOPUTILS_H
 
 #include "mlir/IR/Block.h"
 #include "mlir/Support/LLVM.h"
@@ -321,9 +321,6 @@
 separateFullTiles(MutableArrayRef<AffineForOp> nest,
                   SmallVectorImpl<AffineForOp> *fullTileNest = nullptr);
 
-/// Move loop invariant code out of `looplike`.
-LogicalResult moveLoopInvariantCode(LoopLikeOpInterface looplike);
-
 } // namespace mlir
 
-#endif // MLIR_TRANSFORMS_LOOPUTILS_H
+#endif // MLIR_DIALECT_AFFINE_LOOPUTILS_H
diff --git a/mlir/include/mlir/Dialect/Affine/Passes.h b/mlir/include/mlir/Dialect/Affine/Passes.h
--- a/mlir/include/mlir/Dialect/Affine/Passes.h
+++ b/mlir/include/mlir/Dialect/Affine/Passes.h
@@ -21,6 +21,10 @@
 
 class AffineForOp;
 
+/// Fusion mode to attempt. The default mode `Greedy` does both
+/// producer-consumer and sibling fusion.
+enum FusionMode { Greedy, ProducerConsumer, Sibling };
+
 /// Creates a simplification pass for affine structures (maps and sets). In
 /// addition, this pass also normalizes memrefs to have the trivial (identity)
 /// layout map.
@@ -53,6 +57,19 @@
 /// dead allocs.
 std::unique_ptr<OperationPass<FuncOp>> createAffineScalarReplacementPass();
 
+/// Creates a pass that transforms perfectly nested loops with independent
+/// bounds into a single loop.
+std::unique_ptr<OperationPass<FuncOp>> createLoopCoalescingPass();
+
+/// Creates a loop fusion pass which fuses loops according to type of fusion
+/// specified in `fusionMode`. Buffers of size less than or equal to
+/// `localBufSizeThreshold` are promoted to memory space `fastMemorySpace`.
+std::unique_ptr<OperationPass<FuncOp>>
+createLoopFusionPass(unsigned fastMemorySpace = 0,
+                     uint64_t localBufSizeThreshold = 0,
+                     bool maximalFusion = false,
+                     enum FusionMode fusionMode = FusionMode::Greedy);
+
 /// Creates a pass to perform tiling on loop nests.
 std::unique_ptr<OperationPass<FuncOp>>
 createLoopTilingPass(uint64_t cacheSizeBytes);
@@ -76,6 +93,10 @@
 std::unique_ptr<OperationPass<FuncOp>>
 createLoopUnrollAndJamPass(int unrollJamFactor = -1);
 
+/// Creates a pass to pipeline explicit movement of data across levels of the
+/// memory hierarchy.
+std::unique_ptr<OperationPass<FuncOp>> createPipelineDataTransferPass();
+
 /// Creates a pass to vectorize loops, operations and data types using a
 /// target-independent, n-D super-vector abstraction.
 std::unique_ptr<OperationPass<FuncOp>>
diff --git a/mlir/include/mlir/Dialect/Affine/Passes.td b/mlir/include/mlir/Dialect/Affine/Passes.td
--- a/mlir/include/mlir/Dialect/Affine/Passes.td
+++ b/mlir/include/mlir/Dialect/Affine/Passes.td
@@ -43,6 +43,138 @@
   ];
 }
 
+def AffineLoopFusion : Pass<"affine-loop-fusion", "FuncOp"> {
+  let summary = "Fuse affine loop nests";
+  let description = [{
+    This pass performs fusion of loop nests using a slicing-based approach. It
+    combines two fusion strategies: producer-consumer fusion and sibling fusion.
+    Producer-consumer fusion is aimed at fusing pairs of loops where the first
+    one writes to a memref that the second reads. Sibling fusion targets pairs
+    of loops that share no dependences between them but that load from the same
+    memref. The fused loop nests, when possible, are rewritten to access
+    significantly smaller local buffers instead of the original memref's, and
+    the latter are often either completely optimized away or contracted. This
+    transformation leads to enhanced locality and lower memory footprint through
+    the elimination or contraction of temporaries/intermediate memref's. These
+    benefits are sometimes achieved at the expense of redundant computation
+    through a cost model that evaluates available choices such as the depth at
+    which a source slice should be materialized in the designation slice.
+
+    Example 1: Producer-consumer fusion.
+    Input:
+    ```mlir
+    func @producer_consumer_fusion(%arg0: memref<10xf32>, %arg1: memref<10xf32>) {
+      %0 = memref.alloc() : memref<10xf32>
+      %1 = memref.alloc() : memref<10xf32>
+      %cst = arith.constant 0.000000e+00 : f32
+      affine.for %arg2 = 0 to 10 {
+        affine.store %cst, %0[%arg2] : memref<10xf32>
+        affine.store %cst, %1[%arg2] : memref<10xf32>
+      }
+      affine.for %arg2 = 0 to 10 {
+        %2 = affine.load %0[%arg2] : memref<10xf32>
+        %3 = arith.addf %2, %2 : f32
+        affine.store %3, %arg0[%arg2] : memref<10xf32>
+      }
+      affine.for %arg2 = 0 to 10 {
+        %2 = affine.load %1[%arg2] : memref<10xf32>
+        %3 = arith.mulf %2, %2 : f32
+        affine.store %3, %arg1[%arg2] : memref<10xf32>
+      }
+      return
+    }
+    ```
+    Output:
+    ```mlir
+    func @producer_consumer_fusion(%arg0: memref<10xf32>, %arg1: memref<10xf32>) {
+      %0 = memref.alloc() : memref<1xf32>
+      %1 = memref.alloc() : memref<1xf32>
+      %cst = arith.constant 0.000000e+00 : f32
+      affine.for %arg2 = 0 to 10 {
+        affine.store %cst, %0[0] : memref<1xf32>
+        affine.store %cst, %1[0] : memref<1xf32>
+        %2 = affine.load %1[0] : memref<1xf32>
+        %3 = arith.mulf %2, %2 : f32
+        affine.store %3, %arg1[%arg2] : memref<10xf32>
+        %4 = affine.load %0[0] : memref<1xf32>
+        %5 = arith.addf %4, %4 : f32
+        affine.store %5, %arg0[%arg2] : memref<10xf32>
+      }
+      return
+    }
+    ```
+
+    Example 2: Sibling fusion.
+    Input:
+    ```mlir
+    func @sibling_fusion(%arg0: memref<10x10xf32>, %arg1: memref<10x10xf32>,
+                         %arg2: memref<10x10xf32>, %arg3: memref<10x10xf32>,
+                         %arg4: memref<10x10xf32>) {
+      affine.for %arg5 = 0 to 3 {
+        affine.for %arg6 = 0 to 3 {
+          %0 = affine.load %arg0[%arg5, %arg6] : memref<10x10xf32>
+          %1 = affine.load %arg1[%arg5, %arg6] : memref<10x10xf32>
+          %2 = arith.mulf %0, %1 : f32
+          affine.store %2, %arg3[%arg5, %arg6] : memref<10x10xf32>
+        }
+      }
+      affine.for %arg5 = 0 to 3 {
+        affine.for %arg6 = 0 to 3 {
+          %0 = affine.load %arg0[%arg5, %arg6] : memref<10x10xf32>
+          %1 = affine.load %arg2[%arg5, %arg6] : memref<10x10xf32>
+          %2 = arith.addf %0, %1 : f32
+          affine.store %2, %arg4[%arg5, %arg6] : memref<10x10xf32>
+        }
+      }
+      return
+    }
+    ```
+    Output:
+    ```mlir
+    func @sibling_fusion(%arg0: memref<10x10xf32>, %arg1: memref<10x10xf32>,
+                         %arg2: memref<10x10xf32>, %arg3: memref<10x10xf32>,
+                         %arg4: memref<10x10xf32>) {
+      affine.for %arg5 = 0 to 3 {
+        affine.for %arg6 = 0 to 3 {
+          %0 = affine.load %arg0[%arg5, %arg6] : memref<10x10xf32>
+          %1 = affine.load %arg1[%arg5, %arg6] : memref<10x10xf32>
+          %2 = arith.mulf %0, %1 : f32
+          affine.store %2, %arg3[%arg5, %arg6] : memref<10x10xf32>
+          %3 = affine.load %arg0[%arg5, %arg6] : memref<10x10xf32>
+          %4 = affine.load %arg2[%arg5, %arg6] : memref<10x10xf32>
+          %5 = arith.addf %3, %4 : f32
+          affine.store %5, %arg4[%arg5, %arg6] : memref<10x10xf32>
+        }
+      }
+      return
+    }
+    ```
+  }];
+  let constructor = "mlir::createLoopFusionPass()";
+  let options = [
+    Option<"computeToleranceThreshold", "fusion-compute-tolerance", "double",
+           /*default=*/"0.30f", "Fractional increase in additional computation "
+                                "tolerated while fusing">,
+    Option<"fastMemorySpace", "fusion-fast-mem-space", "unsigned",
+           /*default=*/"0",
+           "Faster memory space number to promote fusion buffers to">,
+    Option<"localBufSizeThreshold", "fusion-local-buf-threshold", "uint64_t",
+           /*default=*/"0", "Threshold size (KiB) for promoting local buffers "
+                            "to fast memory space">,
+    Option<"maximalFusion", "fusion-maximal", "bool", /*default=*/"false",
+           "Enables maximal loop fusion">,
+    Option<"affineFusionMode", "mode", "enum FusionMode",
+           "mlir::FusionMode::Greedy", "fusion mode to attempt",
+           "llvm::cl::values(clEnumValN(mlir::FusionMode::Greedy,"
+           " \"greedy\", \"Perform greedy (both producer-consumer and sibling)  fusion\"), "
+           "clEnumValN( mlir::FusionMode::ProducerConsumer, "
+           "\"producer\", \"Perform only producer-consumer fusion\"), "
+           "clEnumValN( mlir::FusionMode::Sibling, "
+           "\"sibling\", \"Perform only sibling fusion\"))">,
+    ];
+  let dependentDialects = ["memref::MemRefDialect"];
+}
+
 def AffineLoopInvariantCodeMotion
     : Pass<"affine-loop-invariant-code-motion", "FuncOp"> {
   let summary = "Hoist loop invariant instructions outside of affine loops";
@@ -94,6 +226,75 @@
   ];
 }
 
+def AffinePipelineDataTransfer
+    : Pass<"affine-pipeline-data-transfer", "FuncOp"> {
+  let summary = "Pipeline non-blocking data transfers between explicitly "
+                "managed levels of the memory hierarchy";
+  let description = [{
+    This pass performs a transformation to overlap non-blocking DMA operations
+    in a loop with computations through double buffering. This is achieved by
+    advancing dma_start operations with respect to other operations.
+
+    Input
+
+    ```mlir
+    func @pipelinedatatransfer() {
+      %0 = memref.alloc() : memref<256xf32>
+      %1 = memref.alloc() : memref<32xf32, 1>
+      %2 = memref.alloc() : memref<1xf32>
+      %c0 = arith.constant 0 : index
+      %c128 = arith.constant 128 : index
+      affine.for %i0 = 0 to 8 {
+        affine.dma_start %0[%i0], %1[%i0], %2[%c0], %c128 : memref<256xf32>, memref<32xf32, 1>, memref<1xf32>
+        affine.dma_wait %2[%c0], %c128 : memref<1xf32>
+        %3 = affine.load %1[%i0] : memref<32xf32, 1>
+        %4 = "compute"(%3) : (f32) -> f32
+        affine.store %4, %1[%i0] : memref<32xf32, 1>
+      }
+      return
+    }
+    ```
+
+    Output
+
+    ```mlir
+    module {
+      func @pipelinedatatransfer() {
+        %c8 = arith.constant 8 : index
+        %c0 = arith.constant 0 : index
+        %0 = memref.alloc() : memref<256xf32>
+        %c0_0 = arith.constant 0 : index
+        %c128 = arith.constant 128 : index
+        %1 = memref.alloc() : memref<2x32xf32, 1>
+        %2 = memref.alloc() : memref<2x1xf32>
+        affine.dma_start %0[%c0], %1[%c0 mod 2, %c0], %2[%c0 mod 2, symbol(%c0_0)], %c128 : memref<256xf32>, memref<2x32xf32, 1>, memref<2x1xf32>
+        affine.for %arg0 = 1 to 8 {
+          affine.dma_start %0[%arg0], %1[%arg0 mod 2, %arg0], %2[%arg0 mod 2, symbol(%c0_0)], %c128 : memref<256xf32>, memref<2x32xf32, 1>, memref<2x1xf32>
+          %8 = affine.apply #map3(%arg0)
+          %9 = affine.apply #map4(%8)
+          %10 = affine.apply #map4(%8)
+          affine.dma_wait %2[%8 mod 2, symbol(%c0_0)], %c128 : memref<2x1xf32>
+          %11 = affine.load %1[%8 mod 2, %8] : memref<2x32xf32, 1>
+          %12 = "compute"(%11) : (f32) -> f32
+          affine.store %12, %1[%8 mod 2, %8] : memref<2x32xf32, 1>
+        }
+        %3 = affine.apply #map3(%c8)
+        %4 = affine.apply #map4(%3)
+        %5 = affine.apply #map4(%3)
+        affine.dma_wait %2[%3 mod 2, symbol(%c0_0)], %c128 : memref<2x1xf32>
+        %6 = affine.load %1[%3 mod 2, %3] : memref<2x32xf32, 1>
+        %7 = "compute"(%6) : (f32) -> f32
+        affine.store %7, %1[%3 mod 2, %3] : memref<2x32xf32, 1>
+        memref.dealloc %2 : memref<2x1xf32>
+        memref.dealloc %1 : memref<2x32xf32, 1>
+        return
+      }
+    }
+    ```
+  }];
+  let constructor = "mlir::createPipelineDataTransferPass()";
+}
+
 def AffineScalarReplacement : Pass<"affine-scalrep", "FuncOp"> {
   let summary = "Replace affine memref acceses by scalars by forwarding stores "
                 "to loads and eliminating redundant loads";
@@ -184,6 +385,13 @@
   let constructor = "mlir::createAffineLoopNormalizePass()";
 }
 
+def LoopCoalescing : Pass<"loop-coalescing", "FuncOp"> {
+  let summary = "Coalesce nested loops with independent bounds into a single "
+                "loop";
+  let constructor = "mlir::createLoopCoalescingPass()";
+  let dependentDialects = ["arith::ArithmeticDialect"];
+}
+
 def SimplifyAffineStructures : Pass<"simplify-affine-structures", "FuncOp"> {
   let summary = "Simplify affine expressions in maps/sets and normalize "
                 "memrefs";
diff --git a/mlir/include/mlir/Dialect/Affine/Utils.h b/mlir/include/mlir/Dialect/Affine/Utils.h
--- a/mlir/include/mlir/Dialect/Affine/Utils.h
+++ b/mlir/include/mlir/Dialect/Affine/Utils.h
@@ -24,6 +24,10 @@
 class Operation;
 class PostDominanceInfo;
 
+namespace memref {
+class AllocOp;
+} // namespace memref
+
 struct LogicalResult;
 
 using ReductionLoopMap = DenseMap<Operation *, SmallVector<LoopReduction, 2>>;
@@ -168,6 +172,121 @@
 AffineExpr substWithMin(AffineExpr e, AffineExpr dim, AffineExpr min,
                         AffineExpr max, bool positivePath = true);
 
+/// Replaces all "dereferencing" uses of `oldMemRef` with `newMemRef` while
+/// optionally remapping the old memref's indices using the supplied affine map,
+/// `indexRemap`. The new memref could be of a different shape or rank.
+/// `extraIndices` provides any additional access indices to be added to the
+/// start.
+///
+/// `indexRemap` remaps indices of the old memref access to a new set of indices
+/// that are used to index the memref. Additional input operands to indexRemap
+/// can be optionally provided in `extraOperands`, and they occupy the start
+/// of its input list. `indexRemap`'s dimensional inputs are expected to
+/// correspond to memref's indices, and its symbolic inputs if any should be
+/// provided in `symbolOperands`.
+///
+/// `domOpFilter`, if non-null, restricts the replacement to only those
+/// operations that are dominated by the former; similarly, `postDomOpFilter`
+/// restricts replacement to only those operations that are postdominated by it.
+///
+/// 'allowNonDereferencingOps', if set, allows replacement of non-dereferencing
+/// uses of a memref without any requirement for access index rewrites as long
+/// as the user operation has the MemRefsNormalizable trait. The default value
+/// of this flag is false.
+///
+/// 'replaceInDeallocOp', if set, lets DeallocOp, a non-dereferencing user, to
+/// also be a candidate for replacement. The default value of this flag is
+/// false.
+///
+/// Returns true on success and false if the replacement is not possible,
+/// whenever a memref is used as an operand in a non-dereferencing context and
+/// 'allowNonDereferencingOps' is false, except for dealloc's on the memref
+/// which are left untouched. See comments at function definition for an
+/// example.
+//
+//  Ex: to replace load %A[%i, %j] with load %Abuf[%t mod 2, %ii - %i, %j]:
+//  The SSA value corresponding to '%t mod 2' should be in 'extraIndices', and
+//  index remap will perform (%i, %j) -> (%ii - %i, %j), i.e., indexRemap = (d0,
+//  d1, d2) -> (d0 - d1, d2), and %ii will be the extra operand. Without any
+//  extra operands, note that 'indexRemap' would just be applied to existing
+//  indices (%i, %j).
+//  TODO: allow extraIndices to be added at any position.
+LogicalResult replaceAllMemRefUsesWith(
+    Value oldMemRef, Value newMemRef, ArrayRef<Value> extraIndices = {},
+    AffineMap indexRemap = AffineMap(), ArrayRef<Value> extraOperands = {},
+    ArrayRef<Value> symbolOperands = {}, Operation *domOpFilter = nullptr,
+    Operation *postDomOpFilter = nullptr, bool allowNonDereferencingOps = false,
+    bool replaceInDeallocOp = false);
+
+/// Performs the same replacement as the other version above but only for the
+/// dereferencing uses of `oldMemRef` in `op`, except in cases where
+/// 'allowNonDereferencingOps' is set to true where we replace the
+/// non-dereferencing uses as well.
+LogicalResult replaceAllMemRefUsesWith(Value oldMemRef, Value newMemRef,
+                                       Operation *op,
+                                       ArrayRef<Value> extraIndices = {},
+                                       AffineMap indexRemap = AffineMap(),
+                                       ArrayRef<Value> extraOperands = {},
+                                       ArrayRef<Value> symbolOperands = {},
+                                       bool allowNonDereferencingOps = false);
+
+/// Rewrites the memref defined by this alloc op to have an identity layout map
+/// and updates all its indexing uses. Returns failure if any of its uses
+/// escape (while leaving the IR in a valid state).
+LogicalResult normalizeMemRef(memref::AllocOp *op);
+
+/// Uses the old memref type map layout and computes the new memref type to have
+/// a new shape and a layout map, where the old layout map has been normalized
+/// to an identity layout map. It returns the old memref in case no
+/// normalization was needed or a failure occurs while transforming the old map
+/// layout to an identity layout map.
+MemRefType normalizeMemRefType(MemRefType memrefType, OpBuilder builder,
+                               unsigned numSymbolicOperands);
+
+/// Creates and inserts into 'builder' a new AffineApplyOp, with the number of
+/// its results equal to the number of operands, as a composition
+/// of all other AffineApplyOps reachable from input parameter 'operands'. If
+/// different operands were drawing results from multiple affine apply ops,
+/// these will also be collected into a single (multi-result) affine apply op.
+/// The final results of the composed AffineApplyOp are returned in output
+/// parameter 'results'. Returns the affine apply op created.
+Operation *createComposedAffineApplyOp(OpBuilder &builder, Location loc,
+                                       ArrayRef<Value> operands,
+                                       ArrayRef<Operation *> affineApplyOps,
+                                       SmallVectorImpl<Value> *results);
+
+/// Given an operation, inserts one or more single result affine apply
+/// operations, results of which are exclusively used by this operation.
+/// The operands of these newly created affine apply ops are
+/// guaranteed to be loop iterators or terminal symbols of a function.
+///
+/// Before
+///
+/// affine.for %i = 0 to #map(%N)
+///   %idx = affine.apply (d0) -> (d0 mod 2) (%i)
+///   send %A[%idx], ...
+///   %v = "compute"(%idx, ...)
+///
+/// After
+///
+/// affine.for %i = 0 to #map(%N)
+///   %idx = affine.apply (d0) -> (d0 mod 2) (%i)
+///   send %A[%idx], ...
+///   %idx_ = affine.apply (d0) -> (d0 mod 2) (%i)
+///   %v = "compute"(%idx_, ...)
+
+/// This allows the application of different transformations on send and
+/// compute (for eg. different shifts/delays)
+///
+/// Fills `sliceOps` with the list of affine.apply operations.
+/// In the following cases, `sliceOps` remains empty:
+///   1. If none of opInst's operands were the result of an affine.apply
+///      (i.e., there was no affine computation slice to create).
+///   2. If all the affine.apply op's supplying operands to this opInst did not
+///      have any uses other than those in this opInst.
+void createAffineComputationSlice(Operation *opInst,
+                                  SmallVectorImpl<AffineApplyOp> *sliceOps);
+
 } // namespace mlir
 
 #endif // MLIR_DIALECT_AFFINE_UTILS_H
diff --git a/mlir/include/mlir/Dialect/SCF/Passes.h b/mlir/include/mlir/Dialect/SCF/Passes.h
--- a/mlir/include/mlir/Dialect/SCF/Passes.h
+++ b/mlir/include/mlir/Dialect/SCF/Passes.h
@@ -32,6 +32,10 @@
 /// inside of scf.for loops with known lower and upper bounds.
 std::unique_ptr<Pass> createSCFForLoopCanonicalizationPass();
 
+/// Creates a pass that transforms a single ParallelLoop over N induction
+/// variables into another ParallelLoop over less than N induction variables.
+std::unique_ptr<Pass> createParallelLoopCollapsingPass();
+
 /// Creates a loop fusion pass which fuses parallel loops.
 std::unique_ptr<Pass> createParallelLoopFusionPass();
 
diff --git a/mlir/include/mlir/Dialect/SCF/Passes.td b/mlir/include/mlir/Dialect/SCF/Passes.td
--- a/mlir/include/mlir/Dialect/SCF/Passes.td
+++ b/mlir/include/mlir/Dialect/SCF/Passes.td
@@ -52,6 +52,22 @@
   let constructor = "mlir::createParallelLoopFusionPass()";
 }
 
+def SCFParallelLoopCollapsing : Pass<"parallel-loop-collapsing"> {
+  let summary = "Collapse parallel loops to use less induction variables";
+  let constructor = "mlir::createParallelLoopCollapsingPass()";
+  let options = [
+    ListOption<"clCollapsedIndices0", "collapsed-indices-0", "unsigned",
+               "Which loop indices to combine 0th loop index",
+               "llvm::cl::MiscFlags::CommaSeparated">,
+    ListOption<"clCollapsedIndices1", "collapsed-indices-1", "unsigned",
+               "Which loop indices to combine into the position 1 loop index",
+               "llvm::cl::MiscFlags::CommaSeparated">,
+    ListOption<"clCollapsedIndices2", "collapsed-indices-2", "unsigned",
+               "Which loop indices to combine into the position 2 loop index",
+               "llvm::cl::MiscFlags::CommaSeparated">,
+  ];
+}
+
 def SCFParallelLoopSpecialization
     : Pass<"parallel-loop-specialization", "FuncOp"> {
   let summary = "Specialize parallel loops for vectorization";
diff --git a/mlir/include/mlir/Interfaces/LoopLikeInterface.h b/mlir/include/mlir/Interfaces/LoopLikeInterface.h
--- a/mlir/include/mlir/Interfaces/LoopLikeInterface.h
+++ b/mlir/include/mlir/Interfaces/LoopLikeInterface.h
@@ -15,7 +15,20 @@
 
 #include "mlir/IR/OpDefinition.h"
 
+//===----------------------------------------------------------------------===//
+// LoopLike Interfaces
+//===----------------------------------------------------------------------===//
+
 /// Include the generated interface declarations.
 #include "mlir/Interfaces/LoopLikeInterface.h.inc"
 
+//===----------------------------------------------------------------------===//
+// LoopLike Utilities
+//===----------------------------------------------------------------------===//
+
+namespace mlir {
+/// Move loop invariant code out of a `looplike` operation.
+LogicalResult moveLoopInvariantCode(LoopLikeOpInterface looplike);
+} // namespace mlir
+
 #endif // MLIR_INTERFACES_LOOPLIKEINTERFACE_H_
diff --git a/mlir/include/mlir/Transforms/Passes.h b/mlir/include/mlir/Transforms/Passes.h
--- a/mlir/include/mlir/Transforms/Passes.h
+++ b/mlir/include/mlir/Transforms/Passes.h
@@ -22,13 +22,8 @@
 
 namespace mlir {
 
-class AffineForOp;
 class GreedyRewriteConfig;
 
-/// Fusion mode to attempt. The default mode `Greedy` does both
-/// producer-consumer and sibling fusion.
-enum FusionMode { Greedy, ProducerConsumer, Sibling };
-
 //===----------------------------------------------------------------------===//
 // Passes
 //===----------------------------------------------------------------------===//
@@ -53,31 +48,10 @@
 /// Creates a pass to perform common sub expression elimination.
 std::unique_ptr<Pass> createCSEPass();
 
-/// Creates a loop fusion pass which fuses loops according to type of fusion
-/// specified in `fusionMode`. Buffers of size less than or equal to
-/// `localBufSizeThreshold` are promoted to memory space `fastMemorySpace`.
-std::unique_ptr<OperationPass<FuncOp>>
-createLoopFusionPass(unsigned fastMemorySpace = 0,
-                     uint64_t localBufSizeThreshold = 0,
-                     bool maximalFusion = false,
-                     enum FusionMode fusionMode = FusionMode::Greedy);
-
 /// Creates a loop invariant code motion pass that hoists loop invariant
 /// instructions out of the loop.
 std::unique_ptr<Pass> createLoopInvariantCodeMotionPass();
 
-/// Creates a pass to pipeline explicit movement of data across levels of the
-/// memory hierarchy.
-std::unique_ptr<OperationPass<FuncOp>> createPipelineDataTransferPass();
-
-/// Creates a pass that transforms perfectly nested loops with independent
-/// bounds into a single loop.
-std::unique_ptr<OperationPass<FuncOp>> createLoopCoalescingPass();
-
-/// Creates a pass that transforms a single ParallelLoop over N induction
-/// variables into another ParallelLoop over less than N induction variables.
-std::unique_ptr<Pass> createParallelLoopCollapsingPass();
-
 /// Creates a pass to strip debug information from a function.
 std::unique_ptr<Pass> createStripDebugInfoPass();
 
diff --git a/mlir/include/mlir/Transforms/Passes.td b/mlir/include/mlir/Transforms/Passes.td
--- a/mlir/include/mlir/Transforms/Passes.td
+++ b/mlir/include/mlir/Transforms/Passes.td
@@ -16,207 +16,6 @@
 include "mlir/Pass/PassBase.td"
 include "mlir/Rewrite/PassUtil.td"
 
-def AffineLoopFusion : Pass<"affine-loop-fusion", "FuncOp"> {
-  let summary = "Fuse affine loop nests";
-  let description = [{
-    This pass performs fusion of loop nests using a slicing-based approach. It
-    combines two fusion strategies: producer-consumer fusion and sibling fusion.
-    Producer-consumer fusion is aimed at fusing pairs of loops where the first
-    one writes to a memref that the second reads. Sibling fusion targets pairs
-    of loops that share no dependences between them but that load from the same
-    memref. The fused loop nests, when possible, are rewritten to access
-    significantly smaller local buffers instead of the original memref's, and
-    the latter are often either completely optimized away or contracted. This
-    transformation leads to enhanced locality and lower memory footprint through
-    the elimination or contraction of temporaries/intermediate memref's. These
-    benefits are sometimes achieved at the expense of redundant computation
-    through a cost model that evaluates available choices such as the depth at
-    which a source slice should be materialized in the designation slice.
-
-    Example 1: Producer-consumer fusion.
-    Input:
-    ```mlir
-    func @producer_consumer_fusion(%arg0: memref<10xf32>, %arg1: memref<10xf32>) {
-      %0 = memref.alloc() : memref<10xf32>
-      %1 = memref.alloc() : memref<10xf32>
-      %cst = arith.constant 0.000000e+00 : f32
-      affine.for %arg2 = 0 to 10 {
-        affine.store %cst, %0[%arg2] : memref<10xf32>
-        affine.store %cst, %1[%arg2] : memref<10xf32>
-      }
-      affine.for %arg2 = 0 to 10 {
-        %2 = affine.load %0[%arg2] : memref<10xf32>
-        %3 = arith.addf %2, %2 : f32
-        affine.store %3, %arg0[%arg2] : memref<10xf32>
-      }
-      affine.for %arg2 = 0 to 10 {
-        %2 = affine.load %1[%arg2] : memref<10xf32>
-        %3 = arith.mulf %2, %2 : f32
-        affine.store %3, %arg1[%arg2] : memref<10xf32>
-      }
-      return
-    }
-    ```
-    Output:
-    ```mlir
-    func @producer_consumer_fusion(%arg0: memref<10xf32>, %arg1: memref<10xf32>) {
-      %0 = memref.alloc() : memref<1xf32>
-      %1 = memref.alloc() : memref<1xf32>
-      %cst = arith.constant 0.000000e+00 : f32
-      affine.for %arg2 = 0 to 10 {
-        affine.store %cst, %0[0] : memref<1xf32>
-        affine.store %cst, %1[0] : memref<1xf32>
-        %2 = affine.load %1[0] : memref<1xf32>
-        %3 = arith.mulf %2, %2 : f32
-        affine.store %3, %arg1[%arg2] : memref<10xf32>
-        %4 = affine.load %0[0] : memref<1xf32>
-        %5 = arith.addf %4, %4 : f32
-        affine.store %5, %arg0[%arg2] : memref<10xf32>
-      }
-      return
-    }
-    ```
-
-    Example 2: Sibling fusion.
-    Input:
-    ```mlir
-    func @sibling_fusion(%arg0: memref<10x10xf32>, %arg1: memref<10x10xf32>,
-                         %arg2: memref<10x10xf32>, %arg3: memref<10x10xf32>,
-                         %arg4: memref<10x10xf32>) {
-      affine.for %arg5 = 0 to 3 {
-        affine.for %arg6 = 0 to 3 {
-          %0 = affine.load %arg0[%arg5, %arg6] : memref<10x10xf32>
-          %1 = affine.load %arg1[%arg5, %arg6] : memref<10x10xf32>
-          %2 = arith.mulf %0, %1 : f32
-          affine.store %2, %arg3[%arg5, %arg6] : memref<10x10xf32>
-        }
-      }
-      affine.for %arg5 = 0 to 3 {
-        affine.for %arg6 = 0 to 3 {
-          %0 = affine.load %arg0[%arg5, %arg6] : memref<10x10xf32>
-          %1 = affine.load %arg2[%arg5, %arg6] : memref<10x10xf32>
-          %2 = arith.addf %0, %1 : f32
-          affine.store %2, %arg4[%arg5, %arg6] : memref<10x10xf32>
-        }
-      }
-      return
-    }
-    ```
-    Output:
-    ```mlir
-    func @sibling_fusion(%arg0: memref<10x10xf32>, %arg1: memref<10x10xf32>,
-                         %arg2: memref<10x10xf32>, %arg3: memref<10x10xf32>,
-                         %arg4: memref<10x10xf32>) {
-      affine.for %arg5 = 0 to 3 {
-        affine.for %arg6 = 0 to 3 {
-          %0 = affine.load %arg0[%arg5, %arg6] : memref<10x10xf32>
-          %1 = affine.load %arg1[%arg5, %arg6] : memref<10x10xf32>
-          %2 = arith.mulf %0, %1 : f32
-          affine.store %2, %arg3[%arg5, %arg6] : memref<10x10xf32>
-          %3 = affine.load %arg0[%arg5, %arg6] : memref<10x10xf32>
-          %4 = affine.load %arg2[%arg5, %arg6] : memref<10x10xf32>
-          %5 = arith.addf %3, %4 : f32
-          affine.store %5, %arg4[%arg5, %arg6] : memref<10x10xf32>
-        }
-      }
-      return
-    }
-    ```
-  }];
-  let constructor = "mlir::createLoopFusionPass()";
-  let options = [
-    Option<"computeToleranceThreshold", "fusion-compute-tolerance", "double",
-           /*default=*/"0.30f", "Fractional increase in additional computation "
-                                "tolerated while fusing">,
-    Option<"fastMemorySpace", "fusion-fast-mem-space", "unsigned",
-           /*default=*/"0",
-           "Faster memory space number to promote fusion buffers to">,
-    Option<"localBufSizeThreshold", "fusion-local-buf-threshold", "uint64_t",
-           /*default=*/"0", "Threshold size (KiB) for promoting local buffers "
-                            "to fast memory space">,
-    Option<"maximalFusion", "fusion-maximal", "bool", /*default=*/"false",
-           "Enables maximal loop fusion">,
-    Option<"affineFusionMode", "mode", "enum FusionMode",
-           "mlir::FusionMode::Greedy", "fusion mode to attempt",
-           "llvm::cl::values(clEnumValN(mlir::FusionMode::Greedy,"
-           " \"greedy\", \"Perform greedy (both producer-consumer and sibling)  fusion\"), "
-           "clEnumValN( mlir::FusionMode::ProducerConsumer, "
-           "\"producer\", \"Perform only producer-consumer fusion\"), "
-           "clEnumValN( mlir::FusionMode::Sibling, "
-           "\"sibling\", \"Perform only sibling fusion\"))">,
-    ];
-  let dependentDialects = ["memref::MemRefDialect"];
-}
-
-def AffinePipelineDataTransfer
-    : Pass<"affine-pipeline-data-transfer", "FuncOp"> {
-  let summary = "Pipeline non-blocking data transfers between explicitly "
-                "managed levels of the memory hierarchy";
-  let description = [{
-    This pass performs a transformation to overlap non-blocking DMA operations
-    in a loop with computations through double buffering. This is achieved by
-    advancing dma_start operations with respect to other operations.
-
-    Input
-
-    ```mlir
-    func @pipelinedatatransfer() {
-      %0 = memref.alloc() : memref<256xf32>
-      %1 = memref.alloc() : memref<32xf32, 1>
-      %2 = memref.alloc() : memref<1xf32>
-      %c0 = arith.constant 0 : index
-      %c128 = arith.constant 128 : index
-      affine.for %i0 = 0 to 8 {
-        affine.dma_start %0[%i0], %1[%i0], %2[%c0], %c128 : memref<256xf32>, memref<32xf32, 1>, memref<1xf32>
-        affine.dma_wait %2[%c0], %c128 : memref<1xf32>
-        %3 = affine.load %1[%i0] : memref<32xf32, 1>
-        %4 = "compute"(%3) : (f32) -> f32
-        affine.store %4, %1[%i0] : memref<32xf32, 1>
-      }
-      return
-    }
-    ```
-
-    Output
-
-    ```mlir
-    module {
-      func @pipelinedatatransfer() {
-        %c8 = arith.constant 8 : index
-        %c0 = arith.constant 0 : index
-        %0 = memref.alloc() : memref<256xf32>
-        %c0_0 = arith.constant 0 : index
-        %c128 = arith.constant 128 : index
-        %1 = memref.alloc() : memref<2x32xf32, 1>
-        %2 = memref.alloc() : memref<2x1xf32>
-        affine.dma_start %0[%c0], %1[%c0 mod 2, %c0], %2[%c0 mod 2, symbol(%c0_0)], %c128 : memref<256xf32>, memref<2x32xf32, 1>, memref<2x1xf32>
-        affine.for %arg0 = 1 to 8 {
-          affine.dma_start %0[%arg0], %1[%arg0 mod 2, %arg0], %2[%arg0 mod 2, symbol(%c0_0)], %c128 : memref<256xf32>, memref<2x32xf32, 1>, memref<2x1xf32>
-          %8 = affine.apply #map3(%arg0)
-          %9 = affine.apply #map4(%8)
-          %10 = affine.apply #map4(%8)
-          affine.dma_wait %2[%8 mod 2, symbol(%c0_0)], %c128 : memref<2x1xf32>
-          %11 = affine.load %1[%8 mod 2, %8] : memref<2x32xf32, 1>
-          %12 = "compute"(%11) : (f32) -> f32
-          affine.store %12, %1[%8 mod 2, %8] : memref<2x32xf32, 1>
-        }
-        %3 = affine.apply #map3(%c8)
-        %4 = affine.apply #map4(%3)
-        %5 = affine.apply #map4(%3)
-        affine.dma_wait %2[%3 mod 2, symbol(%c0_0)], %c128 : memref<2x1xf32>
-        %6 = affine.load %1[%3 mod 2, %3] : memref<2x32xf32, 1>
-        %7 = "compute"(%6) : (f32) -> f32
-        affine.store %7, %1[%3 mod 2, %3] : memref<2x32xf32, 1>
-        memref.dealloc %2 : memref<2x1xf32>
-        memref.dealloc %1 : memref<2x32xf32, 1>
-        return
-      }
-    }
-    ```
-  }];
-  let constructor = "mlir::createPipelineDataTransferPass()";
-}
-
 def Canonicalizer : Pass<"canonicalize"> {
   let summary = "Canonicalize operations";
   let description = [{
@@ -315,34 +114,11 @@
   ];
 }
 
-def LoopCoalescing : Pass<"loop-coalescing", "FuncOp"> {
-  let summary = "Coalesce nested loops with independent bounds into a single "
-                "loop";
-  let constructor = "mlir::createLoopCoalescingPass()";
-  let dependentDialects = ["arith::ArithmeticDialect"];
-}
-
 def LoopInvariantCodeMotion : Pass<"loop-invariant-code-motion"> {
   let summary = "Hoist loop invariant instructions outside of the loop";
   let constructor = "mlir::createLoopInvariantCodeMotionPass()";
 }
 
-def ParallelLoopCollapsing : Pass<"parallel-loop-collapsing"> {
-  let summary = "Collapse parallel loops to use less induction variables";
-  let constructor = "mlir::createParallelLoopCollapsingPass()";
-  let options = [
-    ListOption<"clCollapsedIndices0", "collapsed-indices-0", "unsigned",
-               "Which loop indices to combine 0th loop index",
-               "llvm::cl::MiscFlags::CommaSeparated">,
-    ListOption<"clCollapsedIndices1", "collapsed-indices-1", "unsigned",
-               "Which loop indices to combine into the position 1 loop index",
-               "llvm::cl::MiscFlags::CommaSeparated">,
-    ListOption<"clCollapsedIndices2", "collapsed-indices-2", "unsigned",
-               "Which loop indices to combine into the position 2 loop index",
-               "llvm::cl::MiscFlags::CommaSeparated">,
-  ];
-}
-
 def PrintOpStats : Pass<"print-op-stats"> {
   let summary = "Print statistics of operations";
   let constructor = "mlir::createPrintOpStatsPass()";
diff --git a/mlir/include/mlir/Transforms/Utils.h b/mlir/include/mlir/Transforms/Utils.h
deleted file mode 100644
--- a/mlir/include/mlir/Transforms/Utils.h
+++ /dev/null
@@ -1,152 +0,0 @@
-//===- Utils.h - General transformation utilities ---------------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This header file defines prototypes for various transformation utilities for
-// memref's and non-loop IR structures. These are not passes by themselves but
-// are used either by passes, optimization sequences, or in turn by other
-// transformation utilities.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef MLIR_TRANSFORMS_UTILS_H
-#define MLIR_TRANSFORMS_UTILS_H
-
-#include "mlir/Dialect/StandardOps/IR/Ops.h"
-#include "mlir/IR/AffineMap.h"
-#include "llvm/ADT/ArrayRef.h"
-#include "llvm/ADT/DenseMap.h"
-
-namespace mlir {
-
-class AffineApplyOp;
-class AffineForOp;
-class Location;
-class OpBuilder;
-
-namespace memref {
-class AllocOp;
-} // namespace memref
-
-/// Replaces all "dereferencing" uses of `oldMemRef` with `newMemRef` while
-/// optionally remapping the old memref's indices using the supplied affine map,
-/// `indexRemap`. The new memref could be of a different shape or rank.
-/// `extraIndices` provides any additional access indices to be added to the
-/// start.
-///
-/// `indexRemap` remaps indices of the old memref access to a new set of indices
-/// that are used to index the memref. Additional input operands to indexRemap
-/// can be optionally provided in `extraOperands`, and they occupy the start
-/// of its input list. `indexRemap`'s dimensional inputs are expected to
-/// correspond to memref's indices, and its symbolic inputs if any should be
-/// provided in `symbolOperands`.
-///
-/// `domOpFilter`, if non-null, restricts the replacement to only those
-/// operations that are dominated by the former; similarly, `postDomOpFilter`
-/// restricts replacement to only those operations that are postdominated by it.
-///
-/// 'allowNonDereferencingOps', if set, allows replacement of non-dereferencing
-/// uses of a memref without any requirement for access index rewrites as long
-/// as the user operation has the MemRefsNormalizable trait. The default value
-/// of this flag is false.
-///
-/// 'replaceInDeallocOp', if set, lets DeallocOp, a non-dereferencing user, to
-/// also be a candidate for replacement. The default value of this flag is
-/// false.
-///
-/// Returns true on success and false if the replacement is not possible,
-/// whenever a memref is used as an operand in a non-dereferencing context and
-/// 'allowNonDereferencingOps' is false, except for dealloc's on the memref
-/// which are left untouched. See comments at function definition for an
-/// example.
-//
-//  Ex: to replace load %A[%i, %j] with load %Abuf[%t mod 2, %ii - %i, %j]:
-//  The SSA value corresponding to '%t mod 2' should be in 'extraIndices', and
-//  index remap will perform (%i, %j) -> (%ii - %i, %j), i.e., indexRemap = (d0,
-//  d1, d2) -> (d0 - d1, d2), and %ii will be the extra operand. Without any
-//  extra operands, note that 'indexRemap' would just be applied to existing
-//  indices (%i, %j).
-//  TODO: allow extraIndices to be added at any position.
-LogicalResult replaceAllMemRefUsesWith(
-    Value oldMemRef, Value newMemRef, ArrayRef<Value> extraIndices = {},
-    AffineMap indexRemap = AffineMap(), ArrayRef<Value> extraOperands = {},
-    ArrayRef<Value> symbolOperands = {}, Operation *domOpFilter = nullptr,
-    Operation *postDomOpFilter = nullptr, bool allowNonDereferencingOps = false,
-    bool replaceInDeallocOp = false);
-
-/// Performs the same replacement as the other version above but only for the
-/// dereferencing uses of `oldMemRef` in `op`, except in cases where
-/// 'allowNonDereferencingOps' is set to true where we replace the
-/// non-dereferencing uses as well.
-LogicalResult replaceAllMemRefUsesWith(Value oldMemRef, Value newMemRef,
-                                       Operation *op,
-                                       ArrayRef<Value> extraIndices = {},
-                                       AffineMap indexRemap = AffineMap(),
-                                       ArrayRef<Value> extraOperands = {},
-                                       ArrayRef<Value> symbolOperands = {},
-                                       bool allowNonDereferencingOps = false);
-
-/// Rewrites the memref defined by this alloc op to have an identity layout map
-/// and updates all its indexing uses. Returns failure if any of its uses
-/// escape (while leaving the IR in a valid state).
-LogicalResult normalizeMemRef(memref::AllocOp *op);
-
-/// Uses the old memref type map layout and computes the new memref type to have
-/// a new shape and a layout map, where the old layout map has been normalized
-/// to an identity layout map. It returns the old memref in case no
-/// normalization was needed or a failure occurs while transforming the old map
-/// layout to an identity layout map.
-MemRefType normalizeMemRefType(MemRefType memrefType, OpBuilder builder,
-                               unsigned numSymbolicOperands);
-
-/// Creates and inserts into 'builder' a new AffineApplyOp, with the number of
-/// its results equal to the number of operands, as a composition
-/// of all other AffineApplyOps reachable from input parameter 'operands'. If
-/// different operands were drawing results from multiple affine apply ops,
-/// these will also be collected into a single (multi-result) affine apply op.
-/// The final results of the composed AffineApplyOp are returned in output
-/// parameter 'results'. Returns the affine apply op created.
-Operation *createComposedAffineApplyOp(OpBuilder &builder, Location loc,
-                                       ArrayRef<Value> operands,
-                                       ArrayRef<Operation *> affineApplyOps,
-                                       SmallVectorImpl<Value> *results);
-
-/// Given an operation, inserts one or more single result affine apply
-/// operations, results of which are exclusively used by this operation.
-/// The operands of these newly created affine apply ops are
-/// guaranteed to be loop iterators or terminal symbols of a function.
-///
-/// Before
-///
-/// affine.for %i = 0 to #map(%N)
-///   %idx = affine.apply (d0) -> (d0 mod 2) (%i)
-///   send %A[%idx], ...
-///   %v = "compute"(%idx, ...)
-///
-/// After
-///
-/// affine.for %i = 0 to #map(%N)
-///   %idx = affine.apply (d0) -> (d0 mod 2) (%i)
-///   send %A[%idx], ...
-///   %idx_ = affine.apply (d0) -> (d0 mod 2) (%i)
-///   %v = "compute"(%idx_, ...)
-
-/// This allows the application of different transformations on send and
-/// compute (for eg. different shifts/delays)
-///
-/// Fills `sliceOps` with the list of affine.apply operations.
-/// In the following cases, `sliceOps` remains empty:
-///   1. If none of opInst's operands were the result of an affine.apply
-///      (i.e., there was no affine computation slice to create).
-///   2. If all the affine.apply op's supplying operands to this opInst did not
-///      have any uses other than those in this opInst.
-void createAffineComputationSlice(Operation *opInst,
-                                  SmallVectorImpl<AffineApplyOp> *sliceOps);
-
-} // namespace mlir
-
-#endif // MLIR_TRANSFORMS_UTILS_H
diff --git a/mlir/lib/Conversion/SCFToGPU/SCFToGPU.cpp b/mlir/lib/Conversion/SCFToGPU/SCFToGPU.cpp
--- a/mlir/lib/Conversion/SCFToGPU/SCFToGPU.cpp
+++ b/mlir/lib/Conversion/SCFToGPU/SCFToGPU.cpp
@@ -16,6 +16,7 @@
 
 #include "mlir/Conversion/AffineToStandard/AffineToStandard.h"
 #include "mlir/Dialect/Affine/IR/AffineOps.h"
+#include "mlir/Dialect/Affine/LoopUtils.h"
 #include "mlir/Dialect/Arithmetic/IR/Arithmetic.h"
 #include "mlir/Dialect/GPU/GPUDialect.h"
 #include "mlir/Dialect/GPU/ParallelLoopMapper.h"
@@ -27,7 +28,6 @@
 #include "mlir/IR/Builders.h"
 #include "mlir/Pass/Pass.h"
 #include "mlir/Transforms/DialectConversion.h"
-#include "mlir/Transforms/LoopUtils.h"
 #include "mlir/Transforms/Passes.h"
 #include "mlir/Transforms/RegionUtils.h"
 #include "llvm/ADT/Sequence.h"
diff --git a/mlir/lib/Conversion/SCFToStandard/SCFToStandard.cpp b/mlir/lib/Conversion/SCFToStandard/SCFToStandard.cpp
--- a/mlir/lib/Conversion/SCFToStandard/SCFToStandard.cpp
+++ b/mlir/lib/Conversion/SCFToStandard/SCFToStandard.cpp
@@ -13,6 +13,7 @@
 
 #include "mlir/Conversion/SCFToStandard/SCFToStandard.h"
 #include "../PassDetail.h"
+#include "mlir/Dialect/Affine/Utils.h"
 #include "mlir/Dialect/Arithmetic/IR/Arithmetic.h"
 #include "mlir/Dialect/SCF/SCF.h"
 #include "mlir/Dialect/StandardOps/IR/Ops.h"
@@ -23,7 +24,6 @@
 #include "mlir/IR/PatternMatch.h"
 #include "mlir/Transforms/DialectConversion.h"
 #include "mlir/Transforms/Passes.h"
-#include "mlir/Transforms/Utils.h"
 
 using namespace mlir;
 using namespace mlir::scf;
diff --git a/mlir/lib/Conversion/StandardToLLVM/StandardToLLVM.cpp b/mlir/lib/Conversion/StandardToLLVM/StandardToLLVM.cpp
--- a/mlir/lib/Conversion/StandardToLLVM/StandardToLLVM.cpp
+++ b/mlir/lib/Conversion/StandardToLLVM/StandardToLLVM.cpp
@@ -19,6 +19,7 @@
 #include "mlir/Conversion/LLVMCommon/VectorPattern.h"
 #include "mlir/Conversion/StandardToLLVM/ConvertStandardToLLVM.h"
 #include "mlir/Conversion/StandardToLLVM/ConvertStandardToLLVMPass.h"
+#include "mlir/Dialect/Affine/Utils.h"
 #include "mlir/Dialect/LLVMIR/FunctionCallUtils.h"
 #include "mlir/Dialect/LLVMIR/LLVMDialect.h"
 #include "mlir/Dialect/StandardOps/IR/Ops.h"
@@ -33,7 +34,6 @@
 #include "mlir/Support/MathExtras.h"
 #include "mlir/Transforms/DialectConversion.h"
 #include "mlir/Transforms/Passes.h"
-#include "mlir/Transforms/Utils.h"
 #include "llvm/ADT/TypeSwitch.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/IRBuilder.h"
diff --git a/mlir/lib/Dialect/Affine/Transforms/AffineDataCopyGeneration.cpp b/mlir/lib/Dialect/Affine/Transforms/AffineDataCopyGeneration.cpp
--- a/mlir/lib/Dialect/Affine/Transforms/AffineDataCopyGeneration.cpp
+++ b/mlir/lib/Dialect/Affine/Transforms/AffineDataCopyGeneration.cpp
@@ -22,12 +22,12 @@
 #include "PassDetail.h"
 #include "mlir/Dialect/Affine/Analysis/Utils.h"
 #include "mlir/Dialect/Affine/IR/AffineOps.h"
+#include "mlir/Dialect/Affine/LoopUtils.h"
 #include "mlir/Dialect/Affine/Passes.h"
 #include "mlir/Dialect/Arithmetic/IR/Arithmetic.h"
 #include "mlir/Dialect/MemRef/IR/MemRef.h"
 #include "mlir/Dialect/StandardOps/IR/Ops.h"
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
-#include "mlir/Transforms/LoopUtils.h"
 #include "llvm/ADT/MapVector.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
diff --git a/mlir/lib/Dialect/Affine/Transforms/AffineLoopInvariantCodeMotion.cpp b/mlir/lib/Dialect/Affine/Transforms/AffineLoopInvariantCodeMotion.cpp
--- a/mlir/lib/Dialect/Affine/Transforms/AffineLoopInvariantCodeMotion.cpp
+++ b/mlir/lib/Dialect/Affine/Transforms/AffineLoopInvariantCodeMotion.cpp
@@ -17,13 +17,13 @@
 #include "mlir/Dialect/Affine/Analysis/LoopAnalysis.h"
 #include "mlir/Dialect/Affine/Analysis/Utils.h"
 #include "mlir/Dialect/Affine/IR/AffineOps.h"
+#include "mlir/Dialect/Affine/LoopUtils.h"
 #include "mlir/Dialect/Affine/Passes.h"
+#include "mlir/Dialect/Affine/Utils.h"
 #include "mlir/Dialect/Arithmetic/IR/Arithmetic.h"
 #include "mlir/IR/AffineExpr.h"
 #include "mlir/IR/AffineMap.h"
 #include "mlir/IR/Builders.h"
-#include "mlir/Transforms/LoopUtils.h"
-#include "mlir/Transforms/Utils.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/DenseSet.h"
 #include "llvm/ADT/SmallPtrSet.h"
diff --git a/mlir/lib/Dialect/Affine/Transforms/AffineParallelize.cpp b/mlir/lib/Dialect/Affine/Transforms/AffineParallelize.cpp
--- a/mlir/lib/Dialect/Affine/Transforms/AffineParallelize.cpp
+++ b/mlir/lib/Dialect/Affine/Transforms/AffineParallelize.cpp
@@ -18,10 +18,10 @@
 #include "mlir/Dialect/Affine/Analysis/Utils.h"
 #include "mlir/Dialect/Affine/IR/AffineOps.h"
 #include "mlir/Dialect/Affine/IR/AffineValueMap.h"
+#include "mlir/Dialect/Affine/LoopUtils.h"
 #include "mlir/Dialect/Affine/Passes.h"
 #include "mlir/Dialect/Affine/Passes.h.inc"
 #include "mlir/Dialect/Affine/Utils.h"
-#include "mlir/Transforms/LoopUtils.h"
 #include "llvm/Support/Debug.h"
 #include <deque>
 
diff --git a/mlir/lib/Dialect/Affine/Transforms/CMakeLists.txt b/mlir/lib/Dialect/Affine/Transforms/CMakeLists.txt
--- a/mlir/lib/Dialect/Affine/Transforms/CMakeLists.txt
+++ b/mlir/lib/Dialect/Affine/Transforms/CMakeLists.txt
@@ -4,9 +4,12 @@
   AffineLoopNormalize.cpp
   AffineParallelize.cpp
   AffineScalarReplacement.cpp
+  LoopCoalescing.cpp
+  LoopFusion.cpp
   LoopTiling.cpp
   LoopUnroll.cpp
   LoopUnrollAndJam.cpp
+  PipelineDataTransfer.cpp
   SuperVectorize.cpp
   SimplifyAffineStructures.cpp
 
diff --git a/mlir/lib/Transforms/LoopCoalescing.cpp b/mlir/lib/Dialect/Affine/Transforms/LoopCoalescing.cpp
rename from mlir/lib/Transforms/LoopCoalescing.cpp
rename to mlir/lib/Dialect/Affine/Transforms/LoopCoalescing.cpp
--- a/mlir/lib/Transforms/LoopCoalescing.cpp
+++ b/mlir/lib/Dialect/Affine/Transforms/LoopCoalescing.cpp
@@ -8,9 +8,9 @@
 
 #include "PassDetail.h"
 #include "mlir/Dialect/Affine/IR/AffineOps.h"
+#include "mlir/Dialect/Affine/LoopUtils.h"
 #include "mlir/Dialect/Arithmetic/IR/Arithmetic.h"
 #include "mlir/Dialect/SCF/SCF.h"
-#include "mlir/Transforms/LoopUtils.h"
 #include "mlir/Transforms/Passes.h"
 #include "mlir/Transforms/RegionUtils.h"
 #include "llvm/Support/Debug.h"
diff --git a/mlir/lib/Transforms/LoopFusion.cpp b/mlir/lib/Dialect/Affine/Transforms/LoopFusion.cpp
rename from mlir/lib/Transforms/LoopFusion.cpp
rename to mlir/lib/Dialect/Affine/Transforms/LoopFusion.cpp
--- a/mlir/lib/Transforms/LoopFusion.cpp
+++ b/mlir/lib/Dialect/Affine/Transforms/LoopFusion.cpp
@@ -16,14 +16,14 @@
 #include "mlir/Dialect/Affine/Analysis/LoopAnalysis.h"
 #include "mlir/Dialect/Affine/Analysis/Utils.h"
 #include "mlir/Dialect/Affine/IR/AffineOps.h"
+#include "mlir/Dialect/Affine/LoopFusionUtils.h"
+#include "mlir/Dialect/Affine/LoopUtils.h"
+#include "mlir/Dialect/Affine/Utils.h"
 #include "mlir/Dialect/MemRef/IR/MemRef.h"
 #include "mlir/IR/AffineExpr.h"
 #include "mlir/IR/AffineMap.h"
 #include "mlir/IR/Builders.h"
-#include "mlir/Transforms/LoopFusionUtils.h"
-#include "mlir/Transforms/LoopUtils.h"
 #include "mlir/Transforms/Passes.h"
-#include "mlir/Transforms/Utils.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/DenseSet.h"
 #include "llvm/ADT/SetVector.h"
diff --git a/mlir/lib/Dialect/Affine/Transforms/LoopTiling.cpp b/mlir/lib/Dialect/Affine/Transforms/LoopTiling.cpp
--- a/mlir/lib/Dialect/Affine/Transforms/LoopTiling.cpp
+++ b/mlir/lib/Dialect/Affine/Transforms/LoopTiling.cpp
@@ -17,11 +17,11 @@
 #include "mlir/Dialect/Affine/Analysis/Utils.h"
 #include "mlir/Dialect/Affine/IR/AffineOps.h"
 #include "mlir/Dialect/Affine/IR/AffineValueMap.h"
+#include "mlir/Dialect/Affine/LoopUtils.h"
 #include "mlir/Dialect/Affine/Passes.h"
+#include "mlir/Dialect/Affine/Utils.h"
 #include "mlir/IR/BlockAndValueMapping.h"
 #include "mlir/IR/Builders.h"
-#include "mlir/Transforms/LoopUtils.h"
-#include "mlir/Transforms/Utils.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 using namespace mlir;
diff --git a/mlir/lib/Dialect/Affine/Transforms/LoopUnroll.cpp b/mlir/lib/Dialect/Affine/Transforms/LoopUnroll.cpp
--- a/mlir/lib/Dialect/Affine/Transforms/LoopUnroll.cpp
+++ b/mlir/lib/Dialect/Affine/Transforms/LoopUnroll.cpp
@@ -12,11 +12,11 @@
 #include "PassDetail.h"
 #include "mlir/Dialect/Affine/Analysis/LoopAnalysis.h"
 #include "mlir/Dialect/Affine/IR/AffineOps.h"
+#include "mlir/Dialect/Affine/LoopUtils.h"
 #include "mlir/Dialect/Affine/Passes.h"
 #include "mlir/IR/AffineExpr.h"
 #include "mlir/IR/AffineMap.h"
 #include "mlir/IR/Builders.h"
-#include "mlir/Transforms/LoopUtils.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
diff --git a/mlir/lib/Dialect/Affine/Transforms/LoopUnrollAndJam.cpp b/mlir/lib/Dialect/Affine/Transforms/LoopUnrollAndJam.cpp
--- a/mlir/lib/Dialect/Affine/Transforms/LoopUnrollAndJam.cpp
+++ b/mlir/lib/Dialect/Affine/Transforms/LoopUnrollAndJam.cpp
@@ -37,12 +37,12 @@
 #include "mlir/Dialect/Affine/Analysis/AffineAnalysis.h"
 #include "mlir/Dialect/Affine/Analysis/LoopAnalysis.h"
 #include "mlir/Dialect/Affine/IR/AffineOps.h"
+#include "mlir/Dialect/Affine/LoopUtils.h"
 #include "mlir/Dialect/Affine/Passes.h"
 #include "mlir/IR/AffineExpr.h"
 #include "mlir/IR/AffineMap.h"
 #include "mlir/IR/BlockAndValueMapping.h"
 #include "mlir/IR/Builders.h"
-#include "mlir/Transforms/LoopUtils.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/Support/CommandLine.h"
 
diff --git a/mlir/lib/Dialect/Affine/Transforms/PassDetail.h b/mlir/lib/Dialect/Affine/Transforms/PassDetail.h
--- a/mlir/lib/Dialect/Affine/Transforms/PassDetail.h
+++ b/mlir/lib/Dialect/Affine/Transforms/PassDetail.h
@@ -9,6 +9,7 @@
 #ifndef DIALECT_AFFINE_TRANSFORMS_PASSDETAIL_H_
 #define DIALECT_AFFINE_TRANSFORMS_PASSDETAIL_H_
 
+#include "mlir/Dialect/Affine/Passes.h"
 #include "mlir/Pass/Pass.h"
 
 namespace mlir {
@@ -16,6 +17,10 @@
 template <typename ConcreteDialect>
 void registerDialect(DialectRegistry &registry);
 
+namespace arith {
+class ArithmeticDialect;
+} // namespace arith
+
 namespace linalg {
 class LinalgDialect;
 } // namespace linalg
diff --git a/mlir/lib/Transforms/PipelineDataTransfer.cpp b/mlir/lib/Dialect/Affine/Transforms/PipelineDataTransfer.cpp
rename from mlir/lib/Transforms/PipelineDataTransfer.cpp
rename to mlir/lib/Dialect/Affine/Transforms/PipelineDataTransfer.cpp
--- a/mlir/lib/Transforms/PipelineDataTransfer.cpp
+++ b/mlir/lib/Dialect/Affine/Transforms/PipelineDataTransfer.cpp
@@ -11,17 +11,16 @@
 //===----------------------------------------------------------------------===//
 
 #include "PassDetail.h"
-#include "mlir/Transforms/Passes.h"
-
 #include "mlir/Dialect/Affine/Analysis/AffineAnalysis.h"
 #include "mlir/Dialect/Affine/Analysis/LoopAnalysis.h"
 #include "mlir/Dialect/Affine/Analysis/Utils.h"
 #include "mlir/Dialect/Affine/IR/AffineOps.h"
+#include "mlir/Dialect/Affine/LoopUtils.h"
+#include "mlir/Dialect/Affine/Utils.h"
 #include "mlir/Dialect/MemRef/IR/MemRef.h"
 #include "mlir/Dialect/StandardOps/Utils/Utils.h"
 #include "mlir/IR/Builders.h"
-#include "mlir/Transforms/LoopUtils.h"
-#include "mlir/Transforms/Utils.h"
+#include "mlir/Transforms/Passes.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/Support/Debug.h"
 
diff --git a/mlir/lib/Dialect/Affine/Transforms/SimplifyAffineStructures.cpp b/mlir/lib/Dialect/Affine/Transforms/SimplifyAffineStructures.cpp
--- a/mlir/lib/Dialect/Affine/Transforms/SimplifyAffineStructures.cpp
+++ b/mlir/lib/Dialect/Affine/Transforms/SimplifyAffineStructures.cpp
@@ -14,9 +14,9 @@
 #include "mlir/Dialect/Affine/Analysis/Utils.h"
 #include "mlir/Dialect/Affine/IR/AffineOps.h"
 #include "mlir/Dialect/Affine/Passes.h"
+#include "mlir/Dialect/Affine/Utils.h"
 #include "mlir/IR/IntegerSet.h"
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
-#include "mlir/Transforms/Utils.h"
 
 #define DEBUG_TYPE "simplify-affine-structure"
 
diff --git a/mlir/lib/Dialect/Affine/Utils/CMakeLists.txt b/mlir/lib/Dialect/Affine/Utils/CMakeLists.txt
--- a/mlir/lib/Dialect/Affine/Utils/CMakeLists.txt
+++ b/mlir/lib/Dialect/Affine/Utils/CMakeLists.txt
@@ -1,4 +1,6 @@
 add_mlir_dialect_library(MLIRAffineUtils
+  LoopFusionUtils.cpp
+  LoopUtils.cpp
   Utils.cpp
 
   ADDITIONAL_HEADER_DIRS
@@ -7,5 +9,6 @@
   LINK_LIBS PUBLIC
   MLIRAffine
   MLIRAnalysis
+  MLIRMemRef
   MLIRTransformUtils
   )
diff --git a/mlir/lib/Transforms/Utils/LoopFusionUtils.cpp b/mlir/lib/Dialect/Affine/Utils/LoopFusionUtils.cpp
rename from mlir/lib/Transforms/Utils/LoopFusionUtils.cpp
rename to mlir/lib/Dialect/Affine/Utils/LoopFusionUtils.cpp
--- a/mlir/lib/Transforms/Utils/LoopFusionUtils.cpp
+++ b/mlir/lib/Dialect/Affine/Utils/LoopFusionUtils.cpp
@@ -10,21 +10,20 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "mlir/Transforms/LoopFusionUtils.h"
-
+#include "mlir/Dialect/Affine/LoopFusionUtils.h"
 #include "mlir/Analysis/SliceAnalysis.h"
 #include "mlir/Dialect/Affine/Analysis/AffineAnalysis.h"
 #include "mlir/Dialect/Affine/Analysis/AffineStructures.h"
 #include "mlir/Dialect/Affine/Analysis/LoopAnalysis.h"
 #include "mlir/Dialect/Affine/Analysis/Utils.h"
 #include "mlir/Dialect/Affine/IR/AffineOps.h"
+#include "mlir/Dialect/Affine/LoopUtils.h"
 #include "mlir/IR/AffineExpr.h"
 #include "mlir/IR/AffineMap.h"
 #include "mlir/IR/BlockAndValueMapping.h"
 #include "mlir/IR/Builders.h"
 #include "mlir/IR/BuiltinOps.h"
 #include "mlir/IR/Operation.h"
-#include "mlir/Transforms/LoopUtils.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/Support/Debug.h"
diff --git a/mlir/lib/Transforms/Utils/LoopUtils.cpp b/mlir/lib/Dialect/Affine/Utils/LoopUtils.cpp
rename from mlir/lib/Transforms/Utils/LoopUtils.cpp
rename to mlir/lib/Dialect/Affine/Utils/LoopUtils.cpp
--- a/mlir/lib/Transforms/Utils/LoopUtils.cpp
+++ b/mlir/lib/Dialect/Affine/Utils/LoopUtils.cpp
@@ -10,14 +10,14 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "mlir/Transforms/LoopUtils.h"
-
+#include "mlir/Dialect/Affine/LoopUtils.h"
 #include "mlir/Analysis/SliceAnalysis.h"
 #include "mlir/Dialect/Affine/Analysis/AffineAnalysis.h"
 #include "mlir/Dialect/Affine/Analysis/LoopAnalysis.h"
 #include "mlir/Dialect/Affine/Analysis/Utils.h"
 #include "mlir/Dialect/Affine/IR/AffineOps.h"
 #include "mlir/Dialect/Affine/IR/AffineValueMap.h"
+#include "mlir/Dialect/Affine/Utils.h"
 #include "mlir/Dialect/MemRef/IR/MemRef.h"
 #include "mlir/Dialect/SCF/SCF.h"
 #include "mlir/IR/BlockAndValueMapping.h"
@@ -25,7 +25,6 @@
 #include "mlir/Support/MathExtras.h"
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
 #include "mlir/Transforms/RegionUtils.h"
-#include "mlir/Transforms/Utils.h"
 #include "llvm/ADT/MapVector.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/Support/Debug.h"
diff --git a/mlir/lib/Dialect/Affine/Utils/Utils.cpp b/mlir/lib/Dialect/Affine/Utils/Utils.cpp
--- a/mlir/lib/Dialect/Affine/Utils/Utils.cpp
+++ b/mlir/lib/Dialect/Affine/Utils/Utils.cpp
@@ -16,12 +16,14 @@
 #include "mlir/Dialect/Affine/Analysis/Utils.h"
 #include "mlir/Dialect/Affine/IR/AffineOps.h"
 #include "mlir/Dialect/Affine/IR/AffineValueMap.h"
+#include "mlir/Dialect/Affine/LoopUtils.h"
 #include "mlir/Dialect/MemRef/IR/MemRef.h"
 #include "mlir/IR/BlockAndValueMapping.h"
 #include "mlir/IR/Dominance.h"
 #include "mlir/IR/IntegerSet.h"
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
-#include "mlir/Transforms/LoopUtils.h"
+
+#define DEBUG_TYPE "affine-utils"
 
 using namespace mlir;
 
@@ -856,3 +858,740 @@
     defOp->erase();
   }
 }
+
+// Perform the replacement in `op`.
+LogicalResult mlir::replaceAllMemRefUsesWith(Value oldMemRef, Value newMemRef,
+                                             Operation *op,
+                                             ArrayRef<Value> extraIndices,
+                                             AffineMap indexRemap,
+                                             ArrayRef<Value> extraOperands,
+                                             ArrayRef<Value> symbolOperands,
+                                             bool allowNonDereferencingOps) {
+  unsigned newMemRefRank = newMemRef.getType().cast<MemRefType>().getRank();
+  (void)newMemRefRank; // unused in opt mode
+  unsigned oldMemRefRank = oldMemRef.getType().cast<MemRefType>().getRank();
+  (void)oldMemRefRank; // unused in opt mode
+  if (indexRemap) {
+    assert(indexRemap.getNumSymbols() == symbolOperands.size() &&
+           "symbolic operand count mismatch");
+    assert(indexRemap.getNumInputs() ==
+           extraOperands.size() + oldMemRefRank + symbolOperands.size());
+    assert(indexRemap.getNumResults() + extraIndices.size() == newMemRefRank);
+  } else {
+    assert(oldMemRefRank + extraIndices.size() == newMemRefRank);
+  }
+
+  // Assert same elemental type.
+  assert(oldMemRef.getType().cast<MemRefType>().getElementType() ==
+         newMemRef.getType().cast<MemRefType>().getElementType());
+
+  SmallVector<unsigned, 2> usePositions;
+  for (const auto &opEntry : llvm::enumerate(op->getOperands())) {
+    if (opEntry.value() == oldMemRef)
+      usePositions.push_back(opEntry.index());
+  }
+
+  // If memref doesn't appear, nothing to do.
+  if (usePositions.empty())
+    return success();
+
+  if (usePositions.size() > 1) {
+    // TODO: extend it for this case when needed (rare).
+    assert(false && "multiple dereferencing uses in a single op not supported");
+    return failure();
+  }
+
+  unsigned memRefOperandPos = usePositions.front();
+
+  OpBuilder builder(op);
+  // The following checks if op is dereferencing memref and performs the access
+  // index rewrites.
+  auto affMapAccInterface = dyn_cast<AffineMapAccessInterface>(op);
+  if (!affMapAccInterface) {
+    if (!allowNonDereferencingOps) {
+      // Failure: memref used in a non-dereferencing context (potentially
+      // escapes); no replacement in these cases unless allowNonDereferencingOps
+      // is set.
+      return failure();
+    }
+    op->setOperand(memRefOperandPos, newMemRef);
+    return success();
+  }
+  // Perform index rewrites for the dereferencing op and then replace the op
+  NamedAttribute oldMapAttrPair =
+      affMapAccInterface.getAffineMapAttrForMemRef(oldMemRef);
+  AffineMap oldMap = oldMapAttrPair.getValue().cast<AffineMapAttr>().getValue();
+  unsigned oldMapNumInputs = oldMap.getNumInputs();
+  SmallVector<Value, 4> oldMapOperands(
+      op->operand_begin() + memRefOperandPos + 1,
+      op->operand_begin() + memRefOperandPos + 1 + oldMapNumInputs);
+
+  // Apply 'oldMemRefOperands = oldMap(oldMapOperands)'.
+  SmallVector<Value, 4> oldMemRefOperands;
+  SmallVector<Value, 4> affineApplyOps;
+  oldMemRefOperands.reserve(oldMemRefRank);
+  if (oldMap != builder.getMultiDimIdentityMap(oldMap.getNumDims())) {
+    for (auto resultExpr : oldMap.getResults()) {
+      auto singleResMap = AffineMap::get(oldMap.getNumDims(),
+                                         oldMap.getNumSymbols(), resultExpr);
+      auto afOp = builder.create<AffineApplyOp>(op->getLoc(), singleResMap,
+                                                oldMapOperands);
+      oldMemRefOperands.push_back(afOp);
+      affineApplyOps.push_back(afOp);
+    }
+  } else {
+    oldMemRefOperands.assign(oldMapOperands.begin(), oldMapOperands.end());
+  }
+
+  // Construct new indices as a remap of the old ones if a remapping has been
+  // provided. The indices of a memref come right after it, i.e.,
+  // at position memRefOperandPos + 1.
+  SmallVector<Value, 4> remapOperands;
+  remapOperands.reserve(extraOperands.size() + oldMemRefRank +
+                        symbolOperands.size());
+  remapOperands.append(extraOperands.begin(), extraOperands.end());
+  remapOperands.append(oldMemRefOperands.begin(), oldMemRefOperands.end());
+  remapOperands.append(symbolOperands.begin(), symbolOperands.end());
+
+  SmallVector<Value, 4> remapOutputs;
+  remapOutputs.reserve(oldMemRefRank);
+
+  if (indexRemap &&
+      indexRemap != builder.getMultiDimIdentityMap(indexRemap.getNumDims())) {
+    // Remapped indices.
+    for (auto resultExpr : indexRemap.getResults()) {
+      auto singleResMap = AffineMap::get(
+          indexRemap.getNumDims(), indexRemap.getNumSymbols(), resultExpr);
+      auto afOp = builder.create<AffineApplyOp>(op->getLoc(), singleResMap,
+                                                remapOperands);
+      remapOutputs.push_back(afOp);
+      affineApplyOps.push_back(afOp);
+    }
+  } else {
+    // No remapping specified.
+    remapOutputs.assign(remapOperands.begin(), remapOperands.end());
+  }
+
+  SmallVector<Value, 4> newMapOperands;
+  newMapOperands.reserve(newMemRefRank);
+
+  // Prepend 'extraIndices' in 'newMapOperands'.
+  for (Value extraIndex : extraIndices) {
+    assert(extraIndex.getDefiningOp()->getNumResults() == 1 &&
+           "single result op's expected to generate these indices");
+    assert((isValidDim(extraIndex) || isValidSymbol(extraIndex)) &&
+           "invalid memory op index");
+    newMapOperands.push_back(extraIndex);
+  }
+
+  // Append 'remapOutputs' to 'newMapOperands'.
+  newMapOperands.append(remapOutputs.begin(), remapOutputs.end());
+
+  // Create new fully composed AffineMap for new op to be created.
+  assert(newMapOperands.size() == newMemRefRank);
+  auto newMap = builder.getMultiDimIdentityMap(newMemRefRank);
+  // TODO: Avoid creating/deleting temporary AffineApplyOps here.
+  fullyComposeAffineMapAndOperands(&newMap, &newMapOperands);
+  newMap = simplifyAffineMap(newMap);
+  canonicalizeMapAndOperands(&newMap, &newMapOperands);
+  // Remove any affine.apply's that became dead as a result of composition.
+  for (Value value : affineApplyOps)
+    if (value.use_empty())
+      value.getDefiningOp()->erase();
+
+  OperationState state(op->getLoc(), op->getName());
+  // Construct the new operation using this memref.
+  state.operands.reserve(op->getNumOperands() + extraIndices.size());
+  // Insert the non-memref operands.
+  state.operands.append(op->operand_begin(),
+                        op->operand_begin() + memRefOperandPos);
+  // Insert the new memref value.
+  state.operands.push_back(newMemRef);
+
+  // Insert the new memref map operands.
+  state.operands.append(newMapOperands.begin(), newMapOperands.end());
+
+  // Insert the remaining operands unmodified.
+  state.operands.append(op->operand_begin() + memRefOperandPos + 1 +
+                            oldMapNumInputs,
+                        op->operand_end());
+
+  // Result types don't change. Both memref's are of the same elemental type.
+  state.types.reserve(op->getNumResults());
+  for (auto result : op->getResults())
+    state.types.push_back(result.getType());
+
+  // Add attribute for 'newMap', other Attributes do not change.
+  auto newMapAttr = AffineMapAttr::get(newMap);
+  for (auto namedAttr : op->getAttrs()) {
+    if (namedAttr.getName() == oldMapAttrPair.getName())
+      state.attributes.push_back({namedAttr.getName(), newMapAttr});
+    else
+      state.attributes.push_back(namedAttr);
+  }
+
+  // Create the new operation.
+  auto *repOp = builder.createOperation(state);
+  op->replaceAllUsesWith(repOp);
+  op->erase();
+
+  return success();
+}
+
+LogicalResult mlir::replaceAllMemRefUsesWith(
+    Value oldMemRef, Value newMemRef, ArrayRef<Value> extraIndices,
+    AffineMap indexRemap, ArrayRef<Value> extraOperands,
+    ArrayRef<Value> symbolOperands, Operation *domOpFilter,
+    Operation *postDomOpFilter, bool allowNonDereferencingOps,
+    bool replaceInDeallocOp) {
+  unsigned newMemRefRank = newMemRef.getType().cast<MemRefType>().getRank();
+  (void)newMemRefRank; // unused in opt mode
+  unsigned oldMemRefRank = oldMemRef.getType().cast<MemRefType>().getRank();
+  (void)oldMemRefRank;
+  if (indexRemap) {
+    assert(indexRemap.getNumSymbols() == symbolOperands.size() &&
+           "symbol operand count mismatch");
+    assert(indexRemap.getNumInputs() ==
+           extraOperands.size() + oldMemRefRank + symbolOperands.size());
+    assert(indexRemap.getNumResults() + extraIndices.size() == newMemRefRank);
+  } else {
+    assert(oldMemRefRank + extraIndices.size() == newMemRefRank);
+  }
+
+  // Assert same elemental type.
+  assert(oldMemRef.getType().cast<MemRefType>().getElementType() ==
+         newMemRef.getType().cast<MemRefType>().getElementType());
+
+  std::unique_ptr<DominanceInfo> domInfo;
+  std::unique_ptr<PostDominanceInfo> postDomInfo;
+  if (domOpFilter)
+    domInfo =
+        std::make_unique<DominanceInfo>(domOpFilter->getParentOfType<FuncOp>());
+
+  if (postDomOpFilter)
+    postDomInfo = std::make_unique<PostDominanceInfo>(
+        postDomOpFilter->getParentOfType<FuncOp>());
+
+  // Walk all uses of old memref; collect ops to perform replacement. We use a
+  // DenseSet since an operation could potentially have multiple uses of a
+  // memref (although rare), and the replacement later is going to erase ops.
+  DenseSet<Operation *> opsToReplace;
+  for (auto *op : oldMemRef.getUsers()) {
+    // Skip this use if it's not dominated by domOpFilter.
+    if (domOpFilter && !domInfo->dominates(domOpFilter, op))
+      continue;
+
+    // Skip this use if it's not post-dominated by postDomOpFilter.
+    if (postDomOpFilter && !postDomInfo->postDominates(postDomOpFilter, op))
+      continue;
+
+    // Skip dealloc's - no replacement is necessary, and a memref replacement
+    // at other uses doesn't hurt these dealloc's.
+    if (isa<memref::DeallocOp>(op) && !replaceInDeallocOp)
+      continue;
+
+    // Check if the memref was used in a non-dereferencing context. It is fine
+    // for the memref to be used in a non-dereferencing way outside of the
+    // region where this replacement is happening.
+    if (!isa<AffineMapAccessInterface>(*op)) {
+      if (!allowNonDereferencingOps) {
+        LLVM_DEBUG(llvm::dbgs()
+                   << "Memref replacement failed: non-deferencing memref op: \n"
+                   << *op << '\n');
+        return failure();
+      }
+      // Non-dereferencing ops with the MemRefsNormalizable trait are
+      // supported for replacement.
+      if (!op->hasTrait<OpTrait::MemRefsNormalizable>()) {
+        LLVM_DEBUG(llvm::dbgs() << "Memref replacement failed: use without a "
+                                   "memrefs normalizable trait: \n"
+                                << *op << '\n');
+        return failure();
+      }
+    }
+
+    // We'll first collect and then replace --- since replacement erases the op
+    // that has the use, and that op could be postDomFilter or domFilter itself!
+    opsToReplace.insert(op);
+  }
+
+  for (auto *op : opsToReplace) {
+    if (failed(replaceAllMemRefUsesWith(
+            oldMemRef, newMemRef, op, extraIndices, indexRemap, extraOperands,
+            symbolOperands, allowNonDereferencingOps)))
+      llvm_unreachable("memref replacement guaranteed to succeed here");
+  }
+
+  return success();
+}
+
+/// Given an operation, inserts one or more single result affine
+/// apply operations, results of which are exclusively used by this operation
+/// operation. The operands of these newly created affine apply ops are
+/// guaranteed to be loop iterators or terminal symbols of a function.
+///
+/// Before
+///
+/// affine.for %i = 0 to #map(%N)
+///   %idx = affine.apply (d0) -> (d0 mod 2) (%i)
+///   "send"(%idx, %A, ...)
+///   "compute"(%idx)
+///
+/// After
+///
+/// affine.for %i = 0 to #map(%N)
+///   %idx = affine.apply (d0) -> (d0 mod 2) (%i)
+///   "send"(%idx, %A, ...)
+///   %idx_ = affine.apply (d0) -> (d0 mod 2) (%i)
+///   "compute"(%idx_)
+///
+/// This allows applying different transformations on send and compute (for eg.
+/// different shifts/delays).
+///
+/// Returns nullptr either if none of opInst's operands were the result of an
+/// affine.apply and thus there was no affine computation slice to create, or if
+/// all the affine.apply op's supplying operands to this opInst did not have any
+/// uses besides this opInst; otherwise returns the list of affine.apply
+/// operations created in output argument `sliceOps`.
+void mlir::createAffineComputationSlice(
+    Operation *opInst, SmallVectorImpl<AffineApplyOp> *sliceOps) {
+  // Collect all operands that are results of affine apply ops.
+  SmallVector<Value, 4> subOperands;
+  subOperands.reserve(opInst->getNumOperands());
+  for (auto operand : opInst->getOperands())
+    if (isa_and_nonnull<AffineApplyOp>(operand.getDefiningOp()))
+      subOperands.push_back(operand);
+
+  // Gather sequence of AffineApplyOps reachable from 'subOperands'.
+  SmallVector<Operation *, 4> affineApplyOps;
+  getReachableAffineApplyOps(subOperands, affineApplyOps);
+  // Skip transforming if there are no affine maps to compose.
+  if (affineApplyOps.empty())
+    return;
+
+  // Check if all uses of the affine apply op's lie only in this op op, in
+  // which case there would be nothing to do.
+  bool localized = true;
+  for (auto *op : affineApplyOps) {
+    for (auto result : op->getResults()) {
+      for (auto *user : result.getUsers()) {
+        if (user != opInst) {
+          localized = false;
+          break;
+        }
+      }
+    }
+  }
+  if (localized)
+    return;
+
+  OpBuilder builder(opInst);
+  SmallVector<Value, 4> composedOpOperands(subOperands);
+  auto composedMap = builder.getMultiDimIdentityMap(composedOpOperands.size());
+  fullyComposeAffineMapAndOperands(&composedMap, &composedOpOperands);
+
+  // Create an affine.apply for each of the map results.
+  sliceOps->reserve(composedMap.getNumResults());
+  for (auto resultExpr : composedMap.getResults()) {
+    auto singleResMap = AffineMap::get(composedMap.getNumDims(),
+                                       composedMap.getNumSymbols(), resultExpr);
+    sliceOps->push_back(builder.create<AffineApplyOp>(
+        opInst->getLoc(), singleResMap, composedOpOperands));
+  }
+
+  // Construct the new operands that include the results from the composed
+  // affine apply op above instead of existing ones (subOperands). So, they
+  // differ from opInst's operands only for those operands in 'subOperands', for
+  // which they will be replaced by the corresponding one from 'sliceOps'.
+  SmallVector<Value, 4> newOperands(opInst->getOperands());
+  for (unsigned i = 0, e = newOperands.size(); i < e; i++) {
+    // Replace the subOperands from among the new operands.
+    unsigned j, f;
+    for (j = 0, f = subOperands.size(); j < f; j++) {
+      if (newOperands[i] == subOperands[j])
+        break;
+    }
+    if (j < subOperands.size()) {
+      newOperands[i] = (*sliceOps)[j];
+    }
+  }
+  for (unsigned idx = 0, e = newOperands.size(); idx < e; idx++) {
+    opInst->setOperand(idx, newOperands[idx]);
+  }
+}
+
+/// Enum to set patterns of affine expr in tiled-layout map.
+/// TileFloorDiv: <dim expr> div <tile size>
+/// TileMod: <dim expr> mod <tile size>
+/// TileNone: None of the above
+/// Example:
+/// #tiled_2d_128x256 = affine_map<(d0, d1)
+///            -> (d0 div 128, d1 div 256, d0 mod 128, d1 mod 256)>
+/// "d0 div 128" and "d1 div 256" ==> TileFloorDiv
+/// "d0 mod 128" and "d1 mod 256" ==> TileMod
+enum TileExprPattern { TileFloorDiv, TileMod, TileNone };
+
+/// Check if `map` is a tiled layout. In the tiled layout, specific k dimensions
+/// being floordiv'ed by respective tile sizes appeare in a mod with the same
+/// tile sizes, and no other expression involves those k dimensions. This
+/// function stores a vector of tuples (`tileSizePos`) including AffineExpr for
+/// tile size, positions of corresponding `floordiv` and `mod`. If it is not a
+/// tiled layout, an empty vector is returned.
+static LogicalResult getTileSizePos(
+    AffineMap map,
+    SmallVectorImpl<std::tuple<AffineExpr, unsigned, unsigned>> &tileSizePos) {
+  // Create `floordivExprs` which is a vector of tuples including LHS and RHS of
+  // `floordiv` and its position in `map` output.
+  // Example: #tiled_2d_128x256 = affine_map<(d0, d1)
+  //                -> (d0 div 128, d1 div 256, d0 mod 128, d1 mod 256)>
+  // In this example, `floordivExprs` includes {d0, 128, 0} and {d1, 256, 1}.
+  SmallVector<std::tuple<AffineExpr, AffineExpr, unsigned>, 4> floordivExprs;
+  unsigned pos = 0;
+  for (AffineExpr expr : map.getResults()) {
+    if (expr.getKind() == AffineExprKind::FloorDiv) {
+      AffineBinaryOpExpr binaryExpr = expr.cast<AffineBinaryOpExpr>();
+      if (binaryExpr.getRHS().isa<AffineConstantExpr>())
+        floordivExprs.emplace_back(
+            std::make_tuple(binaryExpr.getLHS(), binaryExpr.getRHS(), pos));
+    }
+    pos++;
+  }
+  // Not tiled layout if `floordivExprs` is empty.
+  if (floordivExprs.empty()) {
+    tileSizePos = SmallVector<std::tuple<AffineExpr, unsigned, unsigned>>{};
+    return success();
+  }
+
+  // Check if LHS of `floordiv` is used in LHS of `mod`. If not used, `map` is
+  // not tiled layout.
+  for (std::tuple<AffineExpr, AffineExpr, unsigned> fexpr : floordivExprs) {
+    AffineExpr floordivExprLHS = std::get<0>(fexpr);
+    AffineExpr floordivExprRHS = std::get<1>(fexpr);
+    unsigned floordivPos = std::get<2>(fexpr);
+
+    // Walk affinexpr of `map` output except `fexpr`, and check if LHS and RHS
+    // of `fexpr` are used in LHS and RHS of `mod`. If LHS of `fexpr` is used
+    // other expr, the map is not tiled layout. Example of non tiled layout:
+    //   affine_map<(d0, d1, d2) -> (d0, d1, d2 floordiv 256, d2 floordiv 256)>
+    //   affine_map<(d0, d1, d2) -> (d0, d1, d2 floordiv 256, d2 mod 128)>
+    //   affine_map<(d0, d1, d2) -> (d0, d1, d2 floordiv 256, d2 mod 256, d2 mod
+    //   256)>
+    bool found = false;
+    pos = 0;
+    for (AffineExpr expr : map.getResults()) {
+      bool notTiled = false;
+      if (pos != floordivPos) {
+        expr.walk([&](AffineExpr e) {
+          if (e == floordivExprLHS) {
+            if (expr.getKind() == AffineExprKind::Mod) {
+              AffineBinaryOpExpr binaryExpr = expr.cast<AffineBinaryOpExpr>();
+              // If LHS and RHS of `mod` are the same with those of floordiv.
+              if (floordivExprLHS == binaryExpr.getLHS() &&
+                  floordivExprRHS == binaryExpr.getRHS()) {
+                // Save tile size (RHS of `mod`), and position of `floordiv` and
+                // `mod` if same expr with `mod` is not found yet.
+                if (!found) {
+                  tileSizePos.emplace_back(
+                      std::make_tuple(binaryExpr.getRHS(), floordivPos, pos));
+                  found = true;
+                } else {
+                  // Non tiled layout: Have multilpe `mod` with the same LHS.
+                  // eg. affine_map<(d0, d1, d2) -> (d0, d1, d2 floordiv 256, d2
+                  // mod 256, d2 mod 256)>
+                  notTiled = true;
+                }
+              } else {
+                // Non tiled layout: RHS of `mod` is different from `floordiv`.
+                // eg. affine_map<(d0, d1, d2) -> (d0, d1, d2 floordiv 256, d2
+                // mod 128)>
+                notTiled = true;
+              }
+            } else {
+              // Non tiled layout: LHS is the same, but not `mod`.
+              // eg. affine_map<(d0, d1, d2) -> (d0, d1, d2 floordiv 256, d2
+              // floordiv 256)>
+              notTiled = true;
+            }
+          }
+        });
+      }
+      if (notTiled) {
+        tileSizePos = SmallVector<std::tuple<AffineExpr, unsigned, unsigned>>{};
+        return success();
+      }
+      pos++;
+    }
+  }
+  return success();
+}
+
+/// Check if `dim` dimension of memrefType with `layoutMap` becomes dynamic
+/// after normalization. Dimensions that include dynamic dimensions in the map
+/// output will become dynamic dimensions. Return true if `dim` is dynamic
+/// dimension.
+///
+/// Example:
+/// #map0 = affine_map<(d0, d1) -> (d0, d1 floordiv 32, d1 mod 32)>
+///
+/// If d1 is dynamic dimension, 2nd and 3rd dimension of map output are dynamic.
+/// memref<4x?xf32, #map0>  ==>  memref<4x?x?xf32>
+static bool
+isNormalizedMemRefDynamicDim(unsigned dim, AffineMap layoutMap,
+                             SmallVectorImpl<unsigned> &inMemrefTypeDynDims,
+                             MLIRContext *context) {
+  bool isDynamicDim = false;
+  AffineExpr expr = layoutMap.getResults()[dim];
+  // Check if affine expr of the dimension includes dynamic dimension of input
+  // memrefType.
+  expr.walk([&inMemrefTypeDynDims, &isDynamicDim, &context](AffineExpr e) {
+    if (e.isa<AffineDimExpr>()) {
+      for (unsigned dm : inMemrefTypeDynDims) {
+        if (e == getAffineDimExpr(dm, context)) {
+          isDynamicDim = true;
+        }
+      }
+    }
+  });
+  return isDynamicDim;
+}
+
+/// Create affine expr to calculate dimension size for a tiled-layout map.
+static AffineExpr createDimSizeExprForTiledLayout(AffineExpr oldMapOutput,
+                                                  TileExprPattern pat) {
+  // Create map output for the patterns.
+  // "floordiv <tile size>" ==> "ceildiv <tile size>"
+  // "mod <tile size>" ==> "<tile size>"
+  AffineExpr newMapOutput;
+  AffineBinaryOpExpr binaryExpr = nullptr;
+  switch (pat) {
+  case TileExprPattern::TileMod:
+    binaryExpr = oldMapOutput.cast<AffineBinaryOpExpr>();
+    newMapOutput = binaryExpr.getRHS();
+    break;
+  case TileExprPattern::TileFloorDiv:
+    binaryExpr = oldMapOutput.cast<AffineBinaryOpExpr>();
+    newMapOutput = getAffineBinaryOpExpr(
+        AffineExprKind::CeilDiv, binaryExpr.getLHS(), binaryExpr.getRHS());
+    break;
+  default:
+    newMapOutput = oldMapOutput;
+  }
+  return newMapOutput;
+}
+
+/// Create new maps to calculate each dimension size of `newMemRefType`, and
+/// create `newDynamicSizes` from them by using AffineApplyOp.
+///
+/// Steps for normalizing dynamic memrefs for a tiled layout map
+/// Example:
+///    #map0 = affine_map<(d0, d1) -> (d0, d1 floordiv 32, d1 mod 32)>
+///    %0 = dim %arg0, %c1 :memref<4x?xf32>
+///    %1 = alloc(%0) : memref<4x?xf32, #map0>
+///
+/// (Before this function)
+/// 1. Check if `map`(#map0) is a tiled layout using `getTileSizePos()`. Only
+/// single layout map is supported.
+///
+/// 2. Create normalized memrefType using `isNormalizedMemRefDynamicDim()`. It
+/// is memref<4x?x?xf32> in the above example.
+///
+/// (In this function)
+/// 3. Create new maps to calculate each dimension of the normalized memrefType
+/// using `createDimSizeExprForTiledLayout()`. In the tiled layout, the
+/// dimension size can be calculated by replacing "floordiv <tile size>" with
+/// "ceildiv <tile size>" and "mod <tile size>" with "<tile size>".
+/// - New map in the above example
+///   #map0 = affine_map<(d0, d1) -> (d0)>
+///   #map1 = affine_map<(d0, d1) -> (d1 ceildiv 32)>
+///   #map2 = affine_map<(d0, d1) -> (32)>
+///
+/// 4. Create AffineApplyOp to apply the new maps. The output of AffineApplyOp
+/// is used in dynamicSizes of new AllocOp.
+///   %0 = dim %arg0, %c1 : memref<4x?xf32>
+///   %c4 = arith.constant 4 : index
+///   %1 = affine.apply #map1(%c4, %0)
+///   %2 = affine.apply #map2(%c4, %0)
+static void createNewDynamicSizes(MemRefType oldMemRefType,
+                                  MemRefType newMemRefType, AffineMap map,
+                                  memref::AllocOp *allocOp, OpBuilder b,
+                                  SmallVectorImpl<Value> &newDynamicSizes) {
+  // Create new input for AffineApplyOp.
+  SmallVector<Value, 4> inAffineApply;
+  ArrayRef<int64_t> oldMemRefShape = oldMemRefType.getShape();
+  unsigned dynIdx = 0;
+  for (unsigned d = 0; d < oldMemRefType.getRank(); ++d) {
+    if (oldMemRefShape[d] < 0) {
+      // Use dynamicSizes of allocOp for dynamic dimension.
+      inAffineApply.emplace_back(allocOp->dynamicSizes()[dynIdx]);
+      dynIdx++;
+    } else {
+      // Create ConstantOp for static dimension.
+      Attribute constantAttr =
+          b.getIntegerAttr(b.getIndexType(), oldMemRefShape[d]);
+      inAffineApply.emplace_back(
+          b.create<arith::ConstantOp>(allocOp->getLoc(), constantAttr));
+    }
+  }
+
+  // Create new map to calculate each dimension size of new memref for each
+  // original map output. Only for dynamic dimesion of `newMemRefType`.
+  unsigned newDimIdx = 0;
+  ArrayRef<int64_t> newMemRefShape = newMemRefType.getShape();
+  SmallVector<std::tuple<AffineExpr, unsigned, unsigned>> tileSizePos;
+  (void)getTileSizePos(map, tileSizePos);
+  for (AffineExpr expr : map.getResults()) {
+    if (newMemRefShape[newDimIdx] < 0) {
+      // Create new maps to calculate each dimension size of new memref.
+      enum TileExprPattern pat = TileExprPattern::TileNone;
+      for (auto pos : tileSizePos) {
+        if (newDimIdx == std::get<1>(pos))
+          pat = TileExprPattern::TileFloorDiv;
+        else if (newDimIdx == std::get<2>(pos))
+          pat = TileExprPattern::TileMod;
+      }
+      AffineExpr newMapOutput = createDimSizeExprForTiledLayout(expr, pat);
+      AffineMap newMap =
+          AffineMap::get(map.getNumInputs(), map.getNumSymbols(), newMapOutput);
+      Value affineApp =
+          b.create<AffineApplyOp>(allocOp->getLoc(), newMap, inAffineApply);
+      newDynamicSizes.emplace_back(affineApp);
+    }
+    newDimIdx++;
+  }
+}
+
+// TODO: Currently works for static memrefs with a single layout map.
+LogicalResult mlir::normalizeMemRef(memref::AllocOp *allocOp) {
+  MemRefType memrefType = allocOp->getType();
+  OpBuilder b(*allocOp);
+
+  // Fetch a new memref type after normalizing the old memref to have an
+  // identity map layout.
+  MemRefType newMemRefType =
+      normalizeMemRefType(memrefType, b, allocOp->symbolOperands().size());
+  if (newMemRefType == memrefType)
+    // Either memrefType already had an identity map or the map couldn't be
+    // transformed to an identity map.
+    return failure();
+
+  Value oldMemRef = allocOp->getResult();
+
+  SmallVector<Value, 4> symbolOperands(allocOp->symbolOperands());
+  AffineMap layoutMap = memrefType.getLayout().getAffineMap();
+  memref::AllocOp newAlloc;
+  // Check if `layoutMap` is a tiled layout. Only single layout map is
+  // supported for normalizing dynamic memrefs.
+  SmallVector<std::tuple<AffineExpr, unsigned, unsigned>> tileSizePos;
+  (void)getTileSizePos(layoutMap, tileSizePos);
+  if (newMemRefType.getNumDynamicDims() > 0 && !tileSizePos.empty()) {
+    MemRefType oldMemRefType = oldMemRef.getType().cast<MemRefType>();
+    SmallVector<Value, 4> newDynamicSizes;
+    createNewDynamicSizes(oldMemRefType, newMemRefType, layoutMap, allocOp, b,
+                          newDynamicSizes);
+    // Add the new dynamic sizes in new AllocOp.
+    newAlloc =
+        b.create<memref::AllocOp>(allocOp->getLoc(), newMemRefType,
+                                  newDynamicSizes, allocOp->alignmentAttr());
+  } else {
+    newAlloc = b.create<memref::AllocOp>(allocOp->getLoc(), newMemRefType,
+                                         allocOp->alignmentAttr());
+  }
+  // Replace all uses of the old memref.
+  if (failed(replaceAllMemRefUsesWith(oldMemRef, /*newMemRef=*/newAlloc,
+                                      /*extraIndices=*/{},
+                                      /*indexRemap=*/layoutMap,
+                                      /*extraOperands=*/{},
+                                      /*symbolOperands=*/symbolOperands,
+                                      /*domOpFilter=*/nullptr,
+                                      /*postDomOpFilter=*/nullptr,
+                                      /*allowNonDereferencingOps=*/true))) {
+    // If it failed (due to escapes for example), bail out.
+    newAlloc.erase();
+    return failure();
+  }
+  // Replace any uses of the original alloc op and erase it. All remaining uses
+  // have to be dealloc's; RAMUW above would've failed otherwise.
+  assert(llvm::all_of(oldMemRef.getUsers(), [](Operation *op) {
+    return isa<memref::DeallocOp>(op);
+  }));
+  oldMemRef.replaceAllUsesWith(newAlloc);
+  allocOp->erase();
+  return success();
+}
+
+MemRefType mlir::normalizeMemRefType(MemRefType memrefType, OpBuilder b,
+                                     unsigned numSymbolicOperands) {
+  unsigned rank = memrefType.getRank();
+  if (rank == 0)
+    return memrefType;
+
+  if (memrefType.getLayout().isIdentity()) {
+    // Either no maps is associated with this memref or this memref has
+    // a trivial (identity) map.
+    return memrefType;
+  }
+  AffineMap layoutMap = memrefType.getLayout().getAffineMap();
+
+  // We don't do any checks for one-to-one'ness; we assume that it is
+  // one-to-one.
+
+  // Normalize only static memrefs and dynamic memrefs with a tiled-layout map
+  // for now.
+  // TODO: Normalize the other types of dynamic memrefs.
+  SmallVector<std::tuple<AffineExpr, unsigned, unsigned>> tileSizePos;
+  (void)getTileSizePos(layoutMap, tileSizePos);
+  if (memrefType.getNumDynamicDims() > 0 && tileSizePos.empty())
+    return memrefType;
+
+  // We have a single map that is not an identity map. Create a new memref
+  // with the right shape and an identity layout map.
+  ArrayRef<int64_t> shape = memrefType.getShape();
+  // FlatAffineConstraint may later on use symbolicOperands.
+  FlatAffineConstraints fac(rank, numSymbolicOperands);
+  SmallVector<unsigned, 4> memrefTypeDynDims;
+  for (unsigned d = 0; d < rank; ++d) {
+    // Use constraint system only in static dimensions.
+    if (shape[d] > 0) {
+      fac.addBound(FlatAffineConstraints::LB, d, 0);
+      fac.addBound(FlatAffineConstraints::UB, d, shape[d] - 1);
+    } else {
+      memrefTypeDynDims.emplace_back(d);
+    }
+  }
+  // We compose this map with the original index (logical) space to derive
+  // the upper bounds for the new index space.
+  unsigned newRank = layoutMap.getNumResults();
+  if (failed(fac.composeMatchingMap(layoutMap)))
+    return memrefType;
+  // TODO: Handle semi-affine maps.
+  // Project out the old data dimensions.
+  fac.projectOut(newRank, fac.getNumIds() - newRank - fac.getNumLocalIds());
+  SmallVector<int64_t, 4> newShape(newRank);
+  for (unsigned d = 0; d < newRank; ++d) {
+    // Check if each dimension of normalized memrefType is dynamic.
+    bool isDynDim = isNormalizedMemRefDynamicDim(
+        d, layoutMap, memrefTypeDynDims, b.getContext());
+    if (isDynDim) {
+      newShape[d] = -1;
+    } else {
+      // The lower bound for the shape is always zero.
+      auto ubConst = fac.getConstantBound(FlatAffineConstraints::UB, d);
+      // For a static memref and an affine map with no symbols, this is
+      // always bounded.
+      assert(ubConst.hasValue() && "should always have an upper bound");
+      if (ubConst.getValue() < 0)
+        // This is due to an invalid map that maps to a negative space.
+        return memrefType;
+      // If dimension of new memrefType is dynamic, the value is -1.
+      newShape[d] = ubConst.getValue() + 1;
+    }
+  }
+
+  // Create the new memref type after trivializing the old layout map.
+  MemRefType newMemRefType =
+      MemRefType::Builder(memrefType)
+          .setShape(newShape)
+          .setLayout(AffineMapAttr::get(b.getMultiDimIdentityMap(newRank)));
+
+  return newMemRefType;
+}
diff --git a/mlir/lib/Dialect/GPU/Transforms/MemoryPromotion.cpp b/mlir/lib/Dialect/GPU/Transforms/MemoryPromotion.cpp
--- a/mlir/lib/Dialect/GPU/Transforms/MemoryPromotion.cpp
+++ b/mlir/lib/Dialect/GPU/Transforms/MemoryPromotion.cpp
@@ -12,6 +12,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "mlir/Dialect/GPU/MemoryPromotion.h"
+#include "mlir/Dialect/Affine/LoopUtils.h"
 #include "mlir/Dialect/Arithmetic/IR/Arithmetic.h"
 #include "mlir/Dialect/GPU/GPUDialect.h"
 #include "mlir/Dialect/MemRef/IR/MemRef.h"
@@ -19,7 +20,6 @@
 #include "mlir/Dialect/StandardOps/IR/Ops.h"
 #include "mlir/IR/ImplicitLocOpBuilder.h"
 #include "mlir/Pass/Pass.h"
-#include "mlir/Transforms/LoopUtils.h"
 
 using namespace mlir;
 using namespace mlir::gpu;
diff --git a/mlir/lib/Dialect/Linalg/Transforms/CodegenStrategy.cpp b/mlir/lib/Dialect/Linalg/Transforms/CodegenStrategy.cpp
--- a/mlir/lib/Dialect/Linalg/Transforms/CodegenStrategy.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/CodegenStrategy.cpp
@@ -13,6 +13,7 @@
 
 #include "mlir/Dialect/Linalg/Transforms/CodegenStrategy.h"
 
+#include "mlir/Dialect/Affine/LoopUtils.h"
 #include "mlir/Dialect/Linalg/Passes.h"
 #include "mlir/Dialect/Linalg/Transforms/Hoisting.h"
 #include "mlir/Dialect/SCF/Transforms.h"
@@ -20,7 +21,6 @@
 #include "mlir/Dialect/Vector/VectorTransforms.h"
 #include "mlir/Pass/PassManager.h"
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
-#include "mlir/Transforms/LoopUtils.h"
 #include "mlir/Transforms/Passes.h"
 
 using namespace mlir;
diff --git a/mlir/lib/Dialect/Linalg/Transforms/HoistPadding.cpp b/mlir/lib/Dialect/Linalg/Transforms/HoistPadding.cpp
--- a/mlir/lib/Dialect/Linalg/Transforms/HoistPadding.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/HoistPadding.cpp
@@ -12,6 +12,7 @@
 
 #include "mlir/Dialect/Linalg/Transforms/HoistPadding.h"
 #include "mlir/Analysis/SliceAnalysis.h"
+#include "mlir/Dialect/Affine/LoopUtils.h"
 #include "mlir/Dialect/Affine/Utils.h"
 #include "mlir/Dialect/Linalg/IR/Linalg.h"
 #include "mlir/Dialect/Linalg/Transforms/Transforms.h"
@@ -24,7 +25,6 @@
 #include "mlir/IR/AsmState.h"
 #include "mlir/IR/BuiltinOps.h"
 #include "mlir/IR/Dominance.h"
-#include "mlir/Transforms/LoopUtils.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Support/Debug.h"
 
diff --git a/mlir/lib/Dialect/Linalg/Transforms/Hoisting.cpp b/mlir/lib/Dialect/Linalg/Transforms/Hoisting.cpp
--- a/mlir/lib/Dialect/Linalg/Transforms/Hoisting.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/Hoisting.cpp
@@ -15,6 +15,7 @@
 #include "mlir/Analysis/SliceAnalysis.h"
 #include "mlir/Dialect/Affine/Analysis/AffineStructures.h"
 #include "mlir/Dialect/Affine/IR/AffineValueMap.h"
+#include "mlir/Dialect/Affine/LoopUtils.h"
 #include "mlir/Dialect/Affine/Utils.h"
 #include "mlir/Dialect/Linalg/IR/Linalg.h"
 #include "mlir/Dialect/Linalg/Transforms/Transforms.h"
@@ -27,7 +28,6 @@
 #include "mlir/IR/BuiltinOps.h"
 #include "mlir/IR/Dominance.h"
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
-#include "mlir/Transforms/LoopUtils.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Support/Debug.h"
 
diff --git a/mlir/lib/Dialect/Linalg/Transforms/LinalgStrategyPasses.cpp b/mlir/lib/Dialect/Linalg/Transforms/LinalgStrategyPasses.cpp
--- a/mlir/lib/Dialect/Linalg/Transforms/LinalgStrategyPasses.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/LinalgStrategyPasses.cpp
@@ -16,6 +16,8 @@
 #include "PassDetail.h"
 #include "mlir/Analysis/SliceAnalysis.h"
 #include "mlir/Dialect/Affine/IR/AffineOps.h"
+#include "mlir/Dialect/Affine/LoopUtils.h"
+#include "mlir/Dialect/Affine/Utils.h"
 #include "mlir/Dialect/Linalg/IR/Linalg.h"
 #include "mlir/Dialect/Linalg/Passes.h"
 #include "mlir/Dialect/Linalg/Transforms/Hoisting.h"
@@ -29,9 +31,7 @@
 #include "mlir/Pass/PassManager.h"
 #include "mlir/Support/LLVM.h"
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
-#include "mlir/Transforms/LoopUtils.h"
 #include "mlir/Transforms/Passes.h"
-#include "mlir/Transforms/Utils.h"
 
 using namespace mlir;
 using namespace mlir::vector;
diff --git a/mlir/lib/Dialect/Linalg/Utils/Utils.cpp b/mlir/lib/Dialect/Linalg/Utils/Utils.cpp
--- a/mlir/lib/Dialect/Linalg/Utils/Utils.cpp
+++ b/mlir/lib/Dialect/Linalg/Utils/Utils.cpp
@@ -16,6 +16,7 @@
 #include "mlir/Dialect/Affine/Analysis/AffineStructures.h"
 #include "mlir/Dialect/Affine/IR/AffineOps.h"
 #include "mlir/Dialect/Affine/IR/AffineValueMap.h"
+#include "mlir/Dialect/Affine/LoopUtils.h"
 #include "mlir/Dialect/Arithmetic/IR/Arithmetic.h"
 #include "mlir/Dialect/Linalg/IR/Linalg.h"
 #include "mlir/Dialect/MemRef/IR/MemRef.h"
@@ -30,7 +31,6 @@
 #include "mlir/IR/Matchers.h"
 #include "mlir/IR/OpImplementation.h"
 #include "mlir/Pass/Pass.h"
-#include "mlir/Transforms/LoopUtils.h"
 #include "llvm/ADT/TypeSwitch.h"
 #include "llvm/Support/Debug.h"
 
diff --git a/mlir/lib/Dialect/MemRef/Transforms/NormalizeMemRefs.cpp b/mlir/lib/Dialect/MemRef/Transforms/NormalizeMemRefs.cpp
--- a/mlir/lib/Dialect/MemRef/Transforms/NormalizeMemRefs.cpp
+++ b/mlir/lib/Dialect/MemRef/Transforms/NormalizeMemRefs.cpp
@@ -13,9 +13,9 @@
 
 #include "PassDetail.h"
 #include "mlir/Dialect/Affine/IR/AffineOps.h"
+#include "mlir/Dialect/Affine/Utils.h"
 #include "mlir/Dialect/MemRef/IR/MemRef.h"
 #include "mlir/Dialect/MemRef/Transforms/Passes.h"
-#include "mlir/Transforms/Utils.h"
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/Support/Debug.h"
 
diff --git a/mlir/lib/Dialect/SCF/Transforms/CMakeLists.txt b/mlir/lib/Dialect/SCF/Transforms/CMakeLists.txt
--- a/mlir/lib/Dialect/SCF/Transforms/CMakeLists.txt
+++ b/mlir/lib/Dialect/SCF/Transforms/CMakeLists.txt
@@ -6,6 +6,7 @@
   LoopPipelining.cpp
   LoopRangeFolding.cpp
   LoopSpecialization.cpp
+  ParallelLoopCollapsing.cpp
   ParallelLoopFusion.cpp
   ParallelLoopTiling.cpp
   StructuralTypeConversions.cpp
diff --git a/mlir/lib/Transforms/ParallelLoopCollapsing.cpp b/mlir/lib/Dialect/SCF/Transforms/ParallelLoopCollapsing.cpp
rename from mlir/lib/Transforms/ParallelLoopCollapsing.cpp
rename to mlir/lib/Dialect/SCF/Transforms/ParallelLoopCollapsing.cpp
--- a/mlir/lib/Transforms/ParallelLoopCollapsing.cpp
+++ b/mlir/lib/Dialect/SCF/Transforms/ParallelLoopCollapsing.cpp
@@ -7,9 +7,9 @@
 //===----------------------------------------------------------------------===//
 
 #include "PassDetail.h"
+#include "mlir/Dialect/Affine/LoopUtils.h"
+#include "mlir/Dialect/SCF/Passes.h"
 #include "mlir/Dialect/SCF/SCF.h"
-#include "mlir/Transforms/LoopUtils.h"
-#include "mlir/Transforms/Passes.h"
 #include "mlir/Transforms/RegionUtils.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
@@ -20,7 +20,7 @@
 
 namespace {
 struct ParallelLoopCollapsing
-    : public ParallelLoopCollapsingBase<ParallelLoopCollapsing> {
+    : public SCFParallelLoopCollapsingBase<ParallelLoopCollapsing> {
   void runOnOperation() override {
     Operation *module = getOperation();
 
diff --git a/mlir/lib/Interfaces/LoopLikeInterface.cpp b/mlir/lib/Interfaces/LoopLikeInterface.cpp
--- a/mlir/lib/Interfaces/LoopLikeInterface.cpp
+++ b/mlir/lib/Interfaces/LoopLikeInterface.cpp
@@ -7,12 +7,95 @@
 //===----------------------------------------------------------------------===//
 
 #include "mlir/Interfaces/LoopLikeInterface.h"
+#include "mlir/Interfaces/SideEffectInterfaces.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/Support/Debug.h"
 
 using namespace mlir;
 
+#define DEBUG_TYPE "loop-like"
+
 //===----------------------------------------------------------------------===//
 // LoopLike Interfaces
 //===----------------------------------------------------------------------===//
 
 /// Include the definitions of the loop-like interfaces.
 #include "mlir/Interfaces/LoopLikeInterface.cpp.inc"
+
+//===----------------------------------------------------------------------===//
+// LoopLike Utilities
+//===----------------------------------------------------------------------===//
+
+// Checks whether the given op can be hoisted by checking that
+// - the op and any of its contained operations do not depend on SSA values
+//   defined inside of the loop (by means of calling definedOutside).
+// - the op has no side-effects. If sideEffecting is Never, sideeffects of this
+//   op and its nested ops are ignored.
+static bool canBeHoisted(Operation *op,
+                         function_ref<bool(Value)> definedOutside) {
+  // Check that dependencies are defined outside of loop.
+  if (!llvm::all_of(op->getOperands(), definedOutside))
+    return false;
+  // Check whether this op is side-effect free. If we already know that there
+  // can be no side-effects because the surrounding op has claimed so, we can
+  // (and have to) skip this step.
+  if (auto memInterface = dyn_cast<MemoryEffectOpInterface>(op)) {
+    if (!memInterface.hasNoEffect())
+      return false;
+    // If the operation doesn't have side effects and it doesn't recursively
+    // have side effects, it can always be hoisted.
+    if (!op->hasTrait<OpTrait::HasRecursiveSideEffects>())
+      return true;
+
+    // Otherwise, if the operation doesn't provide the memory effect interface
+    // and it doesn't have recursive side effects we treat it conservatively as
+    // side-effecting.
+  } else if (!op->hasTrait<OpTrait::HasRecursiveSideEffects>()) {
+    return false;
+  }
+
+  // Recurse into the regions for this op and check whether the contained ops
+  // can be hoisted.
+  for (auto &region : op->getRegions()) {
+    for (auto &block : region) {
+      for (auto &innerOp : block)
+        if (!canBeHoisted(&innerOp, definedOutside))
+          return false;
+    }
+  }
+  return true;
+}
+
+LogicalResult mlir::moveLoopInvariantCode(LoopLikeOpInterface looplike) {
+  auto &loopBody = looplike.getLoopBody();
+
+  // We use two collections here as we need to preserve the order for insertion
+  // and this is easiest.
+  SmallPtrSet<Operation *, 8> willBeMovedSet;
+  SmallVector<Operation *, 8> opsToMove;
+
+  // Helper to check whether an operation is loop invariant wrt. SSA properties.
+  auto isDefinedOutsideOfBody = [&](Value value) {
+    auto *definingOp = value.getDefiningOp();
+    return (definingOp && !!willBeMovedSet.count(definingOp)) ||
+           looplike.isDefinedOutsideOfLoop(value);
+  };
+
+  // Do not use walk here, as we do not want to go into nested regions and hoist
+  // operations from there. These regions might have semantics unknown to this
+  // rewriting. If the nested regions are loops, they will have been processed.
+  for (auto &block : loopBody) {
+    for (auto &op : block.without_terminator()) {
+      if (canBeHoisted(&op, isDefinedOutsideOfBody)) {
+        opsToMove.push_back(&op);
+        willBeMovedSet.insert(&op);
+      }
+    }
+  }
+
+  // For all instructions that we found to be invariant, move outside of the
+  // loop.
+  LogicalResult result = looplike.moveOutOfLoop(opsToMove);
+  LLVM_DEBUG(looplike.print(llvm::dbgs() << "\n\nModified loop:\n"));
+  return result;
+}
diff --git a/mlir/lib/Transforms/CMakeLists.txt b/mlir/lib/Transforms/CMakeLists.txt
--- a/mlir/lib/Transforms/CMakeLists.txt
+++ b/mlir/lib/Transforms/CMakeLists.txt
@@ -5,12 +5,8 @@
   CSE.cpp
   Inliner.cpp
   LocationSnapshot.cpp
-  LoopCoalescing.cpp
-  LoopFusion.cpp
   LoopInvariantCodeMotion.cpp
   OpStats.cpp
-  ParallelLoopCollapsing.cpp
-  PipelineDataTransfer.cpp
   SCCP.cpp
   StripDebugInfo.cpp
   SymbolDCE.cpp
@@ -20,18 +16,13 @@
   ${MLIR_MAIN_INCLUDE_DIR}/mlir/Transforms
 
   DEPENDS
-  MLIRStandardOpsIncGen
   MLIRTransformsPassIncGen
 
   LINK_LIBS PUBLIC
-  MLIRAffine
   MLIRAnalysis
   MLIRCopyOpInterface
   MLIRLoopLikeInterface
-  MLIRMemRef
-  MLIRSCF
   MLIRPass
   MLIRSupport
   MLIRTransformUtils
-  MLIRVector
   )
diff --git a/mlir/lib/Transforms/CSE.cpp b/mlir/lib/Transforms/CSE.cpp
--- a/mlir/lib/Transforms/CSE.cpp
+++ b/mlir/lib/Transforms/CSE.cpp
@@ -15,7 +15,6 @@
 #include "mlir/IR/Dominance.h"
 #include "mlir/Pass/Pass.h"
 #include "mlir/Transforms/Passes.h"
-#include "mlir/Transforms/Utils.h"
 #include "llvm/ADT/DenseMapInfo.h"
 #include "llvm/ADT/Hashing.h"
 #include "llvm/ADT/ScopedHashTable.h"
diff --git a/mlir/lib/Transforms/LoopInvariantCodeMotion.cpp b/mlir/lib/Transforms/LoopInvariantCodeMotion.cpp
--- a/mlir/lib/Transforms/LoopInvariantCodeMotion.cpp
+++ b/mlir/lib/Transforms/LoopInvariantCodeMotion.cpp
@@ -11,13 +11,10 @@
 //===----------------------------------------------------------------------===//
 
 #include "PassDetail.h"
-#include "mlir/Transforms/Passes.h"
-
 #include "mlir/IR/Builders.h"
-#include "mlir/IR/BuiltinOps.h"
 #include "mlir/Interfaces/LoopLikeInterface.h"
 #include "mlir/Interfaces/SideEffectInterfaces.h"
-#include "mlir/Transforms/LoopUtils.h"
+#include "mlir/Transforms/Passes.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
@@ -34,80 +31,6 @@
 };
 } // namespace
 
-// Checks whether the given op can be hoisted by checking that
-// - the op and any of its contained operations do not depend on SSA values
-//   defined inside of the loop (by means of calling definedOutside).
-// - the op has no side-effects. If sideEffecting is Never, sideeffects of this
-//   op and its nested ops are ignored.
-static bool canBeHoisted(Operation *op,
-                         function_ref<bool(Value)> definedOutside) {
-  // Check that dependencies are defined outside of loop.
-  if (!llvm::all_of(op->getOperands(), definedOutside))
-    return false;
-  // Check whether this op is side-effect free. If we already know that there
-  // can be no side-effects because the surrounding op has claimed so, we can
-  // (and have to) skip this step.
-  if (auto memInterface = dyn_cast<MemoryEffectOpInterface>(op)) {
-    if (!memInterface.hasNoEffect())
-      return false;
-    // If the operation doesn't have side effects and it doesn't recursively
-    // have side effects, it can always be hoisted.
-    if (!op->hasTrait<OpTrait::HasRecursiveSideEffects>())
-      return true;
-
-    // Otherwise, if the operation doesn't provide the memory effect interface
-    // and it doesn't have recursive side effects we treat it conservatively as
-    // side-effecting.
-  } else if (!op->hasTrait<OpTrait::HasRecursiveSideEffects>()) {
-    return false;
-  }
-
-  // Recurse into the regions for this op and check whether the contained ops
-  // can be hoisted.
-  for (auto &region : op->getRegions()) {
-    for (auto &block : region) {
-      for (auto &innerOp : block)
-        if (!canBeHoisted(&innerOp, definedOutside))
-          return false;
-    }
-  }
-  return true;
-}
-
-LogicalResult mlir::moveLoopInvariantCode(LoopLikeOpInterface looplike) {
-  auto &loopBody = looplike.getLoopBody();
-
-  // We use two collections here as we need to preserve the order for insertion
-  // and this is easiest.
-  SmallPtrSet<Operation *, 8> willBeMovedSet;
-  SmallVector<Operation *, 8> opsToMove;
-
-  // Helper to check whether an operation is loop invariant wrt. SSA properties.
-  auto isDefinedOutsideOfBody = [&](Value value) {
-    auto *definingOp = value.getDefiningOp();
-    return (definingOp && !!willBeMovedSet.count(definingOp)) ||
-           looplike.isDefinedOutsideOfLoop(value);
-  };
-
-  // Do not use walk here, as we do not want to go into nested regions and hoist
-  // operations from there. These regions might have semantics unknown to this
-  // rewriting. If the nested regions are loops, they will have been processed.
-  for (auto &block : loopBody) {
-    for (auto &op : block.without_terminator()) {
-      if (canBeHoisted(&op, isDefinedOutsideOfBody)) {
-        opsToMove.push_back(&op);
-        willBeMovedSet.insert(&op);
-      }
-    }
-  }
-
-  // For all instructions that we found to be invariant, move outside of the
-  // loop.
-  auto result = looplike.moveOutOfLoop(opsToMove);
-  LLVM_DEBUG(looplike.print(llvm::dbgs() << "\n\nModified loop:\n"));
-  return result;
-}
-
 void LoopInvariantCodeMotion::runOnOperation() {
   // Walk through all loops in a function in innermost-loop-first order. This
   // way, we first LICM from the inner loop, and place the ops in
diff --git a/mlir/lib/Transforms/PassDetail.h b/mlir/lib/Transforms/PassDetail.h
--- a/mlir/lib/Transforms/PassDetail.h
+++ b/mlir/lib/Transforms/PassDetail.h
@@ -13,23 +13,8 @@
 #include "mlir/Transforms/Passes.h"
 
 namespace mlir {
-class AffineDialect;
-
-// Forward declaration from Dialect.h
-template <typename ConcreteDialect>
-void registerDialect(DialectRegistry &registry);
-
-namespace arith {
-class ArithmeticDialect;
-} // namespace arith
-
-namespace memref {
-class MemRefDialect;
-} // namespace memref
-
 #define GEN_PASS_CLASSES
 #include "mlir/Transforms/Passes.h.inc"
-
 } // namespace mlir
 
 #endif // TRANSFORMS_PASSDETAIL_H_
diff --git a/mlir/lib/Transforms/Utils/CMakeLists.txt b/mlir/lib/Transforms/Utils/CMakeLists.txt
--- a/mlir/lib/Transforms/Utils/CMakeLists.txt
+++ b/mlir/lib/Transforms/Utils/CMakeLists.txt
@@ -3,25 +3,12 @@
   FoldUtils.cpp
   GreedyPatternRewriteDriver.cpp
   InliningUtils.cpp
-  LoopFusionUtils.cpp
-  LoopUtils.cpp
   RegionUtils.cpp
-  Utils.cpp
 
   ADDITIONAL_HEADER_DIRS
   ${MLIR_MAIN_INCLUDE_DIR}/mlir/Transforms
 
-  DEPENDS
-  MLIRStandardOpsIncGen
-
   LINK_LIBS PUBLIC
-  MLIRAffine
-  MLIRArithmetic
   MLIRAnalysis
-  MLIRAffineAnalysis
-  MLIRMemRef
-  MLIRSCF
-  MLIRPass
   MLIRRewrite
-  MLIRStandard
   )
diff --git a/mlir/lib/Transforms/Utils/DialectConversion.cpp b/mlir/lib/Transforms/Utils/DialectConversion.cpp
--- a/mlir/lib/Transforms/Utils/DialectConversion.cpp
+++ b/mlir/lib/Transforms/Utils/DialectConversion.cpp
@@ -13,7 +13,6 @@
 #include "mlir/IR/BuiltinOps.h"
 #include "mlir/IR/FunctionInterfaces.h"
 #include "mlir/Rewrite/PatternApplicator.h"
-#include "mlir/Transforms/Utils.h"
 #include "llvm/ADT/ScopeExit.h"
 #include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/SmallPtrSet.h"
diff --git a/mlir/lib/Transforms/Utils/Utils.cpp b/mlir/lib/Transforms/Utils/Utils.cpp
deleted file mode 100644
--- a/mlir/lib/Transforms/Utils/Utils.cpp
+++ /dev/null
@@ -1,767 +0,0 @@
-//===- Utils.cpp ---- Misc utilities for code and data transformation -----===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file implements miscellaneous transformation routines for non-loop IR
-// structures.
-//
-//===----------------------------------------------------------------------===//
-
-#include "mlir/Transforms/Utils.h"
-#include "mlir/Dialect/Affine/Analysis/AffineAnalysis.h"
-#include "mlir/Dialect/Affine/Analysis/AffineStructures.h"
-#include "mlir/Dialect/Affine/Analysis/Utils.h"
-#include "mlir/Dialect/Affine/IR/AffineOps.h"
-#include "mlir/Dialect/Arithmetic/IR/Arithmetic.h"
-#include "mlir/Dialect/MemRef/IR/MemRef.h"
-#include "mlir/IR/Builders.h"
-#include "mlir/IR/BuiltinOps.h"
-#include "mlir/IR/Dominance.h"
-#include "mlir/Support/MathExtras.h"
-#include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/TypeSwitch.h"
-
-#define DEBUG_TYPE "transforms-utils"
-
-using namespace mlir;
-
-// Perform the replacement in `op`.
-LogicalResult mlir::replaceAllMemRefUsesWith(Value oldMemRef, Value newMemRef,
-                                             Operation *op,
-                                             ArrayRef<Value> extraIndices,
-                                             AffineMap indexRemap,
-                                             ArrayRef<Value> extraOperands,
-                                             ArrayRef<Value> symbolOperands,
-                                             bool allowNonDereferencingOps) {
-  unsigned newMemRefRank = newMemRef.getType().cast<MemRefType>().getRank();
-  (void)newMemRefRank; // unused in opt mode
-  unsigned oldMemRefRank = oldMemRef.getType().cast<MemRefType>().getRank();
-  (void)oldMemRefRank; // unused in opt mode
-  if (indexRemap) {
-    assert(indexRemap.getNumSymbols() == symbolOperands.size() &&
-           "symbolic operand count mismatch");
-    assert(indexRemap.getNumInputs() ==
-           extraOperands.size() + oldMemRefRank + symbolOperands.size());
-    assert(indexRemap.getNumResults() + extraIndices.size() == newMemRefRank);
-  } else {
-    assert(oldMemRefRank + extraIndices.size() == newMemRefRank);
-  }
-
-  // Assert same elemental type.
-  assert(oldMemRef.getType().cast<MemRefType>().getElementType() ==
-         newMemRef.getType().cast<MemRefType>().getElementType());
-
-  SmallVector<unsigned, 2> usePositions;
-  for (const auto &opEntry : llvm::enumerate(op->getOperands())) {
-    if (opEntry.value() == oldMemRef)
-      usePositions.push_back(opEntry.index());
-  }
-
-  // If memref doesn't appear, nothing to do.
-  if (usePositions.empty())
-    return success();
-
-  if (usePositions.size() > 1) {
-    // TODO: extend it for this case when needed (rare).
-    assert(false && "multiple dereferencing uses in a single op not supported");
-    return failure();
-  }
-
-  unsigned memRefOperandPos = usePositions.front();
-
-  OpBuilder builder(op);
-  // The following checks if op is dereferencing memref and performs the access
-  // index rewrites.
-  auto affMapAccInterface = dyn_cast<AffineMapAccessInterface>(op);
-  if (!affMapAccInterface) {
-    if (!allowNonDereferencingOps) {
-      // Failure: memref used in a non-dereferencing context (potentially
-      // escapes); no replacement in these cases unless allowNonDereferencingOps
-      // is set.
-      return failure();
-    }
-    op->setOperand(memRefOperandPos, newMemRef);
-    return success();
-  }
-  // Perform index rewrites for the dereferencing op and then replace the op
-  NamedAttribute oldMapAttrPair =
-      affMapAccInterface.getAffineMapAttrForMemRef(oldMemRef);
-  AffineMap oldMap = oldMapAttrPair.getValue().cast<AffineMapAttr>().getValue();
-  unsigned oldMapNumInputs = oldMap.getNumInputs();
-  SmallVector<Value, 4> oldMapOperands(
-      op->operand_begin() + memRefOperandPos + 1,
-      op->operand_begin() + memRefOperandPos + 1 + oldMapNumInputs);
-
-  // Apply 'oldMemRefOperands = oldMap(oldMapOperands)'.
-  SmallVector<Value, 4> oldMemRefOperands;
-  SmallVector<Value, 4> affineApplyOps;
-  oldMemRefOperands.reserve(oldMemRefRank);
-  if (oldMap != builder.getMultiDimIdentityMap(oldMap.getNumDims())) {
-    for (auto resultExpr : oldMap.getResults()) {
-      auto singleResMap = AffineMap::get(oldMap.getNumDims(),
-                                         oldMap.getNumSymbols(), resultExpr);
-      auto afOp = builder.create<AffineApplyOp>(op->getLoc(), singleResMap,
-                                                oldMapOperands);
-      oldMemRefOperands.push_back(afOp);
-      affineApplyOps.push_back(afOp);
-    }
-  } else {
-    oldMemRefOperands.assign(oldMapOperands.begin(), oldMapOperands.end());
-  }
-
-  // Construct new indices as a remap of the old ones if a remapping has been
-  // provided. The indices of a memref come right after it, i.e.,
-  // at position memRefOperandPos + 1.
-  SmallVector<Value, 4> remapOperands;
-  remapOperands.reserve(extraOperands.size() + oldMemRefRank +
-                        symbolOperands.size());
-  remapOperands.append(extraOperands.begin(), extraOperands.end());
-  remapOperands.append(oldMemRefOperands.begin(), oldMemRefOperands.end());
-  remapOperands.append(symbolOperands.begin(), symbolOperands.end());
-
-  SmallVector<Value, 4> remapOutputs;
-  remapOutputs.reserve(oldMemRefRank);
-
-  if (indexRemap &&
-      indexRemap != builder.getMultiDimIdentityMap(indexRemap.getNumDims())) {
-    // Remapped indices.
-    for (auto resultExpr : indexRemap.getResults()) {
-      auto singleResMap = AffineMap::get(
-          indexRemap.getNumDims(), indexRemap.getNumSymbols(), resultExpr);
-      auto afOp = builder.create<AffineApplyOp>(op->getLoc(), singleResMap,
-                                                remapOperands);
-      remapOutputs.push_back(afOp);
-      affineApplyOps.push_back(afOp);
-    }
-  } else {
-    // No remapping specified.
-    remapOutputs.assign(remapOperands.begin(), remapOperands.end());
-  }
-
-  SmallVector<Value, 4> newMapOperands;
-  newMapOperands.reserve(newMemRefRank);
-
-  // Prepend 'extraIndices' in 'newMapOperands'.
-  for (Value extraIndex : extraIndices) {
-    assert(extraIndex.getDefiningOp()->getNumResults() == 1 &&
-           "single result op's expected to generate these indices");
-    assert((isValidDim(extraIndex) || isValidSymbol(extraIndex)) &&
-           "invalid memory op index");
-    newMapOperands.push_back(extraIndex);
-  }
-
-  // Append 'remapOutputs' to 'newMapOperands'.
-  newMapOperands.append(remapOutputs.begin(), remapOutputs.end());
-
-  // Create new fully composed AffineMap for new op to be created.
-  assert(newMapOperands.size() == newMemRefRank);
-  auto newMap = builder.getMultiDimIdentityMap(newMemRefRank);
-  // TODO: Avoid creating/deleting temporary AffineApplyOps here.
-  fullyComposeAffineMapAndOperands(&newMap, &newMapOperands);
-  newMap = simplifyAffineMap(newMap);
-  canonicalizeMapAndOperands(&newMap, &newMapOperands);
-  // Remove any affine.apply's that became dead as a result of composition.
-  for (Value value : affineApplyOps)
-    if (value.use_empty())
-      value.getDefiningOp()->erase();
-
-  OperationState state(op->getLoc(), op->getName());
-  // Construct the new operation using this memref.
-  state.operands.reserve(op->getNumOperands() + extraIndices.size());
-  // Insert the non-memref operands.
-  state.operands.append(op->operand_begin(),
-                        op->operand_begin() + memRefOperandPos);
-  // Insert the new memref value.
-  state.operands.push_back(newMemRef);
-
-  // Insert the new memref map operands.
-  state.operands.append(newMapOperands.begin(), newMapOperands.end());
-
-  // Insert the remaining operands unmodified.
-  state.operands.append(op->operand_begin() + memRefOperandPos + 1 +
-                            oldMapNumInputs,
-                        op->operand_end());
-
-  // Result types don't change. Both memref's are of the same elemental type.
-  state.types.reserve(op->getNumResults());
-  for (auto result : op->getResults())
-    state.types.push_back(result.getType());
-
-  // Add attribute for 'newMap', other Attributes do not change.
-  auto newMapAttr = AffineMapAttr::get(newMap);
-  for (auto namedAttr : op->getAttrs()) {
-    if (namedAttr.getName() == oldMapAttrPair.getName())
-      state.attributes.push_back({namedAttr.getName(), newMapAttr});
-    else
-      state.attributes.push_back(namedAttr);
-  }
-
-  // Create the new operation.
-  auto *repOp = builder.createOperation(state);
-  op->replaceAllUsesWith(repOp);
-  op->erase();
-
-  return success();
-}
-
-LogicalResult mlir::replaceAllMemRefUsesWith(
-    Value oldMemRef, Value newMemRef, ArrayRef<Value> extraIndices,
-    AffineMap indexRemap, ArrayRef<Value> extraOperands,
-    ArrayRef<Value> symbolOperands, Operation *domOpFilter,
-    Operation *postDomOpFilter, bool allowNonDereferencingOps,
-    bool replaceInDeallocOp) {
-  unsigned newMemRefRank = newMemRef.getType().cast<MemRefType>().getRank();
-  (void)newMemRefRank; // unused in opt mode
-  unsigned oldMemRefRank = oldMemRef.getType().cast<MemRefType>().getRank();
-  (void)oldMemRefRank;
-  if (indexRemap) {
-    assert(indexRemap.getNumSymbols() == symbolOperands.size() &&
-           "symbol operand count mismatch");
-    assert(indexRemap.getNumInputs() ==
-           extraOperands.size() + oldMemRefRank + symbolOperands.size());
-    assert(indexRemap.getNumResults() + extraIndices.size() == newMemRefRank);
-  } else {
-    assert(oldMemRefRank + extraIndices.size() == newMemRefRank);
-  }
-
-  // Assert same elemental type.
-  assert(oldMemRef.getType().cast<MemRefType>().getElementType() ==
-         newMemRef.getType().cast<MemRefType>().getElementType());
-
-  std::unique_ptr<DominanceInfo> domInfo;
-  std::unique_ptr<PostDominanceInfo> postDomInfo;
-  if (domOpFilter)
-    domInfo =
-        std::make_unique<DominanceInfo>(domOpFilter->getParentOfType<FuncOp>());
-
-  if (postDomOpFilter)
-    postDomInfo = std::make_unique<PostDominanceInfo>(
-        postDomOpFilter->getParentOfType<FuncOp>());
-
-  // Walk all uses of old memref; collect ops to perform replacement. We use a
-  // DenseSet since an operation could potentially have multiple uses of a
-  // memref (although rare), and the replacement later is going to erase ops.
-  DenseSet<Operation *> opsToReplace;
-  for (auto *op : oldMemRef.getUsers()) {
-    // Skip this use if it's not dominated by domOpFilter.
-    if (domOpFilter && !domInfo->dominates(domOpFilter, op))
-      continue;
-
-    // Skip this use if it's not post-dominated by postDomOpFilter.
-    if (postDomOpFilter && !postDomInfo->postDominates(postDomOpFilter, op))
-      continue;
-
-    // Skip dealloc's - no replacement is necessary, and a memref replacement
-    // at other uses doesn't hurt these dealloc's.
-    if (isa<memref::DeallocOp>(op) && !replaceInDeallocOp)
-      continue;
-
-    // Check if the memref was used in a non-dereferencing context. It is fine
-    // for the memref to be used in a non-dereferencing way outside of the
-    // region where this replacement is happening.
-    if (!isa<AffineMapAccessInterface>(*op)) {
-      if (!allowNonDereferencingOps) {
-        LLVM_DEBUG(llvm::dbgs()
-                   << "Memref replacement failed: non-deferencing memref op: \n"
-                   << *op << '\n');
-        return failure();
-      }
-      // Non-dereferencing ops with the MemRefsNormalizable trait are
-      // supported for replacement.
-      if (!op->hasTrait<OpTrait::MemRefsNormalizable>()) {
-        LLVM_DEBUG(llvm::dbgs() << "Memref replacement failed: use without a "
-                                   "memrefs normalizable trait: \n"
-                                << *op << '\n');
-        return failure();
-      }
-    }
-
-    // We'll first collect and then replace --- since replacement erases the op
-    // that has the use, and that op could be postDomFilter or domFilter itself!
-    opsToReplace.insert(op);
-  }
-
-  for (auto *op : opsToReplace) {
-    if (failed(replaceAllMemRefUsesWith(
-            oldMemRef, newMemRef, op, extraIndices, indexRemap, extraOperands,
-            symbolOperands, allowNonDereferencingOps)))
-      llvm_unreachable("memref replacement guaranteed to succeed here");
-  }
-
-  return success();
-}
-
-/// Given an operation, inserts one or more single result affine
-/// apply operations, results of which are exclusively used by this operation
-/// operation. The operands of these newly created affine apply ops are
-/// guaranteed to be loop iterators or terminal symbols of a function.
-///
-/// Before
-///
-/// affine.for %i = 0 to #map(%N)
-///   %idx = affine.apply (d0) -> (d0 mod 2) (%i)
-///   "send"(%idx, %A, ...)
-///   "compute"(%idx)
-///
-/// After
-///
-/// affine.for %i = 0 to #map(%N)
-///   %idx = affine.apply (d0) -> (d0 mod 2) (%i)
-///   "send"(%idx, %A, ...)
-///   %idx_ = affine.apply (d0) -> (d0 mod 2) (%i)
-///   "compute"(%idx_)
-///
-/// This allows applying different transformations on send and compute (for eg.
-/// different shifts/delays).
-///
-/// Returns nullptr either if none of opInst's operands were the result of an
-/// affine.apply and thus there was no affine computation slice to create, or if
-/// all the affine.apply op's supplying operands to this opInst did not have any
-/// uses besides this opInst; otherwise returns the list of affine.apply
-/// operations created in output argument `sliceOps`.
-void mlir::createAffineComputationSlice(
-    Operation *opInst, SmallVectorImpl<AffineApplyOp> *sliceOps) {
-  // Collect all operands that are results of affine apply ops.
-  SmallVector<Value, 4> subOperands;
-  subOperands.reserve(opInst->getNumOperands());
-  for (auto operand : opInst->getOperands())
-    if (isa_and_nonnull<AffineApplyOp>(operand.getDefiningOp()))
-      subOperands.push_back(operand);
-
-  // Gather sequence of AffineApplyOps reachable from 'subOperands'.
-  SmallVector<Operation *, 4> affineApplyOps;
-  getReachableAffineApplyOps(subOperands, affineApplyOps);
-  // Skip transforming if there are no affine maps to compose.
-  if (affineApplyOps.empty())
-    return;
-
-  // Check if all uses of the affine apply op's lie only in this op op, in
-  // which case there would be nothing to do.
-  bool localized = true;
-  for (auto *op : affineApplyOps) {
-    for (auto result : op->getResults()) {
-      for (auto *user : result.getUsers()) {
-        if (user != opInst) {
-          localized = false;
-          break;
-        }
-      }
-    }
-  }
-  if (localized)
-    return;
-
-  OpBuilder builder(opInst);
-  SmallVector<Value, 4> composedOpOperands(subOperands);
-  auto composedMap = builder.getMultiDimIdentityMap(composedOpOperands.size());
-  fullyComposeAffineMapAndOperands(&composedMap, &composedOpOperands);
-
-  // Create an affine.apply for each of the map results.
-  sliceOps->reserve(composedMap.getNumResults());
-  for (auto resultExpr : composedMap.getResults()) {
-    auto singleResMap = AffineMap::get(composedMap.getNumDims(),
-                                       composedMap.getNumSymbols(), resultExpr);
-    sliceOps->push_back(builder.create<AffineApplyOp>(
-        opInst->getLoc(), singleResMap, composedOpOperands));
-  }
-
-  // Construct the new operands that include the results from the composed
-  // affine apply op above instead of existing ones (subOperands). So, they
-  // differ from opInst's operands only for those operands in 'subOperands', for
-  // which they will be replaced by the corresponding one from 'sliceOps'.
-  SmallVector<Value, 4> newOperands(opInst->getOperands());
-  for (unsigned i = 0, e = newOperands.size(); i < e; i++) {
-    // Replace the subOperands from among the new operands.
-    unsigned j, f;
-    for (j = 0, f = subOperands.size(); j < f; j++) {
-      if (newOperands[i] == subOperands[j])
-        break;
-    }
-    if (j < subOperands.size()) {
-      newOperands[i] = (*sliceOps)[j];
-    }
-  }
-  for (unsigned idx = 0, e = newOperands.size(); idx < e; idx++) {
-    opInst->setOperand(idx, newOperands[idx]);
-  }
-}
-
-/// Enum to set patterns of affine expr in tiled-layout map.
-/// TileFloorDiv: <dim expr> div <tile size>
-/// TileMod: <dim expr> mod <tile size>
-/// TileNone: None of the above
-/// Example:
-/// #tiled_2d_128x256 = affine_map<(d0, d1)
-///            -> (d0 div 128, d1 div 256, d0 mod 128, d1 mod 256)>
-/// "d0 div 128" and "d1 div 256" ==> TileFloorDiv
-/// "d0 mod 128" and "d1 mod 256" ==> TileMod
-enum TileExprPattern { TileFloorDiv, TileMod, TileNone };
-
-/// Check if `map` is a tiled layout. In the tiled layout, specific k dimensions
-/// being floordiv'ed by respective tile sizes appeare in a mod with the same
-/// tile sizes, and no other expression involves those k dimensions. This
-/// function stores a vector of tuples (`tileSizePos`) including AffineExpr for
-/// tile size, positions of corresponding `floordiv` and `mod`. If it is not a
-/// tiled layout, an empty vector is returned.
-static LogicalResult getTileSizePos(
-    AffineMap map,
-    SmallVectorImpl<std::tuple<AffineExpr, unsigned, unsigned>> &tileSizePos) {
-  // Create `floordivExprs` which is a vector of tuples including LHS and RHS of
-  // `floordiv` and its position in `map` output.
-  // Example: #tiled_2d_128x256 = affine_map<(d0, d1)
-  //                -> (d0 div 128, d1 div 256, d0 mod 128, d1 mod 256)>
-  // In this example, `floordivExprs` includes {d0, 128, 0} and {d1, 256, 1}.
-  SmallVector<std::tuple<AffineExpr, AffineExpr, unsigned>, 4> floordivExprs;
-  unsigned pos = 0;
-  for (AffineExpr expr : map.getResults()) {
-    if (expr.getKind() == AffineExprKind::FloorDiv) {
-      AffineBinaryOpExpr binaryExpr = expr.cast<AffineBinaryOpExpr>();
-      if (binaryExpr.getRHS().isa<AffineConstantExpr>())
-        floordivExprs.emplace_back(
-            std::make_tuple(binaryExpr.getLHS(), binaryExpr.getRHS(), pos));
-    }
-    pos++;
-  }
-  // Not tiled layout if `floordivExprs` is empty.
-  if (floordivExprs.empty()) {
-    tileSizePos = SmallVector<std::tuple<AffineExpr, unsigned, unsigned>>{};
-    return success();
-  }
-
-  // Check if LHS of `floordiv` is used in LHS of `mod`. If not used, `map` is
-  // not tiled layout.
-  for (std::tuple<AffineExpr, AffineExpr, unsigned> fexpr : floordivExprs) {
-    AffineExpr floordivExprLHS = std::get<0>(fexpr);
-    AffineExpr floordivExprRHS = std::get<1>(fexpr);
-    unsigned floordivPos = std::get<2>(fexpr);
-
-    // Walk affinexpr of `map` output except `fexpr`, and check if LHS and RHS
-    // of `fexpr` are used in LHS and RHS of `mod`. If LHS of `fexpr` is used
-    // other expr, the map is not tiled layout. Example of non tiled layout:
-    //   affine_map<(d0, d1, d2) -> (d0, d1, d2 floordiv 256, d2 floordiv 256)>
-    //   affine_map<(d0, d1, d2) -> (d0, d1, d2 floordiv 256, d2 mod 128)>
-    //   affine_map<(d0, d1, d2) -> (d0, d1, d2 floordiv 256, d2 mod 256, d2 mod
-    //   256)>
-    bool found = false;
-    pos = 0;
-    for (AffineExpr expr : map.getResults()) {
-      bool notTiled = false;
-      if (pos != floordivPos) {
-        expr.walk([&](AffineExpr e) {
-          if (e == floordivExprLHS) {
-            if (expr.getKind() == AffineExprKind::Mod) {
-              AffineBinaryOpExpr binaryExpr = expr.cast<AffineBinaryOpExpr>();
-              // If LHS and RHS of `mod` are the same with those of floordiv.
-              if (floordivExprLHS == binaryExpr.getLHS() &&
-                  floordivExprRHS == binaryExpr.getRHS()) {
-                // Save tile size (RHS of `mod`), and position of `floordiv` and
-                // `mod` if same expr with `mod` is not found yet.
-                if (!found) {
-                  tileSizePos.emplace_back(
-                      std::make_tuple(binaryExpr.getRHS(), floordivPos, pos));
-                  found = true;
-                } else {
-                  // Non tiled layout: Have multilpe `mod` with the same LHS.
-                  // eg. affine_map<(d0, d1, d2) -> (d0, d1, d2 floordiv 256, d2
-                  // mod 256, d2 mod 256)>
-                  notTiled = true;
-                }
-              } else {
-                // Non tiled layout: RHS of `mod` is different from `floordiv`.
-                // eg. affine_map<(d0, d1, d2) -> (d0, d1, d2 floordiv 256, d2
-                // mod 128)>
-                notTiled = true;
-              }
-            } else {
-              // Non tiled layout: LHS is the same, but not `mod`.
-              // eg. affine_map<(d0, d1, d2) -> (d0, d1, d2 floordiv 256, d2
-              // floordiv 256)>
-              notTiled = true;
-            }
-          }
-        });
-      }
-      if (notTiled) {
-        tileSizePos = SmallVector<std::tuple<AffineExpr, unsigned, unsigned>>{};
-        return success();
-      }
-      pos++;
-    }
-  }
-  return success();
-}
-
-/// Check if `dim` dimension of memrefType with `layoutMap` becomes dynamic
-/// after normalization. Dimensions that include dynamic dimensions in the map
-/// output will become dynamic dimensions. Return true if `dim` is dynamic
-/// dimension.
-///
-/// Example:
-/// #map0 = affine_map<(d0, d1) -> (d0, d1 floordiv 32, d1 mod 32)>
-///
-/// If d1 is dynamic dimension, 2nd and 3rd dimension of map output are dynamic.
-/// memref<4x?xf32, #map0>  ==>  memref<4x?x?xf32>
-static bool
-isNormalizedMemRefDynamicDim(unsigned dim, AffineMap layoutMap,
-                             SmallVectorImpl<unsigned> &inMemrefTypeDynDims,
-                             MLIRContext *context) {
-  bool isDynamicDim = false;
-  AffineExpr expr = layoutMap.getResults()[dim];
-  // Check if affine expr of the dimension includes dynamic dimension of input
-  // memrefType.
-  expr.walk([&inMemrefTypeDynDims, &isDynamicDim, &context](AffineExpr e) {
-    if (e.isa<AffineDimExpr>()) {
-      for (unsigned dm : inMemrefTypeDynDims) {
-        if (e == getAffineDimExpr(dm, context)) {
-          isDynamicDim = true;
-        }
-      }
-    }
-  });
-  return isDynamicDim;
-}
-
-/// Create affine expr to calculate dimension size for a tiled-layout map.
-static AffineExpr createDimSizeExprForTiledLayout(AffineExpr oldMapOutput,
-                                                  TileExprPattern pat) {
-  // Create map output for the patterns.
-  // "floordiv <tile size>" ==> "ceildiv <tile size>"
-  // "mod <tile size>" ==> "<tile size>"
-  AffineExpr newMapOutput;
-  AffineBinaryOpExpr binaryExpr = nullptr;
-  switch (pat) {
-  case TileExprPattern::TileMod:
-    binaryExpr = oldMapOutput.cast<AffineBinaryOpExpr>();
-    newMapOutput = binaryExpr.getRHS();
-    break;
-  case TileExprPattern::TileFloorDiv:
-    binaryExpr = oldMapOutput.cast<AffineBinaryOpExpr>();
-    newMapOutput = getAffineBinaryOpExpr(
-        AffineExprKind::CeilDiv, binaryExpr.getLHS(), binaryExpr.getRHS());
-    break;
-  default:
-    newMapOutput = oldMapOutput;
-  }
-  return newMapOutput;
-}
-
-/// Create new maps to calculate each dimension size of `newMemRefType`, and
-/// create `newDynamicSizes` from them by using AffineApplyOp.
-///
-/// Steps for normalizing dynamic memrefs for a tiled layout map
-/// Example:
-///    #map0 = affine_map<(d0, d1) -> (d0, d1 floordiv 32, d1 mod 32)>
-///    %0 = dim %arg0, %c1 :memref<4x?xf32>
-///    %1 = alloc(%0) : memref<4x?xf32, #map0>
-///
-/// (Before this function)
-/// 1. Check if `map`(#map0) is a tiled layout using `getTileSizePos()`. Only
-/// single layout map is supported.
-///
-/// 2. Create normalized memrefType using `isNormalizedMemRefDynamicDim()`. It
-/// is memref<4x?x?xf32> in the above example.
-///
-/// (In this function)
-/// 3. Create new maps to calculate each dimension of the normalized memrefType
-/// using `createDimSizeExprForTiledLayout()`. In the tiled layout, the
-/// dimension size can be calculated by replacing "floordiv <tile size>" with
-/// "ceildiv <tile size>" and "mod <tile size>" with "<tile size>".
-/// - New map in the above example
-///   #map0 = affine_map<(d0, d1) -> (d0)>
-///   #map1 = affine_map<(d0, d1) -> (d1 ceildiv 32)>
-///   #map2 = affine_map<(d0, d1) -> (32)>
-///
-/// 4. Create AffineApplyOp to apply the new maps. The output of AffineApplyOp
-/// is used in dynamicSizes of new AllocOp.
-///   %0 = dim %arg0, %c1 : memref<4x?xf32>
-///   %c4 = arith.constant 4 : index
-///   %1 = affine.apply #map1(%c4, %0)
-///   %2 = affine.apply #map2(%c4, %0)
-static void createNewDynamicSizes(MemRefType oldMemRefType,
-                                  MemRefType newMemRefType, AffineMap map,
-                                  memref::AllocOp *allocOp, OpBuilder b,
-                                  SmallVectorImpl<Value> &newDynamicSizes) {
-  // Create new input for AffineApplyOp.
-  SmallVector<Value, 4> inAffineApply;
-  ArrayRef<int64_t> oldMemRefShape = oldMemRefType.getShape();
-  unsigned dynIdx = 0;
-  for (unsigned d = 0; d < oldMemRefType.getRank(); ++d) {
-    if (oldMemRefShape[d] < 0) {
-      // Use dynamicSizes of allocOp for dynamic dimension.
-      inAffineApply.emplace_back(allocOp->dynamicSizes()[dynIdx]);
-      dynIdx++;
-    } else {
-      // Create ConstantOp for static dimension.
-      Attribute constantAttr =
-          b.getIntegerAttr(b.getIndexType(), oldMemRefShape[d]);
-      inAffineApply.emplace_back(
-          b.create<arith::ConstantOp>(allocOp->getLoc(), constantAttr));
-    }
-  }
-
-  // Create new map to calculate each dimension size of new memref for each
-  // original map output. Only for dynamic dimesion of `newMemRefType`.
-  unsigned newDimIdx = 0;
-  ArrayRef<int64_t> newMemRefShape = newMemRefType.getShape();
-  SmallVector<std::tuple<AffineExpr, unsigned, unsigned>> tileSizePos;
-  (void)getTileSizePos(map, tileSizePos);
-  for (AffineExpr expr : map.getResults()) {
-    if (newMemRefShape[newDimIdx] < 0) {
-      // Create new maps to calculate each dimension size of new memref.
-      enum TileExprPattern pat = TileExprPattern::TileNone;
-      for (auto pos : tileSizePos) {
-        if (newDimIdx == std::get<1>(pos))
-          pat = TileExprPattern::TileFloorDiv;
-        else if (newDimIdx == std::get<2>(pos))
-          pat = TileExprPattern::TileMod;
-      }
-      AffineExpr newMapOutput = createDimSizeExprForTiledLayout(expr, pat);
-      AffineMap newMap =
-          AffineMap::get(map.getNumInputs(), map.getNumSymbols(), newMapOutput);
-      Value affineApp =
-          b.create<AffineApplyOp>(allocOp->getLoc(), newMap, inAffineApply);
-      newDynamicSizes.emplace_back(affineApp);
-    }
-    newDimIdx++;
-  }
-}
-
-// TODO: Currently works for static memrefs with a single layout map.
-LogicalResult mlir::normalizeMemRef(memref::AllocOp *allocOp) {
-  MemRefType memrefType = allocOp->getType();
-  OpBuilder b(*allocOp);
-
-  // Fetch a new memref type after normalizing the old memref to have an
-  // identity map layout.
-  MemRefType newMemRefType =
-      normalizeMemRefType(memrefType, b, allocOp->symbolOperands().size());
-  if (newMemRefType == memrefType)
-    // Either memrefType already had an identity map or the map couldn't be
-    // transformed to an identity map.
-    return failure();
-
-  Value oldMemRef = allocOp->getResult();
-
-  SmallVector<Value, 4> symbolOperands(allocOp->symbolOperands());
-  AffineMap layoutMap = memrefType.getLayout().getAffineMap();
-  memref::AllocOp newAlloc;
-  // Check if `layoutMap` is a tiled layout. Only single layout map is
-  // supported for normalizing dynamic memrefs.
-  SmallVector<std::tuple<AffineExpr, unsigned, unsigned>> tileSizePos;
-  (void)getTileSizePos(layoutMap, tileSizePos);
-  if (newMemRefType.getNumDynamicDims() > 0 && !tileSizePos.empty()) {
-    MemRefType oldMemRefType = oldMemRef.getType().cast<MemRefType>();
-    SmallVector<Value, 4> newDynamicSizes;
-    createNewDynamicSizes(oldMemRefType, newMemRefType, layoutMap, allocOp, b,
-                          newDynamicSizes);
-    // Add the new dynamic sizes in new AllocOp.
-    newAlloc =
-        b.create<memref::AllocOp>(allocOp->getLoc(), newMemRefType,
-                                  newDynamicSizes, allocOp->alignmentAttr());
-  } else {
-    newAlloc = b.create<memref::AllocOp>(allocOp->getLoc(), newMemRefType,
-                                         allocOp->alignmentAttr());
-  }
-  // Replace all uses of the old memref.
-  if (failed(replaceAllMemRefUsesWith(oldMemRef, /*newMemRef=*/newAlloc,
-                                      /*extraIndices=*/{},
-                                      /*indexRemap=*/layoutMap,
-                                      /*extraOperands=*/{},
-                                      /*symbolOperands=*/symbolOperands,
-                                      /*domOpFilter=*/nullptr,
-                                      /*postDomOpFilter=*/nullptr,
-                                      /*allowNonDereferencingOps=*/true))) {
-    // If it failed (due to escapes for example), bail out.
-    newAlloc.erase();
-    return failure();
-  }
-  // Replace any uses of the original alloc op and erase it. All remaining uses
-  // have to be dealloc's; RAMUW above would've failed otherwise.
-  assert(llvm::all_of(oldMemRef.getUsers(), [](Operation *op) {
-    return isa<memref::DeallocOp>(op);
-  }));
-  oldMemRef.replaceAllUsesWith(newAlloc);
-  allocOp->erase();
-  return success();
-}
-
-MemRefType mlir::normalizeMemRefType(MemRefType memrefType, OpBuilder b,
-                                     unsigned numSymbolicOperands) {
-  unsigned rank = memrefType.getRank();
-  if (rank == 0)
-    return memrefType;
-
-  if (memrefType.getLayout().isIdentity()) {
-    // Either no maps is associated with this memref or this memref has
-    // a trivial (identity) map.
-    return memrefType;
-  }
-  AffineMap layoutMap = memrefType.getLayout().getAffineMap();
-
-  // We don't do any checks for one-to-one'ness; we assume that it is
-  // one-to-one.
-
-  // Normalize only static memrefs and dynamic memrefs with a tiled-layout map
-  // for now.
-  // TODO: Normalize the other types of dynamic memrefs.
-  SmallVector<std::tuple<AffineExpr, unsigned, unsigned>> tileSizePos;
-  (void)getTileSizePos(layoutMap, tileSizePos);
-  if (memrefType.getNumDynamicDims() > 0 && tileSizePos.empty())
-    return memrefType;
-
-  // We have a single map that is not an identity map. Create a new memref
-  // with the right shape and an identity layout map.
-  ArrayRef<int64_t> shape = memrefType.getShape();
-  // FlatAffineConstraint may later on use symbolicOperands.
-  FlatAffineConstraints fac(rank, numSymbolicOperands);
-  SmallVector<unsigned, 4> memrefTypeDynDims;
-  for (unsigned d = 0; d < rank; ++d) {
-    // Use constraint system only in static dimensions.
-    if (shape[d] > 0) {
-      fac.addBound(FlatAffineConstraints::LB, d, 0);
-      fac.addBound(FlatAffineConstraints::UB, d, shape[d] - 1);
-    } else {
-      memrefTypeDynDims.emplace_back(d);
-    }
-  }
-  // We compose this map with the original index (logical) space to derive
-  // the upper bounds for the new index space.
-  unsigned newRank = layoutMap.getNumResults();
-  if (failed(fac.composeMatchingMap(layoutMap)))
-    return memrefType;
-  // TODO: Handle semi-affine maps.
-  // Project out the old data dimensions.
-  fac.projectOut(newRank, fac.getNumIds() - newRank - fac.getNumLocalIds());
-  SmallVector<int64_t, 4> newShape(newRank);
-  for (unsigned d = 0; d < newRank; ++d) {
-    // Check if each dimension of normalized memrefType is dynamic.
-    bool isDynDim = isNormalizedMemRefDynamicDim(
-        d, layoutMap, memrefTypeDynDims, b.getContext());
-    if (isDynDim) {
-      newShape[d] = -1;
-    } else {
-      // The lower bound for the shape is always zero.
-      auto ubConst = fac.getConstantBound(FlatAffineConstraints::UB, d);
-      // For a static memref and an affine map with no symbols, this is
-      // always bounded.
-      assert(ubConst.hasValue() && "should always have an upper bound");
-      if (ubConst.getValue() < 0)
-        // This is due to an invalid map that maps to a negative space.
-        return memrefType;
-      // If dimension of new memrefType is dynamic, the value is -1.
-      newShape[d] = ubConst.getValue() + 1;
-    }
-  }
-
-  // Create the new memref type after trivializing the old layout map.
-  MemRefType newMemRefType =
-      MemRefType::Builder(memrefType)
-          .setShape(newShape)
-          .setLayout(AffineMapAttr::get(b.getMultiDimIdentityMap(newRank)));
-
-  return newMemRefType;
-}
diff --git a/mlir/test/lib/Dialect/Affine/CMakeLists.txt b/mlir/test/lib/Dialect/Affine/CMakeLists.txt
--- a/mlir/test/lib/Dialect/Affine/CMakeLists.txt
+++ b/mlir/test/lib/Dialect/Affine/CMakeLists.txt
@@ -3,7 +3,11 @@
   TestAffineDataCopy.cpp
   TestAffineLoopUnswitching.cpp
   TestAffineLoopParametricTiling.cpp
+  TestLoopFusion.cpp
+  TestLoopMapping.cpp
+  TestLoopParametricTiling.cpp
   TestLoopPermutation.cpp
+  TestLoopUnrolling.cpp
   TestVectorizationUtils.cpp
 
   EXCLUDE_FROM_LIBMLIR
diff --git a/mlir/test/lib/Dialect/Affine/TestAffineDataCopy.cpp b/mlir/test/lib/Dialect/Affine/TestAffineDataCopy.cpp
--- a/mlir/test/lib/Dialect/Affine/TestAffineDataCopy.cpp
+++ b/mlir/test/lib/Dialect/Affine/TestAffineDataCopy.cpp
@@ -13,10 +13,10 @@
 
 #include "mlir/Dialect/Affine/Analysis/Utils.h"
 #include "mlir/Dialect/Affine/IR/AffineOps.h"
+#include "mlir/Dialect/Affine/LoopUtils.h"
 #include "mlir/Dialect/MemRef/IR/MemRef.h"
 #include "mlir/Pass/Pass.h"
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
-#include "mlir/Transforms/LoopUtils.h"
 #include "mlir/Transforms/Passes.h"
 
 #define PASS_NAME "test-affine-data-copy"
diff --git a/mlir/test/lib/Dialect/Affine/TestAffineLoopParametricTiling.cpp b/mlir/test/lib/Dialect/Affine/TestAffineLoopParametricTiling.cpp
--- a/mlir/test/lib/Dialect/Affine/TestAffineLoopParametricTiling.cpp
+++ b/mlir/test/lib/Dialect/Affine/TestAffineLoopParametricTiling.cpp
@@ -12,8 +12,8 @@
 //===----------------------------------------------------------------------===//
 
 #include "mlir/Dialect/Affine/IR/AffineOps.h"
+#include "mlir/Dialect/Affine/LoopUtils.h"
 #include "mlir/Dialect/Affine/Passes.h"
-#include "mlir/Transforms/LoopUtils.h"
 
 using namespace mlir;
 
diff --git a/mlir/test/lib/Transforms/TestLoopFusion.cpp b/mlir/test/lib/Dialect/Affine/TestLoopFusion.cpp
rename from mlir/test/lib/Transforms/TestLoopFusion.cpp
rename to mlir/test/lib/Dialect/Affine/TestLoopFusion.cpp
--- a/mlir/test/lib/Transforms/TestLoopFusion.cpp
+++ b/mlir/test/lib/Dialect/Affine/TestLoopFusion.cpp
@@ -12,11 +12,10 @@
 
 #include "mlir/Dialect/Affine/Analysis/Utils.h"
 #include "mlir/Dialect/Affine/IR/AffineOps.h"
+#include "mlir/Dialect/Affine/LoopFusionUtils.h"
+#include "mlir/Dialect/Affine/LoopUtils.h"
 #include "mlir/Dialect/StandardOps/IR/Ops.h"
 #include "mlir/Pass/Pass.h"
-#include "mlir/Transforms/LoopFusionUtils.h"
-#include "mlir/Transforms/LoopUtils.h"
-#include "mlir/Transforms/Passes.h"
 
 #define DEBUG_TYPE "test-loop-fusion"
 
diff --git a/mlir/test/lib/Transforms/TestLoopMapping.cpp b/mlir/test/lib/Dialect/Affine/TestLoopMapping.cpp
rename from mlir/test/lib/Transforms/TestLoopMapping.cpp
rename to mlir/test/lib/Dialect/Affine/TestLoopMapping.cpp
--- a/mlir/test/lib/Transforms/TestLoopMapping.cpp
+++ b/mlir/test/lib/Dialect/Affine/TestLoopMapping.cpp
@@ -12,11 +12,10 @@
 //===----------------------------------------------------------------------===//
 
 #include "mlir/Dialect/Affine/IR/AffineOps.h"
+#include "mlir/Dialect/Affine/LoopUtils.h"
 #include "mlir/Dialect/SCF/SCF.h"
 #include "mlir/IR/Builders.h"
 #include "mlir/Pass/Pass.h"
-#include "mlir/Transforms/LoopUtils.h"
-#include "mlir/Transforms/Passes.h"
 
 #include "llvm/ADT/SetVector.h"
 
diff --git a/mlir/test/lib/Transforms/TestLoopParametricTiling.cpp b/mlir/test/lib/Dialect/Affine/TestLoopParametricTiling.cpp
rename from mlir/test/lib/Transforms/TestLoopParametricTiling.cpp
rename to mlir/test/lib/Dialect/Affine/TestLoopParametricTiling.cpp
--- a/mlir/test/lib/Transforms/TestLoopParametricTiling.cpp
+++ b/mlir/test/lib/Dialect/Affine/TestLoopParametricTiling.cpp
@@ -10,11 +10,10 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "mlir/Dialect/Affine/LoopUtils.h"
 #include "mlir/Dialect/SCF/SCF.h"
 #include "mlir/IR/Builders.h"
 #include "mlir/Pass/Pass.h"
-#include "mlir/Transforms/LoopUtils.h"
-#include "mlir/Transforms/Passes.h"
 
 using namespace mlir;
 
diff --git a/mlir/test/lib/Dialect/Affine/TestLoopPermutation.cpp b/mlir/test/lib/Dialect/Affine/TestLoopPermutation.cpp
--- a/mlir/test/lib/Dialect/Affine/TestLoopPermutation.cpp
+++ b/mlir/test/lib/Dialect/Affine/TestLoopPermutation.cpp
@@ -12,9 +12,8 @@
 
 #include "mlir/Dialect/Affine/Analysis/Utils.h"
 #include "mlir/Dialect/Affine/IR/AffineOps.h"
+#include "mlir/Dialect/Affine/LoopUtils.h"
 #include "mlir/Pass/Pass.h"
-#include "mlir/Transforms/LoopUtils.h"
-#include "mlir/Transforms/Passes.h"
 
 #define PASS_NAME "test-loop-permutation"
 
diff --git a/mlir/test/lib/Transforms/TestLoopUnrolling.cpp b/mlir/test/lib/Dialect/Affine/TestLoopUnrolling.cpp
rename from mlir/test/lib/Transforms/TestLoopUnrolling.cpp
rename to mlir/test/lib/Dialect/Affine/TestLoopUnrolling.cpp
--- a/mlir/test/lib/Transforms/TestLoopUnrolling.cpp
+++ b/mlir/test/lib/Dialect/Affine/TestLoopUnrolling.cpp
@@ -10,13 +10,12 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "mlir/Dialect/Affine/LoopUtils.h"
 #include "mlir/Dialect/Arithmetic/IR/Arithmetic.h"
 #include "mlir/Dialect/SCF/SCF.h"
 #include "mlir/Dialect/StandardOps/IR/Ops.h"
 #include "mlir/IR/Builders.h"
 #include "mlir/Pass/Pass.h"
-#include "mlir/Transforms/LoopUtils.h"
-#include "mlir/Transforms/Passes.h"
 
 using namespace mlir;
 
diff --git a/mlir/test/lib/Dialect/Affine/TestVectorizationUtils.cpp b/mlir/test/lib/Dialect/Affine/TestVectorizationUtils.cpp
--- a/mlir/test/lib/Dialect/Affine/TestVectorizationUtils.cpp
+++ b/mlir/test/lib/Dialect/Affine/TestVectorizationUtils.cpp
@@ -14,6 +14,7 @@
 #include "mlir/Dialect/Affine/Analysis/AffineAnalysis.h"
 #include "mlir/Dialect/Affine/Analysis/NestedMatcher.h"
 #include "mlir/Dialect/Affine/IR/AffineOps.h"
+#include "mlir/Dialect/Affine/LoopUtils.h"
 #include "mlir/Dialect/Affine/Utils.h"
 #include "mlir/Dialect/Vector/VectorOps.h"
 #include "mlir/Dialect/Vector/VectorUtils.h"
@@ -21,7 +22,6 @@
 #include "mlir/IR/BuiltinTypes.h"
 #include "mlir/IR/Diagnostics.h"
 #include "mlir/Pass/Pass.h"
-#include "mlir/Transforms/LoopUtils.h"
 #include "mlir/Transforms/Passes.h"
 
 #include "llvm/ADT/STLExtras.h"
diff --git a/mlir/test/lib/Transforms/CMakeLists.txt b/mlir/test/lib/Transforms/CMakeLists.txt
--- a/mlir/test/lib/Transforms/CMakeLists.txt
+++ b/mlir/test/lib/Transforms/CMakeLists.txt
@@ -2,10 +2,6 @@
 add_mlir_library(MLIRTestTransforms
   TestConstantFold.cpp
   TestInlining.cpp
-  TestLoopFusion.cpp
-  TestLoopMapping.cpp
-  TestLoopParametricTiling.cpp
-  TestLoopUnrolling.cpp
 
   EXCLUDE_FROM_LIBMLIR
 
diff --git a/mlir/test/lib/Transforms/TestConstantFold.cpp b/mlir/test/lib/Transforms/TestConstantFold.cpp
--- a/mlir/test/lib/Transforms/TestConstantFold.cpp
+++ b/mlir/test/lib/Transforms/TestConstantFold.cpp
@@ -9,7 +9,6 @@
 #include "mlir/Pass/Pass.h"
 #include "mlir/Transforms/FoldUtils.h"
 #include "mlir/Transforms/Passes.h"
-#include "mlir/Transforms/Utils.h"
 
 using namespace mlir;
 
diff --git a/mlir/unittests/Transforms/CMakeLists.txt b/mlir/unittests/Transforms/CMakeLists.txt
--- a/mlir/unittests/Transforms/CMakeLists.txt
+++ b/mlir/unittests/Transforms/CMakeLists.txt
@@ -4,4 +4,5 @@
 )
 target_link_libraries(MLIRTransformsTests
   PRIVATE
+  MLIRParser
   MLIRTransforms)