diff --git a/mlir/lib/Dialect/Affine/Transforms/AffineLoopInvariantCodeMotion.cpp b/mlir/lib/Dialect/Affine/Transforms/AffineLoopInvariantCodeMotion.cpp --- a/mlir/lib/Dialect/Affine/Transforms/AffineLoopInvariantCodeMotion.cpp +++ b/mlir/lib/Dialect/Affine/Transforms/AffineLoopInvariantCodeMotion.cpp @@ -26,6 +26,7 @@ #include "mlir/IR/AffineMap.h" #include "mlir/IR/Builders.h" #include "mlir/IR/Matchers.h" +#include "mlir/Interfaces/SideEffectInterfaces.h" #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/DenseSet.h" #include "llvm/ADT/SmallPtrSet.h" @@ -49,7 +50,6 @@ /// Loop invariant code motion (LICM) pass. /// TODO: The pass is missing zero-trip tests. -/// TODO: Check for the presence of side effects before hoisting. /// TODO: This code should be removed once the new LICM pass can handle its /// uses. struct LoopInvariantCodeMotion @@ -92,13 +92,11 @@ if (!areAllOpsInTheBlockListInvariant(parOp.getLoopBody(), indVar, iterArgs, opsWithUsers, opsToHoist)) return false; - } else if (isa(op)) { - // TODO: Support DMA ops. - // FIXME: This should be fixed to not special-case these affine DMA ops but - // instead rely on side effects. - return false; - } else if (op.getNumRegions() > 0) { - // We can't handle region-holding ops we don't know about. + } else if (!isMemoryEffectFree(&op) && + !isa(&op)) { + // Check for side-effecting ops. Affine read/write ops are handled + // separately below. return false; } else if (!matchPattern(&op, m_Constant())) { // Register op in the set of ops that have users. diff --git a/mlir/test/Dialect/Affine/affine-loop-invariant-code-motion.mlir b/mlir/test/Dialect/Affine/affine-loop-invariant-code-motion.mlir --- a/mlir/test/Dialect/Affine/affine-loop-invariant-code-motion.mlir +++ b/mlir/test/Dialect/Affine/affine-loop-invariant-code-motion.mlir @@ -867,4 +867,49 @@ // CHECK-NEXT: } // CHECK-NEXT: } return -} \ No newline at end of file +} + +// Side-effecting ops shouldn't be hoisted. + +// CHECK-LABEL: func @side_effecting_ops +func.func @side_effecting_ops() { + %cst = arith.constant 0.0 : f32 + %m0 = memref.alloc(): memref<1x512x16x16xf32> + %0 = gpu.wait async + affine.for %arg783 = 0 to 14 { + affine.for %arg784 = 0 to 14 { + affine.parallel (%arg785) = (0) to (512) { + affine.for %arg786 = 0 to 1 { + affine.for %arg787 = 0 to 1 { + affine.for %arg788 = 0 to 1 { + %m1 = memref.alloc() : memref<1xf32, 3> + %m2 = memref.alloc() : memref<1xf32, 3> + affine.store %cst, %m1[0] : memref<1xf32, 3> + affine.store %cst, %m2[0] : memref<1xf32, 3> + %memref_2897, %asyncToken_2898 = gpu.alloc async [%0] () : memref<1x512x16x16xf32> + %2432 = gpu.memcpy async [%0] %memref_2897, %m0 : memref<1x512x16x16xf32>, memref<1x512x16x16xf32> + affine.for %arg789 = 0 to 16 { + affine.for %arg790 = 0 to 16 { + affine.store %cst, %memref_2897[0, %arg785 + %arg788, %arg789, %arg790] : memref<1x512x16x16xf32> + } + } + memref.dealloc %m2 : memref<1xf32, 3> + memref.dealloc %m1 : memref<1xf32, 3> + %2433 = gpu.memcpy async [%0] %m0, %memref_2897 : memref<1x512x16x16xf32>, memref<1x512x16x16xf32> + %2434 = gpu.dealloc async [%asyncToken_2898] %memref_2897 : memref<1x512x16x16xf32> + } + } + } + } + } + } + // CHECK: affine.for %{{.*}} = 0 to 1 + // CHECK-NEXT: affine.for %{{.*}} = 0 to 1 + // CHECK: memref.alloc + // CHECK: memref.alloc + // CHECK: gpu.memcpy + // CHECK: affine.for %{{.*}} = 0 to 16 + // CHECK: affine.for %{{.*}} = 0 to 16 + // CHECK: memref.dealloc + return +}