diff --git a/mlir/include/mlir/Transforms/LoopFusionUtils.h b/mlir/include/mlir/Transforms/LoopFusionUtils.h --- a/mlir/include/mlir/Transforms/LoopFusionUtils.h +++ b/mlir/include/mlir/Transforms/LoopFusionUtils.h @@ -51,6 +51,12 @@ unsigned dstLoopDepth, ComputationSliceState *srcSlice); + +/// Fuses 'srcForOp' into 'dstForOp' with destination loop block insertion point +/// and source slice loop bounds specified in 'srcSlice'. +void fuseLoops(AffineForOp srcForOp, AffineForOp dstForOp, + ComputationSliceState *srcSlice); + /// LoopNestStats aggregates various per-loop statistics (eg. loop trip count /// and operation count) for a loop nest up until (and including) the innermost /// loop body. diff --git a/mlir/lib/Transforms/Utils/LoopFusionUtils.cpp b/mlir/lib/Transforms/Utils/LoopFusionUtils.cpp --- a/mlir/lib/Transforms/Utils/LoopFusionUtils.cpp +++ b/mlir/lib/Transforms/Utils/LoopFusionUtils.cpp @@ -24,6 +24,7 @@ #include "mlir/IR/Builders.h" #include "mlir/IR/Function.h" #include "mlir/IR/Operation.h" +#include "mlir/Transforms/LoopUtils.h" #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/SmallVector.h" #include "llvm/Support/Debug.h" @@ -246,6 +247,33 @@ return FusionResult::Success; } +/// Fuses 'srcForOp' into 'dstForOp' with destination loop block insertion point +/// and source slice loop bounds specified in 'srcSlice'. +void mlir::fuseLoops(AffineForOp srcForOp, AffineForOp dstForOp, + ComputationSliceState *srcSlice) { + // Clone 'srcForOp' into 'dstForOp' at 'srcSlice->insertPoint'. + OpBuilder b(srcSlice->insertPoint->getBlock(), srcSlice->insertPoint); + BlockAndValueMapping mapper; + b.clone(*srcForOp.getOperation(), mapper); + // Update 'sliceLoopNest' upper and lower bounds from computed 'srcSlice'. + SmallVector sliceLoops; + for (unsigned i = 0, e = srcSlice->ivs.size() ; i < e; ++i) { + auto loopIV = mapper.lookupOrNull(srcSlice->ivs[i]); + if (!loopIV) continue; + auto forOp = getForInductionVarOwner(loopIV); + sliceLoops.push_back(forOp); + if (AffineMap lbMap = srcSlice->lbs[i]) + forOp.setLowerBound(srcSlice->lbOperands[i], lbMap); + if (AffineMap ubMap = srcSlice->ubs[i]) + forOp.setUpperBound(srcSlice->ubOperands[i], ubMap); + } + + // Promote any single iteration slice loops. + for (auto forOp : sliceLoops) { + promoteIfSingleIteration(forOp); + } +} + /// Collect loop nest statistics (eg. loop trip count and operation count) /// in 'stats' for loop nest rooted at 'forOp'. Returns true on success, /// returns false otherwise. diff --git a/mlir/test/Transforms/loop-fusion-transformation.mlir b/mlir/test/Transforms/loop-fusion-transformation.mlir new file mode 100644 --- /dev/null +++ b/mlir/test/Transforms/loop-fusion-transformation.mlir @@ -0,0 +1,107 @@ +// RUN: mlir-opt %s -test-loop-fusion -test-loop-fusion-transformation -split-input-file -canonicalize | FileCheck %s + +// ----- + +// CHECK-LABEL: func @slice_depth1_loop_nest() { +func @slice_depth1_loop_nest() { + %0 = alloc() : memref<100xf32> + %cst = constant 7.000000e+00 : f32 + affine.for %i0 = 0 to 16 { + affine.store %cst, %0[%i0] : memref<100xf32> + } + affine.for %i1 = 0 to 5 { + %1 = affine.load %0[%i1] : memref<100xf32> + } + // CHECK: affine.for %[[IV0:.*]] = 0 to 5 { + // CHECK-NEXT: affine.store %{{.*}}, %{{.*}}[%[[IV0]]] : memref<100xf32> + // CHECK-NEXT: %{{.*}} = affine.load %{{.*}}[%[[IV0]]] : memref<100xf32> + // CHECK-NEXT: } + // CHECK-NEXT: return + return +} + +// ----- + +// CHECK-LABEL: func @should_fuse_reduction_to_pointwise() { +func @should_fuse_reduction_to_pointwise() { + %a = alloc() : memref<10x10xf32> + %b = alloc() : memref<10xf32> + %c = alloc() : memref<10xf32> + + %cf7 = constant 7.0 : f32 + + affine.for %i0 = 0 to 10 { + affine.for %i1 = 0 to 10 { + %v0 = affine.load %b[%i0] : memref<10xf32> + %v1 = affine.load %a[%i0, %i1] : memref<10x10xf32> + %v3 = addf %v0, %v1 : f32 + affine.store %v3, %b[%i0] : memref<10xf32> + } + } + affine.for %i2 = 0 to 10 { + %v4 = affine.load %b[%i2] : memref<10xf32> + affine.store %v4, %c[%i2] : memref<10xf32> + } + + // Match on the fused loop nest. + // Should fuse in entire inner loop on %i1 from source loop nest, as %i1 + // is not used in the access function of the store/load on %b. + // CHECK: affine.for %{{.*}} = 0 to 10 { + // CHECK-NEXT: affine.for %{{.*}} = 0 to 10 { + // CHECK-NEXT: %{{.*}} = affine.load %{{.*}}[%{{.*}}] : memref<10xf32> + // CHECK-NEXT: %{{.*}} = affine.load %{{.*}}[%{{.*}}, %{{.*}}] : memref<10x10xf32> + // CHECK-NEXT: %{{.*}} = addf %{{.*}}, %{{.*}} : f32 + // CHECK-NEXT: affine.store %{{.*}}, %{{.*}}[%{{.*}}] : memref<10xf32> + // CHECK-NEXT: } + // CHECK-NEXT: %{{.*}} = affine.load %{{.*}}[%{{.*}}] : memref<10xf32> + // CHECK-NEXT: affine.store %{{.*}}, %{{.*}}[%{{.*}}] : memref<10xf32> + // CHECK-NEXT: } + // CHECK-NEXT: return + return +} + +// ----- + +// CHECK-LABEL: func @should_fuse_avoiding_dependence_cycle() { +func @should_fuse_avoiding_dependence_cycle() { + %a = alloc() : memref<10xf32> + %b = alloc() : memref<10xf32> + %c = alloc() : memref<10xf32> + + %cf7 = constant 7.0 : f32 + + // Set up the following dependences: + // 1) loop0 -> loop1 on memref '%{{.*}}' + // 2) loop0 -> loop2 on memref '%{{.*}}' + // 3) loop1 -> loop2 on memref '%{{.*}}' + affine.for %i0 = 0 to 10 { + %v0 = affine.load %a[%i0] : memref<10xf32> + affine.store %cf7, %b[%i0] : memref<10xf32> + } + affine.for %i1 = 0 to 10 { + affine.store %cf7, %a[%i1] : memref<10xf32> + %v1 = affine.load %c[%i1] : memref<10xf32> + } + affine.for %i2 = 0 to 10 { + %v2 = affine.load %b[%i2] : memref<10xf32> + affine.store %cf7, %c[%i2] : memref<10xf32> + } + // Fusing loop first loop into last would create a cycle: + // {1} <--> {0, 2} + // However, we can avoid the dependence cycle if we first fuse loop0 into + // loop1: + // {0, 1) --> {2} + // Then fuse this loop nest with loop2: + // {0, 1, 2} + // + // CHECK: affine.for %{{.*}} = 0 to 10 { + // CHECK-NEXT: %{{.*}} = affine.load %{{.*}}[%{{.*}}] : memref<10xf32> + // CHECK-NEXT: affine.store %{{.*}}, %{{.*}}[%{{.*}}] : memref<10xf32> + // CHECK-NEXT: affine.store %{{.*}}, %{{.*}}[%{{.*}}] : memref<10xf32> + // CHECK-NEXT: %{{.*}} = affine.load %{{.*}}[%{{.*}}] : memref<10xf32> + // CHECK-NEXT: %{{.*}} = affine.load %{{.*}}[%{{.*}}] : memref<10xf32> + // CHECK-NEXT: affine.store %{{.*}}, %{{.*}}[%{{.*}}] : memref<10xf32> + // CHECK-NEXT: } + // CHECK-NEXT: return + return +} diff --git a/mlir/test/lib/Transforms/TestLoopFusion.cpp b/mlir/test/lib/Transforms/TestLoopFusion.cpp --- a/mlir/test/lib/Transforms/TestLoopFusion.cpp +++ b/mlir/test/lib/Transforms/TestLoopFusion.cpp @@ -41,6 +41,11 @@ llvm::cl::desc("Enable testing of loop fusion slice computation"), llvm::cl::cat(clOptionsCategory)); +static llvm::cl::opt clTestLoopFusionTransformation( + "test-loop-fusion-transformation", + llvm::cl::desc("Enable testing of loop fusion transformation"), + llvm::cl::cat(clOptionsCategory)); + namespace { struct TestLoopFusion : public FunctionPass { @@ -69,11 +74,9 @@ // Run fusion dependence check on 'loops[i]' and 'loops[j]' at loop depths // in range ['loopDepth' + 1, 'maxLoopDepth']. // Emits a remark on 'loops[i]' if a fusion-preventing dependence exists. -static void testDependenceCheck(SmallVector &loops, unsigned i, - unsigned j, unsigned loopDepth, +static bool testDependenceCheck(AffineForOp srcForOp, AffineForOp dstForOp, + unsigned i, unsigned j, unsigned loopDepth, unsigned maxLoopDepth) { - AffineForOp srcForOp = loops[i]; - AffineForOp dstForOp = loops[j]; mlir::ComputationSliceState sliceUnion; for (unsigned d = loopDepth + 1; d <= maxLoopDepth; ++d) { FusionResult result = @@ -84,6 +87,7 @@ << i << " into loop nest " << j << " at depth " << loopDepth; } } + return false; } // Returns the index of 'op' in its block. @@ -121,11 +125,9 @@ // Computes fusion slice union on 'loops[i]' and 'loops[j]' at loop depths // in range ['loopDepth' + 1, 'maxLoopDepth']. // Emits a string representation of the slice union as a remark on 'loops[j]'. -static void testSliceComputation(SmallVector &loops, unsigned i, - unsigned j, unsigned loopDepth, +static bool testSliceComputation(AffineForOp forOpA, AffineForOp forOpB, + unsigned i, unsigned j, unsigned loopDepth, unsigned maxLoopDepth) { - AffineForOp forOpA = loops[i]; - AffineForOp forOpB = loops[j]; for (unsigned d = loopDepth + 1; d <= maxLoopDepth; ++d) { mlir::ComputationSliceState sliceUnion; FusionResult result = mlir::canFuseLoops(forOpA, forOpB, d, &sliceUnion); @@ -135,31 +137,77 @@ << " : " << getSliceStr(sliceUnion) << ")"; } } + return false; } -void TestLoopFusion::runOnFunction() { - // Gather all AffineForOps by loop depth. - DenseMap> depthToLoops; - for (auto &block : getFunction()) { - gatherLoops(&block, /*currLoopDepth=*/0, depthToLoops); +static bool testLoopFusionTransformation(AffineForOp forOpA, AffineForOp forOpB, + unsigned i, unsigned j, + unsigned loopDepth, + unsigned maxLoopDepth) { + for (unsigned d = loopDepth + 1; d <= maxLoopDepth; ++d) { + mlir::ComputationSliceState sliceUnion; + FusionResult result = mlir::canFuseLoops(forOpA, forOpB, d, &sliceUnion); + if (result.value == FusionResult::Success) { + mlir::fuseLoops(forOpA, forOpB, &sliceUnion); + // Note: 'forOpA' is removed to simplify test output. A proper loop + // fusion pass should check the data dependence graph and run memref + // region analysis to ensure removing 'forOpA' is safe. + forOpA.erase(); + return true; + } } + return false; +} - // Run tests on all combinations of src/dst loop nests in 'depthToLoops'. +using LoopFunc = function_ref; + +// Run tests on all combinations of src/dst loop nests in 'depthToLoops'. +static bool iterateLoops( + DenseMap> &depthToLoops, + LoopFunc fn) { + bool changed = false; for (auto &depthAndLoops : depthToLoops) { unsigned loopDepth = depthAndLoops.first; auto &loops = depthAndLoops.second; unsigned numLoops = loops.size(); for (unsigned j = 0; j < numLoops; ++j) { for (unsigned k = 0; k < numLoops; ++k) { - if (j == k) - continue; - if (clTestDependenceCheck) - testDependenceCheck(loops, j, k, loopDepth, depthToLoops.size()); - if (clTestSliceComputation) - testSliceComputation(loops, j, k, loopDepth, depthToLoops.size()); + if (j != k) + changed |= fn(loops[j], loops[k], j, k, loopDepth, + depthToLoops.size()); + } + } + } + return changed; +} + +void TestLoopFusion::runOnFunction() { + DenseMap> depthToLoops; + if (clTestLoopFusionTransformation) { + // Run loop fusion until a fixed point is reached. + bool changed = true; + while (changed) { + depthToLoops.clear(); + // Gather all AffineForOps by loop depth. + for (auto &block : getFunction()) { + gatherLoops(&block, /*currLoopDepth=*/0, depthToLoops); } + // Trye to fuse all combinations of src/dst loop nests in 'depthToLoops'. + changed = iterateLoops(depthToLoops, testLoopFusionTransformation); } + return; } + + // Gather all AffineForOps by loop depth. + for (auto &block : getFunction()) { + gatherLoops(&block, /*currLoopDepth=*/0, depthToLoops); + } + // Run tests on all combinations of src/dst loop nests in 'depthToLoops'. + if (clTestDependenceCheck) + iterateLoops(depthToLoops, testDependenceCheck); + if (clTestSliceComputation) + iterateLoops(depthToLoops, testSliceComputation); } static PassRegistration