Index: mlir/include/mlir/Dialect/SCF/SCFOps.td =================================================================== --- mlir/include/mlir/Dialect/SCF/SCFOps.td +++ mlir/include/mlir/Dialect/SCF/SCFOps.td @@ -346,6 +346,8 @@ unsigned getNumLoops() { return step().size(); } unsigned getNumReductions() { return initVals().size(); } }]; + + let hasCanonicalizer = 1; } def ReduceOp : SCF_Op<"reduce", [HasParent<"ParallelOp">]> { Index: mlir/lib/Dialect/SCF/SCF.cpp =================================================================== --- mlir/lib/Dialect/SCF/SCF.cpp +++ mlir/lib/Dialect/SCF/SCF.cpp @@ -10,6 +10,7 @@ #include "mlir/Dialect/StandardOps/IR/Ops.h" #include "mlir/IR/AffineExpr.h" #include "mlir/IR/AffineMap.h" +#include "mlir/IR/BlockAndValueMapping.h" #include "mlir/IR/Builders.h" #include "mlir/IR/Function.h" #include "mlir/IR/Matchers.h" @@ -18,6 +19,8 @@ #include "mlir/IR/PatternMatch.h" #include "mlir/IR/StandardTypes.h" #include "mlir/IR/Value.h" +#include "mlir/Support/LLVM.h" +#include "mlir/Support/LogicalResult.h" #include "mlir/Support/MathExtras.h" using namespace mlir; @@ -706,6 +709,61 @@ return dyn_cast(containingOp); } +namespace { +// Collapse loop dimensions that perform a single iteration. +struct CollapseSingleIterationLoops : public OpRewritePattern { + using OpRewritePattern::OpRewritePattern; + + LogicalResult matchAndRewrite(ParallelOp op, + PatternRewriter &rewriter) const override { + BlockAndValueMapping mapping; + // Compute new loop bounds that omit all single-iteration loop dimensions. + SmallVector newLowerBounds; + SmallVector newUpperBounds; + SmallVector newSteps; + newLowerBounds.reserve(op.lowerBound().size()); + newUpperBounds.reserve(op.upperBound().size()); + newSteps.reserve(op.step().size()); + for (auto bounds : llvm::zip(op.lowerBound(), op.upperBound(), op.step(), + op.getBody()->getArguments())) { + auto lowerBound = dyn_cast_or_null( + std::get<0>(bounds).getDefiningOp()); + auto upperBound = dyn_cast_or_null( + std::get<1>(bounds).getDefiningOp()); + auto step = dyn_cast_or_null( + std::get<2>(bounds).getDefiningOp()); + // Replace the loop induction variable by the lower bound if the loop + // performs a single iteration. Otherwise, copy the loop bounds. + if (lowerBound && upperBound && step && + (upperBound.getValue() - lowerBound.getValue()) <= step.getValue()) { + mapping.map(std::get<3>(bounds), std::get<0>(bounds)); + } else { + newLowerBounds.push_back(std::get<0>(bounds)); + newUpperBounds.push_back(std::get<1>(bounds)); + newSteps.push_back(std::get<2>(bounds)); + } + } + // Exit if all or none of the loop dimensions performs a single iteration. + if (newLowerBounds.size() == 0 || + newLowerBounds.size() == op.lowerBound().size()) + return failure(); + // Replace the parallel loop by lower-dimensional parallel loop. + auto newOp = + rewriter.create(op.getLoc(), newLowerBounds, newUpperBounds, + newSteps, op.initVals(), nullptr); + rewriter.cloneRegionBefore(op.region(), newOp.region(), + newOp.region().begin(), mapping); + rewriter.replaceOp(op, newOp.getResults()); + return success(); + } +}; +} // namespace + +void ParallelOp::getCanonicalizationPatterns(OwningRewritePatternList &results, + MLIRContext *context) { + results.insert(context); +} + //===----------------------------------------------------------------------===// // ReduceOp //===----------------------------------------------------------------------===// Index: mlir/lib/Dialect/SCF/Transforms/ParallelLoopTiling.cpp =================================================================== --- mlir/lib/Dialect/SCF/Transforms/ParallelLoopTiling.cpp +++ mlir/lib/Dialect/SCF/Transforms/ParallelLoopTiling.cpp @@ -30,8 +30,8 @@ /// scf.parallel (%i0, %i1) = (%arg0, %arg1) to (%arg2, %arg3) /// step (%arg4*tileSize[0], /// %arg5*tileSize[1]) -/// scf.parallel (%j0, %j1) = (0, 0) to (min(tileSize[0], %arg2-%i0) -/// min(tileSize[1], %arg3-%i1)) +/// scf.parallel (%j0, %j1) = (0, 0) to (min(%arg4*tileSize[0], %arg2-%i0) +/// min(%arg5*tileSize[1], %arg3-%i1)) /// step (%arg4, %arg5) /// /// where the uses of %i0 and %i1 in the loop body are replaced by @@ -76,12 +76,33 @@ // Create the inner loop with adjusted bounds. SmallVector newBounds; newBounds.reserve(op.upperBound().size()); - for (auto bounds : llvm::zip(tileSizeConstants, outerLoop.upperBound(), - outerLoop.getInductionVars())) { - newBounds.push_back(b.create( - op.getLoc(), b.getIndexType(), minMap, - ValueRange{std::get<0>(bounds), std::get<1>(bounds), - std::get<2>(bounds)})); + for (auto bounds : + llvm::zip(outerLoop.lowerBound(), outerLoop.upperBound(), + outerLoop.step(), outerLoop.getInductionVars())) { + auto lowerBound = + dyn_cast_or_null(std::get<0>(bounds).getDefiningOp()); + auto upperBound = + dyn_cast_or_null(std::get<1>(bounds).getDefiningOp()); + // Access the step and the tile size via the multiplication operation + // computing the newStep. + auto step = dyn_cast_or_null( + std::get<2>(bounds).getDefiningOp()->getOperand(0).getDefiningOp()); + auto tileSize = cast( + std::get<2>(bounds).getDefiningOp()->getOperand(1).getDefiningOp()); + // If we statically know the size of the outer loops iteration and it is + // divisible by the tiling factor, we can use a static bound for the inner + // loop. Otherwise, we have to dynamically compute the bound for each + // iteration of the outer loop. + if (lowerBound && upperBound && step && + (upperBound.getValue() - lowerBound.getValue()) % + (step.getValue() * tileSize.getValue()) == + 0) + newBounds.push_back(std::get<2>(bounds)); + else + newBounds.push_back(b.create( + op.getLoc(), b.getIndexType(), minMap, + ValueRange{std::get<2>(bounds), std::get<1>(bounds), + std::get<3>(bounds)})); } auto innerLoop = b.create( op.getLoc(), SmallVector(newBounds.size(), zero), newBounds, @@ -131,7 +152,9 @@ getInnermostNestedLoops(&block, mostNestedParallelOps); } for (ParallelOp pLoop : mostNestedParallelOps) { - tileParallelLoop(pLoop, tileSizes); + // FIXME: Add reduction support. + if (pLoop.getNumReductions() == 0) + tileParallelLoop(pLoop, tileSizes); } } }; Index: mlir/test/Dialect/SCF/canonicalize.mlir =================================================================== --- /dev/null +++ mlir/test/Dialect/SCF/canonicalize.mlir @@ -0,0 +1,31 @@ +// RUN: mlir-opt %s -pass-pipeline='func(canonicalize)' | FileCheck %s + +func @single_iteration(%A: memref) { + %c0 = constant 0 : index + %c1 = constant 1 : index + %c2 = constant 2 : index + %c3 = constant 3 : index + %c6 = constant 6 : index + %c7 = constant 7 : index + %c10 = constant 10 : index + scf.parallel (%i0, %i1, %i2) = (%c0, %c3, %c7) to (%c1, %c6, %c10) step (%c1, %c2, %c3) { + %c42 = constant 42 : i32 + store %c42, %A[%i0, %i1, %i2] : memref + scf.yield + } + return +} + +// CHECK-LABEL: func @single_iteration( +// CHECK-SAME: [[ARG0:%.*]]: memref) { +// CHECK: [[C0:%.*]] = constant 0 : index +// CHECK: [[C2:%.*]] = constant 2 : index +// CHECK: [[C3:%.*]] = constant 3 : index +// CHECK: [[C6:%.*]] = constant 6 : index +// CHECK: [[C7:%.*]] = constant 7 : index +// CHECK: [[C42:%.*]] = constant 42 : i32 +// CHECK: scf.parallel ([[V0:%.*]]) = ([[C3]]) to ([[C6]]) step ([[C2]]) { +// CHECK: store [[C42]], [[ARG0]]{{\[}}[[C0]], [[V0]], [[C7]]] : memref +// CHECK: scf.yield +// CHECK: } +// CHECK: return Index: mlir/test/Dialect/SCF/parallel-loop-tiling.mlir =================================================================== --- mlir/test/Dialect/SCF/parallel-loop-tiling.mlir +++ mlir/test/Dialect/SCF/parallel-loop-tiling.mlir @@ -15,22 +15,50 @@ // CHECK: #map0 = affine_map<(d0, d1, d2) -> (d0, d1 - d2)> // CHECK-LABEL: func @parallel_loop( -// CHECK-SAME: [[VAL_0:%.*]]: index, [[VAL_1:%.*]]: index, [[VAL_2:%.*]]: index, [[VAL_3:%.*]]: index, [[VAL_4:%.*]]: index, [[VAL_5:%.*]]: index, [[VAL_6:%.*]]: memref, [[VAL_7:%.*]]: memref, [[VAL_8:%.*]]: memref, [[VAL_9:%.*]]: memref) { -// CHECK: [[VAL_10:%.*]] = constant 0 : index -// CHECK: [[VAL_11:%.*]] = constant 1 : index -// CHECK: [[VAL_12:%.*]] = constant 4 : index -// CHECK: [[VAL_13:%.*]] = muli [[VAL_4]], [[VAL_11]] : index -// CHECK: [[VAL_14:%.*]] = muli [[VAL_5]], [[VAL_12]] : index -// CHECK: scf.parallel ([[VAL_15:%.*]], [[VAL_16:%.*]]) = ([[VAL_0]], [[VAL_1]]) to ([[VAL_2]], [[VAL_3]]) step ([[VAL_13]], [[VAL_14]]) { -// CHECK: [[VAL_17:%.*]] = affine.min #map0([[VAL_11]], [[VAL_2]], [[VAL_15]]) -// CHECK: [[VAL_18:%.*]] = affine.min #map0([[VAL_12]], [[VAL_3]], [[VAL_16]]) -// CHECK: scf.parallel ([[VAL_19:%.*]], [[VAL_20:%.*]]) = ([[VAL_10]], [[VAL_10]]) to ([[VAL_17]], [[VAL_18]]) step ([[VAL_4]], [[VAL_5]]) { -// CHECK: [[VAL_21:%.*]] = addi [[VAL_19]], [[VAL_15]] : index -// CHECK: [[VAL_22:%.*]] = addi [[VAL_20]], [[VAL_16]] : index -// CHECK: [[VAL_23:%.*]] = load [[VAL_7]]{{\[}}[[VAL_21]], [[VAL_22]]] : memref -// CHECK: [[VAL_24:%.*]] = load [[VAL_8]]{{\[}}[[VAL_21]], [[VAL_22]]] : memref -// CHECK: [[VAL_25:%.*]] = addf [[VAL_23]], [[VAL_24]] : f32 -// CHECK: store [[VAL_25]], [[VAL_9]]{{\[}}[[VAL_21]], [[VAL_22]]] : memref +// CHECK-SAME: [[ARG1:%.*]]: index, [[ARG2:%.*]]: index, [[ARG3:%.*]]: index, [[ARG4:%.*]]: index, [[ARG5:%.*]]: index, [[ARG6:%.*]]: index, [[ARG7:%.*]]: memref, [[ARG8:%.*]]: memref, [[ARG9:%.*]]: memref, [[ARG10:%.*]]: memref) { +// CHECK: [[C0:%.*]] = constant 0 : index +// CHECK: [[C1:%.*]] = constant 1 : index +// CHECK: [[C4:%.*]] = constant 4 : index +// CHECK: [[V1:%.*]] = muli [[ARG5]], [[C1]] : index +// CHECK: [[V2:%.*]] = muli [[ARG6]], [[C4]] : index +// CHECK: scf.parallel ([[V3:%.*]], [[V4:%.*]]) = ([[ARG1]], [[ARG2]]) to ([[ARG3]], [[ARG4]]) step ([[V1]], [[V2]]) { +// CHECK: [[V5:%.*]] = affine.min #map0([[V1]], [[ARG3]], [[V3]]) +// CHECK: [[V6:%.*]] = affine.min #map0([[V2]], [[ARG4]], [[V4]]) +// CHECK: scf.parallel ([[V7:%.*]], [[V8:%.*]]) = ([[C0]], [[C0]]) to ([[V5]], [[V6]]) step ([[ARG5]], [[ARG6]]) { +// CHECK: [[V9:%.*]] = addi [[V7]], [[V3]] : index +// CHECK: [[V10:%.*]] = addi [[V8]], [[V4]] : index +// CHECK: [[V11:%.*]] = load [[ARG8]]{{\[}}[[V9]], [[V10]]] : memref +// CHECK: [[V12:%.*]] = load [[ARG9]]{{\[}}[[V9]], [[V10]]] : memref +// CHECK: [[V13:%.*]] = addf [[V11]], [[V12]] : f32 +// CHECK: store [[V13]], [[ARG10]]{{\[}}[[V9]], [[V10]]] : memref +// CHECK: } +// CHECK: } +// CHECK: return + +// ----- + +func @static_loop_with_step() { + %c0 = constant 0 : index + %c3 = constant 3 : index + %c24 = constant 24 : index + scf.parallel (%i0, %i1) = (%c0, %c0) to (%c24, %c24) step (%c3, %c3) { + } + return +} + +// CHECK-LABEL: func @static_loop_with_step() { +// CHECK: [[C0:%.*]] = constant 0 : index +// CHECK: [[C3:%.*]] = constant 3 : index +// CHECK: [[C24:%.*]] = constant 24 : index +// CHECK: [[C0_1:%.*]] = constant 0 : index +// CHECK: [[C1:%.*]] = constant 1 : index +// CHECK: [[C4:%.*]] = constant 4 : index +// CHECK: [[V1:%.*]] = muli [[C3]], [[C1]] : index +// CHECK: [[V2:%.*]] = muli [[C3]], [[C4]] : index +// CHECK: scf.parallel ([[V3:%.*]], [[V4:%.*]]) = ([[C0]], [[C0]]) to ([[C24]], [[C24]]) step ([[V1]], [[V2]]) { +// CHECK: scf.parallel ([[V5:%.*]], [[V6:%.*]]) = ([[C0_1]], [[C0_1]]) to ([[V1]], [[V2]]) step ([[C3]], [[C3]]) { +// CHECK: = addi [[V5]], [[V3]] : index +// CHECK: = addi [[V6]], [[V4]] : index // CHECK: } // CHECK: } // CHECK: return @@ -51,31 +79,33 @@ } // CHECK-LABEL: func @tile_nested_innermost() { -// CHECK: [[VAL_24:%.*]] = constant 2 : index -// CHECK: [[VAL_25:%.*]] = constant 0 : index -// CHECK: [[VAL_26:%.*]] = constant 1 : index -// CHECK: scf.parallel ([[VAL_27:%.*]], [[VAL_28:%.*]]) = ([[VAL_25]], [[VAL_25]]) to ([[VAL_24]], [[VAL_24]]) step ([[VAL_26]], [[VAL_26]]) { -// CHECK: [[VAL_29:%.*]] = constant 0 : index -// CHECK: [[VAL_30:%.*]] = constant 1 : index -// CHECK: [[VAL_31:%.*]] = constant 4 : index -// CHECK: [[VAL_32:%.*]] = muli [[VAL_26]], [[VAL_30]] : index -// CHECK: [[VAL_33:%.*]] = muli [[VAL_26]], [[VAL_31]] : index -// CHECK: scf.parallel ([[VAL_34:%.*]], [[VAL_35:%.*]]) = ([[VAL_25]], [[VAL_25]]) to ([[VAL_24]], [[VAL_24]]) step ([[VAL_32]], [[VAL_33]]) { -// CHECK: [[VAL_36:%.*]] = affine.min #map0([[VAL_30]], [[VAL_24]], [[VAL_34]]) -// CHECK: [[VAL_37:%.*]] = affine.min #map0([[VAL_31]], [[VAL_24]], [[VAL_35]]) -// CHECK: scf.parallel ([[VAL_38:%.*]], [[VAL_39:%.*]]) = ([[VAL_29]], [[VAL_29]]) to ([[VAL_36]], [[VAL_37]]) step ([[VAL_26]], [[VAL_26]]) { +// CHECK: [[C2:%.*]] = constant 2 : index +// CHECK: [[C0:%.*]] = constant 0 : index +// CHECK: [[C1:%.*]] = constant 1 : index +// CHECK: scf.parallel ([[V1:%.*]], [[V2:%.*]]) = ([[C0]], [[C0]]) to ([[C2]], [[C2]]) step ([[C1]], [[C1]]) { +// CHECK: [[C0_1:%.*]] = constant 0 : index +// CHECK: [[C1_1:%.*]] = constant 1 : index +// CHECK: [[C4:%.*]] = constant 4 : index +// CHECK: [[V3:%.*]] = muli [[C1]], [[C1_1]] : index +// CHECK: [[V4:%.*]] = muli [[C1]], [[C4]] : index +// CHECK: scf.parallel ([[V5:%.*]], [[V6:%.*]]) = ([[C0]], [[C0]]) to ([[C2]], [[C2]]) step ([[V3]], [[V4]]) { +// CHECK: [[V7:%.*]] = affine.min #map0([[V4]], [[C2]], [[V6]]) +// CHECK: scf.parallel ([[V8:%.*]], [[V9:%.*]]) = ([[C0_1]], [[C0_1]]) to ([[V3]], [[V7]]) step ([[C1]], [[C1]]) { +// CHECK: = addi [[V8]], [[V5]] : index +// CHECK: = addi [[V9]], [[V6]] : index // CHECK: } // CHECK: } // CHECK: } -// CHECK: [[VAL_40:%.*]] = constant 0 : index -// CHECK: [[VAL_41:%.*]] = constant 1 : index -// CHECK: [[VAL_42:%.*]] = constant 4 : index -// CHECK: [[VAL_43:%.*]] = muli [[VAL_26]], [[VAL_41]] : index -// CHECK: [[VAL_44:%.*]] = muli [[VAL_26]], [[VAL_42]] : index -// CHECK: scf.parallel ([[VAL_45:%.*]], [[VAL_46:%.*]]) = ([[VAL_25]], [[VAL_25]]) to ([[VAL_24]], [[VAL_24]]) step ([[VAL_43]], [[VAL_44]]) { -// CHECK: [[VAL_47:%.*]] = affine.min #map0([[VAL_41]], [[VAL_24]], [[VAL_45]]) -// CHECK: [[VAL_48:%.*]] = affine.min #map0([[VAL_42]], [[VAL_24]], [[VAL_46]]) -// CHECK: scf.parallel ([[VAL_49:%.*]], [[VAL_50:%.*]]) = ([[VAL_40]], [[VAL_40]]) to ([[VAL_47]], [[VAL_48]]) step ([[VAL_26]], [[VAL_26]]) { +// CHECK: [[C0_2:%.*]] = constant 0 : index +// CHECK: [[C1_2:%.*]] = constant 1 : index +// CHECK: [[C4_1:%.*]] = constant 4 : index +// CHECK: [[V10:%.*]] = muli [[C1]], [[C1_2]] : index +// CHECK: [[V11:%.*]] = muli [[C1]], [[C4_1]] : index +// CHECK: scf.parallel ([[V12:%.*]], [[V13:%.*]]) = ([[C0]], [[C0]]) to ([[C2]], [[C2]]) step ([[V10]], [[V11]]) { +// CHECK: [[V14:%.*]] = affine.min #map0([[V11]], [[C2]], [[V13]]) +// CHECK: scf.parallel ([[V15:%.*]], [[V16:%.*]]) = ([[C0_2]], [[C0_2]]) to ([[V10]], [[V14]]) step ([[C1]], [[C1]]) { +// CHECK: = addi [[V15]], [[V12]] : index +// CHECK: = addi [[V16]], [[V13]] : index // CHECK: } // CHECK: } // CHECK: return Index: mlir/test/Transforms/parallel-loop-collapsing.mlir =================================================================== --- mlir/test/Transforms/parallel-loop-collapsing.mlir +++ mlir/test/Transforms/parallel-loop-collapsing.mlir @@ -16,6 +16,8 @@ %c12 = constant 12 : index %c13 = constant 13 : index %c14 = constant 14 : index + %c15 = constant 15 : index + %c26 = constant 26 : index scf.parallel (%i0, %i1, %i2, %i3, %i4) = (%c0, %c3, %c6, %c9, %c12) to (%c2, %c5, %c8, %c11, %c14) step (%c1, %c4, %c7, %c10, %c13) { @@ -26,24 +28,18 @@ // CHECK-LABEL: func @parallel_many_dims() { // CHECK: [[C6:%.*]] = constant 6 : index -// CHECK: [[C7:%.*]] = constant 7 : index // CHECK: [[C9:%.*]] = constant 9 : index // CHECK: [[C10:%.*]] = constant 10 : index // CHECK: [[C12:%.*]] = constant 12 : index -// CHECK: [[C13:%.*]] = constant 13 : index -// CHECK: [[C3:%.*]] = constant 3 : index // CHECK: [[C0:%.*]] = constant 0 : index // CHECK: [[C1:%.*]] = constant 1 : index // CHECK: [[C2:%.*]] = constant 2 : index -// CHECK: scf.parallel ([[NEW_I0:%.*]], [[NEW_I1:%.*]], [[NEW_I2:%.*]]) = ([[C0]], [[C0]], [[C0]]) to ([[C2]], [[C1]], [[C1]]) step ([[C1]], [[C1]], [[C1]]) { +// CHECK: [[C3:%.*]] = constant 3 : index +// CHECK: scf.parallel ([[NEW_I0:%.*]]) = ([[C0]]) to ([[C2]]) step ([[C1]]) { // CHECK: [[I0:%.*]] = remi_signed [[NEW_I0]], [[C2]] : index -// CHECK: [[VAL_16:%.*]] = muli [[NEW_I1]], [[C13]] : index -// CHECK: [[I4:%.*]] = addi [[VAL_16]], [[C12]] : index -// CHECK: [[VAL_18:%.*]] = muli [[NEW_I0]], [[C10]] : index -// CHECK: [[I3:%.*]] = addi [[VAL_18]], [[C9]] : index -// CHECK: [[VAL_20:%.*]] = muli [[NEW_I2]], [[C7]] : index -// CHECK: [[I2:%.*]] = addi [[VAL_20]], [[C6]] : index -// CHECK: "magic.op"([[I0]], [[C3]], [[I2]], [[I3]], [[I4]]) : (index, index, index, index, index) -> index +// CHECK: [[V18:%.*]] = muli [[NEW_I0]], [[C10]] : index +// CHECK: [[I3:%.*]] = addi [[V18]], [[C9]] : index +// CHECK: "magic.op"([[I0]], [[C3]], [[C6]], [[I3]], [[C12]]) : (index, index, index, index, index) -> index // CHECK: scf.yield // CHECK-NEXT: } // CHECK-NEXT: return