diff --git a/mlir/include/mlir/Analysis/AffineAnalysis.h b/mlir/include/mlir/Analysis/AffineAnalysis.h --- a/mlir/include/mlir/Analysis/AffineAnalysis.h +++ b/mlir/include/mlir/Analysis/AffineAnalysis.h @@ -15,6 +15,7 @@ #ifndef MLIR_ANALYSIS_AFFINE_ANALYSIS_H #define MLIR_ANALYSIS_AFFINE_ANALYSIS_H +#include "mlir/Dialect/StandardOps/IR/Ops.h" #include "mlir/IR/Value.h" #include "llvm/ADT/Optional.h" #include "llvm/ADT/SmallVector.h" @@ -27,6 +28,25 @@ class FlatAffineConstraints; class Operation; +/// A description of a (parallelizable) reduction in an affine loop. +struct LoopReduction { + /// Reduction kind. + AtomicRMWKind kind; + + /// Position of the iteration argument that acts as accumulator. + unsigned iterArgPosition; + + /// The value being reduced. + Value value; +}; + +/// Returns true if `forOp' is a parallel loop. If `parallelReductions` is +/// provided, populates it with descriptors of the parallelizable reductions and +/// treats them as not preventing parallelization. +bool isLoopParallel( + AffineForOp forOp, + SmallVectorImpl *parallelReductions = nullptr); + /// Returns in `affineApplyOps`, the sequence of those AffineApplyOp /// Operations that are reachable via a search starting from `operands` and /// ending at those operands that are not the result of an AffineApplyOp. diff --git a/mlir/include/mlir/Analysis/Utils.h b/mlir/include/mlir/Analysis/Utils.h --- a/mlir/include/mlir/Analysis/Utils.h +++ b/mlir/include/mlir/Analysis/Utils.h @@ -354,9 +354,6 @@ Optional getMemoryFootprintBytes(AffineForOp forOp, int memorySpace = -1); -/// Returns true if `forOp' is a parallel loop. -bool isLoopParallel(AffineForOp forOp); - /// Simplify the integer set by simplifying the underlying affine expressions by /// flattening and some simple inference. Also, drop any duplicate constraints. /// Returns the simplified integer set. This method runs in time linear in the diff --git a/mlir/include/mlir/Dialect/Affine/Passes.td b/mlir/include/mlir/Dialect/Affine/Passes.td --- a/mlir/include/mlir/Dialect/Affine/Passes.td +++ b/mlir/include/mlir/Dialect/Affine/Passes.td @@ -123,6 +123,9 @@ Option<"maxNested", "max-nested", "unsigned", /*default=*/"-1u", "Maximum number of nested parallel loops to produce. " "Defaults to unlimited (UINT_MAX).">, + Option<"parallelReductions", "parallel-reductions", "bool", + /*default=*/"false", + "Whether to parallelize reduction loops. Defaults to false."> ]; } diff --git a/mlir/include/mlir/Dialect/Affine/Utils.h b/mlir/include/mlir/Dialect/Affine/Utils.h --- a/mlir/include/mlir/Dialect/Affine/Utils.h +++ b/mlir/include/mlir/Dialect/Affine/Utils.h @@ -24,12 +24,17 @@ class AffineIfOp; class AffineParallelOp; struct LogicalResult; +struct LoopReduction; class Operation; /// Replaces parallel affine.for op with 1-d affine.parallel op. -/// mlir::isLoopParallel detect the parallel affine.for ops. +/// mlir::isLoopParallel detects the parallel affine.for ops. +/// Parallelizes the specified reductions. Parallelization will fail in presence +/// of loop iteration arguments that are not listed in `parallelReductions`. /// There is no cost model currently used to drive this parallelization. -void affineParallelize(AffineForOp forOp); +LogicalResult +affineParallelize(AffineForOp forOp, + ArrayRef parallelReductions = {}); /// Hoists out affine.if/else to as high as possible, i.e., past all invariant /// affine.fors/parallel's. Returns success if any hoisting happened; folded` is diff --git a/mlir/lib/Analysis/AffineAnalysis.cpp b/mlir/lib/Analysis/AffineAnalysis.cpp --- a/mlir/lib/Analysis/AffineAnalysis.cpp +++ b/mlir/lib/Analysis/AffineAnalysis.cpp @@ -12,6 +12,7 @@ //===----------------------------------------------------------------------===// #include "mlir/Analysis/AffineAnalysis.h" +#include "mlir/Analysis/SliceAnalysis.h" #include "mlir/Analysis/Utils.h" #include "mlir/Dialect/Affine/IR/AffineOps.h" #include "mlir/Dialect/Affine/IR/AffineValueMap.h" @@ -21,6 +22,8 @@ #include "mlir/IR/IntegerSet.h" #include "mlir/Support/MathExtras.h" #include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/SmallPtrSet.h" +#include "llvm/ADT/TypeSwitch.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" @@ -30,6 +33,131 @@ using llvm::dbgs; +/// Returns true if `value` (transitively) depends on iteration arguments of the +/// given `forOp`. +static bool dependsOnIterArgs(Value value, AffineForOp forOp) { + // Compute the backward slice of the value. + SetVector slice; + getBackwardSlice(value, &slice, + [&](Operation *op) { return !forOp->isAncestor(op); }); + + // Check that none of the operands of the operations in the backward slice are + // loop iteration arguments, and neither is the value itself. + auto argRange = forOp.getRegionIterArgs(); + llvm::SmallPtrSet iterArgs(argRange.begin(), argRange.end()); + if (iterArgs.contains(value)) + return true; + + for (Operation *op : slice) + for (Value operand : op->getOperands()) + if (iterArgs.contains(operand)) + return true; + + return false; +} + +/// Get the value that is being reduced by `pos`-th reduction in the loop if +/// such a reduction can be performed by affine parallel loops. This assumes +/// floating-point operations are commutative. On success, `kind` will be the +/// reduction kind suitable for use in affine parallel loop builder. If the +/// reduction is not supported, returns null. +static Value getSupportedReduction(AffineForOp forOp, unsigned pos, + AtomicRMWKind &kind) { + auto yieldOp = cast(forOp.getBody()->back()); + Value yielded = yieldOp.operands()[pos]; + Operation *definition = yielded.getDefiningOp(); + if (!definition) + return nullptr; + if (!forOp.getRegionIterArgs()[pos].hasOneUse()) + return nullptr; + + Optional maybeKind = + TypeSwitch>(definition) + .Case([](Operation *) { return AtomicRMWKind::addf; }) + .Case([](Operation *) { return AtomicRMWKind::mulf; }) + .Case([](Operation *) { return AtomicRMWKind::addi; }) + .Case([](Operation *) { return AtomicRMWKind::muli; }) + .Default([](Operation *) -> Optional { + // TODO: AtomicRMW supports other kinds of reductions this is + // currently not detecting, add those when the need arises. + return llvm::None; + }); + if (!maybeKind) + return nullptr; + + kind = *maybeKind; + if (definition->getOperand(0) == forOp.getRegionIterArgs()[pos] && + !dependsOnIterArgs(definition->getOperand(1), forOp)) + return definition->getOperand(1); + if (definition->getOperand(1) == forOp.getRegionIterArgs()[pos] && + !dependsOnIterArgs(definition->getOperand(0), forOp)) + return definition->getOperand(0); + + return nullptr; +} + +/// Returns true if `forOp' is a parallel loop. If `parallelReductions` is +/// provided, populates it with descriptors of the parallelizable reductions and +/// treats them as not preventing parallelization. +bool mlir::isLoopParallel(AffineForOp forOp, + SmallVectorImpl *parallelReductions) { + unsigned numIterArgs = forOp.getNumIterOperands(); + + // Loop is not parallel if it has SSA loop-carried dependences and reduction + // detection is not requested. + if (numIterArgs > 0 && !parallelReductions) + return false; + + // Find supported reductions of requested. + if (parallelReductions) { + parallelReductions->reserve(forOp.getNumIterOperands()); + for (unsigned i = 0; i < numIterArgs; ++i) { + AtomicRMWKind kind; + if (Value value = getSupportedReduction(forOp, i, kind)) + parallelReductions->emplace_back(LoopReduction{kind, i, value}); + } + + // Return later to allow for identifying all parallel reductions even if the + // loop is not parallel. + if (parallelReductions->size() != numIterArgs) + return false; + } + + // Collect all load and store ops in loop nest rooted at 'forOp'. + SmallVector loadAndStoreOpInsts; + auto walkResult = forOp.walk([&](Operation *opInst) -> WalkResult { + if (isa(opInst)) + loadAndStoreOpInsts.push_back(opInst); + else if (!isa(opInst) && + !MemoryEffectOpInterface::hasNoEffect(opInst)) + return WalkResult::interrupt(); + + return WalkResult::advance(); + }); + + // Stop early if the loop has unknown ops with side effects. + if (walkResult.wasInterrupted()) + return false; + + // Dep check depth would be number of enclosing loops + 1. + unsigned depth = getNestingDepth(forOp) + 1; + + // Check dependences between all pairs of ops in 'loadAndStoreOpInsts'. + for (auto *srcOpInst : loadAndStoreOpInsts) { + MemRefAccess srcAccess(srcOpInst); + for (auto *dstOpInst : loadAndStoreOpInsts) { + MemRefAccess dstAccess(dstOpInst); + FlatAffineConstraints dependenceConstraints; + DependenceResult result = checkMemrefAccessDependence( + srcAccess, dstAccess, depth, &dependenceConstraints, + /*dependenceComponents=*/nullptr); + if (result.value != DependenceResult::NoDependence) + return false; + } + } + return true; +} + /// Returns the sequence of AffineApplyOp Operations operation in /// 'affineApplyOps', which are reachable via a search starting from 'operands', /// and ending at operands which are not defined by AffineApplyOps. diff --git a/mlir/lib/Analysis/Utils.cpp b/mlir/lib/Analysis/Utils.cpp --- a/mlir/lib/Analysis/Utils.cpp +++ b/mlir/lib/Analysis/Utils.cpp @@ -1268,49 +1268,6 @@ }); } -/// Returns true if 'forOp' is parallel. -bool mlir::isLoopParallel(AffineForOp forOp) { - // Loop is not parallel if it has SSA loop-carried dependences. - // TODO: Conditionally support reductions and other loop-carried dependences - // that could be handled in the context of a parallel loop. - if (forOp.getNumIterOperands() > 0) - return false; - - // Collect all load and store ops in loop nest rooted at 'forOp'. - SmallVector loadAndStoreOpInsts; - auto walkResult = forOp.walk([&](Operation *opInst) -> WalkResult { - if (isa(opInst)) - loadAndStoreOpInsts.push_back(opInst); - else if (!isa(opInst) && - !MemoryEffectOpInterface::hasNoEffect(opInst)) - return WalkResult::interrupt(); - - return WalkResult::advance(); - }); - - // Stop early if the loop has unknown ops with side effects. - if (walkResult.wasInterrupted()) - return false; - - // Dep check depth would be number of enclosing loops + 1. - unsigned depth = getNestingDepth(forOp) + 1; - - // Check dependences between all pairs of ops in 'loadAndStoreOpInsts'. - for (auto *srcOpInst : loadAndStoreOpInsts) { - MemRefAccess srcAccess(srcOpInst); - for (auto *dstOpInst : loadAndStoreOpInsts) { - MemRefAccess dstAccess(dstOpInst); - FlatAffineConstraints dependenceConstraints; - DependenceResult result = checkMemrefAccessDependence( - srcAccess, dstAccess, depth, &dependenceConstraints, - /*dependenceComponents=*/nullptr); - if (result.value != DependenceResult::NoDependence) - return false; - } - } - return true; -} - IntegerSet mlir::simplifyIntegerSet(IntegerSet set) { FlatAffineConstraints fac(set); if (fac.isEmpty()) diff --git a/mlir/lib/Dialect/Affine/Transforms/AffineParallelize.cpp b/mlir/lib/Dialect/Affine/Transforms/AffineParallelize.cpp --- a/mlir/lib/Dialect/Affine/Transforms/AffineParallelize.cpp +++ b/mlir/lib/Dialect/Affine/Transforms/AffineParallelize.cpp @@ -12,6 +12,7 @@ //===----------------------------------------------------------------------===// #include "PassDetail.h" +#include "mlir/Analysis/AffineAnalysis.h" #include "mlir/Analysis/AffineStructures.h" #include "mlir/Analysis/LoopAnalysis.h" #include "mlir/Analysis/Utils.h" @@ -33,6 +34,15 @@ struct AffineParallelize : public AffineParallelizeBase { void runOnFunction() override; }; + +/// Descriptor of a potentially parallelizable loop. +struct ParallelizationCandidate { + ParallelizationCandidate(AffineForOp l, SmallVector &&r) + : loop(l), reductions(std::move(r)) {} + + AffineForOp loop; + SmallVector reductions; +}; } // namespace void AffineParallelize::runOnFunction() { @@ -41,14 +51,16 @@ // The walker proceeds in post-order, but we need to process outer loops first // to control the number of outer parallel loops, so push candidate loops to // the front of a deque. - std::deque parallelizableLoops; + std::deque parallelizableLoops; f.walk([&](AffineForOp loop) { - if (isLoopParallel(loop)) - parallelizableLoops.push_front(loop); + SmallVector reductions; + if (isLoopParallel(loop, parallelReductions ? &reductions : nullptr)) + parallelizableLoops.emplace_back(loop, std::move(reductions)); }); - for (AffineForOp loop : parallelizableLoops) { + for (const ParallelizationCandidate &candidate : parallelizableLoops) { unsigned numParentParallelOps = 0; + AffineForOp loop = candidate.loop; for (Operation *op = loop->getParentOp(); op != nullptr && !op->hasTrait(); op = op->getParentOp()) { @@ -56,8 +68,15 @@ ++numParentParallelOps; } - if (numParentParallelOps < maxNested) - affineParallelize(loop); + if (numParentParallelOps < maxNested) { + if (failed(affineParallelize(loop, candidate.reductions))) { + LLVM_DEBUG(llvm::dbgs() << "[" DEBUG_TYPE "] failed to parallelize\n" + << loop); + } + } else { + LLVM_DEBUG(llvm::dbgs() << "[" DEBUG_TYPE "] too many nested loops\n" + << loop); + } } } diff --git a/mlir/lib/Dialect/Affine/Transforms/SuperVectorize.cpp b/mlir/lib/Dialect/Affine/Transforms/SuperVectorize.cpp --- a/mlir/lib/Dialect/Affine/Transforms/SuperVectorize.cpp +++ b/mlir/lib/Dialect/Affine/Transforms/SuperVectorize.cpp @@ -12,16 +12,16 @@ //===----------------------------------------------------------------------===// #include "PassDetail.h" +#include "mlir/Analysis/AffineAnalysis.h" #include "mlir/Analysis/LoopAnalysis.h" #include "mlir/Analysis/NestedMatcher.h" -#include "mlir/Analysis/Utils.h" #include "mlir/Dialect/Affine/IR/AffineOps.h" #include "mlir/Dialect/Affine/Utils.h" #include "mlir/Dialect/Vector/VectorOps.h" #include "mlir/Dialect/Vector/VectorUtils.h" #include "mlir/IR/BlockAndValueMapping.h" -#include "llvm/Support/Debug.h" #include "mlir/Support/LLVM.h" +#include "llvm/Support/Debug.h" using namespace mlir; using namespace vector; diff --git a/mlir/lib/Dialect/Affine/Utils/CMakeLists.txt b/mlir/lib/Dialect/Affine/Utils/CMakeLists.txt --- a/mlir/lib/Dialect/Affine/Utils/CMakeLists.txt +++ b/mlir/lib/Dialect/Affine/Utils/CMakeLists.txt @@ -6,5 +6,6 @@ LINK_LIBS PUBLIC MLIRAffine + MLIRAnalysis MLIRTransformUtils ) diff --git a/mlir/lib/Dialect/Affine/Utils/Utils.cpp b/mlir/lib/Dialect/Affine/Utils/Utils.cpp --- a/mlir/lib/Dialect/Affine/Utils/Utils.cpp +++ b/mlir/lib/Dialect/Affine/Utils/Utils.cpp @@ -12,6 +12,8 @@ //===----------------------------------------------------------------------===// #include "mlir/Dialect/Affine/Utils.h" +#include "mlir/Analysis/AffineAnalysis.h" +#include "mlir/Analysis/Utils.h" #include "mlir/Dialect/Affine/IR/AffineOps.h" #include "mlir/IR/BlockAndValueMapping.h" #include "mlir/IR/BuiltinOps.h" @@ -130,8 +132,17 @@ } /// Replace affine.for with a 1-d affine.parallel and clone the former's body -/// into the latter while remapping values. -void mlir::affineParallelize(AffineForOp forOp) { +/// into the latter while remapping values. Parallelizes the specified +/// reductions. Parallelization will fail in presence of loop iteration +/// arguments that are not listed in `parallelReductions`. +LogicalResult +mlir::affineParallelize(AffineForOp forOp, + ArrayRef parallelReductions) { + // Fail early if there are iter arguments that are not reductions. + unsigned numReductions = parallelReductions.size(); + if (numReductions != forOp.getNumIterOperands()) + return failure(); + Location loc = forOp.getLoc(); OpBuilder outsideBuilder(forOp); @@ -148,7 +159,7 @@ if (needsMax || needsMin) { if (forOp->getParentOp() && !forOp->getParentOp()->hasTrait()) - return; + return failure(); identityMap = AffineMap::getMultiDimIdentityMap(1, loc->getContext()); } @@ -168,12 +179,46 @@ } // Creating empty 1-D affine.parallel op. + auto reducedValues = llvm::to_vector<4>(llvm::map_range( + parallelReductions, [](const LoopReduction &red) { return red.value; })); + auto reductionKinds = llvm::to_vector<4>(llvm::map_range( + parallelReductions, [](const LoopReduction &red) { return red.kind; })); AffineParallelOp newPloop = outsideBuilder.create( - loc, llvm::None, llvm::None, lowerBoundMap, lowerBoundOperands, - upperBoundMap, upperBoundOperands); - // Steal the body of the old affine for op and erase it. + loc, ValueRange(reducedValues).getTypes(), reductionKinds, lowerBoundMap, + lowerBoundOperands, upperBoundMap, upperBoundOperands); + // Steal the body of the old affine for op. newPloop.region().takeBody(forOp.region()); + Operation *yieldOp = &newPloop.getBody()->back(); + + // Handle the initial values of reductions because the parallel loop always + // starts from the neutral value. + SmallVector newResults; + newResults.reserve(numReductions); + for (unsigned i = 0; i < numReductions; ++i) { + Value init = forOp.getIterOperands()[i]; + // This works because we are only handling single-op reductions at the + // moment. A switch on reduction kind or a mechanism to collect operations + // participating in the reduction will be necessary for multi-op reductions. + Operation *reductionOp = yieldOp->getOperand(i).getDefiningOp(); + assert(reductionOp && "yielded value is expected to be produced by an op"); + outsideBuilder.getInsertionBlock()->getOperations().splice( + outsideBuilder.getInsertionPoint(), newPloop.getBody()->getOperations(), + reductionOp); + reductionOp->setOperands({init, newPloop->getResult(i)}); + forOp->getResult(i).replaceAllUsesWith(reductionOp->getResult(0)); + } + + // Update the loop terminator to yield reduced values bypassing the reduction + // operation itself (now moved outside of the loop) and erase the block + // arguments that correspond to reductions. Note that the loop always has one + // "main" induction variable whenc coming from a non-parallel for. + unsigned numIVs = 1; + yieldOp->setOperands(reducedValues); + newPloop.getBody()->eraseArguments( + llvm::to_vector<4>(llvm::seq(numIVs, numReductions + numIVs))); + forOp.erase(); + return success(); } // Returns success if any hoisting happened. diff --git a/mlir/test/Dialect/Affine/parallelize.mlir b/mlir/test/Dialect/Affine/parallelize.mlir --- a/mlir/test/Dialect/Affine/parallelize.mlir +++ b/mlir/test/Dialect/Affine/parallelize.mlir @@ -1,5 +1,6 @@ -// RUN: mlir-opt %s -allow-unregistered-dialect -affine-parallelize| FileCheck %s +// RUN: mlir-opt %s -allow-unregistered-dialect -affine-parallelize | FileCheck %s // RUN: mlir-opt %s -allow-unregistered-dialect -affine-parallelize='max-nested=1' | FileCheck --check-prefix=MAX-NESTED %s +// RUN: mlir-opt %s -allow-unregistered-dialect -affine-parallelize='parallel-reductions=1' | FileCheck --check-prefix=REDUCE %s // CHECK-LABEL: func @reduce_window_max() { func @reduce_window_max() { @@ -159,24 +160,34 @@ return } -// CHECK-LABEL: @unsupported_iter_args -func @unsupported_iter_args(%in: memref<10xf32>) { +// CHECK-LABEL: @iter_args +// REDUCE-LABEL: @iter_args +func @iter_args(%in: memref<10xf32>) { + // REDUCE: %[[init:.*]] = constant %cst = constant 0.000000e+00 : f32 // CHECK-NOT: affine.parallel + // REDUCE: %[[reduced:.*]] = affine.parallel (%{{.*}}) = (0) to (10) reduce ("addf") %final_red = affine.for %i = 0 to 10 iter_args(%red_iter = %cst) -> (f32) { + // REDUCE: %[[red_value:.*]] = affine.load %ld = affine.load %in[%i] : memref<10xf32> + // REDUCE-NOT: addf %add = addf %red_iter, %ld : f32 + // REDUCE: affine.yield %[[red_value]] affine.yield %add : f32 } + // REDUCE: addf %[[init]], %[[reduced]] return } -// CHECK-LABEL: @unsupported_nested_iter_args -func @unsupported_nested_iter_args(%in: memref<20x10xf32>) { +// CHECK-LABEL: @nested_iter_args +// REDUCE-LABEL: @nested_iter_args +func @nested_iter_args(%in: memref<20x10xf32>) { %cst = constant 0.000000e+00 : f32 // CHECK: affine.parallel affine.for %i = 0 to 20 { - // CHECK: affine.for + // CHECK-NOT: affine.parallel + // REDUCE: affine.parallel + // REDUCE: reduce ("addf") %final_red = affine.for %j = 0 to 10 iter_args(%red_iter = %cst) -> (f32) { %ld = affine.load %in[%i, %j] : memref<20x10xf32> %add = addf %red_iter, %ld : f32 @@ -185,3 +196,43 @@ } return } + +// REDUCE-LABEL: @strange_butterfly +func @strange_butterfly() { + %cst1 = constant 0.0 : f32 + %cst2 = constant 1.0 : f32 + // REDUCE-NOT: affine.parallel + affine.for %i = 0 to 10 iter_args(%it1 = %cst1, %it2 = %cst2) -> (f32, f32) { + %0 = addf %it1, %it2 : f32 + affine.yield %0, %0 : f32, f32 + } + return +} + +// An iter arg is used more than once. This is not a simple reduction and +// should not be parallelized. +// REDUCE-LABEL: @repeated_use +func @repeated_use() { + %cst1 = constant 0.0 : f32 + // REDUCE-NOT: affine.parallel + affine.for %i = 0 to 10 iter_args(%it1 = %cst1) -> (f32) { + %0 = addf %it1, %it1 : f32 + affine.yield %0 : f32 + } + return +} + +// An iter arg is used in the chain of operations defining the value being +// reduced, this is not a simple reduction and should not be parallelized. +// REDUCE-LABEL: @use_in_backward_slice +func @use_in_backward_slice() { + %cst1 = constant 0.0 : f32 + %cst2 = constant 1.0 : f32 + // REDUCE-NOT: affine.parallel + affine.for %i = 0 to 10 iter_args(%it1 = %cst1, %it2 = %cst2) -> (f32, f32) { + %0 = "test.some_modification"(%it2) : (f32) -> f32 + %1 = addf %it1, %0 : f32 + affine.yield %1, %1 : f32, f32 + } + return +}