diff --git a/mlir/include/mlir/Dialect/Async/Passes.h b/mlir/include/mlir/Dialect/Async/Passes.h --- a/mlir/include/mlir/Dialect/Async/Passes.h +++ b/mlir/include/mlir/Dialect/Async/Passes.h @@ -19,9 +19,9 @@ std::unique_ptr createAsyncParallelForPass(); -std::unique_ptr createAsyncParallelForPass(bool asyncDispatch, - int32_t numWorkerThreads, - int32_t targetBlockSize); +std::unique_ptr +createAsyncParallelForPass(bool asyncDispatch, int32_t numWorkerThreads, + bool useCostModel, int32_t minTaskSize = 512 * 1024); std::unique_ptr> createAsyncToAsyncRuntimePass(); diff --git a/mlir/include/mlir/Dialect/Async/Passes.td b/mlir/include/mlir/Dialect/Async/Passes.td --- a/mlir/include/mlir/Dialect/Async/Passes.td +++ b/mlir/include/mlir/Dialect/Async/Passes.td @@ -29,7 +29,12 @@ Option<"minTaskSize", "min-task-size", "int32_t", /*default=*/"1000", - "The minimum task size for sharding parallel operation."> + "The minimum task size for sharding parallel operation.">, + + Option<"useCostModel", "use-cost-model", + "bool", /*default=*/"false", + "Use experimental cost model to determine the parallelism granularity."> + ]; let dependentDialects = [ diff --git a/mlir/lib/Dialect/Async/Transforms/AsyncParallelFor.cpp b/mlir/lib/Dialect/Async/Transforms/AsyncParallelFor.cpp --- a/mlir/lib/Dialect/Async/Transforms/AsyncParallelFor.cpp +++ b/mlir/lib/Dialect/Async/Transforms/AsyncParallelFor.cpp @@ -10,6 +10,7 @@ // //===----------------------------------------------------------------------===// +#include "CostModel.h" #include "PassDetail.h" #include "mlir/Dialect/Arithmetic/IR/Arithmetic.h" #include "mlir/Dialect/Async/IR/Async.h" @@ -93,9 +94,10 @@ AsyncParallelForPass() = default; AsyncParallelForPass(bool asyncDispatch, int32_t numWorkerThreads, - int32_t minTaskSize) { + bool useCostModel, int32_t minTaskSize) { this->asyncDispatch = asyncDispatch; this->numWorkerThreads = numWorkerThreads; + this->useCostModel = useCostModel; this->minTaskSize = minTaskSize; } @@ -115,6 +117,7 @@ private: bool asyncDispatch; int32_t numWorkerThreads; + bool useCostModel; int32_t minTaskSize; }; @@ -672,6 +675,11 @@ auto dispatch = [&](OpBuilder &nestedBuilder, Location loc) { ImplicitLocOpBuilder nb(loc, nestedBuilder); + // Create a parallel compute function that takes a block id and computes the + // parallel operation body for a subset of iteration space. + ParallelComputeFunction parallelComputeFunction = + createParallelComputeFunction(op, rewriter); + // With large number of threads the value of creating many compute blocks // is reduced because the problem typically becomes memory bound. For small // number of threads it helps with stragglers. @@ -693,16 +701,20 @@ // blockSize = min(tripCount, // max(ceil_div(tripCount, maxComputeBlocks), // ceil_div(minTaskSize, bodySize))) + Value bodySize; + if (useCostModel) { + CostModel helper(b, *op); + Cost cost = helper.estimateCost(op.getLoopBody()); + bodySize = helper.costToNanoseconds(cost); + } else { + bodySize = b.create(32); + } Value bs0 = b.create(tripCount, maxComputeBlocks); - Value bs1 = b.create(bs0, minTaskSizeCst); - Value blockSize = b.create(tripCount, bs1); + Value bs1 = b.create(minTaskSizeCst, bodySize); + Value bs2 = b.create(bs0, bs1); + Value blockSize = b.create(tripCount, bs2); Value blockCount = b.create(tripCount, blockSize); - // Create a parallel compute function that takes a block id and computes the - // parallel operation body for a subset of iteration space. - ParallelComputeFunction parallelComputeFunction = - createParallelComputeFunction(op, rewriter); - // Dispatch parallel compute function using async recursive work splitting, // or by submitting compute task sequentially from a caller thread. if (asyncDispatch) { @@ -742,7 +754,8 @@ std::unique_ptr mlir::createAsyncParallelForPass(bool asyncDispatch, int32_t numWorkerThreads, + bool useCostModel, int32_t minTaskSize) { return std::make_unique(asyncDispatch, numWorkerThreads, - minTaskSize); + useCostModel, minTaskSize); } diff --git a/mlir/lib/Dialect/Async/Transforms/CMakeLists.txt b/mlir/lib/Dialect/Async/Transforms/CMakeLists.txt --- a/mlir/lib/Dialect/Async/Transforms/CMakeLists.txt +++ b/mlir/lib/Dialect/Async/Transforms/CMakeLists.txt @@ -3,6 +3,7 @@ AsyncRuntimeRefCounting.cpp AsyncRuntimeRefCountingOpt.cpp AsyncToAsyncRuntime.cpp + CostModel.cpp PassDetail.cpp ADDITIONAL_HEADER_DIRS @@ -15,10 +16,13 @@ MLIRArithmetic MLIRAsync MLIRIR + MLIRMath + MLIRMemref MLIRPass MLIRSCF MLIRSCFToStandard MLIRStandard MLIRTransforms MLIRTransformUtils + MLIRVector ) diff --git a/mlir/lib/Dialect/Async/Transforms/CostModel.h b/mlir/lib/Dialect/Async/Transforms/CostModel.h new file mode 100644 --- /dev/null +++ b/mlir/lib/Dialect/Async/Transforms/CostModel.h @@ -0,0 +1,78 @@ +//===- CostModel.h - Declaration of the cost model. --------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===//// +// +// This file declares a cost model that drives deciding the parallelization +// granularity when lowering parallel operations into async tasks. +// +//===----------------------------------------------------------------------===// + +#ifndef DIALECT_ASYNC_TRANSFORMS_COSTMODEL_H_ +#define DIALECT_ASYNC_TRANSFORMS_COSTMODEL_H_ + +#include "mlir/IR/ImplicitLocOpBuilder.h" + +namespace mlir { + +// Cost measuring unit. +// +// For CPU roughly corresponds to cycles. For RAM roughly corresponds to bytes. +using InverseThroughput = Value; + +// Approximate cost of executing an op on a modern CPU. +// +// Encapsulates the leaf nodes of the IR that is being emitted as the cost +// estimation progresses. The leaf nodes correspond to CPU and RAM +// inverse-throughputs. +class Cost { +private: + InverseThroughput ram; + InverseThroughput cpu; + ImplicitLocOpBuilder builder; + +public: + Cost(ImplicitLocOpBuilder &builder, size_t ramCost = 0, size_t cpuCost = 0); + + Cost(ImplicitLocOpBuilder &builder, InverseThroughput &ramCost, + InverseThroughput &cpuCost); + + InverseThroughput &getRAM(); + InverseThroughput &getCPU(); + + Cost &operator*=(const size_t &rhs); + Cost operator*(const size_t &rhs); + Cost &operator+=(const Cost &rhs); + Cost operator+(const Cost &rhs); +}; + +// Estimates execution time for an op on a modern CPU. +// +// Errs on the lower end, but is not strictly a lower bound estimate. Targeting +// being within an order of magnitude of the correct value. +class CostModel { +private: + ImplicitLocOpBuilder builder; + Operation &rootOp; + +public: + CostModel(ImplicitLocOpBuilder &builder, Operation &rootOp) + : builder(builder), rootOp(rootOp) {} + + ImplicitLocOpBuilder &getBuilder(); + bool isBeforeRoot(Operation &op); + bool isBeforeRoot(Value &value); + Optional getIterations(Value lowerBound, Value upperBound, Value step); + Cost newCost(size_t ramCost, size_t cpuCost); + Cost newCost(Value ramCost, Value cpuCost); + Cost zeroCost(); + Cost estimateCost(Operation &op); + Cost estimateCost(Region ®ion); + Value costToNanoseconds(Cost cost); +}; + +} // namespace mlir +#endif // DIALECT_ASYNC_TRANSFORMS_COSTMODEL_H_ diff --git a/mlir/lib/Dialect/Async/Transforms/CostModel.cpp b/mlir/lib/Dialect/Async/Transforms/CostModel.cpp new file mode 100644 --- /dev/null +++ b/mlir/lib/Dialect/Async/Transforms/CostModel.cpp @@ -0,0 +1,232 @@ +//===- CostModel.cpp - Implementation of the cost model. --------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file implements a cost model that drives deciding the parallelization +// granularity when lowering parallel operations into async tasks. +// +//===----------------------------------------------------------------------===// + +#include "CostModel.h" +#include "mlir/Dialect/Arithmetic/IR/Arithmetic.h" +#include "mlir/Dialect/Math/IR/Math.h" +#include "mlir/Dialect/MemRef/IR/MemRef.h" +#include "mlir/Dialect/SCF/SCF.h" +#include "mlir/Dialect/Vector/VectorOps.h" + +namespace mlir { + +Cost::Cost(ImplicitLocOpBuilder &builder, size_t ramCost, size_t cpuCost) + : builder(builder), ram(builder.create(ramCost)), + cpu(builder.create(cpuCost)) {} + +Cost::Cost(ImplicitLocOpBuilder &builder, InverseThroughput &ramCost, + InverseThroughput &cpuCost) + : builder(builder), ram(ramCost), cpu(cpuCost) {} + +InverseThroughput &Cost::getRAM() { return ram; } +InverseThroughput &Cost::getCPU() { return cpu; } + +Cost &Cost::operator*=(const size_t &rhs) { + auto factor = builder.create(rhs); + auto scale = [&](InverseThroughput resource) { + return builder.create(resource, factor); + }; + ram = scale(ram); + cpu = scale(cpu); + return *this; +} + +Cost Cost::operator*(const size_t &rhs) { + Cost ret = *this; + ret *= rhs; + return ret; +} + +Cost &Cost::operator+=(const Cost &rhs) { + ram = builder.create(ram, rhs.ram); + cpu = builder.create(cpu, rhs.cpu); + return *this; // return the result by reference +} + +Cost Cost::operator+(const Cost &rhs) { + Cost ret = *this; + ret += rhs; + return ret; +} + +Cost estimateCostVector(CostModel &helper, Operation &op) { + + auto newCost = [&](Value value) { + if (auto type = value.getType().dyn_cast()) { + if (type.hasStaticShape()) { + return helper.newCost(/* ram */ type.getNumElements() * + type.getElementTypeBitWidth() / 8, + /* cpu */ 0); + } + } + return helper.zeroCost(); + }; + + if (auto loadOp = dyn_cast(op)) { + return newCost(loadOp.getResult()); + } + + if (auto storeOp = dyn_cast(op)) { + return newCost(storeOp.getOperand(0)); + } + + return helper.zeroCost(); +} + +Cost estimateCostMemref(CostModel &helper, Operation &op) { + + auto newCost = [&](Value value) { + return helper.newCost(/* ram */ value.getType().getIntOrFloatBitWidth() / 8, + /* cpu */ 0); + }; + + if (auto loadOp = dyn_cast(op)) { + return newCost(loadOp.getResult()); + } + + if (auto storeOp = dyn_cast(op)) { + return newCost(storeOp.getOperand(0)); + } + + return helper.zeroCost(); +} + +Cost estimateCostArith(CostModel &helper, Operation &op) { + + if (isa(op)) { + return helper.newCost(/* ram */ 0, /* cpu */ 3); + } + + return helper.newCost(/* ram */ 0, /* cpu */ 1); +} + +Cost estimateCostMath(CostModel &helper, Operation &op) { + + if (isa(op)) { + return helper.newCost(/* ram */ 0, /* cpu */ 100); + } + + return helper.newCost(/* ram */ 0, /* cpu */ 1); +} + +Cost estimateCostSCF(CostModel &helper, Operation &op) { + + auto scaleCost = [&](Value iterations, Cost cost) { + return helper.newCost( + helper.getBuilder().create(cost.getRAM(), iterations), + helper.getBuilder().create(cost.getCPU(), iterations)); + }; + + if (auto forOp = dyn_cast(op)) { + Cost cost = helper.estimateCost(forOp.getLoopBody()); + if (auto iterations = helper.getIterations( + forOp.lowerBound(), forOp.upperBound(), forOp.step())) { + return scaleCost(*iterations, cost); + } + return cost; + } + + if (auto parallelOp = dyn_cast(op)) { + Cost cost = helper.estimateCost(parallelOp.getLoopBody()); + for (auto &inductionVariableDomain : llvm::enumerate( + llvm::zip(parallelOp.lowerBound(), parallelOp.upperBound(), + parallelOp.step()))) { + Value lb, ub, step; + std::tie(lb, ub, step) = inductionVariableDomain.value(); + + if (auto iterations = helper.getIterations(lb, ub, step)) { + cost = scaleCost(*iterations, cost); + } + } + return cost; + } + + return helper.zeroCost(); +} + +ImplicitLocOpBuilder &CostModel::getBuilder() { return builder; } + +bool CostModel::isBeforeRoot(Operation &op) { + return op.getBlock() == rootOp.getBlock() && op.isBeforeInBlock(&rootOp); +} + +bool CostModel::isBeforeRoot(Value &value) { + return isBeforeRoot(*value.getDefiningOp()); +} + +Optional CostModel::getIterations(Value lowerBound, Value upperBound, + Value step) { + if (isBeforeRoot(lowerBound) && isBeforeRoot(upperBound) && + isBeforeRoot(step)) { + return (Value)builder.create( + builder.create(upperBound, lowerBound), step); + } + return llvm::None; +} + +Cost CostModel::newCost(size_t ramCost, size_t cpuCost) { + return Cost(builder, ramCost, cpuCost); +} + +Cost CostModel::newCost(Value ramCost, Value cpuCost) { + return Cost(builder, ramCost, cpuCost); +} + +Cost CostModel::zeroCost() { return Cost(builder, 0, 0); } + +Cost CostModel::estimateCost(Operation &op) { + MLIRContext *ctx = op.getContext(); + if (op.getDialect() == ctx->getLoadedDialect("scf")) { + return estimateCostSCF(*this, op); + } + if (op.getDialect() == ctx->getLoadedDialect("memref")) { + return estimateCostMemref(*this, op); + } + if (op.getDialect() == ctx->getLoadedDialect("vector")) { + return estimateCostVector(*this, op); + } + if (op.getDialect() == ctx->getLoadedDialect("arith")) { + return estimateCostArith(*this, op); + } + if (op.getDialect() == ctx->getLoadedDialect("math")) { + return estimateCostMath(*this, op); + } + return zeroCost(); +} + +Cost CostModel::estimateCost(Region ®ion) { + Cost cost = zeroCost(); + for (auto &op : region.getOps()) { + cost += estimateCost(op); + } + return cost; +} + +Value CostModel::costToNanoseconds(Cost cost) { + // Assume that RAM throughput is 16 GB/s and that CPU runs at 3 GHz. + auto ramRuntime = builder.create( + cost.getRAM(), builder.create(16)); + auto cpuRuntime = builder.create( + cost.getCPU(), builder.create(3)); + auto max = [&](Value x, Value y) { + return builder.create(x, y); + }; + return max(max(ramRuntime, cpuRuntime), + builder.create(1)); +} + +} // namespace mlir diff --git a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel --- a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel +++ b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel @@ -2014,6 +2014,8 @@ ":Async", ":AsyncPassIncGen", ":IR", + ":MathDialect", + ":MemRefDialect", ":Pass", ":SCFDialect", ":SCFToStandard", @@ -2022,6 +2024,7 @@ ":TransformUtils", ":Transforms", ":TransformsPassIncGen", + ":VectorOps", "//llvm:Core", "//llvm:Support", ],