diff --git a/mlir/include/mlir/Dialect/Async/Passes.td b/mlir/include/mlir/Dialect/Async/Passes.td --- a/mlir/include/mlir/Dialect/Async/Passes.td +++ b/mlir/include/mlir/Dialect/Async/Passes.td @@ -27,9 +27,9 @@ "int32_t", /*default=*/"8", "The number of available workers to execute async operations.">, - Option<"targetBlockSize", "target-block-size", + Option<"minTaskSize", "min-task-size", "int32_t", /*default=*/"1000", - "The target block size for sharding parallel operation."> + "The minimum task size for sharding parallel operation."> ]; let dependentDialects = ["async::AsyncDialect", "scf::SCFDialect"]; diff --git a/mlir/lib/Dialect/Async/Transforms/AsyncParallelFor.cpp b/mlir/lib/Dialect/Async/Transforms/AsyncParallelFor.cpp --- a/mlir/lib/Dialect/Async/Transforms/AsyncParallelFor.cpp +++ b/mlir/lib/Dialect/Async/Transforms/AsyncParallelFor.cpp @@ -92,10 +92,10 @@ AsyncParallelForPass() = default; AsyncParallelForPass(bool asyncDispatch, int32_t numWorkerThreads, - int32_t targetBlockSize) { + int32_t minTaskSize) { this->asyncDispatch = asyncDispatch; this->numWorkerThreads = numWorkerThreads; - this->targetBlockSize = targetBlockSize; + this->minTaskSize = minTaskSize; } void runOnOperation() override; @@ -104,9 +104,9 @@ struct AsyncParallelForRewrite : public OpRewritePattern { public: AsyncParallelForRewrite(MLIRContext *ctx, bool asyncDispatch, - int32_t numWorkerThreads, int32_t targetBlockSize) + int32_t numWorkerThreads, int32_t minTaskSize) : OpRewritePattern(ctx), asyncDispatch(asyncDispatch), - numWorkerThreads(numWorkerThreads), targetBlockSize(targetBlockSize) {} + numWorkerThreads(numWorkerThreads), minTaskSize(minTaskSize) {} LogicalResult matchAndRewrite(scf::ParallelOp op, PatternRewriter &rewriter) const override; @@ -114,7 +114,7 @@ private: bool asyncDispatch; int32_t numWorkerThreads; - int32_t targetBlockSize; + int32_t minTaskSize; }; struct ParallelComputeFunctionType { @@ -564,7 +564,7 @@ // Dispatch parallel compute functions by submitting all async compute tasks // from a simple for loop in the caller thread. static void -doSequantialDispatch(ImplicitLocOpBuilder &b, PatternRewriter &rewriter, +doSequentialDispatch(ImplicitLocOpBuilder &b, PatternRewriter &rewriter, ParallelComputeFunction ¶llelComputeFunction, scf::ParallelOp op, Value blockSize, Value blockCount, const SmallVector &tripCounts) { @@ -684,15 +684,15 @@ std::max(1, static_cast(numWorkerThreads * overshardingFactor))); // Target block size from the pass parameters. - Value targetComputeBlock = b.create(targetBlockSize); + Value minTaskSizeCst = b.create(minTaskSize); // Compute parallel block size from the parallel problem size: // blockSize = min(tripCount, // max(ceil_div(tripCount, maxComputeBlocks), - // targetComputeBlock)) + // ceil_div(minTaskSize, bodySize))) Value bs0 = b.create(tripCount, maxComputeBlocks); - Value bs1 = b.create(CmpIPredicate::sge, bs0, targetComputeBlock); - Value bs2 = b.create(bs1, bs0, targetComputeBlock); + Value bs1 = b.create(CmpIPredicate::sge, bs0, minTaskSizeCst); + Value bs2 = b.create(bs1, bs0, minTaskSizeCst); Value bs3 = b.create(CmpIPredicate::sle, tripCount, bs2); Value blockSize0 = b.create(bs3, tripCount, bs2); Value blockCount0 = b.create(tripCount, blockSize0); @@ -712,7 +712,7 @@ doAsyncDispatch(b, rewriter, parallelComputeFunction, op, blockSize, blockCount, tripCounts); } else { - doSequantialDispatch(b, rewriter, parallelComputeFunction, op, blockSize, + doSequentialDispatch(b, rewriter, parallelComputeFunction, op, blockSize, blockCount, tripCounts); } @@ -733,7 +733,7 @@ RewritePatternSet patterns(ctx); patterns.add(ctx, asyncDispatch, numWorkerThreads, - targetBlockSize); + minTaskSize); if (failed(applyPatternsAndFoldGreedily(getOperation(), std::move(patterns)))) signalPassFailure(); @@ -743,9 +743,9 @@ return std::make_unique(); } -std::unique_ptr -mlir::createAsyncParallelForPass(bool asyncDispatch, int32_t numWorkerThreads, - int32_t targetBlockSize) { +std::unique_ptr mlir::createAsyncParallelForPass(bool asyncDispatch, + int32_t numWorkerThreads, + int32_t minTaskSize) { return std::make_unique(asyncDispatch, numWorkerThreads, - targetBlockSize); + minTaskSize); } diff --git a/mlir/test/Integration/Dialect/Async/CPU/test-async-parallel-for-1d.mlir b/mlir/test/Integration/Dialect/Async/CPU/test-async-parallel-for-1d.mlir --- a/mlir/test/Integration/Dialect/Async/CPU/test-async-parallel-for-1d.mlir +++ b/mlir/test/Integration/Dialect/Async/CPU/test-async-parallel-for-1d.mlir @@ -31,7 +31,7 @@ // RUN: mlir-opt %s -async-parallel-for="async-dispatch=false \ // RUN: num-workers=20 \ -// RUN: target-block-size=1" \ +// RUN: min-task-size=1" \ // RUN: -async-to-async-runtime \ // RUN: -async-runtime-ref-counting \ // RUN: -async-runtime-ref-counting-opt \ diff --git a/mlir/test/Integration/Dialect/Async/CPU/test-async-parallel-for-2d.mlir b/mlir/test/Integration/Dialect/Async/CPU/test-async-parallel-for-2d.mlir --- a/mlir/test/Integration/Dialect/Async/CPU/test-async-parallel-for-2d.mlir +++ b/mlir/test/Integration/Dialect/Async/CPU/test-async-parallel-for-2d.mlir @@ -29,7 +29,7 @@ // RUN: mlir-opt %s -async-parallel-for="async-dispatch=false \ // RUN: num-workers=20 \ -// RUN: target-block-size=1" \ +// RUN: min-task-size=1" \ // RUN: -async-to-async-runtime \ // RUN: -async-runtime-ref-counting \ // RUN: -async-runtime-ref-counting-opt \