diff --git a/mlir/include/mlir/Dialect/SCF/Transforms/Passes.h b/mlir/include/mlir/Dialect/SCF/Transforms/Passes.h --- a/mlir/include/mlir/Dialect/SCF/Transforms/Passes.h +++ b/mlir/include/mlir/Dialect/SCF/Transforms/Passes.h @@ -37,7 +37,7 @@ /// Creates a pass that transforms a single ParallelLoop over N induction /// variables into another ParallelLoop over less than N induction variables. -std::unique_ptr createParallelLoopCollapsingPass(); +std::unique_ptr createTestSCFParallelLoopCollapsingPass(); /// Creates a loop fusion pass which fuses parallel loops. std::unique_ptr createParallelLoopFusionPass(); diff --git a/mlir/include/mlir/Dialect/SCF/Transforms/Passes.td b/mlir/include/mlir/Dialect/SCF/Transforms/Passes.td --- a/mlir/include/mlir/Dialect/SCF/Transforms/Passes.td +++ b/mlir/include/mlir/Dialect/SCF/Transforms/Passes.td @@ -50,9 +50,38 @@ let constructor = "mlir::createParallelLoopFusionPass()"; } -def SCFParallelLoopCollapsing : Pass<"scf-parallel-loop-collapsing"> { - let summary = "Collapse parallel loops to use less induction variables"; - let constructor = "mlir::createParallelLoopCollapsingPass()"; +def TestSCFParallelLoopCollapsing : Pass<"test-scf-parallel-loop-collapsing"> { + let summary = "Test parallel loops collapsing transformation"; + let constructor = "mlir::createTestSCFParallelLoopCollapsingPass()"; + let description = [{ + This pass is purely for testing the scf::collapseParallelLoops + transformation. The transformation does not have opinions on how a + parallel loop should be collapsed, so this pass is structured for the + common case on GPUs of collapsing to a 3d parallel loop. 3 lists can be + provided to collapsed-indices-{0,1,2} to represent how the loop should be + collapsed and must reference evrey iterator in the original parallel loop. + + ```mlir + # Before: + scf.parallel (%arg0, %arg1) + = (%c0, %c0) to (%c2, %c2) step (%c1, %c1) { + "test.sink"(%5, %3) : (index, index) -> () + scf.yield + } + + # After: + scf.parallel (%arg0) = (%c0) to (%c4) step (%c1) { + %0 = arith.remsi %arg0, %c2 : index + %1 = arith.divsi %arg0, %c2 : index + %2 = arith.muli %0, %c7 : index + %3 = arith.addi %2, %c3 : index + %4 = arith.muli %1, %c7 : index + %5 = arith.addi %4, %c3 : index + "test.sink"(%5, %3) : (index, index) -> () + } + ``` + }]; + let options = [ ListOption<"clCollapsedIndices0", "collapsed-indices-0", "unsigned", "Which loop indices to combine 0th loop index">, diff --git a/mlir/lib/Dialect/SCF/Transforms/ParallelLoopCollapsing.cpp b/mlir/lib/Dialect/SCF/Transforms/ParallelLoopCollapsing.cpp --- a/mlir/lib/Dialect/SCF/Transforms/ParallelLoopCollapsing.cpp +++ b/mlir/lib/Dialect/SCF/Transforms/ParallelLoopCollapsing.cpp @@ -11,11 +11,12 @@ #include "mlir/Dialect/SCF/IR/SCF.h" #include "mlir/Dialect/SCF/Utils/Utils.h" #include "mlir/Transforms/RegionUtils.h" +#include "llvm/ADT/SmallSet.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" namespace mlir { -#define GEN_PASS_DEF_SCFPARALLELLOOPCOLLAPSING +#define GEN_PASS_DEF_TESTSCFPARALLELLOOPCOLLAPSING #include "mlir/Dialect/SCF/Transforms/Passes.h.inc" } // namespace mlir @@ -24,27 +25,84 @@ using namespace mlir; namespace { -struct ParallelLoopCollapsing - : public impl::SCFParallelLoopCollapsingBase { +struct TestSCFParallelLoopCollapsing + : public impl::TestSCFParallelLoopCollapsingBase< + TestSCFParallelLoopCollapsing> { void runOnOperation() override { Operation *module = getOperation(); + // The common case for GPU dialect will be simplifying the ParallelOp to 3 + // arguments, so we do that here to simplify things. + llvm::SmallVector, 3> combinedLoops; + + // Gather the input args into the format required by + // `collapseParallelLoops`. + if (!clCollapsedIndices0.empty()) + combinedLoops.push_back(clCollapsedIndices0); + if (!clCollapsedIndices1.empty()) { + if (clCollapsedIndices0.empty()) { + llvm::errs() + << "collapsed-indices-1 specified but not collapsed-indices-0"; + signalPassFailure(); + return; + } + combinedLoops.push_back(clCollapsedIndices1); + } + if (!clCollapsedIndices2.empty()) { + if (clCollapsedIndices1.empty()) { + llvm::errs() + << "collapsed-indices-2 specified but not collapsed-indices-1"; + signalPassFailure(); + return; + } + combinedLoops.push_back(clCollapsedIndices2); + } + + if (combinedLoops.empty()) { + llvm::errs() << "No collapsed-indices were specified. This pass is only " + "for testing and does not automatically collapse all " + "parallel loops or similar."; + signalPassFailure(); + return; + } + + // Confirm that the specified loops are [0,N) by testing that N values exist + // with the maximum value being N-1. + llvm::SmallSet flattenedCombinedLoops; + unsigned maxCollapsedIndex = 0; + for (auto &loops : combinedLoops) { + for (auto &loop : loops) { + flattenedCombinedLoops.insert(loop); + maxCollapsedIndex = std::max(maxCollapsedIndex, loop); + } + } + + if (maxCollapsedIndex != flattenedCombinedLoops.size() - 1 || + !flattenedCombinedLoops.contains(maxCollapsedIndex)) { + llvm::errs() + << "collapsed-indices arguments must include all values [0,N)."; + signalPassFailure(); + return; + } + + // Only apply the transformation on parallel loops where the specified + // transformation is valid, but do NOT early abort in the case of invalid + // loops. module->walk([&](scf::ParallelOp op) { - // The common case for GPU dialect will be simplifying the ParallelOp to 3 - // arguments, so we do that here to simplify things. - llvm::SmallVector, 3> combinedLoops; - if (!clCollapsedIndices0.empty()) - combinedLoops.push_back(clCollapsedIndices0); - if (!clCollapsedIndices1.empty()) - combinedLoops.push_back(clCollapsedIndices1); - if (!clCollapsedIndices2.empty()) - combinedLoops.push_back(clCollapsedIndices2); + if (flattenedCombinedLoops.size() != op.getNumLoops()) { + op.emitOpError("has ") + << op.getNumLoops() + << " iter args while this limited functionality testing pass was " + "configured only for loops with exactly " + << flattenedCombinedLoops.size() << " iter args."; + return; + } collapseParallelLoops(op, combinedLoops); }); } }; } // namespace -std::unique_ptr mlir::createParallelLoopCollapsingPass() { - return std::make_unique(); +std::unique_ptr mlir::createTestSCFParallelLoopCollapsingPass() { + return std::make_unique(); } diff --git a/mlir/test/Transforms/invalid-parallel-loop-collapsing.mlir b/mlir/test/Transforms/invalid-parallel-loop-collapsing.mlir new file mode 100644 --- /dev/null +++ b/mlir/test/Transforms/invalid-parallel-loop-collapsing.mlir @@ -0,0 +1,34 @@ +// First test various sets of invalid arguments +// RUN: not mlir-opt -allow-unregistered-dialect %s -pass-pipeline='builtin.module(func.func(test-scf-parallel-loop-collapsing))' 2>&1 | FileCheck %s --check-prefix=CL0 +// CL0: No collapsed-indices were specified. This pass is only for testing and does not automatically collapse all parallel loops or similar + +// RUN: not mlir-opt -allow-unregistered-dialect %s -pass-pipeline='builtin.module(func.func(test-scf-parallel-loop-collapsing{collapsed-indices-1=1}))' 2>&1 | FileCheck %s --check-prefix=CL1 +// CL1: collapsed-indices-1 specified but not collapsed-indices-0 + +// RUN: not mlir-opt -allow-unregistered-dialect %s -pass-pipeline='builtin.module(func.func(test-scf-parallel-loop-collapsing{collapsed-indices-0=1 collapsed-indices-2=2}))' 2>&1 | FileCheck %s --check-prefix=CL2 +// CL2: collapsed-indices-2 specified but not collapsed-indices-1 + +// RUN: not mlir-opt -allow-unregistered-dialect %s -pass-pipeline='builtin.module(func.func(test-scf-parallel-loop-collapsing{collapsed-indices-0=1 collapsed-indices-1=2}))' 2>&1 | FileCheck %s --check-prefix=NON-ZERO +// NON-ZERO: collapsed-indices arguments must include all values [0,N). + +// RUN: not mlir-opt -allow-unregistered-dialect %s -pass-pipeline='builtin.module(func.func(test-scf-parallel-loop-collapsing{collapsed-indices-0=0 collapsed-indices-1=2}))' 2>&1 | FileCheck %s --check-prefix=NON-CONTIGUOUS +// NON-CONTIGUOUS: collapsed-indices arguments must include all values [0,N). + + +// Then test for invalid combinations of argument+input-ir +// RUN: mlir-opt -allow-unregistered-dialect %s -pass-pipeline='builtin.module(func.func(test-scf-parallel-loop-collapsing{collapsed-indices-0=0,1}))' -verify-diagnostics +func.func @too_few_iters(%arg0: index, %arg1: index, %arg2: index) { + // expected-error @+1 {{op has 1 iter args while this limited functionality testing pass was configured only for loops with exactly 2 iter args.}} + scf.parallel (%arg3) = (%arg0) to (%arg1) step (%arg2) { + scf.yield + } + return +} + +func.func @too_many_iters(%arg0: index, %arg1: index, %arg2: index) { + // expected-error @+1 {{op has 3 iter args while this limited functionality testing pass was configured only for loops with exactly 2 iter args.}} + scf.parallel (%arg3, %arg4, %arg5) = (%arg0, %arg0, %arg0) to (%arg1, %arg1, %arg1) step (%arg2, %arg2, %arg2) { + scf.yield + } + return +} diff --git a/mlir/test/Transforms/parallel-loop-collapsing.mlir b/mlir/test/Transforms/parallel-loop-collapsing.mlir --- a/mlir/test/Transforms/parallel-loop-collapsing.mlir +++ b/mlir/test/Transforms/parallel-loop-collapsing.mlir @@ -1,4 +1,4 @@ -// RUN: mlir-opt -allow-unregistered-dialect %s -pass-pipeline='builtin.module(func.func(scf-parallel-loop-collapsing{collapsed-indices-0=0,3 collapsed-indices-1=1,4 collapsed-indices-2=2}, canonicalize))' | FileCheck %s +// RUN: mlir-opt -allow-unregistered-dialect %s -pass-pipeline='builtin.module(func.func(test-scf-parallel-loop-collapsing{collapsed-indices-0=0,3 collapsed-indices-1=1,4 collapsed-indices-2=2}, canonicalize))' | FileCheck %s // CHECK-LABEL: func @parallel_many_dims() { func.func @parallel_many_dims() { diff --git a/mlir/test/Transforms/single-parallel-loop-collapsing.mlir b/mlir/test/Transforms/single-parallel-loop-collapsing.mlir --- a/mlir/test/Transforms/single-parallel-loop-collapsing.mlir +++ b/mlir/test/Transforms/single-parallel-loop-collapsing.mlir @@ -1,4 +1,4 @@ -// RUN: mlir-opt -allow-unregistered-dialect %s -pass-pipeline='builtin.module(func.func(scf-parallel-loop-collapsing{collapsed-indices-0=0,1}, canonicalize))' | FileCheck %s +// RUN: mlir-opt -allow-unregistered-dialect %s -pass-pipeline='builtin.module(func.func(test-scf-parallel-loop-collapsing{collapsed-indices-0=0,1}, canonicalize))' | FileCheck %s func.func @collapse_to_single() { %c0 = arith.constant 3 : index