diff --git a/mlir/include/mlir/Dialect/Affine/Passes.h b/mlir/include/mlir/Dialect/Affine/Passes.h --- a/mlir/include/mlir/Dialect/Affine/Passes.h +++ b/mlir/include/mlir/Dialect/Affine/Passes.h @@ -36,6 +36,10 @@ std::unique_ptr> createAffineLoopInvariantCodeMotionPass(); +/// Creates a pass to convert all parallel affine.for's into 1-d affine.parallel +/// ops. +std::unique_ptr> createAffineParallelizePass(); + /// Performs packing (or explicit copying) of accessed memref regions into /// buffers in the specified faster memory space through either pointwise copies /// or DMA operations. diff --git a/mlir/include/mlir/Dialect/Affine/Passes.td b/mlir/include/mlir/Dialect/Affine/Passes.td --- a/mlir/include/mlir/Dialect/Affine/Passes.td +++ b/mlir/include/mlir/Dialect/Affine/Passes.td @@ -112,6 +112,11 @@ ]; } +def AffineParallelize : FunctionPass<"affine-parallelize"> { + let summary = "Convert affine.for ops into 1-D affine.parallel"; + let constructor = "mlir::createAffineParallelizePass()"; +} + def SimplifyAffineStructures : FunctionPass<"simplify-affine-structures"> { let summary = "Simplify affine expressions in maps/sets and normalize " "memrefs"; diff --git a/mlir/include/mlir/Dialect/Affine/Utils.h b/mlir/include/mlir/Dialect/Affine/Utils.h --- a/mlir/include/mlir/Dialect/Affine/Utils.h +++ b/mlir/include/mlir/Dialect/Affine/Utils.h @@ -15,9 +15,16 @@ namespace mlir { +class AffineForOp; class AffineIfOp; +class AffineParallelOp; struct LogicalResult; +/// Replaces parallel affine.for op with 1-d affine.parallel op. +/// mlir::isLoopParallel detect the parallel affine.for ops. +/// There is no cost model currently used to drive this parallelization. +void affineParallelize(AffineForOp forOp); + /// Hoists out affine.if/else to as high as possible, i.e., past all invariant /// affine.fors/parallel's. Returns success if any hoisting happened; folded` is /// set to true if the op was folded or erased. This hoisting could lead to diff --git a/mlir/lib/Dialect/Affine/Transforms/AffineParallelize.cpp b/mlir/lib/Dialect/Affine/Transforms/AffineParallelize.cpp new file mode 100644 --- /dev/null +++ b/mlir/lib/Dialect/Affine/Transforms/AffineParallelize.cpp @@ -0,0 +1,50 @@ +//===- AffineParallelize.cpp - Affineparallelize Pass---------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file implements a parallelizer for affine loop nests that is able to +// perform inner or outer loop parallelization. +// +//===----------------------------------------------------------------------===// + +#include "PassDetail.h" +#include "mlir/Analysis/AffineStructures.h" +#include "mlir/Analysis/LoopAnalysis.h" +#include "mlir/Analysis/Utils.h" +#include "mlir/Dialect/Affine/IR/AffineOps.h" +#include "mlir/Dialect/Affine/IR/AffineValueMap.h" +#include "mlir/Dialect/Affine/Passes.h" +#include "mlir/Dialect/Affine/Passes.h.inc" +#include "mlir/Dialect/Affine/Utils.h" +#include "mlir/Transforms/LoopUtils.h" +#include "llvm/Support/Debug.h" + +#define DEBUG_TYPE "affine-parallel" + +using namespace mlir; + +namespace { +/// Convert all parallel affine.for op into 1-D affine.parallel op. +struct AffineParallelize : public AffineParallelizeBase { + void runOnFunction() override; +}; +} // namespace + +void AffineParallelize::runOnFunction() { + FuncOp f = getFunction(); + SmallVector parallelizableLoops; + f.walk([&](AffineForOp loop) { + if (isLoopParallel(loop)) + parallelizableLoops.push_back(loop); + }); + for (AffineForOp loop : parallelizableLoops) + affineParallelize(loop); +} + +std::unique_ptr> mlir::createAffineParallelizePass() { + return std::make_unique(); +} diff --git a/mlir/lib/Dialect/Affine/Transforms/CMakeLists.txt b/mlir/lib/Dialect/Affine/Transforms/CMakeLists.txt --- a/mlir/lib/Dialect/Affine/Transforms/CMakeLists.txt +++ b/mlir/lib/Dialect/Affine/Transforms/CMakeLists.txt @@ -1,6 +1,7 @@ add_mlir_dialect_library(MLIRAffineTransforms AffineDataCopyGeneration.cpp AffineLoopInvariantCodeMotion.cpp + AffineParallelize.cpp LoopTiling.cpp LoopUnroll.cpp LoopUnrollAndJam.cpp @@ -17,6 +18,7 @@ LINK_LIBS PUBLIC MLIRAffineOps + MLIRAffineUtils MLIREDSC MLIRIR MLIRPass diff --git a/mlir/lib/Dialect/Affine/Utils/Utils.cpp b/mlir/lib/Dialect/Affine/Utils/Utils.cpp --- a/mlir/lib/Dialect/Affine/Utils/Utils.cpp +++ b/mlir/lib/Dialect/Affine/Utils/Utils.cpp @@ -129,6 +129,20 @@ return hoistedIfOp; } +/// Replace affine.for with a 1-d affine.parallel and clone the former's body +/// into the latter while remapping values. +void mlir::affineParallelize(AffineForOp forOp) { + Location loc = forOp.getLoc(); + OpBuilder outsideBuilder(forOp); + // Creating empty 1-D affine.parallel op. + AffineParallelOp newPloop = outsideBuilder.create( + loc, forOp.getLowerBoundMap(), forOp.getLowerBoundOperands(), + forOp.getUpperBoundMap(), forOp.getUpperBoundOperands()); + // Steal the body of the old affine for op and erase it. + newPloop.region().takeBody(forOp.region()); + forOp.erase(); +} + // Returns success if any hoisting happened. LogicalResult mlir::hoistAffineIfOp(AffineIfOp ifOp, bool *folded) { // Apply canonicalization patterns and folding - this is necessary for the diff --git a/mlir/test/Dialect/Affine/parallelism-detection.mlir b/mlir/test/Dialect/Affine/parallelism-detection.mlir deleted file mode 100644 --- a/mlir/test/Dialect/Affine/parallelism-detection.mlir +++ /dev/null @@ -1,47 +0,0 @@ -// RUN: mlir-opt -allow-unregistered-dialect %s -test-detect-parallel -split-input-file -verify-diagnostics | FileCheck %s - -// CHECK-LABEL: func @loop_nest_3d_outer_two_parallel -func @loop_nest_3d_outer_two_parallel(%N : index) { - %0 = alloc() : memref<1024 x 1024 x vector<64xf32>> - %1 = alloc() : memref<1024 x 1024 x vector<64xf32>> - %2 = alloc() : memref<1024 x 1024 x vector<64xf32>> - affine.for %i = 0 to %N { - // expected-remark@-1 {{parallel loop}} - affine.for %j = 0 to %N { - // expected-remark@-1 {{parallel loop}} - affine.for %k = 0 to %N { - // expected-remark@-1 {{sequential loop}} - %5 = affine.load %0[%i, %k] : memref<1024x1024xvector<64xf32>> - %6 = affine.load %1[%k, %j] : memref<1024x1024xvector<64xf32>> - %7 = affine.load %2[%i, %j] : memref<1024x1024xvector<64xf32>> - %8 = mulf %5, %6 : vector<64xf32> - %9 = addf %7, %8 : vector<64xf32> - affine.store %9, %2[%i, %j] : memref<1024x1024xvector<64xf32>> - } - } - } - return -} - -// ----- - -// CHECK-LABEL: unknown_op_conservative -func @unknown_op_conservative() { - affine.for %i = 0 to 10 { - // expected-remark@-1 {{sequential loop}} - "unknown"() : () -> () - } - return -} - -// ----- - -// CHECK-LABEL: non_affine_load -func @non_affine_load() { - %0 = alloc() : memref<100 x f32> - affine.for %i = 0 to 100 { - // expected-remark@-1 {{sequential loop}} - load %0[%i] : memref<100 x f32> - } - return -} diff --git a/mlir/test/Dialect/Affine/parallelize.mlir b/mlir/test/Dialect/Affine/parallelize.mlir new file mode 100644 --- /dev/null +++ b/mlir/test/Dialect/Affine/parallelize.mlir @@ -0,0 +1,118 @@ +// RUN: mlir-opt %s -allow-unregistered-dialect -affine-parallelize| FileCheck %s + +// For multiple nested for-loops. +// CHECK-DAG: [[MAP5:#map[0-9]+]] = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0 + d1, d2 * 2 + d3, d4 * 2 + d5, d6 + d7)> +// CHECK-LABEL: func @reduce_window_max() { +func @reduce_window_max() { + %cst = constant 0.000000e+00 : f32 + %0 = alloc() : memref<1x8x8x64xf32> + %1 = alloc() : memref<1x18x18x64xf32> + affine.for %arg0 = 0 to 1 { + affine.for %arg1 = 0 to 8 { + affine.for %arg2 = 0 to 8 { + affine.for %arg3 = 0 to 64 { + affine.store %cst, %0[%arg0, %arg1, %arg2, %arg3] : memref<1x8x8x64xf32> + } + } + } + } + affine.for %arg0 = 0 to 1 { + affine.for %arg1 = 0 to 8 { + affine.for %arg2 = 0 to 8 { + affine.for %arg3 = 0 to 64 { + affine.for %arg4 = 0 to 1 { + affine.for %arg5 = 0 to 3 { + affine.for %arg6 = 0 to 3 { + affine.for %arg7 = 0 to 1 { + %2 = affine.load %0[%arg0, %arg1, %arg2, %arg3] : memref<1x8x8x64xf32> + %3 = affine.load %1[%arg0 + %arg4, %arg1 * 2 + %arg5, %arg2 * 2 + %arg6, %arg3 + %arg7] : memref<1x18x18x64xf32> + %4 = cmpf "ogt", %2, %3 : f32 + %5 = select %4, %2, %3 : f32 + affine.store %5, %0[%arg0, %arg1, %arg2, %arg3] : memref<1x8x8x64xf32> + } + } + } + } + } + } + } + } + return +} + +// CHECK: %[[cst:.*]] = constant 0.000000e+00 : f32 +// CHECK: %[[v0:.*]] = alloc() : memref<1x8x8x64xf32> +// CHECK: %[[v1:.*]] = alloc() : memref<1x18x18x64xf32> +// CHECK: affine.parallel (%[[arg0:.*]]) = (0) to (1) { +// CHECK: affine.parallel (%[[arg1:.*]]) = (0) to (8) { +// CHECK: affine.parallel (%[[arg2:.*]]) = (0) to (8) { +// CHECK: affine.parallel (%[[arg3:.*]]) = (0) to (64) { +// CHECK: affine.store %[[cst]], %[[v0]][%[[arg0]], %[[arg1]], %[[arg2]], %[[arg3]]] : memref<1x8x8x64xf32> +// CHECK: } +// CHECK: } +// CHECK: } +// CHECK: } +// CHECK: affine.parallel (%[[a0:.*]]) = (0) to (1) { +// CHECK: affine.parallel (%[[a1:.*]]) = (0) to (8) { +// CHECK: affine.parallel (%[[a2:.*]]) = (0) to (8) { +// CHECK: affine.parallel (%[[a3:.*]]) = (0) to (64) { +// CHECK: affine.parallel (%[[a4:.*]]) = (0) to (1) { +// CHECK: affine.for %[[a5:.*]] = 0 to 3 { +// CHECK: affine.for %[[a6:.*]] = 0 to 3 { +// CHECK: affine.parallel (%[[a7:.*]]) = (0) to (1) { +// CHECK: %[[lhs:.*]] = affine.load %[[v0]][%[[a0]], %[[a1]], %[[a2]], %[[a3]]] : memref<1x8x8x64xf32> +// CHECK: %[[rhs:.*]] = affine.load %[[v1]][%[[a0]] + %[[a4]], %[[a1]] * 2 + %[[a5]], %[[a2]] * 2 + %[[a6]], %[[a3]] + %[[a7]]] : memref<1x18x18x64xf32> +// CHECK: %[[res:.*]] = cmpf "ogt", %[[lhs]], %[[rhs]] : f32 +// CHECK: %[[sel:.*]] = select %[[res]], %[[lhs]], %[[rhs]] : f32 +// CHECK: affine.store %[[sel]], %[[v0]][%[[a0]], %[[a1]], %[[a2]], %[[a3]]] : memref<1x8x8x64xf32> +// CHECK: } +// CHECK: } +// CHECK: } +// CHECK: } +// CHECK: } +// CHECK: } +// CHECK: } +// CHECK: } +// CHECK: } + +func @loop_nest_3d_outer_two_parallel(%N : index) { + %0 = alloc() : memref<1024 x 1024 x vector<64xf32>> + %1 = alloc() : memref<1024 x 1024 x vector<64xf32>> + %2 = alloc() : memref<1024 x 1024 x vector<64xf32>> + affine.for %i = 0 to %N { + affine.for %j = 0 to %N { + %7 = affine.load %2[%i, %j] : memref<1024x1024xvector<64xf32>> + affine.for %k = 0 to %N { + %5 = affine.load %0[%i, %k] : memref<1024x1024xvector<64xf32>> + %6 = affine.load %1[%k, %j] : memref<1024x1024xvector<64xf32>> + %8 = mulf %5, %6 : vector<64xf32> + %9 = addf %7, %8 : vector<64xf32> + affine.store %9, %2[%i, %j] : memref<1024x1024xvector<64xf32>> + } + } + } + return +} + +// CHECK: affine.parallel (%[[arg1:.*]]) = (0) to (symbol(%arg0)) { +// CHECK-NEXT: affine.parallel (%[[arg2:.*]]) = (0) to (symbol(%arg0)) { +// CHECK: affine.for %[[arg3:.*]] = 0 to %arg0 { + +// CHECK-LABEL: unknown_op_conservative +func @unknown_op_conservative() { + affine.for %i = 0 to 10 { +// CHECK: affine.for %[[arg1:.*]] = 0 to 10 { + "unknown"() : () -> () + } + return +} + +// CHECK-LABEL: non_affine_load +func @non_affine_load() { + %0 = alloc() : memref<100 x f32> + affine.for %i = 0 to 100 { +// CHECK: affine.for %{{.*}} = 0 to 100 { + load %0[%i] : memref<100 x f32> + } + return +} diff --git a/mlir/test/lib/Dialect/Affine/CMakeLists.txt b/mlir/test/lib/Dialect/Affine/CMakeLists.txt --- a/mlir/test/lib/Dialect/Affine/CMakeLists.txt +++ b/mlir/test/lib/Dialect/Affine/CMakeLists.txt @@ -3,7 +3,6 @@ TestAffineDataCopy.cpp TestAffineLoopUnswitching.cpp TestLoopPermutation.cpp - TestParallelismDetection.cpp TestVectorizationUtils.cpp EXCLUDE_FROM_LIBMLIR diff --git a/mlir/test/lib/Dialect/Affine/TestParallelismDetection.cpp b/mlir/test/lib/Dialect/Affine/TestParallelismDetection.cpp deleted file mode 100644 --- a/mlir/test/lib/Dialect/Affine/TestParallelismDetection.cpp +++ /dev/null @@ -1,47 +0,0 @@ -//===- ParallelismDetection.cpp - Parallelism Detection pass ------------*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// This file implements a pass to detect parallel affine 'affine.for' ops. -// -//===----------------------------------------------------------------------===// - -#include "mlir/Analysis/Utils.h" -#include "mlir/Dialect/Affine/IR/AffineOps.h" -#include "mlir/IR/Builders.h" -#include "mlir/Pass/Pass.h" - -using namespace mlir; - -namespace { - -struct TestParallelismDetection - : public PassWrapper { - void runOnFunction() override; -}; - -} // end anonymous namespace - -// Walks the function and emits a note for all 'affine.for' ops detected as -// parallel. -void TestParallelismDetection::runOnFunction() { - FuncOp f = getFunction(); - OpBuilder b(f.getBody()); - f.walk([&](AffineForOp forOp) { - if (isLoopParallel(forOp)) - forOp.emitRemark("parallel loop"); - else - forOp.emitRemark("sequential loop"); - }); -} - -namespace mlir { -void registerTestParallelismDetection() { - PassRegistration pass( - "test-detect-parallel", "Test parallelism detection "); -} -} // namespace mlir diff --git a/mlir/tools/mlir-opt/mlir-opt.cpp b/mlir/tools/mlir-opt/mlir-opt.cpp --- a/mlir/tools/mlir-opt/mlir-opt.cpp +++ b/mlir/tools/mlir-opt/mlir-opt.cpp @@ -62,7 +62,6 @@ void registerTestMemRefDependenceCheck(); void registerTestMemRefStrideCalculation(); void registerTestOpaqueLoc(); -void registerTestParallelismDetection(); void registerTestPreparationPassWithAllowedMemrefResults(); void registerTestGpuParallelLoopMappingPass(); void registerTestSCFUtilsPass(); @@ -137,7 +136,6 @@ registerTestMemRefDependenceCheck(); registerTestMemRefStrideCalculation(); registerTestOpaqueLoc(); - registerTestParallelismDetection(); registerTestPreparationPassWithAllowedMemrefResults(); registerTestGpuParallelLoopMappingPass(); registerTestSCFUtilsPass();