diff --git a/mlir/include/mlir/Dialect/Affine/Analysis/AffineAnalysis.h b/mlir/include/mlir/Dialect/Affine/Analysis/AffineAnalysis.h --- a/mlir/include/mlir/Dialect/Affine/Analysis/AffineAnalysis.h +++ b/mlir/include/mlir/Dialect/Affine/Analysis/AffineAnalysis.h @@ -53,8 +53,10 @@ SmallVectorImpl *parallelReductions = nullptr); /// Returns true if `forOp' doesn't have memory dependences preventing -/// parallelization. This function doesn't check iter_args and should be used -/// only as a building block for full parallel-checking functions. +/// parallelization. Memrefs that are allocated inside `forOp` do not impact its +/// dependences and parallelism. This function does not check iter_args (for +/// values other than memref types) and should be used only as a building block +/// for complete parallelism-checking functions. bool isLoopMemoryParallel(AffineForOp forOp); /// Returns in `affineApplyOps`, the sequence of those AffineApplyOp diff --git a/mlir/lib/Dialect/Affine/Analysis/AffineAnalysis.cpp b/mlir/lib/Dialect/Affine/Analysis/AffineAnalysis.cpp --- a/mlir/lib/Dialect/Affine/Analysis/AffineAnalysis.cpp +++ b/mlir/lib/Dialect/Affine/Analysis/AffineAnalysis.cpp @@ -17,12 +17,10 @@ #include "mlir/Dialect/Affine/Analysis/Utils.h" #include "mlir/Dialect/Affine/IR/AffineOps.h" #include "mlir/Dialect/Affine/IR/AffineValueMap.h" -#include "mlir/Dialect/Arithmetic/IR/Arithmetic.h" #include "mlir/IR/AffineExprVisitor.h" #include "mlir/IR/BuiltinOps.h" #include "mlir/IR/IntegerSet.h" -#include "mlir/Support/MathExtras.h" -#include "llvm/ADT/DenseMap.h" +#include "mlir/Interfaces/ViewLikeInterface.h" #include "llvm/ADT/TypeSwitch.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" @@ -114,18 +112,51 @@ return isLoopMemoryParallel(forOp); } -/// Returns true if `forOp' doesn't have memory dependences preventing -/// parallelization. This function doesn't check iter_args and should be used -/// only as a building block for full parallel-checking functions. +/// Returns true if `op` is an alloc-like op, i.e., one allocating memrefs. +static bool isAllocLikeOp(Operation *op) { + auto memEffects = dyn_cast(op); + return memEffects && memEffects.hasEffect(); +} + +/// Returns true if `v` is allocated locally to `enclosingOp` -- i.e., it is +/// allocated by an operation nested within `enclosingOp`. +static bool isLocallyDefined(Value v, Operation *enclosingOp) { + Operation *defOp = v.getDefiningOp(); + if (!defOp) + return false; + + if (isAllocLikeOp(defOp) && enclosingOp->isProperAncestor(defOp)) + return true; + + // Aliasing ops. + auto viewOp = dyn_cast(defOp); + return viewOp && isLocallyDefined(viewOp.getViewSource(), enclosingOp); +} + bool mlir::isLoopMemoryParallel(AffineForOp forOp) { + // Any memref-typed iteration arguments are treated as serializing. + if (llvm::any_of(forOp.getResultTypes(), + [](Type type) { return type.isa(); })) + return false; + // Collect all load and store ops in loop nest rooted at 'forOp'. SmallVector loadAndStoreOps; auto walkResult = forOp.walk([&](Operation *op) -> WalkResult { - if (isa(op)) - loadAndStoreOps.push_back(op); - else if (!isa(op) && - !MemoryEffectOpInterface::hasNoEffect(op)) + if (auto readOp = dyn_cast(op)) { + // Memrefs that are allocated inside `forOp` need not be considered. + if (!isLocallyDefined(readOp.getMemRef(), forOp)) + loadAndStoreOps.push_back(op); + } else if (auto writeOp = dyn_cast(op)) { + // Filter out stores the same way as above. + if (!isLocallyDefined(writeOp.getMemRef(), forOp)) + loadAndStoreOps.push_back(op); + } else if (!isa(op) && + !isAllocLikeOp(op) && + !MemoryEffectOpInterface::hasNoEffect(op)) { + // Alloc-like ops inside `forOp` are fine (they don't impact parallelism) + // as long as they don't escape the loop (which has been checked above). return WalkResult::interrupt(); + } return WalkResult::advance(); }); diff --git a/mlir/test/Dialect/Affine/parallelize.mlir b/mlir/test/Dialect/Affine/parallelize.mlir --- a/mlir/test/Dialect/Affine/parallelize.mlir +++ b/mlir/test/Dialect/Affine/parallelize.mlir @@ -269,3 +269,57 @@ } return } + +// Test in the presence of locally allocated memrefs. + +// CHECK: func @local_alloc +func @local_alloc() { + %cst = arith.constant 0.0 : f32 + affine.for %i = 0 to 100 { + %m = memref.alloc() : memref<1xf32> + %ma = memref.alloca() : memref<1xf32> + affine.store %cst, %m[0] : memref<1xf32> + } + // CHECK: affine.parallel + return +} + +// CHECK: func @local_alloc_cast +func @local_alloc_cast() { + %cst = arith.constant 0.0 : f32 + affine.for %i = 0 to 100 { + %m = memref.alloc() : memref<128xf32> + affine.for %j = 0 to 128 { + affine.store %cst, %m[%j] : memref<128xf32> + } + affine.for %j = 0 to 128 { + affine.store %cst, %m[0] : memref<128xf32> + } + %r = memref.reinterpret_cast %m to offset: [0], sizes: [8, 16], + strides: [16, 1] : memref<128xf32> to memref<8x16xf32> + affine.for %j = 0 to 8 { + affine.store %cst, %r[%j, %j] : memref<8x16xf32> + } + } + // CHECK: affine.parallel + // CHECK: affine.parallel + // CHECK: } + // CHECK: affine.for + // CHECK: } + // CHECK: affine.parallel + // CHECK: } + // CHECK: } + + return +} + +// CHECK-LABEL: @iter_arg_memrefs +func @iter_arg_memrefs(%in: memref<10xf32>) { + %mi = memref.alloc() : memref + // Loop-carried memrefs are treated as serializing the loop. + // CHECK: affine.for + %mo = affine.for %i = 0 to 10 iter_args(%m_arg = %mi) -> (memref) { + affine.yield %m_arg : memref + } + return +}