diff --git a/mlir/include/mlir/Dialect/Affine/Analysis/AffineAnalysis.h b/mlir/include/mlir/Dialect/Affine/Analysis/AffineAnalysis.h --- a/mlir/include/mlir/Dialect/Affine/Analysis/AffineAnalysis.h +++ b/mlir/include/mlir/Dialect/Affine/Analysis/AffineAnalysis.h @@ -55,7 +55,8 @@ /// Returns true if `forOp' doesn't have memory dependences preventing /// parallelization. This function doesn't check iter_args and should be used -/// only as a building block for full parallel-checking functions. +/// only as a building block for full parallel-checking functions. Memrefs that +/// are allocated inside `forOp` do not impact its dependences and parallelism. bool isLoopMemoryParallel(AffineForOp forOp); /// Returns in `affineApplyOps`, the sequence of those AffineApplyOp diff --git a/mlir/lib/Dialect/Affine/Analysis/AffineAnalysis.cpp b/mlir/lib/Dialect/Affine/Analysis/AffineAnalysis.cpp --- a/mlir/lib/Dialect/Affine/Analysis/AffineAnalysis.cpp +++ b/mlir/lib/Dialect/Affine/Analysis/AffineAnalysis.cpp @@ -17,13 +17,12 @@ #include "mlir/Dialect/Affine/Analysis/Utils.h" #include "mlir/Dialect/Affine/IR/AffineOps.h" #include "mlir/Dialect/Affine/IR/AffineValueMap.h" -#include "mlir/Dialect/Arithmetic/IR/Arithmetic.h" +#include "mlir/Dialect/MemRef/IR/MemRef.h" #include "mlir/Dialect/StandardOps/IR/Ops.h" #include "mlir/IR/AffineExprVisitor.h" #include "mlir/IR/BuiltinOps.h" #include "mlir/IR/IntegerSet.h" #include "mlir/Support/MathExtras.h" -#include "llvm/ADT/DenseMap.h" #include "llvm/ADT/TypeSwitch.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" @@ -115,6 +114,40 @@ return isLoopMemoryParallel(forOp); } +/// Returns true if `op` is an alloc-like op, i.e., one allocating memrefs. +static bool isAllocLikeOp(Operation *op) { + auto memEffects = dyn_cast(op); + return memEffects && memEffects.hasEffect(); +} + +/// Returns true if `v` is allocated locally to `enclosingOp` -- i.e., it is +/// allocated by an operation nested within `enclosingOp`. +static bool isLocallyDefined(Value v, Operation *enclosingOp) { + Operation *defOp = v.getDefiningOp(); + if (!defOp) + return false; + + if (isAllocLikeOp(defOp) && enclosingOp->isProperAncestor(defOp)) + return true; + + // Aliasing ops. + // TODO: cover all known ops via an appropriate op interface. Common ones + // listed here for now. + if (auto castOp = dyn_cast(defOp)) + return isLocallyDefined(castOp.src(), enclosingOp); + if (auto castOp = dyn_cast(defOp)) + return isLocallyDefined(castOp.src(), enclosingOp); + if (auto castOp = dyn_cast(defOp)) + return isLocallyDefined(castOp.source(), enclosingOp); + if (auto castOp = dyn_cast(defOp)) + return isLocallyDefined(castOp.source(), enclosingOp); + if (auto castOp = dyn_cast(defOp)) + return isLocallyDefined(castOp.source(), enclosingOp); + if (auto castOp = dyn_cast(defOp)) + return isLocallyDefined(castOp.source(), enclosingOp); + return false; +} + /// Returns true if `forOp' doesn't have memory dependences preventing /// parallelization. This function doesn't check iter_args and should be used /// only as a building block for full parallel-checking functions. @@ -122,11 +155,25 @@ // Collect all load and store ops in loop nest rooted at 'forOp'. SmallVector loadAndStoreOps; auto walkResult = forOp.walk([&](Operation *op) -> WalkResult { - if (isa(op)) - loadAndStoreOps.push_back(op); - else if (!isa(op) && - !MemoryEffectOpInterface::hasNoEffect(op)) + if (auto readOp = dyn_cast(op)) { + // Memrefs that are allocated inside `forOp` need not be considered. Note + // that such local memrefs may still escape or could be passed around as + // iter args but checking for that is, strictly speaking, the concern of + // `isLoopParallel`: it's not a memory dependence. + if (!isLocallyDefined(readOp.getMemRef(), forOp)) + loadAndStoreOps.push_back(op); + } else if (auto writeOp = dyn_cast(op)) { + // Filter out stores the same way as above. + if (!isLocallyDefined(writeOp.getMemRef(), forOp)) + loadAndStoreOps.push_back(op); + } else if (!isa(op) && + !isAllocLikeOp(op) && + !MemoryEffectOpInterface::hasNoEffect(op)) { + // Alloc-like ops inside `forOp` are fine (they don't impact parallelism) + // as long as they don't escape the loop. When they escape or cross loop + // iterations, it's still not a memory dependence. return WalkResult::interrupt(); + } return WalkResult::advance(); }); diff --git a/mlir/test/Dialect/Affine/parallelize.mlir b/mlir/test/Dialect/Affine/parallelize.mlir --- a/mlir/test/Dialect/Affine/parallelize.mlir +++ b/mlir/test/Dialect/Affine/parallelize.mlir @@ -269,3 +269,46 @@ } return } + +// Test in the presence of locally allocated memrefs. + +// CHECK: func @local_alloc +func @local_alloc() { + %cst = arith.constant 0.0 : f32 + affine.for %i = 0 to 100 { + %m = memref.alloc() : memref<1xf32> + %ma = memref.alloca() : memref<1xf32> + affine.store %cst, %m[0] : memref<1xf32> + } + // CHECK: affine.parallel + return +} + +// CHECK: func @local_alloc_cast +func @local_alloc_cast() { + %cst = arith.constant 0.0 : f32 + affine.for %i = 0 to 100 { + %m = memref.alloc() : memref<128xf32> + affine.for %j = 0 to 128 { + affine.store %cst, %m[%j] : memref<128xf32> + } + affine.for %j = 0 to 128 { + affine.store %cst, %m[0] : memref<128xf32> + } + %r = memref.reinterpret_cast %m to offset: [0], sizes: [8, 16], + strides: [16, 1] : memref<128xf32> to memref<8x16xf32> + affine.for %j = 0 to 8 { + affine.store %cst, %r[%j, %j] : memref<8x16xf32> + } + } + // CHECK: affine.parallel + // CHECK: affine.parallel + // CHECK: } + // CHECK: affine.for + // CHECK: } + // CHECK: affine.parallel + // CHECK: } + // CHECK: } + + return +}