Diff 435745

mlir/lib/Dialect/Affine/Analysis/AffineAnalysis.cpp

Show All 12 Lines

#include "mlir/Dialect/Affine/Analysis/AffineAnalysis.h"		#include "mlir/Dialect/Affine/Analysis/AffineAnalysis.h"
#include "mlir/Analysis/SliceAnalysis.h"		#include "mlir/Analysis/SliceAnalysis.h"
#include "mlir/Dialect/Affine/Analysis/LoopAnalysis.h"		#include "mlir/Dialect/Affine/Analysis/LoopAnalysis.h"
#include "mlir/Dialect/Affine/Analysis/Utils.h"		#include "mlir/Dialect/Affine/Analysis/Utils.h"
#include "mlir/Dialect/Affine/IR/AffineOps.h"		#include "mlir/Dialect/Affine/IR/AffineOps.h"
#include "mlir/Dialect/Affine/IR/AffineValueMap.h"		#include "mlir/Dialect/Affine/IR/AffineValueMap.h"
#include "mlir/Dialect/Func/IR/FuncOps.h"		#include "mlir/Dialect/Func/IR/FuncOps.h"
		#include "mlir/Dialect/MemRef/IR/MemRef.h"
#include "mlir/IR/AffineExprVisitor.h"		#include "mlir/IR/AffineExprVisitor.h"
#include "mlir/IR/BuiltinOps.h"		#include "mlir/IR/BuiltinOps.h"
#include "mlir/IR/IntegerSet.h"		#include "mlir/IR/IntegerSet.h"
#include "mlir/Interfaces/ViewLikeInterface.h"		#include "mlir/Interfaces/ViewLikeInterface.h"
#include "llvm/ADT/TypeSwitch.h"		#include "llvm/ADT/TypeSwitch.h"
#include "llvm/Support/Debug.h"		#include "llvm/Support/Debug.h"
#include "llvm/Support/raw_ostream.h"		#include "llvm/Support/raw_ostream.h"

#define DEBUG_TYPE "affine-analysis"		#define DEBUG_TYPE "affine-analysis"

using namespace mlir;		using namespace mlir;
using namespace presburger;		using namespace presburger;

		/// Value to pass for position when the reduction in the loop
		/// happens through a memref.
		static constexpr unsigned kMemRefReduction = ~0u;

/// Get the value that is being reduced by `pos`-th reduction in the loop if		/// Get the value that is being reduced by `pos`-th reduction in the loop if
/// such a reduction can be performed by affine parallel loops. This assumes		/// such a reduction can be performed by affine parallel loops. This assumes
/// floating-point operations are commutative. On success, `kind` will be the		/// floating-point operations are commutative. On success, `kind` will be the
/// reduction kind suitable for use in affine parallel loop builder. If the		/// reduction kind suitable for use in affine parallel loop builder. If the
/// reduction is not supported, returns null.		/// reduction is not supported, returns null. For memref based reductions
static Value getSupportedReduction(AffineForOp forOp, unsigned pos,		/// `memRefCombinerOps` carries the combiner ops used to accumulate into
arith::AtomicRMWKind &kind) {		/// the memref `memRefReducedVal`.
		static Value
		getSupportedReduction(AffineForOp forOp, unsigned pos,
		arith::AtomicRMWKind &kind,
		const SmallVectorImpl<Operation > memRefCombinerOps,
		Value memRefReducedVal) {
SmallVector<Operation *> combinerOps;		SmallVector<Operation *> combinerOps;
Value reducedVal =		Value reducedVal;
matchReduction(forOp.getRegionIterArgs(), pos, combinerOps);		if (forOp.getNumRegionIterArgs() > 0) {
		reducedVal = matchReduction(forOp.getRegionIterArgs(), pos, combinerOps);
		} else {
		// Memref based reduction. Initialize the combiner ops.
		combinerOps.assign(memRefCombinerOps->begin(), memRefCombinerOps->end());
		reducedVal = memRefReducedVal;
		}
if (!reducedVal)		if (!reducedVal)
return nullptr;		return nullptr;

// Expected only one combiner operation.		// Expected only one combiner operation.
if (combinerOps.size() > 1)		if (combinerOps.size() > 1)
return nullptr;		return nullptr;

Operation *combinerOp = combinerOps.back();		Operation *combinerOp = combinerOps.back();
Show All 18 Lines	Optional<arith::AtomicRMWKind> maybeKind =
});		});
if (!maybeKind)		if (!maybeKind)
return nullptr;		return nullptr;

kind = *maybeKind;		kind = *maybeKind;
return reducedVal;		return reducedVal;
}		}

		/// Matcher function that looks for memory based reduction and
		/// returns the memrefs being used for reduction and the associated
		/// combiner ops.
		static void
		matchReductionViaMemRef(Region &loopBodyBlock,
		SmallVectorImpl<Operation *> &allCombinerOps,
		SmallVectorImpl<Value> &reductionMemRefs) {
		DenseMap<Value, AffineWriteOpInterface> potentialRedStore;
		const auto &ops = loopBodyBlock.getOps();
		for (Operation &op : ops) {
		AffineWriteOpInterface storeOp = dyn_cast<AffineWriteOpInterface>(op);
		if (!storeOp)
		continue;
		auto storeMemRef = storeOp.getMemRef();
		auto defOp = storeMemRef.getDefiningOp();
		if (!defOp \|\| (defOp && !isa<memref::AllocOp>(defOp)))
		continue;
		if (potentialRedStore.find(storeMemRef) == potentialRedStore.end()) {
		potentialRedStore[storeMemRef] = storeOp;
		} else {
		// Only 1 store expected.
		potentialRedStore[storeMemRef] = nullptr;
		}
		}
		// Go over all the potential stores to see if there is match
		// for the following reduction pattern
		// val = load from reduction memref
		// reduced-val = combinerOp(val)
		// store reduced-val to reduction memref
		for (auto pair : potentialRedStore) {
		auto *combinerOp = pair.second.getValueToStore().getDefiningOp();
		Operation *redRegionOp = pair.second->getParentOp();
		if (!combinerOp \|\| !MemoryEffectOpInterface::hasNoEffect(combinerOp) \|\|
		combinerOp->getNumResults() != 1 \|\| !combinerOp->hasOneUse() \|\|
		combinerOp->getParentOp() != redRegionOp \|\|
		combinerOp->getNumOperands() != 2)
		continue;
		// Limit to seeing only one combiner op since the callee
		// only expects one.
		auto loadOp1 = dyn_cast<AffineReadOpInterface>(
		combinerOp->getOperand(0).getDefiningOp());
		auto loadOp2 = dyn_cast<AffineReadOpInterface>(
		combinerOp->getOperand(1).getDefiningOp());
		if ((loadOp1 && loadOp1.getMemRef() == pair.first) \|\|
		(loadOp2 && loadOp2.getMemRef() == pair.first)) {
		reductionMemRefs.push_back(pair.first);
		allCombinerOps.push_back(combinerOp);
		}
		}
		return;
		}

/// Populate `supportedReductions` with descriptors of the supported reductions.		/// Populate `supportedReductions` with descriptors of the supported reductions.
void mlir::getSupportedReductions(		void mlir::getSupportedReductions(
AffineForOp forOp, SmallVectorImpl<LoopReduction> &supportedReductions) {		AffineForOp forOp, SmallVectorImpl<LoopReduction> &supportedReductions) {
unsigned numIterArgs = forOp.getNumIterOperands();		unsigned numIterArgs = forOp.getNumIterOperands();
if (numIterArgs == 0)		unsigned numReductions = 0;
return;		bool reductionViaMemRef = false;
supportedReductions.reserve(numIterArgs);		SmallVector<Value> allReducedVals;
for (unsigned i = 0; i < numIterArgs; ++i) {		SmallVector<Operation *> allCombinerOps;
		if (numIterArgs == 0) {
		// Possibly memory based reductions. Find all the memref's being
		// used for these reductions and the associated combiner ops.
		matchReductionViaMemRef(forOp.getLoopBody(), allCombinerOps,
		allReducedVals);
		reductionViaMemRef = true;
		numReductions = allReducedVals.size();
		} else {
		numReductions = numIterArgs;
		}
		supportedReductions.reserve(numReductions);
		for (unsigned i = 0; i < numReductions; ++i) {
arith::AtomicRMWKind kind;		arith::AtomicRMWKind kind;
if (Value value = getSupportedReduction(forOp, i, kind))		if (reductionViaMemRef) {
		SmallVector<Operation *> combinerOps(1, allCombinerOps[i]);
		if (Value value = getSupportedReduction(forOp, i, kind, &combinerOps,
		allReducedVals[i]))
		supportedReductions.emplace_back(
		LoopReduction{kind, kMemRefReduction, value});
		} else {
		if (Value value = getSupportedReduction(forOp, i, kind, nullptr, nullptr))
supportedReductions.emplace_back(LoopReduction{kind, i, value});		supportedReductions.emplace_back(LoopReduction{kind, i, value});
}		}
}		}
		}

/// Returns true if `forOp' is a parallel loop. If `parallelReductions` is		/// Returns true if `forOp' is a parallel loop. If `parallelReductions` is
/// provided, populates it with descriptors of the parallelizable reductions and		/// provided, populates it with descriptors of the parallelizable reductions and
/// treats them as not preventing parallelization.		/// treats them as not preventing parallelization.
bool mlir::isLoopParallel(AffineForOp forOp,		bool mlir::isLoopParallel(AffineForOp forOp,
SmallVectorImpl<LoopReduction> *parallelReductions) {		SmallVectorImpl<LoopReduction> *parallelReductions) {
unsigned numIterArgs = forOp.getNumIterOperands();		unsigned numIterArgs = forOp.getNumIterOperands();

// Loop is not parallel if it has SSA loop-carried dependences and reduction		// Loop is not parallel if it has SSA loop-carried dependences and reduction
// detection is not requested.		// detection is not requested.
if (numIterArgs > 0 && !parallelReductions)		if (numIterArgs > 0 && !parallelReductions)
return false;		return false;

// Find supported reductions of requested.		// Find supported reductions of requested.
if (parallelReductions) {		if (parallelReductions) {
getSupportedReductions(forOp, *parallelReductions);		getSupportedReductions(forOp, *parallelReductions);
// Return later to allow for identifying all parallel reductions even if the		// Return later to allow for identifying all parallel reductions even if the
// loop is not parallel.		// loop is not parallel.
if (parallelReductions->size() != numIterArgs)		if (numIterArgs != 0 && parallelReductions->size() != numIterArgs)
return false;		return false;
}		}

// Check memory dependences.		// Check memory dependences.
return isLoopMemoryParallel(forOp);		return isLoopMemoryParallel(forOp);
}		}

/// Returns true if `op` is an alloc-like op, i.e., one allocating memrefs.		/// Returns true if `op` is an alloc-like op, i.e., one allocating memrefs.
▲ Show 20 Lines • Show All 581 Lines • Show Last 20 Lines

mlir/lib/Dialect/Affine/Analysis/Utils.cpp

Show First 20 Lines • Show All 1,104 Lines • ▼ Show 20 Lines	if (isa<AffineReadOpInterface>(depSourceOp) &&
// For read-read access pairs, clear any slice bounds on sequential loops.		// For read-read access pairs, clear any slice bounds on sequential loops.
// Get sequential loops in loop nest rooted at 'srcLoopIVs[0]'.		// Get sequential loops in loop nest rooted at 'srcLoopIVs[0]'.
getSequentialLoops(isBackwardSlice ? srcLoopIVs[0] : dstLoopIVs[0],		getSequentialLoops(isBackwardSlice ? srcLoopIVs[0] : dstLoopIVs[0],
&sequentialLoops);		&sequentialLoops);
}		}
auto getSliceLoop = [&](unsigned i) {		auto getSliceLoop = [&](unsigned i) {
return isBackwardSlice ? srcLoopIVs[i] : dstLoopIVs[i];		return isBackwardSlice ? srcLoopIVs[i] : dstLoopIVs[i];
};		};
		if (isa<AffineWriteOpInterface>(depSourceOp) &&
		isa<AffineReadOpInterface>(depSinkOp)) {
		// Slices of source loop nests that contain reductions are reset
		// to their original bounds..
		for (unsigned i = 0; i < numSliceLoopIVs; ++i) {
		AffineForOp srcForOp = getSliceLoop(i);
		SmallVector<LoopReduction> reductions;
		bool forOpIsParallel = isLoopParallel(srcForOp, &reductions);
		(void)forOpIsParallel;
		if (!reductions.empty()) {
		getSequentialLoops(srcForOp, &sequentialLoops);
		break;
		}
		}
		}
auto isInnermostInsertion = [&]() {		auto isInnermostInsertion = [&]() {
return (isBackwardSlice ? loopDepth >= srcLoopIVs.size()		return (isBackwardSlice ? loopDepth >= srcLoopIVs.size()
: loopDepth >= dstLoopIVs.size());		: loopDepth >= dstLoopIVs.size());
};		};
llvm::SmallDenseMap<Operation *, uint64_t, 8> sliceTripCountMap;		llvm::SmallDenseMap<Operation *, uint64_t, 8> sliceTripCountMap;
auto srcIsUnitSlice = [&]() {		auto srcIsUnitSlice = [&]() {
return (buildSliceTripCountMap(*sliceState, &sliceTripCountMap) &&		return (buildSliceTripCountMap(*sliceState, &sliceTripCountMap) &&
(getSliceIterationCount(sliceTripCountMap) == 1));		(getSliceIterationCount(sliceTripCountMap) == 1));
▲ Show 20 Lines • Show All 248 Lines • Show Last 20 Lines

mlir/test/Transforms/loop-fusion-4.mlir

	// RUN: mlir-opt -allow-unregistered-dialect %s -affine-loop-fusion="mode=producer" -split-input-file \| FileCheck %s --check-prefix=PRODUCER-CONSUMER			// RUN: mlir-opt -allow-unregistered-dialect %s -affine-loop-fusion="mode=producer" -split-input-file \| FileCheck %s --check-prefix=PRODUCER-CONSUMER
	// RUN: mlir-opt -allow-unregistered-dialect %s -affine-loop-fusion="fusion-maximal mode=sibling" -split-input-file \| FileCheck %s --check-prefix=SIBLING-MAXIMAL			// RUN: mlir-opt -allow-unregistered-dialect %s -affine-loop-fusion="fusion-maximal mode=sibling" -split-input-file \| FileCheck %s --check-prefix=SIBLING-MAXIMAL
				// RUN: mlir-opt -allow-unregistered-dialect %s -affine-loop-fusion="fusion-maximal mode=producer" -split-input-file \| FileCheck %s --check-prefix=PRODUCER-CONSUMER-MAXIMAL

	// Part I of fusion tests in mlir/test/Transforms/loop-fusion.mlir.			// Part I of fusion tests in mlir/test/Transforms/loop-fusion.mlir.
	// Part II of fusion tests in mlir/test/Transforms/loop-fusion-2.mlir			// Part II of fusion tests in mlir/test/Transforms/loop-fusion-2.mlir
	// Part III of fusion tests in mlir/test/Transforms/loop-fusion-3.mlir			// Part III of fusion tests in mlir/test/Transforms/loop-fusion-3.mlir

	// Expects fusion of producer into consumer at depth 4 and subsequent removal of			// Expects fusion of producer into consumer at depth 4 and subsequent removal of
	// source loop.			// source loop.
	// PRODUCER-CONSUMER-LABEL: func @unflatten4d			// PRODUCER-CONSUMER-LABEL: func @unflatten4d
	▲ Show 20 Lines • Show All 125 Lines • ▼ Show 20 Lines
	// since the destination loop and source loop trip counts do not			// since the destination loop and source loop trip counts do not
	// match.			// match.
	// SIBLING-MAXIMAL: %[[cst_0:.*]] = arith.constant 0.000000e+00 : f32			// SIBLING-MAXIMAL: %[[cst_0:.*]] = arith.constant 0.000000e+00 : f32
	// SIBLING-MAXIMAL-NEXT: %[[cst_1:.*]] = arith.constant 1.000000e+00 : f32			// SIBLING-MAXIMAL-NEXT: %[[cst_1:.*]] = arith.constant 1.000000e+00 : f32
	// SIBLING-MAXIMAL-NEXT: affine.for %[[idx_0:.*]]= 0 to 1 {			// SIBLING-MAXIMAL-NEXT: affine.for %[[idx_0:.*]]= 0 to 1 {
	// SIBLING-MAXIMAL-NEXT: affine.for %[[idx_1:.*]] = 0 to 64 {			// SIBLING-MAXIMAL-NEXT: affine.for %[[idx_1:.*]] = 0 to 64 {
	// SIBLING-MAXIMAL-NEXT: %[[result_1:.]] = affine.for %[[idx_2:.]] = 0 to 32 iter_args(%[[iter_0:.*]] = %[[cst_1]]) -> (f32) {			// SIBLING-MAXIMAL-NEXT: %[[result_1:.]] = affine.for %[[idx_2:.]] = 0 to 32 iter_args(%[[iter_0:.*]] = %[[cst_1]]) -> (f32) {
	// SIBLING-MAXIMAL-NEXT: %[[result_0:.]] = affine.for %[[idx_3:.]] = 0 to 64 iter_args(%[[iter_1:.*]] = %[[cst_0]]) -> (f32) {			// SIBLING-MAXIMAL-NEXT: %[[result_0:.]] = affine.for %[[idx_3:.]] = 0 to 64 iter_args(%[[iter_1:.*]] = %[[cst_0]]) -> (f32) {

				// -----

				// Source loop nest %i1 is a reduction but due to fusion of preceding
				// preceding loop %i0 generates a producer for %i3. Check if the
				// producer fusion happens at depth 2 with original bounds.
				// PRODUCER-CONSUMER-MAXIMAL-LABEL: func @reduction_producer_consumer(
				func.func @reduction_producer_consumer(%arg0: memref<1024xf32, 1>, %arg1: memref<1xf32, 1>, %arg2: memref<1xf32, 1>, %arg3: memref<1xf32, 1>) {
				%cst = arith.constant 0.000000e+00 : f32
				%0 = memref.alloc() : memref<f32, 1>
				%1 = memref.alloc() : memref<f32, 1>
				%2 = memref.alloc() : memref<1024xf32, 1>
				affine.for %i0 = 0 to 1024 {
				%4 = affine.load %arg0[%i0] : memref<1024xf32, 1>
				%6 = arith.addf %4, %4 : f32
				affine.store %6, %2[%i0] : memref<1024xf32, 1>
				}
				affine.for %i1 = 0 to 1 {
				affine.store %cst, %1[] : memref<f32, 1>
				affine.for %i2 = 0 to 1024 {
				%5 = affine.load %1[] : memref<f32, 1>
				%6 = affine.load %2[%i2] : memref<1024xf32, 1>
				%7 = arith.addf %5, %6 : f32
				affine.store %7, %1[] : memref<f32, 1>
				}
				%4 = affine.load %1[] : memref<f32, 1>
				affine.store %4, %arg2[%i1] : memref<1xf32, 1>
				}
				affine.for %i3 = 0 to 1 {
				affine.store %cst, %0[] : memref<f32, 1>
				affine.for %i4 = 0 to 1024 {
				%5 = affine.load %0[] : memref<f32, 1>
				%6 = affine.load %2[%i4] : memref<1024xf32, 1>
				%7 = arith.addf %5, %6 : f32
				affine.store %7, %0[] : memref<f32, 1>
				}
				%4 = affine.load %0[] : memref<f32, 1>
				affine.store %4, %arg3[%i3] : memref<1xf32, 1>
				}
				return
				}
				// PRODUCER-CONSUMER-MAXIMAL: %[[cst0:.*]] = arith.constant 0.000000e+00 : f32
				// PRODUCER-CONSUMER-MAXIMAL: %[[tmp0:.*]] = memref.alloc() : memref<f32, 1>
				// PRODUCER-CONSUMER-MAXIMAL: %[[tmp1:.*]] = memref.alloc() : memref<f32, 1>
				// PRODUCER-CONSUMER-MAXIMAL: affine.for %[[i0:.*]] = 0 to 1 {
				// PRODUCER-CONSUMER-MAXIMAL: affine.store %[[cst0]], %[[tmp0]][] : memref<f32, 1>
				// PRODUCER-CONSUMER-MAXIMAL: affine.for %[[i1:.*]] = 0 to 1024 {
				// PRODUCER-CONSUMER-MAXIMAL: affine.store %[[cst0]], %[[tmp1]][] : memref<f32, 1>
				// Producer reduction is inserted with original bounds
				// PRODUCER-CONSUMER-MAXIMAL: affine.for %[[i2:.*]] = 0 to 1024 {

This is an archive of the discontinued LLVM Phabricator instance.

[MLIR][affine-loop-fusion] Fix incorrect slice bounds on reduction producers when fusing
Needs ReviewPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 435745

mlir/lib/Dialect/Affine/Analysis/AffineAnalysis.cpp

mlir/lib/Dialect/Affine/Analysis/Utils.cpp

mlir/test/Transforms/loop-fusion-4.mlir

This is an archive of the discontinued LLVM Phabricator instance.

[MLIR][affine-loop-fusion] Fix incorrect slice bounds on reduction producers when fusingNeeds ReviewPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 435745

mlir/lib/Dialect/Affine/Analysis/AffineAnalysis.cpp

mlir/lib/Dialect/Affine/Analysis/Utils.cpp

mlir/test/Transforms/loop-fusion-4.mlir

[MLIR][affine-loop-fusion] Fix incorrect slice bounds on reduction producers when fusing
Needs ReviewPublic