This is an archive of the discontinued LLVM Phabricator instance.

Paths

Table of Contentst

-
mlir/
-
lib/Dialect/Affine/Transforms/
-
Dialect/
-
Affine/
-
Transforms/
4/4
AffineLoopNormalize.cpp
-
test/Dialect/Affine/
-
Dialect/
-
Affine/
-
affine-loop-normalize.mlir

Differential D92233

[MLIR][Affine] Add affine.for normalization support
ClosedPublic

Authored by navdeepkk on Nov 27 2020, 7:57 AM.

Download Raw Diff

Details

Reviewers

bondhugula
nicolasvasilache

Commits

rGdc930e5f2f91: [MLIR][Affine] Add affine.for normalization support

Summary

Add support to normalize affine.for ops i.e., convert the lower bound to zero
and loop step to one. The Upper bound is set to the trip count of the loop.
The exact value of loopIV is calculated just inside the body of affine.for.
Currently loops with lower bounds having single result are supported. No such
restriction exists on upper bounds.

Diff Detail

Repository: rG LLVM Github Monorepo

Event Timeline

navdeepkk created this revision.Nov 27 2020, 7:57 AM

Herald added a project: Restricted Project. · View Herald TranscriptNov 27 2020, 7:57 AM

Herald added subscribers: teijeong, rdzhabarov, tatianashp and 14 others. · View Herald Transcript

navdeepkk requested review of this revision.Nov 27 2020, 7:57 AM

Herald added a reviewer: nicolasvasilache. · View Herald TranscriptNov 27 2020, 7:57 AM

Herald added subscribers: stephenneuendorffer, nicolasvasilache. · View Herald Transcript

Harbormaster completed remote builds in B80351: Diff 308060.Nov 27 2020, 8:17 AM

This looks great! Thanks.

mlir/lib/Dialect/Affine/Transforms/AffineLoopNormalize.cpp
82–88	has only a single iteration
84	Please add a line to say what `normalized` is.
84	-> with a single result.
97	You may want to mention that loops with a max lower bound can't be normalized without additional support (like `affine.execute_region`'s). (Actually those loops whose max's could be hoisted to the outermost position - outside any loops - could still be normalized.) You can just mention this in a comment.

This revision is now accepted and ready to land.Dec 3 2020, 5:49 AM

Address comments on diff 308060

Harbormaster completed remote builds in B81220: Diff 309760.Dec 5 2020, 10:41 PM

bondhugula accepted this revision.Dec 7 2020, 1:26 AM

Closed by commit rGdc930e5f2f91: [MLIR][Affine] Add affine.for normalization support (authored by navdeepkk, committed by bondhugula). · Explain WhyDec 7 2020, 8:38 AM

This revision was automatically updated to reflect the committed changes.

bondhugula added a commit: rGdc930e5f2f91: [MLIR][Affine] Add affine.for normalization support.

Hey guys,
I found a problem with this pass; if you feed it the following loop:

func @main(%arg0: memref<10xf32>, %x: f32) {
  %c0 = constant 0 : index
  %c10 = constant 0 : index
  affine.for %i = affine_map<(d0) -> (d0)>(%c0) to affine_map<(d0) -> (d0)>(%c10) {
    affine.store %x, %arg0[%i] : memref<10xf32>
  }
  return
}

then it will be collapsed to a zero-iteration loop:

#map0 = affine_map<(d0, d1) -> (0)>
#map1 = affine_map<(d0, d1) -> (d1 + d0)>
module  {
  func @main(%arg0: memref<10xf32>, %arg1: f32) {
    %c0 = constant 0 : index
    %c0_0 = constant 0 : index
    affine.for %arg2 = 0 to #map0(%c0_0, %c0) {
      %0 = affine.apply #map1(%c0_0, %arg2)
      affine.store %arg1, %arg0[%0] : memref<10xf32>
    }
    return
  }
}

As a workaround it is possible to run canonicalization before normalization.
Should I report it somewhere like bugzilla (it doesn't seem to be very active though)?

In D92233#2488280, @sgrechanik wrote:
Hey guys,
I found a problem with this pass; if you feed it the following loop:
func @main(%arg0: memref<10xf32>, %x: f32) {
  %c0 = constant 0 : index
  %c10 = constant 0 : index
  affine.for %i = affine_map<(d0) -> (d0)>(%c0) to affine_map<(d0) -> (d0)>(%c10) {
    affine.store %x, %arg0[%i] : memref<10xf32>
  }
  return
}
then it will be collapsed to a zero-iteration loop:
#map0 = affine_map<(d0, d1) -> (0)>
#map1 = affine_map<(d0, d1) -> (d1 + d0)>
module  {
  func @main(%arg0: memref<10xf32>, %arg1: f32) {
    %c0 = constant 0 : index
    %c0_0 = constant 0 : index
    affine.for %arg2 = 0 to #map0(%c0_0, %c0) {
      %0 = affine.apply #map1(%c0_0, %arg2)
      affine.store %arg1, %arg0[%0] : memref<10xf32>
    }
    return
  }
}
As a workaround it is possible to run canonicalization before normalization.
Should I report it somewhere like bugzilla (it doesn't seem to be very active though)?

Hi sgrechanik,
Thanks for pointing this pointing this out. What you are saying is correct, but your test case needs a slight correction. %c10 needs to be 10 not 0. Right now we can use --canoniclaize before running this pass. Although I will try to incorporate this in the pass itself.

Revision Contents

Path

Size

mlir/

lib/

Dialect/

Affine/

Transforms/

AffineLoopNormalize.cpp

98 lines

test/

Dialect/

Affine/

affine-loop-normalize.mlir

159 lines

Diff 309930

mlir/lib/Dialect/Affine/Transforms/AffineLoopNormalize.cpp

Show First 20 Lines • Show All 73 Lines • ▼ Show 20 Lines	void mlir::normalizeAffineParallel(AffineParallelOp op) {
auto newLowerMap = AffineMap::get(		auto newLowerMap = AffineMap::get(
/dimCount=/0, /symbolCount=/0, lbExprs, op.getContext());		/dimCount=/0, /symbolCount=/0, lbExprs, op.getContext());
op.setLowerBounds({}, newLowerMap);		op.setLowerBounds({}, newLowerMap);
auto newUpperMap = AffineMap::get(ranges.getNumDims(), ranges.getNumSymbols(),		auto newUpperMap = AffineMap::get(ranges.getNumDims(), ranges.getNumSymbols(),
ubExprs, op.getContext());		ubExprs, op.getContext());
op.setUpperBounds(ranges.getOperands(), newUpperMap);		op.setUpperBounds(ranges.getOperands(), newUpperMap);
}		}

/// Normalization transformations for affine.for ops. For now, it only removes		/// Normalizes affine.for ops. If the affine.for op has only a single iteration
/// single iteration loops. We may want to consider separating redundant loop		/// only then it is simply promoted, else it is normalized in the traditional
/// elimitation from loop bound normalization, if needed in the future.		/// way, by converting the lower bound to zero and loop step to one. The upper
		bondhugulaUnsubmitted Done Reply Inline Actions Please add a line to say what `normalized` is. bondhugula: Please add a line to say what `normalized` is.
		bondhugulaUnsubmitted Done Reply Inline Actions -> with a single result. bondhugula: -> with a single result.
		/// bound is set to the trip count of the loop. For now, original loops must
		/// have lower bound with a single result only. There is no such restriction on
		/// upper bounds.
static void normalizeAffineFor(AffineForOp op) {		static void normalizeAffineFor(AffineForOp op) {
		bondhugulaUnsubmitted Done Reply Inline Actions has only a single iteration bondhugula: has only a single iteration
if (succeeded(promoteIfSingleIteration(op)))		if (succeeded(promoteIfSingleIteration(op)))
return;		return;

// TODO: Normalize loop bounds.		// Check if the forop is already normalized.
		if (op.hasConstantLowerBound() && (op.getConstantLowerBound() == 0) &&
		(op.getStep() == 1))
		return;

		// Check if the lower bound has a single result only. Loops with a max lower
		bondhugulaUnsubmitted Done Reply Inline Actions You may want to mention that loops with a max lower bound can't be normalized without additional support (like `affine.execute_region`'s). (Actually those loops whose max's could be hoisted to the outermost position - outside any loops - could still be normalized.) You can just mention this in a comment. bondhugula: You may want to mention that loops with a max lower bound can't be normalized without…
		// bound can't be normalized without additional support like
		// affine.execute_region's. If the lower bound does not have a single result
		// then skip this op.
		if (op.getLowerBoundMap().getNumResults() != 1)
		return;

		Location loc = op.getLoc();
		OpBuilder opBuilder(op);
		int64_t origLoopStep = op.getStep();

		// Calculate upperBound for normalized loop.
		SmallVector<Value, 4> ubOperands;
		AffineBound lb = op.getLowerBound();
		AffineBound ub = op.getUpperBound();
		ubOperands.reserve(ub.getNumOperands() + lb.getNumOperands());
		AffineMap origLbMap = lb.getMap();
		AffineMap origUbMap = ub.getMap();

		// Add dimension operands from upper/lower bound.
		for (unsigned j = 0, e = origUbMap.getNumDims(); j < e; ++j)
		ubOperands.push_back(ub.getOperand(j));
		for (unsigned j = 0, e = origLbMap.getNumDims(); j < e; ++j)
		ubOperands.push_back(lb.getOperand(j));

		// Add symbol operands from upper/lower bound.
		for (unsigned j = 0, e = origUbMap.getNumSymbols(); j < e; ++j)
		ubOperands.push_back(ub.getOperand(origUbMap.getNumDims() + j));
		for (unsigned j = 0, e = origLbMap.getNumSymbols(); j < e; ++j)
		ubOperands.push_back(lb.getOperand(origLbMap.getNumDims() + j));

		// Add original result expressions from lower/upper bound map.
		SmallVector<AffineExpr, 1> origLbExprs(origLbMap.getResults().begin(),
		origLbMap.getResults().end());
		SmallVector<AffineExpr, 2> origUbExprs(origUbMap.getResults().begin(),
		origUbMap.getResults().end());
		SmallVector<AffineExpr, 4> newUbExprs;

		// The original upperBound can have more than one result. For the new
		// upperBound of this loop, take difference of all possible combinations of
		// the ub results and lb result and ceildiv with the loop step. For e.g.,
		//
		// affine.for %i1 = 0 to min affine_map<(d0)[] -> (d0 + 32, 1024)>(%i0)
		// will have an upperBound map as,
		// affine_map<(d0)[] -> (((d0 + 32) - 0) ceildiv 1, (1024 - 0) ceildiv
		// 1)>(%i0)
		//
		// Insert all combinations of upper/lower bound results.
		for (unsigned i = 0, e = origUbExprs.size(); i < e; ++i) {
		newUbExprs.push_back(
		(origUbExprs[i] - origLbExprs[0]).ceilDiv(origLoopStep));
		}

		// Construct newUbMap.
		AffineMap newUbMap =
		AffineMap::get(origLbMap.getNumDims() + origUbMap.getNumDims(),
		origLbMap.getNumSymbols() + origUbMap.getNumSymbols(),
		newUbExprs, opBuilder.getContext());

		// Normalize the loop.
		op.setUpperBound(ubOperands, newUbMap);
		op.setLowerBound({}, opBuilder.getConstantAffineMap(0));
		op.setStep(1);

		// Calculate the Value of new loopIV. Create affine.apply for the value of
		// the loopIV in normalized loop.
		opBuilder.setInsertionPointToStart(op.getBody());
		SmallVector<Value, 4> lbOperands(lb.getOperands().begin(),
		lb.getOperands().begin() +
		lb.getMap().getNumDims());
		// Add an extra dim operand for loopIV.
		lbOperands.push_back(op.getInductionVar());
		// Add symbol operands from lower bound.
		for (unsigned j = 0, e = origLbMap.getNumSymbols(); j < e; ++j)
		lbOperands.push_back(lb.getOperand(origLbMap.getNumDims() + j));

		AffineExpr origIVExpr = opBuilder.getAffineDimExpr(lb.getMap().getNumDims());
		AffineExpr newIVExpr = origIVExpr * origLoopStep + origLbMap.getResult(0);
		AffineMap ivMap = AffineMap::get(origLbMap.getNumDims() + 1,
		origLbMap.getNumSymbols(), newIVExpr);
		Operation *newIV = opBuilder.create<AffineApplyOp>(loc, ivMap, lbOperands);
		op.getInductionVar().replaceAllUsesExcept(newIV->getResult(0),
		SmallPtrSet<Operation *, 1>{newIV});
}		}

namespace {		namespace {

/// Normalize affine.parallel ops so that lower bounds are 0 and steps are 1.		/// Normalize affine.parallel ops so that lower bounds are 0 and steps are 1.
/// As currently implemented, this pass cannot fail, but it might skip over ops		/// As currently implemented, this pass cannot fail, but it might skip over ops
/// that are already in a normalized form.		/// that are already in a normalized form.
struct AffineLoopNormalizePass		struct AffineLoopNormalizePass
Show All 17 Lines

mlir/test/Dialect/Affine/affine-loop-normalize.mlir

Show All 36 Lines	func @single_iteration_loop(%in: memref<1xf32>, %out: memref<1xf32>) {
}		}
return		return
}		}

// CHECK-NOT: affine.for		// CHECK-NOT: affine.for
// CHECK: affine.load		// CHECK: affine.load
// CHECK-NEXT: affine.store		// CHECK-NEXT: affine.store
// CHECK-NEXT: return		// CHECK-NEXT: return

		// -----

		// CHECK-DAG: [[$IV0:#map[0-9]+]] = affine_map<(d0) -> (d0 * 2 + 2)>
		// CHECK-DAG: [[$IV1:#map[0-9]+]] = affine_map<(d0) -> (d0 * 3)>

		// CHECK-LABEL: func @simple_loop_nest()
		// CHECK-NEXT: affine.for %[[I:.*]] = 0 to 15 {
		// CHECK-NEXT: %[[IIV:.*]] = affine.apply [[$IV0]](%[[I]])
		// CHECK-NEXT: affine.for %[[II:.*]] = 0 to 11 {
		// CHECK-NEXT: %[[IIIV:.*]] = affine.apply [[$IV1]](%[[II]])
		// CHECK-NEXT: "test.foo"(%[[IIV]], %[[IIIV]])
		// CHECK-NEXT: }
		// CHECK-NEXT: }
		// CHECK-NEXT: return
		// CHECK-NEXT: }
		func @simple_loop_nest(){
		affine.for %i0 = 2 to 32 step 2 {
		affine.for %i1 = 0 to 32 step 3 {
		"test.foo"(%i0, %i1) : (index, index) -> ()
		}
		}
		return
		}

		// -----

		// CHECK-DAG: [[$IV00:#map[0-9]+]] = affine_map<(d0) -> (d0 * 32 + 2)>
		// CHECK-DAG: [[$IV11:#map[0-9]+]] = affine_map<(d0) -> (d0 * 2)>
		// CHECK-DAG: [[$UB00:#map[0-9]+]] = affine_map<()[s0] -> ((s0 - 2) ceildiv 32)>
		// CHECK-DAG: [[$UB11:#map[0-9]+]] = affine_map<()[s0] -> (s0 ceildiv 2)>

		// CHECK-LABEL: func @loop_with_unknown_upper_bound
		// CHECK-SAME: (%[[ARG0:.]]: memref<?x?xf32>, %[[ARG1:.]]: index)
		// CHECK-NEXT: %{{.*}} = constant 0 : index
		// CHECK-NEXT: %[[DIM:.*]] = dim %arg0, %c0 : memref<?x?xf32>
		// CHECK-NEXT: affine.for %[[I:.*]] = 0 to [[$UB00]]()[%[[DIM]]] {
		// CHECK-NEXT: %[[IIV:.*]] = affine.apply [[$IV00]](%[[I]])
		// CHECK-NEXT: affine.for %[[II:.*]] = 0 to [[$UB11]]()[%[[ARG1]]] {
		// CHECK-NEXT: %[[IIIV:.*]] = affine.apply [[$IV11]](%[[II]])
		// CHECK-NEXT: "test.foo"(%[[IIV]], %[[IIIV]])
		// CHECK-NEXT: }
		// CHECK-NEXT: }
		// CHECK-NEXT: return
		// CHECK-NEXT: }
		func @loop_with_unknown_upper_bound(%arg0: memref<?x?xf32>, %arg1: index) {
		%c0 = constant 0 : index
		%0 = dim %arg0, %c0 : memref<?x?xf32>
		affine.for %i0 = 2 to %0 step 32 {
		affine.for %i1 = 0 to %arg1 step 2 {
		"test.foo"(%i0, %i1) : (index, index) -> ()
		}
		}
		return
		}

		// -----

		// CHECK-DAG: [[$OUTERIV:#map[0-9]+]] = affine_map<(d0) -> (d0 * 32 + 2)>
		// CHECK-DAG: [[$INNERIV:#map[0-9]+]] = affine_map<(d0) -> (d0 + 2)>
		// CHECK-DAG: [[$OUTERUB:#map[0-9]+]] = affine_map<()[s0] -> ((s0 - 2) ceildiv 32)>
		// CHECK-DAG: [[$INNERUB:#map[0-9]+]] = affine_map<(d0) -> (d0 - 2, 510)>

		// CHECK-LABEL: func @loop_with_multiple_upper_bounds
		// CHECK-SAME: (%[[ARG0:.]]: memref<?x?xf32>, %[[ARG1:.]]: index)
		// CHECK-NEXT: %{{.*}} = constant 0 : index
		// CHECK-NEXT: %[[DIM:.*]] = dim %arg0, %c0 : memref<?x?xf32>
		// CHECK-NEXT: affine.for %[[I:.*]] = 0 to [[$OUTERUB]]()[%[[DIM]]] {
		// CHECK-NEXT: %[[IIV:.*]] = affine.apply [[$OUTERIV]](%[[I]])
		// CHECK-NEXT: affine.for %[[II:.*]] = 0 to min [[$INNERUB]](%[[ARG1]]) {
		// CHECK-NEXT: %[[IIIV:.*]] = affine.apply [[$INNERIV]](%[[II]])
		// CHECK-NEXT: "test.foo"(%[[IIV]], %[[IIIV]])
		// CHECK-NEXT: }
		// CHECK-NEXT: }
		// CHECK-NEXT: return
		// CHECK-NEXT: }
		func @loop_with_multiple_upper_bounds(%arg0: memref<?x?xf32>, %arg1 : index) {
		%c0 = constant 0 : index
		%0 = dim %arg0, %c0 : memref<?x?xf32>
		affine.for %i0 = 2 to %0 step 32{
		affine.for %i1 = 2 to min affine_map<(d0)[] -> (d0, 512)>(%arg1) {
		"test.foo"(%i0, %i1) : (index, index) -> ()
		}
		}
		return
		}

		// -----

		// CHECK-DAG: [[$INTERUB:#map[0-9]+]] = affine_map<()[s0] -> (s0 ceildiv 32)>
		// CHECK-DAG: [[$INTERIV:#map[0-9]+]] = affine_map<(d0) -> (d0 * 32)>
		// CHECK-DAG: [[$INTRAUB:#map[0-9]+]] = affine_map<(d0, d1)[s0] -> (32, -d0 + s0)>
		// CHECK-DAG: [[$INTRAIV:#map[0-9]+]] = affine_map<(d0, d1) -> (d1 + d0)>

		// CHECK-LABEL: func @tiled_matmul
		// CHECK-SAME: (%[[ARG0:.]]: memref<1024x1024xf32>, %[[ARG1:.]]: memref<1024x1024xf32>, %[[ARG2:.*]]: memref<1024x1024xf32>)
		// CHECK-NEXT: %{{.*}} = constant 0 : index
		// CHECK-NEXT: %{{.*}} = constant 1 : index
		// CHECK-NEXT: %[[DIM0:.]] = dim %[[ARG0]], %{{.}}
		// CHECK-NEXT: %[[DIM1:.]] = dim %[[ARG1]], %{{.}}
		// CHECK-NEXT: %[[DIM2:.]] = dim %[[ARG0]], %{{.}}
		// CHECK-NEXT: affine.for %[[I:.*]] = 0 to [[$INTERUB]]()[%[[DIM0]]] {
		// CHECK-NEXT: %[[IIV:.*]] = affine.apply [[$INTERIV]](%[[I]])
		// CHECK-NEXT: affine.for %[[J:.*]] = 0 to [[$INTERUB]]()[%[[DIM1]]] {
		// CHECK-NEXT: %[[JIV:.*]] = affine.apply [[$INTERIV]](%[[J]])
		// CHECK-NEXT: affine.for %[[K:.*]] = 0 to [[$INTERUB]]()[%[[DIM2]]] {
		// CHECK-NEXT: %[[KIV:.*]] = affine.apply [[$INTERIV]](%[[K]])
		// CHECK-NEXT: affine.for %[[II:.*]] = 0 to min [[$INTRAUB]](%[[IIV]], %[[IIV]])[%[[DIM0]]] {
		// CHECK-NEXT: %[[IIIV:.*]] = affine.apply [[$INTRAIV]](%[[IIV]], %[[II]])
		// CHECK-NEXT: affine.for %[[JJ:.*]] = 0 to min [[$INTRAUB]](%[[JIV]], %[[JIV]])[%[[DIM1]]] {
		// CHECK-NEXT: %[[JJIV:.*]] = affine.apply [[$INTRAIV]](%[[JIV]], %[[JJ]])
		// CHECK-NEXT: affine.for %[[KK:.*]] = 0 to min [[$INTRAUB]](%[[KIV]], %[[KIV]])[%[[DIM2]]] {
		// CHECK-NEXT: %[[KKIV:.*]] = affine.apply [[$INTRAIV]](%[[KIV]], %[[KK]])
		// CHECK-NEXT: %{{.*}} = affine.load %[[ARG0]][%[[IIIV]], %[[KKIV]]] : memref<1024x1024xf32>
		// CHECK-NEXT: %{{.*}} = affine.load %[[ARG1]][%[[KKIV]], %[[JJIV]]] : memref<1024x1024xf32>
		// CHECK-NEXT: %{{.*}} = affine.load %[[ARG2]][%[[IIIV]], %[[JJIV]]] : memref<1024x1024xf32>
		// CHECK-NEXT: %{{.*}} = mulf %9, %10 : f32
		// CHECK-NEXT: %{{.*}} = addf %11, %12 : f32
		// CHECK-NEXT: affine.store %{{.*}}, %[[ARG2]][%6, %7] : memref<1024x1024xf32>
		// CHECK-NEXT: }
		// CHECK-NEXT: }
		// CHECK-NEXT: }
		// CHECK-NEXT: }
		// CHECK-NEXT: }
		// CHECK-NEXT: }
		// CHECK-NEXT: return
		// CHECK-NEXT: }
		#map0 = affine_map<(d0, d1) -> (d0, d1)>
		#map1 = affine_map<(d0) -> (d0)>
		#map2 = affine_map<(d0)[s0] -> (d0 + 32, s0)>
		#map3 = affine_map<() -> (0)>
		#map4 = affine_map<()[s0] -> (s0)>

		func @tiled_matmul(%0: memref<1024x1024xf32>, %1: memref<1024x1024xf32>, %2: memref<1024x1024xf32>) {
		%c0 = constant 0 : index
		%c1 = constant 1 : index
		%3 = dim %0, %c0 : memref<1024x1024xf32>
		%4 = dim %1, %c1 : memref<1024x1024xf32>
		%5 = dim %0, %c1 : memref<1024x1024xf32>
		affine.for %arg0 = 0 to %3 step 32 {
		affine.for %arg1 = 0 to %4 step 32 {
		affine.for %arg2 = 0 to %5 step 32 {
		affine.for %arg3 = #map1(%arg0) to min #map2(%arg0)[%3] {
		affine.for %arg4 = #map1(%arg1) to min #map2(%arg1)[%4] {
		affine.for %arg5 = #map1(%arg2) to min #map2(%arg2)[%5] {
		%6 = affine.load %0[%arg3, %arg5] : memref<1024x1024xf32>
		%7 = affine.load %1[%arg5, %arg4] : memref<1024x1024xf32>
		%8 = affine.load %2[%arg3, %arg4] : memref<1024x1024xf32>
		%9 = mulf %6, %7 : f32
		%10 = addf %8, %9 : f32
		affine.store %10, %2[%arg3, %arg4] : memref<1024x1024xf32>
		}
		}
		}
		}
		}
		}
		return
		}