diff --git a/mlir/lib/Dialect/Affine/Transforms/AffineLoopNormalize.cpp b/mlir/lib/Dialect/Affine/Transforms/AffineLoopNormalize.cpp --- a/mlir/lib/Dialect/Affine/Transforms/AffineLoopNormalize.cpp +++ b/mlir/lib/Dialect/Affine/Transforms/AffineLoopNormalize.cpp @@ -79,14 +79,104 @@ op.setUpperBounds(ranges.getOperands(), newUpperMap); } -/// Normalization transformations for affine.for ops. For now, it only removes -/// single iteration loops. We may want to consider separating redundant loop -/// elimitation from loop bound normalization, if needed in the future. +/// Normalizes affine.for ops. If the affine.for op has only a single iteration +/// only then it is simply promoted, else it is normalized in the traditional +/// way, by converting the lower bound to zero and loop step to one. The upper +/// bound is set to the trip count of the loop. For now, original loops must +/// have lower bound with a single result only. There is no such restriction on +/// upper bounds. static void normalizeAffineFor(AffineForOp op) { if (succeeded(promoteIfSingleIteration(op))) return; - // TODO: Normalize loop bounds. + // Check if the forop is already normalized. + if (op.hasConstantLowerBound() && (op.getConstantLowerBound() == 0) && + (op.getStep() == 1)) + return; + + // Check if the lower bound has a single result only. Loops with a max lower + // bound can't be normalized without additional support like + // affine.execute_region's. If the lower bound does not have a single result + // then skip this op. + if (op.getLowerBoundMap().getNumResults() != 1) + return; + + Location loc = op.getLoc(); + OpBuilder opBuilder(op); + int64_t origLoopStep = op.getStep(); + + // Calculate upperBound for normalized loop. + SmallVector ubOperands; + AffineBound lb = op.getLowerBound(); + AffineBound ub = op.getUpperBound(); + ubOperands.reserve(ub.getNumOperands() + lb.getNumOperands()); + AffineMap origLbMap = lb.getMap(); + AffineMap origUbMap = ub.getMap(); + + // Add dimension operands from upper/lower bound. + for (unsigned j = 0, e = origUbMap.getNumDims(); j < e; ++j) + ubOperands.push_back(ub.getOperand(j)); + for (unsigned j = 0, e = origLbMap.getNumDims(); j < e; ++j) + ubOperands.push_back(lb.getOperand(j)); + + // Add symbol operands from upper/lower bound. + for (unsigned j = 0, e = origUbMap.getNumSymbols(); j < e; ++j) + ubOperands.push_back(ub.getOperand(origUbMap.getNumDims() + j)); + for (unsigned j = 0, e = origLbMap.getNumSymbols(); j < e; ++j) + ubOperands.push_back(lb.getOperand(origLbMap.getNumDims() + j)); + + // Add original result expressions from lower/upper bound map. + SmallVector origLbExprs(origLbMap.getResults().begin(), + origLbMap.getResults().end()); + SmallVector origUbExprs(origUbMap.getResults().begin(), + origUbMap.getResults().end()); + SmallVector newUbExprs; + + // The original upperBound can have more than one result. For the new + // upperBound of this loop, take difference of all possible combinations of + // the ub results and lb result and ceildiv with the loop step. For e.g., + // + // affine.for %i1 = 0 to min affine_map<(d0)[] -> (d0 + 32, 1024)>(%i0) + // will have an upperBound map as, + // affine_map<(d0)[] -> (((d0 + 32) - 0) ceildiv 1, (1024 - 0) ceildiv + // 1)>(%i0) + // + // Insert all combinations of upper/lower bound results. + for (unsigned i = 0, e = origUbExprs.size(); i < e; ++i) { + newUbExprs.push_back( + (origUbExprs[i] - origLbExprs[0]).ceilDiv(origLoopStep)); + } + + // Construct newUbMap. + AffineMap newUbMap = + AffineMap::get(origLbMap.getNumDims() + origUbMap.getNumDims(), + origLbMap.getNumSymbols() + origUbMap.getNumSymbols(), + newUbExprs, opBuilder.getContext()); + + // Normalize the loop. + op.setUpperBound(ubOperands, newUbMap); + op.setLowerBound({}, opBuilder.getConstantAffineMap(0)); + op.setStep(1); + + // Calculate the Value of new loopIV. Create affine.apply for the value of + // the loopIV in normalized loop. + opBuilder.setInsertionPointToStart(op.getBody()); + SmallVector lbOperands(lb.getOperands().begin(), + lb.getOperands().begin() + + lb.getMap().getNumDims()); + // Add an extra dim operand for loopIV. + lbOperands.push_back(op.getInductionVar()); + // Add symbol operands from lower bound. + for (unsigned j = 0, e = origLbMap.getNumSymbols(); j < e; ++j) + lbOperands.push_back(lb.getOperand(origLbMap.getNumDims() + j)); + + AffineExpr origIVExpr = opBuilder.getAffineDimExpr(lb.getMap().getNumDims()); + AffineExpr newIVExpr = origIVExpr * origLoopStep + origLbMap.getResult(0); + AffineMap ivMap = AffineMap::get(origLbMap.getNumDims() + 1, + origLbMap.getNumSymbols(), newIVExpr); + Operation *newIV = opBuilder.create(loc, ivMap, lbOperands); + op.getInductionVar().replaceAllUsesExcept(newIV->getResult(0), + SmallPtrSet{newIV}); } namespace { diff --git a/mlir/test/Dialect/Affine/affine-loop-normalize.mlir b/mlir/test/Dialect/Affine/affine-loop-normalize.mlir --- a/mlir/test/Dialect/Affine/affine-loop-normalize.mlir +++ b/mlir/test/Dialect/Affine/affine-loop-normalize.mlir @@ -42,3 +42,162 @@ // CHECK: affine.load // CHECK-NEXT: affine.store // CHECK-NEXT: return + +// ----- + +// CHECK-DAG: [[$IV0:#map[0-9]+]] = affine_map<(d0) -> (d0 * 2 + 2)> +// CHECK-DAG: [[$IV1:#map[0-9]+]] = affine_map<(d0) -> (d0 * 3)> + +// CHECK-LABEL: func @simple_loop_nest() +// CHECK-NEXT: affine.for %[[I:.*]] = 0 to 15 { +// CHECK-NEXT: %[[IIV:.*]] = affine.apply [[$IV0]](%[[I]]) +// CHECK-NEXT: affine.for %[[II:.*]] = 0 to 11 { +// CHECK-NEXT: %[[IIIV:.*]] = affine.apply [[$IV1]](%[[II]]) +// CHECK-NEXT: "test.foo"(%[[IIV]], %[[IIIV]]) +// CHECK-NEXT: } +// CHECK-NEXT: } +// CHECK-NEXT: return +// CHECK-NEXT: } +func @simple_loop_nest(){ + affine.for %i0 = 2 to 32 step 2 { + affine.for %i1 = 0 to 32 step 3 { + "test.foo"(%i0, %i1) : (index, index) -> () + } + } + return +} + +// ----- + +// CHECK-DAG: [[$IV00:#map[0-9]+]] = affine_map<(d0) -> (d0 * 32 + 2)> +// CHECK-DAG: [[$IV11:#map[0-9]+]] = affine_map<(d0) -> (d0 * 2)> +// CHECK-DAG: [[$UB00:#map[0-9]+]] = affine_map<()[s0] -> ((s0 - 2) ceildiv 32)> +// CHECK-DAG: [[$UB11:#map[0-9]+]] = affine_map<()[s0] -> (s0 ceildiv 2)> + +// CHECK-LABEL: func @loop_with_unknown_upper_bound +// CHECK-SAME: (%[[ARG0:.*]]: memref, %[[ARG1:.*]]: index) +// CHECK-NEXT: %{{.*}} = constant 0 : index +// CHECK-NEXT: %[[DIM:.*]] = dim %arg0, %c0 : memref +// CHECK-NEXT: affine.for %[[I:.*]] = 0 to [[$UB00]]()[%[[DIM]]] { +// CHECK-NEXT: %[[IIV:.*]] = affine.apply [[$IV00]](%[[I]]) +// CHECK-NEXT: affine.for %[[II:.*]] = 0 to [[$UB11]]()[%[[ARG1]]] { +// CHECK-NEXT: %[[IIIV:.*]] = affine.apply [[$IV11]](%[[II]]) +// CHECK-NEXT: "test.foo"(%[[IIV]], %[[IIIV]]) +// CHECK-NEXT: } +// CHECK-NEXT: } +// CHECK-NEXT: return +// CHECK-NEXT: } +func @loop_with_unknown_upper_bound(%arg0: memref, %arg1: index) { + %c0 = constant 0 : index + %0 = dim %arg0, %c0 : memref + affine.for %i0 = 2 to %0 step 32 { + affine.for %i1 = 0 to %arg1 step 2 { + "test.foo"(%i0, %i1) : (index, index) -> () + } + } + return +} + +// ----- + +// CHECK-DAG: [[$OUTERIV:#map[0-9]+]] = affine_map<(d0) -> (d0 * 32 + 2)> +// CHECK-DAG: [[$INNERIV:#map[0-9]+]] = affine_map<(d0) -> (d0 + 2)> +// CHECK-DAG: [[$OUTERUB:#map[0-9]+]] = affine_map<()[s0] -> ((s0 - 2) ceildiv 32)> +// CHECK-DAG: [[$INNERUB:#map[0-9]+]] = affine_map<(d0) -> (d0 - 2, 510)> + +// CHECK-LABEL: func @loop_with_multiple_upper_bounds +// CHECK-SAME: (%[[ARG0:.*]]: memref, %[[ARG1:.*]]: index) +// CHECK-NEXT: %{{.*}} = constant 0 : index +// CHECK-NEXT: %[[DIM:.*]] = dim %arg0, %c0 : memref +// CHECK-NEXT: affine.for %[[I:.*]] = 0 to [[$OUTERUB]]()[%[[DIM]]] { +// CHECK-NEXT: %[[IIV:.*]] = affine.apply [[$OUTERIV]](%[[I]]) +// CHECK-NEXT: affine.for %[[II:.*]] = 0 to min [[$INNERUB]](%[[ARG1]]) { +// CHECK-NEXT: %[[IIIV:.*]] = affine.apply [[$INNERIV]](%[[II]]) +// CHECK-NEXT: "test.foo"(%[[IIV]], %[[IIIV]]) +// CHECK-NEXT: } +// CHECK-NEXT: } +// CHECK-NEXT: return +// CHECK-NEXT: } +func @loop_with_multiple_upper_bounds(%arg0: memref, %arg1 : index) { + %c0 = constant 0 : index + %0 = dim %arg0, %c0 : memref + affine.for %i0 = 2 to %0 step 32{ + affine.for %i1 = 2 to min affine_map<(d0)[] -> (d0, 512)>(%arg1) { + "test.foo"(%i0, %i1) : (index, index) -> () + } + } + return +} + +// ----- + +// CHECK-DAG: [[$INTERUB:#map[0-9]+]] = affine_map<()[s0] -> (s0 ceildiv 32)> +// CHECK-DAG: [[$INTERIV:#map[0-9]+]] = affine_map<(d0) -> (d0 * 32)> +// CHECK-DAG: [[$INTRAUB:#map[0-9]+]] = affine_map<(d0, d1)[s0] -> (32, -d0 + s0)> +// CHECK-DAG: [[$INTRAIV:#map[0-9]+]] = affine_map<(d0, d1) -> (d1 + d0)> + +// CHECK-LABEL: func @tiled_matmul +// CHECK-SAME: (%[[ARG0:.*]]: memref<1024x1024xf32>, %[[ARG1:.*]]: memref<1024x1024xf32>, %[[ARG2:.*]]: memref<1024x1024xf32>) +// CHECK-NEXT: %{{.*}} = constant 0 : index +// CHECK-NEXT: %{{.*}} = constant 1 : index +// CHECK-NEXT: %[[DIM0:.*]] = dim %[[ARG0]], %{{.*}} +// CHECK-NEXT: %[[DIM1:.*]] = dim %[[ARG1]], %{{.*}} +// CHECK-NEXT: %[[DIM2:.*]] = dim %[[ARG0]], %{{.*}} +// CHECK-NEXT: affine.for %[[I:.*]] = 0 to [[$INTERUB]]()[%[[DIM0]]] { +// CHECK-NEXT: %[[IIV:.*]] = affine.apply [[$INTERIV]](%[[I]]) +// CHECK-NEXT: affine.for %[[J:.*]] = 0 to [[$INTERUB]]()[%[[DIM1]]] { +// CHECK-NEXT: %[[JIV:.*]] = affine.apply [[$INTERIV]](%[[J]]) +// CHECK-NEXT: affine.for %[[K:.*]] = 0 to [[$INTERUB]]()[%[[DIM2]]] { +// CHECK-NEXT: %[[KIV:.*]] = affine.apply [[$INTERIV]](%[[K]]) +// CHECK-NEXT: affine.for %[[II:.*]] = 0 to min [[$INTRAUB]](%[[IIV]], %[[IIV]])[%[[DIM0]]] { +// CHECK-NEXT: %[[IIIV:.*]] = affine.apply [[$INTRAIV]](%[[IIV]], %[[II]]) +// CHECK-NEXT: affine.for %[[JJ:.*]] = 0 to min [[$INTRAUB]](%[[JIV]], %[[JIV]])[%[[DIM1]]] { +// CHECK-NEXT: %[[JJIV:.*]] = affine.apply [[$INTRAIV]](%[[JIV]], %[[JJ]]) +// CHECK-NEXT: affine.for %[[KK:.*]] = 0 to min [[$INTRAUB]](%[[KIV]], %[[KIV]])[%[[DIM2]]] { +// CHECK-NEXT: %[[KKIV:.*]] = affine.apply [[$INTRAIV]](%[[KIV]], %[[KK]]) +// CHECK-NEXT: %{{.*}} = affine.load %[[ARG0]][%[[IIIV]], %[[KKIV]]] : memref<1024x1024xf32> +// CHECK-NEXT: %{{.*}} = affine.load %[[ARG1]][%[[KKIV]], %[[JJIV]]] : memref<1024x1024xf32> +// CHECK-NEXT: %{{.*}} = affine.load %[[ARG2]][%[[IIIV]], %[[JJIV]]] : memref<1024x1024xf32> +// CHECK-NEXT: %{{.*}} = mulf %9, %10 : f32 +// CHECK-NEXT: %{{.*}} = addf %11, %12 : f32 +// CHECK-NEXT: affine.store %{{.*}}, %[[ARG2]][%6, %7] : memref<1024x1024xf32> +// CHECK-NEXT: } +// CHECK-NEXT: } +// CHECK-NEXT: } +// CHECK-NEXT: } +// CHECK-NEXT: } +// CHECK-NEXT: } +// CHECK-NEXT: return +// CHECK-NEXT: } +#map0 = affine_map<(d0, d1) -> (d0, d1)> +#map1 = affine_map<(d0) -> (d0)> +#map2 = affine_map<(d0)[s0] -> (d0 + 32, s0)> +#map3 = affine_map<() -> (0)> +#map4 = affine_map<()[s0] -> (s0)> + +func @tiled_matmul(%0: memref<1024x1024xf32>, %1: memref<1024x1024xf32>, %2: memref<1024x1024xf32>) { + %c0 = constant 0 : index + %c1 = constant 1 : index + %3 = dim %0, %c0 : memref<1024x1024xf32> + %4 = dim %1, %c1 : memref<1024x1024xf32> + %5 = dim %0, %c1 : memref<1024x1024xf32> + affine.for %arg0 = 0 to %3 step 32 { + affine.for %arg1 = 0 to %4 step 32 { + affine.for %arg2 = 0 to %5 step 32 { + affine.for %arg3 = #map1(%arg0) to min #map2(%arg0)[%3] { + affine.for %arg4 = #map1(%arg1) to min #map2(%arg1)[%4] { + affine.for %arg5 = #map1(%arg2) to min #map2(%arg2)[%5] { + %6 = affine.load %0[%arg3, %arg5] : memref<1024x1024xf32> + %7 = affine.load %1[%arg5, %arg4] : memref<1024x1024xf32> + %8 = affine.load %2[%arg3, %arg4] : memref<1024x1024xf32> + %9 = mulf %6, %7 : f32 + %10 = addf %8, %9 : f32 + affine.store %10, %2[%arg3, %arg4] : memref<1024x1024xf32> + } + } + } + } + } + } + return +}