diff --git a/mlir/lib/Dialect/Affine/Utils/Utils.cpp b/mlir/lib/Dialect/Affine/Utils/Utils.cpp --- a/mlir/lib/Dialect/Affine/Utils/Utils.cpp +++ b/mlir/lib/Dialect/Affine/Utils/Utils.cpp @@ -550,6 +550,7 @@ ubExprs, op.getContext()); op.setUpperBounds(ranges.getOperands(), newUpperMap); } + LogicalResult mlir::affine::normalizeAffineFor(AffineForOp op, bool promoteSingleIter) { if (promoteSingleIter && succeeded(promoteIfSingleIteration(op))) @@ -571,86 +572,54 @@ OpBuilder opBuilder(op); int64_t origLoopStep = op.getStep(); - AffineBound lb = op.getLowerBound(); - AffineMap originalLbMap = lb.getMap(); - SmallVector origLbOperands; - llvm::append_range(origLbOperands, lb.getOperands()); - - AffineBound ub = op.getUpperBound(); - AffineMap originalUbMap = ub.getMap(); - SmallVector origUbOperands; - llvm::append_range(origUbOperands, ub.getOperands()); - - // Calculate upperBound for normalized loop. - SmallVector ubOperands; - ubOperands.reserve(ub.getNumOperands() + lb.getNumOperands()); - - // Add dimension operands from upper/lower bound. - for (unsigned j = 0, e = originalUbMap.getNumDims(); j < e; ++j) - ubOperands.push_back(ub.getOperand(j)); - for (unsigned j = 0, e = originalLbMap.getNumDims(); j < e; ++j) - ubOperands.push_back(lb.getOperand(j)); - - // Add symbol operands from upper/lower bound. - for (unsigned j = 0, e = originalUbMap.getNumSymbols(); j < e; ++j) - ubOperands.push_back(ub.getOperand(originalUbMap.getNumDims() + j)); - for (unsigned j = 0, e = originalLbMap.getNumSymbols(); j < e; ++j) - ubOperands.push_back(lb.getOperand(originalLbMap.getNumDims() + j)); - - // Add original result expressions from lower/upper bound map. - SmallVector origLbExprs(originalLbMap.getResults().begin(), - originalLbMap.getResults().end()); - SmallVector origUbExprs(originalUbMap.getResults().begin(), - originalUbMap.getResults().end()); - SmallVector newUbExprs; - - // The original upperBound can have more than one result. For the new - // upperBound of this loop, take difference of all possible combinations of - // the ub results and lb result and ceildiv with the loop step. For e.g., - // - // affine.for %i1 = 0 to min affine_map<(d0)[] -> (d0 + 32, 1024)>(%i0) - // will have an upperBound map as, - // affine_map<(d0)[] -> (((d0 + 32) - 0) ceildiv 1, (1024 - 0) ceildiv - // 1)>(%i0) - // - // Insert all combinations of upper/lower bound results. - for (unsigned i = 0, e = origUbExprs.size(); i < e; ++i) { - newUbExprs.push_back( - (origUbExprs[i] - origLbExprs[0]).ceilDiv(origLoopStep)); - } - - // Construct newUbMap. - AffineMap newUbMap = AffineMap::get( - originalLbMap.getNumDims() + originalUbMap.getNumDims(), - originalLbMap.getNumSymbols() + originalUbMap.getNumSymbols(), newUbExprs, - opBuilder.getContext()); - canonicalizeMapAndOperands(&newUbMap, &ubOperands); - - SmallVector lbOperands(lb.getOperands().begin(), - lb.getOperands().begin() + - originalLbMap.getNumDims()); - - // Normalize the loop. - op.setUpperBound(ubOperands, newUbMap); + // Construct the new upper bound value map. + AffineMap oldLbMap = op.getLowerBoundMap(); + // The upper bound can have multiple results. To use + // AffineValueMap::difference, we need to have the same number of results in + // both lower and upper bound maps. So, we just create a value map for the + // lower bound with the only available lower bound result repeated to pad up + // to the number of upper bound results. + SmallVector lbExprs(op.getUpperBoundMap().getNumResults(), + op.getLowerBoundMap().getResult(0)); + AffineValueMap lbMap(oldLbMap, op.getLowerBoundOperands()); + AffineMap paddedLbMap = + AffineMap::get(oldLbMap.getNumDims(), oldLbMap.getNumSymbols(), lbExprs, + op.getContext()); + AffineValueMap paddedLbValueMap(paddedLbMap, op.getLowerBoundOperands()); + AffineValueMap ubValueMap(op.getUpperBoundMap(), op.getUpperBoundOperands()); + AffineValueMap newUbValueMap; + // Compute the `upper bound - lower bound`. + AffineValueMap::difference(ubValueMap, paddedLbValueMap, &newUbValueMap); + (void)newUbValueMap.canonicalize(); + + // Scale down the upper bound value map by the loop step. + unsigned numResult = newUbValueMap.getNumResults(); + SmallVector scaleDownExprs(numResult); + for (unsigned i = 0; i < numResult; ++i) + scaleDownExprs[i] = opBuilder.getAffineDimExpr(i).ceilDiv(origLoopStep); + // `scaleDownMap` is (d0, d1, ..., d_n) -> (d0 / step, d1 / step, ..., d_n / + // step). Where `n` is the number of results in the upper bound map. + AffineMap scaleDownMap = + AffineMap::get(numResult, 0, scaleDownExprs, op.getContext()); + AffineMap newUbMap = scaleDownMap.compose(newUbValueMap.getAffineMap()); + + // Set the newly create upper bound map and operands. + op.setUpperBound(newUbValueMap.getOperands(), newUbMap); op.setLowerBound({}, opBuilder.getConstantAffineMap(0)); op.setStep(1); // Calculate the Value of new loopIV. Create affine.apply for the value of // the loopIV in normalized loop. opBuilder.setInsertionPointToStart(op.getBody()); - // Add an extra dim operand for loopIV. - lbOperands.push_back(op.getInductionVar()); - // Add symbol operands from lower bound. - for (unsigned j = 0, e = originalLbMap.getNumSymbols(); j < e; ++j) - lbOperands.push_back(origLbOperands[originalLbMap.getNumDims() + j]); - - AffineExpr origIVExpr = - opBuilder.getAffineDimExpr(originalLbMap.getNumDims()); - AffineExpr newIVExpr = origIVExpr * origLoopStep + originalLbMap.getResult(0); - AffineMap ivMap = AffineMap::get(originalLbMap.getNumDims() + 1, - originalLbMap.getNumSymbols(), newIVExpr); - canonicalizeMapAndOperands(&ivMap, &lbOperands); - Operation *newIV = opBuilder.create(loc, ivMap, lbOperands); + // Construct an affine.apply op mapping the new IV to the old IV. + AffineMap scaleIvMap = + AffineMap::get(1, 0, -opBuilder.getAffineDimExpr(0) * origLoopStep); + AffineValueMap scaleIvValueMap(scaleIvMap, ValueRange{op.getInductionVar()}); + AffineValueMap newIvToOldIvMap; + AffineValueMap::difference(lbMap, scaleIvValueMap, &newIvToOldIvMap); + (void)newIvToOldIvMap.canonicalize(); + auto newIV = opBuilder.create( + loc, newIvToOldIvMap.getAffineMap(), newIvToOldIvMap.getOperands()); op.getInductionVar().replaceAllUsesExcept(newIV->getResult(0), newIV); return success(); } diff --git a/mlir/test/Dialect/Affine/affine-loop-normalize.mlir b/mlir/test/Dialect/Affine/affine-loop-normalize.mlir --- a/mlir/test/Dialect/Affine/affine-loop-normalize.mlir +++ b/mlir/test/Dialect/Affine/affine-loop-normalize.mlir @@ -88,7 +88,7 @@ // CHECK-LABEL: func @loop_with_unknown_upper_bound // CHECK-SAME: (%[[ARG0:.*]]: memref, %[[ARG1:.*]]: index) -// CHECK-NEXT: %{{.*}} = arith.constant 0 : index +// CHECK-NEXT: arith.constant 0 : index // CHECK-NEXT: %[[DIM:.*]] = memref.dim %arg0, %c0 : memref // CHECK-NEXT: affine.for %[[I:.*]] = 0 to [[$UB00]]()[%[[DIM]]] { // CHECK-NEXT: %[[IIV:.*]] = affine.apply [[$IV00]](%[[I]]) @@ -119,7 +119,7 @@ // CHECK-LABEL: func @loop_with_multiple_upper_bounds // CHECK-SAME: (%[[ARG0:.*]]: memref, %[[ARG1:.*]]: index) -// CHECK-NEXT: %{{.*}} = arith.constant 0 : index +// CHECK-NEXT: arith.constant 0 : index // CHECK-NEXT: %[[DIM:.*]] = memref.dim %arg0, %c0 : memref // CHECK-NEXT: affine.for %[[I:.*]] = 0 to [[$OUTERUB]]()[%[[DIM]]] { // CHECK-NEXT: %[[IIV:.*]] = affine.apply [[$OUTERIV]](%[[I]]) @@ -146,12 +146,12 @@ // CHECK-DAG: [[$INTERUB:#map[0-9]*]] = affine_map<()[s0] -> (s0 ceildiv 32)> // CHECK-DAG: [[$INTERIV:#map[0-9]*]] = affine_map<(d0) -> (d0 * 32)> // CHECK-DAG: [[$INTRAUB:#map[0-9]*]] = affine_map<(d0)[s0] -> (32, -d0 + s0)> -// CHECK-DAG: [[$INTRAIV:#map[0-9]*]] = affine_map<(d0, d1) -> (d1 + d0)> +// CHECK-DAG: [[$INTRAIV:#map[0-9]*]] = affine_map<(d0, d1) -> (d0 + d1)> // CHECK-LABEL: func @tiled_matmul // CHECK-SAME: (%[[ARG0:.*]]: memref<1024x1024xf32>, %[[ARG1:.*]]: memref<1024x1024xf32>, %[[ARG2:.*]]: memref<1024x1024xf32>) -// CHECK-NEXT: %{{.*}} = arith.constant 0 : index -// CHECK-NEXT: %{{.*}} = arith.constant 1 : index +// CHECK-NEXT: arith.constant 0 : index +// CHECK-NEXT: arith.constant 1 : index // CHECK-NEXT: %[[DIM0:.*]] = memref.dim %[[ARG0]], %{{.*}} // CHECK-NEXT: %[[DIM1:.*]] = memref.dim %[[ARG1]], %{{.*}} // CHECK-NEXT: %[[DIM2:.*]] = memref.dim %[[ARG0]], %{{.*}} @@ -167,11 +167,11 @@ // CHECK-NEXT: %[[JJIV:.*]] = affine.apply [[$INTRAIV]](%[[JIV]], %[[JJ]]) // CHECK-NEXT: affine.for %[[KK:.*]] = 0 to min [[$INTRAUB]](%[[KIV]])[%[[DIM2]]] { // CHECK-NEXT: %[[KKIV:.*]] = affine.apply [[$INTRAIV]](%[[KIV]], %[[KK]]) -// CHECK-NEXT: %{{.*}} = affine.load %[[ARG0]][%[[IIIV]], %[[KKIV]]] : memref<1024x1024xf32> -// CHECK-NEXT: %{{.*}} = affine.load %[[ARG1]][%[[KKIV]], %[[JJIV]]] : memref<1024x1024xf32> -// CHECK-NEXT: %{{.*}} = affine.load %[[ARG2]][%[[IIIV]], %[[JJIV]]] : memref<1024x1024xf32> -// CHECK-NEXT: %{{.*}} = arith.mulf -// CHECK-NEXT: %{{.*}} = arith.addf +// CHECK-NEXT: affine.load %[[ARG0]][%[[IIIV]], %[[KKIV]]] : memref<1024x1024xf32> +// CHECK-NEXT: affine.load %[[ARG1]][%[[KKIV]], %[[JJIV]]] : memref<1024x1024xf32> +// CHECK-NEXT: affine.load %[[ARG2]][%[[IIIV]], %[[JJIV]]] : memref<1024x1024xf32> +// CHECK-NEXT: arith.mulf +// CHECK-NEXT: arith.addf // CHECK-NEXT: affine.store %{{.*}}, %[[ARG2]]{{.*}} : memref<1024x1024xf32> // CHECK-NEXT: } // CHECK-NEXT: } @@ -223,7 +223,7 @@ scf.for %j = %c0 to %c1 step %c1 { // CHECK: affine.for %[[ARG0:.*]] = affine.for %i = %c0 to %c1 { - // CHECK-NEXT: %{{.*}} = affine.apply #map{{.*}}(%[[ARG0]]) + // CHECK-NEXT: affine.apply #map{{.*}}(%[[ARG0]]) } } return @@ -292,3 +292,34 @@ } return } + +// ----- + +// CHECK: [[$MAP:#map[0-9]*]] = affine_map<(d0) -> (d0 * 64)> +// CHECK: [[$MAP1:#map[0-9]*]] = affine_map<(d0) -> (2, (-d0 + 1024) ceildiv 32)> +// CHECK: [[$MAP2:#map[0-9]*]] = affine_map<(d0, d1) -> (d0 + d1 * 32)> +// CHECK: [[$MAP3:#map[0-9]*]] = affine_map<(d0, d1) -> (32, d0 - d1 + 64, -d1 + 1024)> +// CHECK: [[$MAP4:#map[0-9]*]] = affine_map<(d0, d1) -> (d0 + d1)> +#map0 = affine_map<(d0) -> (d0)> +#map1 = affine_map<(d0) -> (d0 + 64, 1024)> +#map2 = affine_map<(d0, d1) -> (d1 + 32, d0 + 64, 1024)> +// CHECK-LABEL: @muli_level_tiled_matmul() +func.func @muli_level_tiled_matmul() { + // CHECK-NEXT: %[[BUF:.*]] = memref.alloc() : memref<1024xf16> + %0 = memref.alloc() : memref<1024xf16> + affine.for %arg0 = 0 to 1024 step 64 { + // CHECK-NEXT: affine.for %[[ARG0:.*]] = 0 to 16 { + // CHECK-NEXT: %[[IV0:.*]] = affine.apply [[$MAP]](%[[ARG0]]) + affine.for %arg3 = #map0(%arg0) to min #map1(%arg0) step 32 { + // CHECK-NEXT: affine.for %[[ARG1:.*]] = 0 to min [[$MAP1]](%[[IV0]]) { + // CHECK-NEXT: %[[IV1:.*]] = affine.apply [[$MAP2]](%[[IV0]], %[[ARG1]]) + affine.for %arg6 = #map0(%arg3) to min #map2(%arg0, %arg3) { + // CHECK-NEXT: affine.for %[[ARG2:.*]] = 0 to min [[$MAP3]](%[[IV0]], %[[IV1]]) { + // CHECK-NEXT: %[[IV2:.*]] = affine.apply [[$MAP4]](%[[IV1]], %[[ARG2]]) + // CHECK-NEXT: affine.load %[[BUF]][%[[IV2]]] : memref<1024xf16> + %3 = affine.load %0[%arg6] : memref<1024xf16> + } + } + } + return +}