diff --git a/mlir/include/mlir/Dialect/Linalg/Transforms/HoistPadding.h b/mlir/include/mlir/Dialect/Linalg/Transforms/HoistPadding.h --- a/mlir/include/mlir/Dialect/Linalg/Transforms/HoistPadding.h +++ b/mlir/include/mlir/Dialect/Linalg/Transforms/HoistPadding.h @@ -16,12 +16,16 @@ namespace linalg { class PadTensorOp; +class LinalgOp; /// Mechanically hoist padding operations on tensors by `numLoops` into a new, /// generally larger tensor. This achieves packing of multiple padding ops into /// a larger tensor. On success, `padTensorOp` is replaced by the cloned version /// in the packing loop so the caller can continue reasoning about the padding -/// operation. +/// operation. If `transposeVector` is non-empty, hoist padding introduces a +/// GenericOp operation to transpose the padded tensor before inserting it into +/// the packed tensor. +/// /// /// Example in pseudo-mlir: /// ======================= @@ -60,7 +64,9 @@ /// } /// ``` FailureOr hoistPaddingOnTensors(PadTensorOp opToHoist, int numLoops, - PadTensorOp &hoistedOp); + ArrayRef transposeVector, + PadTensorOp &hoistedOp, + SmallVectorImpl &transposeOps); } // namespace linalg } // namespace mlir diff --git a/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h b/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h --- a/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h +++ b/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h @@ -492,6 +492,11 @@ /// defining the given OpOperand. using PaddingHoistComputationFunction = std::function; +/// Callback returning the interchange vector used to transpose the result of +/// the hoisted pad tensor operation defining the given OpOperand. +using PaddingTransposeComputationFunction = + std::function(OpOperand &)>; + struct LinalgPaddingOptions { /// Callback returning the padding value to use for a given OpOperand or /// failure for no padding. Padding operations are introduced if @@ -527,6 +532,17 @@ paddingHoistComputationFunction = std::move(fun); return *this; } + + /// Callback returning the interchange vector used to transpose the result of + /// the hoisted pad tensor operation defining the given OpOperand. + PaddingTransposeComputationFunction paddingTransposeComputationFunction = + nullptr; + + LinalgPaddingOptions &setPaddingTransposeComputationFunction( + PaddingTransposeComputationFunction fun) { + paddingTransposeComputationFunction = std::move(fun); + return *this; + } }; struct LinalgTilingAndFusionOptions { diff --git a/mlir/lib/Dialect/Linalg/Transforms/HoistPadding.cpp b/mlir/lib/Dialect/Linalg/Transforms/HoistPadding.cpp --- a/mlir/lib/Dialect/Linalg/Transforms/HoistPadding.cpp +++ b/mlir/lib/Dialect/Linalg/Transforms/HoistPadding.cpp @@ -143,6 +143,63 @@ } } +/// Returns the transposed `tensorType` if `transposeVector` is non-empty. Fail +/// if `transposeVector` is not a permutation or does not match the tensor rank. +static FailureOr +computeTransposedType(RankedTensorType tensorType, + ArrayRef transposeVector) { + if (transposeVector.empty()) + return tensorType; + if (!isPermutation(transposeVector) || + transposeVector.size() != tensorType.getRank()) + return failure(); + + SmallVector transposedShape(tensorType.getShape().begin(), + tensorType.getShape().end()); + applyPermutationToVector(transposedShape, transposeVector); + return RankedTensorType::get(transposedShape, tensorType.getElementType()); +} + +/// Returns a GenericOp that tansposes `inputTensor` into `outputTensor` using +/// `transposeVector`. +static LinalgOp makeTransposeOp(OpBuilder &b, Location loc, Value inputTensor, + Value outputTensor, + ArrayRef transposeVector) { + auto resultTensorType = outputTensor.getType().cast(); + Type elementType = resultTensorType.getElementType(); + + assert(isPermutation(transposeVector) && + "expect transpose vector to be a permutation"); + assert(transposeVector.size() == resultTensorType.getRank() && + "expect transpose vector size to match result tensor rank"); + + // Compute the transpose and the indentity indexing maps. + SmallVector indexingMaps = { + inversePermutation(AffineMap::getPermutationMap( + SmallVector(transposeVector.begin(), transposeVector.end()), + b.getContext())), + AffineMap::getMultiDimIdentityMap(transposeVector.size(), + b.getContext())}; + SmallVector iteratorTypes(transposeVector.size(), + getParallelIteratorTypeName()); + + // Create a GenericOp to transpose `inputTensor` into `outputTensor`. + auto transposeOp = b.create( + loc, resultTensorType, inputTensor, outputTensor, + b.getAffineMapArrayAttr(indexingMaps), b.getStrArrayAttr(iteratorTypes), + /*doc=*/nullptr, + /*library_call=*/nullptr); + Region &body = transposeOp.getRegion(); + body.push_back(new Block()); + body.front().addArguments({elementType, elementType}); + + // Create the body of the transpose operation. + OpBuilder::InsertionGuard g(b); + b.setInsertionPointToEnd(&body.front()); + b.create(loc, transposeOp.getRegion().front().getArgument(0)); + return transposeOp; +} + HoistingAnalysis::HoistingAnalysis(PadTensorOp padTensorOp, int numLoops) { valid = false; @@ -373,9 +430,9 @@ ValueRange{ivVal, lbVal, stepVal}); } -FailureOr mlir::linalg::hoistPaddingOnTensors(PadTensorOp opToHoist, - int numLoops, - PadTensorOp &hoistedOp) { +FailureOr mlir::linalg::hoistPaddingOnTensors( + PadTensorOp opToHoist, int numLoops, ArrayRef transposeVector, + PadTensorOp &hoistedOp, SmallVectorImpl &transposeOps) { LLVM_DEBUG(DBGS() << "Try to hoist " << *(opToHoist) << " by " << numLoops << " loops\n"); HoistingAnalysis analysis(opToHoist, numLoops); @@ -396,14 +453,20 @@ RankedTensorType paddedTensorType = opToHoist.getResultType(); int paddedRank = paddedTensorType.getRank(); - // Create the packed tensor into which we amortize + // Compute the type of the transposed padded tensor. + FailureOr transposedTensorType = + computeTransposedType(paddedTensorType, transposeVector); + if (failed(transposedTensorType)) + return failure(); + + // Create the packed tensor into which we amortize // padding. SmallVector packedShape(nPackedLoops, ShapedType::kDynamicSize); // TODO: go grab dims when necessary, for now PadTensorOp returns a static // tensor. - llvm::append_range(packedShape, paddedTensorType.getShape()); - auto packedTensorType = - RankedTensorType::get(packedShape, paddedTensorType.getElementType()); + llvm::append_range(packedShape, transposedTensorType->getShape()); + auto packedTensorType = RankedTensorType::get( + packedShape, transposedTensorType->getElementType()); Value packedTensor = b.create( loc, dynamicTensorSizes, packedTensorType.getShape(), packedTensorType.getElementType()); @@ -413,9 +476,10 @@ // The implementation proceeds in a stack-like fashion: // 1. Iteratively clone and step into the loops, pushing the `packedTensor` // deeper in the stack. - // 2. Create a InsertSliceOp at the top of the stack. - // 3. Iteratively pop and yield the result of the InsertSliceOp across - // the cloned loops. + // 2. Create a GenericOp if `transposeVector` is non-empty. + // 3. Create a InsertSliceOp at the top of the stack. + // 4. Iteratively pop and yield the result of the InsertSliceOp across + // the cloned loops. SmallVector clonedLoopIvs, leadingPackedTensorIndexings; clonedLoopIvs.reserve(nPackedLoops); leadingPackedTensorIndexings.reserve(nPackedLoops); @@ -455,14 +519,13 @@ packedTensor = clonedForOp.getRegionIterArgs().front(); } - // Stack step 2. create InsertSliceOp at the top of the stack. // offsets = [clonedLoopIvs, 0 .. 0]. SmallVector offsets(leadingPackedTensorIndexings.begin(), leadingPackedTensorIndexings.end()); offsets.append(paddedRank, b.getIndexAttr(0)); - // sizes = [1 .. 1, paddedShape]. + // sizes = [1 .. 1, transposedShape]. SmallVector sizes(nPackedLoops, b.getIndexAttr(1)); - for (int64_t sz : paddedTensorType.getShape()) { + for (int64_t sz : transposedTensorType->getShape()) { // TODO: go grab dims when necessary, for now PadTensorOp returns a static // tensor. assert(!ShapedType::isDynamic(sz) && "padded tensor needs static sizes"); @@ -472,11 +535,21 @@ SmallVector strides(nPackedLoops + paddedRank, b.getIndexAttr(1)); - Value inserted = - b.create(loc, bvm.lookup(opToHoist.result()), - packedTensor, offsets, sizes, strides); + // Stack step 2. create GenericOp if `transposeVector` is non-empty. + Value paddedTensor = bvm.lookup(opToHoist.result()); + if (!transposeVector.empty()) { + Value outputTensor = b.create( + loc, *transposedTensorType, packedTensor, offsets, sizes, strides); + transposeOps.push_back( + makeTransposeOp(b, loc, paddedTensor, outputTensor, transposeVector)); + paddedTensor = transposeOps.back()->getResult(0); + } - // Stack step 3. iteratively pop the stack and propagate the yield. + // Stack step 3. create InsertSliceOp at the top of the stack. + Value inserted = b.create( + loc, paddedTensor, packedTensor, offsets, sizes, strides); + + // Stack step 4. iteratively pop the stack and propagate the yield. Value valueToYield = inserted; for (Value iv : llvm::reverse(clonedLoopIvs)) { auto forOp = scf::getForInductionVarOwner(iv); @@ -498,12 +571,22 @@ // offsets = [originalLoopIvs, 0 .. 0]. offsets.assign(loopIterationCounts.begin(), loopIterationCounts.end()); offsets.append(paddedRank, b.getIndexAttr(0)); - // sizes = [1 .. 1, paddedShape] (definedabove). + // sizes = [1 .. 1, transposedShape] (definedabove). // strides = [1 .. 1] (defined above) packedTensor = scf::getForInductionVarOwner(clonedLoopIvs.front())->getResult(0); Value newResult = b.create( - loc, opToHoist.getResultType(), packedTensor, offsets, sizes, strides); + loc, *transposedTensorType, packedTensor, offsets, sizes, strides); + + // Transpose the packed tensor back to the original storage order. + if (!transposeVector.empty()) { + Value initTensor = + b.create(loc, ValueRange{}, paddedTensorType.getShape(), + paddedTensorType.getElementType()); + transposeOps.push_back( + makeTransposeOp(b, loc, newResult, initTensor, transposeVector)); + newResult = transposeOps.back()->getResult(0); + } // Make the newly cloned `opToHoist` available to the caller. hoistedOp = cast(bvm.lookup(opToHoist.result()).getDefiningOp()); diff --git a/mlir/lib/Dialect/Linalg/Transforms/Transforms.cpp b/mlir/lib/Dialect/Linalg/Transforms/Transforms.cpp --- a/mlir/lib/Dialect/Linalg/Transforms/Transforms.cpp +++ b/mlir/lib/Dialect/Linalg/Transforms/Transforms.cpp @@ -532,16 +532,25 @@ if (!padTensorOp || en.value() == 0) continue; PadTensorOp hoistedOp; - FailureOr newResult = - hoistPaddingOnTensors(padTensorOp, en.value(), hoistedOp); + SmallVector transposeOps; + SmallVector transposeVector = + options.paddingTransposeComputationFunction(opOperand); + + FailureOr newResult = hoistPaddingOnTensors( + padTensorOp, en.value(), transposeVector, hoistedOp, transposeOps); if (failed(newResult)) continue; rewriter.replaceOp(padTensorOp, newResult.getValue()); + + // Do not apply hoist padding to the newly introduced transpose operations. + for (LinalgOp transposeOp : transposeOps) + filter.replaceLinalgTransformationFilter(rewriter, transposeOp); } // Replace the original operation to pad. rewriter.replaceOp(linalgOp, newResults.getValue()); filter.replaceLinalgTransformationFilter(rewriter, paddedOp); + return paddedOp; } diff --git a/mlir/test/Dialect/Linalg/hoist-padding.mlir b/mlir/test/Dialect/Linalg/hoist-padding.mlir --- a/mlir/test/Dialect/Linalg/hoist-padding.mlir +++ b/mlir/test/Dialect/Linalg/hoist-padding.mlir @@ -1,4 +1,5 @@ // RUN: mlir-opt %s -test-linalg-codegen-strategy="anchor-op=linalg.matvec pad hoist-paddings=1,1,0 run-enable-pass=false" -cse -canonicalize -split-input-file | FileCheck %s --check-prefix=MATVEC +// RUN: mlir-opt %s -test-linalg-codegen-strategy="anchor-op=linalg.matvec pad hoist-paddings=1,1,0 transpose-paddings=10,0,0 run-enable-pass=false" -cse -canonicalize -split-input-file | FileCheck %s --check-prefix=TRANSP // RUN: mlir-opt %s -test-linalg-codegen-strategy="anchor-op=linalg.matmul pad hoist-paddings=1,2,1 run-enable-pass=false" -cse -canonicalize -split-input-file | FileCheck %s --check-prefix=MATMUL // MATVEC-DAG: #[[DIV4:[0-9a-z]+]] = affine_map<(d0) -> (d0 ceildiv 4)> @@ -30,7 +31,7 @@ // MATVEC-DAG: %[[T4:.*]] = tensor.extract_slice %[[T0]][%[[IDX0]] %2 = tensor.extract_slice %arg1[%arg3] [4] [1] : tensor<12xf32> to tensor<4xf32> %3 = linalg.pad_tensor %2 nofold low[%c0] high[%c0] { - ^bb0(%arg5: index): + ^bb0(%arg5: index): linalg.yield %cst : f32 } : tensor<4xf32> to tensor<4xf32> @@ -81,11 +82,11 @@ %3 = tensor.extract_slice %arg1[%arg3] [%1] [1] : tensor<12xf32> to tensor %4 = affine.apply #map1(%1) %5 = linalg.pad_tensor %2 low[%c0, %c0] high[%c0, %4] { - ^bb0(%arg5: index, %arg6: index): + ^bb0(%arg5: index, %arg6: index): linalg.yield %cst : f32 } : tensor<24x?xf32> to tensor<24x5xf32> %6 = linalg.pad_tensor %3 low[%c0] high[%4] { - ^bb0(%arg5: index): + ^bb0(%arg5: index): linalg.yield %cst : f32 } : tensor to tensor<5xf32> @@ -141,11 +142,11 @@ %4 = tensor.extract_slice %arg1[%arg3] [%2] [1] : tensor to tensor %5 = affine.apply #map1(%2) %6 = linalg.pad_tensor %3 low[%c0, %c0] high[%c0, %5] { - ^bb0(%arg5: index, %arg6: index): + ^bb0(%arg5: index, %arg6: index): linalg.yield %cst : f32 } : tensor<24x?xf32> to tensor<24x4xf32> %7 = linalg.pad_tensor %4 nofold low[%c0] high[%5] { - ^bb0(%arg5: index): + ^bb0(%arg5: index): linalg.yield %cst : f32 } : tensor to tensor<4xf32> @@ -177,7 +178,7 @@ // MATVEC: %[[T1:.*]] = linalg.pad_tensor %[[T0]] %2 = tensor.extract_slice %arg1[%arg3] [4] [1] : tensor<12xf32> to tensor<4xf32> %3 = linalg.pad_tensor %2 nofold low[%c0] high[%c0] { - ^bb0(%arg5: index): + ^bb0(%arg5: index): %5 = arith.index_cast %arg3 : index to i32 %6 = arith.sitofp %5 : i32 to f32 linalg.yield %6 : f32 @@ -214,7 +215,7 @@ %2 = tensor.extract_slice %arg1[%arg3] [4] [1] : tensor<12xf32> to tensor<4xf32> %3 = tensor.extract %arg1[%arg3] : tensor<12xf32> %4 = linalg.pad_tensor %2 nofold low[%c0] high[%c0] { - ^bb0(%arg5: index): + ^bb0(%arg5: index): linalg.yield %3 : f32 } : tensor<4xf32> to tensor<4xf32> @@ -251,7 +252,7 @@ %2 = tensor.extract_slice %arg1[%arg4] [4] [1] : tensor<12xf32> to tensor<4xf32> %3 = arith.index_cast %arg3 : i32 to index %4 = linalg.pad_tensor %2 nofold low[%3] high[%3] { - ^bb0(%arg6: index): + ^bb0(%arg6: index): linalg.yield %cst : f32 } : tensor<4xf32> to tensor<4xf32> @@ -288,7 +289,7 @@ %2 = tensor.extract_slice %arg1[%arg4] [4] [1] : tensor<12xf32> to tensor<4xf32> %3 = memref.load %arg3[%c0] : memref %4 = linalg.pad_tensor %2 nofold low[%3] high[%3] { - ^bb0(%arg6: index): + ^bb0(%arg6: index): linalg.yield %cst : f32 } : tensor<4xf32> to tensor<4xf32> @@ -328,7 +329,7 @@ scf.yield %6 : index } %4 = linalg.pad_tensor %2 nofold low[%3] high[%3] { - ^bb0(%arg6: index): + ^bb0(%arg6: index): linalg.yield %cst : f32 } : tensor<4xf32> to tensor<4xf32> @@ -373,7 +374,7 @@ // Check the fused and padded fill op does not prevent hoisting. %4 = linalg.pad_tensor %2 nofold low[%c0, %c0] high[%3, %c0] { - ^bb0(%arg5: index, %arg6: index): + ^bb0(%arg5: index, %arg6: index): linalg.yield %cst : f32 } : tensor to tensor<5x24xf32> %5 = linalg.fill(%cst, %4) : f32, tensor<5x24xf32> -> tensor<5x24xf32> @@ -394,18 +395,18 @@ %10 = tensor.extract_slice %arg1[%arg5, 0] [3, 24] [1, 1] : tensor<6x24xf32> to tensor<3x24xf32> %11 = tensor.extract_slice %arg6[0, 0] [%1, 24] [1, 1] : tensor to tensor %12 = linalg.pad_tensor %9 nofold low[%c0, %c0] high[%3, %c0] { - ^bb0(%arg7: index, %arg8: index): + ^bb0(%arg7: index, %arg8: index): linalg.yield %cst : f32 } : tensor to tensor<5x3xf32> %13 = linalg.pad_tensor %10 nofold low[%c0, %c0] high[%c0, %c0] { - ^bb0(%arg7: index, %arg8: index): + ^bb0(%arg7: index, %arg8: index): linalg.yield %cst : f32 } : tensor<3x24xf32> to tensor<3x24xf32> // Check the output padding is not hoisted. // MATMUL: %[[T8:.*]] = linalg.pad_tensor %14 = linalg.pad_tensor %11 nofold low[%c0, %c0] high[%3, %c0] { - ^bb0(%arg7: index, %arg8: index): + ^bb0(%arg7: index, %arg8: index): linalg.yield %cst : f32 } : tensor to tensor<5x24xf32> @@ -421,3 +422,59 @@ } return %0 : tensor<12x24xf32> } + +// ----- + +#map0 = affine_map<(d0)[s0] -> (4, -d0 + s0)> +#map1 = affine_map<(d0) -> (-d0 + 4)> + +// TRANSP: transpose +// TRANSP-SAME: %[[ARG0:[0-9a-zA-Z]*]]: tensor<24x?xf32> +func @transpose(%arg0: tensor<24x?xf32>, + %arg1: tensor, + %arg2: tensor<24xf32>) -> tensor<24xf32> { + %cst = arith.constant 0.000000e+00 : f32 + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c4 = arith.constant 4 : index + %0 = tensor.dim %arg0, %c1 : tensor<24x?xf32> + + // Transpose the padded matrix. + // TRANSP: %[[T0:.*]] = scf.for %[[PIV0:[0-9a-z]+]] = {{.*}}iter_args(%[[T1:.*]] = + // TRANSP: %[[T2:.*]] = linalg.pad_tensor + // TRANSP: %[[T3:.*]] = tensor.extract_slice %[[T1]] + // TRANSP: %[[T4:.*]] = linalg.generic + // TRANSP-SAME: ins(%[[T2]] : tensor<24x4xf32> + // TRANSP-SAME: outs(%[[T3]] : tensor<4x24xf32> + // TRANSP: %[[T5:.*]] = tensor.insert_slice %[[T4]] into %[[T1]] + // TRANSP: scf.yield %[[T5:.*]] + + // TRANSP: scf.for %[[IV0:[0-9a-zA-Z]*]] = + %1 = scf.for %arg3 = %c0 to %0 step %c4 iter_args(%arg4 = %arg2) -> (tensor<24xf32>) { + %2 = affine.min #map0(%arg3)[%0] + %3 = tensor.extract_slice %arg0[0, %arg3] [24, %2] [1, 1] : tensor<24x?xf32> to tensor<24x?xf32> + + // Index the packed vector and transpose back. + // TRANSP: %[[T6:.*]] = tensor.extract_slice %[[T0]] + // TRANSP: %[[T7:.*]] = linalg.init_tensor + // TRANSP: %[[T8:.*]] = linalg.generic + // TRANSP-SAME: ins(%[[T6]] : tensor<4x24xf32> + // TRANSP-SAME: outs(%[[T7]] : tensor<24x4xf32> + %4 = tensor.extract_slice %arg1[%arg3] [%2] [1] : tensor to tensor + %5 = affine.apply #map1(%2) + %6 = linalg.pad_tensor %3 low[%c0, %c0] high[%c0, %5] { + ^bb0(%arg5: index, %arg6: index): // no predecessors + linalg.yield %cst : f32 + } : tensor<24x?xf32> to tensor<24x4xf32> + %7 = linalg.pad_tensor %4 nofold low[%c0] high[%5] { + ^bb0(%arg5: index): // no predecessors + linalg.yield %cst : f32 + } : tensor to tensor<4xf32> + + // Check matvec uses the packed input vector. + // TRANSP: = linalg.matvec ins(%[[T8]] + %8 = linalg.matvec ins(%6, %7 : tensor<24x4xf32>, tensor<4xf32>) outs(%arg4 : tensor<24xf32>) -> tensor<24xf32> + scf.yield %8 : tensor<24xf32> + } + return %1 : tensor<24xf32> +} diff --git a/mlir/test/lib/Dialect/Linalg/TestLinalgCodegenStrategy.cpp b/mlir/test/lib/Dialect/Linalg/TestLinalgCodegenStrategy.cpp --- a/mlir/test/lib/Dialect/Linalg/TestLinalgCodegenStrategy.cpp +++ b/mlir/test/lib/Dialect/Linalg/TestLinalgCodegenStrategy.cpp @@ -109,6 +109,10 @@ *this, "hoist-paddings", llvm::cl::desc("Operand hoisting depths when test-pad-pattern."), llvm::cl::ZeroOrMore, llvm::cl::MiscFlags::CommaSeparated}; + ListOption transposePaddings{ + *this, "transpose-paddings", + llvm::cl::desc("Transpose paddings when test-pad-pattern."), + llvm::cl::ZeroOrMore, llvm::cl::MiscFlags::CommaSeparated}; Option generalize{*this, "generalize", llvm::cl::desc("Generalize named operations."), llvm::cl::init(false)}; @@ -257,9 +261,24 @@ ? hoistPaddings[opOperand.getOperandNumber()] : 0; }; + auto transposeFunc = [&](OpOperand &opOperand) { + SmallVector transposeVector = {}; + if (opOperand.getOperandNumber() >= transposePaddings.size()) + return transposeVector; + llvm::transform(transposePaddings[opOperand.getOperandNumber()], + std::back_inserter(transposeVector), [](char value) { + std::stringstream str; + int64_t index; + str << value; + str >> index; + return index; + }); + return transposeVector; + }; paddingOptions.setPaddingValueComputationFunction(getNeutralOfLinalgOp); paddingOptions.setPaddingNoFoldComputationFunction(packFunc); paddingOptions.setPaddingHoistComputationFunction(hoistingFunc); + paddingOptions.setPaddingTransposeComputationFunction(transposeFunc); // Compute input padding values only an return failure for output operands. if (padInputsOnly) {